Code
library(ggplot2)
library(dplyr)
library(broom)
library(margins)
library(ggeffects)
library(patchwork)
library(viridis)
#devtools::install_github("vdeminstitute/vdemdata")
library(vdemdata)
First, load the packages required for the examples.
ggplot2
ggplot2
is one of the most powerful and flexible packages in R for creating data visualizations. It follows the Grammar of Graphics, meaning it builds plots in layers.
We’ll use the built-in mpg
dataset for the basic examples.
Every ggplot2
plot starts with the ggplot()
function, where you specify:
aes()
This creates a plot object, but it won’t render a chart until you add a layer.
You must add a geometry layer (geom_*()
) to display the data.
Common geometries:
geom_point()
– scatterplotgeom_line()
– line plotgeom_col()
– bar chart (with values)geom_bar()
– bar chart (with counts)geom_boxplot()
– boxplotgeom_histogram()
– histogramYou can map variables to color, size, shape, or linetype inside aes()
.
Use labs()
to customize the title, subtitle, and axis labels.
Use facet_wrap()
to split the data into multiple panels by category.
Themes control the look of the plot: grid, font, background.
Popular themes:
theme_minimal()
theme_classic()
theme_light()
theme_bw()
theme_void()
Use scale_*()
functions to adjust axes, colors, or sizes.
For the following examples, we use the Varities of Democracy (V-Dem)
dataset.
Outcomes:
Continuous: e_gdppc
GDP per Capita
Binary: democracy_binary
(recoded from v2x_regime
)
Predictors:
v2x_jucon
(judicial constraints)
v2x_corr
(control of corruption)
v2x_libdem
Liberal Democracy Index
e_regionpol_6C
(region)
Get Data:
## OLS Model
What is the effect of liberal democracy on GDP?
ols <- lm(e_gdppc ~ v2x_libdem+v2x_corr+ as.factor(e_regionpol_6C), data = vdem)
broom::tidy(ols, conf.int = TRUE) %>%
mutate(term = recode(term,
"v2x_libdem" = "Liberal Democracy",
"v2x_corr" = "Corruption Control",
"as.factor(e_regionpol_6C)2" = "Latin America & Caribbean",
"as.factor(e_regionpol_6C)3" = "Middle East & North Africa",
"as.factor(e_regionpol_6C)4" = "Sub-Saharan Africa",
"as.factor(e_regionpol_6C)5" = "Western Europe & North America",
"as.factor(e_regionpol_6C)6" = "Asia & Pacific"
# Region 1 is the reference category — not shown in output
))%>%
mutate(term = factor(term, levels = rev(c(
"Liberal Democracy",
"Corruption Control",
"Latin America & Caribbean",
"Middle East & North Africa",
"Sub-Saharan Africa",
"Western Europe & North America",
"Asia & Pacific",
"(Intercept)"
)))) %>%
ggplot(aes(x = estimate, y = term)) +
geom_point() +
geom_errorbar(aes(xmin = conf.low, xmax = conf.high), width = 0.2) +
labs(
title = "OLS Coefficient Plot: GDP",
x = "Estimate", y = NULL
) +
theme_minimal()
ggplot2
, the order of categorical variables on axes follows the factor level order.recode()
changes the labels; factor(..., levels = ...)
sets the plotting order.Computes predicted values based on a regression model you’ve specified.
Predictions follow the functional form you impose (e.g., linear relationship, additive terms).
Based on estimated coefficients, it calculates:
Fitted/predicted values
Confidence intervals (e.g., 95% CI)
vdem_clean <- vdem %>%
filter(!is.na(v2x_libdem), !is.na(e_gdppc))
ggplot(vdem_clean, aes(x = v2x_libdem, y =e_gdppc)) +
geom_point(alpha = 0.2, size = 1) +
geom_smooth(method = "loess", se = TRUE, alpha = 0.3) +
labs(
title = "Raw Data Smoothing: GDP by Liberal Democracy",
x = "Liberal Democracy Index",
y = "Observed GDP per Capita"
) +
theme_minimal(base_size = 13)
`geom_smooth()` using formula = 'y ~ x'
ggplot(freexp_preds, aes(x = as.factor(x), y = predicted)) +
geom_point()+
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
coord_flip() +
scale_x_discrete(labels = c(
"1" = "Eastern Europe & Central Asia",
"2" = "Latin America & Caribbean",
"3" = "Middle East & North Africa",
"4" = "Sub-Saharan Africa",
"5" = "Western Europe & North America",
"6" = "Asia & Pacific"
)) + scale_y_continuous(limits = c(0, 60)) +
labs(
title = "Predicted GDP by Region",
subtitle ="Mean Democracy Index = 0.24",
x = NULL,
y = "Predicted Index Value"
) +
theme_minimal(base_size = 13)
freexp_preds <- predict_response(ols, terms = c("v2x_libdem", "e_regionpol_6C"))
region_labels <- c(
"1" = "Eastern Europe & Central Asia",
"2" = "Latin America & Caribbean",
"3" = "Middle East & North Africa",
"4" = "Sub-Saharan Africa",
"5" = "Western Europe & North America",
"6" = "Asia & Pacific"
)
ggplot(freexp_preds, aes(x = x, y = predicted, color=group)) +
geom_line(linewidth=1.1)+
labs(
title = "Predicted GDP by Democracy",
x = "Democracy Index",
y = "Predicted GDP Value"
) + scale_color_viridis_d(labels = region_labels, name = "Region") +
theme_minimal(base_size = 13)
What is the effect of Judicial Constraints on Executive on Democracy?
logit_model <- glm(democracy_binary ~ v2x_jucon + v2x_corr + as.factor(e_regionpol_6C),
data = vdem, family = binomial)
broom::tidy(logit_model, conf.int = TRUE)%>%
mutate(term = recode(term,
"v2x_jucon" = "Judicial Constraints",
"v2x_corr" = "Corruption Control",
"as.factor(e_regionpol_6C)2" = "Latin America & Caribbean",
"as.factor(e_regionpol_6C)3" = "Middle East & North Africa",
"as.factor(e_regionpol_6C)4" = "Sub-Saharan Africa",
"as.factor(e_regionpol_6C)5" = "Western Europe & North America",
"as.factor(e_regionpol_6C)6" = "Asia & Pacific"
# Region 1 is the reference category — not shown in output
)) %>%
mutate(term = factor(term, levels = rev(c(
"Judicial Constraints",
"Corruption Control",
"Latin America & Caribbean",
"Middle East & North Africa",
"Sub-Saharan Africa",
"Western Europe & North America",
"Asia & Pacific",
"(Intercept)"
)))) %>%
ggplot(aes(x = estimate, y = term)) +
geom_point() +
geom_errorbar(aes(xmin = conf.low, xmax = conf.high), width = 0.2) +
labs(
title = "Logit Coefficient Plot: Democracy (Binary)",
x = "Estimate (Log Odds)", y = NULL
) +
theme_minimal()
dem_preds <- ggpredict(logit_model, terms = c("v2x_jucon [all]", "e_regionpol_6C"))
# Plot using ggplot and viridis color scale
ggplot(dem_preds, aes(x = x, y = predicted, color = group)) +
geom_line(linewidth = 1) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.15, color = NA) +
scale_color_viridis_d(labels = region_labels, name = "Region", end = 0.9) +
scale_fill_viridis_d(labels = region_labels, name = "Region", end = 0.9) +
labs(
title = "Predicted Probability of Democracy by Region",
x = "Judicial Constraints on Executive (1=No Constraints)",
y = "Pr(Democracy)"
) +
theme_minimal(base_size = 13)
interact <- lm(e_gdppc ~ v2x_corr * democracy_binary,
data = vdem)
corr_region_pred <- ggpredict(interact,
terms = c("v2x_corr", "democracy_binary"))
# Relabel group variable (0/1) to "Autocracy"/"Democracy"
corr_region_pred$group <- factor(corr_region_pred$group,
levels = c("0", "1"),
labels = c("Autocracy", "Democracy")
)
ggplot(corr_region_pred, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
scale_color_viridis_d(name = "Regime Type", end = 0.8) +
scale_fill_viridis_d(name = "Regime Type", end = 0.8) +
labs(
title = "Interaction: Corruption × Regime Type on GDP per Capita",
subtitle = "0 = less corrupt, 1 = more corrupt",
x = "Corruption",
y = "Predicted GDP per Capita"
) +
theme_minimal(base_size = 13)
For more details and help interpreting regression models in more depth, I highly recommend the marginaleffects
package.
It allows you to:
Spatial data in R comes in two main forms:
Represents features as points, lines, or polygons.
Used for things like cities or geocoded events (points), roads (lines), or countries (polygons).
sf
(Simple Features) — modern and tidy-friendly{sf}
sp
and integrates well with dplyr
, ggplot2
, and tidyverse
Stores data in grids/cells, such as satellite imagery, elevation, or temperature.
SpatRaster
(in {terra}) or RasterLayer (older {raster})Extract world map
ggplot vs. base plot
Map a Continuous Variable (GDP per Capita)
# Use Gapminder data (filter for latest year)
gap2007 <- gapminder::gapminder %>%
filter(year == 2007)
# Join with spatial data
world_gap <- world %>%
left_join(gap2007, by = c("name" = "country"))
# Plot map
ggplot(world_gap) +
geom_sf(aes(fill = gdpPercap)) +
scale_fill_viridis_c(option = "C", na.value = "grey80") +
labs(title = "GDP per Capita (2007)", fill = "GDP per Capita") +
theme_minimal()
Map a Categorical Variable (Continent)
Extract a country
Map population estimates per country
Map geocoded event locations on U.S. Map
### 🧪 Simulate and Map Protest Events Inside U.S. State Borders
us <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") %>%
filter(admin == "United States of America")
# Simulate random points inside bounding box
set.seed(123)
n_events <- 1000
raw_points <- tibble(
long = runif(n_events, min = -125, max = -66),
lat = runif(n_events, min = 25, max = 49),
protest_mass = sample(50:5000, n_events, replace = TRUE)
)
# Convert to sf points and intersect with US shape
points_sf <- st_as_sf(raw_points, coords = c("long", "lat"), crs = 4326)
points_us <- points_sf[st_intersects(points_sf, us, sparse = FALSE), ]
# Plot result
ggplot() +
geom_sf(data = us, fill = "grey90", color = "white") +
geom_sf(data = points_us, aes(size = protest_mass), color = "red", alpha = 0.6) +
scale_size(range = c(0.25, 3), name = "Protesters") +
labs(
title = "Simulated Protest Events in the U.S.",
x = NULL, y = NULL
) +
coord_sf(xlim = c(-125, -66), ylim = c(25, 50)) +
theme_minimal()
In this example, we scrape prison locations from OSM and map den on an interactive map.
First, get and prepare the data.
# bounding box of where we want to query data
q <- opq(bbox = st_bbox(st_transform(germany, 4326)))
ger <- opq(bbox = "Germany")
# Build the query of location of prisons
#osmq <- add_osm_feature(q, key = "amenity", value = "prison")
osmq <- add_osm_feature(ger, key = "amenity", value = "prison")
# And then query the data
pris.osm <- osmdata_sf(osmq)
# Make unique points / polygons
pris.osm <- unique_osmdata(pris.osm)
# Get points and polygons
pris.points <- pris.osm$osm_points
# filter only points within Germany
pris.points <- pris.osm$osm_points |>
st_as_sf() |>
st_filter(germany)
# Get polygons
poly <- pris.osm$osm_polygons |>
st_as_sf() |>
st_filter(germany)
multipoly <- pris.osm$osm_multipolygons |>
st_as_sf() |> st_make_valid() |>
st_filter(germany)
# Harmonize columns (keep only common ones, or add missing if needed)
common_cols <- intersect(names(pris.points), names(poly)) |>
intersect(names(multipoly))
pris.points <- pris.points[, common_cols]
poly <- poly[, common_cols]
multipoly <- multipoly[, common_cols]
# Convert polygons and multipolygons to centroids
poly_pts <- st_centroid(poly)
multipoly_pts <- st_centroid(multipoly)
# Combine all as point geometries
pris_all <- bind_rows(pris.points, poly_pts, multipoly_pts)
Map the data. Specify the indicator with zcol = " "
BlackMarbleR
provides a simple way to use nighttime lights data from NASA’s Black Marble. Black Marble is a NASA Earth Science Data Systems (ESDS) project that provides a product suite of daily, monthly and yearly global nighttime lights.
Prior to using it, you need to register an account here and generate a token.
We are interested in Nighttime Light in Germany. For that, we first need to load a shapefile with the countryborder.
Second, we load the raster data and clip it to the country border. Choose your own date.
#### Make raster
#r <- bm_raster(roi_sf = roi_sf,
# product_id = "VNP46A3",
# date = "2021-10-01",
# bearer = bearer, check_all_tiles_exist = FALSE)
#### Prep data
r <- r |> terra::mask(roi_sf)
## Distribution is skewed, so log
r[] <- log(r[] + 1)
##### Map
ggplot() +
geom_spatraster(data = r) +
scale_fill_gradient2(low = "black",
mid = "yellow",
high = "red",
midpoint = 4.5,
na.value = "transparent") +
labs(title = "Germany Nighttime Lights: October 2021") +
coord_sf() +
theme_void() +
theme(plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "none")
We can also load yearly or monthly data and show e.g. the trend in different states.
#ntl_df <- bm_extract(roi_sf = roi_sf,
# product_id = "VNP46A4",
#date = 2019:2022,
#bearer = bearer, check_all_tiles_exist = FALSE)
ntl_df |>
ggplot() +
geom_col(aes(x = date,
y = ntl_mean),
fill = "darkorange") +
facet_wrap(~NAME_1) +
labs(x = NULL,
y = "NTL Luminosity",
title = "Germany Admin Level 1: Annual Average Nighttime Lights") +
scale_x_continuous(labels = seq(2019, 2022, 2),
breaks = seq(2019, 2022, 2)) +
theme_minimal() +
theme(strip.text = element_text(face = "bold"))