Code
library(dplyr)
library(ggplot2)
library(patchwork)
color_base <- "#007AB7"
color_accent <- "#E69F00"
set.seed(1234)library(dplyr)
library(ggplot2)
library(patchwork)
color_base <- "#007AB7"
color_accent <- "#E69F00"
set.seed(1234)1. Group Types and Policy
For groups \(g=1,\dots,G\), let the type \(U_g\) and policy \(W_g\) be defined as
\[ U_g \sim \mathcal N(0,\sigma_u^2),\qquad W_g \sim \mathrm{Bernoulli}\left(1/2\right). \]
By construction, \(U_g \perp W_g\). Note that this \(W_g\) is randomly assigned at the group level.
2. Policy-Affected Event Probability
Define the event probability (e.g., fertility/compliance) depending on group type \(U_g\) and policy assignment \(W_g\) as
\[ p_{g} := \Pr(E_{gi}=1\mid U_g, W_g) = \Lambda(a_0 + a_1 U_g + \kappa W_g), \quad \Lambda(x)=\frac{1}{1+e^{-x}}. \]
3. Model-based Outcome
\[ \tau_g = b_0 + b_1 U_g + \beta W_g + \xi_{g}, \quad \xi_{g} \sim \mathcal{N}(0, \sigma_{\xi}^2). \]
4. Microdata
For individuals \(i=1,\dots,n\) in group \(g\), let the event indicator be
\[ E_{g, i} \mid (W_g, U_g) \sim \mathrm{Bernoulli}(p_g). \]
Model outcome (e.g., difference in outcome \(\Delta Y\)) is generated as
\[ \Delta Y_{g, i} = \tau_g E_{g,i} + \varepsilon_{g,i},\quad \varepsilon_{g,i} \sim \mathcal N(0,\sigma_{\varepsilon}^2),\quad \varepsilon_{g,i} \perp (E_{g,i}, W_g, U_g). \]
GMM (1-Step Method)
\[ \Delta Y_{g, i} \sim \gamma_g + \alpha E_{g, i} + \beta E_{g, i} W_g \]
Minimum Distance (2-Step Method)
beta_true <- 0.1
gen_data <- function(
N = 1000,
n_g = 50,
sigma_u = 1,
sigma_xi = 0.1,
b0 = -0.2,
b1 = 0.6,
beta = beta_true,
a0 = 0.0,
a1 = 1.0,
kappa = 1.0,
sigma_delta = 1.0,
delta_u = 0.5
) {
tibble(
g = 1:n_g,
U = rnorm(n_g, 0, sigma_u),
W = rbinom(n_g, 1, 0.5),
tau = b0 + b1 * U + beta * W + rnorm(n_g, 0, sigma_xi),
p = plogis(a0 + a1 * U + kappa * W),
delta = delta_u * U + rnorm(n_g, 0, sigma_delta)
) |>
tidyr::uncount(weights = N, .id = "i") |>
mutate(
E = rbinom(dplyr::n(), 1, p),
dY = delta + tau * E + rnorm(dplyr::n(), 0, 1)
) |>
select(g, i, W, U, tau, p, E, dY)
}
est_md <- function(df) {
df_g <- df |>
summarize(
tau_hat = {
if (dplyr::n_distinct(E) < 2) {
NA_real_
} else {
fit_g <- lm(dY ~ E, data = pick(dY, E))
unname(coef(fit_g)["E"])
}
},
W = first(W),
.by = g
) |>
filter(!is.na(tau_hat))
if (nrow(df_g) == 0 || dplyr::n_distinct(df_g$W) < 2) {
return(NA_real_)
}
fit <- lm(tau_hat ~ W, data = df_g)
unname(coef(fit)["W"])
}
est_gmm <- function(df) {
fit <- fixest::feols(dY ~ E + E:W | g, data = df)
unname(coef(fit)["E:W"])
}
est_onestep <- function(df) {
tibble(
b_gmm = est_gmm(df),
b_md = est_md(df)
)
}
R_mc <- 250
df_mc <- purrr::map_dfr(seq_len(R_mc), ~ est_onestep(gen_data()))df_mc |>
tidyr::pivot_longer(everything(), names_to = "name", values_to = "value") |>
mutate(lbl = recode_factor(name, b_gmm = "GMM", b_md = "MD")) |>
ggplot(aes(x = value, color = lbl)) +
geom_density() +
geom_vline(xintercept = beta_true, linetype = "dashed") +
labs(
x = "Estimate",
y = "Density",
color = NULL
) +
scale_color_manual(values = c(color_base, color_accent)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "inside",
legend.position.inside = c(0.9, 0.5)
)Why the 1-step method (GMM) is biased? In the 1-step regression, groups with more within-group variation in \(E_{g, i}\) contribute more to identifying the slope(s). For a binary \(E_{g, i}\), a simple proxy for this contribution is
\[ \widetilde{A}_g \propto n_g\,\mathrm{Var}(E_{g, i}\mid g) \approx n_g\,\widehat p_g(1-\widehat p_g), \]
where \(\widehat p_g\) is the realized share of \(E_{g, i}=1\) in group \(g\). If policy shifts \(p_g\) and \(\tau_g\) is related to \(p_g\) through \(U_g\), then treated and control groups get systematically different weights.
df_g <- gen_data(N = 1000, n_g = 100) |>
summarize(
W = first(W),
tau = first(tau),
p_hat = mean(E),
varE = var(E),
n = n(),
.by = g
) |>
mutate(
w_proxy = n * varE,
W_lbl = recode_factor(W, `1` = "Treated (W=1)", `0` = "Control (W=0)")
)
p1 <- df_g |>
ggplot(aes(x = p_hat, color = W_lbl)) +
geom_density() +
theme_minimal() +
labs(
x = "Event Rate",
y = "Density",
title = latex2exp::TeX("Realized event rate $\\hat{p}_g$"),
color = NULL
) +
scale_color_manual(values = c(color_base, color_accent)) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "inside",
legend.position.inside = c(0.2, 0.9)
)
p2 <- df_g |>
ggplot(aes(x = w_proxy, color = W_lbl)) +
geom_density() +
theme_minimal() +
labs(
x = "Weight",
y = "Density",
title = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
color = NULL
) +
scale_color_manual(values = c(color_base, color_accent)) +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "none"
)
p1 + p2We can find a different distribution of the proxy weight between treated and control groups. Finally, we check the relationship between the (true) group parameter \(\tau_g\) and the proxy weight.
df_g |>
ggplot(aes(x = w_proxy, y = tau, color = W_lbl)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
y = latex2exp::TeX("Group parameter $\\tau_g$"),
color = NULL
) +
scale_color_manual(values = c(color_base, color_accent)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
legend.position = "inside",
legend.position.inside = c(0.2, 0.2)
)The solid lines are the linear fit by treatment status. Since the slopes are different, the endogenous weight shift causes bias in the GMM estimate.