1  A Toy Example

Code
library(dplyr)
library(ggplot2)
library(patchwork)

color_base <- "#007AB7"
color_accent <- "#E69F00"
set.seed(1234)

1.1 Data Generating Process

1. Group Types and Policy

For groups \(g=1,\dots,G\), let the type \(U_g\) and policy \(W_g\) be defined as

\[ U_g \sim \mathcal N(0,\sigma_u^2),\qquad W_g \sim \mathrm{Bernoulli}\left(1/2\right). \]

By construction, \(U_g \perp W_g\). Note that this \(W_g\) is randomly assigned at the group level.

2. Policy-Affected Event Probability

Define the event probability (e.g., fertility/compliance) depending on group type \(U_g\) and policy assignment \(W_g\) as

\[ p_{g} := \Pr(E_{gi}=1\mid U_g, W_g) = \Lambda(a_0 + a_1 U_g + \kappa W_g), \quad \Lambda(x)=\frac{1}{1+e^{-x}}. \]

3. Model-based Outcome

\[ \tau_g = b_0 + b_1 U_g + \beta W_g + \xi_{g}, \quad \xi_{g} \sim \mathcal{N}(0, \sigma_{\xi}^2). \]

  • Here, \(\beta\) is the true policy effect on the model-based outcome.

4. Microdata

For individuals \(i=1,\dots,n\) in group \(g\), let the event indicator be

\[ E_{g, i} \mid (W_g, U_g) \sim \mathrm{Bernoulli}(p_g). \]

Model outcome (e.g., difference in outcome \(\Delta Y\)) is generated as

\[ \Delta Y_{g, i} = \tau_g E_{g,i} + \varepsilon_{g,i},\quad \varepsilon_{g,i} \sim \mathcal N(0,\sigma_{\varepsilon}^2),\quad \varepsilon_{g,i} \perp (E_{g,i}, W_g, U_g). \]

1.2 Estimation Methods

GMM (1-Step Method)

\[ \Delta Y_{g, i} \sim \gamma_g + \alpha E_{g, i} + \beta E_{g, i} W_g \]

Minimum Distance (2-Step Method)

  1. \(\Delta Y_{g, i} \sim \gamma_g + \tau_g E_{g, i}\) (split by group \(g\))
  2. \(\widehat{\tau}_g \sim \alpha + \beta W_g\)

1.3 Monte Carlo Simulation

Code
beta_true <- 0.1
gen_data <- function(
  N = 1000,
  n_g = 50,
  sigma_u = 1,
  sigma_xi = 0.1,
  b0 = -0.2,
  b1 = 0.6,
  beta = beta_true,
  a0 = 0.0,
  a1 = 1.0,
  kappa = 1.0,
  sigma_delta = 1.0,
  delta_u = 0.5
) {
  tibble(
    g = 1:n_g,
    U = rnorm(n_g, 0, sigma_u),
    W = rbinom(n_g, 1, 0.5),
    tau = b0 + b1 * U + beta * W + rnorm(n_g, 0, sigma_xi),
    p = plogis(a0 + a1 * U + kappa * W),
    delta = delta_u * U + rnorm(n_g, 0, sigma_delta)
  ) |>
    tidyr::uncount(weights = N, .id = "i") |>
    mutate(
      E = rbinom(dplyr::n(), 1, p),
      dY = delta + tau * E + rnorm(dplyr::n(), 0, 1)
    ) |>
    select(g, i, W, U, tau, p, E, dY)
}

est_md <- function(df) {
  df_g <- df |>
    summarize(
      tau_hat = {
        if (dplyr::n_distinct(E) < 2) {
          NA_real_
        } else {
          fit_g <- lm(dY ~ E, data = pick(dY, E))
          unname(coef(fit_g)["E"])
        }
      },
      W = first(W),
      .by = g
    ) |>
    filter(!is.na(tau_hat))

  if (nrow(df_g) == 0 || dplyr::n_distinct(df_g$W) < 2) {
    return(NA_real_)
  }

  fit <- lm(tau_hat ~ W, data = df_g)
  unname(coef(fit)["W"])
}

est_gmm <- function(df) {
  fit <- fixest::feols(dY ~ E + E:W | g, data = df)
  unname(coef(fit)["E:W"])
}

est_onestep <- function(df) {
  tibble(
    b_gmm = est_gmm(df),
    b_md = est_md(df)
  )
}

R_mc <- 250
df_mc <- purrr::map_dfr(seq_len(R_mc), ~ est_onestep(gen_data()))
Code
df_mc |>
  tidyr::pivot_longer(everything(), names_to = "name", values_to = "value") |>
  mutate(lbl = recode_factor(name, b_gmm = "GMM", b_md = "MD")) |>
  ggplot(aes(x = value, color = lbl)) +
  geom_density() +
  geom_vline(xintercept = beta_true, linetype = "dashed") +
  labs(
    x = "Estimate",
    y = "Density",
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.9, 0.5)
  )
Figure 1.1: 1-Step (GMM) vs. 2-Step (MD) Estimates over 250 Monte Carlo Simulations.

1.4 Visualizing the Endogenous Weight Shift

Why the 1-step method (GMM) is biased? In the 1-step regression, groups with more within-group variation in \(E_{g, i}\) contribute more to identifying the slope(s). For a binary \(E_{g, i}\), a simple proxy for this contribution is

\[ \widetilde{A}_g \propto n_g\,\mathrm{Var}(E_{g, i}\mid g) \approx n_g\,\widehat p_g(1-\widehat p_g), \]

where \(\widehat p_g\) is the realized share of \(E_{g, i}=1\) in group \(g\). If policy shifts \(p_g\) and \(\tau_g\) is related to \(p_g\) through \(U_g\), then treated and control groups get systematically different weights.

Code
df_g <- gen_data(N = 1000, n_g = 100) |>
  summarize(
    W = first(W),
    tau = first(tau),
    p_hat = mean(E),
    varE = var(E),
    n = n(),
    .by = g
  ) |>
  mutate(
    w_proxy = n * varE,
    W_lbl = recode_factor(W, `1` = "Treated (W=1)", `0` = "Control (W=0)")
  )

p1 <- df_g |>
  ggplot(aes(x = p_hat, color = W_lbl)) +
  geom_density() +
  theme_minimal() +
  labs(
    x = "Event Rate",
    y = "Density",
    title = latex2exp::TeX("Realized event rate $\\hat{p}_g$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.2, 0.9)
  )

p2 <- df_g |>
  ggplot(aes(x = w_proxy, color = W_lbl)) +
  geom_density() +
  theme_minimal() +
  labs(
    x = "Weight",
    y = "Density",
    title = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "none"
  )

p1 + p2
Figure 1.2: Distributions of the realized event rate and the proxy weight by treatment status.

We can find a different distribution of the proxy weight between treated and control groups. Finally, we check the relationship between the (true) group parameter \(\tau_g\) and the proxy weight.

Code
df_g |>
  ggplot(aes(x = w_proxy, y = tau, color = W_lbl)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
    y = latex2exp::TeX("Group parameter $\\tau_g$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.2, 0.2)
  )

The solid lines are the linear fit by treatment status. Since the slopes are different, the endogenous weight shift causes bias in the GMM estimate.