1 A Toy Example

Code

library(dplyr)
library(ggplot2)
library(patchwork)

color_base <- "#007AB7"
color_accent <- "#E69F00"
set.seed(1234)

1.1 Data Generating Process

1. Group Types and Policy

For groups $g=1,\dots,G$, let the type $U_g$ and policy $W_g$ be defined as

\[ U_g \sim \mathcal N(0,\sigma_u^2),\qquad W_g \sim \mathrm{Bernoulli}\left(1/2\right). \]

By construction, $U_g \perp W_g$. Note that this $W_g$ is randomly assigned at the group level.

2. Policy-Affected Event Probability

Define the event probability (e.g., fertility/compliance) depending on group type $U_g$ and policy assignment $W_g$ as

\[ p_{g} := \Pr(E_{gi}=1\mid U_g, W_g) = \Lambda(a_0 + a_1 U_g + \kappa W_g), \quad \Lambda(x)=\frac{1}{1+e^{-x}}. \]

3. Model-based Outcome

\[ \tau_g = b_0 + b_1 U_g + \beta W_g + \xi_{g}, \quad \xi_{g} \sim \mathcal{N}(0, \sigma_{\xi}^2). \]

Here, $\beta$ is the true policy effect on the model-based outcome.

4. Microdata

For individuals $i=1,\dots,n$ in group $g$, let the event indicator be

\[ E_{g, i} \mid (W_g, U_g) \sim \mathrm{Bernoulli}(p_g). \]

Model outcome (e.g., difference in outcome $\Delta Y$) is generated as

\[ \Delta Y_{g, i} = \tau_g E_{g,i} + \varepsilon_{g,i},\quad \varepsilon_{g,i} \sim \mathcal N(0,\sigma_{\varepsilon}^2),\quad \varepsilon_{g,i} \perp (E_{g,i}, W_g, U_g). \]

1.2 Estimation Methods

GMM (1-Step Method)

\[ \Delta Y_{g, i} \sim \gamma_g + \alpha E_{g, i} + \beta E_{g, i} W_g \]

Minimum Distance (2-Step Method)

$\Delta Y_{g, i} \sim \gamma_g + \tau_g E_{g, i}$ (split by group $g$)
$\widehat{\tau}_g \sim \alpha + \beta W_g$

1.3 Monte Carlo Simulation

Code

beta_true <- 0.1
gen_data <- function(
  N = 1000,
  n_g = 50,
  sigma_u = 1,
  sigma_xi = 0.1,
  b0 = -0.2,
  b1 = 0.6,
  beta = beta_true,
  a0 = 0.0,
  a1 = 1.0,
  kappa = 1.0,
  sigma_delta = 1.0,
  delta_u = 0.5
) {
  tibble(
    g = 1:n_g,
    U = rnorm(n_g, 0, sigma_u),
    W = rbinom(n_g, 1, 0.5),
    tau = b0 + b1 * U + beta * W + rnorm(n_g, 0, sigma_xi),
    p = plogis(a0 + a1 * U + kappa * W),
    delta = delta_u * U + rnorm(n_g, 0, sigma_delta)
  ) |>
    tidyr::uncount(weights = N, .id = "i") |>
    mutate(
      E = rbinom(dplyr::n(), 1, p),
      dY = delta + tau * E + rnorm(dplyr::n(), 0, 1)
    ) |>
    select(g, i, W, U, tau, p, E, dY)
}

est_md <- function(df) {
  df_g <- df |>
    summarize(
      tau_hat = {
        if (dplyr::n_distinct(E) < 2) {
          NA_real_
        } else {
          fit_g <- lm(dY ~ E, data = pick(dY, E))
          unname(coef(fit_g)["E"])
        }
      },
      W = first(W),
      .by = g
    ) |>
    filter(!is.na(tau_hat))

  if (nrow(df_g) == 0 || dplyr::n_distinct(df_g$W) < 2) {
    return(NA_real_)
  }

  fit <- lm(tau_hat ~ W, data = df_g)
  unname(coef(fit)["W"])
}

est_gmm <- function(df) {
  fit <- fixest::feols(dY ~ E + E:W | g, data = df)
  unname(coef(fit)["E:W"])
}

est_onestep <- function(df) {
  tibble(
    b_gmm = est_gmm(df),
    b_md = est_md(df)
  )
}

R_mc <- 250
df_mc <- purrr::map_dfr(seq_len(R_mc), ~ est_onestep(gen_data()))

Code

df_mc |>
  tidyr::pivot_longer(everything(), names_to = "name", values_to = "value") |>
  mutate(lbl = recode_factor(name, b_gmm = "GMM", b_md = "MD")) |>
  ggplot(aes(x = value, color = lbl)) +
  geom_density() +
  geom_vline(xintercept = beta_true, linetype = "dashed") +
  labs(
    x = "Estimate",
    y = "Density",
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.9, 0.5)
  )

Figure 1.1: 1-Step (GMM) vs. 2-Step (MD) Estimates over 250 Monte Carlo Simulations.

1.4 Visualizing the Endogenous Weight Shift

Why is the 1-step method (GMM) biased? In the 1-step regression, groups with more within-group variation in $E_{g, i}$ contribute more to identifying the slope(s). For a binary $E_{g, i}$, a simple proxy for this contribution is

\[ \widetilde{A}_g \propto n_g\,\mathrm{Var}(E_{g, i}\mid g) \approx n_g\,\widehat p_g(1-\widehat p_g), \]

where $\widehat p_g$ is the realized share of $E_{g, i}=1$ in group $g$. If policy shifts $p_g$ and $\tau_g$ is related to $p_g$ through $U_g$, then treated and control groups get systematically different weights.

Code

df_g <- gen_data(N = 1000, n_g = 100) |>
  summarize(
    W = first(W),
    tau = first(tau),
    p_hat = mean(E),
    varE = var(E),
    n = n(),
    .by = g
  ) |>
  mutate(
    w_proxy = n * varE,
    W_lbl = recode_factor(W, `1` = "Treated (W=1)", `0` = "Control (W=0)")
  )

p1 <- df_g |>
  ggplot(aes(x = p_hat, color = W_lbl)) +
  geom_density() +
  theme_minimal() +
  labs(
    x = "Event Rate",
    y = "Density",
    title = latex2exp::TeX("Realized event rate $\\hat{p}_g$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.2, 0.9)
  )

p2 <- df_g |>
  ggplot(aes(x = w_proxy, color = W_lbl)) +
  geom_density() +
  theme_minimal() +
  labs(
    x = "Weight",
    y = "Density",
    title = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "none"
  )

p1 + p2

Figure 1.2: Distributions of the realized event rate and the proxy weight by treatment status.

We can find a different distribution of the proxy weight between treated and control groups. Finally, we check the relationship between the (true) group parameter $\tau_g$ and the proxy weight.

Code

df_g |>
  ggplot(aes(x = w_proxy, y = tau, color = W_lbl)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"),
    y = latex2exp::TeX("Group parameter $\\tau_g$"),
    color = NULL
  ) +
  scale_color_manual(values = c(color_base, color_accent)) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    legend.position = "inside",
    legend.position.inside = c(0.2, 0.2)
  )

The solid lines are the linear fit by treatment status. Since the slopes are different, the endogenous weight shift causes bias in the GMM estimate.

--- format: html: code-tools: true --- # A Toy Example ```{r} #| label: setup #| output: false library(dplyr) library(ggplot2) library(patchwork) color_base <- "#007AB7" color_accent <- "#E69F00" set.seed(1234) ``` ## Data Generating Process **1. Group Types and Policy** For groups $g=1,\dots,G$, let the type $U_g$ and policy $W_g$ be defined as $$ U_g \sim \mathcal N(0,\sigma_u^2),\qquad W_g \sim \mathrm{Bernoulli}\left(1/2\right). $$ By construction, $U_g \perp W_g$. Note that this $W_g$ is randomly assigned at the group level. **2. Policy-Affected Event Probability** Define the event probability (e.g., fertility/compliance) depending on group type $U_g$ and policy assignment $W_g$ as $$ p_{g} := \Pr(E_{gi}=1\mid U_g, W_g) = \Lambda(a_0 + a_1 U_g + \kappa W_g), \quad \Lambda(x)=\frac{1}{1+e^{-x}}. $$ **3. Model-based Outcome** $$ \tau_g = b_0 + b_1 U_g + \beta W_g + \xi_{g}, \quad \xi_{g} \sim \mathcal{N}(0, \sigma_{\xi}^2). $$ - Here, $\beta$ is the true policy effect on the model-based outcome. **4. Microdata** For individuals $i=1,\dots,n$ in group $g$, let the event indicator be $$ E_{g, i} \mid (W_g, U_g) \sim \mathrm{Bernoulli}(p_g). $$ Model outcome (e.g., difference in outcome $\Delta Y$) is generated as $$ \Delta Y_{g, i} = \tau_g E_{g,i} + \varepsilon_{g,i},\quad \varepsilon_{g,i} \sim \mathcal N(0,\sigma_{\varepsilon}^2),\quad \varepsilon_{g,i} \perp (E_{g,i}, W_g, U_g). $$ ## Estimation Methods **GMM** (1-Step Method) $$ \Delta Y_{g, i} \sim \gamma_g + \alpha E_{g, i} + \beta E_{g, i} W_g $$ **Minimum Distance** (2-Step Method) 1. $\Delta Y_{g, i} \sim \gamma_g + \tau_g E_{g, i}$ (split by group $g$) 1. $\widehat{\tau}_g \sim \alpha + \beta W_g$ ## Monte Carlo Simulation ```{r} #| label: monte-carlo beta_true <- 0.1 gen_data <- function( N = 1000, n_g = 50, sigma_u = 1, sigma_xi = 0.1, b0 = -0.2, b1 = 0.6, beta = beta_true, a0 = 0.0, a1 = 1.0, kappa = 1.0, sigma_delta = 1.0, delta_u = 0.5 ) { tibble( g = 1:n_g, U = rnorm(n_g, 0, sigma_u), W = rbinom(n_g, 1, 0.5), tau = b0 + b1 * U + beta * W + rnorm(n_g, 0, sigma_xi), p = plogis(a0 + a1 * U + kappa * W), delta = delta_u * U + rnorm(n_g, 0, sigma_delta) ) |> tidyr::uncount(weights = N, .id = "i") |> mutate( E = rbinom(dplyr::n(), 1, p), dY = delta + tau * E + rnorm(dplyr::n(), 0, 1) ) |> select(g, i, W, U, tau, p, E, dY) } est_md <- function(df) { df_g <- df |> summarize( tau_hat = { if (dplyr::n_distinct(E) < 2) { NA_real_ } else { fit_g <- lm(dY ~ E, data = pick(dY, E)) unname(coef(fit_g)["E"]) } }, W = first(W), .by = g ) |> filter(!is.na(tau_hat)) if (nrow(df_g) == 0 || dplyr::n_distinct(df_g$W) < 2) { return(NA_real_) } fit <- lm(tau_hat ~ W, data = df_g) unname(coef(fit)["W"]) } est_gmm <- function(df) { fit <- fixest::feols(dY ~ E + E:W | g, data = df) unname(coef(fit)["E:W"]) } est_onestep <- function(df) { tibble( b_gmm = est_gmm(df), b_md = est_md(df) ) } R_mc <- 250 df_mc <- purrr::map_dfr(seq_len(R_mc), ~ est_onestep(gen_data())) ``` ```{r} #| label: fig-mc #| fig-cap: "1-Step (GMM) vs. 2-Step (MD) Estimates over 250 Monte Carlo Simulations." df_mc |> tidyr::pivot_longer(everything(), names_to = "name", values_to = "value") |> mutate(lbl = recode_factor(name, b_gmm = "GMM", b_md = "MD")) |> ggplot(aes(x = value, color = lbl)) + geom_density() + geom_vline(xintercept = beta_true, linetype = "dashed") + labs( x = "Estimate", y = "Density", color = NULL ) + scale_color_manual(values = c(color_base, color_accent)) + theme_minimal() + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), legend.position = "inside", legend.position.inside = c(0.9, 0.5) ) ``` ## Visualizing the Endogenous Weight Shift Why is the 1-step method (GMM) biased? In the 1-step regression, groups with more within-group variation in $E_{g, i}$ contribute more to identifying the slope(s). For a binary $E_{g, i}$, a simple proxy for this contribution is $$ \widetilde{A}_g \propto n_g\,\mathrm{Var}(E_{g, i}\mid g) \approx n_g\,\widehat p_g(1-\widehat p_g), $$ where $\widehat p_g$ is the realized share of $E_{g, i}=1$ in group $g$. If policy shifts $p_g$ and $\tau_g$ is related to $p_g$ through $U_g$, then treated and control groups get systematically different weights. ```{r} #| label: fig-weight #| fig-cap: "Distributions of the realized event rate and the proxy weight by treatment status." df_g <- gen_data(N = 1000, n_g = 100) |> summarize( W = first(W), tau = first(tau), p_hat = mean(E), varE = var(E), n = n(), .by = g ) |> mutate( w_proxy = n * varE, W_lbl = recode_factor(W, `1` = "Treated (W=1)", `0` = "Control (W=0)") ) p1 <- df_g |> ggplot(aes(x = p_hat, color = W_lbl)) + geom_density() + theme_minimal() + labs( x = "Event Rate", y = "Density", title = latex2exp::TeX("Realized event rate $\\hat{p}_g$"), color = NULL ) + scale_color_manual(values = c(color_base, color_accent)) + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), legend.position = "inside", legend.position.inside = c(0.2, 0.9) ) p2 <- df_g |> ggplot(aes(x = w_proxy, color = W_lbl)) + geom_density() + theme_minimal() + labs( x = "Weight", y = "Density", title = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"), color = NULL ) + scale_color_manual(values = c(color_base, color_accent)) + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), legend.position = "none" ) p1 + p2 ``` We can find a different distribution of the proxy weight between treated and control groups. Finally, we check the relationship between the (true) group parameter $\tau_g$ and the proxy weight. ```{r} #| label: plot-weight-tau #| message: false df_g |> ggplot(aes(x = w_proxy, y = tau, color = W_lbl)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs( x = latex2exp::TeX("Proxy weight $n_g * Var(E|g)$"), y = latex2exp::TeX("Group parameter $\\tau_g$"), color = NULL ) + scale_color_manual(values = c(color_base, color_accent)) + theme_minimal() + theme( panel.grid.minor = element_blank(), legend.position = "inside", legend.position.inside = c(0.2, 0.2) ) ``` The solid lines are the linear fit by treatment status. Since the slopes are different, the endogenous weight shift causes bias in the GMM estimate.