Individual-level Child Penalties with PSID

Author

Kazuharu Yanagimoto

Published

September 9, 2024

library(unitdid)
library(dplyr)
library(ggplot2)
library(purrr)

Panel Study of Income Dynamics (PSID)

The PSID data is a longitudinal household survey of the United States and one of the longest and most famous panel surveys in the world. The study began in 1968 with a nationally representative sample of over 18,000 individuals living in 5,000 families. The data is available here for the registered users.

To avoid the convoluted cleaning tasks, I mostly use the cleaning codes provided by the Comparative Panel File (CPF). Their cleaning codes are developed for creating a harmonized panel data from seven countries (Australia, Germany, the UK, South Korea, Russia, Switzerland, and the US).

To replicate this article, you need to run

  1. Run the code of the CPF only for the PSID
    • In step 4 (Add vars constant across all waves) of the CPF-Code/03_PSID/us_01_3_GEt_vars.do, add ER32024 (birth year of the first child)
    • us_01.dta-us_03.dta will be produced
  2. Run the following code to create a dataset for the child penalties
Code
library(haven)
cpf_us01 <- read_dta(here::here("vignettes/us_01.dta"))
cpf_us03 <- read_dta(here::here("vignettes/us_03_CPF.dta"))

ever_birth <- cpf_us01 |>
  select(pid, cyear = ER32024) |>
  filter(cyear != 9999) |>
  distinct()

cp_psid <- cpf_us03 |>
  filter(country == 3) |>
  select(id = pid,
         year = intyear,
         byear = yborn,
         gender = female,
         earn = incjobs_yg,
         hours = whweek,
         years_edu = eduy) |>
  mutate(gender = if_else(gender == 0, "Men", "Women"),
         particip = if_else(hours > 0, 1, 0),
         wage = if_else(particip == 1, earn / hours, 0),
         years_edu = if_else(years_edu < 0, NA_integer_, years_edu)) |>
  right_join(ever_birth, by = c("id" = "pid")) |>
  mutate(rel_time = year - cyear) |>
  filter(!is.na(earn))

save(cp_psid, file = here::here("vignettes/cp_psid.rds"))

Analysis of Child Penalties on Participation

load(here::here("vignettes/cp_psid.rds"))

cp_psid <- cp_psid |>
  mutate(eage = cyear - byear) |>
  filter(cyear >= 1968,
         year <= 1997, # The last year of the annual data
         between(eage, 22, 40))

mdl_particip <- cp_psid |>
  unitdid(yname = "particip",
        iname = "id",
        tname = "year",
        ename = "cyear",
        bname = "byear",
        by = c("gender"),
        compute_varcov = "var")

sum_particip <- summary(mdl_particip, agg = "event_age") |>
  filter(between(event_age, 26, 32))

df_plot <- sum_particip |>
  filter(rel_time == 0) |>
  mutate(rel_time = -1, mean = 0,
         var = 0) |>
  bind_rows(sum_particip) |>
  mutate(lbl_facet = paste0("Age ", event_age))

Mean of Child Penalties on Participation

Code
df_plot |>
  ggplot(aes(x = rel_time, y = mean,
             color = gender, shape = gender)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept = -1, linetype = "dashed") +
  geom_hline(yintercept = 0) +
  facet_wrap(~lbl_facet) +
  scale_color_manual(values = c("#009F8C", "#B75C9D")) +
  labs(x = "Time to First Childbirth",
       y = NULL,
       color = NULL, shape = NULL) +
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        legend.position = "bottom")

Standard Deviation of Child Penalties on Participation

Code
df_plot |>
  ggplot(aes(x = rel_time, y = sqrt(var),
             color = gender, shape = gender)) +
  geom_point() +
  geom_line() +
  geom_vline(xintercept = -1, linetype = "dashed") +
  geom_hline(yintercept = 0) +
  facet_wrap(~lbl_facet) +
  scale_color_manual(values = c("#009F8C", "#B75C9D")) +
  labs(x = "Time to First Childbirth",
       y = NULL, color = NULL, shape = NULL) +
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        legend.position = "bottom")

Scatter Plot with Covariates

To highlight the flexibility of the individual child penalties, let’s plot the relationship between the years of education and the child penalties.

unitdid_particip <- get_unitdid(mdl_particip)
Code
library(patchwork)

plot_scatter <- function(gender) {
  clr <- ifelse(gender == "Men", "#009F8C", "#B75C9D")

  unitdid_particip |>
    filter(gender == {{gender}}, between(rel_time, 0, 5)) |>
    mutate(lbl_facet = paste0("k = ", rel_time)) |>
    ggplot(aes(x = years_edu, y = particip_tilde)) +
    geom_point(color = clr, alpha = 0.1) +
    geom_smooth(method = "lm", color = clr, fill = clr) +
    facet_wrap(~lbl_facet, nrow = 1) +
    labs(x = "Years of Education",
         y = "CP on Participation",
         title = gender) +
    coord_cartesian(xlim = c(6, NA), ylim = c(-1.25, 0.5)) +
    theme_minimal()
}

p1 <- plot_scatter("Men")
p2 <- plot_scatter("Women")

p1 / p2

On Binscatter

Another way to visualize the relationship is to use the binscatter (Cattaneo et al. 2024).

library(binsreg)

compute_binsreg <- function(gender) {
  est <- unitdid_particip |>
    filter(gender == {{ gender }}) |>
    as.data.frame() |>
    binsreg(x = years_edu, y = particip_tilde,
            data = _, by = rel_time)

  map(0:5, ~{est$data.plot[[paste0("Group ", .x)]]$data.dots |>
               select(x, fit) |>
               mutate(rel_time = .x)}) |>
    list_rbind() |>
    mutate(gender = gender)
}

bs <- map(c("Men", "Women"), compute_binsreg) |> list_rbind()
Code
plot_binsreg <- function(gender) {
  clr <- ifelse(gender == "Men", "#009F8C", "#B75C9D")

  bs |>
    filter(gender == {{ gender }}) |>
    mutate(lbl_facet = paste0("k = ", rel_time)) |>
    ggplot(aes(x = x, y = fit)) +
    geom_point(color = clr) +
    facet_wrap(~lbl_facet, nrow = 1) +
    labs(x = "Years of Education",
         y = "CP on Participation",
         title = gender) +
    theme_minimal()
}

p1 <- plot_binsreg("Men")
p2 <- plot_binsreg("Women")

p1 / p2

References

Cattaneo, Matias D., Richard K. Crump, Max H. Farrell, and Yingjie Feng. 2024. “On Binscatter.” American Economic Review 114 (5): 1488–1514. https://doi.org/10.1257/aer.20221576.