Sample Selection Bias

2021/05/12

Series: causal-inference-basics

Causality Graph

DGP

set.seed(66)
# Generate Exogenous Variables
df <- data.frame(X = runif(300) * 10, 
                 W = runif(300) * 5, 
                 U = runif(300) * 2, 
                 Z = c(rep(0, 150), rep(1, 150))) %>%
  # God's Game
  # X (Visible) affects Y, W (Invisible) and U (Invisible) affect Y
  mutate(Y = .1 + .5*X + 0.8*W + 0.2*U + rnorm(300),
  # X (Visible) affects Y, W (Invisible) and U (Invisible) affect Latent Selection Function
  # Z (Visible) affect Latent Selection Function
         select_ = 10 + 3*X + 2*W + 1*U + .5*Z + rnorm(300),
  # Latent Selection Determines the True Selection
         select = ifelse(select_ > mean(select_), 1, 0)
         )

Run Heckman Regression

run_selection <- glm(select ~ X + Z, family = binomial( link = "probit" ), data = df)
df$IMR <- dnorm(run_selection$linear.predictors)/pnorm(run_selection$linear.predictors)
summary(run_selection)
## 
## Call:
## glm(formula = select ~ X + Z, family = binomial(link = "probit"), 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0059  -0.1287  -0.0013   0.1240   2.5195  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.4514     0.6613  -8.244   <2e-16 ***
## X             1.0163     0.1213   8.380   <2e-16 ***
## Z             0.6183     0.2704   2.287   0.0222 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 415.03  on 299  degrees of freedom
## Residual deviance: 113.84  on 297  degrees of freedom
## AIC: 119.84
## 
## Number of Fisher Scoring iterations: 8
run_outcome <- lm(Y ~ IMR + X, data = df[which(df$select == 1),])
summary(run_outcome)
## 
## Call:
## lm(formula = Y ~ IMR + X, data = df[which(df$select == 1), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4067 -0.8600  0.0496  0.7680  3.4331 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.4928     0.8875   2.809  0.00569 ** 
## IMR           0.9237     0.4398   2.100  0.03749 *  
## X             0.4922     0.1099   4.477 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.338 on 139 degrees of freedom
## Multiple R-squared:  0.1532, Adjusted R-squared:  0.141 
## F-statistic: 12.58 on 2 and 139 DF,  p-value: 9.55e-06

Visualization

Double Check

# Run OLS (which is biased)
run_biased <- lm(Y ~ X, data = df[which(df$select == 1),])
summary(run_biased)
## 
## Call:
## lm(formula = Y ~ X, data = df[which(df$select == 1), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5643 -0.9695  0.1065  0.7926  3.2590 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.0168     0.5172   7.766 1.55e-12 ***
## X             0.3114     0.0692   4.500 1.42e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.354 on 140 degrees of freedom
## Multiple R-squared:  0.1263, Adjusted R-squared:  0.1201 
## F-statistic: 20.25 on 1 and 140 DF,  p-value: 1.419e-05
# Check the Inclusion Restriction
run_test_instrument <- lm(Y ~ X + Z, data = df[which(df$select == 1),])
summary(run_test_instrument)
## 
## Call:
## lm(formula = Y ~ X + Z, data = df[which(df$select == 1), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3369 -0.9958  0.1245  0.8445  3.1289 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.71761    0.54715   6.794 2.93e-10 ***
## X            0.32192    0.06913   4.657 7.42e-06 ***
## Z            0.37117    0.23158   1.603    0.111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.347 on 139 degrees of freedom
## Multiple R-squared:  0.1422, Adjusted R-squared:  0.1299 
## F-statistic: 11.52 on 2 and 139 DF,  p-value: 2.347e-05
# In Reality, Only God Can Run This Regression (by Knowing about the Truth and Seeing the Invisibles).
run_ture <- lm(Y ~ U + W + X, data = df)
summary(run_ture)
## 
## Call:
## lm(formula = Y ~ U + W + X, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.02525 -0.64811 -0.01175  0.56035  2.71664 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.008025   0.188084   0.043  0.96600    
## U           0.330793   0.099258   3.333  0.00097 ***
## W           0.773520   0.040014  19.331  < 2e-16 ***
## X           0.499843   0.020288  24.637  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9544 on 296 degrees of freedom
## Multiple R-squared:  0.7703, Adjusted R-squared:  0.768 
## F-statistic: 330.9 on 3 and 296 DF,  p-value: < 2.2e-16

« Previous Next »
Share on: