Causal Inference for ML

Name: causal-inference-ml
Rating: 88
Author: jlaws

Method Selection Decision Table

Scenario	Key Assumption	Method	Library
RCT / A/B test	Randomization	Difference-in-means	scipy
Observational, known confounders	Unconfoundedness	T/S/X-Learner, CausalForestDML	EconML
Observational, high-dim confounders	Unconfoundedness + sparsity	DML (Double ML)	EconML
Selection on observables	Strong ignorability	IPW / Hajek estimator	DoWhy
Unobserved confounders, instrument exists	Valid instrument	IV / 2SLS / DMLIV	EconML
Pre/post with control group	Parallel trends	Difference-in-Differences	statsmodels
Full causal graph validation	Graph structure known	DoWhy (identify, estimate, refute)	DoWhy
Targeting / personalization	Unconfoundedness	Uplift modeling + Qini curves	EconML

DoWhy Pipeline

python

from dowhy import CausalModel

def run_dowhy_pipeline(df, treatment, outcome, confounders,
                       instruments=None, effect_modifiers=None):
    """Full DoWhy pipeline: model, identify, estimate, refute."""
    model = CausalModel(
        data=df, treatment=treatment, outcome=outcome,
        common_causes=confounders, instruments=instruments,
        effect_modifiers=effect_modifiers,
    )
    # Identify estimand (backdoor, frontdoor, or IV)
    estimand = model.identify_effect(proceed_when_unidentifiable=True)

    # Estimate effect via T-Learner backend
    estimate = model.estimate_effect(
        estimand,
        method_name="backdoor.econml.metalearner_tlearner",
        method_params={"init_params": {"models": "GradientBoostingRegressor()"}},
    )

    # Refutation tests (critical -- skip at your peril)
    refutations = {}
    refutations["random_cause"] = model.refute_estimate(
        estimand, estimate, method_name="random_common_cause",
    )
    refutations["placebo"] = model.refute_estimate(
        estimand, estimate,
        method_name="placebo_treatment_refuter", placebo_type="permute",
    )
    refutations["subset"] = model.refute_estimate(
        estimand, estimate,
        method_name="data_subset_refuter", subset_fraction=0.8, num_simulations=5,
    )
    return estimate, refutations

EconML Meta-Learners

T-Learner, S-Learner, X-Learner

python

from econml.metalearners import TLearner, SLearner, XLearner
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

def compare_metalearners(X, T, Y):
    """Compare three meta-learner approaches for CATE estimation."""
    models = {
        # T-Learner: separate models per treatment group (heterogeneous effects)
        "T": TLearner(models=GradientBoostingRegressor(n_estimators=200)),
        # S-Learner: single model, treatment as feature (small effects)
        "S": SLearner(overall_model=GradientBoostingRegressor(n_estimators=200)),
        # X-Learner: cross-estimates (imbalanced treatment/control)
        "X": XLearner(
            models=GradientBoostingRegressor(n_estimators=200),
            propensity_model=GradientBoostingRegressor(n_estimators=100),
        ),
    }
    results = {}
    for name, model in models.items():
        model.fit(Y, T, X=X)
        cate = model.effect(X)
        results[name] = {"ate": np.mean(cate), "cate_std": np.std(cate), "cate": cate}
    return results

CausalForestDML

python

from econml.dml import CausalForestDML
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

def fit_causal_forest(X, T, Y, W):
    """Doubly-robust CATE with CIs. X: effect modifiers, W: confounders."""
    est = CausalForestDML(
        model_y=GradientBoostingRegressor(n_estimators=200),
        model_t=GradientBoostingClassifier(n_estimators=200),
        n_estimators=500, min_samples_leaf=5, cv=5, random_state=42,
    )
    est.fit(Y, T, X=X, W=W)
    cate = est.effect(X)
    lb, ub = est.effect_interval(X, alpha=0.05)  # 95% CI
    ate_inf = est.ate_inference(X=X)
    print(f"ATE: {ate_inf.point_estimate:.4f} "
          f"[{ate_inf.conf_int()[0]:.4f}, {ate_inf.conf_int()[1]:.4f}]")
    return est, cate, lb, ub, est.feature_importances_

Propensity Scoring with IPW

python

from sklearn.linear_model import LogisticRegression
import numpy as np

def ipw_ate(X, T, Y, estimator="hajek"):
    """Inverse Probability Weighting with Hajek stabilization."""
    ps_model = LogisticRegression(max_iter=1000, C=0.1)
    ps_model.fit(X, T)
    e = np.clip(ps_model.predict_proba(X)[:, 1], 0.01, 0.99)  # clip extremes

    if estimator == "horvitz_thompson":
        ate = np.mean(T * Y / e) - np.mean((1 - T) * Y / (1 - e))
    elif estimator == "hajek":
        # Normalized weights: lower variance than HT
        w1, w0 = T / e, (1 - T) / (1 - e)
        ate = np.sum(w1 * Y) / np.sum(w1) - np.sum(w0 * Y) / np.sum(w0)
    return ate, e

Instrumental Variables with DMLIV

python

from econml.iv.dml import DMLIV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def fit_dmliv(Y, T, Z, X, W):
    """DMLIV: Double ML + IV. Z: instrument, T: endogenous treatment."""
    est = DMLIV(
        model_y_xw=RandomForestRegressor(n_estimators=200),
        model_t_xw=RandomForestClassifier(n_estimators=200),
        model_t_xwz=RandomForestClassifier(n_estimators=200),
        model_final=RandomForestRegressor(n_estimators=100), cv=3,
    )
    est.fit(Y, T, Z=Z, X=X, W=W)
    return est, est.effect(X)

Difference-in-Differences

python

import statsmodels.formula.api as smf

def diff_in_diff(df, time_col, treat_col, outcome_col, pre_periods, post_periods):
    """DiD with parallel trends pre-test."""
    # Parallel trends test in pre-period
    pre_df = df[df[time_col].isin(pre_periods)].copy()
    pre_df["time_numeric"] = pre_df[time_col].rank(method="dense")
    trend_fit = smf.ols(
        f"{outcome_col} ~ {treat_col} * time_numeric", data=pre_df
    ).fit()
    interaction_pval = trend_fit.pvalues[f"{treat_col}:time_numeric"]
    if interaction_pval < 0.05:
        print(f"WARNING: Parallel trends violated (p={interaction_pval:.4f})")

    # DiD estimation with robust SEs
    df = df.copy()
    df["post"] = df[time_col].isin(post_periods).astype(int)
    did_fit = smf.ols(f"{outcome_col} ~ {treat_col} * post", data=df).fit(cov_type="HC1")
    att = did_fit.params[f"{treat_col}:post"]
    print(f"ATT: {att:.4f} (SE: {did_fit.bse[f'{treat_col}:post']:.4f})")
    return did_fit, att, interaction_pval

Uplift Modeling with Qini Curves

python

from econml.metalearners import TLearner
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

def uplift_with_qini(X_train, T_train, Y_train, X_test, T_test, Y_test):
    """Train uplift model and evaluate with Qini curve."""
    model = TLearner(models=GradientBoostingRegressor(n_estimators=200))
    model.fit(Y_train, T_train, X=X_train)
    scores = model.effect(X_test).flatten()

    # Compute Qini curve: cumulative incremental gain ranked by uplift
    order = np.argsort(-scores)
    n = len(order)
    cum_uplift = []
    for k in range(1, n + 1):
        idx = order[:k]
        t_mask, c_mask = T_test[idx] == 1, T_test[idx] == 0
        if t_mask.sum() == 0 or c_mask.sum() == 0:
            cum_uplift.append(0)
            continue
        gain = Y_test[idx][t_mask].sum() - (
            Y_test[idx][c_mask].sum() * t_mask.sum() / c_mask.sum())
        cum_uplift.append(gain)

    qini_coeff = np.trapz(cum_uplift, dx=1.0 / n) - cum_uplift[-1] / 2
    return scores, cum_uplift, qini_coeff

Gotchas

•Propensity score overlap: Scores near 0 or 1 cause exploding IPW weights. Always check overlap histograms and clip (0.01-0.99 minimum)
•DoWhy refutations are not optional: An estimate without refutation is meaningless. Random common cause and placebo tests catch specification errors
•T-Learner bias: Imbalanced treatment groups cause overfitting on the smaller group. Use X-Learner or CausalForestDML instead
•DiD parallel trends: Non-significant pre-trend test does not prove parallel trends -- it only fails to reject. Use multiple pre-periods and visual inspection
•IV strength: Weak instruments (first-stage F-stat < 10) cause severe bias. Always report first-stage F-statistic
•CATE vs ATE: Averaging CATE gives ATE only under correct specification. Report both with confidence intervals
•Cross-fitting matters: DML methods use cross-fitting to avoid overfitting bias. Fewer than 3 folds introduces regularization bias