diff --git a/causalml/metrics/rate.py b/causalml/metrics/rate.py index fe28b61f..783d0868 100644 --- a/causalml/metrics/rate.py +++ b/causalml/metrics/rate.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from matplotlib import pyplot as plt +from scipy import stats import logging plt.style.use("fivethirtyeight") @@ -11,6 +12,30 @@ logger = logging.getLogger("causalml") +def _compute_rate_from_toc(toc, weighting): + """Compute the RATE scalar from a TOC DataFrame. + + Args: + toc (pandas.DataFrame): TOC curve indexed by quantile q + weighting (str): one of ``"autoc"`` or ``"qini"`` + + Returns: + (pandas.Series): RATE score for each model column + """ + quantiles = toc.index.values + q_mid = (quantiles[:-1] + quantiles[1:]) / 2 + toc_mid = (toc.iloc[:-1].values + toc.iloc[1:].values) / 2 + if weighting == "autoc": + weights = 1.0 / q_mid + else: + weights = q_mid + weights = weights / weights.sum() + return pd.Series( + np.average(toc_mid, axis=0, weights=weights), + index=toc.columns, + ) + + def get_toc( df, outcome_col="y", @@ -158,6 +183,10 @@ def rate_score( treatment_effect_col="tau", weighting="autoc", normalize=False, + return_ci=False, + n_bootstrap=200, + alpha=0.05, + random_state=None, ): """Calculate the Rank-weighted Average Treatment Effect (RATE) score. @@ -182,6 +211,11 @@ def rate_score( so the absolute scale matches the TOC values but may differ slightly from the paper's continuous integral definition. Model rankings are preserved. + When return_ci=True, standard errors and confidence intervals are estimated via the + half-sample bootstrap (m = n // 2 draws without replacement), which gives valid + coverage for the RATE functional per the Yadlowsky et al. (2021) functional CLT. + The p-value tests H0: RATE = 0 (i.e. the model's prioritization is no better than + random) using a two-sided z-test. When using observed outcomes (without ``treatment_effect_col``), the underlying TOC estimates the subset ATE via naive difference-in-means, which is valid for RCTs but biased for observational data. For observational settings, pass AIPW pseudo-outcomes @@ -201,9 +235,21 @@ def rate_score( weighting (str, optional): the weighting scheme for the RATE integral. One of ``"autoc"`` (default) or ``"qini"``. normalize (bool, optional): whether to normalize the TOC curve before scoring + return_ci (bool, optional): whether to return bootstrap confidence intervals and + p-values. Default False. + n_bootstrap (int, optional): number of half-sample bootstrap iterations. + Only used when return_ci=True. Default 200. + alpha (float, optional): significance level for confidence intervals. + Only used when return_ci=True. Default 0.05. + random_state (int or None, optional): random seed for the bootstrap sampler. + Pass an integer for reproducible results. Default None. Returns: - (pandas.Series): RATE scores of model estimates + If return_ci=False: + (pandas.Series): RATE scores of model estimates + If return_ci=True: + (pandas.DataFrame): RATE score, standard error, CI lower bound, CI upper bound, + and p-value for each model estimate column """ assert weighting in ( "autoc", @@ -220,26 +266,55 @@ def rate_score( normalize=normalize, ) - quantiles = toc.index.values # includes 0 and 1 - - # Use midpoints to avoid division by zero for autoc at q=0 - q_mid = (quantiles[:-1] + quantiles[1:]) / 2 - toc_mid = (toc.iloc[:-1].values + toc.iloc[1:].values) / 2 - - if weighting == "autoc": - weights = 1.0 / q_mid - else: - weights = q_mid + rate = _compute_rate_from_toc(toc, weighting) + rate.name = "RATE ({})".format(weighting) - # Normalize weights so they sum to 1 over the integration domain - weights = weights / weights.sum() + if not return_ci: + return rate + + # Half-sample bootstrap for SE and p-value + n = len(df) + m = n // 2 + model_names = toc.columns.tolist() + boot_scores = {model: [] for model in model_names} + + rng = np.random.default_rng(random_state) + for _ in range(n_bootstrap): + idx = rng.choice(n, size=m, replace=False) + df_boot = df.iloc[idx].reset_index(drop=True) + toc_boot = get_toc( + df_boot, + outcome_col=outcome_col, + treatment_col=treatment_col, + treatment_effect_col=treatment_effect_col, + normalize=normalize, + ) + rate_boot = _compute_rate_from_toc(toc_boot, weighting) + for model in model_names: + boot_scores[model].append(rate_boot[model]) + + z_crit = stats.norm.ppf(1 - alpha / 2) + results = [] + for model in model_names: + point = rate[model] + boot = np.array(boot_scores[model]) + se = np.std(boot, ddof=1) + ci_lower = point - z_crit * se + ci_upper = point + z_crit * se + z_stat = point / se if se > 0 else np.inf + p_value = 2 * (1 - stats.norm.cdf(abs(z_stat))) + results.append( + { + "model": model, + "rate": point, + "se": se, + "ci_lower": ci_lower, + "ci_upper": ci_upper, + "p_value": p_value, + } + ) - rate = pd.Series( - np.average(toc_mid, axis=0, weights=weights), - index=toc.columns, - ) - rate.name = "RATE ({})".format(weighting) - return rate + return pd.DataFrame(results).set_index("model") def plot_toc( diff --git a/tests/test_rate.py b/tests/test_rate.py index 30585be0..6acd9666 100644 --- a/tests/test_rate.py +++ b/tests/test_rate.py @@ -159,3 +159,52 @@ def test_plot_toc(synthetic_df, monkeypatch): monkeypatch.setattr(plt, "show", lambda: None) ax = plot_toc(synthetic_df, treatment_effect_col="tau") assert isinstance(ax, plt.Axes) + + +def test_rate_score_return_ci_returns_dataframe(synthetic_df): + result = rate_score( + synthetic_df, + treatment_effect_col="tau", + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert isinstance(result, pd.DataFrame) + assert set(result.columns) == {"rate", "se", "ci_lower", "ci_upper", "p_value"} + + +def test_rate_score_ci_bounds_ordered(synthetic_df): + result = rate_score( + synthetic_df, + treatment_effect_col="tau", + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert (result["ci_lower"] < result["rate"]).all() + assert (result["ci_upper"] > result["rate"]).all() + + +def test_rate_score_p_value_range(synthetic_df): + result = rate_score( + synthetic_df, + treatment_effect_col="tau", + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert (result["p_value"] >= 0).all() + assert (result["p_value"] <= 1).all() + + +def test_rate_score_ci_qini_weighting(synthetic_df): + result = rate_score( + synthetic_df, + treatment_effect_col="tau", + weighting="qini", + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert isinstance(result, pd.DataFrame) + assert not result["se"].isnull().any()