Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 94 additions & 19 deletions causalml/metrics/rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import logging

plt.style.use("fivethirtyeight")
Expand All @@ -11,6 +12,30 @@
logger = logging.getLogger("causalml")


def _compute_rate_from_toc(toc, weighting):
"""Compute the RATE scalar from a TOC DataFrame.

Args:
toc (pandas.DataFrame): TOC curve indexed by quantile q
weighting (str): one of ``"autoc"`` or ``"qini"``

Returns:
(pandas.Series): RATE score for each model column
"""
quantiles = toc.index.values
q_mid = (quantiles[:-1] + quantiles[1:]) / 2
toc_mid = (toc.iloc[:-1].values + toc.iloc[1:].values) / 2
if weighting == "autoc":
weights = 1.0 / q_mid
else:
weights = q_mid
weights = weights / weights.sum()
return pd.Series(
np.average(toc_mid, axis=0, weights=weights),
index=toc.columns,
)


def get_toc(
df,
outcome_col="y",
Expand Down Expand Up @@ -158,6 +183,10 @@ def rate_score(
treatment_effect_col="tau",
weighting="autoc",
normalize=False,
return_ci=False,
n_bootstrap=200,
alpha=0.05,
random_state=None,
):
"""Calculate the Rank-weighted Average Treatment Effect (RATE) score.

Expand All @@ -182,6 +211,11 @@ def rate_score(
so the absolute scale matches the TOC values but may differ slightly from the paper's
continuous integral definition. Model rankings are preserved.

When return_ci=True, standard errors and confidence intervals are estimated via the
half-sample bootstrap (m = n // 2 draws without replacement), which gives valid
coverage for the RATE functional per the Yadlowsky et al. (2021) functional CLT.
The p-value tests H0: RATE = 0 (i.e. the model's prioritization is no better than
random) using a two-sided z-test.
When using observed outcomes (without ``treatment_effect_col``), the underlying TOC
estimates the subset ATE via naive difference-in-means, which is valid for RCTs but
biased for observational data. For observational settings, pass AIPW pseudo-outcomes
Expand All @@ -201,9 +235,21 @@ def rate_score(
weighting (str, optional): the weighting scheme for the RATE integral.
One of ``"autoc"`` (default) or ``"qini"``.
normalize (bool, optional): whether to normalize the TOC curve before scoring
return_ci (bool, optional): whether to return bootstrap confidence intervals and
p-values. Default False.
n_bootstrap (int, optional): number of half-sample bootstrap iterations.
Only used when return_ci=True. Default 200.
alpha (float, optional): significance level for confidence intervals.
Only used when return_ci=True. Default 0.05.
random_state (int or None, optional): random seed for the bootstrap sampler.
Pass an integer for reproducible results. Default None.

Returns:
(pandas.Series): RATE scores of model estimates
If return_ci=False:
(pandas.Series): RATE scores of model estimates
If return_ci=True:
(pandas.DataFrame): RATE score, standard error, CI lower bound, CI upper bound,
and p-value for each model estimate column
"""
assert weighting in (
"autoc",
Expand All @@ -220,26 +266,55 @@ def rate_score(
normalize=normalize,
)

quantiles = toc.index.values # includes 0 and 1

# Use midpoints to avoid division by zero for autoc at q=0
q_mid = (quantiles[:-1] + quantiles[1:]) / 2
toc_mid = (toc.iloc[:-1].values + toc.iloc[1:].values) / 2

if weighting == "autoc":
weights = 1.0 / q_mid
else:
weights = q_mid
rate = _compute_rate_from_toc(toc, weighting)
rate.name = "RATE ({})".format(weighting)

# Normalize weights so they sum to 1 over the integration domain
weights = weights / weights.sum()
if not return_ci:
return rate

# Half-sample bootstrap for SE and p-value
n = len(df)
m = n // 2
model_names = toc.columns.tolist()
boot_scores = {model: [] for model in model_names}

rng = np.random.default_rng(random_state)
for _ in range(n_bootstrap):
idx = rng.choice(n, size=m, replace=False)
df_boot = df.iloc[idx].reset_index(drop=True)
toc_boot = get_toc(
df_boot,
outcome_col=outcome_col,
treatment_col=treatment_col,
treatment_effect_col=treatment_effect_col,
normalize=normalize,
)
rate_boot = _compute_rate_from_toc(toc_boot, weighting)
for model in model_names:
boot_scores[model].append(rate_boot[model])

z_crit = stats.norm.ppf(1 - alpha / 2)
results = []
for model in model_names:
point = rate[model]
boot = np.array(boot_scores[model])
se = np.std(boot, ddof=1)
ci_lower = point - z_crit * se
ci_upper = point + z_crit * se
z_stat = point / se if se > 0 else np.inf
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
results.append(
{
"model": model,
"rate": point,
"se": se,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"p_value": p_value,
}
)

rate = pd.Series(
np.average(toc_mid, axis=0, weights=weights),
index=toc.columns,
)
rate.name = "RATE ({})".format(weighting)
return rate
return pd.DataFrame(results).set_index("model")


def plot_toc(
Expand Down
49 changes: 49 additions & 0 deletions tests/test_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,52 @@ def test_plot_toc(synthetic_df, monkeypatch):
monkeypatch.setattr(plt, "show", lambda: None)
ax = plot_toc(synthetic_df, treatment_effect_col="tau")
assert isinstance(ax, plt.Axes)


def test_rate_score_return_ci_returns_dataframe(synthetic_df):
result = rate_score(
synthetic_df,
treatment_effect_col="tau",
return_ci=True,
n_bootstrap=50,
random_state=RANDOM_SEED,
)
assert isinstance(result, pd.DataFrame)
assert set(result.columns) == {"rate", "se", "ci_lower", "ci_upper", "p_value"}


def test_rate_score_ci_bounds_ordered(synthetic_df):
result = rate_score(
synthetic_df,
treatment_effect_col="tau",
return_ci=True,
n_bootstrap=50,
random_state=RANDOM_SEED,
)
assert (result["ci_lower"] < result["rate"]).all()
assert (result["ci_upper"] > result["rate"]).all()


def test_rate_score_p_value_range(synthetic_df):
result = rate_score(
synthetic_df,
treatment_effect_col="tau",
return_ci=True,
n_bootstrap=50,
random_state=RANDOM_SEED,
)
assert (result["p_value"] >= 0).all()
assert (result["p_value"] <= 1).all()


def test_rate_score_ci_qini_weighting(synthetic_df):
result = rate_score(
synthetic_df,
treatment_effect_col="tau",
weighting="qini",
return_ci=True,
n_bootstrap=50,
random_state=RANDOM_SEED,
)
assert isinstance(result, pd.DataFrame)
assert not result["se"].isnull().any()