Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 32 additions & 28 deletions src/scanpy/tools/_rank_genes_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,33 @@ def _select_top_n(scores: NDArray, n_top: int):
return global_indices


def _illico_results_to_iter(
illico_df: pd.DataFrame,
groups_order: NDArray,
ireference: int | None,
):
"""Yield per-group ``(index, z, p)`` tuples from illico's long-form output.

illico returns a DataFrame with a 2-level MultiIndex ``(pert, feature)``
and columns including ``z_score`` and ``p_value``. We stream one group
at a time via `pandas.Series.loc`, trusting illico_df groups are ordered
by ``var_name``.
"""
ref_label = None if ireference is None else groups_order[ireference]
z_series = illico_df["z_score"]
p_series = illico_df["p_value"]
illico_groups = set(illico_df.index.unique(level="pert"))
return (
(
group_index,
z_series.loc[group_name].to_numpy(),
p_series.loc[group_name].to_numpy(),
)
for group_index, group_name in enumerate(groups_order)
if group_name != ref_label and group_name in illico_groups
)


@njit
def rankdata(data: NDArray[np.number]) -> NDArray[np.float64]:
"""Parallelized version of scipy.stats.rankdata."""
Expand Down Expand Up @@ -425,7 +452,7 @@ def logreg(
if len(self.groups_order) <= 2:
break

def compute_statistics( # noqa: PLR0912, PLR0915
def compute_statistics( # noqa: PLR0912
self,
method: DETest,
*,
Expand Down Expand Up @@ -469,33 +496,10 @@ def compute_statistics( # noqa: PLR0912, PLR0915
alternative="two-sided",
use_rust=False,
)
# Generate a lookup of category -> result excluding the refernece if it is present.
generate_test_results_map = {
group_cat: (
group["z_score"].to_numpy(copy=True),
group["p_value"].to_numpy(copy=True),
)
for (_, group) in illico_df.groupby(level="pert")
if (
group_cat := np.unique(
group.index.get_level_values("pert").to_numpy(copy=True)
).item()
)
!= (
None
if self.ireference is None
else self.groups_order[self.ireference]
)
}
# Create the iterator that is expected by the other method-branches.
groups_order_list = self.groups_order.tolist()
generate_test_results = (
(
groups_order_list.index(group_cat),
*generate_test_results_map[group_cat],
)
for group_cat in self.groups_order
if group_cat in generate_test_results_map
generate_test_results = _illico_results_to_iter(
illico_df,
self.groups_order,
self.ireference,
)
else:
generate_test_results = self.wilcoxon(tie_correct=tie_correct)
Expand Down
62 changes: 55 additions & 7 deletions tests/test_rank_genes_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from scanpy._utils.random import _LegacyRng
from scanpy.get import rank_genes_groups_df
from scanpy.tools import rank_genes_groups
from scanpy.tools._rank_genes_groups import _RankGenes
from scanpy.tools._rank_genes_groups import _illico_results_to_iter, _RankGenes
from testing.scanpy._helpers import random_mask
from testing.scanpy._helpers.data import pbmc68k_reduced
from testing.scanpy._pytest.marks import needs
Expand Down Expand Up @@ -79,6 +79,25 @@ def get_true_scores(method: Literal["t-test", "wilcoxon"]) -> Expected:
return Expected(names=expected["names"].astype("T"), scores=expected["scores"])


def get_illico_results_df(
n_groups: int, n_genes: int, *, seed: int = 0
) -> pd.DataFrame:
"""Synthetic illico-shaped output used by the _illico_results_to_iter tests.

Features are in deliberately non-ascending to test ``var_name`` ordering.
"""
rng = np.random.default_rng(seed)
groups = [f"g{i}" for i in range(n_groups)]
features = [f"f{n_genes - 1 - j}" for j in range(n_genes)] # reversed -> not sorted
return pd.DataFrame(
{
"z_score": rng.normal(size=n_groups * n_genes),
"p_value": rng.uniform(size=n_groups * n_genes),
},
index=pd.MultiIndex.from_product([groups, features], names=["pert", "feature"]),
)


# TODO: Make dask compatible
@pytest.mark.parametrize("method", ["t-test", "wilcoxon"])
@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM)
Expand Down Expand Up @@ -353,9 +372,30 @@ def test_mask_not_equal():
assert not np.array_equal(no_mask, with_mask)


@pytest.mark.parametrize(
("groups_order", "ireference", "expected_indices"),
[
(["g0", "g1", "g2"], None, [0, 1, 2]),
(["g0", "g1", "g2"], 1, [0, 2]),
],
ids=["vs_rest", "vs_reference"],
)
def test_illico_iter(groups_order, ireference, expected_indices):
df = get_illico_results_df(n_groups=3, n_genes=4)
feature_order = df.index.unique(level="feature")
out = list(_illico_results_to_iter(df, np.array(groups_order), ireference))
assert sorted(t[0] for t in out) == sorted(expected_indices)
for gi, z, p in out:
sub = df.xs(groups_order[gi], level=0).reindex(feature_order)
np.testing.assert_array_equal(z, sub["z_score"].to_numpy())
np.testing.assert_array_equal(p, sub["p_value"].to_numpy())


@pytest.mark.parametrize("corr_method", ["benjamini-hochberg", "bonferroni"])
@pytest.mark.parametrize("test", ["ovo", "ovr"])
@pytest.mark.parametrize("exp_post_agg", [True, False], ids=["post_exp", "pre_exp"])
@pytest.mark.parametrize(
"mean_in_log_space", [True, False], ids=["log_space_mean", "linear_space_mean"]
)
@pytest.mark.parametrize(
"tie_correct", [True, False], ids=["tie_correct", "no_tie_correct"]
)
Expand All @@ -368,7 +408,7 @@ def test_illico(
subtests: pytest.Subtests,
groups: Literal["all"] | Sequence[str],
*,
exp_post_agg: bool,
mean_in_log_space: bool,
tie_correct: bool,
):

Expand All @@ -386,7 +426,7 @@ def test_illico(
n_genes=pbmc.n_vars,
tie_correct=tie_correct,
corr_method=corr_method,
exp_post_agg=exp_post_agg,
mean_in_log_space=mean_in_log_space,
groups=groups,
)

Expand All @@ -398,7 +438,7 @@ def test_illico(
n_genes=pbmc.n_vars,
tie_correct=tie_correct,
corr_method=corr_method,
exp_post_agg=exp_post_agg,
mean_in_log_space=mean_in_log_space,
groups=groups,
)
scanpy_results = pbmc.uns["rank_genes_groups"]
Expand Down Expand Up @@ -437,10 +477,18 @@ def test_illico(
(False, -1.0),
],
)
@pytest.mark.parametrize("method", ["wilcoxon", "t-test", "t-test_overestim_var"])
@pytest.mark.parametrize(
"method",
[
"wilcoxon",
"t-test",
"t-test_overestim_var",
pytest.param("wilcoxon_illico", marks=needs.illico),
],
)
def test_mean_in_log_space(
expected_logfc: float,
method: Literal["wilcoxon", "t-test", "t-test_overestim_var"],
method: Literal["wilcoxon", "wilcoxon_illico", "t-test", "t-test_overestim_var"],
*,
mean_in_log_space: bool,
):
Expand Down
Loading