diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bc26fee..43e21b6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,26 +15,21 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ['3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest - pip install . - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 + pip install ".[dev]" + - name: Lint with ruff run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + ruff check scnym/ tests/ - name: Test with pytest run: | pytest diff --git a/.gitignore b/.gitignore index b0f5e9b..296a2ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,34 @@ dist/ **/__pycache__/ +*.pyc .ipynb_checkpoints/ build/ *.egg-info/ +.claude/ +CLAUDE.md +data/ +tmp/ + +# Testing / linting caches +.pytest_cache/ +.ruff_cache/ +.mypy_cache/ +htmlcov/ +.coverage + +# Environment +.env +*.env + +# Editors +.vscode/ +.idea/ + +# OS +.DS_Store +Thumbs.db + +# Model outputs +*.pt +*.pth +hallmark.gmt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5aa0fe5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-merge-conflict + - id: check-added-large-files + args: ['--maxkb=500'] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.4 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..510b4af --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include VERSION +include requirements.txt diff --git a/README.md b/README.md index 4fec6a1..7e7400a 100644 --- a/README.md +++ b/README.md @@ -91,17 +91,17 @@ First, clone the repository: We recommend creating a virtual environment for use with `scNym`. This is easily accomplished using `virtualenv` or `conda`. -We recommend using `python=3.8` for `scNym`, as some of our dependencies don't currently support the newest Python versions. +We recommend using `python=3.10` or newer for `scNym`. ```bash -$ python3 -m venv scnym_env # python3 is python3.8 +$ python3 -m venv scnym_env $ source scnym_env/bin/activate ``` or ```bash -$ conda create -n scnym_env -c conda-forge python=3.8 +$ conda create -n scnym_env -c conda-forge python=3.10 $ conda activate scnym_env ``` diff --git a/VERSION b/VERSION index 1c09c74..1d0ba9e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.3 +0.4.0 diff --git a/demo_script.sh b/demo_script.sh index dc7502b..cc8e954 100755 --- a/demo_script.sh +++ b/demo_script.sh @@ -26,7 +26,7 @@ mv 15467792 lung.h5ad # export metadata as a separate CSV for scNym echo "EXPORTING METADATA AND GENE NAMES" echo "NORMALIZING COUNTS TO LOG(CPM + 1)" -python -c "import anndata; import numpy as np; import scanpy.api as sc; a=anndata.read_h5ad('lung.h5ad'); a.obs.to_csv('metadata.csv'); np.savetxt('gene_names.csv', a.var_names, fmt='%s'); sc.pp.normalize_per_cell(a, counts_per_cell_after=1e6); sc.pp.log1p(a); a.write_h5ad('lung.h5ad')" +python -c "import anndata; import numpy as np; import scanpy as sc; a=anndata.read_h5ad('lung.h5ad'); a.obs.to_csv('metadata.csv'); np.savetxt('gene_names.csv', a.var_names, fmt='%s'); sc.pp.normalize_total(a, target_sum=1e6); sc.pp.log1p(a); a.write_h5ad('lung.h5ad')" # return to the original directory cd - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2ca4a0d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.ruff] +line-length = 127 +target-version = "py310" + +[tool.ruff.lint] +select = [ + "E9", # Runtime errors + "F63", # Invalid print format + "F7", # Syntax errors + "F82", # Undefined names + "F", # Pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings +] +ignore = [ + "E501", # line too long (handled by formatter if desired) + "E741", # ambiguous variable name (common in scientific code) + "F401", # unused imports (star imports in losses) + "F403", # star imports + "F405", # may be undefined from star import + "E722", # bare except +] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/requirements.txt b/requirements.txt index e895253..0608143 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,32 +1,16 @@ -anndata==0.8.* +anndata==0.11.* ConfigArgParse==1.1 -h5py==3.10.* -leidenalg==0.8.10 -louvain==0.7.2 -numba==0.60.* -numpy==1.26.* -numpy-groupies==0.10.* +leidenalg==0.11.* +numpy==2.2.* pandas==2.2.* -pytest==5.4.* -python-dateutil==2.8.* -PyYAML==5.3.* -requests==2.26.* -requests-cache==0.5.* -requests-oauthlib==1.3.* -requests-toolbelt==0.9.* -matplotlib==3.6.* -scanpy==1.9.* -scikit-learn==1.3.* -scikit-misc==0.2.* -scipy==1.14 -six==1.17.* -tensorboard==2.6.* -tensorboard-plugin-wit==1.6.* -tensorboardX==2.1 -torch==2.2.* -torchvision==0.17.* -tqdm==4.44.* -umap-learn==0.3.* -urllib3==1.26.* -protobuf==3.20.* - +PyYAML==6.0.* +requests==2.32.* +scanpy==1.11.* +scikit-learn==1.7.* +scikit-misc==0.5.* +scipy==1.15.* +tensorboardX==2.6.* +torch==2.6.* +torchvision==0.21.* +tqdm==4.67.* +umap-learn==0.5.* diff --git a/scnym/__init__.py b/scnym/__init__.py index 14f386b..ee2650f 100644 --- a/scnym/__init__.py +++ b/scnym/__init__.py @@ -1,6 +1,6 @@ __author__ = "Jacob C. Kimmel, David R. Kelley" __email__ = "jacobkimmel+scnym@gmail.com, drk@calicolabs.com" -__version__ = "0.3.4" +__version__ = "0.4.0" # populate the namespace so top level imports work # e.g. diff --git a/scnym/api.py b/scnym/api.py index bdfca1a..db26409 100644 --- a/scnym/api.py +++ b/scnym/api.py @@ -12,6 +12,7 @@ them onto a user supplied target dataset. """ from typing import Optional, Union, List, Tuple +import anndata from anndata import AnnData import scanpy as sc import numpy as np @@ -437,7 +438,7 @@ def scnym_train( # set all samples for training train_adata = adata # set no samples as `target_bidx` - target_bidx = np.zeros(adata.shape[0], dtype=np.bool) + target_bidx = np.zeros(adata.shape[0], dtype=bool) else: print(f"{n_unlabeled} unlabeled observations found.") print( @@ -583,7 +584,8 @@ def scnym_train( "traintest_idx": traintest_idx, "val_idx": val_idx, } - assert osp.exists(results["model_path"]) + if not osp.exists(results["model_path"]): + raise FileNotFoundError(f"Model path not found: {results['model_path']}") adata.uns["scNym_train_results"] = results @@ -909,8 +911,8 @@ def atlas2target( logger.info(msg) # join the target and atlas data - joint_adata = atlas.concatenate( - adata, + joint_adata = anndata.concat( + [atlas, adata], join="inner", ) diff --git a/scnym/dataprep.py b/scnym/dataprep.py index 1dbf1f0..f46e783 100644 --- a/scnym/dataprep.py +++ b/scnym/dataprep.py @@ -30,9 +30,9 @@ class SingleCellDS(Dataset): def __init__( self, - X: Union[sparse.csr.csr_matrix, np.ndarray], - y: Union[sparse.csr.csr_matrix, np.ndarray], - domain: Union[sparse.csr.csr_matrix, np.ndarray] = None, + X: Union[sparse.csr_matrix, np.ndarray], + y: Union[sparse.csr_matrix, np.ndarray], + domain: Union[sparse.csr_matrix, np.ndarray] = None, transform: Callable = None, num_classes: int = -1, num_domains: int = -1, @@ -139,7 +139,7 @@ def __getitem__( # retrieve relevant sample vector and associated label # store in a hash table for later manipulation and retrieval - # input_ is either an `np.ndarray` or `sparse.csr.csr_matrix` + # input_ is either an `np.ndarray` or `sparse.csr_matrix` input_ = self.X[idx, ...] # label is already a `torch.Tensor` label = self.y[idx] @@ -365,7 +365,7 @@ def __call__( if self.depth_ratio is None: # tile the specified depth for all cells depth = np.tile(np.array(self.depth).reshape(1, -1), (x.size(0), 1)).astype( - np.int + int ) else: # compute a range of depths based on the library size @@ -376,7 +376,7 @@ def __call__( np.ceil(self.depth_ratio[1] * size).reshape(-1, 1), ], axis=1, - ).astype(np.int) + ).astype(int) # sample from a multinomial # np.random.multinomial is ~100X faster than the native @@ -384,12 +384,10 @@ def __call__( m = np.zeros(x.size()) for i in range(x.size(0)): - d = int( - np.random.choice( - np.arange(depth[i, 0], depth[i, 1]), - size=1, - ) - ) + d = np.random.choice( + np.arange(depth[i, 0], depth[i, 1]), + size=1, + ).item() m[i, :] = np.random.multinomial( d, @@ -460,7 +458,7 @@ def __call__( np.arange(n_genes), size=int(np.floor(n_genes * p_drop)), replace=False, - ).astype(np.int) + ).astype(int) x[i, idx] = 0 sample["input"] = x diff --git a/scnym/interpret.py b/scnym/interpret.py index 70c9c98..15ce37a 100644 --- a/scnym/interpret.py +++ b/scnym/interpret.py @@ -241,7 +241,7 @@ class in `.class_names` for which to compute gradients. msg = f"{target_class} is not in `.class_names`" raise ValueError(msg) - target_idx = np.where(target_class == self.class_names)[0].astype(np.int) + target_idx = np.where(target_class == self.class_names)[0].astype(int) target_idx = int(target_idx) self.model.zero_grad() @@ -297,7 +297,7 @@ def rank_genes_by_saliency( s = self.get_saliency(**kwargs) sort_idx = torch.argsort(s) idx = sort_idx[0].numpy()[::-1] - return self.gene_names[idx.astype(np.int)] + return self.gene_names[idx.astype(int)] class IntegratedGradient(object): @@ -709,7 +709,7 @@ class in `self.class_names` and `adata.obs[groupby]` raise ValueError(msg) # get the indices for cells of the target class - cell_idx = np.where(adata.obs[groupby] == target_class)[0].astype(np.int) + cell_idx = np.where(adata.obs[groupby] == target_class)[0].astype(int) if n_cells is not None: if n_cells < len(cell_idx): # subset if a specific number of cells was specified @@ -1033,7 +1033,7 @@ class name for source class to use as reference cells for expected target_bidx = adata.obs[self.cell_type_col] == target if source in self.background_vals: - source_bidx = np.ones(adata.shape[0], dtype=np.bool) + source_bidx = np.ones(adata.shape[0], dtype=bool) # ensure target cells aren't in the source data source_bidx[target_bidx] = False else: diff --git a/scnym/losses.py b/scnym/losses.py index 8b950a4..310f8c9 100644 --- a/scnym/losses.py +++ b/scnym/losses.py @@ -371,7 +371,11 @@ def _update_teacher( # normalization statistics for m in self.teacher.modules(): if isinstance(m, nn.BatchNorm1d): - assert m.track_running_stats == self.teacher_bn_running_stats + if m.track_running_stats != self.teacher_bn_running_stats: + raise RuntimeError( + f"Teacher BatchNorm track_running_stats={m.track_running_stats} " + f"does not match expected={self.teacher_bn_running_stats}" + ) return @@ -398,7 +402,7 @@ def _update_teacher_params( # new parameters zipped_params = zip(self.teacher.parameters(), model.parameters()) for teacher_param, model_param in zipped_params: - (teacher_param.data.mul_(alpha).add_(1 - alpha, model_param.data)) + (teacher_param.data.mul_(alpha).add_(model_param.data, alpha=1 - alpha)) return def __call__( @@ -483,7 +487,8 @@ def __call__( mixed_output = F.softmax( model(mixed_sample["input"]), ) - assert mixed_output.requires_grad + if not mixed_output.requires_grad: + raise RuntimeError("mixed_output does not require grad") # set outputs as attributes for later access self.mixed_output = mixed_output @@ -1771,7 +1776,8 @@ def __init__( # if the prior_matrix was provided, always prefer it. self.prior_matrix = prior_matrix - assert self.prior_matrix is not None + if self.prior_matrix is None: + raise ValueError("prior_matrix must be set, either via argument or gene sets") return def _set_prior_matrix_from_gene_sets( diff --git a/scnym/main.py b/scnym/main.py index 59d84ee..b6ea24d 100644 --- a/scnym/main.py +++ b/scnym/main.py @@ -28,20 +28,6 @@ from .predict import Predicter from . import utils -# allow tensorboard outputs even though TF2 is installed -# TF2 broke the tensorboard/pytorch API, so we need to alias -# the old API endpoint below -try: - import tensorflow as tf - tfv = int(tf.__version__.split(".")[0]) -except ImportError: - print("tensorflow is not installed, assuming tensorboard is independent") - tfv = 1 - -if tfv > 1: - import tensorboard as tb - - tf.io.gfile = tb.compat.tensorflow_stub.io.gfile logger = logging.getLogger(__name__) @@ -83,7 +69,7 @@ def repeater(data_loader): def fit_model( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], y: np.ndarray, traintest_idx: Union[np.ndarray, tuple], val_idx: np.ndarray, @@ -705,7 +691,7 @@ def fit_model( def train_cv( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], y: np.ndarray, batch_size: int, n_epochs: int, @@ -821,7 +807,7 @@ def train_cv( def train_all( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], y: np.ndarray, batch_size: int, n_epochs: int, @@ -930,7 +916,7 @@ def train_all( def train_tissue_independent_cv( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], metadata: pd.DataFrame, out_path: str, balanced_classes: bool = False, @@ -1055,7 +1041,7 @@ def train_tissue_independent_cv( def train_one_tissue_cv( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], metadata: pd.DataFrame, out_path: str, balanced_classes: bool = False, @@ -1171,7 +1157,7 @@ def train_one_tissue_cv( def predict_cell_types( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], model_path: str, out_path: str, upper_groups: Union[list, np.ndarray] = None, @@ -1182,7 +1168,7 @@ def predict_cell_types( Parameters ---------- - X : np.ndarray, sparse.csr.csr_matrix + X : np.ndarray, sparse.csr_matrix [Cells, Genes] of log1p transformed, normalized values. log1p and normalization performed using scanpy defaults. model_path : str @@ -1248,7 +1234,7 @@ def predict_cell_types( def load_data( path: str, -) -> Union[np.ndarray, sparse.csr.csr_matrix]: +) -> Union[np.ndarray, sparse.csr_matrix]: """Load a counts matrix from a file path. Parameters @@ -1557,7 +1543,7 @@ def main(): if args.ssl_config is not None: print(f"Loading Semi-Supervised Learning parameters for {args.ssl_method}") with open(args.ssl_config, "r") as f: - ssl_kwargs = yaml.load(f, Loader=yaml.Loader) + ssl_kwargs = yaml.safe_load(f) print("SSL kwargs:") for k, v in ssl_kwargs.items(): print(f"{k}\t\t:\t\t{v}") @@ -1586,7 +1572,7 @@ def main(): if args.unlabeled_domain is not None: unlabeled_domain = np.loadtxt( args.unlabeled_domain, - ).astype(np.int) + ).astype(int) else: unlabeled_domain = None else: diff --git a/scnym/predict.py b/scnym/predict.py index 8b84b42..fa84556 100644 --- a/scnym/predict.py +++ b/scnym/predict.py @@ -106,7 +106,7 @@ def __init__( def predict( self, - X: Union[np.ndarray, sparse.csr.csr_matrix, torch.FloatTensor], + X: Union[np.ndarray, sparse.csr_matrix, torch.FloatTensor], output: str = None, batch_size: int = 1024, **kwargs, @@ -116,7 +116,7 @@ def predict( Parameters ---------- - X : np.ndarray, sparse.csr.csr_matrix, torch.FloatTensor + X : np.ndarray, sparse.csr_matrix, torch.FloatTensor [Cells, Genes] output : str additional output to include as an optional third tuple. diff --git a/scnym/trainer.py b/scnym/trainer.py index c3c8522..fdd3216 100644 --- a/scnym/trainer.py +++ b/scnym/trainer.py @@ -7,15 +7,11 @@ import json import logging from typing import Callable, Iterable, Union, List -from .dataprep import SampleMixUp -from .utils import compute_entropy_of_mixing -from .model import CellTypeCLF, DANN -import copy -from torch.utils.tensorboard import SummaryWriter - from .dataprep import SampleMixUp from .utils import compute_entropy_of_mixing from .model import CellTypeCLF, DANN, AE +import copy +from tensorboardX import SummaryWriter from .losses import * diff --git a/scnym/utils.py b/scnym/utils.py index 1e4cab1..8e4316c 100644 --- a/scnym/utils.py +++ b/scnym/utils.py @@ -80,15 +80,15 @@ def l1_layer0( def append_categorical_to_data( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], categorical: np.ndarray, -) -> (Union[np.ndarray, sparse.csr.csr_matrix], np.ndarray): +) -> (Union[np.ndarray, sparse.csr_matrix], np.ndarray): """Convert `categorical` to a one-hot vector and append this vector to each sample in `X`. Parameters ---------- - X : np.ndarray, sparse.csr.csr_matrix + X : np.ndarray, sparse.csr_matrix [Cells, Features] categorical : np.ndarray [Cells,] @@ -126,7 +126,7 @@ def append_categorical_to_data( def get_adata_asarray( adata: anndata.AnnData, -) -> Union[np.ndarray, sparse.csr.csr_matrix]: +) -> Union[np.ndarray, sparse.csr_matrix]: """Get the gene expression matrix `.X` of an AnnData object as an array rather than a view. @@ -137,7 +137,7 @@ def get_adata_asarray( Returns ------- - X : np.ndarray, sparse.csr.csr_matrix + X : np.ndarray, sparse.csr_matrix [Cells, Genes] `.X` attribute as an array in memory. @@ -146,18 +146,18 @@ def get_adata_asarray( Returned `X` will match the type of `adata.X` view. """ if sparse.issparse(adata.X): - X = sparse.csr.csr_matrix(adata.X) + X = sparse.csr_matrix(adata.X) else: X = np.array(adata.X) return X def build_classification_matrix( - X: Union[np.ndarray, sparse.csr.csr_matrix], + X: Union[np.ndarray, sparse.csr_matrix], model_genes: np.ndarray, sample_genes: np.ndarray, gene_batch_size: int = 512, -) -> Union[np.ndarray, sparse.csr.csr_matrix]: +) -> Union[np.ndarray, sparse.csr_matrix]: """ Build a matrix for classification using only genes that overlap between the current sample and the pre-trained model. @@ -182,7 +182,7 @@ def build_classification_matrix( as zeros. `type(N)` will match `type(X)`. """ # check types - if type(X) not in (np.ndarray, sparse.csr.csr_matrix): + if type(X) not in (np.ndarray, sparse.csr_matrix): msg = f"X is type {type(X)}, must `np.ndarray` or `sparse.csr_matrix`" raise TypeError(msg) n_cells = X.shape[0] @@ -211,7 +211,7 @@ def build_classification_matrix( common_genes = 0 for i, g in tqdm.tqdm(enumerate(sample_genes), desc="mapping genes"): if np.sum(g == model_genes) > 0: - model_genes_indices.append(int(np.where(g == model_genes)[0])) + model_genes_indices.append(np.where(g == model_genes)[0].item()) sample_genes_indices.append( i, ) @@ -396,7 +396,7 @@ def __call__( self, distances: np.ndarray, ) -> np.ndarray: - """Generate a set of weights based on distances to a point + r"""Generate a set of weights based on distances to a point with a radial basis function kernel. Parameters @@ -667,13 +667,13 @@ def compute_entropy_of_mixing( def _optimize_clustering(adata, resolution: list = [0.1, 0.2, 0.3, 0.5, 1.0]): scores = [] for r in resolution: - sc.tl.leiden(adata, resolution=r) + sc.tl.leiden(adata, resolution=r, flavor="igraph", n_iterations=2) s = calinski_harabasz_score(adata.obsm["X_scnym"], adata.obs["leiden"]) scores.append(s) cl_opt_df = pd.DataFrame({"resolution": resolution, "score": scores}) best_idx = np.argmax(cl_opt_df["score"]) res = cl_opt_df.iloc[best_idx, 0] - sc.tl.leiden(adata, resolution=res) + sc.tl.leiden(adata, resolution=res, flavor="igraph", n_iterations=2) print("Best resolution: ", res) return cl_opt_df diff --git a/setup.py b/setup.py index f3566cd..d29cadb 100644 --- a/setup.py +++ b/setup.py @@ -1,29 +1,38 @@ -import sys -if sys.version_info < (3,): - sys.exit('scnym requires Python >= 3.6') from pathlib import Path from setuptools import setup, find_packages +_here = Path(__file__).resolve().parent + try: from scnym import __author__, __email__ except ImportError: # Deps not yet installed __author__ = __email__ = '' +# Single source of truth for version +_version = (_here / 'VERSION').read_text('utf-8').strip() + setup( name='scnym', - version='0.3.3', - description="Semi supervised adversarial network networks for single cell classification", + version=_version, + description="Semi supervised adversarial neural networks for single cell classification", long_description="scNym uses the semi-supervised MixMatch framework and domain adversarial training to take advantage of information in both the labeled and unlabeled datasets.", url='http://github.com/calico/scnym', author=__author__, author_email=__email__, license='Apache', - python_requires='>=3.6', + python_requires='>=3.10', install_requires=[ l.strip() for l in - Path('requirements.txt').read_text('utf-8').splitlines() + (_here / 'requirements.txt').read_text('utf-8').splitlines() + if l.strip() ], + extras_require={ + 'dev': [ + 'pytest>=7.0', + 'ruff', + ], + }, packages=find_packages(), entry_points=dict( console_scripts=['scnym=scnym.main:main', 'scnym_ad=scnym.scnym_ad:main'], diff --git a/tests/test_api.py b/tests/test_api.py index 06dac73..48fe5ef 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -121,23 +121,27 @@ def test_assumption_checking(): ) adata = adata[ridx, :].copy() - # test that an input anndata with duplicate genes - # throws an error + # test that duplicate genes are rejected — either by anndata + # (>= 0.11 raises at assignment) or by scnym's own check adata_dup_genes = adata.copy() - var_names_with_dups = np.array(adata_dup_genes.var_names) + var_names_with_dups = np.array(adata_dup_genes.var_names).copy() var_names_with_dups[-1] = var_names_with_dups[-2] - adata_dup_genes.var_names = var_names_with_dups - - with pytest.raises(ValueError, match="Duplicate Genes"): - # this should throw an error about duplicate genes - config = {"n_epochs": 1} - scnym_api( - adata=adata_dup_genes, - task="train", - groupby="cell", - out_path=str(sc.settings.datasetdir), - config=config, - ) + try: + adata_dup_genes.var_names = var_names_with_dups + except ValueError as exc: + # anndata >= 0.11 rejects duplicate var_names at assignment + assert "duplicate" in str(exc).lower() or "unique" in str(exc).lower() + else: + # older anndata accepted duplicates; scnym should catch them + with pytest.raises(ValueError, match="Duplicate Genes"): + config = {"n_epochs": 1} + scnym_api( + adata=adata_dup_genes, + task="train", + groupby="cell", + out_path=str(sc.settings.datasetdir), + config=config, + ) # test that an input anndata with `.X` formatted as something # other than log1p(CPM) will throw an error diff --git a/tests/test_guide.py b/tests/test_guide.py index a46f29b..739a876 100644 --- a/tests/test_guide.py +++ b/tests/test_guide.py @@ -101,13 +101,13 @@ def test_sparsity_loss(): # load 10x human PBMC data as a sample adata = sc.datasets.pbmc3k() - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=2000) sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) # generate clusters to use as class labels - sc.tl.leiden(adata, resolution=0.5, key_added="leiden") + sc.tl.leiden(adata, resolution=0.5, key_added="leiden", flavor="igraph", n_iterations=2) adata.obs["class"] = pd.Categorical( adata.obs["leiden"], @@ -281,13 +281,13 @@ def test_nonneg_guide(): # load 10x human PBMC data as a sample adata = sc.datasets.pbmc3k() - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=2000) sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) # generate clusters to use as class labels - sc.tl.leiden(adata, resolution=0.5, key_added="leiden") + sc.tl.leiden(adata, resolution=0.5, key_added="leiden", flavor="igraph", n_iterations=2) adata.obs["class"] = pd.Categorical( adata.obs["leiden"], diff --git a/tests/test_interpret.py b/tests/test_interpret.py index ed7be93..96d2cc2 100644 --- a/tests/test_interpret.py +++ b/tests/test_interpret.py @@ -14,18 +14,18 @@ def _load_10x_pbmc(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=3000) sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) - sc.tl.leiden(adata, resolution=0.3) + sc.tl.leiden(adata, resolution=0.3, flavor="igraph", n_iterations=2) # name one class T cell and one B cell cd4 = adata.obs_vector("CD4") cd22 = adata.obs_vector("CD22") leiden = adata.obs_vector("leiden") tmp = pd.DataFrame({"CD4": cd4, "CD22": cd22, "leiden": leiden}) - grp = tmp.groupby("leiden").mean().reset_index() + grp = tmp.groupby("leiden", observed=True).mean().reset_index() print(grp) t_cell_cl = grp.sort_values("CD4", ascending=False)["leiden"].tolist()[0] b_cell_cl = grp.sort_values("CD22", ascending=False)["leiden"].tolist()[0] diff --git a/tests/test_mixmatch.py b/tests/test_mixmatch.py index 6677ef3..0247042 100644 --- a/tests/test_mixmatch.py +++ b/tests/test_mixmatch.py @@ -82,7 +82,7 @@ def test_mixmatch_forward(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # generate fake class labels @@ -181,7 +181,7 @@ def test_mixmatch_forward_with_confthresh(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # generate fake class labels @@ -343,7 +343,7 @@ def test_mixmatch_forward_with_teacher_bn_runnning_stats(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # generate fake class labels @@ -435,7 +435,7 @@ def test_train_mixmatch(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # generate fake class labels diff --git a/tests/test_multitask.py b/tests/test_multitask.py index 398c2e6..94473c1 100644 --- a/tests/test_multitask.py +++ b/tests/test_multitask.py @@ -23,7 +23,7 @@ def test_multitask_mixmatch(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # generate fake class labels @@ -133,7 +133,7 @@ def test_multitask_dan(): adata = sc.datasets.pbmc3k() sc.pp.filter_cells(adata, min_counts=100) sc.pp.filter_genes(adata, min_cells=100) - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) # create dataloaders @@ -196,7 +196,7 @@ def test_multitask_trainer(): sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) # generate clusters to use as class labels - sc.tl.leiden(adata, resolution=0.5, key_added="leiden") + sc.tl.leiden(adata, resolution=0.5, key_added="leiden", flavor="igraph", n_iterations=2) adata.obs["class"] = pd.Categorical( adata.obs["leiden"], ).codes diff --git a/tests/test_reconstruction.py b/tests/test_reconstruction.py index 093ce8b..7ec7035 100644 --- a/tests/test_reconstruction.py +++ b/tests/test_reconstruction.py @@ -30,7 +30,7 @@ def test_reconstruction_loss(): sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) # generate clusters to use as class labels - sc.tl.leiden(adata, resolution=0.5, key_added="leiden") + sc.tl.leiden(adata, resolution=0.5, key_added="leiden", flavor="igraph", n_iterations=2) adata.obs["class"] = pd.Categorical( adata.obs["leiden"], ).codes diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 66fbb4d..6d5adfc 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -16,13 +16,13 @@ def test_trainer(): # load 10x human PBMC data as a sample adata = sc.datasets.pbmc3k() - sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6) + sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, n_top_genes=2000) sc.pp.pca(adata) sc.pp.neighbors(adata, n_neighbors=15) # generate clusters to use as class labels - sc.tl.leiden(adata, resolution=0.5, key_added="leiden") + sc.tl.leiden(adata, resolution=0.5, key_added="leiden", flavor="igraph", n_iterations=2) adata.obs["class"] = pd.Categorical( adata.obs["leiden"], diff --git a/tests/test_utils.py b/tests/test_utils.py index 6724172..262b715 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -34,7 +34,7 @@ def test_build_classification_matrix_dense(): # X should have the genes of B in the order of A for i, g in enumerate(A_genes): - j = int(np.where(B_genes == g)[0]) + j = np.where(B_genes == g)[0].item() assert np.all(X[:, i] == B[:, j]) return @@ -68,7 +68,7 @@ def test_build_classification_matrix_sparse(): # X should have the genes of B in the order of A for i, g in enumerate(A_genes): - j = int(np.where(B_genes == g)[0]) + j = np.where(B_genes == g)[0].item() assert np.all(X[:, i].toarray() == B[:, j].toarray()) return @@ -78,7 +78,7 @@ def test_get_adata_asarray(): # test getting a dense matrix import scnym - adata = anndata.AnnData(X=np.random.random((100, 100))) + adata = anndata.AnnData(X=np.random.random((100, 100)).astype(np.float32)) X = scnym.utils.get_adata_asarray(adata=adata) assert type(X) == np.ndarray @@ -87,7 +87,7 @@ def test_get_adata_asarray(): ridx = np.random.choice(A.size, size=1000, replace=True) A.flat[ridx] = 1 A = sparse.csr_matrix(A) - adata = anndata.AnnData(X=A) + adata = anndata.AnnData(X=A.astype(np.float32)) X = scnym.utils.get_adata_asarray(adata=adata) assert sparse.issparse(X) return