Source code for stlearn.embedding.pca

import numpy as np
import scanpy
from anndata import AnnData
from numpy.random.mtrand import RandomState
from scipy.sparse import spmatrix


[docs] def run_pca( data: AnnData | np.ndarray | spmatrix, n_comps: int = 50, zero_center: bool | None = True, svd_solver: str = "arpack", random_state: int | RandomState | None = 0, return_info: bool = False, use_highly_variable: bool | None = None, dtype: str = "float32", copy: bool = False, chunked: bool = False, chunk_size: int | None = None, ) -> AnnData | None: """\ Wrap function scanpy.pp.pca Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. zero_center If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing `None` decides automatically based on sparseness of the data. svd_solver SVD solver to use: - `'arpack'` (the default - deterministic) for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) - `'randomized'` for the randomized algorithm due to Halko (2009). - `'auto'` chooses automatically depending on the size of the problem. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in `.var['highly_variable']`. By default, uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If `True`, perform an incremental PCA on segments of `chunk_size`. The incremental PCA automatically zero centers and ignores settings of `random_seed` and `svd_solver`. If `False`, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if `chunked=True` was passed. Returns ------- X_pca: :class:`~scipy.sparse.spmatrix`, :class:`~numpy.ndarray` If `data` is array-like and `return_info=False` was passed, this function only returns `X_pca`. adata: anndata.AnnData Otherwise if `copy=True` it returns or else adds fields to `adata`: - `.obsm['X_pca']` - PCA representation of data. - `.varm['PCs']` - The principal components containing the loadings. - `.uns['pca']['variance_ratio']` - Ratio of explained variance. - `.uns['pca']['variance']` - Explained variance, equivalent to the eigenvalues of the covariance matrix. """ adata = scanpy.pp.pca( data, n_comps=n_comps, zero_center=zero_center, svd_solver=svd_solver, random_state=random_state, return_info=return_info, use_highly_variable=use_highly_variable, dtype=dtype, copy=copy, chunked=chunked, chunk_size=chunk_size, ) print( "PCA is done! Generated in adata.obsm['X_pca'], adata.uns['pca'] and " + "adata.varm['PCs']" ) return adata