From 49349e336d8233c5e7603d301c3f7c79cecf2352 Mon Sep 17 00:00:00 2001 From: Dan Yuan Date: Tue, 13 Aug 2024 15:38:13 -0700 Subject: [PATCH 01/13] Added an example script and updated README --- README.md | 6 ++- examples/example_usage.py | 104 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 examples/example_usage.py diff --git a/README.md b/README.md index 0016035..797bcfa 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,14 @@ Installing ---------- For usage and installation instructions, see the [install guide](docs/install.rst). +Examples +-------- +You can find example usage of this package in the examples directory. + Contributing ------------ We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you! Level of Support ---------------- -We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests. \ No newline at end of file +We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests. diff --git a/examples/example_usage.py b/examples/example_usage.py new file mode 100644 index 0000000..5caba6e --- /dev/null +++ b/examples/example_usage.py @@ -0,0 +1,104 @@ +import scanpy as sc +import pandas as pd +import numpy as np +import transcriptomic_clustering +from transcriptomic_clustering.iterative_clustering import ( + build_cluster_dict, iter_clust, OnestepKwargs +) + +# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step. +adata = sc.read('path/to/your/data.h5ad') + +# Normalize the data. Skip if adata.X is already normalized. +adata=transcriptomic_clustering.normalize(adata) + +# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present. +scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) +adata.obsm['X_scVI'] = np.asarray(scvi) + +# Set up the clustering parameters +def setup_transcriptomic_clustering(): + means_vars_kwargs = { + 'low_thresh': 0.6931472, + 'min_cells': 4 + } + highly_variable_kwargs = { + 'max_genes': 4000 + } + pca_kwargs = { # not used if using a scvi latent space + 'cell_select': 30000, + 'n_comps': 50, + 'svd_solver': 'randomized' + } + filter_pcs_kwargs = { # not used if using a scvi latent space + 'known_components': None, + 'similarity_threshold': 0.7, + 'method': 'zscore', + 'zth': 2, + 'max_pcs': None, + } + filter_known_modes_kwargs = { + # 'known_modes': known_modes_df, + 'similarity_threshold': 0.7 + } + latent_kwargs = { + 'latent_component': "X_scVI" + } + cluster_louvain_kwargs = { + 'k': 15, + 'nn_measure': 'euclidean', + 'knn_method': 'annoy', + 'louvain_method': 'taynaud', + 'weighting_method': 'jaccard', + 'n_jobs': 30, + 'resolution': 1.0, + } + merge_clusters_kwargs = { + 'thresholds': { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 + }, + 'k': 4, + 'de_method': 'ebayes' + } + onestep_kwargs = OnestepKwargs( + means_vars_kwargs = means_vars_kwargs, + highly_variable_kwargs = highly_variable_kwargs, + pca_kwargs = pca_kwargs, + filter_pcs_kwargs = filter_pcs_kwargs, + filter_known_modes_kwargs = filter_known_modes_kwargs, + latent_kwargs = latent_kwargs, + cluster_louvain_kwargs = cluster_louvain_kwargs, + merge_clusters_kwargs = merge_clusters_kwargs + ) + return onestep_kwargs + +onestep_kwargs = setup_transcriptomic_clustering() + +# run the iterative clustering. need a tmp folder to store intermediate results +clusters, markers = iter_clust( + adata, + min_samples=10, + onestep_kwargs=onestep_kwargs, + random_seed=123, + tmp_dir="/path/to/your/tmp" +) + +cluster_dict = build_cluster_dict(clusters) + +adata.obs["scrattch_py_cluster"] = "" +for cluster in cluster_dict.keys(): + adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster + +adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category') + +# save the clustering results +res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster}) +res.to_csv('/path/to/your/clustering_results.csv') From 20467e3612bbf54a16d989f4d5be3d0910973245 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Fri, 11 Oct 2024 12:56:02 -0700 Subject: [PATCH 02/13] Modified the final_merge function --- transcriptomic_clustering/__init__.py | 3 +- transcriptomic_clustering/de_ebayes.py | 90 ++++++++++ transcriptomic_clustering/final_merging.py | 194 +++++++++++++-------- transcriptomic_clustering/markers.py | 12 +- transcriptomic_clustering/merging.py | 17 +- 5 files changed, 234 insertions(+), 82 deletions(-) diff --git a/transcriptomic_clustering/__init__.py b/transcriptomic_clustering/__init__.py index e69b5a1..68b7e5f 100644 --- a/transcriptomic_clustering/__init__.py +++ b/transcriptomic_clustering/__init__.py @@ -29,5 +29,6 @@ from .cluster_means import get_cluster_means from .merging import merge_clusters from .diff_expression import de_pairs_chisq, vec_chisq_test -from .de_ebayes import de_pairs_ebayes +from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel from .merging import merge_clusters +from .final_merging import final_merge diff --git a/transcriptomic_clustering/de_ebayes.py b/transcriptomic_clustering/de_ebayes.py index ba9514d..240b6d6 100644 --- a/transcriptomic_clustering/de_ebayes.py +++ b/transcriptomic_clustering/de_ebayes.py @@ -15,6 +15,9 @@ from .diff_expression import get_qdiff, filter_gene_stats, calc_de_score +import multiprocessing as mp +from functools import partial + logger = logging.getLogger(__name__) """ @@ -232,3 +235,90 @@ def de_pairs_ebayes( de_pairs = pd.DataFrame(de_pairs).T return de_pairs + +def process_pair(cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds, pair): + cluster_a, cluster_b = pair + # t-test with ebayes adjusted variances + means_diff = cl_means.loc[cluster_a] - cl_means.loc[cluster_b] + means_diff = means_diff.to_frame() + stdev_unscaled_comb = np.sqrt(np.sum(stdev_unscaled.loc[[cluster_a, cluster_b]] ** 2))[0] + + df_total = df + df_prior + df_pooled = np.sum(df) + df_total = min(df_total, df_pooled) + + t_vals = means_diff / np.sqrt(sigma_sq_post) / stdev_unscaled_comb + + p_adj = np.ones((len(t_vals),)) + p_vals = 2 * stats.t.sf(np.abs(t_vals[0]), df_total) + reject, p_adj, alphacSidak, alphacBonf= multipletests(p_vals, alpha=de_thresholds['padj_thresh'], method='holm') + lfc = means_diff + + # Get DE score + de_pair_stats = pd.DataFrame(index=cl_means.columns) + de_pair_stats['p_value'] = p_vals + de_pair_stats['p_adj'] = p_adj + de_pair_stats['lfc'] = lfc + de_pair_stats["meanA"] = cl_means.loc[cluster_a] + de_pair_stats["meanB"] = cl_means.loc[cluster_b] + de_pair_stats["q1"] = cl_present.loc[cluster_a] + de_pair_stats["q2"] = cl_present.loc[cluster_b] + de_pair_stats["qdiff"] = get_qdiff(cl_present.loc[cluster_a], cl_present.loc[cluster_b]) + + de_pair_up = filter_gene_stats( + de_stats=de_pair_stats, + gene_type='up-regulated', + cl1_size=cl_size[cluster_a], + cl2_size=cl_size[cluster_b], + **de_thresholds + ) + up_score = calc_de_score(de_pair_up['p_adj'].values) + + de_pair_down = filter_gene_stats( + de_stats=de_pair_stats, + gene_type='down-regulated', + cl1_size=cl_size[cluster_a], + cl2_size=cl_size[cluster_b], + **de_thresholds + ) + down_score = calc_de_score(de_pair_down['p_adj'].values) + + return (pair, { + 'score': up_score + down_score, + 'up_score': up_score, + 'down_score': down_score, + 'up_genes': de_pair_up.index.to_list(), + 'down_genes': de_pair_down.index.to_list(), + 'up_num': len(de_pair_up.index), + 'down_num': len(de_pair_down.index), + 'num': len(de_pair_up.index) + len(de_pair_down.index) + }) + +def de_pairs_ebayes_parallel( + pairs: List[Tuple[Any, Any]], + cl_means: pd.DataFrame, + cl_vars: pd.DataFrame, + cl_present: pd.DataFrame, + cl_size: Dict[Any, int], + de_thresholds: Dict[str, Any], + n_jobs: int = 1 + ) -> pd.DataFrame: + + logger.info('Fitting Variances') + sigma_sq, df, stdev_unscaled = get_linear_fit_vals(cl_vars, cl_size) + logger.info('Moderating Variances') + sigma_sq_post, var_prior, df_prior = moderate_variances(sigma_sq, df) + + logger.info(f'Comparing {len(pairs)} pairs') + + # de_pairs = {} + + partial_process = partial(process_pair, cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds) + + # Use multiprocessing to parallelize the process + with mp.Pool(processes=n_jobs) as pool: + results = pool.map(partial_process, pairs) + + de_pairs = {pair: result for pair, result in results} + de_pairs = pd.DataFrame(de_pairs).T + return de_pairs \ No newline at end of file diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py index 0ed9471..1f8d170 100644 --- a/transcriptomic_clustering/final_merging.py +++ b/transcriptomic_clustering/final_merging.py @@ -1,4 +1,4 @@ -fromt typing import Optional, Dict, Set, List, Any +from typing import Optional, Dict, Set, List, Any from collections import defaultdict from dataclasses import dataclass, field @@ -13,7 +13,8 @@ build_cluster_dict, summarize_final_clusters ) - +import logging +import time logger = logging.getLogger(__name__) @@ -25,6 +26,7 @@ class FinalMergeKwargs: filter_known_modes_kwargs: Dict = field(default_factory = lambda: ({})) project_kwargs: Dict = field(default_factory = lambda: ({})) merge_clusters_kwargs: Dict = field(default_factory = lambda: ({})) + latent_kwargs: Dict = field(default_factory = lambda: ({})) def sample_clusters( @@ -38,7 +40,10 @@ def sample_clusters( Parameters ---------- - Adata + adata + AnnData object + cluster_dict + Dictionary of cluster assignments. values are lists of cell names Returns ------- @@ -46,13 +51,15 @@ def sample_clusters( """ rng = default_rng(random_seed) - cell_samples = [] + cell_samples_ids = [] for k, v in cluster_dict.items(): if len(v) > n_samples_per_clust: choices = rng.choice(v, size=(n_samples_per_clust,)) else: choices = v - cell_samples.extend(choices) + cell_samples_ids.extend(choices) + + cell_samples = adata.obs_names[cell_samples_ids] cell_mask = pd.Series( index=adata.obs.index, @@ -62,112 +69,151 @@ def sample_clusters( return cell_mask +def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]: + """ + Convert a dictionary of cluster assignments to a list of cluster assignments + """ + # Find the total number of observations + max_index = max(max(indices) for indices in obs_by_cluster.values()) + + # Initialize the list of clusters with None (or some other default value) + cluster_by_obs = [None] * (max_index + 1) + + # Fill in the list with the corresponding cluster for each observation + for cluster, cell_ids in obs_by_cluster.items(): + for obs in cell_ids: + cluster_by_obs[obs] = cluster + + return cluster_by_obs + + def final_merge( adata: ad.AnnData, - cluster_assignments: pd.Series, + cluster_assignments: List, marker_genes: Set, n_samples_per_clust: int = 20, random_seed: Optional[int]=None, + n_jobs: Optional[int] = 1, + return_markers_df: Optional[bool] = False, final_merge_kwargs: FinalMergeKwargs = FinalMergeKwargs(), ) -> pd.DataFrame: """ Runs a final merging step on cluster assignment results + Step1: Using a pre-defined latent space or compute PCA as below: * Do PCA on random sample of cells per cluster and selected marker genes * Filter PCA results to select top eigenvectors * Project to reduced space * remove known eigen vector - * Do differential expression merge + Step2: Do differential expression merge Parameters ---------- + adata + AnnData object + cluster_assignments + List of arrays of cell ids, one array per cluster. This is the result returned by onestep_clust/iter_clust """ - # Quick data rearranging for convenience - cluster_dict = defaultdict(lambda: []) - for cell_name, cl_label in cluster_assignments.iteritems(): - cluster_dict[row].append(cell_name) - - cell_mask = sample_clusters( - adata=adata, - cluster_dict=cluster_dict, - n_samples_per_clust=n_samples_per_clust, - random_seed=random_seed - ) - gene_mask = pd.Series( - index=adata.var.index, - dtype=bool - ) - gene_mask[markers] = True + obs_by_cluster = defaultdict(lambda: []) + for i, cell_ids in enumerate(cluster_assignments): + obs_by_cluster[i] = cell_ids + + cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster) - # Do PCA on cell samples and marker genes - logger.info('Computing PCA on cell samples and marker genes') - tic = time.perf_counter() - (components, explained_variance_ratio, explained_variance, means) = tc.pca( - adata, - gene_mask=gene_mask, - cell_select=cell_mask, - random_state=random_seed, - **final_merge_kwargs.pca_kwargs - ) - logger.info(f'Computed {components.shape[1]} principal components') - toc = time.perf_counter() - logger.info(f'PCA Elapsed Time: {toc - tic}') + if final_merge_kwargs.latent_kwargs.get("latent_component") is None: - # Filter PCA - logger.info('Filtering PCA Components') - tic = time.perf_counter() - components = tc.dimension_reduction.filter_components( - components, - explained_variance, - explained_variance_ratio, - **final_merge_kwargs.filter_pcs_kwargs - ) - logger.info(f'Filtered to {components.shape[1]} principal components') - toc = time.perf_counter() - logger.info(f'Filter PCA Elapsed Time: {toc - tic}') - - # Project - logger.info("Projecting into PCA space") - tic = time.perf_counter() - projected_adata = tc.project( - adata, components, means, - **final_merge_kwargs.project_kwargs - ) - logger.info(f'Projected Adata Dimensions: {projected_adata.shape}') - toc = time.perf_counter() - logger.info(f'Projection Elapsed Time: {toc - tic}') + cell_mask = sample_clusters( + adata=adata, + cluster_dict=obs_by_cluster, + n_samples_per_clust=n_samples_per_clust, + random_seed=random_seed + ) + gene_mask = pd.Series( + index=adata.var.index, + dtype=bool + ) + gene_mask[marker_genes] = True - # Filter Projection - #Filter Known Modes - if final_merge_kwargs.filter_known_modes_kwargs: - logger.info('Filtering Known Modes') + # Do PCA on cell samples and marker genes + logger.info('Computing PCA on cell samples and marker genes') tic = time.perf_counter() + (components, explained_variance_ratio, explained_variance, means) = tc.pca( + adata, + gene_mask=gene_mask, + cell_select=cell_mask, + random_state=random_seed, + **final_merge_kwargs.pca_kwargs + ) + logger.info(f'Computed {components.shape[1]} principal components') + toc = time.perf_counter() + logger.info(f'PCA Elapsed Time: {toc - tic}') - projected_adata = tc.filter_known_modes( - projected_adata, - **final_merge_kwargs.filter_known_modes_kwargs + # Filter PCA + logger.info('Filtering PCA Components') + tic = time.perf_counter() + components = tc.dimension_reduction.filter_components( + components, + explained_variance, + explained_variance_ratio, + **final_merge_kwargs.filter_pcs_kwargs ) + logger.info(f'Filtered to {components.shape[1]} principal components') + toc = time.perf_counter() + logger.info(f'Filter PCA Elapsed Time: {toc - tic}') - logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}') + # Project + logger.info("Projecting into PCA space") + tic = time.perf_counter() + projected_adata = tc.project( + adata, components, means, + **final_merge_kwargs.project_kwargs + ) + logger.info(f'Projected Adata Dimensions: {projected_adata.shape}') toc = time.perf_counter() - logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}') - else: - logger.info('No known modes, skipping Filter Known Modes') + logger.info(f'Projection Elapsed Time: {toc - tic}') + + # Filter Projection + #Filter Known Modes + if final_merge_kwargs.filter_known_modes_kwargs: + logger.info('Filtering Known Modes') + tic = time.perf_counter() + + projected_adata = tc.filter_known_modes( + projected_adata, + **final_merge_kwargs.filter_known_modes_kwargs + ) + + logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}') + toc = time.perf_counter() + logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}') + else: + logger.info('No known modes, skipping Filter Known Modes') + else: + logger.info('Extracting latent dims') + tic = time.perf_counter() + + ## Extract latent dimensions + projected_adata = tc.latent_project(adata, **final_merge_kwargs.latent_kwargs) + + toc = time.perf_counter() + logger.info(f'Extracting latent dims Elapsed Time: {toc - tic}') # Merging logger.info('Starting Cluster Merging') tic = time.perf_counter() - cluster_assignments_after_merging = tc.merge_clusters( + cluster_assignments_after_merging, markers = tc.merge_clusters( adata_norm=adata, adata_reduced=projected_adata, - cluster_assignments=cluster_dict, - cluster_by_obs=cluster_assignments, + cluster_assignments=obs_by_cluster, + cluster_by_obs=cluster_by_obs, + return_markers_df=return_markers_df, + n_jobs=n_jobs, **final_merge_kwargs.merge_clusters_kwargs ) logger.info(f'Completed Merging') toc = time.perf_counter() logger.info(f'Merging Elapsed Time: {toc - tic}') - return cluster_assignments_after_merging + return cluster_assignments_after_merging, markers diff --git a/transcriptomic_clustering/markers.py b/transcriptomic_clustering/markers.py index 1ac0700..04b8c56 100644 --- a/transcriptomic_clustering/markers.py +++ b/transcriptomic_clustering/markers.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Set, Literal, Any, Optional +from typing import Dict, List, Set, Literal, Any, Optional, Union import logging from itertools import combinations @@ -20,7 +20,9 @@ def select_marker_genes( thresholds: Dict[str, Any], n_markers: int = 20, de_method: Optional[Literal['ebayes', 'chisq']] = 'ebayes', -) -> Set: + return_markers_df: Optional[bool] = False, + n_jobs: Optional[int] = 1 +) -> Union[pd.DataFrame, set]: """ Selects n up genes and n down genes from the differentially expressed genes between each pair of clusters, and saves the combined set for all cluster pairs. @@ -58,13 +60,14 @@ def select_marker_genes( thresholds.pop('score_thresh', None) neighbor_pairs = list(combinations(cl_names, 2)) if de_method == 'ebayes': - de_df = tc.de_pairs_ebayes( + de_df = tc.de_pairs_ebayes_parallel( neighbor_pairs, cluster_means, cluster_variances, present_cluster_means, cl_size, thresholds, + n_jobs = n_jobs ) elif de_method == 'chisq': de_df = tc.de_pairs_chisq( @@ -77,6 +80,9 @@ def select_marker_genes( else: raise ValueError(f'Unknown de_method {de_method}, must be one of [chisq, ebayes]') + if return_markers_df: + return de_df + markers = set() for pair, row in de_df.iterrows(): markers.update(row.up_genes[:n_markers]) diff --git a/transcriptomic_clustering/merging.py b/transcriptomic_clustering/merging.py index 97c469c..6c7ce9b 100644 --- a/transcriptomic_clustering/merging.py +++ b/transcriptomic_clustering/merging.py @@ -34,7 +34,9 @@ def merge_clusters( k: Optional[int] = 2, de_method: Optional[str] = 'ebayes', n_markers: Optional[int] = 20, - chunk_size: Optional[int] = None + chunk_size: Optional[int] = None, + return_markers_df: Optional[bool] = False, + n_jobs: Optional[int] = 1 ) -> Tuple[Dict[Any, np.ndarray], Set]: """ Merge clusters based on size and differential gene expression score @@ -63,6 +65,10 @@ def merge_clusters( if 0 or None will skip chunk_size: number of observations to process in a single chunk + return_markers_df: + return markers as a dataframe + n_jobs: + number of jobs to run in parallel Returns ------- @@ -132,7 +138,9 @@ def merge_clusters( logger.info(f'Merging DE Elapsed Time: {toc - tic}') # Select marker genes - if n_markers: + if return_markers_df is False and n_markers is None: + markers = None + else: logger.info('Starting Marker Selection') tic = time.perf_counter() markers = select_marker_genes( @@ -143,12 +151,13 @@ def merge_clusters( thresholds=thresholds, n_markers=n_markers, de_method=de_method, + return_markers_df=return_markers_df, + n_jobs=n_jobs ) logger.info('Completed Marker Selection') toc = time.perf_counter() logger.info(f'Marker Selection Elapsed Time: {toc - tic}') - else: - markers = None + return cluster_assignments_merge, markers From 4fe87f6b109d87caa357085c8039d9f08f1d60b5 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Fri, 11 Oct 2024 13:24:11 -0700 Subject: [PATCH 03/13] Updated the example scripts --- examples/clustering_example.py | 115 ++++++++++++++++++++++++++++++ examples/final_merging_example.py | 95 ++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 examples/clustering_example.py create mode 100644 examples/final_merging_example.py diff --git a/examples/clustering_example.py b/examples/clustering_example.py new file mode 100644 index 0000000..839c2f6 --- /dev/null +++ b/examples/clustering_example.py @@ -0,0 +1,115 @@ +import scanpy as sc +import pandas as pd +import numpy as np +import pickle + +# Skip this line if transcriptomic_clustering is installed +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') + +from transcriptomic_clustering.iterative_clustering import ( + build_cluster_dict, iter_clust, OnestepKwargs +) + +# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step. +adata = sc.read('path/to/your/data.h5ad') + +# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata) +adata=transcriptomic_clustering.normalize(adata) + +# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present. +scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) +adata.obsm['X_scVI'] = np.asarray(scvi) + +# Set up the clustering parameters +def setup_transcriptomic_clustering(): + means_vars_kwargs = { + 'low_thresh': 0.6931472, + 'min_cells': 4 + } + highly_variable_kwargs = { + 'max_genes': 4000 + } + pca_kwargs = { # not used if using a scvi latent space + 'cell_select': 30000, + 'n_comps': 50, + 'svd_solver': 'randomized' + } + filter_pcs_kwargs = { # not used if using a scvi latent space + 'known_components': None, + 'similarity_threshold': 0.7, + 'method': 'zscore', + 'zth': 2, + 'max_pcs': None, + } + filter_known_modes_kwargs = { + # 'known_modes': known_modes_df, + 'similarity_threshold': 0.7 + } + latent_kwargs = { + 'latent_component': "X_scVI" + } + cluster_louvain_kwargs = { + 'k': 15, + 'nn_measure': 'euclidean', + 'knn_method': 'annoy', + 'louvain_method': 'taynaud', + 'weighting_method': 'jaccard', + 'n_jobs': 30, + 'resolution': 1.0, + } + merge_clusters_kwargs = { + 'thresholds': { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 + }, + 'k': 4, + 'de_method': 'ebayes' + } + onestep_kwargs = OnestepKwargs( + means_vars_kwargs = means_vars_kwargs, + highly_variable_kwargs = highly_variable_kwargs, + pca_kwargs = pca_kwargs, + filter_pcs_kwargs = filter_pcs_kwargs, + filter_known_modes_kwargs = filter_known_modes_kwargs, + latent_kwargs = latent_kwargs, + cluster_louvain_kwargs = cluster_louvain_kwargs, + merge_clusters_kwargs = merge_clusters_kwargs + ) + return onestep_kwargs + +onestep_kwargs = setup_transcriptomic_clustering() + +# run the iterative clustering. need a tmp folder to store intermediate results +clusters, markers = iter_clust( + adata, + min_samples=10, + onestep_kwargs=onestep_kwargs, + random_seed=123, + tmp_dir="/path/to/your/tmp" +) + +# save the results in .pkl format, which will be used for final merging +with open('clustering_results.pkl', 'wb') as f: + pickle.dump(clusters, f) + +with open('markers.pkl', 'wb') as f: + pickle.dump(markers, f) + +# (Optional) save the clustering results in a data frame. +cluster_dict = build_cluster_dict(clusters) + +adata.obs["scrattch_py_cluster"] = "" +for cluster in cluster_dict.keys(): + adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster + +adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category') + +res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster}) +res.to_csv('/path/to/your/clustering_results.csv') diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py new file mode 100644 index 0000000..957a387 --- /dev/null +++ b/examples/final_merging_example.py @@ -0,0 +1,95 @@ +import sys +import os +import pickle +import pandas as pd +import scanpy as sc +import importlib +import time +import anndata as ad +import numpy as np +import math + +# Skip this line if transcriptomic_clustering is installed +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') + +import transcriptomic_clustering as tc +from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs + +# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step. +adata = sc.read('path/to/your/data.h5ad') + +# Normalize the data. Skip if adata.X is already normalized. +sc.pp.normalize_total(adata, target_sum=1e6) +sc.pp.log1p(adata) + +# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present. +scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) +adata.obsm['scVI'] = np.asarray(scvi) + +# loading clustering results +cl_pth = "/path/to/clustering_results" +with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f: + clusters = pickle.load(f) + +with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f: + markers = pickle.load(f) + +# The first 4 are for PCA. modify latent_kwargs if using a pre-computed latent space +def setup_merging(): + pca_kwargs ={ + # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA + 'n_comps': 50, + 'svd_solver': 'randomized' + } + filter_pcs_kwargs = { + 'known_components': None, + 'similarity_threshold': 0.7, + 'method': 'zscore', + 'zth': 2, + 'max_pcs': None} + filter_known_modes_kwargs = { + 'known_modes': None, + 'similarity_threshold': 0.7} + project_kwargs = {} + merge_clusters_kwargs = { + 'thresholds': { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 + }, + 'k': 4, + 'de_method': 'ebayes' + } + latent_kwargs = { # if None: default is running pca, else use the latent_component in adata.obsm + 'latent_component': "scVI" + } + + merge_kwargs = FinalMergeKwargs( + pca_kwargs = pca_kwargs, + filter_pcs_kwargs = filter_pcs_kwargs, + filter_known_modes_kwargs = filter_known_modes_kwargs, + project_kwargs = project_kwargs, + merge_clusters_kwargs = merge_clusters_kwargs, + latent_kwargs = latent_kwargs + ) + return merge_kwargs + +merge_kwargs = setup_merging() + +# Run the final merging +clusters_after_merging, markers = final_merge( + adata, + clusters, + markers, + n_samples_per_clust=20, + random_seed=2024, + final_merge_kwargs=merge_kwargs, + n_jobs = 30, # modify this to the number of cores you want to use + return_markers_df = True # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison) +) \ No newline at end of file From 5d6b2ba95563292a4f41ac9dbfe0d29ad55e0641 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Fri, 11 Oct 2024 13:28:13 -0700 Subject: [PATCH 04/13] Deleted the origin example_usage.py --- examples/example_usage.py | 104 -------------------------------------- 1 file changed, 104 deletions(-) delete mode 100644 examples/example_usage.py diff --git a/examples/example_usage.py b/examples/example_usage.py deleted file mode 100644 index 5caba6e..0000000 --- a/examples/example_usage.py +++ /dev/null @@ -1,104 +0,0 @@ -import scanpy as sc -import pandas as pd -import numpy as np -import transcriptomic_clustering -from transcriptomic_clustering.iterative_clustering import ( - build_cluster_dict, iter_clust, OnestepKwargs -) - -# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step. -adata = sc.read('path/to/your/data.h5ad') - -# Normalize the data. Skip if adata.X is already normalized. -adata=transcriptomic_clustering.normalize(adata) - -# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present. -scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) -adata.obsm['X_scVI'] = np.asarray(scvi) - -# Set up the clustering parameters -def setup_transcriptomic_clustering(): - means_vars_kwargs = { - 'low_thresh': 0.6931472, - 'min_cells': 4 - } - highly_variable_kwargs = { - 'max_genes': 4000 - } - pca_kwargs = { # not used if using a scvi latent space - 'cell_select': 30000, - 'n_comps': 50, - 'svd_solver': 'randomized' - } - filter_pcs_kwargs = { # not used if using a scvi latent space - 'known_components': None, - 'similarity_threshold': 0.7, - 'method': 'zscore', - 'zth': 2, - 'max_pcs': None, - } - filter_known_modes_kwargs = { - # 'known_modes': known_modes_df, - 'similarity_threshold': 0.7 - } - latent_kwargs = { - 'latent_component': "X_scVI" - } - cluster_louvain_kwargs = { - 'k': 15, - 'nn_measure': 'euclidean', - 'knn_method': 'annoy', - 'louvain_method': 'taynaud', - 'weighting_method': 'jaccard', - 'n_jobs': 30, - 'resolution': 1.0, - } - merge_clusters_kwargs = { - 'thresholds': { - 'q1_thresh': 0.5, - 'q2_thresh': None, - 'cluster_size_thresh': 10, - 'qdiff_thresh': 0.7, - 'padj_thresh': 0.05, - 'lfc_thresh': 1, - 'score_thresh': 100, - 'low_thresh': 0.6931472, - 'min_genes': 5 - }, - 'k': 4, - 'de_method': 'ebayes' - } - onestep_kwargs = OnestepKwargs( - means_vars_kwargs = means_vars_kwargs, - highly_variable_kwargs = highly_variable_kwargs, - pca_kwargs = pca_kwargs, - filter_pcs_kwargs = filter_pcs_kwargs, - filter_known_modes_kwargs = filter_known_modes_kwargs, - latent_kwargs = latent_kwargs, - cluster_louvain_kwargs = cluster_louvain_kwargs, - merge_clusters_kwargs = merge_clusters_kwargs - ) - return onestep_kwargs - -onestep_kwargs = setup_transcriptomic_clustering() - -# run the iterative clustering. need a tmp folder to store intermediate results -clusters, markers = iter_clust( - adata, - min_samples=10, - onestep_kwargs=onestep_kwargs, - random_seed=123, - tmp_dir="/path/to/your/tmp" -) - -cluster_dict = build_cluster_dict(clusters) - -adata.obs["scrattch_py_cluster"] = "" -for cluster in cluster_dict.keys(): - adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster - -adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category') - -# save the clustering results -res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster}) -res.to_csv('/path/to/your/clustering_results.csv') From 93147f562ec484a5432b7552d3d00e9ef3764d82 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Fri, 11 Oct 2024 14:38:48 -0700 Subject: [PATCH 05/13] modified return data type in final_merge() --- transcriptomic_clustering/final_merging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py index 1f8d170..e9f18e9 100644 --- a/transcriptomic_clustering/final_merging.py +++ b/transcriptomic_clustering/final_merging.py @@ -96,7 +96,7 @@ def final_merge( n_jobs: Optional[int] = 1, return_markers_df: Optional[bool] = False, final_merge_kwargs: FinalMergeKwargs = FinalMergeKwargs(), -) -> pd.DataFrame: +) -> Tuple[List[List[int]], Union[pd.DataFrame, set]]: """ Runs a final merging step on cluster assignment results Step1: Using a pre-defined latent space or compute PCA as below: From 1d35005d85a739d9f44bcffca62a03a77e355521 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Fri, 11 Oct 2024 16:14:19 -0700 Subject: [PATCH 06/13] modified final_merge() to allow not providing marger genes --- transcriptomic_clustering/final_merging.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py index e9f18e9..f64080f 100644 --- a/transcriptomic_clustering/final_merging.py +++ b/transcriptomic_clustering/final_merging.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Set, List, Any +from typing import Optional, Dict, Set, List, Any, Tuple, Union from collections import defaultdict from dataclasses import dataclass, field @@ -9,9 +9,6 @@ import anndata as ad import transcriptomic_clustering as tc -from transcriptomic_clustering.iterative_clustering import ( - build_cluster_dict, summarize_final_clusters -) import logging import time @@ -90,7 +87,7 @@ def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int] def final_merge( adata: ad.AnnData, cluster_assignments: List, - marker_genes: Set, + marker_genes: Optional[set] = None, n_samples_per_clust: int = 20, random_seed: Optional[int]=None, n_jobs: Optional[int] = 1, @@ -123,6 +120,9 @@ def final_merge( if final_merge_kwargs.latent_kwargs.get("latent_component") is None: + if marker_genes is None: + raise ValueError("Need marker genes to run PCA") + cell_mask = sample_clusters( adata=adata, cluster_dict=obs_by_cluster, @@ -216,4 +216,6 @@ def final_merge( toc = time.perf_counter() logger.info(f'Merging Elapsed Time: {toc - tic}') + cluster_assignments_after_merging = list(cluster_assignments_after_merging.values()) + return cluster_assignments_after_merging, markers From 5eb4083646db2c1df9a5e50413a3f3b783a77062 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Mon, 14 Oct 2024 13:53:48 -0700 Subject: [PATCH 07/13] Modify example scripts --- examples/final_merging_example.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py index 957a387..667dc7f 100644 --- a/examples/final_merging_example.py +++ b/examples/final_merging_example.py @@ -31,6 +31,7 @@ with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f: clusters = pickle.load(f) +# marker genes are only needed for computing PCA with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f: markers = pickle.load(f) @@ -86,7 +87,7 @@ def setup_merging(): clusters_after_merging, markers = final_merge( adata, clusters, - markers, + markers, # required for PCA, but optional if using a pre-computed latent space n_samples_per_clust=20, random_seed=2024, final_merge_kwargs=merge_kwargs, From 21845d4ce1e9616e87d2bba83ed12a717f4202d7 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Mon, 14 Oct 2024 13:58:11 -0700 Subject: [PATCH 08/13] Modified example scripts --- examples/final_merging_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py index 667dc7f..e648b34 100644 --- a/examples/final_merging_example.py +++ b/examples/final_merging_example.py @@ -12,7 +12,6 @@ # Skip this line if transcriptomic_clustering is installed sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') -import transcriptomic_clustering as tc from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs # Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step. From 6438d8f4c81a6b6f42f806b39f5eed7718d093ff Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Tue, 26 Nov 2024 19:35:33 -0800 Subject: [PATCH 09/13] fixed a bug in pca();chanegd the input to a str for filter_known_modes() --- transcriptomic_clustering/dimension_reduction.py | 2 +- transcriptomic_clustering/filter_known_modes.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/transcriptomic_clustering/dimension_reduction.py b/transcriptomic_clustering/dimension_reduction.py index 499e083..2a39f39 100644 --- a/transcriptomic_clustering/dimension_reduction.py +++ b/transcriptomic_clustering/dimension_reduction.py @@ -116,7 +116,7 @@ def pca( _, vidx = adata._normalize_indices((slice(None), gene_mask)) # handle gene mask like anndata would vidx_bool = np.zeros((adata.n_vars,), dtype=bool) vidx_bool[vidx] = True - n_genes = len(vidx) + n_genes = sum(vidx) # select n_comps max_comps = min(n_cells, n_genes) - 1 diff --git a/transcriptomic_clustering/filter_known_modes.py b/transcriptomic_clustering/filter_known_modes.py index 7b17c43..98ce71c 100644 --- a/transcriptomic_clustering/filter_known_modes.py +++ b/transcriptomic_clustering/filter_known_modes.py @@ -10,7 +10,7 @@ def filter_known_modes( projected_adata: ad.AnnData, - known_modes: Union[pd.DataFrame, pd.Series], + known_modes: Optional[str] = None, similarity_threshold: Optional[float] = 0.7): """ Filters out principal components which correlate strongly with the known modes @@ -27,6 +27,12 @@ def filter_known_modes( projected_adata: after filtering out correlated principal components """ + # determine if know_modes is in adata.obs + if known_modes in projected_adata.obs.columns: + known_modes = projected_adata.obs[known_modes] + else: + raise ValueError(f'{known_modes} not found in adata.obs') + if isinstance(known_modes, pd.Series): known_modes = known_modes.to_frame() From 096165b988d8df2144fadd39446c4bba81692576 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Tue, 26 Nov 2024 19:48:11 -0800 Subject: [PATCH 10/13] update the example script for final merging --- examples/final_merging_example.py | 40 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py index e648b34..0a63c4d 100644 --- a/examples/final_merging_example.py +++ b/examples/final_merging_example.py @@ -34,7 +34,7 @@ with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f: markers = pickle.load(f) -# The first 4 are for PCA. modify latent_kwargs if using a pre-computed latent space +# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space def setup_merging(): pca_kwargs ={ # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA @@ -48,7 +48,7 @@ def setup_merging(): 'zth': 2, 'max_pcs': None} filter_known_modes_kwargs = { - 'known_modes': None, + 'known_modes': 'log2ngene', 'similarity_threshold': 0.7} project_kwargs = {} merge_clusters_kwargs = { @@ -64,10 +64,11 @@ def setup_merging(): 'min_genes': 5 }, 'k': 4, - 'de_method': 'ebayes' + 'de_method': 'ebayes', + # 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step } - latent_kwargs = { # if None: default is running pca, else use the latent_component in adata.obsm - 'latent_component': "scVI" + latent_kwargs = { + 'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm } merge_kwargs = FinalMergeKwargs( @@ -89,7 +90,30 @@ def setup_merging(): markers, # required for PCA, but optional if using a pre-computed latent space n_samples_per_clust=20, random_seed=2024, - final_merge_kwargs=merge_kwargs, n_jobs = 30, # modify this to the number of cores you want to use - return_markers_df = True # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison) -) \ No newline at end of file + return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison) + final_merge_kwargs=merge_kwargs +) + +out_dir = "/path/to/output" + +with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f: + pickle.dump(clusters_after_merging, f) + +# determine datatype for markers_after_merging and save +if markers_after_merging is None: + print("Skipped calculating markers. Did not save markers.") +elif isinstance(markers_after_merging, pd.DataFrame): + markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv')) +else: + with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f: + pickle.dump(markers_after_merging, f) + +# convert the clustering results to a .csv file +n_cells = sum(len(i) for i in clusters_after_merging) +cl = ['unknown']*n_cells +for i in range(len(clusters_after_merging)): + for j in clusters_after_merging[i]: + cl[j] = i+1 +res = pd.DataFrame({'cl': cl}, index=adata.obs_names) +res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv')) \ No newline at end of file From 026edbab4e05392eea15b5801860365a29faf71f Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Wed, 16 Jul 2025 11:40:39 -0700 Subject: [PATCH 11/13] update example --- examples/final_merging_example.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py index 0a63c4d..9917c91 100644 --- a/examples/final_merging_example.py +++ b/examples/final_merging_example.py @@ -15,7 +15,7 @@ from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs # Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step. -adata = sc.read('path/to/your/data.h5ad') +adata = sc.read('path/to/your/adata.h5ad') # Normalize the data. Skip if adata.X is already normalized. sc.pp.normalize_total(adata, target_sum=1e6) @@ -25,7 +25,7 @@ scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0) adata.obsm['scVI'] = np.asarray(scvi) -# loading clustering results +# Loading clustering results cl_pth = "/path/to/clustering_results" with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f: clusters = pickle.load(f) @@ -65,7 +65,7 @@ def setup_merging(): }, 'k': 4, 'de_method': 'ebayes', - # 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step + 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step } latent_kwargs = { 'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm @@ -84,7 +84,7 @@ def setup_merging(): merge_kwargs = setup_merging() # Run the final merging -clusters_after_merging, markers = final_merge( +clusters_after_merging, markers_after_merging = final_merge( adata, clusters, markers, # required for PCA, but optional if using a pre-computed latent space From 5c43281d95164608703bba96dcf8d4ccecfe4a16 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Thu, 31 Jul 2025 20:38:50 -0700 Subject: [PATCH 12/13] add de pair function --- examples/pairwiseDE_example.py | 45 ++++++++++++ transcriptomic_clustering/pairwise_DEGs.py | 79 ++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 examples/pairwiseDE_example.py create mode 100644 transcriptomic_clustering/pairwise_DEGs.py diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py new file mode 100644 index 0000000..48f9a10 --- /dev/null +++ b/examples/pairwiseDE_example.py @@ -0,0 +1,45 @@ +# this example script is used to get pair-wise DEGs for all clusters given the normalized count and the cluster assignment +import sys +import scanpy as sc +import pickle +from collections import defaultdict + +sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/') +from transcriptomic_clustering.pairwise_DEGs import pairwise_degs + +thresholds = { + 'q1_thresh': 0.5, + 'q2_thresh': None, + 'cluster_size_thresh': 10, + 'qdiff_thresh': 0.7, + 'padj_thresh': 0.05, + 'lfc_thresh': 1, + 'score_thresh': 100, + 'low_thresh': 0.6931472, + 'min_genes': 5 +} + +adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad') + +clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl' +with open(clusters_pth, 'rb') as f: + clusters = pickle.load(f) +obs_by_cluster = defaultdict(lambda: []) +for i, cell_ids in enumerate(clusters): + obs_by_cluster[i] = cell_ids + +# subset the adata and obs_by_cluster to only include two clusters for testing +adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy() + +obs_by_cluster_subset = {} +obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))] +obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))] + +degs = pairwise_degs( + adata, + obs_by_cluster_subset, + thresholds, + n_markers=20, + de_method='ebayes', + n_jobs=30 + ) \ No newline at end of file diff --git a/transcriptomic_clustering/pairwise_DEGs.py b/transcriptomic_clustering/pairwise_DEGs.py new file mode 100644 index 0000000..6c28e95 --- /dev/null +++ b/transcriptomic_clustering/pairwise_DEGs.py @@ -0,0 +1,79 @@ +import anndata as ad +import scanpy as sc +import time +from typing import Dict, List, Any, Optional +import transcriptomic_clustering as tc +from transcriptomic_clustering.markers import select_marker_genes +import logging + +logger = logging.getLogger(__name__) + +def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]: + """ + Convert a dictionary of cluster assignments to a list of cluster assignments + """ + # Find the total number of observations + max_index = max(max(indices) for indices in obs_by_cluster.values()) + + # Initialize the list of clusters with None (or some other default value) + cluster_by_obs = [None] * (max_index + 1) + + # Fill in the list with the corresponding cluster for each observation + for cluster, cell_ids in obs_by_cluster.items(): + for obs in cell_ids: + cluster_by_obs[obs] = cluster + + return cluster_by_obs + +def pairwise_degs( + adata_norm: ad.AnnData, + obs_by_cluster: Dict[Any, List], # a dictionary with keys as cluster names and values as lists of cell names + thresholds: Dict[str, Any], + n_markers: int = 20, + de_method: str = 'ebayes', + n_jobs: int = 30, + chunk_size: Optional[int] = None + ): + """ + Perform pairwise differential expression analysis on clusters in an AnnData object. + Parameters + ---------- + adata_norm : AnnData + The normalized AnnData object containing the data. + cluster_assignments : Dict[Any, List] + A dictionary where keys are cluster names and values are lists of cell names belonging to those clusters + chunk_size : Optional[int] + The size of chunks to process at a time. If None, the entire dataset is processed + thresholds : Dict[str, Any] + n_markers : int + The number of up-regulated and downregulated markers to return for all pairs of clusters + """ + cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster) + logger.info("Computing Cluster Means") + tic = time.perf_counter() + cl_means, present_cl_means, cl_vars = tc.get_cluster_means(adata_norm, + obs_by_cluster, + cluster_by_obs, + chunk_size, + low_th=thresholds['low_thresh']) + logger.info(f'Completed Cluster Means') + toc = time.perf_counter() + logger.info(f'Cluster Means Elapsed Time: {toc - tic}') + + logger.info('Starting Marker Selection') + tic = time.perf_counter() + markers = select_marker_genes( + cluster_assignments=obs_by_cluster, + cluster_means=cl_means, + cluster_variances=cl_vars, + present_cluster_means=present_cl_means, + thresholds=thresholds, + n_markers=n_markers, + de_method=de_method, + n_jobs=n_jobs + ) + logger.info('Completed Marker Selection') + toc = time.perf_counter() + logger.info(f'Marker Selection Elapsed Time: {toc - tic}') + + return markers \ No newline at end of file From bbd457933919498e8115711d96b988a31767ce40 Mon Sep 17 00:00:00 2001 From: dyuan1111 Date: Thu, 31 Jul 2025 20:54:13 -0700 Subject: [PATCH 13/13] update de example --- examples/pairwiseDE_example.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py index 48f9a10..b9496e7 100644 --- a/examples/pairwiseDE_example.py +++ b/examples/pairwiseDE_example.py @@ -19,22 +19,28 @@ 'min_genes': 5 } +# reading in the adata that adata.X is already normalizedd adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad') +# reading in the clustering results, a list of lists of cell indices in adata clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl' with open(clusters_pth, 'rb') as f: clusters = pickle.load(f) + +# convert to a dictionary of cluster id to cell indices obs_by_cluster = defaultdict(lambda: []) for i, cell_ids in enumerate(clusters): obs_by_cluster[i] = cell_ids -# subset the adata and obs_by_cluster to only include two clusters for testing +# subset the adata to include only the first two clusters for testing adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy() +# create the new obs_by_cluster of cluster id to the cell indices in the subsetted adata obs_by_cluster_subset = {} obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))] obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))] +# returns a set of markers (20 up regulated and 20 down regulated) degs = pairwise_degs( adata, obs_by_cluster_subset,