AllenInstitute · yyuandann · Aug 13, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/README.md b/README.md
@@ -10,10 +10,14 @@ Installing
 ----------
 For usage and installation instructions, see the [install guide](docs/install.rst).
 
+Examples
+--------
+You can find example usage of this package in the examples directory.
+
 Contributing
 ------------
 We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you!
 
 Level of Support
 ----------------
-We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
+We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
diff --git a/examples/clustering_example.py b/examples/clustering_example.py
@@ -0,0 +1,115 @@
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import pickle
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+from transcriptomic_clustering.iterative_clustering import (
+    build_cluster_dict, iter_clust, OnestepKwargs
+)
+
+# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
+adata = sc.read('path/to/your/data.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata)
+adata=transcriptomic_clustering.normalize(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['X_scVI'] = np.asarray(scvi)
+
+# Set up the clustering parameters
+def setup_transcriptomic_clustering(): 
+    means_vars_kwargs = {
+        'low_thresh': 0.6931472,
+        'min_cells': 4 
+    }
+    highly_variable_kwargs = {
+        'max_genes': 4000 
+    }
+    pca_kwargs = { # not used if using a scvi latent space
+        'cell_select': 30000, 
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = { # not used if using a scvi latent space
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None,
+    }
+    filter_known_modes_kwargs = { 
+        # 'known_modes': known_modes_df,
+        'similarity_threshold': 0.7
+    }
+    latent_kwargs = {
+        'latent_component': "X_scVI"
+    }
+    cluster_louvain_kwargs = { 
+        'k': 15, 
+        'nn_measure': 'euclidean',
+        'knn_method': 'annoy',
+        'louvain_method': 'taynaud', 
+        'weighting_method': 'jaccard',
+        'n_jobs': 30, 
+        'resolution': 1.0, 
+    }
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes'
+    }
+    onestep_kwargs = OnestepKwargs(
+        means_vars_kwargs = means_vars_kwargs,
+        highly_variable_kwargs = highly_variable_kwargs,
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        latent_kwargs = latent_kwargs,
+        cluster_louvain_kwargs = cluster_louvain_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs
+    )
+    return onestep_kwargs
+
+onestep_kwargs = setup_transcriptomic_clustering()
+
+# run the iterative clustering. need a tmp folder to store intermediate results
+clusters, markers = iter_clust(
+    adata,
+    min_samples=10, 
+    onestep_kwargs=onestep_kwargs,
+    random_seed=123,
+    tmp_dir="/path/to/your/tmp"
+)
+
+# save the results in .pkl format, which will be used for final merging
+with open('clustering_results.pkl', 'wb') as f:
+    pickle.dump(clusters, f)
+
+with open('markers.pkl', 'wb') as f:
+    pickle.dump(markers, f)
+
+# (Optional) save the clustering results in a data frame.
+cluster_dict = build_cluster_dict(clusters)
+
+adata.obs["scrattch_py_cluster"] = ""
+for cluster in cluster_dict.keys():
+    adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster
+
+adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')
+
+res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
+res.to_csv('/path/to/your/clustering_results.csv')
diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
@@ -0,0 +1,119 @@
+import sys
+import os
+import pickle
+import pandas as pd
+import scanpy as sc
+import importlib
+import time
+import anndata as ad
+import numpy as np
+import math
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs
+
+# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
+adata = sc.read('path/to/your/adata.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized.
+sc.pp.normalize_total(adata, target_sum=1e6)
+sc.pp.log1p(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['scVI'] = np.asarray(scvi)
+
+# Loading clustering results
+cl_pth = "/path/to/clustering_results"
+with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
+    clusters = pickle.load(f)
+
+# marker genes are only needed for computing PCA
+with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
+    markers = pickle.load(f)
+
+# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space
+def setup_merging(): 
+    pca_kwargs ={
+        # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = {
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None}
+    filter_known_modes_kwargs = {
+        'known_modes': 'log2ngene', 
+        'similarity_threshold': 0.7}
+    project_kwargs = {}
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes',
+        'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
+    }
+    latent_kwargs = { 
+        'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
+    }
+
+    merge_kwargs = FinalMergeKwargs(
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        project_kwargs = project_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs,
+        latent_kwargs = latent_kwargs
+    )
+    return merge_kwargs
+
+merge_kwargs = setup_merging()
+
+# Run the final merging
+clusters_after_merging, markers_after_merging = final_merge(
+    adata, 
+    clusters, 
+    markers, # required for PCA, but optional if using a pre-computed latent space
+    n_samples_per_clust=20, 
+    random_seed=2024, 
+    n_jobs = 30, # modify this to the number of cores you want to use
+    return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
+    final_merge_kwargs=merge_kwargs
+)
+
+out_dir = "/path/to/output"
+
+with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f:
+        pickle.dump(clusters_after_merging, f)
+
+# determine datatype for markers_after_merging and save
+if markers_after_merging is None:
+    print("Skipped calculating markers. Did not save markers.")
+elif isinstance(markers_after_merging, pd.DataFrame):
+    markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv'))
+else:
+    with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f:
+        pickle.dump(markers_after_merging, f)
+
+# convert the clustering results to a .csv file
+n_cells = sum(len(i) for i in clusters_after_merging)
+cl = ['unknown']*n_cells
+for i in range(len(clusters_after_merging)):
+    for j in clusters_after_merging[i]:
+        cl[j] = i+1
+res = pd.DataFrame({'cl': cl}, index=adata.obs_names)
+res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv'))
diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py
@@ -0,0 +1,51 @@
+# this example script is used to get pair-wise DEGs for all clusters given the normalized count and the cluster assignment 
+import sys
+import scanpy as sc
+import pickle
+from collections import defaultdict
+
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+from transcriptomic_clustering.pairwise_DEGs import pairwise_degs
+
+thresholds = {
+    'q1_thresh': 0.5,
+    'q2_thresh': None,
+    'cluster_size_thresh': 10, 
+    'qdiff_thresh': 0.7, 
+    'padj_thresh': 0.05, 
+    'lfc_thresh': 1, 
+    'score_thresh': 100, 
+    'low_thresh': 0.6931472, 
+    'min_genes': 5
+}
+
+# reading in the adata that adata.X is already normalizedd
+adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad')
+
+# reading in the clustering results, a list of lists of cell indices in adata
+clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl'
+with open(clusters_pth, 'rb') as f:
+    clusters = pickle.load(f)
+
+# convert to a dictionary of cluster id to cell indices
+obs_by_cluster = defaultdict(lambda: [])
+for i, cell_ids in enumerate(clusters):
+    obs_by_cluster[i] = cell_ids
+
+# subset the adata to include only the first two clusters for testing
+adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy()
+
+# create the new obs_by_cluster of cluster id to the cell indices in the subsetted adata
+obs_by_cluster_subset = {}
+obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))]
+obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))]
+
+# returns a set of markers (20 up regulated and 20 down regulated)
+degs = pairwise_degs(
+        adata,
+        obs_by_cluster_subset,
+        thresholds,
+        n_markers=20,
+        de_method='ebayes',
+        n_jobs=30
+        )
diff --git a/transcriptomic_clustering/__init__.py b/transcriptomic_clustering/__init__.py
@@ -29,5 +29,6 @@
 from .cluster_means import get_cluster_means
 from .merging import merge_clusters
 from .diff_expression import de_pairs_chisq, vec_chisq_test
-from .de_ebayes import de_pairs_ebayes
+from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel
 from .merging import merge_clusters
-from .merging import merge_clusters
-from .merging import merge_clusters
+from .final_merging import final_merge