Skip to content
Open
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ Installing
----------
For usage and installation instructions, see the [install guide](docs/install.rst).

Examples
--------
You can find example usage of this package in the examples directory.

Contributing
------------
We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you!

Level of Support
----------------
We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
115 changes: 115 additions & 0 deletions examples/clustering_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import scanpy as sc
import pandas as pd
import numpy as np
import pickle

# Skip this line if transcriptomic_clustering is installed
sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')

from transcriptomic_clustering.iterative_clustering import (
build_cluster_dict, iter_clust, OnestepKwargs
)

# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
adata = sc.read('path/to/your/data.h5ad')

# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata)
adata=transcriptomic_clustering.normalize(adata)

# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
adata.obsm['X_scVI'] = np.asarray(scvi)

# Set up the clustering parameters
def setup_transcriptomic_clustering():
means_vars_kwargs = {
'low_thresh': 0.6931472,
'min_cells': 4
}
highly_variable_kwargs = {
'max_genes': 4000
}
pca_kwargs = { # not used if using a scvi latent space
'cell_select': 30000,
'n_comps': 50,
'svd_solver': 'randomized'
}
filter_pcs_kwargs = { # not used if using a scvi latent space
'known_components': None,
'similarity_threshold': 0.7,
'method': 'zscore',
'zth': 2,
'max_pcs': None,
}
filter_known_modes_kwargs = {
# 'known_modes': known_modes_df,
'similarity_threshold': 0.7
}
latent_kwargs = {
'latent_component': "X_scVI"
}
cluster_louvain_kwargs = {
'k': 15,
'nn_measure': 'euclidean',
'knn_method': 'annoy',
'louvain_method': 'taynaud',
'weighting_method': 'jaccard',
'n_jobs': 30,
'resolution': 1.0,
}
merge_clusters_kwargs = {
'thresholds': {
'q1_thresh': 0.5,
'q2_thresh': None,
'cluster_size_thresh': 10,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1,
'score_thresh': 100,
'low_thresh': 0.6931472,
'min_genes': 5
},
'k': 4,
'de_method': 'ebayes'
}
onestep_kwargs = OnestepKwargs(
means_vars_kwargs = means_vars_kwargs,
highly_variable_kwargs = highly_variable_kwargs,
pca_kwargs = pca_kwargs,
filter_pcs_kwargs = filter_pcs_kwargs,
filter_known_modes_kwargs = filter_known_modes_kwargs,
latent_kwargs = latent_kwargs,
cluster_louvain_kwargs = cluster_louvain_kwargs,
merge_clusters_kwargs = merge_clusters_kwargs
)
return onestep_kwargs

onestep_kwargs = setup_transcriptomic_clustering()

# run the iterative clustering. need a tmp folder to store intermediate results
clusters, markers = iter_clust(
adata,
min_samples=10,
onestep_kwargs=onestep_kwargs,
random_seed=123,
tmp_dir="/path/to/your/tmp"
)

# save the results in .pkl format, which will be used for final merging
with open('clustering_results.pkl', 'wb') as f:
pickle.dump(clusters, f)

with open('markers.pkl', 'wb') as f:
pickle.dump(markers, f)

# (Optional) save the clustering results in a data frame.
cluster_dict = build_cluster_dict(clusters)

adata.obs["scrattch_py_cluster"] = ""
for cluster in cluster_dict.keys():
adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster

adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')

res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
res.to_csv('/path/to/your/clustering_results.csv')
119 changes: 119 additions & 0 deletions examples/final_merging_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import sys
import os
import pickle
import pandas as pd
import scanpy as sc
import importlib
import time
import anndata as ad
import numpy as np
import math

# Skip this line if transcriptomic_clustering is installed
sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')

from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs

# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
adata = sc.read('path/to/your/adata.h5ad')

# Normalize the data. Skip if adata.X is already normalized.
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present.
scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
adata.obsm['scVI'] = np.asarray(scvi)

# Loading clustering results
cl_pth = "/path/to/clustering_results"
with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
clusters = pickle.load(f)

# marker genes are only needed for computing PCA
with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
markers = pickle.load(f)

# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space
def setup_merging():
pca_kwargs ={
# 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
'n_comps': 50,
'svd_solver': 'randomized'
}
filter_pcs_kwargs = {
'known_components': None,
'similarity_threshold': 0.7,
'method': 'zscore',
'zth': 2,
'max_pcs': None}
filter_known_modes_kwargs = {
'known_modes': 'log2ngene',
'similarity_threshold': 0.7}
project_kwargs = {}
merge_clusters_kwargs = {
'thresholds': {
'q1_thresh': 0.5,
'q2_thresh': None,
'cluster_size_thresh': 10,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1,
'score_thresh': 100,
'low_thresh': 0.6931472,
'min_genes': 5
},
'k': 4,
'de_method': 'ebayes',
'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
}
latent_kwargs = {
'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
}

merge_kwargs = FinalMergeKwargs(
pca_kwargs = pca_kwargs,
filter_pcs_kwargs = filter_pcs_kwargs,
filter_known_modes_kwargs = filter_known_modes_kwargs,
project_kwargs = project_kwargs,
merge_clusters_kwargs = merge_clusters_kwargs,
latent_kwargs = latent_kwargs
)
return merge_kwargs

merge_kwargs = setup_merging()

# Run the final merging
clusters_after_merging, markers_after_merging = final_merge(
adata,
clusters,
markers, # required for PCA, but optional if using a pre-computed latent space
n_samples_per_clust=20,
random_seed=2024,
n_jobs = 30, # modify this to the number of cores you want to use
return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
final_merge_kwargs=merge_kwargs
)

out_dir = "/path/to/output"

with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f:
pickle.dump(clusters_after_merging, f)

# determine datatype for markers_after_merging and save
if markers_after_merging is None:
print("Skipped calculating markers. Did not save markers.")
elif isinstance(markers_after_merging, pd.DataFrame):
markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv'))
else:
with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f:
pickle.dump(markers_after_merging, f)

# convert the clustering results to a .csv file
n_cells = sum(len(i) for i in clusters_after_merging)
cl = ['unknown']*n_cells
for i in range(len(clusters_after_merging)):
for j in clusters_after_merging[i]:
cl[j] = i+1
res = pd.DataFrame({'cl': cl}, index=adata.obs_names)
res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv'))
51 changes: 51 additions & 0 deletions examples/pairwiseDE_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# this example script is used to get pair-wise DEGs for all clusters given the normalized count and the cluster assignment
import sys
import scanpy as sc
import pickle
from collections import defaultdict

sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
from transcriptomic_clustering.pairwise_DEGs import pairwise_degs

thresholds = {
'q1_thresh': 0.5,
'q2_thresh': None,
'cluster_size_thresh': 10,
'qdiff_thresh': 0.7,
'padj_thresh': 0.05,
'lfc_thresh': 1,
'score_thresh': 100,
'low_thresh': 0.6931472,
'min_genes': 5
}

# reading in the adata that adata.X is already normalizedd
adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad')

# reading in the clustering results, a list of lists of cell indices in adata
clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl'
with open(clusters_pth, 'rb') as f:
clusters = pickle.load(f)

# convert to a dictionary of cluster id to cell indices
obs_by_cluster = defaultdict(lambda: [])
for i, cell_ids in enumerate(clusters):
obs_by_cluster[i] = cell_ids

# subset the adata to include only the first two clusters for testing
adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy()

# create the new obs_by_cluster of cluster id to the cell indices in the subsetted adata
obs_by_cluster_subset = {}
obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))]
obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))]

# returns a set of markers (20 up regulated and 20 down regulated)
degs = pairwise_degs(
adata,
obs_by_cluster_subset,
thresholds,
n_markers=20,
de_method='ebayes',
n_jobs=30
)
3 changes: 2 additions & 1 deletion transcriptomic_clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@
from .cluster_means import get_cluster_means
from .merging import merge_clusters
from .diff_expression import de_pairs_chisq, vec_chisq_test
from .de_ebayes import de_pairs_ebayes
from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel
from .merging import merge_clusters
Copy link

Copilot AI Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate import of merge_clusters; remove the redundant line to keep the module init.py clean.

Suggested change
from .merging import merge_clusters

Copilot uses AI. Check for mistakes.
from .final_merging import final_merge
Loading