From 49349e336d8233c5e7603d301c3f7c79cecf2352 Mon Sep 17 00:00:00 2001
From: Dan Yuan <dan.yuan@hpc-login.corp.alleninstitute.org>
Date: Tue, 13 Aug 2024 15:38:13 -0700
Subject: [PATCH 01/13] Added an example script and updated README

---
 README.md                 |   6 ++-
 examples/example_usage.py | 104 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 examples/example_usage.py

diff --git a/README.md b/README.md
index 0016035..797bcfa 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,14 @@ Installing
 ----------
 For usage and installation instructions, see the [install guide](docs/install.rst).
 
+Examples
+--------
+You can find example usage of this package in the examples directory.
+
 Contributing
 ------------
 We welcome contributions! Please see our [contribution guide](CONTRIBUTING.md) for more information. Thank you!
 
 Level of Support
 ----------------
-We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
\ No newline at end of file
+We are planning on occasional updating this tool with no fixed schedule. Community involvement is encouraged through both issues and pull requests.
diff --git a/examples/example_usage.py b/examples/example_usage.py
new file mode 100644
index 0000000..5caba6e
--- /dev/null
+++ b/examples/example_usage.py
@@ -0,0 +1,104 @@
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import transcriptomic_clustering
+from transcriptomic_clustering.iterative_clustering import (
+    build_cluster_dict, iter_clust, OnestepKwargs
+)
+
+# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
+adata = sc.read('path/to/your/data.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized.
+adata=transcriptomic_clustering.normalize(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['X_scVI'] = np.asarray(scvi)
+
+# Set up the clustering parameters
+def setup_transcriptomic_clustering(): 
+    means_vars_kwargs = {
+        'low_thresh': 0.6931472,
+        'min_cells': 4 
+    }
+    highly_variable_kwargs = {
+        'max_genes': 4000 
+    }
+    pca_kwargs = { # not used if using a scvi latent space
+        'cell_select': 30000, 
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = { # not used if using a scvi latent space
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None,
+    }
+    filter_known_modes_kwargs = { 
+        # 'known_modes': known_modes_df,
+        'similarity_threshold': 0.7
+    }
+    latent_kwargs = {
+        'latent_component': "X_scVI"
+    }
+    cluster_louvain_kwargs = { 
+        'k': 15, 
+        'nn_measure': 'euclidean',
+        'knn_method': 'annoy',
+        'louvain_method': 'taynaud', 
+        'weighting_method': 'jaccard',
+        'n_jobs': 30, 
+        'resolution': 1.0, 
+    }
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes'
+    }
+    onestep_kwargs = OnestepKwargs(
+        means_vars_kwargs = means_vars_kwargs,
+        highly_variable_kwargs = highly_variable_kwargs,
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        latent_kwargs = latent_kwargs,
+        cluster_louvain_kwargs = cluster_louvain_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs
+    )
+    return onestep_kwargs
+
+onestep_kwargs = setup_transcriptomic_clustering()
+
+# run the iterative clustering. need a tmp folder to store intermediate results
+clusters, markers = iter_clust(
+    adata,
+    min_samples=10, 
+    onestep_kwargs=onestep_kwargs,
+    random_seed=123,
+    tmp_dir="/path/to/your/tmp"
+)
+
+cluster_dict = build_cluster_dict(clusters)
+
+adata.obs["scrattch_py_cluster"] = ""
+for cluster in cluster_dict.keys():
+    adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster
+
+adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')
+
+# save the clustering results
+res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
+res.to_csv('/path/to/your/clustering_results.csv')

From 20467e3612bbf54a16d989f4d5be3d0910973245 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Fri, 11 Oct 2024 12:56:02 -0700
Subject: [PATCH 02/13] Modified the final_merge function

---
 transcriptomic_clustering/__init__.py      |   3 +-
 transcriptomic_clustering/de_ebayes.py     |  90 ++++++++++
 transcriptomic_clustering/final_merging.py | 194 +++++++++++++--------
 transcriptomic_clustering/markers.py       |  12 +-
 transcriptomic_clustering/merging.py       |  17 +-
 5 files changed, 234 insertions(+), 82 deletions(-)

diff --git a/transcriptomic_clustering/__init__.py b/transcriptomic_clustering/__init__.py
index e69b5a1..68b7e5f 100644
--- a/transcriptomic_clustering/__init__.py
+++ b/transcriptomic_clustering/__init__.py
@@ -29,5 +29,6 @@
 from .cluster_means import get_cluster_means
 from .merging import merge_clusters
 from .diff_expression import de_pairs_chisq, vec_chisq_test
-from .de_ebayes import de_pairs_ebayes
+from .de_ebayes import de_pairs_ebayes, de_pairs_ebayes_parallel
 from .merging import merge_clusters
+from .final_merging import final_merge
diff --git a/transcriptomic_clustering/de_ebayes.py b/transcriptomic_clustering/de_ebayes.py
index ba9514d..240b6d6 100644
--- a/transcriptomic_clustering/de_ebayes.py
+++ b/transcriptomic_clustering/de_ebayes.py
@@ -15,6 +15,9 @@
 
 from .diff_expression import get_qdiff, filter_gene_stats, calc_de_score
 
+import multiprocessing as mp
+from functools import partial
+
 logger = logging.getLogger(__name__)
 
 """
@@ -232,3 +235,90 @@ def de_pairs_ebayes(
 
     de_pairs = pd.DataFrame(de_pairs).T
     return de_pairs
+
+def process_pair(cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds, pair):
+    cluster_a, cluster_b = pair
+    # t-test with ebayes adjusted variances
+    means_diff = cl_means.loc[cluster_a] - cl_means.loc[cluster_b]
+    means_diff = means_diff.to_frame()
+    stdev_unscaled_comb = np.sqrt(np.sum(stdev_unscaled.loc[[cluster_a, cluster_b]] ** 2))[0]
+    
+    df_total = df + df_prior
+    df_pooled = np.sum(df)
+    df_total = min(df_total, df_pooled)
+    
+    t_vals = means_diff / np.sqrt(sigma_sq_post) / stdev_unscaled_comb
+    
+    p_adj = np.ones((len(t_vals),))
+    p_vals = 2 * stats.t.sf(np.abs(t_vals[0]), df_total)
+    reject, p_adj, alphacSidak, alphacBonf= multipletests(p_vals, alpha=de_thresholds['padj_thresh'], method='holm')
+    lfc = means_diff
+
+    # Get DE score
+    de_pair_stats = pd.DataFrame(index=cl_means.columns)
+    de_pair_stats['p_value'] = p_vals
+    de_pair_stats['p_adj'] = p_adj
+    de_pair_stats['lfc'] = lfc
+    de_pair_stats["meanA"] = cl_means.loc[cluster_a]
+    de_pair_stats["meanB"] = cl_means.loc[cluster_b]
+    de_pair_stats["q1"] = cl_present.loc[cluster_a]
+    de_pair_stats["q2"] = cl_present.loc[cluster_b]
+    de_pair_stats["qdiff"] = get_qdiff(cl_present.loc[cluster_a], cl_present.loc[cluster_b])
+
+    de_pair_up = filter_gene_stats(
+        de_stats=de_pair_stats,
+        gene_type='up-regulated', 
+        cl1_size=cl_size[cluster_a],
+        cl2_size=cl_size[cluster_b],
+        **de_thresholds
+    )
+    up_score = calc_de_score(de_pair_up['p_adj'].values)
+
+    de_pair_down = filter_gene_stats(
+        de_stats=de_pair_stats,
+        gene_type='down-regulated',
+        cl1_size=cl_size[cluster_a],
+        cl2_size=cl_size[cluster_b],
+        **de_thresholds
+    )
+    down_score = calc_de_score(de_pair_down['p_adj'].values)
+
+    return (pair, {
+        'score': up_score + down_score,
+        'up_score': up_score,
+        'down_score': down_score,
+        'up_genes': de_pair_up.index.to_list(),
+        'down_genes': de_pair_down.index.to_list(),
+        'up_num': len(de_pair_up.index),
+        'down_num': len(de_pair_down.index),
+        'num': len(de_pair_up.index) + len(de_pair_down.index)
+    })
+
+def de_pairs_ebayes_parallel(
+        pairs: List[Tuple[Any, Any]],
+        cl_means: pd.DataFrame,
+        cl_vars: pd.DataFrame,
+        cl_present: pd.DataFrame,
+        cl_size: Dict[Any, int],
+        de_thresholds: Dict[str, Any],
+        n_jobs: int = 1
+    ) -> pd.DataFrame:
+
+    logger.info('Fitting Variances')
+    sigma_sq, df, stdev_unscaled = get_linear_fit_vals(cl_vars, cl_size)
+    logger.info('Moderating Variances')
+    sigma_sq_post, var_prior, df_prior = moderate_variances(sigma_sq, df)
+
+    logger.info(f'Comparing {len(pairs)} pairs')
+
+    # de_pairs = {}
+
+    partial_process = partial(process_pair, cl_means, cl_present, cl_size, stdev_unscaled, df, df_prior, sigma_sq_post, de_thresholds)
+    
+    # Use multiprocessing to parallelize the process
+    with mp.Pool(processes=n_jobs) as pool:
+        results = pool.map(partial_process, pairs)
+
+    de_pairs = {pair: result for pair, result in results}
+    de_pairs = pd.DataFrame(de_pairs).T
+    return de_pairs
\ No newline at end of file
diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py
index 0ed9471..1f8d170 100644
--- a/transcriptomic_clustering/final_merging.py
+++ b/transcriptomic_clustering/final_merging.py
@@ -1,4 +1,4 @@
-fromt typing import Optional, Dict, Set, List, Any
+from typing import Optional, Dict, Set, List, Any
 from collections import defaultdict
 from dataclasses import dataclass, field
 
@@ -13,7 +13,8 @@
     build_cluster_dict, summarize_final_clusters
 )
 
-
+import logging
+import time
 logger = logging.getLogger(__name__)
 
 
@@ -25,6 +26,7 @@ class FinalMergeKwargs:
     filter_known_modes_kwargs: Dict = field(default_factory = lambda: ({}))
     project_kwargs: Dict = field(default_factory = lambda: ({}))
     merge_clusters_kwargs: Dict = field(default_factory = lambda: ({}))
+    latent_kwargs: Dict = field(default_factory = lambda: ({}))
 
 
 def sample_clusters(
@@ -38,7 +40,10 @@ def sample_clusters(
 
     Parameters
     ----------
-    Adata 
+    adata
+        AnnData object
+    cluster_dict
+        Dictionary of cluster assignments. values are lists of cell names 
 
     Returns
     -------
@@ -46,13 +51,15 @@ def sample_clusters(
     """
 
     rng = default_rng(random_seed)
-    cell_samples = []
+    cell_samples_ids = []
     for k, v in cluster_dict.items():
         if len(v) > n_samples_per_clust:
             choices = rng.choice(v, size=(n_samples_per_clust,))
         else:
             choices = v
-        cell_samples.extend(choices)
+        cell_samples_ids.extend(choices)
+    
+    cell_samples = adata.obs_names[cell_samples_ids]
 
     cell_mask = pd.Series(
         index=adata.obs.index,
@@ -62,112 +69,151 @@ def sample_clusters(
     return cell_mask
 
 
+def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]:
+    """
+    Convert a dictionary of cluster assignments to a list of cluster assignments
+    """
+    # Find the total number of observations
+    max_index = max(max(indices) for indices in obs_by_cluster.values())
+
+    # Initialize the list of clusters with None (or some other default value)
+    cluster_by_obs = [None] * (max_index + 1)
+
+    # Fill in the list with the corresponding cluster for each observation
+    for cluster, cell_ids in obs_by_cluster.items():
+        for obs in cell_ids:
+            cluster_by_obs[obs] = cluster
+
+    return cluster_by_obs
+
+
 def final_merge(
         adata: ad.AnnData,
-        cluster_assignments: pd.Series,
+        cluster_assignments: List,
         marker_genes: Set,
         n_samples_per_clust: int = 20,
         random_seed: Optional[int]=None,
+        n_jobs: Optional[int] = 1,
+        return_markers_df: Optional[bool] = False,
         final_merge_kwargs: FinalMergeKwargs = FinalMergeKwargs(),
 ) -> pd.DataFrame:
     """
     Runs a final merging step on cluster assignment results
+    Step1: Using a pre-defined latent space or compute PCA as below:
     * Do PCA on random sample of cells per cluster and selected marker genes
     * Filter PCA results to select top eigenvectors
     * Project to reduced space
     * remove known eigen vector
-    * Do differential expression merge
+   Step2: Do differential expression merge
 
     Parameters
     ----------
+    adata
+        AnnData object
+    cluster_assignments
+        List of arrays of cell ids, one array per cluster. This is the result returned by onestep_clust/iter_clust
     
     """
 
-    # Quick data rearranging for convenience
-    cluster_dict = defaultdict(lambda: [])
-    for cell_name, cl_label in cluster_assignments.iteritems():
-        cluster_dict[row].append(cell_name)
-
-    cell_mask = sample_clusters(
-        adata=adata,
-        cluster_dict=cluster_dict,
-        n_samples_per_clust=n_samples_per_clust,
-        random_seed=random_seed
-    )
-    gene_mask = pd.Series(
-        index=adata.var.index,
-        dtype=bool
-    )
-    gene_mask[markers] = True
+    obs_by_cluster = defaultdict(lambda: [])
+    for i, cell_ids in enumerate(cluster_assignments):
+        obs_by_cluster[i] = cell_ids
+    
+    cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster)
 
-    # Do PCA on cell samples and marker genes
-    logger.info('Computing PCA on cell samples and marker genes')
-    tic = time.perf_counter()
-    (components, explained_variance_ratio, explained_variance, means) =  tc.pca(
-        adata,
-        gene_mask=gene_mask,
-        cell_select=cell_mask,
-        random_state=random_seed,
-        **final_merge_kwargs.pca_kwargs
-    )
-    logger.info(f'Computed {components.shape[1]} principal components')
-    toc = time.perf_counter()
-    logger.info(f'PCA Elapsed Time: {toc - tic}')
+    if final_merge_kwargs.latent_kwargs.get("latent_component") is None:
 
-    # Filter PCA
-    logger.info('Filtering PCA Components')
-    tic = time.perf_counter()
-    components = tc.dimension_reduction.filter_components(
-        components,
-        explained_variance,
-        explained_variance_ratio,
-        **final_merge_kwargs.filter_pcs_kwargs
-    )
-    logger.info(f'Filtered to {components.shape[1]} principal components')
-    toc = time.perf_counter()
-    logger.info(f'Filter PCA Elapsed Time: {toc - tic}')
-
-    # Project
-    logger.info("Projecting into PCA space")
-    tic = time.perf_counter()
-    projected_adata = tc.project(
-        adata, components, means,
-        **final_merge_kwargs.project_kwargs
-    )
-    logger.info(f'Projected Adata Dimensions: {projected_adata.shape}')
-    toc = time.perf_counter()
-    logger.info(f'Projection Elapsed Time: {toc - tic}')
+        cell_mask = sample_clusters(
+            adata=adata,
+            cluster_dict=obs_by_cluster,
+            n_samples_per_clust=n_samples_per_clust,
+            random_seed=random_seed
+        )
+        gene_mask = pd.Series(
+            index=adata.var.index,
+            dtype=bool
+        )
+        gene_mask[marker_genes] = True
 
-    # Filter Projection
-    #Filter Known Modes
-    if final_merge_kwargs.filter_known_modes_kwargs:
-        logger.info('Filtering Known Modes')
+        # Do PCA on cell samples and marker genes
+        logger.info('Computing PCA on cell samples and marker genes')
         tic = time.perf_counter()
+        (components, explained_variance_ratio, explained_variance, means) =  tc.pca(
+            adata,
+            gene_mask=gene_mask,
+            cell_select=cell_mask,
+            random_state=random_seed,
+            **final_merge_kwargs.pca_kwargs
+        )
+        logger.info(f'Computed {components.shape[1]} principal components')
+        toc = time.perf_counter()
+        logger.info(f'PCA Elapsed Time: {toc - tic}')
 
-        projected_adata = tc.filter_known_modes(
-            projected_adata,
-            **final_merge_kwargs.filter_known_modes_kwargs
+        # Filter PCA
+        logger.info('Filtering PCA Components')
+        tic = time.perf_counter()
+        components = tc.dimension_reduction.filter_components(
+            components,
+            explained_variance,
+            explained_variance_ratio,
+            **final_merge_kwargs.filter_pcs_kwargs
         )
+        logger.info(f'Filtered to {components.shape[1]} principal components')
+        toc = time.perf_counter()
+        logger.info(f'Filter PCA Elapsed Time: {toc - tic}')
 
-        logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}')
+        # Project
+        logger.info("Projecting into PCA space")
+        tic = time.perf_counter()
+        projected_adata = tc.project(
+            adata, components, means,
+            **final_merge_kwargs.project_kwargs
+        )
+        logger.info(f'Projected Adata Dimensions: {projected_adata.shape}')
         toc = time.perf_counter()
-        logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}')
-    else:
-        logger.info('No known modes, skipping Filter Known Modes')
+        logger.info(f'Projection Elapsed Time: {toc - tic}')
+
+        # Filter Projection
+        #Filter Known Modes
+        if final_merge_kwargs.filter_known_modes_kwargs:
+            logger.info('Filtering Known Modes')
+            tic = time.perf_counter()
+
+            projected_adata = tc.filter_known_modes(
+                projected_adata,
+                **final_merge_kwargs.filter_known_modes_kwargs
+            )
+
+            logger.info(f'Projected Adata Dimensions after Filtering Known Modes: {projected_adata.shape}')
+            toc = time.perf_counter()
+            logger.info(f'Filter Known Modes Elapsed Time: {toc - tic}')
+        else:
+            logger.info('No known modes, skipping Filter Known Modes')
     
+    else:
+        logger.info('Extracting latent dims')
+        tic = time.perf_counter()   
+
+         ## Extract latent dimensions
+        projected_adata = tc.latent_project(adata, **final_merge_kwargs.latent_kwargs)
+
+        toc = time.perf_counter()
+        logger.info(f'Extracting latent dims Elapsed Time: {toc - tic}')
 
     # Merging
     logger.info('Starting Cluster Merging')
     tic = time.perf_counter()
-    cluster_assignments_after_merging = tc.merge_clusters(
+    cluster_assignments_after_merging, markers = tc.merge_clusters(
         adata_norm=adata,
         adata_reduced=projected_adata,
-        cluster_assignments=cluster_dict,
-        cluster_by_obs=cluster_assignments,
+        cluster_assignments=obs_by_cluster,
+        cluster_by_obs=cluster_by_obs,
+        return_markers_df=return_markers_df,
+        n_jobs=n_jobs,
         **final_merge_kwargs.merge_clusters_kwargs
     )
     logger.info(f'Completed Merging')
     toc = time.perf_counter()
     logger.info(f'Merging Elapsed Time: {toc - tic}')
 
-    return cluster_assignments_after_merging
+    return cluster_assignments_after_merging, markers
diff --git a/transcriptomic_clustering/markers.py b/transcriptomic_clustering/markers.py
index 1ac0700..04b8c56 100644
--- a/transcriptomic_clustering/markers.py
+++ b/transcriptomic_clustering/markers.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Set, Literal, Any, Optional
+from typing import Dict, List, Set, Literal, Any, Optional, Union
 import logging
 from itertools import combinations
 
@@ -20,7 +20,9 @@ def select_marker_genes(
         thresholds: Dict[str, Any],
         n_markers: int = 20,
         de_method: Optional[Literal['ebayes', 'chisq']] = 'ebayes',
-) -> Set:
+        return_markers_df: Optional[bool] = False,
+        n_jobs: Optional[int] = 1
+) -> Union[pd.DataFrame, set]:
     """
     Selects n up genes and n down genes from the differentially expressed genes
     between each pair of clusters, and saves the combined set for all cluster pairs.
@@ -58,13 +60,14 @@ def select_marker_genes(
     thresholds.pop('score_thresh', None)
     neighbor_pairs = list(combinations(cl_names, 2))
     if de_method == 'ebayes':
-        de_df = tc.de_pairs_ebayes(
+        de_df = tc.de_pairs_ebayes_parallel(
             neighbor_pairs,
             cluster_means,
             cluster_variances,
             present_cluster_means,
             cl_size,
             thresholds,
+            n_jobs = n_jobs
         )
     elif de_method == 'chisq':
         de_df = tc.de_pairs_chisq(
@@ -77,6 +80,9 @@ def select_marker_genes(
     else:
         raise ValueError(f'Unknown de_method {de_method}, must be one of [chisq, ebayes]')
     
+    if return_markers_df:
+        return de_df
+    
     markers = set()
     for pair, row in de_df.iterrows():
         markers.update(row.up_genes[:n_markers])
diff --git a/transcriptomic_clustering/merging.py b/transcriptomic_clustering/merging.py
index 97c469c..6c7ce9b 100644
--- a/transcriptomic_clustering/merging.py
+++ b/transcriptomic_clustering/merging.py
@@ -34,7 +34,9 @@ def merge_clusters(
         k: Optional[int] = 2,
         de_method: Optional[str] = 'ebayes',
         n_markers: Optional[int] = 20,
-        chunk_size: Optional[int] = None
+        chunk_size: Optional[int] = None,
+        return_markers_df: Optional[bool] = False,
+        n_jobs: Optional[int] = 1
 ) -> Tuple[Dict[Any, np.ndarray], Set]:
     """
     Merge clusters based on size and differential gene expression score
@@ -63,6 +65,10 @@ def merge_clusters(
         if 0 or None will skip
     chunk_size:
         number of observations to process in a single chunk
+    return_markers_df:
+        return markers as a dataframe
+    n_jobs:
+        number of jobs to run in parallel
 
     Returns
     -------
@@ -132,7 +138,9 @@ def merge_clusters(
     logger.info(f'Merging DE Elapsed Time: {toc - tic}')
 
     # Select marker genes
-    if n_markers:
+    if return_markers_df is False and n_markers is None:
+        markers = None
+    else:
         logger.info('Starting Marker Selection')
         tic = time.perf_counter()
         markers = select_marker_genes(
@@ -143,12 +151,13 @@ def merge_clusters(
             thresholds=thresholds,
             n_markers=n_markers,
             de_method=de_method,
+            return_markers_df=return_markers_df,
+            n_jobs=n_jobs
         )
         logger.info('Completed Marker Selection')
         toc = time.perf_counter()
         logger.info(f'Marker Selection Elapsed Time: {toc - tic}')
-    else:
-        markers = None
+
 
     return cluster_assignments_merge, markers
 

From 4fe87f6b109d87caa357085c8039d9f08f1d60b5 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Fri, 11 Oct 2024 13:24:11 -0700
Subject: [PATCH 03/13] Updated the example scripts

---
 examples/clustering_example.py    | 115 ++++++++++++++++++++++++++++++
 examples/final_merging_example.py |  95 ++++++++++++++++++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 examples/clustering_example.py
 create mode 100644 examples/final_merging_example.py

diff --git a/examples/clustering_example.py b/examples/clustering_example.py
new file mode 100644
index 0000000..839c2f6
--- /dev/null
+++ b/examples/clustering_example.py
@@ -0,0 +1,115 @@
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import pickle
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+from transcriptomic_clustering.iterative_clustering import (
+    build_cluster_dict, iter_clust, OnestepKwargs
+)
+
+# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
+adata = sc.read('path/to/your/data.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized. This is the same as using sc.normalize_total(adata, target_sum=1e6) and sc.pp.log1p(adata)
+adata=transcriptomic_clustering.normalize(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['X_scVI'] = np.asarray(scvi)
+
+# Set up the clustering parameters
+def setup_transcriptomic_clustering(): 
+    means_vars_kwargs = {
+        'low_thresh': 0.6931472,
+        'min_cells': 4 
+    }
+    highly_variable_kwargs = {
+        'max_genes': 4000 
+    }
+    pca_kwargs = { # not used if using a scvi latent space
+        'cell_select': 30000, 
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = { # not used if using a scvi latent space
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None,
+    }
+    filter_known_modes_kwargs = { 
+        # 'known_modes': known_modes_df,
+        'similarity_threshold': 0.7
+    }
+    latent_kwargs = {
+        'latent_component': "X_scVI"
+    }
+    cluster_louvain_kwargs = { 
+        'k': 15, 
+        'nn_measure': 'euclidean',
+        'knn_method': 'annoy',
+        'louvain_method': 'taynaud', 
+        'weighting_method': 'jaccard',
+        'n_jobs': 30, 
+        'resolution': 1.0, 
+    }
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes'
+    }
+    onestep_kwargs = OnestepKwargs(
+        means_vars_kwargs = means_vars_kwargs,
+        highly_variable_kwargs = highly_variable_kwargs,
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        latent_kwargs = latent_kwargs,
+        cluster_louvain_kwargs = cluster_louvain_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs
+    )
+    return onestep_kwargs
+
+onestep_kwargs = setup_transcriptomic_clustering()
+
+# run the iterative clustering. need a tmp folder to store intermediate results
+clusters, markers = iter_clust(
+    adata,
+    min_samples=10, 
+    onestep_kwargs=onestep_kwargs,
+    random_seed=123,
+    tmp_dir="/path/to/your/tmp"
+)
+
+# save the results in .pkl format, which will be used for final merging
+with open('clustering_results.pkl', 'wb') as f:
+    pickle.dump(clusters, f)
+
+with open('markers.pkl', 'wb') as f:
+    pickle.dump(markers, f)
+
+# (Optional) save the clustering results in a data frame.
+cluster_dict = build_cluster_dict(clusters)
+
+adata.obs["scrattch_py_cluster"] = ""
+for cluster in cluster_dict.keys():
+    adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster
+
+adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')
+
+res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
+res.to_csv('/path/to/your/clustering_results.csv')
diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
new file mode 100644
index 0000000..957a387
--- /dev/null
+++ b/examples/final_merging_example.py
@@ -0,0 +1,95 @@
+import sys
+import os
+import pickle
+import pandas as pd
+import scanpy as sc
+import importlib
+import time
+import anndata as ad
+import numpy as np
+import math
+
+# Skip this line if transcriptomic_clustering is installed
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+
+import transcriptomic_clustering as tc
+from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs
+
+# Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
+adata = sc.read('path/to/your/data.h5ad')
+
+# Normalize the data. Skip if adata.X is already normalized.
+sc.pp.normalize_total(adata, target_sum=1e6)
+sc.pp.log1p(adata)
+
+# Add scVI latent space to the adata object. Skip if adata.obsm['scVI'] is already present.
+scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
+adata.obsm['scVI'] = np.asarray(scvi)
+
+# loading clustering results
+cl_pth = "/path/to/clustering_results"
+with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
+    clusters = pickle.load(f)
+
+with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
+    markers = pickle.load(f)
+
+# The first 4 are for PCA. modify latent_kwargs if using a pre-computed latent space
+def setup_merging(): 
+    pca_kwargs ={
+        # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
+        'n_comps': 50,
+        'svd_solver': 'randomized'
+    }
+    filter_pcs_kwargs = {
+        'known_components': None,
+        'similarity_threshold': 0.7,
+        'method': 'zscore', 
+        'zth': 2,
+        'max_pcs': None}
+    filter_known_modes_kwargs = {
+        'known_modes': None,
+        'similarity_threshold': 0.7}
+    project_kwargs = {}
+    merge_clusters_kwargs = {
+        'thresholds': {
+            'q1_thresh': 0.5,
+            'q2_thresh': None,
+            'cluster_size_thresh': 10, 
+            'qdiff_thresh': 0.7, 
+            'padj_thresh': 0.05, 
+            'lfc_thresh': 1, 
+            'score_thresh': 100, 
+            'low_thresh': 0.6931472, 
+            'min_genes': 5
+        },
+        'k': 4,
+        'de_method': 'ebayes'
+    }
+    latent_kwargs = { # if None: default is running pca, else use the latent_component in adata.obsm
+        'latent_component': "scVI"
+    }
+    
+    merge_kwargs = FinalMergeKwargs(
+        pca_kwargs = pca_kwargs,
+        filter_pcs_kwargs = filter_pcs_kwargs,
+        filter_known_modes_kwargs = filter_known_modes_kwargs,
+        project_kwargs = project_kwargs,
+        merge_clusters_kwargs = merge_clusters_kwargs,
+        latent_kwargs = latent_kwargs
+    )
+    return merge_kwargs
+
+merge_kwargs = setup_merging()
+
+# Run the final merging
+clusters_after_merging, markers = final_merge(
+    adata, 
+    clusters, 
+    markers, 
+    n_samples_per_clust=20, 
+    random_seed=2024, 
+    final_merge_kwargs=merge_kwargs,
+    n_jobs = 30, # modify this to the number of cores you want to use
+    return_markers_df = True # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
+)
\ No newline at end of file

From 5d6b2ba95563292a4f41ac9dbfe0d29ad55e0641 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Fri, 11 Oct 2024 13:28:13 -0700
Subject: [PATCH 04/13] Deleted the origin example_usage.py

---
 examples/example_usage.py | 104 --------------------------------------
 1 file changed, 104 deletions(-)
 delete mode 100644 examples/example_usage.py

diff --git a/examples/example_usage.py b/examples/example_usage.py
deleted file mode 100644
index 5caba6e..0000000
--- a/examples/example_usage.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import scanpy as sc
-import pandas as pd
-import numpy as np
-import transcriptomic_clustering
-from transcriptomic_clustering.iterative_clustering import (
-    build_cluster_dict, iter_clust, OnestepKwargs
-)
-
-# Load the data that contains the raw counts in the 'X' slot. If using normalized data, skip the normalization step.
-adata = sc.read('path/to/your/data.h5ad')
-
-# Normalize the data. Skip if adata.X is already normalized.
-adata=transcriptomic_clustering.normalize(adata)
-
-# Add scVI latent space to the adata object. Skip if adata.obsm['X_scVI'] is already present.
-scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
-adata.obsm['X_scVI'] = np.asarray(scvi)
-
-# Set up the clustering parameters
-def setup_transcriptomic_clustering(): 
-    means_vars_kwargs = {
-        'low_thresh': 0.6931472,
-        'min_cells': 4 
-    }
-    highly_variable_kwargs = {
-        'max_genes': 4000 
-    }
-    pca_kwargs = { # not used if using a scvi latent space
-        'cell_select': 30000, 
-        'n_comps': 50,
-        'svd_solver': 'randomized'
-    }
-    filter_pcs_kwargs = { # not used if using a scvi latent space
-        'known_components': None,
-        'similarity_threshold': 0.7,
-        'method': 'zscore', 
-        'zth': 2,
-        'max_pcs': None,
-    }
-    filter_known_modes_kwargs = { 
-        # 'known_modes': known_modes_df,
-        'similarity_threshold': 0.7
-    }
-    latent_kwargs = {
-        'latent_component': "X_scVI"
-    }
-    cluster_louvain_kwargs = { 
-        'k': 15, 
-        'nn_measure': 'euclidean',
-        'knn_method': 'annoy',
-        'louvain_method': 'taynaud', 
-        'weighting_method': 'jaccard',
-        'n_jobs': 30, 
-        'resolution': 1.0, 
-    }
-    merge_clusters_kwargs = {
-        'thresholds': {
-            'q1_thresh': 0.5,
-            'q2_thresh': None,
-            'cluster_size_thresh': 10, 
-            'qdiff_thresh': 0.7, 
-            'padj_thresh': 0.05, 
-            'lfc_thresh': 1, 
-            'score_thresh': 100, 
-            'low_thresh': 0.6931472, 
-            'min_genes': 5
-        },
-        'k': 4,
-        'de_method': 'ebayes'
-    }
-    onestep_kwargs = OnestepKwargs(
-        means_vars_kwargs = means_vars_kwargs,
-        highly_variable_kwargs = highly_variable_kwargs,
-        pca_kwargs = pca_kwargs,
-        filter_pcs_kwargs = filter_pcs_kwargs,
-        filter_known_modes_kwargs = filter_known_modes_kwargs,
-        latent_kwargs = latent_kwargs,
-        cluster_louvain_kwargs = cluster_louvain_kwargs,
-        merge_clusters_kwargs = merge_clusters_kwargs
-    )
-    return onestep_kwargs
-
-onestep_kwargs = setup_transcriptomic_clustering()
-
-# run the iterative clustering. need a tmp folder to store intermediate results
-clusters, markers = iter_clust(
-    adata,
-    min_samples=10, 
-    onestep_kwargs=onestep_kwargs,
-    random_seed=123,
-    tmp_dir="/path/to/your/tmp"
-)
-
-cluster_dict = build_cluster_dict(clusters)
-
-adata.obs["scrattch_py_cluster"] = ""
-for cluster in cluster_dict.keys():
-    adata.obs.scrattch_py_cluster[cluster_dict[cluster]] = cluster
-
-adata.obs.scrattch_py_cluster = adata.obs.scrattch_py_cluster.astype('category')
-
-# save the clustering results
-res = pd.DataFrame({'sample_id':adata.obs_names, 'cl': adata.obs.scrattch_py_cluster})
-res.to_csv('/path/to/your/clustering_results.csv')

From 93147f562ec484a5432b7552d3d00e9ef3764d82 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Fri, 11 Oct 2024 14:38:48 -0700
Subject: [PATCH 05/13] modified return data type in final_merge()

---
 transcriptomic_clustering/final_merging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py
index 1f8d170..e9f18e9 100644
--- a/transcriptomic_clustering/final_merging.py
+++ b/transcriptomic_clustering/final_merging.py
@@ -96,7 +96,7 @@ def final_merge(
         n_jobs: Optional[int] = 1,
         return_markers_df: Optional[bool] = False,
         final_merge_kwargs: FinalMergeKwargs = FinalMergeKwargs(),
-) -> pd.DataFrame:
+) -> Tuple[List[List[int]], Union[pd.DataFrame, set]]:
     """
     Runs a final merging step on cluster assignment results
     Step1: Using a pre-defined latent space or compute PCA as below:

From 1d35005d85a739d9f44bcffca62a03a77e355521 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Fri, 11 Oct 2024 16:14:19 -0700
Subject: [PATCH 06/13] modified final_merge() to allow not providing marger
 genes

---
 transcriptomic_clustering/final_merging.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/transcriptomic_clustering/final_merging.py b/transcriptomic_clustering/final_merging.py
index e9f18e9..f64080f 100644
--- a/transcriptomic_clustering/final_merging.py
+++ b/transcriptomic_clustering/final_merging.py
@@ -1,4 +1,4 @@
-from typing import Optional, Dict, Set, List, Any
+from typing import Optional, Dict, Set, List, Any, Tuple, Union
 from collections import defaultdict
 from dataclasses import dataclass, field
 
@@ -9,9 +9,6 @@
 import anndata as ad
 
 import transcriptomic_clustering as tc
-from transcriptomic_clustering.iterative_clustering import (
-    build_cluster_dict, summarize_final_clusters
-)
 
 import logging
 import time
@@ -90,7 +87,7 @@ def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]
 def final_merge(
         adata: ad.AnnData,
         cluster_assignments: List,
-        marker_genes: Set,
+        marker_genes: Optional[set] = None,
         n_samples_per_clust: int = 20,
         random_seed: Optional[int]=None,
         n_jobs: Optional[int] = 1,
@@ -123,6 +120,9 @@ def final_merge(
 
     if final_merge_kwargs.latent_kwargs.get("latent_component") is None:
 
+        if marker_genes is None:
+            raise ValueError("Need marker genes to run PCA")
+
         cell_mask = sample_clusters(
             adata=adata,
             cluster_dict=obs_by_cluster,
@@ -216,4 +216,6 @@ def final_merge(
     toc = time.perf_counter()
     logger.info(f'Merging Elapsed Time: {toc - tic}')
 
+    cluster_assignments_after_merging = list(cluster_assignments_after_merging.values())
+
     return cluster_assignments_after_merging, markers

From 5eb4083646db2c1df9a5e50413a3f3b783a77062 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Mon, 14 Oct 2024 13:53:48 -0700
Subject: [PATCH 07/13] Modify example scripts

---
 examples/final_merging_example.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
index 957a387..667dc7f 100644
--- a/examples/final_merging_example.py
+++ b/examples/final_merging_example.py
@@ -31,6 +31,7 @@
 with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
     clusters = pickle.load(f)
 
+# marker genes are only needed for computing PCA
 with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
     markers = pickle.load(f)
 
@@ -86,7 +87,7 @@ def setup_merging():
 clusters_after_merging, markers = final_merge(
     adata, 
     clusters, 
-    markers, 
+    markers, # required for PCA, but optional if using a pre-computed latent space
     n_samples_per_clust=20, 
     random_seed=2024, 
     final_merge_kwargs=merge_kwargs,

From 21845d4ce1e9616e87d2bba83ed12a717f4202d7 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Mon, 14 Oct 2024 13:58:11 -0700
Subject: [PATCH 08/13] Modified example scripts

---
 examples/final_merging_example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
index 667dc7f..e648b34 100644
--- a/examples/final_merging_example.py
+++ b/examples/final_merging_example.py
@@ -12,7 +12,6 @@
 # Skip this line if transcriptomic_clustering is installed
 sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
 
-import transcriptomic_clustering as tc
 from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs
 
 # Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.

From 6438d8f4c81a6b6f42f806b39f5eed7718d093ff Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Tue, 26 Nov 2024 19:35:33 -0800
Subject: [PATCH 09/13] fixed a bug in pca();chanegd the input to a str for
 filter_known_modes()

---
 transcriptomic_clustering/dimension_reduction.py | 2 +-
 transcriptomic_clustering/filter_known_modes.py  | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/transcriptomic_clustering/dimension_reduction.py b/transcriptomic_clustering/dimension_reduction.py
index 499e083..2a39f39 100644
--- a/transcriptomic_clustering/dimension_reduction.py
+++ b/transcriptomic_clustering/dimension_reduction.py
@@ -116,7 +116,7 @@ def pca(
     _, vidx = adata._normalize_indices((slice(None), gene_mask)) # handle gene mask like anndata would
     vidx_bool = np.zeros((adata.n_vars,), dtype=bool)
     vidx_bool[vidx] = True
-    n_genes = len(vidx)
+    n_genes = sum(vidx)
     
     # select n_comps
     max_comps = min(n_cells, n_genes) - 1
diff --git a/transcriptomic_clustering/filter_known_modes.py b/transcriptomic_clustering/filter_known_modes.py
index 7b17c43..98ce71c 100644
--- a/transcriptomic_clustering/filter_known_modes.py
+++ b/transcriptomic_clustering/filter_known_modes.py
@@ -10,7 +10,7 @@
 
 def filter_known_modes(
         projected_adata: ad.AnnData,
-        known_modes: Union[pd.DataFrame, pd.Series],
+        known_modes: Optional[str] = None,
         similarity_threshold: Optional[float] = 0.7):
     """
         Filters out principal components which correlate strongly with the known modes
@@ -27,6 +27,12 @@ def filter_known_modes(
         projected_adata: after filtering out correlated principal components
 
     """
+    # determine if know_modes is in adata.obs
+    if known_modes in projected_adata.obs.columns:
+        known_modes = projected_adata.obs[known_modes]
+    else:
+        raise ValueError(f'{known_modes} not found in adata.obs')
+    
     if isinstance(known_modes, pd.Series):
         known_modes = known_modes.to_frame()
 

From 096165b988d8df2144fadd39446c4bba81692576 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Tue, 26 Nov 2024 19:48:11 -0800
Subject: [PATCH 10/13] update the example script for final merging

---
 examples/final_merging_example.py | 40 ++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
index e648b34..0a63c4d 100644
--- a/examples/final_merging_example.py
+++ b/examples/final_merging_example.py
@@ -34,7 +34,7 @@
 with open(os.path.join(cl_pth, 'markers.pkl'), 'rb') as f:
     markers = pickle.load(f)
 
-# The first 4 are for PCA. modify latent_kwargs if using a pre-computed latent space
+# The first 4 are for PCA only. modify latent_kwargs if using a pre-computed latent space
 def setup_merging(): 
     pca_kwargs ={
         # 'cell_select': 30000, # should not use set this for final merging, as we need to sample from each cluster if computing PCA
@@ -48,7 +48,7 @@ def setup_merging():
         'zth': 2,
         'max_pcs': None}
     filter_known_modes_kwargs = {
-        'known_modes': None,
+        'known_modes': 'log2ngene', 
         'similarity_threshold': 0.7}
     project_kwargs = {}
     merge_clusters_kwargs = {
@@ -64,10 +64,11 @@ def setup_merging():
             'min_genes': 5
         },
         'k': 4,
-        'de_method': 'ebayes'
+        'de_method': 'ebayes',
+        # 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
     }
-    latent_kwargs = { # if None: default is running pca, else use the latent_component in adata.obsm
-        'latent_component': "scVI"
+    latent_kwargs = { 
+        'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
     }
     
     merge_kwargs = FinalMergeKwargs(
@@ -89,7 +90,30 @@ def setup_merging():
     markers, # required for PCA, but optional if using a pre-computed latent space
     n_samples_per_clust=20, 
     random_seed=2024, 
-    final_merge_kwargs=merge_kwargs,
     n_jobs = 30, # modify this to the number of cores you want to use
-    return_markers_df = True # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
-)
\ No newline at end of file
+    return_markers_df = False, # return the pair-wise DE results for each cluster pair. If False (default), only return a set of markers (top 20 of up and down regulated genes in each pair comparison)
+    final_merge_kwargs=merge_kwargs
+)
+
+out_dir = "/path/to/output"
+
+with open(os.path.join(out_dir, "clustering_results_after_merging.pkl"), 'wb') as f:
+        pickle.dump(clusters_after_merging, f)
+
+# determine datatype for markers_after_merging and save
+if markers_after_merging is None:
+    print("Skipped calculating markers. Did not save markers.")
+elif isinstance(markers_after_merging, pd.DataFrame):
+    markers_after_merging.to_csv(os.path.join(out_dir,'markers_after_merging.csv'))
+else:
+    with open(os.path.join(out_dir,'markers_after_merging.pkl'), 'wb') as f:
+        pickle.dump(markers_after_merging, f)
+
+# convert the clustering results to a .csv file
+n_cells = sum(len(i) for i in clusters_after_merging)
+cl = ['unknown']*n_cells
+for i in range(len(clusters_after_merging)):
+    for j in clusters_after_merging[i]:
+        cl[j] = i+1
+res = pd.DataFrame({'cl': cl}, index=adata.obs_names)
+res.to_csv(os.path.join(out_dir,'clustering_results_after_merging.csv'))
\ No newline at end of file

From 026edbab4e05392eea15b5801860365a29faf71f Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Wed, 16 Jul 2025 11:40:39 -0700
Subject: [PATCH 11/13] update example

---
 examples/final_merging_example.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/final_merging_example.py b/examples/final_merging_example.py
index 0a63c4d..9917c91 100644
--- a/examples/final_merging_example.py
+++ b/examples/final_merging_example.py
@@ -15,7 +15,7 @@
 from transcriptomic_clustering.final_merging import final_merge, FinalMergeKwargs
 
 # Load the data that contains the raw counts in the 'X' slot. If adata.X is normalized, skip the next normalization step.
-adata = sc.read('path/to/your/data.h5ad')
+adata = sc.read('path/to/your/adata.h5ad')
 
 # Normalize the data. Skip if adata.X is already normalized.
 sc.pp.normalize_total(adata, target_sum=1e6)
@@ -25,7 +25,7 @@
 scvi = pd.read_csv('path/to/scvi_latent_space.csv', index_col=0)
 adata.obsm['scVI'] = np.asarray(scvi)
 
-# loading clustering results
+# Loading clustering results
 cl_pth = "/path/to/clustering_results"
 with open(os.path.join(cl_pth, 'clustering_results.pkl'), 'rb') as f:
     clusters = pickle.load(f)
@@ -65,7 +65,7 @@ def setup_merging():
         },
         'k': 4,
         'de_method': 'ebayes',
-        # 'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
+        'n_markers': None, # if set to None, will bypass the marker calculation step, which is the time-consuming step
     }
     latent_kwargs = { 
         'latent_component': "scVI" # None or a obsm in adata. if None: default is running pca, else use the latent_component in adata.obsm
@@ -84,7 +84,7 @@ def setup_merging():
 merge_kwargs = setup_merging()
 
 # Run the final merging
-clusters_after_merging, markers = final_merge(
+clusters_after_merging, markers_after_merging = final_merge(
     adata, 
     clusters, 
     markers, # required for PCA, but optional if using a pre-computed latent space

From 5c43281d95164608703bba96dcf8d4ccecfe4a16 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Thu, 31 Jul 2025 20:38:50 -0700
Subject: [PATCH 12/13] add de pair function

---
 examples/pairwiseDE_example.py             | 45 ++++++++++++
 transcriptomic_clustering/pairwise_DEGs.py | 79 ++++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 examples/pairwiseDE_example.py
 create mode 100644 transcriptomic_clustering/pairwise_DEGs.py

diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py
new file mode 100644
index 0000000..48f9a10
--- /dev/null
+++ b/examples/pairwiseDE_example.py
@@ -0,0 +1,45 @@
+# this example script is used to get pair-wise DEGs for all clusters given the normalized count and the cluster assignment 
+import sys
+import scanpy as sc
+import pickle
+from collections import defaultdict
+
+sys.path.insert(1, '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/tool/transcriptomic_clustering/')
+from transcriptomic_clustering.pairwise_DEGs import pairwise_degs
+
+thresholds = {
+    'q1_thresh': 0.5,
+    'q2_thresh': None,
+    'cluster_size_thresh': 10, 
+    'qdiff_thresh': 0.7, 
+    'padj_thresh': 0.05, 
+    'lfc_thresh': 1, 
+    'score_thresh': 100, 
+    'low_thresh': 0.6931472, 
+    'min_genes': 5
+}
+
+adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad')
+
+clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl'
+with open(clusters_pth, 'rb') as f:
+    clusters = pickle.load(f)
+obs_by_cluster = defaultdict(lambda: [])
+for i, cell_ids in enumerate(clusters):
+    obs_by_cluster[i] = cell_ids
+
+# subset the adata and obs_by_cluster to only include two clusters for testing
+adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy()
+
+obs_by_cluster_subset = {}
+obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))]
+obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))]
+
+degs = pairwise_degs(
+        adata,
+        obs_by_cluster_subset,
+        thresholds,
+        n_markers=20,
+        de_method='ebayes',
+        n_jobs=30
+        )
\ No newline at end of file
diff --git a/transcriptomic_clustering/pairwise_DEGs.py b/transcriptomic_clustering/pairwise_DEGs.py
new file mode 100644
index 0000000..6c28e95
--- /dev/null
+++ b/transcriptomic_clustering/pairwise_DEGs.py
@@ -0,0 +1,79 @@
+import anndata as ad
+import scanpy as sc
+import time
+from typing import Dict, List, Any, Optional
+import transcriptomic_clustering as tc
+from transcriptomic_clustering.markers import select_marker_genes
+import logging
+
+logger = logging.getLogger(__name__)
+
+def _cluster_obs_dict_to_list(obs_by_cluster: Dict[int, List[int]]) -> List[int]:
+    """
+    Convert a dictionary of cluster assignments to a list of cluster assignments
+    """
+    # Find the total number of observations
+    max_index = max(max(indices) for indices in obs_by_cluster.values())
+
+    # Initialize the list of clusters with None (or some other default value)
+    cluster_by_obs = [None] * (max_index + 1)
+
+    # Fill in the list with the corresponding cluster for each observation
+    for cluster, cell_ids in obs_by_cluster.items():
+        for obs in cell_ids:
+            cluster_by_obs[obs] = cluster
+
+    return cluster_by_obs
+
+def pairwise_degs(
+        adata_norm: ad.AnnData,
+        obs_by_cluster: Dict[Any, List], # a dictionary with keys as cluster names and values as lists of cell names
+        thresholds: Dict[str, Any], 
+        n_markers: int = 20,
+        de_method: str = 'ebayes',
+        n_jobs: int = 30,
+        chunk_size: Optional[int] = None
+        ):
+    """
+    Perform pairwise differential expression analysis on clusters in an AnnData object.
+    Parameters
+    ----------
+    adata_norm : AnnData
+        The normalized AnnData object containing the data.
+    cluster_assignments : Dict[Any, List]
+        A dictionary where keys are cluster names and values are lists of cell names belonging to those clusters
+    chunk_size : Optional[int]
+        The size of chunks to process at a time. If None, the entire dataset is processed
+    thresholds : Dict[str, Any]
+    n_markers : int
+        The number of up-regulated and downregulated markers to return for all pairs of clusters
+    """
+    cluster_by_obs = _cluster_obs_dict_to_list(obs_by_cluster)
+    logger.info("Computing Cluster Means")
+    tic = time.perf_counter()
+    cl_means, present_cl_means, cl_vars = tc.get_cluster_means(adata_norm,
+                                                    obs_by_cluster,
+                                                    cluster_by_obs,
+                                                    chunk_size,
+                                                    low_th=thresholds['low_thresh'])
+    logger.info(f'Completed Cluster Means')
+    toc = time.perf_counter()
+    logger.info(f'Cluster Means Elapsed Time: {toc - tic}')
+
+    logger.info('Starting Marker Selection')
+    tic = time.perf_counter()
+    markers = select_marker_genes(
+        cluster_assignments=obs_by_cluster,
+        cluster_means=cl_means,
+        cluster_variances=cl_vars,
+        present_cluster_means=present_cl_means,
+        thresholds=thresholds,
+        n_markers=n_markers,
+        de_method=de_method,
+        n_jobs=n_jobs
+    )
+    logger.info('Completed Marker Selection')
+    toc = time.perf_counter()
+    logger.info(f'Marker Selection Elapsed Time: {toc - tic}')
+
+    return markers
\ No newline at end of file

From bbd457933919498e8115711d96b988a31767ce40 Mon Sep 17 00:00:00 2001
From: dyuan1111 <dan.yuan@alleninstitute.org>
Date: Thu, 31 Jul 2025 20:54:13 -0700
Subject: [PATCH 13/13] update de example

---
 examples/pairwiseDE_example.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/pairwiseDE_example.py b/examples/pairwiseDE_example.py
index 48f9a10..b9496e7 100644
--- a/examples/pairwiseDE_example.py
+++ b/examples/pairwiseDE_example.py
@@ -19,22 +19,28 @@
     'min_genes': 5
 }
 
+# reading in the adata that adata.X is already normalizedd
 adata = sc.read('/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/HMBA_analysis/iscANVI_mapping/troubleshoot/test_data/adata_query_MHGlut.h5ad')
 
+# reading in the clustering results, a list of lists of cell indices in adata
 clusters_pth = '/allen/programs/celltypes/workgroups/rnaseqanalysis/dyuan/custom_packages/mpi_tc/archive_versions/v3_githubVersion/test_70k_worked/out/clustering_results.pkl'
 with open(clusters_pth, 'rb') as f:
     clusters = pickle.load(f)
+
+# convert to a dictionary of cluster id to cell indices
 obs_by_cluster = defaultdict(lambda: [])
 for i, cell_ids in enumerate(clusters):
     obs_by_cluster[i] = cell_ids
 
-# subset the adata and obs_by_cluster to only include two clusters for testing
+# subset the adata to include only the first two clusters for testing
 adata = adata[obs_by_cluster[0] + obs_by_cluster[1], :].copy()
 
+# create the new obs_by_cluster of cluster id to the cell indices in the subsetted adata
 obs_by_cluster_subset = {}
 obs_by_cluster_subset[0] = [i for i in range(len(obs_by_cluster[0]))]
 obs_by_cluster_subset[1] = [i + len(obs_by_cluster[0]) for i in range(len(obs_by_cluster[1]))]
 
+# returns a set of markers (20 up regulated and 20 down regulated)
 degs = pairwise_degs(
         adata,
         obs_by_cluster_subset,