From 6c670228b7abeeecfded4519a5fb1981d7f03065 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 31 Mar 2023 20:40:53 +0200 Subject: [PATCH 01/20] add cluster methods --- snapseed/cluster.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 snapseed/cluster.py diff --git a/snapseed/cluster.py b/snapseed/cluster.py new file mode 100644 index 0000000..2b907bc --- /dev/null +++ b/snapseed/cluster.py @@ -0,0 +1,81 @@ +import math + +import numpy as np +import pandas as pd + +import scanpy as sc + +from snapseed.utils import read_yaml + + + +def annot_adata(adata, marker_genes, level_name='level_1', + has_na=False, res=1.0): + + corr_df = get_bulk_exp(adata, f"leiden_{res}").astype(float).corr() + corr_df = 1-corr_df + # cluster_to_compair = corr_df.idxmax().to_dict() + ntop = math.ceil(len(adata.obs[f"leiden_{res}"].unique())/10) + cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() + + result_df_zscore = pd.DataFrame(marker_genes.keys()) + result_df_zscore = result_df_zscore.rename(columns={0:level_name}) + + for cluster in adata.obs[f"leiden_{res}"].unique(): + adata0 = adata.copy() + adata0.obs[f"leiden_{res}"] = adata0.obs[f"leiden_{res}"].astype(str) + + # sc.tl.rank_genes_groups(adata0, f"leiden_{res}", groups=[cluster], + # reference=cluster_to_compair[cluster], method='wilcoxon') + + adata0.obs.loc[adata0.obs[f"leiden_{res}"].isin(cluster_to_compair[cluster]), + f"leiden_{res}"] = 'ref' + + sc.tl.rank_genes_groups(adata0, f"leiden_{high_res}", groups=[cluster], + reference='ref', method='wilcoxon') + + a= wrangle_ranks_from_anndata(adata0) + + z_scores=[] + adj_pvalss=[] + for i in marker_genes: + z_scores.append(a.loc[a.gene.isin(marker_genes[i]['marker_genes']), 'z_score'].max()) + adj_pvalss.append(-np.log10(a.loc[a.gene.isin(marker_genes[i]['marker_genes']), 'adj_pvals']).max()) + # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] + result_df_zscore[cluster]=z_scores + + z_df = result_df_zscore.set_index(level_name) + cluster2ct = z_df.idxmax().to_dict() + if has_na: + for i in cluster2ct: + if i in z_df[z_df.max(axis=1)<1].index: + cluster2ct[i]='na' + + adata.obs[level_name] = adata.obs[f"leiden_{res}"].map(cluster2ct) + + return adata + +def annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True): + level2={} + for i in adata.obs.level_1.unique(): + sub_adata = adata[adata.obs.level_1==i] + + if 'subtypes' not in marker_genes[i]: + level2.update(sub_adata.obs.level_1.to_dict()) + else: + sub_adata = sub_adata.copy() + sub_adata=annot_adata(sub_adata, marker_genes[i]['subtypes'], + level_name='level_2', has_na=True) + sub_adata = sub_adata[sub_adata.obs.level_1==i] + + level2.update(sub_adata.obs.level_2.to_dict()) + + adata.obs['level_2'] = adata.obs.index.map(level2) + + return adata + +res=1.0 +sc.tl.leiden(adata, resolution=res, key_added=f"leiden_{res}") + +adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=True) +adata=annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True) From 7a6c304964e3b666d77a8dec522118e37878cf16 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 31 Mar 2023 20:43:20 +0200 Subject: [PATCH 02/20] add wrangle_ranks --- snapseed/cluster.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/snapseed/cluster.py b/snapseed/cluster.py index 2b907bc..d68f035 100644 --- a/snapseed/cluster.py +++ b/snapseed/cluster.py @@ -8,6 +8,27 @@ from snapseed.utils import read_yaml +def wrangle_ranks_from_anndata(anndata): + """ + Wrangle results from the ranked_genes_groups function of Scanpy (Wolf et al., 2018) on louvain clusters. + """ + # Get number of top ranked genes per groups + nb_marker = len(anndata.uns['rank_genes_groups']['names']) + # Wrangle results into a table (pandas dataframe) + top_score = pd.DataFrame(anndata.uns['rank_genes_groups']['scores']).loc[:nb_marker] + top_adjpval = pd.DataFrame(anndata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] + top_gene = pd.DataFrame(anndata.uns['rank_genes_groups']['names']).loc[:nb_marker] + marker_df = pd.DataFrame() + # Order values + for i in top_score.columns: + concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) + concat['cluster_number'] = i + col = list(concat.columns) + col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' + concat.columns = col + marker_df = marker_df.append(concat) + return marker_df + def annot_adata(adata, marker_genes, level_name='level_1', has_na=False, res=1.0): From 5cdf88a95a9c34b0ae0b9f512dc64cd0bbdce965 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 31 Mar 2023 21:06:44 +0200 Subject: [PATCH 03/20] update method --- snapseed/annotate.py | 8 ++++- snapseed/cluster.py | 83 +++++++++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/snapseed/annotate.py b/snapseed/annotate.py index 96dc0e5..5bc14c0 100644 --- a/snapseed/annotate.py +++ b/snapseed/annotate.py @@ -2,6 +2,7 @@ from .trinarize import annotate_cytograph from .auroc import annotate_snap +from .cluster import annotate_cluster from .utils import get_markers, get_annot_df @@ -58,7 +59,8 @@ def annotate_levels( level += 1 level_name = "level_" + str(level) marker_dict = get_markers(marker_hierarchy) - assignments = annotate(adata, marker_dict, group_name, method=method, layer=layer) + assignments = annotate(adata, marker_dict, group_name, method=method, + layer=layer, level_name=level_name) if assignment_levels is None: assignment_levels = {} @@ -123,6 +125,10 @@ def annotate(adata, marker_dict, group_name, method="auroc", layer=None, **kwarg assignments = annotate_cytograph( adata, marker_dict, group_name, layer=layer, **kwargs ) + elif method == "cluster": + assignments = annotate_cluster( + adata, marker_dict, group_name, layer=layer, level_name=level_name, **kwargs + ) else: raise ValueError("Unknown annotation method.") # Join cluster-level results with adata diff --git a/snapseed/cluster.py b/snapseed/cluster.py index d68f035..f2b0412 100644 --- a/snapseed/cluster.py +++ b/snapseed/cluster.py @@ -8,49 +8,53 @@ from snapseed.utils import read_yaml -def wrangle_ranks_from_anndata(anndata): +def annotate_cluster(adata, + marker_dict, + group_name, + layer=None, + level_name=None +): """ - Wrangle results from the ranked_genes_groups function of Scanpy (Wolf et al., 2018) on louvain clusters. + Annotate cell types based on AUROC and expression of predefined marker genes. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression """ - # Get number of top ranked genes per groups - nb_marker = len(anndata.uns['rank_genes_groups']['names']) - # Wrangle results into a table (pandas dataframe) - top_score = pd.DataFrame(anndata.uns['rank_genes_groups']['scores']).loc[:nb_marker] - top_adjpval = pd.DataFrame(anndata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] - top_gene = pd.DataFrame(anndata.uns['rank_genes_groups']['names']).loc[:nb_marker] - marker_df = pd.DataFrame() - # Order values - for i in top_score.columns: - concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) - concat['cluster_number'] = i - col = list(concat.columns) - col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' - concat.columns = col - marker_df = marker_df.append(concat) - return marker_df + if level_name=='level_1': + adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=True) + else: + adata=annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True) def annot_adata(adata, marker_genes, level_name='level_1', - has_na=False, res=1.0): + has_na=False, group_name=group_name): - corr_df = get_bulk_exp(adata, f"leiden_{res}").astype(float).corr() + corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1-corr_df # cluster_to_compair = corr_df.idxmax().to_dict() - ntop = math.ceil(len(adata.obs[f"leiden_{res}"].unique())/10) + ntop = math.ceil(len(adata.obs[group_name].unique())/10) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() result_df_zscore = pd.DataFrame(marker_genes.keys()) result_df_zscore = result_df_zscore.rename(columns={0:level_name}) - for cluster in adata.obs[f"leiden_{res}"].unique(): + for cluster in adata.obs[group_name].unique(): adata0 = adata.copy() - adata0.obs[f"leiden_{res}"] = adata0.obs[f"leiden_{res}"].astype(str) + adata0.obs[group_name] = adata0.obs[group_name].astype(str) - # sc.tl.rank_genes_groups(adata0, f"leiden_{res}", groups=[cluster], + # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], # reference=cluster_to_compair[cluster], method='wilcoxon') - adata0.obs.loc[adata0.obs[f"leiden_{res}"].isin(cluster_to_compair[cluster]), - f"leiden_{res}"] = 'ref' + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + group_name] = 'ref' sc.tl.rank_genes_groups(adata0, f"leiden_{high_res}", groups=[cluster], reference='ref', method='wilcoxon') @@ -72,7 +76,7 @@ def annot_adata(adata, marker_genes, level_name='level_1', if i in z_df[z_df.max(axis=1)<1].index: cluster2ct[i]='na' - adata.obs[level_name] = adata.obs[f"leiden_{res}"].map(cluster2ct) + adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) return adata @@ -95,8 +99,23 @@ def annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True): return adata -res=1.0 -sc.tl.leiden(adata, resolution=res, key_added=f"leiden_{res}") - -adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=True) -adata=annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True) +def wrangle_ranks_from_anndata(anndata): + """ + Wrangle results from the ranked_genes_groups function of Scanpy (Wolf et al., 2018) on louvain clusters. + """ + # Get number of top ranked genes per groups + nb_marker = len(anndata.uns['rank_genes_groups']['names']) + # Wrangle results into a table (pandas dataframe) + top_score = pd.DataFrame(anndata.uns['rank_genes_groups']['scores']).loc[:nb_marker] + top_adjpval = pd.DataFrame(anndata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] + top_gene = pd.DataFrame(anndata.uns['rank_genes_groups']['names']).loc[:nb_marker] + marker_df = pd.DataFrame() + # Order values + for i in top_score.columns: + concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) + concat['cluster_number'] = i + col = list(concat.columns) + col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' + concat.columns = col + marker_df = marker_df.append(concat) + return marker_df From 1c8bb204168d4b64bb1f0f069fe3b9aeaa72be18 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Sat, 1 Apr 2023 10:09:27 +0200 Subject: [PATCH 04/20] fix method --- snapseed/annotate.py | 4 +- snapseed/{cluster.py => degenes.py} | 97 ++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 23 deletions(-) rename snapseed/{cluster.py => degenes.py} (54%) diff --git a/snapseed/annotate.py b/snapseed/annotate.py index 5bc14c0..82562a5 100644 --- a/snapseed/annotate.py +++ b/snapseed/annotate.py @@ -2,7 +2,7 @@ from .trinarize import annotate_cytograph from .auroc import annotate_snap -from .cluster import annotate_cluster +from .cluster import annotate_degenes from .utils import get_markers, get_annot_df @@ -126,7 +126,7 @@ def annotate(adata, marker_dict, group_name, method="auroc", layer=None, **kwarg adata, marker_dict, group_name, layer=layer, **kwargs ) elif method == "cluster": - assignments = annotate_cluster( + assignments = annotate_degenes( adata, marker_dict, group_name, layer=layer, level_name=level_name, **kwargs ) else: diff --git a/snapseed/cluster.py b/snapseed/degenes.py similarity index 54% rename from snapseed/cluster.py rename to snapseed/degenes.py index f2b0412..3e7c22e 100644 --- a/snapseed/cluster.py +++ b/snapseed/degenes.py @@ -8,14 +8,15 @@ from snapseed.utils import read_yaml -def annotate_cluster(adata, +def annotate_cluster( + adata, marker_dict, group_name, layer=None, level_name=None -): + ): """ - Annotate cell types based on AUROC and expression of predefined marker genes. + Annotate cell types based on differentially expressed (DE) marker genes. Parameters ---------- @@ -29,23 +30,31 @@ def annotate_cluster(adata, Layer in adata to use for expression """ if level_name=='level_1': - adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=True) + adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=False) else: - adata=annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True) + adata=annot_adata_lowlevel(adata, marker_genes, level_name=level_name, has_na=True) -def annot_adata(adata, marker_genes, level_name='level_1', - has_na=False, group_name=group_name): +def annot_adata( + adata, + marker_genes, + level_name='level_1', + has_na=False, + group_name=group_name + ): corr_df = get_bulk_exp(adata, group_name).astype(float).corr() - corr_df = 1-corr_df - # cluster_to_compair = corr_df.idxmax().to_dict() + corr_df = 1 - corr_df + ntop = math.ceil(len(adata.obs[group_name].unique())/10) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() result_df_zscore = pd.DataFrame(marker_genes.keys()) result_df_zscore = result_df_zscore.rename(columns={0:level_name}) + result_df_apvalue = pd.DataFrame(marker_genes.keys()) + result_df_apvalue = result_df_apvalue.rename(columns={0:level_name}) + for cluster in adata.obs[group_name].unique(): adata0 = adata.copy() adata0.obs[group_name] = adata0.obs[group_name].astype(str) @@ -59,28 +68,37 @@ def annot_adata(adata, marker_genes, level_name='level_1', sc.tl.rank_genes_groups(adata0, f"leiden_{high_res}", groups=[cluster], reference='ref', method='wilcoxon') - a= wrangle_ranks_from_anndata(adata0) + wranks = wrangle_ranks_from_adata(adata0) z_scores=[] adj_pvalss=[] for i in marker_genes: - z_scores.append(a.loc[a.gene.isin(marker_genes[i]['marker_genes']), 'z_score'].max()) - adj_pvalss.append(-np.log10(a.loc[a.gene.isin(marker_genes[i]['marker_genes']), 'adj_pvals']).max()) + z_scores.append(wranks.loc[wranks.gene.isin(marker_genes[i]['marker_genes']), 'z_score'].max()) + adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_genes[i]['marker_genes']), 'adj_pvals']).max()) + # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] result_df_zscore[cluster]=z_scores + result_df_apvalue[cluster]=adj_pvalss z_df = result_df_zscore.set_index(level_name) cluster2ct = z_df.idxmax().to_dict() + if has_na: for i in cluster2ct: if i in z_df[z_df.max(axis=1)<1].index: - cluster2ct[i]='na' + cluster2ct[i] = 'na' adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) return adata -def annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True): +def annot_adata_level2( + adata, + marker_genes, + level_name='level_2', + has_na=True + ): + level2={} for i in adata.obs.level_1.unique(): sub_adata = adata[adata.obs.level_1==i] @@ -89,7 +107,7 @@ def annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True): level2.update(sub_adata.obs.level_1.to_dict()) else: sub_adata = sub_adata.copy() - sub_adata=annot_adata(sub_adata, marker_genes[i]['subtypes'], + sub_adata = annot_adata(sub_adata, marker_genes[i]['subtypes'], level_name='level_2', has_na=True) sub_adata = sub_adata[sub_adata.obs.level_1==i] @@ -99,16 +117,42 @@ def annot_adata_level2(adata, marker_genes, level_name='level_2', has_na=True): return adata -def wrangle_ranks_from_anndata(anndata): +def annot_adata_lowlevel( + adata, + marker_genes, + level_name='level_2', + has_na=True + ): + + llevel = {} + up_level_name = f"level_{int(level_name.split('_')[-1])-1}" + + for i in adata.obs[up_level_name].unique(): + sub_adata = adata[adata.obs[up_level_name] == i].copy() + + if 'subtypes' not in marker_genes[i]: + llevel.update(sub_adata.obs[up_level_name].to_dict()) + else: + sub_adata = annot_adata(sub_adata, marker_genes[i]['subtypes'], + level_name=level_name, has_na=True) + sub_adata = sub_adata[sub_adata.obs[up_level_name] == i] + + llevel.update(sub_adata.obs[level_name].to_dict()) + + adata.obs[level_name] = adata.obs.index.map(llevel) + + return adata + +def wrangle_ranks_from_adata(adata): """ - Wrangle results from the ranked_genes_groups function of Scanpy (Wolf et al., 2018) on louvain clusters. + Wrangle results from the ranked_genes_groups function of Scanpy. """ # Get number of top ranked genes per groups - nb_marker = len(anndata.uns['rank_genes_groups']['names']) + nb_marker = len(adata.uns['rank_genes_groups']['names']) # Wrangle results into a table (pandas dataframe) - top_score = pd.DataFrame(anndata.uns['rank_genes_groups']['scores']).loc[:nb_marker] - top_adjpval = pd.DataFrame(anndata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] - top_gene = pd.DataFrame(anndata.uns['rank_genes_groups']['names']).loc[:nb_marker] + top_score = pd.DataFrame(adata.uns['rank_genes_groups']['scores']).loc[:nb_marker] + top_adjpval = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] + top_gene = pd.DataFrame(adata.uns['rank_genes_groups']['names']).loc[:nb_marker] marker_df = pd.DataFrame() # Order values for i in top_score.columns: @@ -119,3 +163,14 @@ def wrangle_ranks_from_anndata(anndata): concat.columns = col marker_df = marker_df.append(concat) return marker_df + +def get_bulk_exp(adata, bulk_labels): + res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) + + for clust in adata.obs[bulk_labels].cat.categories: + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) + + res.index=adata.obs[bulk_labels].cat.categories + + return res.T + From 42d07aafe94458ef7bcf14e63e103de16d02c5e0 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 3 Apr 2023 11:27:30 +0200 Subject: [PATCH 05/20] fix degenes method --- snapseed/annotate.py | 11 +++-- snapseed/degenes.py | 100 +++++++++++-------------------------------- 2 files changed, 29 insertions(+), 82 deletions(-) diff --git a/snapseed/annotate.py b/snapseed/annotate.py index 82562a5..930a893 100644 --- a/snapseed/annotate.py +++ b/snapseed/annotate.py @@ -2,7 +2,7 @@ from .trinarize import annotate_cytograph from .auroc import annotate_snap -from .cluster import annotate_degenes +from .degenes import annotate_degenes from .utils import get_markers, get_annot_df @@ -59,8 +59,7 @@ def annotate_levels( level += 1 level_name = "level_" + str(level) marker_dict = get_markers(marker_hierarchy) - assignments = annotate(adata, marker_dict, group_name, method=method, - layer=layer, level_name=level_name) + assignments = annotate(adata, marker_dict, group_name, method=method, layer=layer) if assignment_levels is None: assignment_levels = {} @@ -125,12 +124,12 @@ def annotate(adata, marker_dict, group_name, method="auroc", layer=None, **kwarg assignments = annotate_cytograph( adata, marker_dict, group_name, layer=layer, **kwargs ) - elif method == "cluster": + elif method == "degenes": assignments = annotate_degenes( - adata, marker_dict, group_name, layer=layer, level_name=level_name, **kwargs + adata, marker_dict, group_name, layer=layer, **kwargs ) else: - raise ValueError("Unknown annotation method.") + raise ValueError("Unknown annotation method.") # Join cluster-level results with adata assignments = assignments.reset_index(names=group_name) return assignments diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 3e7c22e..9011cd9 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -8,12 +8,11 @@ from snapseed.utils import read_yaml -def annotate_cluster( +def annotate_degenes( adata, marker_dict, group_name, layer=None, - level_name=None ): """ Annotate cell types based on differentially expressed (DE) marker genes. @@ -29,19 +28,7 @@ def annotate_cluster( layer Layer in adata to use for expression """ - if level_name=='level_1': - adata=annot_adata(adata, marker_genes, level_name='level_1', has_na=False) - else: - adata=annot_adata_lowlevel(adata, marker_genes, level_name=level_name, has_na=True) - - -def annot_adata( - adata, - marker_genes, - level_name='level_1', - has_na=False, - group_name=group_name - ): + # level_name = "level_" + str(level) corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1 - corr_df @@ -49,11 +36,11 @@ def annot_adata( ntop = math.ceil(len(adata.obs[group_name].unique())/10) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() - result_df_zscore = pd.DataFrame(marker_genes.keys()) - result_df_zscore = result_df_zscore.rename(columns={0:level_name}) + result_df_zscore = pd.DataFrame(marker_dict.keys()) + result_df_zscore = result_df_zscore.rename(columns={0:'level_name'}) - result_df_apvalue = pd.DataFrame(marker_genes.keys()) - result_df_apvalue = result_df_apvalue.rename(columns={0:level_name}) + result_df_apvalue = pd.DataFrame(marker_dict.keys()) + result_df_apvalue = result_df_apvalue.rename(columns={0:'level_name'}) for cluster in adata.obs[group_name].unique(): adata0 = adata.copy() @@ -65,83 +52,44 @@ def annot_adata( adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), group_name] = 'ref' - sc.tl.rank_genes_groups(adata0, f"leiden_{high_res}", groups=[cluster], + sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], reference='ref', method='wilcoxon') wranks = wrangle_ranks_from_adata(adata0) z_scores=[] adj_pvalss=[] - for i in marker_genes: - z_scores.append(wranks.loc[wranks.gene.isin(marker_genes[i]['marker_genes']), 'z_score'].max()) - adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_genes[i]['marker_genes']), 'adj_pvals']).max()) + for i in marker_dict: + z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) + adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) + # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) + # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] result_df_zscore[cluster]=z_scores result_df_apvalue[cluster]=adj_pvalss - z_df = result_df_zscore.set_index(level_name) + z_df = result_df_zscore.set_index('level_name') cluster2ct = z_df.idxmax().to_dict() + has_na = False + # TODO add has_na if has_na: for i in cluster2ct: if i in z_df[z_df.max(axis=1)<1].index: cluster2ct[i] = 'na' + + assign_df = pd.DataFrame(pd.Series(cluster2ct)) + assign_df = assign_df.rename(columns={0:'class'}) + assign_df['score'] = z_df.max() - adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) - - return adata - -def annot_adata_level2( - adata, - marker_genes, - level_name='level_2', - has_na=True - ): - - level2={} - for i in adata.obs.level_1.unique(): - sub_adata = adata[adata.obs.level_1==i] - - if 'subtypes' not in marker_genes[i]: - level2.update(sub_adata.obs.level_1.to_dict()) - else: - sub_adata = sub_adata.copy() - sub_adata = annot_adata(sub_adata, marker_genes[i]['subtypes'], - level_name='level_2', has_na=True) - sub_adata = sub_adata[sub_adata.obs.level_1==i] - - level2.update(sub_adata.obs.level_2.to_dict()) - - adata.obs['level_2'] = adata.obs.index.map(level2) - - return adata - -def annot_adata_lowlevel( - adata, - marker_genes, - level_name='level_2', - has_na=True - ): + # TODO magic way to avoid get_annot_df error + assign_df['expr'] = 1 - llevel = {} - up_level_name = f"level_{int(level_name.split('_')[-1])-1}" + return assign_df - for i in adata.obs[up_level_name].unique(): - sub_adata = adata[adata.obs[up_level_name] == i].copy() - - if 'subtypes' not in marker_genes[i]: - llevel.update(sub_adata.obs[up_level_name].to_dict()) - else: - sub_adata = annot_adata(sub_adata, marker_genes[i]['subtypes'], - level_name=level_name, has_na=True) - sub_adata = sub_adata[sub_adata.obs[up_level_name] == i] - - llevel.update(sub_adata.obs[level_name].to_dict()) - - adata.obs[level_name] = adata.obs.index.map(llevel) - - return adata + # adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) + # return adata def wrangle_ranks_from_adata(adata): """ From d1119189e4f1906f985a3d6c6c5c597ce74ebabd Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 3 Apr 2023 13:55:59 +0200 Subject: [PATCH 06/20] fix one cluster error --- snapseed/degenes.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 9011cd9..fcc75c7 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -4,6 +4,7 @@ import pandas as pd import scanpy as sc +from .auroc import annotate_snap from snapseed.utils import read_yaml @@ -30,6 +31,15 @@ def annotate_degenes( """ # level_name = "level_" + str(level) + # TODO magic way for adata only has one cluster + if len(adata.obs[group_name].unique()) == 1: + assign_df = annotate_snap( + adata, marker_dict, group_name, layer=layer + ) + # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + # assign_df.index=adata.obs[group_name].unique() + return assign_df + corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1 - corr_df From 245f084f6fbe3a761944b20814790fa13212e0d3 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 3 Apr 2023 15:13:58 +0200 Subject: [PATCH 07/20] fix na --- snapseed/annotate.py | 4 +++- snapseed/degenes.py | 17 +++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/snapseed/annotate.py b/snapseed/annotate.py index 930a893..638a229 100644 --- a/snapseed/annotate.py +++ b/snapseed/annotate.py @@ -72,7 +72,9 @@ def annotate_levels( ) for subtype in assignments["class"].unique(): - + if subtype == 'na': + continue + if "subtypes" not in marker_hierarchy[subtype].keys(): continue diff --git a/snapseed/degenes.py b/snapseed/degenes.py index fcc75c7..e182aa7 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -32,12 +32,17 @@ def annotate_degenes( # level_name = "level_" + str(level) # TODO magic way for adata only has one cluster - if len(adata.obs[group_name].unique()) == 1: - assign_df = annotate_snap( - adata, marker_dict, group_name, layer=layer - ) - # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) - # assign_df.index=adata.obs[group_name].unique() + print(adata.obs[group_name].unique()) + if len(adata.obs[group_name].unique()) <= 1: + # 1st way + # assign_df = annotate_snap( + # adata, marker_dict, group_name, layer=layer + # ) + # 2nd way + assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + assign_df.index=adata.obs[group_name].unique() + # 3rd way + # assign_df=pd.DataFrame() return assign_df corr_df = get_bulk_exp(adata, group_name).astype(float).corr() From c2e93a944c384b5116ad511c0a8f1ec932dcbf93 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 14 Apr 2023 10:46:57 +0200 Subject: [PATCH 08/20] add exp when de<1 --- snapseed/degenes.py | 50 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index e182aa7..627bc26 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -32,7 +32,6 @@ def annotate_degenes( # level_name = "level_" + str(level) # TODO magic way for adata only has one cluster - print(adata.obs[group_name].unique()) if len(adata.obs[group_name].unique()) <= 1: # 1st way # assign_df = annotate_snap( @@ -45,6 +44,7 @@ def annotate_degenes( # assign_df=pd.DataFrame() return assign_df + # cal max de corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1 - corr_df @@ -95,8 +95,38 @@ def annotate_degenes( cluster2ct[i] = 'na' assign_df = pd.DataFrame(pd.Series(cluster2ct)) - assign_df = assign_df.rename(columns={0:'class'}) - assign_df['score'] = z_df.max() + assign_df = assign_df.rename(columns={0:'max_de'}) + assign_df['de_score'] = z_df.max() + + # cal max exp + raw_bulk = get_bulk_exp(adata, group_name, 'raw') + max_exp=pd.DataFrame(index=raw_bulk.columns) + for cell in marker_dict: + if sum(raw_bulk.index.isin(marker_dict[cell]))>0: + good_markers = [i for i in marker_dict[cell] if i in raw_bulk.index] + max_exp[cell] = raw_bulk.loc[good_markers].max() + else: + max_exp[cell] = 0 + + s = max_exp.select_dtypes(include='object').columns + max_exp[s] = max_exp[s].astype("float") + max_exp_df = pd.DataFrame(max_exp.max(axis=1)) + max_exp_df = max_exp_df.rename(columns={0:'exp_score'}) + + max_exp_df['max_exp'] = max_exp.idxmax(axis=1) + + + # merge de and exp + assign_df = pd.merge(assign_df,max_exp_df, left_index=True, right_index=True) + + classs=[] + for index,row in assign_df.iterrows(): + if row['de_score']>1: + classs.append(row['max_de']) + else: + classs.append(row['max_exp']) + + assign_df['class'] = classs # TODO magic way to avoid get_annot_df error assign_df['expr'] = 1 @@ -127,11 +157,17 @@ def wrangle_ranks_from_adata(adata): marker_df = marker_df.append(concat) return marker_df -def get_bulk_exp(adata, bulk_labels): - res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) - +def get_bulk_exp(adata, bulk_labels, layer='var'): + if layer=='raw': + res = pd.DataFrame(columns=adata.raw.var_names, index=adata.obs[bulk_labels].cat.categories) + else: + res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) + for clust in adata.obs[bulk_labels].cat.categories: - res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) + if layer=='raw': + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].raw.X.mean(0) + else: + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) res.index=adata.obs[bulk_labels].cat.categories From cc345b155f85b570c95328610b3d17ef82bd20d4 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 14 Apr 2023 11:45:27 +0200 Subject: [PATCH 09/20] fix 1 cluster infer --- snapseed/degenes.py | 111 +++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 627bc26..212e40a 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -32,17 +32,18 @@ def annotate_degenes( # level_name = "level_" + str(level) # TODO magic way for adata only has one cluster - if len(adata.obs[group_name].unique()) <= 1: - # 1st way - # assign_df = annotate_snap( - # adata, marker_dict, group_name, layer=layer - # ) - # 2nd way - assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) - assign_df.index=adata.obs[group_name].unique() - # 3rd way - # assign_df=pd.DataFrame() - return assign_df + # fixed + # if len(adata.obs[group_name].unique()) <= 1: + # # 1st way + # # assign_df = annotate_snap( + # # adata, marker_dict, group_name, layer=layer + # # ) + # # 2nd way + # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + # assign_df.index=adata.obs[group_name].unique() + # # 3rd way + # # assign_df=pd.DataFrame() + # return assign_df # cal max de corr_df = get_bulk_exp(adata, group_name).astype(float).corr() @@ -57,46 +58,52 @@ def annotate_degenes( result_df_apvalue = pd.DataFrame(marker_dict.keys()) result_df_apvalue = result_df_apvalue.rename(columns={0:'level_name'}) - for cluster in adata.obs[group_name].unique(): - adata0 = adata.copy() - adata0.obs[group_name] = adata0.obs[group_name].astype(str) - - # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], - # reference=cluster_to_compair[cluster], method='wilcoxon') - - adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), - group_name] = 'ref' - - sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], - reference='ref', method='wilcoxon') - - wranks = wrangle_ranks_from_adata(adata0) - - z_scores=[] - adj_pvalss=[] - for i in marker_dict: - z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) - adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) - # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) - # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) - - # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] - result_df_zscore[cluster]=z_scores - result_df_apvalue[cluster]=adj_pvalss - - z_df = result_df_zscore.set_index('level_name') - cluster2ct = z_df.idxmax().to_dict() - - has_na = False - # TODO add has_na - if has_na: - for i in cluster2ct: - if i in z_df[z_df.max(axis=1)<1].index: - cluster2ct[i] = 'na' - - assign_df = pd.DataFrame(pd.Series(cluster2ct)) - assign_df = assign_df.rename(columns={0:'max_de'}) - assign_df['de_score'] = z_df.max() + if len(adata.obs[group_name].unique()) == 1: + assign_df = pd.DataFrame(index=adata.obs[group_name].unique()) + assign_df['max_de'] = 'na' + assign_df['de_score'] = 0 + + else: + for cluster in adata.obs[group_name].unique(): + adata0 = adata.copy() + adata0.obs[group_name] = adata0.obs[group_name].astype(str) + + # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + # reference=cluster_to_compair[cluster], method='wilcoxon') + + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + group_name] = 'ref' + + sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + reference='ref', method='wilcoxon') + + wranks = wrangle_ranks_from_adata(adata0) + + z_scores=[] + adj_pvalss=[] + for i in marker_dict: + z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) + adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) + # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) + # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) + + # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] + result_df_zscore[cluster]=z_scores + result_df_apvalue[cluster]=adj_pvalss + + z_df = result_df_zscore.set_index('level_name') + cluster2ct = z_df.idxmax().to_dict() + + has_na = False + # TODO add has_na + if has_na: + for i in cluster2ct: + if i in z_df[z_df.max(axis=1)<1].index: + cluster2ct[i] = 'na' + + assign_df = pd.DataFrame(pd.Series(cluster2ct)) + assign_df = assign_df.rename(columns={0:'max_de'}) + assign_df['de_score'] = z_df.max() # cal max exp raw_bulk = get_bulk_exp(adata, group_name, 'raw') @@ -117,7 +124,7 @@ def annotate_degenes( # merge de and exp - assign_df = pd.merge(assign_df,max_exp_df, left_index=True, right_index=True) + assign_df = pd.merge(assign_df, max_exp_df, left_index=True, right_index=True) classs=[] for index,row in assign_df.iterrows(): From c2190b69787d12c38c69570803a6cf110a4ff82f Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Wed, 19 Apr 2023 09:11:53 +0200 Subject: [PATCH 10/20] change dist cluster pect --- snapseed/annotate.py | 6 +++--- snapseed/degenes.py | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/snapseed/annotate.py b/snapseed/annotate.py index 638a229..2c2a1ab 100644 --- a/snapseed/annotate.py +++ b/snapseed/annotate.py @@ -59,7 +59,7 @@ def annotate_levels( level += 1 level_name = "level_" + str(level) marker_dict = get_markers(marker_hierarchy) - assignments = annotate(adata, marker_dict, group_name, method=method, layer=layer) + assignments = annotate(adata, marker_dict, group_name, method=method, layer=layer, level=level) if assignment_levels is None: assignment_levels = {} @@ -98,7 +98,7 @@ def annotate_levels( return assignment_levels -def annotate(adata, marker_dict, group_name, method="auroc", layer=None, **kwargs): +def annotate(adata, marker_dict, group_name, method="auroc", layer=None, level=None, **kwargs): """ Annotate clusters based on a manually defined cell type markers. @@ -128,7 +128,7 @@ def annotate(adata, marker_dict, group_name, method="auroc", layer=None, **kwarg ) elif method == "degenes": assignments = annotate_degenes( - adata, marker_dict, group_name, layer=layer, **kwargs + adata, marker_dict, group_name, layer=layer, level=None, **kwargs ) else: raise ValueError("Unknown annotation method.") diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 212e40a..52887cd 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -14,6 +14,7 @@ def annotate_degenes( marker_dict, group_name, layer=None, + level=None, ): """ Annotate cell types based on differentially expressed (DE) marker genes. @@ -49,7 +50,11 @@ def annotate_degenes( corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1 - corr_df - ntop = math.ceil(len(adata.obs[group_name].unique())/10) + if level==1: + dist_pect=10 + else: + dist_pect=2 + ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() result_df_zscore = pd.DataFrame(marker_dict.keys()) @@ -128,11 +133,13 @@ def annotate_degenes( classs=[] for index,row in assign_df.iterrows(): - if row['de_score']>1: + if row['de_score'] > 1: classs.append(row['max_de']) - else: + elif row['exp_score'] > 0: classs.append(row['max_exp']) - + else: + classs.append('na') + assign_df['class'] = classs # TODO magic way to avoid get_annot_df error From cbab442332b8cadd396989d3ec1d5808118a799c Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Wed, 19 Apr 2023 09:12:23 +0200 Subject: [PATCH 11/20] change dist cluster pect --- snapseed/degenes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 52887cd..5be0247 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -54,6 +54,7 @@ def annotate_degenes( dist_pect=10 else: dist_pect=2 + ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() From fc3b97efece83902a856625561463b2eb3077d39 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Thu, 20 Apr 2023 20:49:35 +0200 Subject: [PATCH 12/20] fix bugs --- snapseed/degenes.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 5be0247..50fcded 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -46,6 +46,16 @@ def annotate_degenes( # # assign_df=pd.DataFrame() # return assign_df + + # for celltype in marker_dict['subtypes']: + # subgenes = marker_dict['subtypes'][celltype] + # if marker_dict['subtypes'][celltype]['marker_genes'] == []: + # markers_all=[] + # for i in marker_dict['subtypes']: + # for j in marker_dict['subtypes'][i]['marker_genes']: + # markers_all.append(j) + # marker_dict['subtypes'][celltype]['marker_genes'] = markers_all + # cal max de corr_df = get_bulk_exp(adata, group_name).astype(float).corr() corr_df = 1 - corr_df From 34c1f410d73eaee5106452d2fc7868ca5888a568 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Wed, 26 Apr 2023 17:07:42 +0200 Subject: [PATCH 13/20] fix max de exp --- snapseed/degenes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 50fcded..6243ef1 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -144,8 +144,12 @@ def annotate_degenes( classs=[] for index,row in assign_df.iterrows(): - if row['de_score'] > 1: + if row['de_score'] > 2: classs.append(row['max_de']) + elif row['de_score'] > 1 and row['exp_score'] < 2: + classs.append(row['max_de']) + elif row['de_score'] > 1 and row['exp_score'] > 2: + classs.append(row['max_exp']) elif row['exp_score'] > 0: classs.append(row['max_exp']) else: From 9ae9c061c4d07dc58126e6f7c862e15269fe9fe5 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 5 May 2023 14:32:33 +0200 Subject: [PATCH 14/20] fix score --- snapseed/degenes.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 6243ef1..20fbf5f 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -143,15 +143,30 @@ def annotate_degenes( assign_df = pd.merge(assign_df, max_exp_df, left_index=True, right_index=True) classs=[] + # for index,row in assign_df.iterrows(): + # if row['de_score'] > 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] < 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] > 2: + # classs.append(row['max_exp']) + # elif row['exp_score'] > 0: + # classs.append(row['max_exp']) + # else: + # classs.append('na') for index,row in assign_df.iterrows(): if row['de_score'] > 2: classs.append(row['max_de']) - elif row['de_score'] > 1 and row['exp_score'] < 2: - classs.append(row['max_de']) - elif row['de_score'] > 1 and row['exp_score'] > 2: - classs.append(row['max_exp']) - elif row['exp_score'] > 0: - classs.append(row['max_exp']) + elif row['de_score'] > 1: + if row['exp_score'] > 2: + classs.append(row['max_exp']) + else: + classs.append(row['max_de']) + elif row['de_score'] > 0: + if row['exp_score'] > 0.5: + classs.append(row['max_exp']) + else: + classs.append('na') else: classs.append('na') From c6c53a0531e0884ea437106c589eff1f39be0778 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 12 Jun 2023 14:30:44 +0200 Subject: [PATCH 15/20] add roc auc --- build/lib/snapseed/__init__.py | 1 + build/lib/snapseed/annotate.py | 137 ++++++++++++ build/lib/snapseed/auroc.py | 124 +++++++++++ build/lib/snapseed/degenes copy.py | 255 ++++++++++++++++++++++ build/lib/snapseed/degenes.py | 267 +++++++++++++++++++++++ build/lib/snapseed/trinarize.py | 152 +++++++++++++ build/lib/snapseed/utils.py | 113 ++++++++++ snapseed.egg-info/PKG-INFO | 40 ++++ snapseed.egg-info/SOURCES.txt | 14 ++ snapseed.egg-info/dependency_links.txt | 1 + snapseed.egg-info/requires.txt | 7 + snapseed.egg-info/top_level.txt | 1 + snapseed/degenes copy.py | 255 ++++++++++++++++++++++ snapseed/degenes.py | 286 +++++++++++++++---------- snapseed/utils.py | 42 +++- 15 files changed, 1572 insertions(+), 123 deletions(-) create mode 100644 build/lib/snapseed/__init__.py create mode 100644 build/lib/snapseed/annotate.py create mode 100644 build/lib/snapseed/auroc.py create mode 100644 build/lib/snapseed/degenes copy.py create mode 100644 build/lib/snapseed/degenes.py create mode 100644 build/lib/snapseed/trinarize.py create mode 100644 build/lib/snapseed/utils.py create mode 100644 snapseed.egg-info/PKG-INFO create mode 100644 snapseed.egg-info/SOURCES.txt create mode 100644 snapseed.egg-info/dependency_links.txt create mode 100644 snapseed.egg-info/requires.txt create mode 100644 snapseed.egg-info/top_level.txt create mode 100644 snapseed/degenes copy.py diff --git a/build/lib/snapseed/__init__.py b/build/lib/snapseed/__init__.py new file mode 100644 index 0000000..8b6ae0a --- /dev/null +++ b/build/lib/snapseed/__init__.py @@ -0,0 +1 @@ +from .annotate import annotate, annotate_hierarchy diff --git a/build/lib/snapseed/annotate.py b/build/lib/snapseed/annotate.py new file mode 100644 index 0000000..2c2a1ab --- /dev/null +++ b/build/lib/snapseed/annotate.py @@ -0,0 +1,137 @@ +import pandas as pd + +from .trinarize import annotate_cytograph +from .auroc import annotate_snap +from .degenes import annotate_degenes + +from .utils import get_markers, get_annot_df + + +def annotate_hierarchy( + adata, + marker_hierarchy, + group_name, + method="auroc", + layer=None, + min_expr=0.1, + **kwargs +): + """ + Annotate clusters based on a manually defined cell type and marker hierarchy. + + Parameters + ---------- + adata + AnnData object + marker_hierarchy + Dict with marker genes for each celltype arranged hierarchically. + group_name + Name of the column in adata.obs that contains the cluster labels + method + Method to use for annotation. Options are "auroc" and "trinatize". + layer + Layer in adata to use for expression + **kwargs + Additional arguments to pass to the annotation function. + """ + + # Annotate at each level of the hierarchy + assignment_hierarchy = annotate_levels( + adata, marker_hierarchy, group_name, method=method + ) + + return dict( + assignments=get_annot_df(assignment_hierarchy, group_name, min_expr=min_expr), + metrics=assignment_hierarchy, + ) + + +def annotate_levels( + adata, + marker_hierarchy, + group_name, + level=0, + assignment_levels=None, + method="auroc", + layer=None, +): + """Recursively annotatates all levels of a marker hierarchy.""" + level += 1 + level_name = "level_" + str(level) + marker_dict = get_markers(marker_hierarchy) + assignments = annotate(adata, marker_dict, group_name, method=method, layer=layer, level=level) + + if assignment_levels is None: + assignment_levels = {} + + if level_name not in assignment_levels.keys(): + assignment_levels[level_name] = pd.DataFrame() + + assignment_levels[level_name] = pd.concat( + [assignment_levels[level_name], assignments], axis=0 + ) + + for subtype in assignments["class"].unique(): + if subtype == 'na': + continue + + if "subtypes" not in marker_hierarchy[subtype].keys(): + continue + + # Subset adata + subtype_groups = assignments[group_name][ + assignments["class"] == subtype + ].astype(str) + subtype_adata = adata[adata.obs[group_name].isin(subtype_groups)] + + # Recursively annotate + assignment_levels = annotate_levels( + subtype_adata, + marker_hierarchy[subtype]["subtypes"], + group_name, + level=level, + assignment_levels=assignment_levels, + method=method, + layer=layer, + ) + + return assignment_levels + + +def annotate(adata, marker_dict, group_name, method="auroc", layer=None, level=None, **kwargs): + """ + Annotate clusters based on a manually defined cell type markers. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + method + Method to use for annotation. Options are "auroc" and "trinatize". + layer + Layer in adata to use for expression + **kwargs + Additional arguments to pass to the annotation function. + """ + + if method == "auroc": + assignments = annotate_snap( + adata, marker_dict, group_name, layer=layer, **kwargs + ) + elif method == "trinatize": + assignments = annotate_cytograph( + adata, marker_dict, group_name, layer=layer, **kwargs + ) + elif method == "degenes": + assignments = annotate_degenes( + adata, marker_dict, group_name, layer=layer, level=None, **kwargs + ) + else: + raise ValueError("Unknown annotation method.") + # Join cluster-level results with adata + assignments = assignments.reset_index(names=group_name) + return assignments diff --git a/build/lib/snapseed/auroc.py b/build/lib/snapseed/auroc.py new file mode 100644 index 0000000..bd1d787 --- /dev/null +++ b/build/lib/snapseed/auroc.py @@ -0,0 +1,124 @@ +import pandas as pd + +import jax +from jax import numpy as jnp + +from functools import partial +from sklearn import preprocessing + +from .utils import dict_to_binary, get_expr, frac_nonzero, masked_max + + +def annotate_snap( + adata, + marker_dict, + group_name, + layer=None, + auc_weight=0.5, + expr_weight=0.5, +): + """ + Annotate cell types based on AUROC and expression of predefined marker genes. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression + """ + # Reformat marker_dict into binary matrix + marker_mat = dict_to_binary(marker_dict) + + # Compute AUROC and fraction nonzero for marker features + features = marker_mat.columns + metrics = auc_expr(adata, group_name, features=features) + + marker_mat = marker_mat.loc[:, metrics["features"]] + auc_max = masked_max(metrics["auroc"], marker_mat.values) + expr_max = masked_max(metrics["frac_nonzero"], marker_mat.values) + # Combine metrics + assignment_scores = (auc_weight * auc_max + expr_weight * expr_max) / ( + auc_weight + expr_weight + ) + assign_idx = jnp.argmax(assignment_scores, axis=0) + # Mask out genes that are not expressed in any cell + assign_class = marker_mat.index[assign_idx] + + assign_df = pd.DataFrame( + { + "class": assign_class, + "score": assignment_scores[assign_idx, jnp.arange(auc_max.shape[1])], + "auc": auc_max[assign_idx, jnp.arange(auc_max.shape[1])], + "expr": expr_max[assign_idx, jnp.arange(expr_max.shape[1])], + }, + index=metrics["groups"], + ) + + return assign_df + + +def auc_expr(adata, group_name, features=None, layer=None): + """Computes AUROC and fraction nonzero for each gene in an adata object.""" + # Turn string groups into integers + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + + # Compute AUROC and fraction nonzero + groups = jnp.array(le.transform(adata.obs[group_name])) + expr, features = get_expr(adata, features=features, layer=layer) + auroc, frac_nonzero = expr_auroc_over_groups(expr, groups) + + return dict( + frac_nonzero=frac_nonzero, + auroc=auroc, + features=features, + groups=le.classes_, + ) + + +@jax.jit +@partial(jax.vmap, in_axes=[1, None]) +def jit_auroc(x, groups): + # TODO: compute frac nonzero here to avoid iterating twice + + # sort scores and corresponding truth values + desc_score_indices = jnp.argsort(x)[::-1] + x = x[desc_score_indices] + groups = groups[desc_score_indices] + + # x typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = jnp.array(jnp.diff(x) != 0, dtype=jnp.int32) + threshold_mask = jnp.r_[distinct_value_indices, 1] + + # accumulate the true positives with decreasing threshold + tps_ = jnp.cumsum(groups) + fps_ = 1 + jnp.arange(groups.size) - tps_ + + # mask out the values that are not distinct + tps = jnp.sort(tps_ * threshold_mask) + fps = jnp.sort(fps_ * threshold_mask) + tps = jnp.r_[0, tps] + fps = jnp.r_[0, fps] + fpr = fps / fps[-1] + tpr = tps / tps[-1] + area = jnp.trapz(tpr, fpr) + return area + + +def expr_auroc_over_groups(expr, groups): + """Computes AUROC for each group separately.""" + auroc = jnp.zeros((groups.max() + 1, expr.shape[1])) + frac_nz = jnp.zeros((groups.max() + 1, expr.shape[1])) + + for group in range(groups.max() + 1): + auroc = auroc.at[group, :].set(jit_auroc(expr, groups == group)) + frac_nz = frac_nz.at[group, :].set(frac_nonzero(expr[groups == group, :])) + + return auroc, frac_nz diff --git a/build/lib/snapseed/degenes copy.py b/build/lib/snapseed/degenes copy.py new file mode 100644 index 0000000..4d42df2 --- /dev/null +++ b/build/lib/snapseed/degenes copy.py @@ -0,0 +1,255 @@ +import math + +import numpy as np +import pandas as pd + +import scanpy as sc +from .auroc import annotate_snap + +from snapseed.utils import read_yaml + + +def annotate_degenes( + adata, + marker_dict, + group_name, + layer=None, + level=None, + ): + """ + Annotate cell types based on differentially expressed (DE) marker genes. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression + """ + # level_name = "level_" + str(level) + + # TODO magic way for adata only has one cluster + # fixed + # if len(adata.obs[group_name].unique()) <= 1: + # # 1st way + # # assign_df = annotate_snap( + # # adata, marker_dict, group_name, layer=layer + # # ) + # # 2nd way + # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + # assign_df.index=adata.obs[group_name].unique() + # # 3rd way + # # assign_df=pd.DataFrame() + # return assign_df + + + # for celltype in marker_dict['subtypes']: + # subgenes = marker_dict['subtypes'][celltype] + # if marker_dict['subtypes'][celltype]['marker_genes'] == []: + # markers_all=[] + # for i in marker_dict['subtypes']: + # for j in marker_dict['subtypes'][i]['marker_genes']: + # markers_all.append(j) + # marker_dict['subtypes'][celltype]['marker_genes'] = markers_all + + # cal max de + corr_df = get_bulk_exp(adata, group_name).astype(float).corr() + corr_df = 1 - corr_df + + if level==1: + dist_pect=10 + else: + dist_pect=2 + + ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) + cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() + + result_df_zscore = pd.DataFrame(marker_dict.keys()) + result_df_zscore = result_df_zscore.rename(columns={0:'level_name'}) + + result_df_apvalue = pd.DataFrame(marker_dict.keys()) + result_df_apvalue = result_df_apvalue.rename(columns={0:'level_name'}) + + if len(adata.obs[group_name].unique()) == 1: + assign_df = pd.DataFrame(index=adata.obs[group_name].unique()) + assign_df['max_de'] = 'na' + assign_df['de_score'] = 0 + z_df = pd.DataFrame(columns=adata.obs[group_name].unique(), + index=list(marker_dict.keys())).fillna(0) + + else: + for cluster in adata.obs[group_name].unique(): + adata0 = adata.copy() + adata0.obs[group_name] = adata0.obs[group_name].astype(str) + + # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + # reference=cluster_to_compair[cluster], method='wilcoxon') + + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + group_name] = 'ref' + + sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + reference='ref', method='wilcoxon') + + wranks = wrangle_ranks_from_adata(adata0) + + z_scores=[] + adj_pvalss=[] + for i in marker_dict: + z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) + adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) + # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) + # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) + + # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] + result_df_zscore[cluster]=z_scores + result_df_apvalue[cluster]=adj_pvalss + + z_df = result_df_zscore.set_index('level_name') + cluster2ct = z_df.idxmax().to_dict() + + has_na = False + # TODO add has_na + if has_na: + for i in cluster2ct: + if i in z_df[z_df.max(axis=1)<1].index: + cluster2ct[i] = 'na' + + assign_df = pd.DataFrame(pd.Series(cluster2ct)) + assign_df = assign_df.rename(columns={0:'max_de'}) + assign_df['de_score'] = z_df.max() + + # cal max exp + raw_bulk = get_bulk_exp(adata, group_name, 'raw') + max_exp=pd.DataFrame(index=raw_bulk.columns) + for cell in marker_dict: + if sum(raw_bulk.index.isin(marker_dict[cell]))>0: + good_markers = [i for i in marker_dict[cell] if i in raw_bulk.index] + max_exp[cell] = raw_bulk.loc[good_markers].max() + else: + max_exp[cell] = 0 + + s = max_exp.select_dtypes(include='object').columns + max_exp[s] = max_exp[s].astype("float") + max_exp_df = pd.DataFrame(max_exp.max(axis=1)) + max_exp_df = max_exp_df.rename(columns={0:'exp_score'}) + + max_exp_df['max_exp'] = max_exp.idxmax(axis=1) + + + # merge de and exp + assign_df = pd.merge(assign_df, max_exp_df, left_index=True, right_index=True) + + + + mt_results = z_df * max_exp.T.loc[z_df.index, z_df.columns] + + mt_results_df = pd.DataFrame(mt_results.max(axis=0)) + mt_results_df = mt_results_df.rename(columns={0:'mt_score'}) + + mt_results_df['max_mt'] = mt_results.idxmax(axis=0) + + assign_df = pd.merge(assign_df, mt_results_df, left_index=True, right_index=True) + + use_mt = True + na_cutoff = 0.1 + if use_mt: + classs=[] + for index,row in assign_df.iterrows(): + if row['mt_score'] <= na_cutoff: + classs.append(row['max_exp']) + else: + classs.append(row['max_mt']) + + # else: + # for index,row in assign_df.iterrows(): + # if row['mt_score'] < na_cutoff: + # if row['de_score'] > na_cutoff: + # classs.append(row['max_de']) + # elif row['exp_score'] > na_cutoff: + # classs.append(row['max_exp']) + # else: + # classs.append('na') + # else: + # classs.append(row['max_mt']) + + else: + classs=[] + # for index,row in assign_df.iterrows(): + # if row['de_score'] > 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] < 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] > 2: + # classs.append(row['max_exp']) + # elif row['exp_score'] > 0: + # classs.append(row['max_exp']) + # else: + # classs.append('na') + for index,row in assign_df.iterrows(): + if row['de_score'] > 2: + classs.append(row['max_de']) + elif row['de_score'] > 1: + if row['exp_score'] > 2: + classs.append(row['max_exp']) + else: + classs.append(row['max_de']) + elif row['de_score'] > 0: + if row['exp_score'] > 0.5: + classs.append(row['max_exp']) + else: + classs.append('na') + else: + classs.append('na') + + assign_df['class'] = classs + + # TODO magic way to avoid get_annot_df error + assign_df['expr'] = 1 + + return assign_df + + # adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) + # return adata + +def wrangle_ranks_from_adata(adata): + """ + Wrangle results from the ranked_genes_groups function of Scanpy. + """ + # Get number of top ranked genes per groups + nb_marker = len(adata.uns['rank_genes_groups']['names']) + # Wrangle results into a table (pandas dataframe) + top_score = pd.DataFrame(adata.uns['rank_genes_groups']['scores']).loc[:nb_marker] + top_adjpval = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] + top_gene = pd.DataFrame(adata.uns['rank_genes_groups']['names']).loc[:nb_marker] + marker_df = pd.DataFrame() + # Order values + for i in top_score.columns: + concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) + concat['cluster_number'] = i + col = list(concat.columns) + col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' + concat.columns = col + marker_df = marker_df.append(concat) + return marker_df + +def get_bulk_exp(adata, bulk_labels, layer='var'): + if layer=='raw': + res = pd.DataFrame(columns=adata.raw.var_names, index=adata.obs[bulk_labels].cat.categories) + else: + res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) + + for clust in adata.obs[bulk_labels].cat.categories: + if layer=='raw': + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].raw.X.mean(0) + else: + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) + + res.index=adata.obs[bulk_labels].cat.categories + + return res.T + diff --git a/build/lib/snapseed/degenes.py b/build/lib/snapseed/degenes.py new file mode 100644 index 0000000..546de3e --- /dev/null +++ b/build/lib/snapseed/degenes.py @@ -0,0 +1,267 @@ +import math +from functools import partial + +import numpy as np +import pandas as pd +from sklearn import preprocessing + +import jax +from jax import numpy as jnp + +import scanpy as sc + +from .auroc import annotate_snap + +# from snapseed.utils import read_yaml +from .utils import read_yaml, dict_to_binary, frac_nonzero, masked_max, match, to_dense + + +def annotate_degenes( + adata, + marker_dict, + group_name, + layer=None, + level=None, + auc_weight=0.5, + expr_weight=0.5 + ): + """ + Annotate cell types based on differentially expressed (DE) marker genes. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression + """ + # level_name = "level_" + str(level) + + # TODO magic way for adata only has one cluster + # fixed + # if len(adata.obs[group_name].unique()) <= 1: + # # 1st way + # # assign_df = annotate_snap( + # # adata, marker_dict, group_name, layer=layer + # # ) + # # 2nd way + # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + # assign_df.index=adata.obs[group_name].unique() + # # 3rd way + # # assign_df=pd.DataFrame() + # return assign_df + + + # for celltype in marker_dict['subtypes']: + # subgenes = marker_dict['subtypes'][celltype] + # if marker_dict['subtypes'][celltype]['marker_genes'] == []: + # markers_all=[] + # for i in marker_dict['subtypes']: + # for j in marker_dict['subtypes'][i]['marker_genes']: + # markers_all.append(j) + # marker_dict['subtypes'][celltype]['marker_genes'] = markers_all + + # cal max de + corr_df = get_bulk_exp(adata, group_name).astype(float).corr() + corr_df = 1 - corr_df + + if level==1: + dist_pect=10 + else: + dist_pect=2 + + ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) + cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() + for i in cluster_to_compair: + if i in cluster_to_compair[i]: + cluster_to_compair[i].remove(i) + + # Reformat marker_dict into binary matrix + marker_mat = dict_to_binary(marker_dict) + + # Compute AUROC and fraction nonzero for marker features + features = marker_mat.columns + + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + cell_groups = jnp.array(le.transform(adata.obs[group_name])) + + groups = le.classes_ + + expr, features = get_expr(adata, features=features, layer='raw') + + aurocs = jnp.zeros((len(groups), len(features))) + frac_nzs = jnp.zeros((len(groups), len(features))) + allgroups=[] + + aurocs = pd.DataFrame(aurocs).T + aurocs.columns=groups + aurocs.index = features + + frac_nzs = pd.DataFrame(frac_nzs).T + frac_nzs.columns=groups + frac_nzs.index = features + + for cluster in groups: + + cluster = str(cluster) + + adata0 = adata.copy() + adata0.obs[group_name] = adata0.obs[group_name].astype(str) + + if cluster not in cluster_to_compair: + adata0 = adata0[adata0.obs[group_name].isin([cluster])] + + else: + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + group_name] = 'ref' + adata0 = adata0[adata0.obs[group_name].isin(['ref', cluster])] + + metric = auc_expr(adata0, group_name, features=features) + aurocs[cluster] = metric['auroc'][metric['groups']==cluster][0] + frac_nzs[cluster] = metric['frac_nonzero'][metric['groups']==cluster][0] + allgroups.append(cluster) + + metrics={'frac_nonzero':frac_nzs, + 'auroc':aurocs, + 'features':features, + 'groups':groups} + + + marker_mat = marker_mat.loc[:, metrics["features"]] + + auc_max = pd.DataFrame() + for i in marker_dict: + auc_max[i] = aurocs.loc[[i for i in marker_dict[i] if i in aurocs.index],:].max() + + expr_max = pd.DataFrame() + for i in marker_dict: + expr_max[i] = frac_nzs.loc[[i for i in marker_dict[i] if i in frac_nzs.index],:].max() + + # Combine metrics + assignment_scores = (auc_weight * auc_max + expr_weight * expr_max) / ( + auc_weight + expr_weight + ) + + assign_class = assignment_scores.idxmax(1) + + assign_df = pd.DataFrame( + { + "class": assign_class, + "score": np.diag(assignment_scores[assign_class]), + "auc": np.diag(auc_max[assign_class]), + "expr": np.diag(expr_max[assign_class]), + }, + index=metrics["groups"], + ) + + return assign_df + + # adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) + # return adata + +def get_bulk_exp(adata, bulk_labels, layer='var'): + if layer=='raw': + res = pd.DataFrame(columns=adata.raw.var_names, index=adata.obs[bulk_labels].cat.categories) + else: + res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) + + for clust in adata.obs[bulk_labels].cat.categories: + if layer=='raw': + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].raw.X.mean(0) + else: + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) + + res.index=adata.obs[bulk_labels].cat.categories + + return res.T + + +def get_expr(adata, features=None, layer=None): + """Get expression matrix from adata object""" + if features is not None: + # intersect with adata features + features = list(set(features) & set(adata.var_names)) + adata = adata[:, match(features, adata.var_names.tolist())] + + + if layer == 'raw': + expr = jnp.array(to_dense(adata.raw.X)) + elif layer is not None: + expr = jnp.array(to_dense(adata.layers[layer])) + else: + expr = jnp.array(to_dense(adata.X)) + + return expr, features + +def auc_expr(adata, group_name, features=None, layer=None): + """Computes AUROC and fraction nonzero for each gene in an adata object.""" + # Turn string groups into integers + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + # Compute AUROC and fraction nonzero + cell_groups = jnp.array(le.transform(adata.obs[group_name])) + + groups = le.classes_ + + expr, features = get_expr(adata, features=features, layer=layer) + auroc, frac_nonzero = expr_auroc_over_groups(expr, cell_groups, groups) + + return dict( + frac_nonzero=frac_nonzero, + auroc=auroc, + features=features, + groups=le.classes_, + ) + + +@jax.jit +@partial(jax.vmap, in_axes=[1, None]) +def jit_auroc(x, groups): + # TODO: compute frac nonzero here to avoid iterating twice + + # sort scores and corresponding truth values + desc_score_indices = jnp.argsort(x)[::-1] + x = x[desc_score_indices] + groups = groups[desc_score_indices] + + # x typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = jnp.array(jnp.diff(x) != 0, dtype=jnp.int32) + threshold_mask = jnp.r_[distinct_value_indices, 1] + + # accumulate the true positives with decreasing threshold + tps_ = jnp.cumsum(groups) + fps_ = 1 + jnp.arange(groups.size) - tps_ + + # mask out the values that are not distinct + tps = jnp.sort(tps_ * threshold_mask) + fps = jnp.sort(fps_ * threshold_mask) + tps = jnp.r_[0, tps] + fps = jnp.r_[0, fps] + fpr = fps / fps[-1] + tpr = tps / tps[-1] + area = jnp.trapz(tpr, fpr) + return area + + +def expr_auroc_over_groups(expr, cell_groups, groups): + """Computes AUROC for each group separately.""" + auroc = jnp.zeros((len(groups), expr.shape[1])) + frac_nz = jnp.zeros((len(groups), expr.shape[1])) + + for group in groups: + if group == 'ref': + group = 0 + else: + group = int(group) + auroc = auroc.at[group, :].set(jit_auroc(expr, cell_groups == group)) + frac_nz = frac_nz.at[group, :].set(frac_nonzero(expr[cell_groups == group, :])) + + return auroc, frac_nz + diff --git a/build/lib/snapseed/trinarize.py b/build/lib/snapseed/trinarize.py new file mode 100644 index 0000000..3515b56 --- /dev/null +++ b/build/lib/snapseed/trinarize.py @@ -0,0 +1,152 @@ +import numba +import jax +from jax import numpy as jnp +import pandas as pd +import numpy as np + +from functools import partial +from sklearn import preprocessing + +from jax.scipy.special import gammaln, betainc, betaln + +from .utils import dict_to_binary, get_expr + + +def annotate_cytograph(adata, marker_dict, group_name, layer=None, f=0.2): + """ + Annotate clusters based on trinarization of marker gene expression. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression + """ + # Reformat marker_dict into binary matrix + marker_mat = dict_to_binary(marker_dict) + + features = marker_mat.columns + marker_mat = marker_mat.loc[:, features] + + # Compute trinaries + metrics = trinarize(adata, group_name, features=features, layer=layer, f=0.2) + + annot_probs = get_annot_probs(np.array(metrics["trinaries"]), marker_mat.values) + + annot_dict = {} + annot_scores = np.zeros(annot_probs.shape[1]) + for i in range(annot_probs.shape[1]): + annot_dict[metrics["groups"][i]] = marker_mat.index[ + np.argmax(annot_probs[:, i]) + ] + annot_scores[i] = np.max(annot_probs[:, i]) + + annot_tags = [ + tag if isinstance(tag, str) else ";".join(tag) for tag in annot_dict.values() + ] + + assign_df = pd.DataFrame( + {"class": annot_tags, "score": annot_scores}, + index=annot_dict.keys(), + ) + return assign_df + + +def trinarize(adata, group_name, features=None, layer=None, f=0.2): + """Compute the trinaries for each marker gene.""" + # Turn string groups into integers + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + + # Get groups and expression + groups = jnp.array(le.transform(adata.obs[group_name])) + n_groups = int(groups.max() + 1) + expr, features = get_expr(adata, features=features, layer=layer) + + # Trinatize + trinaries = betabinomial_trinarize_array(expr, groups, n_groups, f) + + return dict( + trinaries=trinaries, + features=features, + groups=le.classes_, + ) + + +@numba.jit +def get_annot_probs(trinaries, marker_array): + """Compute the annotaion probability foea each cell type.""" + group_probs = np.zeros((marker_array.shape[0], trinaries.shape[1])) + for i in np.arange(trinaries.shape[1]): + for j in np.arange(marker_array.shape[0]): + marker_trinaries = trinaries[:, i][np.nonzero(marker_array[j, :])] + if marker_trinaries.shape[0] == 0: + group_probs[j, i] = 0 + else: + group_probs[j, i] = np.mean(marker_trinaries) + return group_probs + + +@partial(jax.jit, static_argnames=["n_groups", "f"]) +@partial(jax.vmap, in_axes=[1, None, None, None]) +def betabinomial_trinarize_array(x, groups, n_groups, f): + """ + Trinarize a vector, grouped by groups, using a beta binomial model + Parameters + ---------- + x + The input expression vector. + groups + Group labels. + + Returns + ------- + ps + The posterior probability of xession in at least a fraction f + """ + x = jnp.round(jnp.array(x)) + n_by_group = jnp.bincount(groups, length=n_groups) + k_by_group = jnp.zeros(n_groups) + for g in range(n_groups): + group_mask = jnp.array(groups == g, dtype=jnp.int32) + k_by_group = k_by_group.at[g].set(jnp.count_nonzero(x * group_mask)) + ps = p_half(k_by_group, n_by_group, f) + return ps + + +@jax.jit +@partial(jax.vmap, in_axes=[0, 0, None]) +def p_half(k: int, n: int, f: float) -> float: + """ + Return probability that at least half the cells express, if we have observed k of n cells expressing: + + p|k,n = 1-(betainc(1+k, 1-k+n, f)*gamma(2+n)/(gamma(1+k)*gamma(1-k+n))/beta(1+k, 1-k+n) + + Parameters + ---------- + k + Number of observed positive cells + n + Total number of cells + """ + # These are the prior hyperparameters beta(a,b) + a = 1.5 + b = 2 + # We really want to calculate this: + # p = 1-(betainc(a+k, b-k+n, 0.5)*beta(a+k, b-k+n)*gamma(a+b+n)/(gamma(a+k)*gamma(b-k+n))) + # + # But it's numerically unstable, so we need to work on log scale (and special-case the incomplete beta) + incb = betainc(a + k, b - k + n, f) + p = 1.0 - jnp.exp( + jnp.log(incb) + + betaln(a + k, b - k + n) + + gammaln(a + b + n) + - gammaln(a + k) + - gammaln(b - k + n) + ) + return p diff --git a/build/lib/snapseed/utils.py b/build/lib/snapseed/utils.py new file mode 100644 index 0000000..dbcd84e --- /dev/null +++ b/build/lib/snapseed/utils.py @@ -0,0 +1,113 @@ +import yaml +import pandas as pd +import numba + +import jax +from jax import numpy as jnp +from jax.experimental.sparse import BCOO + +from functools import partial +from sklearn.metrics import roc_auc_score + +import scanpy as sc + +to_dense = lambda x: x.toarray() if hasattr(x, "toarray") else x + +def to_jax_array(x): + """Turn matrix to jax array.""" + if hasattr(x, "todense"): + # Currently this is not supported for really large matrices + # return BCOO.from_scipy_sparse(x).update_layout(n_batch=1, on_inefficient=None) + return jnp.asarray(x.todense()) + else: + return jnp.asarray(x) + + +@jax.jit +@partial(jax.vmap, in_axes=[None, 0]) +def masked_max(x, mask): + return jnp.max(x * mask, axis=1) + + +@jax.jit +@partial(jax.vmap, in_axes=[None, 0]) +def masked_mean(x, mask): + return jnp.sum(x * mask, axis=1) / jnp.sum(mask) + + +def frac_nonzero(x, axis=0): + return jnp.mean(x > 0, axis=axis) + + +jit_frac_nonzero = jax.jit(frac_nonzero) + + +def dict_to_binary(d): + df = pd.concat( + [pd.Series(v, name=k).astype(str) for k, v in d.items()], + axis=1, + ) + marker_mat = pd.get_dummies(df.stack()).groupby(level=1).sum().clip(upper=1) + return marker_mat + + +@numba.jit +def match(a, b): + return [b.index(x) if x in b else None for x in a] + + +def get_expr(adata, features=None, layer=None): + """Get expression matrix from adata object""" + if features is not None: + # intersect with adata features + features = list(set(features) & set(adata.var_names)) + adata = adata[ + :, + match( + numba.typed.List(features), numba.typed.List(adata.var_names.tolist()) + ), + ] + else: + features = adata.var_names.copy().tolist() + + if layer is not None: + expr = to_jax_array(adata.layers[layer]) + else: + expr = to_jax_array(adata.X) + + return expr, features + + +def get_markers(x): + return {n: v["marker_genes"] for n, v in x.items()} + + +def read_yaml(file): + with open(file, "r") as f: + marker_dict = yaml.safe_load(f) + return marker_dict + + +def get_annot_df(x, group_name, min_expr=0.1): + # Get valid annots from each level + annot_list = [] + for k, v in x.items(): + annot = v.set_index(group_name)["class"] + if min_expr > 0: + expr = v.set_index(group_name)["expr"] + annot = annot[expr > min_expr] + annot_list.append(annot) + # Concat annots + annot_df = pd.concat(annot_list, axis=1) + # Rename cols to levels + annot_df.columns = [str(i) for i in x.keys()] + return annot_df + + +def matrix_to_long_df(x, features, groups): + """Converts a matrix to a long dataframe""" + df = pd.DataFrame(x, index=groups, columns=features) + df = df.stack().reset_index() + df.columns = ["group", "feature", "value"] + return df + \ No newline at end of file diff --git a/snapseed.egg-info/PKG-INFO b/snapseed.egg-info/PKG-INFO new file mode 100644 index 0000000..2547740 --- /dev/null +++ b/snapseed.egg-info/PKG-INFO @@ -0,0 +1,40 @@ +Metadata-Version: 2.1 +Name: snapseed +Version: 0.1.0 +Summary: Superfast hierarchical annotation of large single-cell datasets +Author: Jonas Simon Fleck +Author-email: jonas.simon.fleck@gmail.com +License: MIT +Requires-Python: >=3.9 +Description-Content-Type: text/markdown + +# snapseed + +Snapseed annotates single-cell datasets based on manually defined sets of marker genes for individual cell types or cell type hierarchies. It is fast and simple to accelerate annotation of very large datasets. + + +## Quick start + +```python +import snapseed as snap +from snapseed.utils import read_yaml + +# Read in the marker genes +marker_genes = read_yaml("marker_genes.yaml") + +# Annotate anndata objects +snap.annotate( + adata, + marker_genes, + group_name="clusters", + layer="lognorm", +) + +# Or for more complex hierarchies +snap.annotate_hierarchy( + adata, + marker_genes, + group_name="clusters", + layer="lognorm", +) +``` diff --git a/snapseed.egg-info/SOURCES.txt b/snapseed.egg-info/SOURCES.txt new file mode 100644 index 0000000..0434dd2 --- /dev/null +++ b/snapseed.egg-info/SOURCES.txt @@ -0,0 +1,14 @@ +README.md +setup.py +snapseed/__init__.py +snapseed/annotate.py +snapseed/auroc.py +snapseed/degenes copy.py +snapseed/degenes.py +snapseed/trinarize.py +snapseed/utils.py +snapseed.egg-info/PKG-INFO +snapseed.egg-info/SOURCES.txt +snapseed.egg-info/dependency_links.txt +snapseed.egg-info/requires.txt +snapseed.egg-info/top_level.txt \ No newline at end of file diff --git a/snapseed.egg-info/dependency_links.txt b/snapseed.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/snapseed.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/snapseed.egg-info/requires.txt b/snapseed.egg-info/requires.txt new file mode 100644 index 0000000..412f00d --- /dev/null +++ b/snapseed.egg-info/requires.txt @@ -0,0 +1,7 @@ +jax>=0.2.26 +jaxlib>=0.1.75 +numpy>=1.18.5 +numba>=0.56.4 +scanpy>=1.9.1 +scikit-learn>=1.1.3 +pandas>=1.5.2 diff --git a/snapseed.egg-info/top_level.txt b/snapseed.egg-info/top_level.txt new file mode 100644 index 0000000..b7c5403 --- /dev/null +++ b/snapseed.egg-info/top_level.txt @@ -0,0 +1 @@ +snapseed diff --git a/snapseed/degenes copy.py b/snapseed/degenes copy.py new file mode 100644 index 0000000..4d42df2 --- /dev/null +++ b/snapseed/degenes copy.py @@ -0,0 +1,255 @@ +import math + +import numpy as np +import pandas as pd + +import scanpy as sc +from .auroc import annotate_snap + +from snapseed.utils import read_yaml + + +def annotate_degenes( + adata, + marker_dict, + group_name, + layer=None, + level=None, + ): + """ + Annotate cell types based on differentially expressed (DE) marker genes. + + Parameters + ---------- + adata + AnnData object + marker_dict + Dict with marker genes for each celltype + group_name + Name of the column in adata.obs that contains the cluster labels + layer + Layer in adata to use for expression + """ + # level_name = "level_" + str(level) + + # TODO magic way for adata only has one cluster + # fixed + # if len(adata.obs[group_name].unique()) <= 1: + # # 1st way + # # assign_df = annotate_snap( + # # adata, marker_dict, group_name, layer=layer + # # ) + # # 2nd way + # assign_df = pd.DataFrame({'class':['na'], 'score':[np.nan], 'expr':[1]}) + # assign_df.index=adata.obs[group_name].unique() + # # 3rd way + # # assign_df=pd.DataFrame() + # return assign_df + + + # for celltype in marker_dict['subtypes']: + # subgenes = marker_dict['subtypes'][celltype] + # if marker_dict['subtypes'][celltype]['marker_genes'] == []: + # markers_all=[] + # for i in marker_dict['subtypes']: + # for j in marker_dict['subtypes'][i]['marker_genes']: + # markers_all.append(j) + # marker_dict['subtypes'][celltype]['marker_genes'] = markers_all + + # cal max de + corr_df = get_bulk_exp(adata, group_name).astype(float).corr() + corr_df = 1 - corr_df + + if level==1: + dist_pect=10 + else: + dist_pect=2 + + ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) + cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() + + result_df_zscore = pd.DataFrame(marker_dict.keys()) + result_df_zscore = result_df_zscore.rename(columns={0:'level_name'}) + + result_df_apvalue = pd.DataFrame(marker_dict.keys()) + result_df_apvalue = result_df_apvalue.rename(columns={0:'level_name'}) + + if len(adata.obs[group_name].unique()) == 1: + assign_df = pd.DataFrame(index=adata.obs[group_name].unique()) + assign_df['max_de'] = 'na' + assign_df['de_score'] = 0 + z_df = pd.DataFrame(columns=adata.obs[group_name].unique(), + index=list(marker_dict.keys())).fillna(0) + + else: + for cluster in adata.obs[group_name].unique(): + adata0 = adata.copy() + adata0.obs[group_name] = adata0.obs[group_name].astype(str) + + # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + # reference=cluster_to_compair[cluster], method='wilcoxon') + + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + group_name] = 'ref' + + sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], + reference='ref', method='wilcoxon') + + wranks = wrangle_ranks_from_adata(adata0) + + z_scores=[] + adj_pvalss=[] + for i in marker_dict: + z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) + adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) + # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) + # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) + + # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] + result_df_zscore[cluster]=z_scores + result_df_apvalue[cluster]=adj_pvalss + + z_df = result_df_zscore.set_index('level_name') + cluster2ct = z_df.idxmax().to_dict() + + has_na = False + # TODO add has_na + if has_na: + for i in cluster2ct: + if i in z_df[z_df.max(axis=1)<1].index: + cluster2ct[i] = 'na' + + assign_df = pd.DataFrame(pd.Series(cluster2ct)) + assign_df = assign_df.rename(columns={0:'max_de'}) + assign_df['de_score'] = z_df.max() + + # cal max exp + raw_bulk = get_bulk_exp(adata, group_name, 'raw') + max_exp=pd.DataFrame(index=raw_bulk.columns) + for cell in marker_dict: + if sum(raw_bulk.index.isin(marker_dict[cell]))>0: + good_markers = [i for i in marker_dict[cell] if i in raw_bulk.index] + max_exp[cell] = raw_bulk.loc[good_markers].max() + else: + max_exp[cell] = 0 + + s = max_exp.select_dtypes(include='object').columns + max_exp[s] = max_exp[s].astype("float") + max_exp_df = pd.DataFrame(max_exp.max(axis=1)) + max_exp_df = max_exp_df.rename(columns={0:'exp_score'}) + + max_exp_df['max_exp'] = max_exp.idxmax(axis=1) + + + # merge de and exp + assign_df = pd.merge(assign_df, max_exp_df, left_index=True, right_index=True) + + + + mt_results = z_df * max_exp.T.loc[z_df.index, z_df.columns] + + mt_results_df = pd.DataFrame(mt_results.max(axis=0)) + mt_results_df = mt_results_df.rename(columns={0:'mt_score'}) + + mt_results_df['max_mt'] = mt_results.idxmax(axis=0) + + assign_df = pd.merge(assign_df, mt_results_df, left_index=True, right_index=True) + + use_mt = True + na_cutoff = 0.1 + if use_mt: + classs=[] + for index,row in assign_df.iterrows(): + if row['mt_score'] <= na_cutoff: + classs.append(row['max_exp']) + else: + classs.append(row['max_mt']) + + # else: + # for index,row in assign_df.iterrows(): + # if row['mt_score'] < na_cutoff: + # if row['de_score'] > na_cutoff: + # classs.append(row['max_de']) + # elif row['exp_score'] > na_cutoff: + # classs.append(row['max_exp']) + # else: + # classs.append('na') + # else: + # classs.append(row['max_mt']) + + else: + classs=[] + # for index,row in assign_df.iterrows(): + # if row['de_score'] > 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] < 2: + # classs.append(row['max_de']) + # elif row['de_score'] > 1 and row['exp_score'] > 2: + # classs.append(row['max_exp']) + # elif row['exp_score'] > 0: + # classs.append(row['max_exp']) + # else: + # classs.append('na') + for index,row in assign_df.iterrows(): + if row['de_score'] > 2: + classs.append(row['max_de']) + elif row['de_score'] > 1: + if row['exp_score'] > 2: + classs.append(row['max_exp']) + else: + classs.append(row['max_de']) + elif row['de_score'] > 0: + if row['exp_score'] > 0.5: + classs.append(row['max_exp']) + else: + classs.append('na') + else: + classs.append('na') + + assign_df['class'] = classs + + # TODO magic way to avoid get_annot_df error + assign_df['expr'] = 1 + + return assign_df + + # adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) + # return adata + +def wrangle_ranks_from_adata(adata): + """ + Wrangle results from the ranked_genes_groups function of Scanpy. + """ + # Get number of top ranked genes per groups + nb_marker = len(adata.uns['rank_genes_groups']['names']) + # Wrangle results into a table (pandas dataframe) + top_score = pd.DataFrame(adata.uns['rank_genes_groups']['scores']).loc[:nb_marker] + top_adjpval = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] + top_gene = pd.DataFrame(adata.uns['rank_genes_groups']['names']).loc[:nb_marker] + marker_df = pd.DataFrame() + # Order values + for i in top_score.columns: + concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) + concat['cluster_number'] = i + col = list(concat.columns) + col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' + concat.columns = col + marker_df = marker_df.append(concat) + return marker_df + +def get_bulk_exp(adata, bulk_labels, layer='var'): + if layer=='raw': + res = pd.DataFrame(columns=adata.raw.var_names, index=adata.obs[bulk_labels].cat.categories) + else: + res = pd.DataFrame(columns=adata.var_names, index=adata.obs[bulk_labels].cat.categories) + + for clust in adata.obs[bulk_labels].cat.categories: + if layer=='raw': + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].raw.X.mean(0) + else: + res.loc[clust] = adata[adata.obs[bulk_labels].isin([clust]),:].X.mean(0) + + res.index=adata.obs[bulk_labels].cat.categories + + return res.T + diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 20fbf5f..546de3e 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -1,12 +1,19 @@ import math +from functools import partial import numpy as np import pandas as pd +from sklearn import preprocessing + +import jax +from jax import numpy as jnp import scanpy as sc + from .auroc import annotate_snap -from snapseed.utils import read_yaml +# from snapseed.utils import read_yaml +from .utils import read_yaml, dict_to_binary, frac_nonzero, masked_max, match, to_dense def annotate_degenes( @@ -15,6 +22,8 @@ def annotate_degenes( group_name, layer=None, level=None, + auc_weight=0.5, + expr_weight=0.5 ): """ Annotate cell types based on differentially expressed (DE) marker genes. @@ -67,140 +76,94 @@ def annotate_degenes( ntop = math.ceil(len(adata.obs[group_name].unique())/dist_pect) cluster_to_compair = corr_df.apply(lambda s: s.abs().nlargest(ntop).index.tolist(), axis=1).to_dict() + for i in cluster_to_compair: + if i in cluster_to_compair[i]: + cluster_to_compair[i].remove(i) - result_df_zscore = pd.DataFrame(marker_dict.keys()) - result_df_zscore = result_df_zscore.rename(columns={0:'level_name'}) + # Reformat marker_dict into binary matrix + marker_mat = dict_to_binary(marker_dict) - result_df_apvalue = pd.DataFrame(marker_dict.keys()) - result_df_apvalue = result_df_apvalue.rename(columns={0:'level_name'}) + # Compute AUROC and fraction nonzero for marker features + features = marker_mat.columns - if len(adata.obs[group_name].unique()) == 1: - assign_df = pd.DataFrame(index=adata.obs[group_name].unique()) - assign_df['max_de'] = 'na' - assign_df['de_score'] = 0 + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + cell_groups = jnp.array(le.transform(adata.obs[group_name])) - else: - for cluster in adata.obs[group_name].unique(): - adata0 = adata.copy() - adata0.obs[group_name] = adata0.obs[group_name].astype(str) + groups = le.classes_ + + expr, features = get_expr(adata, features=features, layer='raw') + + aurocs = jnp.zeros((len(groups), len(features))) + frac_nzs = jnp.zeros((len(groups), len(features))) + allgroups=[] + + aurocs = pd.DataFrame(aurocs).T + aurocs.columns=groups + aurocs.index = features + + frac_nzs = pd.DataFrame(frac_nzs).T + frac_nzs.columns=groups + frac_nzs.index = features + + for cluster in groups: + + cluster = str(cluster) + + adata0 = adata.copy() + adata0.obs[group_name] = adata0.obs[group_name].astype(str) - # sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], - # reference=cluster_to_compair[cluster], method='wilcoxon') - + if cluster not in cluster_to_compair: + adata0 = adata0[adata0.obs[group_name].isin([cluster])] + + else: adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), group_name] = 'ref' + adata0 = adata0[adata0.obs[group_name].isin(['ref', cluster])] + + metric = auc_expr(adata0, group_name, features=features) + aurocs[cluster] = metric['auroc'][metric['groups']==cluster][0] + frac_nzs[cluster] = metric['frac_nonzero'][metric['groups']==cluster][0] + allgroups.append(cluster) + + metrics={'frac_nonzero':frac_nzs, + 'auroc':aurocs, + 'features':features, + 'groups':groups} - sc.tl.rank_genes_groups(adata0, group_name, groups=[cluster], - reference='ref', method='wilcoxon') - - wranks = wrangle_ranks_from_adata(adata0) - - z_scores=[] - adj_pvalss=[] - for i in marker_dict: - z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]), 'z_score'].max()) - adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]), 'adj_pvals']).max()) - # z_scores.append(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'z_score'].max()) - # adj_pvalss.append(-np.log10(wranks.loc[wranks.gene.isin(marker_dict[i]['marker_genes']), 'adj_pvals']).max()) - - # result_df_zscore[cluster]=[i*j for i,j in zip(z_scores,adj_pvalss)] - result_df_zscore[cluster]=z_scores - result_df_apvalue[cluster]=adj_pvalss - - z_df = result_df_zscore.set_index('level_name') - cluster2ct = z_df.idxmax().to_dict() - - has_na = False - # TODO add has_na - if has_na: - for i in cluster2ct: - if i in z_df[z_df.max(axis=1)<1].index: - cluster2ct[i] = 'na' - - assign_df = pd.DataFrame(pd.Series(cluster2ct)) - assign_df = assign_df.rename(columns={0:'max_de'}) - assign_df['de_score'] = z_df.max() - - # cal max exp - raw_bulk = get_bulk_exp(adata, group_name, 'raw') - max_exp=pd.DataFrame(index=raw_bulk.columns) - for cell in marker_dict: - if sum(raw_bulk.index.isin(marker_dict[cell]))>0: - good_markers = [i for i in marker_dict[cell] if i in raw_bulk.index] - max_exp[cell] = raw_bulk.loc[good_markers].max() - else: - max_exp[cell] = 0 - - s = max_exp.select_dtypes(include='object').columns - max_exp[s] = max_exp[s].astype("float") - max_exp_df = pd.DataFrame(max_exp.max(axis=1)) - max_exp_df = max_exp_df.rename(columns={0:'exp_score'}) - - max_exp_df['max_exp'] = max_exp.idxmax(axis=1) - - - # merge de and exp - assign_df = pd.merge(assign_df, max_exp_df, left_index=True, right_index=True) - - classs=[] - # for index,row in assign_df.iterrows(): - # if row['de_score'] > 2: - # classs.append(row['max_de']) - # elif row['de_score'] > 1 and row['exp_score'] < 2: - # classs.append(row['max_de']) - # elif row['de_score'] > 1 and row['exp_score'] > 2: - # classs.append(row['max_exp']) - # elif row['exp_score'] > 0: - # classs.append(row['max_exp']) - # else: - # classs.append('na') - for index,row in assign_df.iterrows(): - if row['de_score'] > 2: - classs.append(row['max_de']) - elif row['de_score'] > 1: - if row['exp_score'] > 2: - classs.append(row['max_exp']) - else: - classs.append(row['max_de']) - elif row['de_score'] > 0: - if row['exp_score'] > 0.5: - classs.append(row['max_exp']) - else: - classs.append('na') - else: - classs.append('na') - assign_df['class'] = classs + marker_mat = marker_mat.loc[:, metrics["features"]] - # TODO magic way to avoid get_annot_df error - assign_df['expr'] = 1 + auc_max = pd.DataFrame() + for i in marker_dict: + auc_max[i] = aurocs.loc[[i for i in marker_dict[i] if i in aurocs.index],:].max() + + expr_max = pd.DataFrame() + for i in marker_dict: + expr_max[i] = frac_nzs.loc[[i for i in marker_dict[i] if i in frac_nzs.index],:].max() + + # Combine metrics + assignment_scores = (auc_weight * auc_max + expr_weight * expr_max) / ( + auc_weight + expr_weight + ) + + assign_class = assignment_scores.idxmax(1) + + assign_df = pd.DataFrame( + { + "class": assign_class, + "score": np.diag(assignment_scores[assign_class]), + "auc": np.diag(auc_max[assign_class]), + "expr": np.diag(expr_max[assign_class]), + }, + index=metrics["groups"], + ) return assign_df # adata.obs[level_name] = adata.obs[group_name].map(cluster2ct) # return adata -def wrangle_ranks_from_adata(adata): - """ - Wrangle results from the ranked_genes_groups function of Scanpy. - """ - # Get number of top ranked genes per groups - nb_marker = len(adata.uns['rank_genes_groups']['names']) - # Wrangle results into a table (pandas dataframe) - top_score = pd.DataFrame(adata.uns['rank_genes_groups']['scores']).loc[:nb_marker] - top_adjpval = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']).loc[:nb_marker] - top_gene = pd.DataFrame(adata.uns['rank_genes_groups']['names']).loc[:nb_marker] - marker_df = pd.DataFrame() - # Order values - for i in top_score.columns: - concat = pd.concat([top_score[[str(i)]], top_adjpval[str(i)], top_gene[[str(i)]]], axis=1, ignore_index=True) - concat['cluster_number'] = i - col = list(concat.columns) - col[0], col[1], col[-2] = 'z_score', 'adj_pvals', 'gene' - concat.columns = col - marker_df = marker_df.append(concat) - return marker_df - def get_bulk_exp(adata, bulk_labels, layer='var'): if layer=='raw': res = pd.DataFrame(columns=adata.raw.var_names, index=adata.obs[bulk_labels].cat.categories) @@ -217,3 +180,88 @@ def get_bulk_exp(adata, bulk_labels, layer='var'): return res.T + +def get_expr(adata, features=None, layer=None): + """Get expression matrix from adata object""" + if features is not None: + # intersect with adata features + features = list(set(features) & set(adata.var_names)) + adata = adata[:, match(features, adata.var_names.tolist())] + + + if layer == 'raw': + expr = jnp.array(to_dense(adata.raw.X)) + elif layer is not None: + expr = jnp.array(to_dense(adata.layers[layer])) + else: + expr = jnp.array(to_dense(adata.X)) + + return expr, features + +def auc_expr(adata, group_name, features=None, layer=None): + """Computes AUROC and fraction nonzero for each gene in an adata object.""" + # Turn string groups into integers + le = preprocessing.LabelEncoder() + le.fit(adata.obs[group_name]) + # Compute AUROC and fraction nonzero + cell_groups = jnp.array(le.transform(adata.obs[group_name])) + + groups = le.classes_ + + expr, features = get_expr(adata, features=features, layer=layer) + auroc, frac_nonzero = expr_auroc_over_groups(expr, cell_groups, groups) + + return dict( + frac_nonzero=frac_nonzero, + auroc=auroc, + features=features, + groups=le.classes_, + ) + + +@jax.jit +@partial(jax.vmap, in_axes=[1, None]) +def jit_auroc(x, groups): + # TODO: compute frac nonzero here to avoid iterating twice + + # sort scores and corresponding truth values + desc_score_indices = jnp.argsort(x)[::-1] + x = x[desc_score_indices] + groups = groups[desc_score_indices] + + # x typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = jnp.array(jnp.diff(x) != 0, dtype=jnp.int32) + threshold_mask = jnp.r_[distinct_value_indices, 1] + + # accumulate the true positives with decreasing threshold + tps_ = jnp.cumsum(groups) + fps_ = 1 + jnp.arange(groups.size) - tps_ + + # mask out the values that are not distinct + tps = jnp.sort(tps_ * threshold_mask) + fps = jnp.sort(fps_ * threshold_mask) + tps = jnp.r_[0, tps] + fps = jnp.r_[0, fps] + fpr = fps / fps[-1] + tpr = tps / tps[-1] + area = jnp.trapz(tpr, fpr) + return area + + +def expr_auroc_over_groups(expr, cell_groups, groups): + """Computes AUROC for each group separately.""" + auroc = jnp.zeros((len(groups), expr.shape[1])) + frac_nz = jnp.zeros((len(groups), expr.shape[1])) + + for group in groups: + if group == 'ref': + group = 0 + else: + group = int(group) + auroc = auroc.at[group, :].set(jit_auroc(expr, cell_groups == group)) + frac_nz = frac_nz.at[group, :].set(frac_nonzero(expr[cell_groups == group, :])) + + return auroc, frac_nz + diff --git a/snapseed/utils.py b/snapseed/utils.py index f56cbe9..dbcd84e 100644 --- a/snapseed/utils.py +++ b/snapseed/utils.py @@ -4,6 +4,7 @@ import jax from jax import numpy as jnp +from jax.experimental.sparse import BCOO from functools import partial from sklearn.metrics import roc_auc_score @@ -12,6 +13,15 @@ to_dense = lambda x: x.toarray() if hasattr(x, "toarray") else x +def to_jax_array(x): + """Turn matrix to jax array.""" + if hasattr(x, "todense"): + # Currently this is not supported for really large matrices + # return BCOO.from_scipy_sparse(x).update_layout(n_batch=1, on_inefficient=None) + return jnp.asarray(x.todense()) + else: + return jnp.asarray(x) + @jax.jit @partial(jax.vmap, in_axes=[None, 0]) @@ -20,10 +30,18 @@ def masked_max(x, mask): @jax.jit +@partial(jax.vmap, in_axes=[None, 0]) +def masked_mean(x, mask): + return jnp.sum(x * mask, axis=1) / jnp.sum(mask) + + def frac_nonzero(x, axis=0): return jnp.mean(x > 0, axis=axis) +jit_frac_nonzero = jax.jit(frac_nonzero) + + def dict_to_binary(d): df = pd.concat( [pd.Series(v, name=k).astype(str) for k, v in d.items()], @@ -43,12 +61,19 @@ def get_expr(adata, features=None, layer=None): if features is not None: # intersect with adata features features = list(set(features) & set(adata.var_names)) - adata = adata[:, match(features, adata.var_names.tolist())] + adata = adata[ + :, + match( + numba.typed.List(features), numba.typed.List(adata.var_names.tolist()) + ), + ] + else: + features = adata.var_names.copy().tolist() if layer is not None: - expr = jnp.array(to_dense(adata.layers[layer])) + expr = to_jax_array(adata.layers[layer]) else: - expr = jnp.array(to_dense(adata.X)) + expr = to_jax_array(adata.X) return expr, features @@ -63,7 +88,7 @@ def read_yaml(file): return marker_dict -def get_annot_df(x, group_name, min_expr=0.2): +def get_annot_df(x, group_name, min_expr=0.1): # Get valid annots from each level annot_list = [] for k, v in x.items(): @@ -77,3 +102,12 @@ def get_annot_df(x, group_name, min_expr=0.2): # Rename cols to levels annot_df.columns = [str(i) for i in x.keys()] return annot_df + + +def matrix_to_long_df(x, features, groups): + """Converts a matrix to a long dataframe""" + df = pd.DataFrame(x, index=groups, columns=features) + df = df.stack().reset_index() + df.columns = ["group", "feature", "value"] + return df + \ No newline at end of file From 30ea440db7656ade07145d388f00e1821964f16d Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 12 Jun 2023 15:00:55 +0200 Subject: [PATCH 16/20] fix bugs --- build/lib/snapseed/degenes.py | 31 ++++++++++++++++--------------- snapseed/degenes.py | 31 ++++++++++++++++--------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/build/lib/snapseed/degenes.py b/build/lib/snapseed/degenes.py index 546de3e..c0c5e43 100644 --- a/build/lib/snapseed/degenes.py +++ b/build/lib/snapseed/degenes.py @@ -106,25 +106,25 @@ def annotate_degenes( frac_nzs.columns=groups frac_nzs.index = features - for cluster in groups: + for group in groups: - cluster = str(cluster) + group = str(group) adata0 = adata.copy() adata0.obs[group_name] = adata0.obs[group_name].astype(str) - if cluster not in cluster_to_compair: - adata0 = adata0[adata0.obs[group_name].isin([cluster])] + if group not in cluster_to_compair: + adata0 = adata0[adata0.obs[group_name].isin([group])] else: - adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[group]), group_name] = 'ref' - adata0 = adata0[adata0.obs[group_name].isin(['ref', cluster])] + adata0 = adata0[adata0.obs[group_name].isin(['ref', group])] metric = auc_expr(adata0, group_name, features=features) - aurocs[cluster] = metric['auroc'][metric['groups']==cluster][0] - frac_nzs[cluster] = metric['frac_nonzero'][metric['groups']==cluster][0] - allgroups.append(cluster) + aurocs[group] = metric['auroc'][metric['groups']==group][0] + frac_nzs[group] = metric['frac_nonzero'][metric['groups']==group][0] + allgroups.append(group) metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, @@ -180,21 +180,22 @@ def get_bulk_exp(adata, bulk_labels, layer='var'): return res.T - def get_expr(adata, features=None, layer=None): """Get expression matrix from adata object""" + + if layer == 'raw' or layer == None: + adata = adata.raw.to_adata() + if features is not None: # intersect with adata features features = list(set(features) & set(adata.var_names)) adata = adata[:, match(features, adata.var_names.tolist())] + if layer == 'raw' or layer == None: + expr = jnp.array(to_dense(adata.X)) - if layer == 'raw': - expr = jnp.array(to_dense(adata.raw.X)) - elif layer is not None: - expr = jnp.array(to_dense(adata.layers[layer])) else: - expr = jnp.array(to_dense(adata.X)) + expr = jnp.array(to_dense(adata.layers[layer])) return expr, features diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 546de3e..c0c5e43 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -106,25 +106,25 @@ def annotate_degenes( frac_nzs.columns=groups frac_nzs.index = features - for cluster in groups: + for group in groups: - cluster = str(cluster) + group = str(group) adata0 = adata.copy() adata0.obs[group_name] = adata0.obs[group_name].astype(str) - if cluster not in cluster_to_compair: - adata0 = adata0[adata0.obs[group_name].isin([cluster])] + if group not in cluster_to_compair: + adata0 = adata0[adata0.obs[group_name].isin([group])] else: - adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[cluster]), + adata0.obs.loc[adata0.obs[group_name].isin(cluster_to_compair[group]), group_name] = 'ref' - adata0 = adata0[adata0.obs[group_name].isin(['ref', cluster])] + adata0 = adata0[adata0.obs[group_name].isin(['ref', group])] metric = auc_expr(adata0, group_name, features=features) - aurocs[cluster] = metric['auroc'][metric['groups']==cluster][0] - frac_nzs[cluster] = metric['frac_nonzero'][metric['groups']==cluster][0] - allgroups.append(cluster) + aurocs[group] = metric['auroc'][metric['groups']==group][0] + frac_nzs[group] = metric['frac_nonzero'][metric['groups']==group][0] + allgroups.append(group) metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, @@ -180,21 +180,22 @@ def get_bulk_exp(adata, bulk_labels, layer='var'): return res.T - def get_expr(adata, features=None, layer=None): """Get expression matrix from adata object""" + + if layer == 'raw' or layer == None: + adata = adata.raw.to_adata() + if features is not None: # intersect with adata features features = list(set(features) & set(adata.var_names)) adata = adata[:, match(features, adata.var_names.tolist())] + if layer == 'raw' or layer == None: + expr = jnp.array(to_dense(adata.X)) - if layer == 'raw': - expr = jnp.array(to_dense(adata.raw.X)) - elif layer is not None: - expr = jnp.array(to_dense(adata.layers[layer])) else: - expr = jnp.array(to_dense(adata.X)) + expr = jnp.array(to_dense(adata.layers[layer])) return expr, features From 02434744d56cb4a96f207ccdced85cc0fbe2be39 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Mon, 12 Jun 2023 15:25:31 +0200 Subject: [PATCH 17/20] fix bugs --- build/lib/snapseed/degenes.py | 2 +- snapseed/degenes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build/lib/snapseed/degenes.py b/build/lib/snapseed/degenes.py index c0c5e43..4289365 100644 --- a/build/lib/snapseed/degenes.py +++ b/build/lib/snapseed/degenes.py @@ -92,7 +92,7 @@ def annotate_degenes( groups = le.classes_ - expr, features = get_expr(adata, features=features, layer='raw') + expr, features = get_expr(adata, features=features, layer=layer) aurocs = jnp.zeros((len(groups), len(features))) frac_nzs = jnp.zeros((len(groups), len(features))) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index c0c5e43..4289365 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -92,7 +92,7 @@ def annotate_degenes( groups = le.classes_ - expr, features = get_expr(adata, features=features, layer='raw') + expr, features = get_expr(adata, features=features, layer=layer) aurocs = jnp.zeros((len(groups), len(features))) frac_nzs = jnp.zeros((len(groups), len(features))) From 4f13c12c41500b3f3d8eca7c68065b931bf05a82 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 1 Dec 2023 12:18:18 +0100 Subject: [PATCH 18/20] magic one cluter --- snapseed/degenes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 4289365..9970bca 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -126,6 +126,8 @@ def annotate_degenes( frac_nzs[group] = metric['frac_nonzero'][metric['groups']==group][0] allgroups.append(group) + aurocs=aurocs.fillna(0.5) ### MAGIC TODO MAY PROBLEM + metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, 'features':features, From 113d04c6e87651904b12d665af4a33bc883614fe Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 1 Dec 2023 12:39:22 +0100 Subject: [PATCH 19/20] add error discribe --- build/lib/snapseed/degenes.py | 2 ++ snapseed/degenes.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/build/lib/snapseed/degenes.py b/build/lib/snapseed/degenes.py index 4289365..9970bca 100644 --- a/build/lib/snapseed/degenes.py +++ b/build/lib/snapseed/degenes.py @@ -126,6 +126,8 @@ def annotate_degenes( frac_nzs[group] = metric['frac_nonzero'][metric['groups']==group][0] allgroups.append(group) + aurocs=aurocs.fillna(0.5) ### MAGIC TODO MAY PROBLEM + metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, 'features':features, diff --git a/snapseed/degenes.py b/snapseed/degenes.py index 9970bca..6839829 100644 --- a/snapseed/degenes.py +++ b/snapseed/degenes.py @@ -127,7 +127,10 @@ def annotate_degenes( allgroups.append(group) aurocs=aurocs.fillna(0.5) ### MAGIC TODO MAY PROBLEM - + ### The PDAC Hwang_NatGenet_2022 GSE202051_003 sample + ### Level 2 mesenchyme only have 1 cluster lead error + ### KeyError: "None of [Float64Index([nan], dtype='float64')] are in the [columns]" + metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, 'features':features, From 680f81cde411c5dafb7b655680f7aae761f422b6 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Fri, 1 Dec 2023 12:42:56 +0100 Subject: [PATCH 20/20] no --- build/lib/snapseed/degenes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build/lib/snapseed/degenes.py b/build/lib/snapseed/degenes.py index 9970bca..6839829 100644 --- a/build/lib/snapseed/degenes.py +++ b/build/lib/snapseed/degenes.py @@ -127,7 +127,10 @@ def annotate_degenes( allgroups.append(group) aurocs=aurocs.fillna(0.5) ### MAGIC TODO MAY PROBLEM - + ### The PDAC Hwang_NatGenet_2022 GSE202051_003 sample + ### Level 2 mesenchyme only have 1 cluster lead error + ### KeyError: "None of [Float64Index([nan], dtype='float64')] are in the [columns]" + metrics={'frac_nonzero':frac_nzs, 'auroc':aurocs, 'features':features,