bbglab · FerriolCalvet · Jan 31, 2026 · Jan 31, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/bin/mut_density.py → bin/mut_density_adjusted.py b/bin/mut_density.py → bin/mut_density_adjusted.py
@@ -31,6 +31,8 @@ def get_correction_factor(sample_name, trinucleotide_counts_df, mutability_df, f
     triplet_counts = np.array(l)
 
     # genome length in Mb
+    #   accounting for the fact that each position contributes:
+    #   3*depth because of the 3 mutations available at each position
     genome_length = sum(triplet_counts) / (3 * 1e6)
 
     # vector of relative mutabilities in 96-channel canonical sorting
@@ -61,13 +63,17 @@ def mutation_density(sample_name, depths_file, somatic_mutations_file, mutabilit
 
     for csqn, csqn_set in broadimpact_grouping_dict_with_synonymous.items():
 
-        for gene in panel_df['GENE'].unique():
+        for gene in list(panel_df['GENE'].unique()) + ["ALL_GENES"]:
 
             # compute vector of sum of depths per trinucleotide context
             # tailored to the specific gene-impact target
-            region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set)) & (panel_df['GENE'] == gene)].copy()
+            if gene == 'ALL_GENES':
+                region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set))][['CHROM', 'POS', 'REF', 'ALT']].drop_duplicates()
+            else:
+                region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set)) & (panel_df['GENE'] == gene)][['CHROM', 'POS', 'REF', 'ALT']].drop_duplicates()
 
-            # counting every position once
+            # counting every position as many times as the number of possible
+            # mutations of the selected consequences at that position (1,2 or 3)
             dh = pd.merge(region_df[['CHROM', 'POS']],
                           depths_df[['CHROM', 'POS', 'CONTEXT', sample_name]],
                           on=['CHROM', 'POS'], how='left')
@@ -88,10 +94,13 @@ def mutation_density(sample_name, depths_file, somatic_mutations_file, mutabilit
             except AssertionError:
                 res.loc[gene, csqn] = None
                 continue
-
-            # observed somatic mutations
 
-            n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set)) & (somatic_mutations_df['GENE'] == gene)].shape[0]
+
+            # observed somatic mutations
+            if gene == 'ALL_GENES':
+                n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set))].shape[0]
+            else:
+                n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set)) & (somatic_mutations_df['GENE'] == gene)].shape[0]
 
             res.loc[gene, csqn] = n / effective_length
 
@@ -149,12 +158,12 @@ def main(sample_name, depths_file, somatic_mutations_file, mutability_file, pane
     logfoldchange_plot(sample_name, res, res_flat)
 
     # save results
-    res["SAMPLE"] = sample_name
-    res_flat["SAMPLE"] = sample_name
+    res["SAMPLE_ID"] = sample_name
+    res_flat["SAMPLE_ID"] = sample_name
     res.index.name = 'GENE'
     res_flat.index.name = 'GENE'
-    res[['SAMPLE'] + [col for col in res.columns if col != 'SAMPLE']].to_csv(f'{sample_name}.mutdensities.tsv', sep='\t')
-    res_flat[['SAMPLE'] + [col for col in res_flat.columns if col != 'SAMPLE']].to_csv(f'{sample_name}.mutdensities_flat.tsv', sep='\t')
+    res[['SAMPLE_ID'] + [col for col in res.columns if col != 'SAMPLE_ID']].to_csv(f'{sample_name}.mutdensities.tsv', sep='\t')
+    res_flat[['SAMPLE_ID'] + [col for col in res_flat.columns if col != 'SAMPLE_ID']].to_csv(f'{sample_name}.mutdensities_flat.tsv', sep='\t')
 
 
 

diff --git a/bin/mut_density_adjusted_dnds.py b/bin/mut_density_adjusted_dnds.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+
+import click
+import pandas as pd
+import numpy as np
+from read_utils import custom_na_values
+
+
+def compute_dnds_proxy(mutdensity_file, cohort_syn_mutdensities_file, output_file, mode):
+    """
+    TODO: explain what this function does
+    TODO 2: store a log file that is also outputted and can be used to check some basic statistics
+
+    right now the use of mode is not implemented,
+    since we only compute one type of synonymous mutation densities.
+    """
+
+    mutdensity_df_init = pd.read_csv(mutdensity_file, sep = "\t", header = 0, na_values = custom_na_values)
+    mutdensity_df_init["synonymous"] = mutdensity_df_init["synonymous"].replace(0, np.nan)
+    all_possible_genes = list(mutdensity_df_init["GENE"].unique())
+
+    cohort_syn_mutdensity_df = pd.read_csv(cohort_syn_mutdensities_file, sep = "\t", header = 0, na_values = custom_na_values)
+    cohort_syn_mutdensity_df.columns = ['GENE', 'cohort_synonymous']
+    cohort_syn_mutdensity_df = cohort_syn_mutdensity_df.set_index("GENE")
+
+    init_cohort_syn_df = pd.DataFrame(index = all_possible_genes)
+    cohort_syn_df = pd.concat((init_cohort_syn_df, cohort_syn_mutdensity_df), axis = 1)
+
+    # filling the null mutation densities with the value of the 1st decile
+    cohort_syn_df = cohort_syn_df.fillna(cohort_syn_df[~(cohort_syn_df.isna())].quantile(.1)).reset_index()
+    cohort_syn_df.columns = ['GENE', 'cohort_synonymous']
+
+    mutdensity_df = mutdensity_df_init.merge(cohort_syn_df, on = "GENE")
+    for impact in ["missense", "truncating", "nonsynonymous_splice"]:
+        mutdensity_df[f"d_{impact}/d_synonymous"] = mutdensity_df[impact] / mutdensity_df["synonymous"]
+        mutdensity_df[f"d_{impact}/d_cohort_synonymous"] = mutdensity_df[impact] / mutdensity_df["cohort_synonymous"]
+
+    # summary at all_samples level
+    subset_mutdensities = mutdensity_df[(mutdensity_df["SAMPLE_ID"] == 'all_samples')]
+    for impact in ["missense", "truncating"]:
+        print(subset_mutdensities.sort_values(by=f"d_{impact}/d_synonymous", ascending=False)[
+            ["GENE", "SAMPLE_ID", impact, "synonymous", f"d_{impact}/d_synonymous"]
+            ].head(10))
+
+
+    # # summary at sample-level
+    # subset_mutdensities = mutdensity_df[(mutdensity_df["SAMPLE_ID"] != 'all_samples')]
+    # for impact in ["missense", "truncating"]:
+    #     print(subset_mutdensities.sort_values(by=f"d_{impact}/d_synonymous", ascending=False)[
+    #         ["GENE", "SAMPLE_ID", impact, "synonymous", f"d_{impact}/d_synonymous"]
+    #         ].head(10))
+
+    # TODO implement these different modes if appropriate
+    # if mode == 'mutations':
+    #     synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
+    # elif mode == 'mutated_reads':
+    #     synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
+
+    mutdensity_df.to_csv(f"{output_file}",
+                            header=True,
+                            index=False,
+                            sep="\t")
+
+
+@click.command()
+@click.option('--mutdensities', type=click.Path(exists=True), help='Input mutation density file')
+@click.option('--cohort-syn-mutdensities', type=click.Path(exists=True), help='Input cohort synonymous mutation densities')
+@click.option('--output', type=click.Path(), help='Output file')
+@click.option('--mode', type=click.Choice(['mutations', 'mutated_reads']), default='mutations')
+def main(mutdensities, cohort_syn_mutdensities, output, mode):
+    click.echo("Selecting the gene synonymous mutation densities...")
+    compute_dnds_proxy(mutdensities, cohort_syn_mutdensities, output, mode)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/bin/compute_mutdensity.py → bin/mut_density_simple.py b/bin/compute_mutdensity.py → bin/mut_density_simple.py
@@ -19,7 +19,7 @@
 
 MUTDENSITY_IMPACT_GROUPS = [False, ["SNV"] , ["INSERTION", "DELETION"], ["SNV", "INSERTION", "DELETION"]]
 
-def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
+def mutdensity_sample(maf_df, depths_df, sample_name):
     """
     Computes a sample's global mutation density. Returns the mutation density
     per Mb, non-adjusted and adjusted by panel
@@ -29,8 +29,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
     impact_group_results = list()
 
     # mutation density depth information
-    sample_features_depth = {"DEPTH" : depths_df.drop_duplicates(subset = ["CHROM", "POS"])[f"{sample_name}"].sum(),
-                                "DEPTH_ADJUSTED": depths_adj_df[f"{sample_name}"].sum()
+    sample_features_depth = {"DEPTH" : depths_df.drop_duplicates(subset = ["CHROM", "POS"])[f"{sample_name}"].sum()
                                 }
 
     for type_list in MUTDENSITY_IMPACT_GROUPS:
@@ -55,9 +54,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
         sample_features["N_MUTATED"] = n_mutated_reads
 
         sample_features["MUTDENSITY_MB"] = ( sample_features["N_MUTS"] / sample_features["DEPTH"] * 1000000 ).astype(float)
-        sample_features["MUTDENSITY_MB_ADJUSTED"] = ( sample_features["N_MUTS"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float)
         sample_features["MUTREADSDENSITY_MB"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH"] * 1000000 ).astype(float)
-        sample_features["MUTREADSDENSITY_MB_ADJUSTED"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float)
 
         sample_features["GENE"] = "ALL_GENES"
         sample_features["MUTTYPES"] = types_included
@@ -70,7 +67,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
     return mutdensity_sample
 
 
-def mutdensity_gene(maf_df, depths_df, depths_adj_df, sample_name):
+def mutdensity_gene(maf_df, depths_df, sample_name):
     """
     Computes each gene mutation density. Returns the mutation density
     both per Mb and Kb sequenced, both non-adjusted and adjusted by panel
@@ -101,21 +98,16 @@ def mutdensity_gene(maf_df, depths_df, depths_adj_df, sample_name):
 
         depths_gene_df = depths_df.groupby("GENE").agg({f"{sample_name}" : "sum" })
         depths_gene_df.columns = ["DEPTH"]
-        depths_adj_gene_df = depths_adj_df.groupby("GENE").agg({f"{sample_name}" : "sum" })
-        depths_adj_gene_df.columns = ["DEPTH_ADJUSTED"]
 
         mut_rate_mut_reads_df = n_muts_gene.merge(n_mutated_reads, on = "GENE")
-        depths_depthsadj_gene_df = depths_gene_df.merge(depths_adj_gene_df, on = "GENE")
+
         ## merge so that mutation density is computed although the number of mutations is NA (meaning, zero)
-        mut_depths_df = depths_depthsadj_gene_df.merge(mut_rate_mut_reads_df, on = "GENE", how = 'left')
-        mut_depths_df = mut_depths_df.fillna(0) # I think this is not needed
+        mut_depths_df = depths_gene_df.merge(mut_rate_mut_reads_df, on = "GENE", how = 'left')
+        mut_depths_df = mut_depths_df.fillna(0)
 
         # mutation density metrics
         mut_depths_df["MUTDENSITY_MB"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH"] * 1000000).astype(float)
-        mut_depths_df["MUTDENSITY_MB_ADJUSTED"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float)
-
         mut_depths_df["MUTREADSDENSITY_MB"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH"] * 1000000).astype(float)
-        mut_depths_df["MUTREADSDENSITY_MB_ADJUSTED"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float)
 
         mut_depths_df["MUTTYPES"] = types_included
         impact_group_results.append(mut_depths_df.reset_index())
@@ -137,25 +129,21 @@ def load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name):
     ## mode 1: each position counts one (once per gene, be careful that it might be duplicated in different genes)
     depths_subset_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]].drop_duplicates(),
                                         on = ["CHROM", "POS"], how = "inner")
-    ## mode 2 (adjusted): each position counts as many times it contributes to the panel
-    depths_df[sample_name] = depths_df[sample_name] / 3   # the depth per position can contribute to three different mutations
-    depths_subset_adj_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]], on = ["CHROM", "POS"], how = "inner")
-
-    ## mode 3 (adjusted): each position counts as many times it contributes to the panel, but ONLY ONCE PER SAMPLE
-    depths_subset_adj_sample_df = depths_df.merge(annot_panel_df.drop_duplicates(subset = ["CHROM", "POS", "REF", "ALT"])[["CHROM", "POS"]],
-                                                    on = ["CHROM", "POS"], how = "inner")
 
     # Add domains and exons to maf_df
     annot_panel_df['CHROM_POS'] = annot_panel_df['CHROM'].astype(str) + ':' + annot_panel_df['POS'].astype(str)
     maf_df_raw['CHROM_POS'] = maf_df_raw['MUT_ID'].str.split('_', expand = True)[0]
 
-    maf_df = maf_df_raw.merge(annot_panel_df[['CHROM_POS', 'GENE']], on = ['CHROM_POS'], how = 'left', suffixes=['','_subgenic']).reset_index(drop=True)
+    maf_df = maf_df_raw.merge(annot_panel_df[['CHROM_POS', 'GENE']],
+                              on = ['CHROM_POS'], how = 'left',
+                              suffixes=['','_subgenic']).reset_index(drop=True)
+
     maf_df = maf_df.drop(columns = ['GENE', 'CHROM_POS'])
     maf_df = maf_df.rename(columns={ 'GENE_subgenic' : 'GENE'})
 
     maf_df = maf_df.drop_duplicates()
 
-    return maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df
+    return maf_df, depths_subset_df
 
 
 # -- Main function -- #
@@ -166,14 +154,14 @@ def compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, pan
     the panel composition. It saves the results to a TSV file.
     """
 
-    maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df = load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name)
+    maf_df, depths_subset_df = load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name)
 
     # Compute mutation densities
     ## sample mutation density
-    mutdensity_sample_df = mutdensity_sample(maf_df, depths_subset_df, depths_subset_adj_sample_df, sample_name)
+    mutdensity_sample_df = mutdensity_sample(maf_df, depths_subset_df, sample_name)
 
     ## per gene mutation density
-    mutdensity_genes_df = mutdensity_gene(maf_df, depths_subset_df, depths_subset_adj_df, sample_name)
+    mutdensity_genes_df = mutdensity_gene(maf_df, depths_subset_df, sample_name)
 
     mutdensity_df = pd.concat([mutdensity_sample_df, mutdensity_genes_df])
 
@@ -184,8 +172,7 @@ def compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, pan
     mutdensity_df[["SAMPLE_ID", "GENE", "REGIONS", "MUTTYPES",
                 "DEPTH",
                 "N_MUTS", "N_MUTATED",
-                "MUTDENSITY_MB", "MUTDENSITY_MB_ADJUSTED",
-                "MUTREADSDENSITY_MB", "MUTREADSDENSITY_MB_ADJUSTED",
+                "MUTDENSITY_MB", "MUTREADSDENSITY_MB"
                 ]].to_csv(f"{sample_name}.{panel_v}.mutdensities.tsv",
                                                             sep = "\t",
                                                             header = True,

diff --git a/bin/omega_select_mutdensity.py b/bin/omega_select_mutdensity.py
@@ -8,19 +8,26 @@
 
 def select_syn_mutdensity(mutdensity_file, output_file, mode):
     """
-    INFO
+    This function selects the synonymous mutation densities for all genes
+    from the mutation density file of all samples.
+
+    right now the use of mode is not implemented,
+    since we only compute one type of synonymous mutation densities.
     """
 
     mutdensity_df = pd.read_csv(mutdensity_file, sep = "\t", header = 0, na_values = custom_na_values)
 
-    synonymous_mutdensities_all_samples = mutdensity_df[(mutdensity_df["MUTTYPES"] == "SNV") &
-                                                        (mutdensity_df["GENE"] != "ALL_GENES") &
-                                                        ~(mutdensity_df["GENE"].str.contains("--"))].reset_index(drop = True)
+    synonymous_mutdensities_all_samples = mutdensity_df[(mutdensity_df["SAMPLE_ID"] == 'all_samples') &
+                                                        ~(mutdensity_df["GENE"].str.contains("--"))
+                                                        ].reset_index(drop = True)
 
-    if mode == 'mutations':
-        synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'MUTDENSITY_MB_ADJUSTED']]
-    elif mode == 'mutated_reads':
-        synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'MUTREADSDENSITY_MB_ADJUSTED']]
+    synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']].copy()
+
+    # TODO implement these different modes if appropriate
+    # if mode == 'mutations':
+    #     synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
+    # elif mode == 'mutated_reads':
+    #     synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
 
     synonymous_mutdensities_genes.columns = ["GENE", "MUTDENSITY"]
     synonymous_mutdensities_genes.to_csv(f"{output_file}",

diff --git a/bin/plot_explore_variability.py b/bin/plot_explore_variability.py
@@ -84,15 +84,13 @@ def mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = '',
 
 def adj_mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = '',
                                 config_datasets = {
-                                    "all" : ({"MUTTYPES": 'all_types', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
-                                    "all protein-affecting" : ({"MUTTYPES": 'all_types', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
-                                    "all non-protein-affecting" : ({"MUTTYPES": 'all_types', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB'),
-                                    "SNVs" : ({"MUTTYPES": 'SNV', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
-                                    "SNVs protein-affecting" : ({"MUTTYPES": 'SNV', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
-                                    "SNVs non-protein-affecting" : ({"MUTTYPES": 'SNV', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB'),
-                                    "INDELs" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
-                                    "INDELs protein-affecting" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
-                                    "INDELs non-protein-affecting" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB')
+                                    "synonymous" : "synonymous",
+                                    "missense" : "missense",
+                                    "nonsense" : "nonsense",
+                                    "essential_splice" : "essential_splice",
+                                    "truncating" : "truncating",
+                                    "nonsynonymous_splice" : "nonsynonymous_splice",
+                                    "all_impacts" : "all_impacts",
                                 }
                             ):
     """
@@ -105,13 +103,13 @@ def adj_mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = ''
         print("No data available for the selected samples/groups")
         return
 
-    pdf_filename = f"{outdir}/{prefix}mut_density_heatmaps.pdf"
+    pdf_filename = f"{outdir}/{prefix}_adjusted_mut_density_heatmaps.pdf"
     with PdfPages(pdf_filename) as pdf:
 
-        for title, (config, value) in config_datasets.items():
-            print("Creating heatmap for:", title, config, value)
-            filtered_data = filter_data_from_config(data, config)
-            # print(filtered_data[['GENE', 'SAMPLE_ID', value]].head())
+        for title, value in config_datasets.items():
+            print("Creating heatmap for:", title)
+            filtered_data = data[["GENE", "SAMPLE_ID", value]]
+
             # Create a pivot table for the heatmap
             heatmap_data = filtered_data.pivot_table(index='GENE', columns='SAMPLE_ID', values=value)
             heatmap_data = heatmap_data.reindex(index=genes_list, columns=samples_list)
@@ -208,7 +206,7 @@ def main(outdir, panel_regions, samples_json, all_groups_json, mutdensities, adj
         plotting_manager(outdir, genes_list, samples_list, "samples.", data_string, data_objects)
     except Exception as e:
         print("Error in the process", e)
-    
+
     try:
         plotting_manager(outdir, genes_list, groups_names, "groups.", data_string, data_objects)
     except Exception as e: