Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions bin/mut_density.py → bin/mut_density_adjusted.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def get_correction_factor(sample_name, trinucleotide_counts_df, mutability_df, f
triplet_counts = np.array(l)

# genome length in Mb
# accounting for the fact that each position contributes:
# 3*depth because of the 3 mutations available at each position
genome_length = sum(triplet_counts) / (3 * 1e6)

# vector of relative mutabilities in 96-channel canonical sorting
Expand Down Expand Up @@ -61,13 +63,17 @@ def mutation_density(sample_name, depths_file, somatic_mutations_file, mutabilit

for csqn, csqn_set in broadimpact_grouping_dict_with_synonymous.items():

for gene in panel_df['GENE'].unique():
for gene in list(panel_df['GENE'].unique()) + ["ALL_GENES"]:

# compute vector of sum of depths per trinucleotide context
# tailored to the specific gene-impact target
region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set)) & (panel_df['GENE'] == gene)].copy()
if gene == 'ALL_GENES':
region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set))][['CHROM', 'POS', 'REF', 'ALT']].drop_duplicates()
else:
region_df = panel_df[(panel_df['IMPACT'].isin(csqn_set)) & (panel_df['GENE'] == gene)][['CHROM', 'POS', 'REF', 'ALT']].drop_duplicates()

# counting every position once
# counting every position as many times as the number of possible
# mutations of the selected consequences at that position (1,2 or 3)
dh = pd.merge(region_df[['CHROM', 'POS']],
depths_df[['CHROM', 'POS', 'CONTEXT', sample_name]],
on=['CHROM', 'POS'], how='left')
Expand All @@ -88,10 +94,13 @@ def mutation_density(sample_name, depths_file, somatic_mutations_file, mutabilit
except AssertionError:
res.loc[gene, csqn] = None
continue

# observed somatic mutations

n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set)) & (somatic_mutations_df['GENE'] == gene)].shape[0]

# observed somatic mutations
if gene == 'ALL_GENES':
n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set))].shape[0]
else:
n = somatic_mutations_df[(somatic_mutations_df['IMPACT'].isin(csqn_set)) & (somatic_mutations_df['GENE'] == gene)].shape[0]

res.loc[gene, csqn] = n / effective_length

Expand Down Expand Up @@ -149,12 +158,12 @@ def main(sample_name, depths_file, somatic_mutations_file, mutability_file, pane
logfoldchange_plot(sample_name, res, res_flat)

# save results
res["SAMPLE"] = sample_name
res_flat["SAMPLE"] = sample_name
res["SAMPLE_ID"] = sample_name
res_flat["SAMPLE_ID"] = sample_name
res.index.name = 'GENE'
res_flat.index.name = 'GENE'
res[['SAMPLE'] + [col for col in res.columns if col != 'SAMPLE']].to_csv(f'{sample_name}.mutdensities.tsv', sep='\t')
res_flat[['SAMPLE'] + [col for col in res_flat.columns if col != 'SAMPLE']].to_csv(f'{sample_name}.mutdensities_flat.tsv', sep='\t')
res[['SAMPLE_ID'] + [col for col in res.columns if col != 'SAMPLE_ID']].to_csv(f'{sample_name}.mutdensities.tsv', sep='\t')
res_flat[['SAMPLE_ID'] + [col for col in res_flat.columns if col != 'SAMPLE_ID']].to_csv(f'{sample_name}.mutdensities_flat.tsv', sep='\t')



Expand Down
77 changes: 77 additions & 0 deletions bin/mut_density_adjusted_dnds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python


import click
import pandas as pd
import numpy as np
from read_utils import custom_na_values


def compute_dnds_proxy(mutdensity_file, cohort_syn_mutdensities_file, output_file, mode):
"""
TODO: explain what this function does
TODO 2: store a log file that is also outputted and can be used to check some basic statistics

right now the use of mode is not implemented,
since we only compute one type of synonymous mutation densities.
"""

mutdensity_df_init = pd.read_csv(mutdensity_file, sep = "\t", header = 0, na_values = custom_na_values)
mutdensity_df_init["synonymous"] = mutdensity_df_init["synonymous"].replace(0, np.nan)
all_possible_genes = list(mutdensity_df_init["GENE"].unique())

cohort_syn_mutdensity_df = pd.read_csv(cohort_syn_mutdensities_file, sep = "\t", header = 0, na_values = custom_na_values)
cohort_syn_mutdensity_df.columns = ['GENE', 'cohort_synonymous']
cohort_syn_mutdensity_df = cohort_syn_mutdensity_df.set_index("GENE")

init_cohort_syn_df = pd.DataFrame(index = all_possible_genes)
cohort_syn_df = pd.concat((init_cohort_syn_df, cohort_syn_mutdensity_df), axis = 1)

# filling the null mutation densities with the value of the 1st decile
cohort_syn_df = cohort_syn_df.fillna(cohort_syn_df[~(cohort_syn_df.isna())].quantile(.1)).reset_index()
cohort_syn_df.columns = ['GENE', 'cohort_synonymous']

mutdensity_df = mutdensity_df_init.merge(cohort_syn_df, on = "GENE")
for impact in ["missense", "truncating", "nonsynonymous_splice"]:
mutdensity_df[f"d_{impact}/d_synonymous"] = mutdensity_df[impact] / mutdensity_df["synonymous"]
mutdensity_df[f"d_{impact}/d_cohort_synonymous"] = mutdensity_df[impact] / mutdensity_df["cohort_synonymous"]

# summary at all_samples level
subset_mutdensities = mutdensity_df[(mutdensity_df["SAMPLE_ID"] == 'all_samples')]
for impact in ["missense", "truncating"]:
print(subset_mutdensities.sort_values(by=f"d_{impact}/d_synonymous", ascending=False)[
["GENE", "SAMPLE_ID", impact, "synonymous", f"d_{impact}/d_synonymous"]
].head(10))


# # summary at sample-level
# subset_mutdensities = mutdensity_df[(mutdensity_df["SAMPLE_ID"] != 'all_samples')]
# for impact in ["missense", "truncating"]:
# print(subset_mutdensities.sort_values(by=f"d_{impact}/d_synonymous", ascending=False)[
# ["GENE", "SAMPLE_ID", impact, "synonymous", f"d_{impact}/d_synonymous"]
# ].head(10))

# TODO implement these different modes if appropriate
# if mode == 'mutations':
# synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
# elif mode == 'mutated_reads':
# synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]

mutdensity_df.to_csv(f"{output_file}",
header=True,
index=False,
sep="\t")


@click.command()
@click.option('--mutdensities', type=click.Path(exists=True), help='Input mutation density file')
@click.option('--cohort-syn-mutdensities', type=click.Path(exists=True), help='Input cohort synonymous mutation densities')
@click.option('--output', type=click.Path(), help='Output file')
@click.option('--mode', type=click.Choice(['mutations', 'mutated_reads']), default='mutations')
def main(mutdensities, cohort_syn_mutdensities, output, mode):
click.echo("Selecting the gene synonymous mutation densities...")
compute_dnds_proxy(mutdensities, cohort_syn_mutdensities, output, mode)

if __name__ == '__main__':
main()

43 changes: 15 additions & 28 deletions bin/compute_mutdensity.py → bin/mut_density_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

MUTDENSITY_IMPACT_GROUPS = [False, ["SNV"] , ["INSERTION", "DELETION"], ["SNV", "INSERTION", "DELETION"]]

def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
def mutdensity_sample(maf_df, depths_df, sample_name):
"""
Computes a sample's global mutation density. Returns the mutation density
per Mb, non-adjusted and adjusted by panel
Expand All @@ -29,8 +29,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
impact_group_results = list()

# mutation density depth information
sample_features_depth = {"DEPTH" : depths_df.drop_duplicates(subset = ["CHROM", "POS"])[f"{sample_name}"].sum(),
"DEPTH_ADJUSTED": depths_adj_df[f"{sample_name}"].sum()
sample_features_depth = {"DEPTH" : depths_df.drop_duplicates(subset = ["CHROM", "POS"])[f"{sample_name}"].sum()
}

for type_list in MUTDENSITY_IMPACT_GROUPS:
Expand All @@ -55,9 +54,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
sample_features["N_MUTATED"] = n_mutated_reads

sample_features["MUTDENSITY_MB"] = ( sample_features["N_MUTS"] / sample_features["DEPTH"] * 1000000 ).astype(float)
sample_features["MUTDENSITY_MB_ADJUSTED"] = ( sample_features["N_MUTS"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float)
sample_features["MUTREADSDENSITY_MB"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH"] * 1000000 ).astype(float)
sample_features["MUTREADSDENSITY_MB_ADJUSTED"] = ( sample_features["N_MUTATED"] / sample_features["DEPTH_ADJUSTED"] * 1000000 ).astype(float)

sample_features["GENE"] = "ALL_GENES"
sample_features["MUTTYPES"] = types_included
Expand All @@ -70,7 +67,7 @@ def mutdensity_sample(maf_df, depths_df, depths_adj_df, sample_name):
return mutdensity_sample


def mutdensity_gene(maf_df, depths_df, depths_adj_df, sample_name):
def mutdensity_gene(maf_df, depths_df, sample_name):
"""
Computes each gene mutation density. Returns the mutation density
both per Mb and Kb sequenced, both non-adjusted and adjusted by panel
Expand Down Expand Up @@ -101,21 +98,16 @@ def mutdensity_gene(maf_df, depths_df, depths_adj_df, sample_name):

depths_gene_df = depths_df.groupby("GENE").agg({f"{sample_name}" : "sum" })
depths_gene_df.columns = ["DEPTH"]
depths_adj_gene_df = depths_adj_df.groupby("GENE").agg({f"{sample_name}" : "sum" })
depths_adj_gene_df.columns = ["DEPTH_ADJUSTED"]

mut_rate_mut_reads_df = n_muts_gene.merge(n_mutated_reads, on = "GENE")
depths_depthsadj_gene_df = depths_gene_df.merge(depths_adj_gene_df, on = "GENE")

## merge so that mutation density is computed although the number of mutations is NA (meaning, zero)
mut_depths_df = depths_depthsadj_gene_df.merge(mut_rate_mut_reads_df, on = "GENE", how = 'left')
mut_depths_df = mut_depths_df.fillna(0) # I think this is not needed
mut_depths_df = depths_gene_df.merge(mut_rate_mut_reads_df, on = "GENE", how = 'left')
mut_depths_df = mut_depths_df.fillna(0)

# mutation density metrics
mut_depths_df["MUTDENSITY_MB"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH"] * 1000000).astype(float)
mut_depths_df["MUTDENSITY_MB_ADJUSTED"] = (mut_depths_df["N_MUTS"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float)

mut_depths_df["MUTREADSDENSITY_MB"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH"] * 1000000).astype(float)
mut_depths_df["MUTREADSDENSITY_MB_ADJUSTED"] = (mut_depths_df["N_MUTATED"] / mut_depths_df["DEPTH_ADJUSTED"] * 1000000).astype(float)

mut_depths_df["MUTTYPES"] = types_included
impact_group_results.append(mut_depths_df.reset_index())
Expand All @@ -137,25 +129,21 @@ def load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name):
## mode 1: each position counts one (once per gene, be careful that it might be duplicated in different genes)
depths_subset_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]].drop_duplicates(),
on = ["CHROM", "POS"], how = "inner")
## mode 2 (adjusted): each position counts as many times it contributes to the panel
depths_df[sample_name] = depths_df[sample_name] / 3 # the depth per position can contribute to three different mutations
depths_subset_adj_df = depths_df.merge(annot_panel_df[["CHROM", "POS", "GENE"]], on = ["CHROM", "POS"], how = "inner")

## mode 3 (adjusted): each position counts as many times it contributes to the panel, but ONLY ONCE PER SAMPLE
depths_subset_adj_sample_df = depths_df.merge(annot_panel_df.drop_duplicates(subset = ["CHROM", "POS", "REF", "ALT"])[["CHROM", "POS"]],
on = ["CHROM", "POS"], how = "inner")

# Add domains and exons to maf_df
annot_panel_df['CHROM_POS'] = annot_panel_df['CHROM'].astype(str) + ':' + annot_panel_df['POS'].astype(str)
maf_df_raw['CHROM_POS'] = maf_df_raw['MUT_ID'].str.split('_', expand = True)[0]

maf_df = maf_df_raw.merge(annot_panel_df[['CHROM_POS', 'GENE']], on = ['CHROM_POS'], how = 'left', suffixes=['','_subgenic']).reset_index(drop=True)
maf_df = maf_df_raw.merge(annot_panel_df[['CHROM_POS', 'GENE']],
on = ['CHROM_POS'], how = 'left',
suffixes=['','_subgenic']).reset_index(drop=True)

maf_df = maf_df.drop(columns = ['GENE', 'CHROM_POS'])
maf_df = maf_df.rename(columns={ 'GENE_subgenic' : 'GENE'})

maf_df = maf_df.drop_duplicates()

return maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df
return maf_df, depths_subset_df


# -- Main function -- #
Expand All @@ -166,14 +154,14 @@ def compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, pan
the panel composition. It saves the results to a TSV file.
"""

maf_df, depths_subset_df, depths_subset_adj_df, depths_subset_adj_sample_df = load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name)
maf_df, depths_subset_df = load_n_process_inputs(maf_path, depths_path, annot_panel_path, sample_name)

# Compute mutation densities
## sample mutation density
mutdensity_sample_df = mutdensity_sample(maf_df, depths_subset_df, depths_subset_adj_sample_df, sample_name)
mutdensity_sample_df = mutdensity_sample(maf_df, depths_subset_df, sample_name)

## per gene mutation density
mutdensity_genes_df = mutdensity_gene(maf_df, depths_subset_df, depths_subset_adj_df, sample_name)
mutdensity_genes_df = mutdensity_gene(maf_df, depths_subset_df, sample_name)

mutdensity_df = pd.concat([mutdensity_sample_df, mutdensity_genes_df])

Expand All @@ -184,8 +172,7 @@ def compute_mutdensity(maf_path, depths_path, annot_panel_path, sample_name, pan
mutdensity_df[["SAMPLE_ID", "GENE", "REGIONS", "MUTTYPES",
"DEPTH",
"N_MUTS", "N_MUTATED",
"MUTDENSITY_MB", "MUTDENSITY_MB_ADJUSTED",
"MUTREADSDENSITY_MB", "MUTREADSDENSITY_MB_ADJUSTED",
"MUTDENSITY_MB", "MUTREADSDENSITY_MB"
]].to_csv(f"{sample_name}.{panel_v}.mutdensities.tsv",
sep = "\t",
header = True,
Expand Down
23 changes: 15 additions & 8 deletions bin/omega_select_mutdensity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,26 @@

def select_syn_mutdensity(mutdensity_file, output_file, mode):
"""
INFO
This function selects the synonymous mutation densities for all genes
from the mutation density file of all samples.

right now the use of mode is not implemented,
since we only compute one type of synonymous mutation densities.
"""

mutdensity_df = pd.read_csv(mutdensity_file, sep = "\t", header = 0, na_values = custom_na_values)

synonymous_mutdensities_all_samples = mutdensity_df[(mutdensity_df["MUTTYPES"] == "SNV") &
(mutdensity_df["GENE"] != "ALL_GENES") &
~(mutdensity_df["GENE"].str.contains("--"))].reset_index(drop = True)
synonymous_mutdensities_all_samples = mutdensity_df[(mutdensity_df["SAMPLE_ID"] == 'all_samples') &
~(mutdensity_df["GENE"].str.contains("--"))
].reset_index(drop = True)

if mode == 'mutations':
synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'MUTDENSITY_MB_ADJUSTED']]
elif mode == 'mutated_reads':
synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'MUTREADSDENSITY_MB_ADJUSTED']]
synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']].copy()

# TODO implement these different modes if appropriate
# if mode == 'mutations':
# synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]
# elif mode == 'mutated_reads':
# synonymous_mutdensities_genes = synonymous_mutdensities_all_samples[['GENE', 'synonymous']]

synonymous_mutdensities_genes.columns = ["GENE", "MUTDENSITY"]
synonymous_mutdensities_genes.to_csv(f"{output_file}",
Expand Down
28 changes: 13 additions & 15 deletions bin/plot_explore_variability.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,13 @@ def mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = '',

def adj_mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = '',
config_datasets = {
"all" : ({"MUTTYPES": 'all_types', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
"all protein-affecting" : ({"MUTTYPES": 'all_types', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
"all non-protein-affecting" : ({"MUTTYPES": 'all_types', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB'),
"SNVs" : ({"MUTTYPES": 'SNV', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
"SNVs protein-affecting" : ({"MUTTYPES": 'SNV', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
"SNVs non-protein-affecting" : ({"MUTTYPES": 'SNV', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB'),
"INDELs" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'all'}, 'MUTDENSITY_MB'),
"INDELs protein-affecting" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'protein_affecting'}, 'MUTDENSITY_MB'),
"INDELs non-protein-affecting" : ({"MUTTYPES": 'DELETION-INSERTION', "REGIONS": 'non_protein_affecting'}, 'MUTDENSITY_MB')
"synonymous" : "synonymous",
"missense" : "missense",
"nonsense" : "nonsense",
"essential_splice" : "essential_splice",
"truncating" : "truncating",
"nonsynonymous_splice" : "nonsynonymous_splice",
"all_impacts" : "all_impacts",
}
):
"""
Expand All @@ -105,13 +103,13 @@ def adj_mut_density_heatmaps(data, genes_list, samples_list, outdir, prefix = ''
print("No data available for the selected samples/groups")
return

pdf_filename = f"{outdir}/{prefix}mut_density_heatmaps.pdf"
pdf_filename = f"{outdir}/{prefix}_adjusted_mut_density_heatmaps.pdf"
with PdfPages(pdf_filename) as pdf:

for title, (config, value) in config_datasets.items():
print("Creating heatmap for:", title, config, value)
filtered_data = filter_data_from_config(data, config)
# print(filtered_data[['GENE', 'SAMPLE_ID', value]].head())
for title, value in config_datasets.items():
print("Creating heatmap for:", title)
filtered_data = data[["GENE", "SAMPLE_ID", value]]

# Create a pivot table for the heatmap
heatmap_data = filtered_data.pivot_table(index='GENE', columns='SAMPLE_ID', values=value)
heatmap_data = heatmap_data.reindex(index=genes_list, columns=samples_list)
Expand Down Expand Up @@ -208,7 +206,7 @@ def main(outdir, panel_regions, samples_json, all_groups_json, mutdensities, adj
plotting_manager(outdir, genes_list, samples_list, "samples.", data_string, data_objects)
except Exception as e:
print("Error in the process", e)

try:
plotting_manager(outdir, genes_list, groups_names, "groups.", data_string, data_objects)
except Exception as e:
Expand Down
Loading