diff --git a/build/conda/meta.yaml b/build/conda/meta.yaml index 40c35d3..defca77 100644 --- a/build/conda/meta.yaml +++ b/build/conda/meta.yaml @@ -35,6 +35,7 @@ requirements: - pysam - pyyaml - i18nice + - pydantic # tools - minimap2 - samtools >=1.20 diff --git a/environments/run.yml b/environments/run.yml index 951751d..1a49e17 100644 --- a/environments/run.yml +++ b/environments/run.yml @@ -3,7 +3,7 @@ channels: - conda-forge - bioconda dependencies: - - python >= 3.10 + - python >= 3.10 - minimap2 - samtools >=1.20 - bcftools >=1.20 @@ -19,5 +19,6 @@ dependencies: - platformdirs - pyyaml - i18nice + - pydantic - pip - rsync diff --git a/scripts/run_realtime.sh b/scripts/run_realtime.sh index 3efd8d5..0642eed 100755 --- a/scripts/run_realtime.sh +++ b/scripts/run_realtime.sh @@ -1,8 +1,7 @@ # Example of how to run nomadic realtime # 2023/07/12, J.Hendry -nomadic realtime \ --e 0000-00-00_example \ +nomadic realtime 0000-00-00_example \ -f example_data/minknow/fastq_pass \ -m example_data/metadata/sample_info.csv \ -b example_data/beds/nomads8.amplicons.bed --call diff --git a/setup.cfg b/setup.cfg index 13b18e4..8b52b71 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,7 @@ install_requires = pyyaml seaborn i18nice + pydantic python_requires = >=3.10 zip_safe = no @@ -31,6 +32,9 @@ zip_safe = no nomadic.realtime.dashboard = assets/* translations/* +nomadic.summarize.dashboard = + assets/* + translations/* nomadic.start = data/** diff --git a/src/nomadic/cli.py b/src/nomadic/cli.py index ab3b724..d55236f 100644 --- a/src/nomadic/cli.py +++ b/src/nomadic/cli.py @@ -8,6 +8,7 @@ from nomadic.process.commands import process from nomadic.realtime.commands import realtime from nomadic.start.commands import start +from nomadic.summarize.commands import summarize # From: https://stackoverflow.com/questions/47972638/how-can-i-define-the-order-of-click-sub-commands-in-help @@ -36,4 +37,5 @@ def cli(): cli.add_command(realtime) cli.add_command(process) cli.add_command(dashboard) +cli.add_command(summarize) cli.add_command(backup) diff --git a/src/nomadic/dashboard/main.py b/src/nomadic/dashboard/main.py index fe14fc1..a14afc9 100644 --- a/src/nomadic/dashboard/main.py +++ b/src/nomadic/dashboard/main.py @@ -1,57 +1,9 @@ import os from nomadic.realtime.dashboard.builders import MappingRTDashboard, CallingRTDashboard -from nomadic.util.metadata import MetadataTableParser -from nomadic.util.experiment import ExperimentDirectories -from nomadic.util.regions import RegionBEDParser +from nomadic.util.experiment import ExperimentDirectories, find_metadata, find_regions from nomadic.util.settings import load_settings -def find_metadata(input_dir: str) -> MetadataTableParser: - """ - Given an experiment directory, search for the metadata CSV file in thee - expected location - - """ - - metadata_dir = os.path.join(input_dir, "metadata") - csvs = [ - f"{metadata_dir}/{file}" - for file in os.listdir(metadata_dir) - if file.endswith(".csv") - and not file.startswith("._") # ignore AppleDouble files - ] # TODO: what about no-suffix files? - - if len(csvs) != 1: # Could alternatively load and LOOK - raise FileNotFoundError( - f"Expected one metadata CSV file (*.csv) at {metadata_dir}, but found {len(csvs)}." - ) - - return MetadataTableParser(csvs[0]) - - -def find_regions(input_dir: str) -> RegionBEDParser: - """ - Given an experiment directory, search for the metadata CSV file in thee - expected location - - TODO: Bad duplication from above, can write inner function - """ - - metadata_dir = os.path.join(input_dir, "metadata") - beds = [ - f"{metadata_dir}/{file}" - for file in os.listdir(metadata_dir) - if file.endswith(".bed") and not file.endswith(".lowcomplexity_mask.bed") - ] # TODO: what about no-suffix files? - - if len(beds) != 1: # Could alternatively load and LOOK - raise FileNotFoundError( - f"Expected one region BED file (*.bed) at {metadata_dir}, but found {len(beds)}." - ) - - return RegionBEDParser(beds[0]) - - def variant_calling_performed(expt_dirs: ExperimentDirectories) -> bool: """ Check if the variant calling TSV is present diff --git a/src/nomadic/realtime/dashboard/components.py b/src/nomadic/realtime/dashboard/components.py index cf4190b..1d92eab 100644 --- a/src/nomadic/realtime/dashboard/components.py +++ b/src/nomadic/realtime/dashboard/components.py @@ -394,7 +394,7 @@ def _update(_, selected_categories): if "sample_id" not in df.columns: # for backwards compatibility to be able to show old experiments where this column was not in the data - df = df.join(self.metadata.required_metadata, on="barcode") + df = df.join(self.metadata.sample_ids_df, on="barcode") x = df.apply( sample_string_from_row, @@ -578,7 +578,7 @@ def _update(_, dropdown_stat, shift=0): ) if "sample_id" not in df.columns: # for backwards compatibility to be able to show old experiments where this column was not in the data - df = df.join(self.metadata.required_metadata, on="barcode") + df = df.join(self.metadata.sample_ids_df, on="barcode") # Prepare plotting data # plot_data = [ @@ -1126,9 +1126,7 @@ def _update(_, target_region): target_df = df.query(qry) if "sample_id" not in target_df.columns: # for backwards compatibility to be able to show old experiments where this column was not in the data - target_df = target_df.join( - self.metadata.required_metadata, on="barcode" - ) + target_df = target_df.join(self.metadata.sample_ids_df, on="barcode") # Munge for plot target_df["sample_string"] = pd.Categorical( diff --git a/src/nomadic/realtime/pipelines/experiment.py b/src/nomadic/realtime/pipelines/experiment.py index 608f642..0122787 100644 --- a/src/nomadic/realtime/pipelines/experiment.py +++ b/src/nomadic/realtime/pipelines/experiment.py @@ -74,7 +74,7 @@ def _run_fastq(self): except FileNotFoundError: continue df = pd.DataFrame(fastq_dts) - df = df.join(self.metadata.required_metadata, on="barcode") + df = df.join(self.metadata.sample_ids_df, on="barcode") df_path = self.expt_dirs.get_summary_files().fastqs_processed df.to_csv(df_path, index=False) @@ -97,7 +97,7 @@ def _run_qcbams(self): except FileNotFoundError: continue df = pd.DataFrame(qcbams_dts) - df = df.join(self.metadata.required_metadata, on="barcode") + df = df.join(self.metadata.sample_ids_df, on="barcode") df_path = self.expt_dirs.get_summary_files().read_mapping df.to_csv(df_path, index=False) @@ -117,7 +117,7 @@ def _run_bedcov(self): except FileNotFoundError: continue bedcov_df = pd.concat(bedcov_dfs) - bedcov_df = bedcov_df.join(self.metadata.required_metadata, on="barcode") + bedcov_df = bedcov_df.join(self.metadata.sample_ids_df, on="barcode") df_path = self.expt_dirs.get_summary_files().region_coverage bedcov_df.to_csv(df_path, index=False) @@ -137,7 +137,7 @@ def _run_depth(self): except FileNotFoundError: continue depth_df = pd.concat(depth_dfs) - depth_df = depth_df.join(self.metadata.required_metadata, on="barcode") + depth_df = depth_df.join(self.metadata.sample_ids_df, on="barcode") df_path = self.expt_dirs.get_summary_files().depth_profiles depth_df.to_csv(df_path, index=False) @@ -192,7 +192,7 @@ def _run_variant(self, caller: str): annotator.convert_to_csv(temp_path) df = pd.read_csv(temp_path) - df = df.join(self.metadata.required_metadata, on="barcode") + df = df.join(self.metadata.sample_ids_df, on="barcode") df.to_csv(csv_path, index=False) # Clean-up diff --git a/src/nomadic/summarize/__init__.py b/src/nomadic/summarize/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nomadic/summarize/commands.py b/src/nomadic/summarize/commands.py new file mode 100644 index 0000000..ef728ea --- /dev/null +++ b/src/nomadic/summarize/commands.py @@ -0,0 +1,171 @@ +from pathlib import Path + +import click + +from nomadic.util.exceptions import MetadataFormatError +from nomadic.util.workspace import Workspace, check_if_workspace + + +@click.command( + short_help="Summarize a set of experiments.", +) +@click.argument( + "experiment_dirs", + type=click.Path(exists=True), + nargs=-1, # allow multiple arguments; gets passed as tuple +) +@click.option( + "-w", + "--workspace", + "workspace_path", + default="./", + show_default="current directory", + type=click.Path(exists=True, file_okay=False, dir_okay=True), + help="Path of the workspace where all input/output files (beds, metadata, results) are stored. " + "The workspace directory simplifies the use of nomadic in that many arguments don't need to be listed " + "as they are predefined in the workspace config or can be loaded from the workspace", +) +@click.option( + "-m", + "--metadata_csv", + type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path), + help="Path to the master metadata CSV file.", + show_default="/metadata/.csv", +) +@click.option( + "-n", + "--summary_name", + type=str, + help="Name of summary", + show_default="name of the workspace.", +) +@click.option( + "--prevalence-by", + type=str, + help="Column to calculate prevalence by for output files.", + multiple=True, +) +@click.option( + "--dashboard/--no-dashboard", + default=True, + help="Whether to start the web dashboard to look at the summary.", +) +@click.option( + "--only-dashboard", + type=bool, + is_flag=True, + default=False, + help="If set, only open the summary dashboard in the web browser. Can not be combined with --no-dashboard.", +) +@click.option( + "-s", + "--settings-file", + type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path), + show_default="/metadata/.yaml", + help="Path to the summary settings YAML file.", +) +@click.option( + "--no-master-metadata", + is_flag=True, + default=False, + help="If set, no master metadata CSV needs to be provided. This is not recommended, as it's better to be explicit about the samples to be included, but it can be used to quickly get an overview of the data in the workspace.", +) +@click.option( + "--qc-min-coverage", + type=int, + default=50, + show_default=True, + help="Minimum coverage threshold for quality control. Amplicons with less than this coverage will be marked as low coverage.", +) +@click.option( + "--qc-max-contam", + type=float, + default=0.1, + show_default=True, + help="Maximum contamination fraction for quality control. Samples with contamination above this fraction will be marked as contaminated. Contamination is defined as the mean coverage of negative controls being more than this fraction of the sample coverage.", +) +@click.option( + "--output-dir", + "-o", + type=click.Path(dir_okay=True, file_okay=False, path_type=Path), + help="Path to the output directory where the summary will be stored. By default, this is /summaries/.", +) +def summarize( + experiment_dirs: tuple[str], + summary_name: str, + workspace_path: str, + output_dir: Path, + metadata_csv: Path, + dashboard: bool, + only_dashboard: bool, + prevalence_by: tuple[str], + settings_file: Path, + no_master_metadata: bool, + qc_min_coverage: int, + qc_max_contam: float, +): + """ + Summarize a set of experiments to evaluate quality control and + mutation prevalence. You can either provide a list of folders of experiments, + or if none are provided, all experiments of this workspace will be used. + + """ + if not check_if_workspace(workspace_path): + raise click.BadParameter( + param_hint="-w/--workspace", + message=f"'{workspace_path}' is not a workspace.", + ) + workspace = Workspace(workspace_path) + + if summary_name is None: + summary_name = workspace.get_name() + + if only_dashboard and not dashboard: + raise click.BadParameter( + param_hint="--dashboard-only", + message="--dashboard-only can not be used together with --no-dashboard.", + ) + + if only_dashboard: + from .main import view + + return view(Path(workspace.get_summary_dir(summary_name)), summary_name) + + if metadata_csv is None and not no_master_metadata: + metadata_csv = Path(workspace.get_master_metadata_csv(summary_name)) + + if settings_file is None: + settings_file = Path(workspace.get_summary_settings_file(summary_name)) + + if not no_master_metadata and not metadata_csv.exists(): + raise click.BadParameter( + param_hint="-m/--metadata_csv", + message=f"Master metadata file '{metadata_csv}' does not exist.", + ) + + if len(experiment_dirs) == 0: + experiment_dirs = workspace.get_experiment_dirs() + + if output_dir is None: + output_dir = Path(workspace.get_summary_dir(summary_name)) + + from .main import main + + try: + main( + workspace=workspace, + output_dir=output_dir, + expt_dirs=experiment_dirs, + summary_name=summary_name, + metadata_path=metadata_csv, + settings_file_path=settings_file, + show_dashboard=dashboard, + prevalence_by=list(prevalence_by), + no_master_metadata=no_master_metadata, + qc_min_coverage=qc_min_coverage, + qc_max_contam=qc_max_contam, + ) + except MetadataFormatError as e: + raise click.BadParameter( + message=f"Metadata format error: {e}", + ) from e diff --git a/src/nomadic/summarize/compute.py b/src/nomadic/summarize/compute.py new file mode 100644 index 0000000..c3bd424 --- /dev/null +++ b/src/nomadic/summarize/compute.py @@ -0,0 +1,296 @@ +import enum +from typing import Optional + +import pandas as pd +from statsmodels.stats.proportion import proportion_confint + + +# These columns are used to define unique variants +VARIANTS_GROUP_COLUMNS = [ + "gene", + "chrom", + "pos", + # "ref", + # The alt might differ for multiallelic sites + # "alt", + "aa_change", + "aa_pos", + "mut_type", + "mutation", +] + + +class Status(enum.Enum): + PASSING = "passing" + FAILING = "failing" + NOT_SEQUENCED = "not_sequenced" + + +def calc_samples_summary( + master_metadata_df: pd.DataFrame, replicates_qc_df: pd.DataFrame +) -> pd.DataFrame: + """ + Calculates a summary of which samples have how many replicates that are passing or failing, + and if it has at least one passing replicate, it is concidered as passing. + + This can be used to get a list of to be resequenced samples. + """ + samples_summary_df = ( + replicates_qc_df.groupby(["sample_id"]) + .agg( + n_replicates=pd.NamedAgg("barcode", "count"), + n_passing=pd.NamedAgg("passing", "sum"), + ) + .reset_index() + ) + samples_summary_df = ( + samples_summary_df.merge( + master_metadata_df[["sample_id"]], how="right", on="sample_id" + ) + .fillna({"n_replicates": 0, "n_passing": 0}) + .astype({"n_replicates": int, "n_passing": int}) + ) + samples_summary_df["status"] = samples_summary_df.apply( + lambda row: Status.PASSING.value + if row["n_passing"] > 0 + else Status.FAILING.value + if row["n_replicates"] > 0 + else Status.NOT_SEQUENCED.value, + axis=1, + ) + samples_summary_df.sort_values( + by=["n_passing", "n_replicates", "sample_id"], + inplace=True, + ascending=[False, False, True], + ) + + return samples_summary_df + + +def calc_amplicons_summary(master_metadata, replicates_amplicon_qc_df): + """ + Calculates a summary of which samples have how many replicates per amplicon that are passing or failing, + and if it has at least one passing replicate over that amplicon, it is concidered as passing. + + This can be used to understand if there are certain amplicons of samples that have no coverage yet + and make decisions on resampling, that are more fine granular than per sample. + """ + samples_by_amplicons_summary_df = ( + replicates_amplicon_qc_df.groupby(["sample_id", "name"]) + .agg( + n_replicates=pd.NamedAgg("barcode", "count"), + n_passing=pd.NamedAgg("passing", "sum"), + ) + .reset_index() + ) + samples_by_amplicons_summary_df = ( + samples_by_amplicons_summary_df.merge( + master_metadata[["sample_id"]], how="right", on="sample_id" + ) + .fillna({"n_replicates": 0, "n_passing": 0}) + .astype({"n_replicates": int, "n_passing": int}) + ) + samples_by_amplicons_summary_df["status"] = samples_by_amplicons_summary_df.apply( + lambda row: Status.PASSING.value + if row["n_passing"] > 0 + else Status.FAILING.value + if row["n_replicates"] > 0 + else Status.NOT_SEQUENCED.value, + axis=1, + ) + samples_by_amplicons_summary_df.sort_values( + by=["n_passing", "n_replicates", "sample_id"], + inplace=True, + ascending=[False, False, True], + ) + + return samples_by_amplicons_summary_df + + +def filter_false_positives( + variants_df: pd.DataFrame, min_obs: int = 1, min_wsaf: float = 0.15 +): + """Filter out likely false positive variant calls. + + Remove variants that have only been observed `min_obs` times across all samples in + the analysis set, and with a WSAF below `min_wsaf` + + This removes likely false-positive mutations, which are usually rare and at low + WSAF. + + """ + + mut = variants_df[variants_df["gt_int"].isin([1, 2])] + df = variants_df.merge( + mut.groupby(VARIANTS_GROUP_COLUMNS).agg( + n_mut=pd.NamedAgg("gt_int", len), wsaf_max=pd.NamedAgg("wsaf", "max") + ), + on=VARIANTS_GROUP_COLUMNS, + ) + df = df[~(df["n_mut"].le(min_obs) & df["wsaf_max"].lt(min_wsaf))].drop( + columns=["n_mut", "wsaf_max"] + ) + return df + + +def compute_variant_prevalence( + variants_df: pd.DataFrame, + master_df: Optional[pd.DataFrame] = None, + additional_groups: Optional[list[str]] = None, +) -> pd.DataFrame: + """ + Compute the prevalence of each mutation in `variants_df` + """ + if additional_groups is None: + additional_groups = [] + + if additional_groups: + assert master_df is not None, ( + "master_df must be provided if additional_groups are used" + ) + assert all(group in master_df.columns for group in additional_groups), ( + "all additional_groups must be columns in master_df" + ) + variants_df = variants_df.merge( + master_df[["sample_id", *additional_groups]], on="sample_id", how="left" + ) + + prev_df = ( + variants_df.groupby( + VARIANTS_GROUP_COLUMNS + additional_groups, + ) + .agg( + n_samples=pd.NamedAgg("gt_int", len), + n_passed=pd.NamedAgg("gt_int", lambda x: sum(x != -1)), + n_wt=pd.NamedAgg("gt_int", lambda x: sum(x == 0)), + n_mixed=pd.NamedAgg("gt_int", lambda x: sum(x == 1)), + n_mut=pd.NamedAgg("gt_int", lambda x: sum(x == 2)), + ) + .reset_index() + ) + + # Compute frequencies + prev_df["per_wt"] = 100 * prev_df["n_wt"] / prev_df["n_passed"] + prev_df["per_mixed"] = 100 * prev_df["n_mixed"] / prev_df["n_passed"] + prev_df["per_mut"] = 100 * prev_df["n_mut"] / prev_df["n_passed"] + + # Compute prevalence + prev_df["prevalence"] = prev_df["per_mixed"] + prev_df["per_mut"] + + # Compute prevalence 95% confidence intervals + low, high = proportion_confint( + prev_df["n_mut"] + prev_df["n_mixed"], + prev_df["n_passed"], + alpha=0.05, + method="beta", + ) + prev_df["prevalence_lowci"] = 100 * low + prev_df["prevalence_highci"] = 100 * high + + return prev_df + + +def gene_deletions(coverage_df: pd.DataFrame, genes: list[str]) -> pd.DataFrame: + """ + Analyze gene deletions based on coverage data. + + Assumes `coverage_df` contains columns: + - 'sample_id' + - 'name' + - 'mean_cov' + + A gene is considered deleted in a sample if its mean coverage is below a threshold. + This is a very simple heuristic and might be improved in the future. + + Returns a DataFrame with deletion prevalence per gene. + """ + # How many of the other amplolicons must be covered to consider the sample for deletion analysis + AMPLICONS_QC_CUTOFF = 0.8 + # Minimum coverage to consider a gene deleted + DELETION_COVERAGE_THRESHOLD = 5 + + coverage_df = coverage_df[coverage_df["sample_type"] == "field"].copy() + coverage_df["gene"] = coverage_df["name"].str.split("-").str[0] + + # QC: only analyze samples with sufficient control amplicon coverage + analysis_samples = ( + coverage_df[~coverage_df["gene"].isin(genes)] + .groupby(["expt_name", "barcode"]) + .agg( + n_ctrl_amplicons=pd.NamedAgg("name", "nunique"), + n_passing_ctrl_amplicons=pd.NamedAgg("passing", "sum"), + ) + ) + + analysis_set = coverage_df.merge( + analysis_samples, on=["expt_name", "barcode"], how="left" + ) + analysis_set = analysis_set[ + analysis_set["n_passing_ctrl_amplicons"] + >= AMPLICONS_QC_CUTOFF * analysis_set["n_ctrl_amplicons"] + ] + + # QC: don't analyze amplicons that did not pass negative control + analysis_set = analysis_set[~analysis_set["fail_contam_abs"]] + + # Determine deletions + analysis_set["is_deleted"] = analysis_set["mean_cov"] < DELETION_COVERAGE_THRESHOLD + + # consider a gene deleted if all amplicons that belong to the gene are deleted + result = ( + analysis_set.groupby(["expt_name", "barcode", "sample_id", "gene"]) + .agg( + is_deleted=pd.NamedAgg("is_deleted", "all"), + ) + .reset_index() + ) + + # consider a gene deleted, if it is deleted for all replicates of a sample + result = ( + result.groupby(["sample_id", "gene"]) + .agg( + is_deleted=pd.NamedAgg("is_deleted", "all"), + n_deleted=pd.NamedAgg("is_deleted", "sum"), + n_replicates=pd.NamedAgg("barcode", "count"), + ) + .reset_index() + ) + + return result[result["gene"].isin(genes)] + + +def gene_deletion_prevalence_by( + gene_deletions_df: pd.DataFrame, master_df: pd.DataFrame, fields: list[str] +) -> pd.DataFrame: + """ + Compute the prevalence of gene deletions in `gene_deletions_df` + stratified by columns in `fields`. + """ + gene_deletions_df = gene_deletions_df.merge( + master_df[["sample_id", *fields]], on="sample_id", how="left" + ) + + prev_df = ( + gene_deletions_df.groupby(["gene", *fields]) + .agg( + n_samples=pd.NamedAgg("is_deleted", len), + n_passed=pd.NamedAgg("is_deleted", lambda x: sum(x.notnull())), + n_deleted=pd.NamedAgg("is_deleted", lambda x: sum(x)), + ) + .reset_index() + ) + + # Compute prevalence + prev_df["prevalence"] = 100 * prev_df["n_deleted"] / prev_df["n_passed"] + + # Compute prevalence 95% confidence intervals + low, high = proportion_confint( + prev_df["n_deleted"], + prev_df["n_passed"], + alpha=0.05, + method="beta", + ) + prev_df["prevalence_lowci"] = 100 * low + prev_df["prevalence_highci"] = 100 * high + + return prev_df diff --git a/src/nomadic/summarize/dashboard/assets/nomadic_logo.png b/src/nomadic/summarize/dashboard/assets/nomadic_logo.png new file mode 100644 index 0000000..3b6878a Binary files /dev/null and b/src/nomadic/summarize/dashboard/assets/nomadic_logo.png differ diff --git a/src/nomadic/summarize/dashboard/assets/summary-style.css b/src/nomadic/summarize/dashboard/assets/summary-style.css new file mode 100644 index 0000000..17a1d40 --- /dev/null +++ b/src/nomadic/summarize/dashboard/assets/summary-style.css @@ -0,0 +1,149 @@ +/* Overall styling ------------------------------------------------------------------ */ + +:root { + --grey: #ecebeb; +} + + +body { + font-family: Arial, Helvetica, sans-serif; + background: var(--grey); +} + +#overall { + margin: 20px; + padding: 20px; + border-radius: 20px; + background: white; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +/* Banner ------------------------------------------------------------------ */ + +.logo-and-summary { + display: flex; + flex-direction: row; + margin-bottom: 20px; +} + +#logo { + margin-left: 0px; + width: 500px; + flex: 0.1; +} + +#throughput-summary { + margin-left: 20px; + flex: 0.9; +} + +/* Samples Section ------------------------------------------------------------------ */ +.samples-row { + padding: 20px; + margin: 20px; + border-color: var(--grey); + border-width: 2px; + border-style: solid; + border-radius: 20px; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +.samples-plots { + display: flex; + flex-direction: row; + gap: 20px; +} + +#samples-pie { + flex: 0.25; +} + +#samples-barplot { + flex: 0.75; +} + + +/* Quality Section ------------------------------------------------------------------ */ + +.quality-row { + padding: 20px; + margin: 20px; + border-color: var(--grey); + border-width: 2px; + border-style: solid; + border-radius: 20px; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +.quality-plots { + display: flex; + flex-direction: row; + gap: 20px; +} + +.quality-dropdowns { + display: flex; + flex-direction: row; + gap: 20px; + margin-bottom: 20px; +} + +#quality-heat { + flex: 1.0; +} + +/* Prevalence Section ------------------------------------------------------------------ */ +.prevalence-row { + padding: 20px; + margin: 20px; + border-color: var(--grey); + border-width: 2px; + border-style: solid; + border-radius: 20px; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +.prevalence-dropdowns { + display: flex; + flex-direction: row; + gap: 20px; + margin-bottom: 20px; +} + +.prevalence-plots { + display: flex; + flex-direction: row; + gap: 20px; +} + +#prevalence-bars { + flex: 1.0; +} + +.prevalence-by-row { + padding: 20px; + margin: 20px; + border-color: var(--grey); + border-width: 2px; + border-style: solid; + border-radius: 20px; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +.map-dropdowns { + display: flex; + flex-direction: row; + gap: 20px; + margin-bottom: 20px; +} + +/* Footer ------------------------------------------------------------------ */ + +.footer { + padding: 0px 20px; + margin: 0px 20px; + display: flex; + flex-direction: row; + gap: 20px; + justify-content: right; +} \ No newline at end of file diff --git a/src/nomadic/summarize/dashboard/builders.py b/src/nomadic/summarize/dashboard/builders.py new file mode 100644 index 0000000..7f42b77 --- /dev/null +++ b/src/nomadic/summarize/dashboard/builders.py @@ -0,0 +1,672 @@ +import glob +from importlib.resources import as_file, files +import logging +import os +import threading +from abc import ABC, abstractmethod +import webbrowser +from dash import Dash, Input, Output, callback, html, dcc + +from i18n import t +import i18n +import pandas as pd + +from nomadic.summarize.compute import compute_variant_prevalence + +from nomadic.summarize.dashboard.components import ( + AmpliconsBarplot, + GeneDeletionsBarplot, + PrevalenceHeatmap, + SamplesPie, + ThroughputSummary, + QualityControl, + PrevalenceBarplot, + MapComponent, +) +from nomadic.util.summary import Settings, get_map_settings + + +class SummaryDashboardBuilder(ABC): + """ + Interface for summary dashboards + + """ + + def __init__(self, summary_name, css_style_sheets): + self.summary_name = summary_name + self.css_style_sheets = css_style_sheets + self.components = [] + self.layout = [] + + @abstractmethod + def _gen_layout(self): + """ + Define the layout of the dashboard + + This will link with the stylesheet that is used to produce + the overall dashboard organisation + + """ + pass + + def _gen_app(self): + """ + Generate an instance of a Dash app + + """ + + app = Dash(name=__name__, external_stylesheets=self.css_style_sheets) + app_log = logging.getLogger("werkzeug") + app_log.setLevel(logging.ERROR) + + return app + + def run(self, in_thread: bool = False, auto_open: bool = True, **kwargs): + """ + Run the dashboard + + """ + + setup_translations() + app = self._gen_app() + self._gen_layout() + + for component in self.components: + component.callback(app) + + app.layout = html.Div(id="overall", children=self.layout) + + if auto_open: + threading.Timer( + 1, webbrowser.open, kwargs=dict(url="http://127.0.0.1:8050") + ).start() + + if in_thread: + dashboard_thread = threading.Thread( + target=lambda: app.run(**kwargs), name="dashboard", daemon=True + ) + dashboard_thread.start() + else: + app.run(**kwargs) + + # --------------------------------------------------------------------------- + # Below here, we are putting concrete methods for creating different + # pieces of a dashboard that may be shared across dashboard subclasse + # + # --------------------------------------------------------------------------- + + def _add_throughput_banner(self, throughput_csv: str) -> None: + """ + Add a banner which shows the logo and summarise the number of samples processed. + + """ + + # Create the component + self.expt_summary = ThroughputSummary( + summary_name=self.summary_name, + component_id="thoughput-summary", + throughput_csv=throughput_csv, + ) + + # Define banner layout + banner = html.Div(className="banner", children=self.expt_summary.get_layout()) + + # Add to components and layout + self.components.append(self.expt_summary) + self.layout.append(banner) + + def _add_samples(self, samples_csv: str, samples_amplicons_csv: str) -> None: + """ + Add a panel that shows progress of samples sequenced + + """ + self.samples = SamplesPie( + summary_name=self.summary_name, + component_id="samples-pie", + samples_csv=samples_csv, + ) + self.amplicons = AmpliconsBarplot( + summary_name=self.summary_name, + component_id="samples-barplot", + samples_amplicons_csv=samples_amplicons_csv, + ) + quality_row = html.Div( + className="samples-row", + children=[ + html.H3("Sample Statistics", style=dict(marginTop="0px")), + html.Div( + className="samples-plots", + children=[self.samples.get_layout(), self.amplicons.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.samples) + self.layout.append(quality_row) + + def _add_experiment_qc(self, coverage_csv: str) -> None: + """ + Add a panel that shows quality control results + + """ + dropdown = dcc.Dropdown( + id="quality-dropdown", + options=[ + {"label": t(option), "value": option, "title": t(f"{option}_tooltip")} + for option in QualityControl.STATISTICS + ], + value=QualityControl.STATISTICS[2], + style=dict(width="300px"), + ) + + self.quality_control = QualityControl( + self.summary_name, + component_id="quality-heat", + dropdown_id="quality-dropdown", + coverage_csv=coverage_csv, + ) + + quality_row = html.Div( + className="quality-row", + children=[ + html.H3("Experiment QC Statistics", style=dict(marginTop="0px")), + html.Div( + className="quality-dropdowns", + children=[ + html.Div( + children=[ + html.Label("Select statistic:"), + dropdown, + ] + ), + ], + ), + html.Div( + className="quality-plots", + children=[self.quality_control.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.quality_control) + self.layout.append(quality_row) + + def _add_prevalence_row( + self, + analysis_csv: str, + master_csv: str, + amplicon_names: list[str], + amplicon_sets: dict[str, list[str]], + ) -> None: + """ + Add a panel that shows prevalence calls + + """ + if amplicon_sets: + dropdown_amplicon_set = dcc.Dropdown( + id="prevalence-dropdown-amplicon-set", + options=list(amplicon_sets.keys()), + value=list(amplicon_sets.keys())[0], + style=dict(width="300px"), + clearable=False, + ) + else: + dropdown_amplicon_set = dcc.Dropdown( + id="prevalence-dropdown-amplicon-set", + options=["All"], + value="All", + style=dict(width="300px"), + clearable=False, + ) + + cols = cols_to_group_by(master_csv, analysis_csv, max_cat=10) + + dropdown_by = dcc.Dropdown( + id="prevalence-dropdown-by", + options=["All", *cols], + value="All", + style=dict(width="300px"), + clearable=False, + ) + + self.prevalence_bars = PrevalenceBarplot( + self.summary_name, + component_id="prevalence-bars", + radio_id="amplicons-dropdown", + radio_id_by="prevalence-dropdown-by", + analysis_csv=analysis_csv, + master_csv=master_csv, + amplicon_sets=amplicon_sets, + ) + + @callback( + Output("amplicons-dropdown", "options"), + Input("prevalence-dropdown-amplicon-set", "value"), + ) + def update_amplicons_options(amplicon_set): + if amplicon_set == "All": + return amplicon_names + return amplicon_sets[amplicon_set] + + @callback( + Output("amplicons-dropdown", "value"), + Input("prevalence-dropdown-amplicon-set", "value"), + ) + def update_amplicons_value(amplicon_set): + if amplicon_set == "All": + return amplicon_names + return amplicon_sets[amplicon_set] + + prevalence_row = html.Div( + className="prevalence-row", + children=[ + html.H3("Prevalence", style=dict(marginTop="0px")), + html.Div( + className="prevalence-dropdowns", + children=[ + html.Div( + children=[ + html.Label("Select amplicon set:"), + dropdown_amplicon_set, + ] + ), + html.Div( + children=[ + html.Label("Select amplicons:"), + dcc.Dropdown( + id="amplicons-dropdown", + multi=True, + style=dict(width="300px"), + ), + ] + ), + html.Div( + children=[ + html.Label("Group by:"), + dropdown_by, + ] + ), + ], + ), + html.Div( + className="prevalence-plots", + children=[self.prevalence_bars.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.prevalence_bars) + self.layout.append(prevalence_row) + + def _add_prevalence_by_col_row( + self, + analysis_csv: str, + master_csv: str, + amplicon_names: list[str], + amplicon_sets: dict[str, list[str]], + ) -> None: + """ + Add a panel that shows prevalence calls by cols + + """ + + if "Resistance" in amplicon_sets: + amplicon_names = amplicon_sets["Resistance"] + + amplicon_dropdown = dcc.Dropdown( + id="amplicon-dropdown", + options=amplicon_names, + value=amplicon_names[0], + style=dict(width="300px"), + clearable=False, + ) + + cols = cols_to_group_by(master_csv, analysis_csv, max_cat=50) + + if not cols: + # Nothing to show + return + + col_dropdown = dcc.Dropdown( + id="col-dropdown", + options=cols, + value=cols[0], + style=dict(width="300px"), + clearable=False, + ) + + self.prevalence_heatmap = PrevalenceHeatmap( + summary_name=self.summary_name, + analysis_csv=analysis_csv, + master_csv=master_csv, + component_id="prevalence-heatmap", + amplicon_dropdown_id="amplicon-dropdown", + col_dropdown_id="col-dropdown", + ) + prevalence_row = html.Div( + className="prevalence-by-row", + children=[ + html.H3("Prevalence by category", style=dict(marginTop="0px")), + html.Div( + className="prevalence-dropdowns", + children=[ + html.Div( + children=[ + html.Label("Select amplicon:"), + amplicon_dropdown, + ] + ), + html.Div( + children=[ + html.Label("Group by:"), + col_dropdown, + ] + ), + ], + ), + html.Div( + className="prevalence-region-plots", + children=[self.prevalence_heatmap.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.prevalence_heatmap) + self.layout.append(prevalence_row) + + def _add_gene_deletion_row(self, gene_deletions_csv: str, master_csv: str) -> None: + """ + Add a panel that shows prevalence calls + + """ + + cols = cols_to_group_by(master_csv, gene_deletions_csv, max_cat=50) + + dropdown_by = dcc.Dropdown( + id="gene-deletions-dropdown-by", + options=["All", *cols], + value="All", + style=dict(width="300px"), + clearable=False, + ) + + self.prevalence_bars = GeneDeletionsBarplot( + self.summary_name, + component_id="gene-deletions-bars", + radio_id_by="gene-deletions-dropdown-by", + gene_deletions_csv=gene_deletions_csv, + master_csv=master_csv, + ) + + prevalence_row = html.Div( + className="gene-deltions-row", + children=[ + html.H3("Potential Gene Deletions", style=dict(marginTop="0px")), + html.Div( + className="prevalence-dropdowns", + children=[ + html.Div( + children=[ + html.Label("Group by:"), + dropdown_by, + ] + ), + ], + ), + html.Div( + className="gene-deletions-plots", + children=[self.prevalence_bars.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.prevalence_bars) + self.layout.append(prevalence_row) + + def _add_map_row( + self, + analysis_csv: str, + master_csv: str, + geojsons: list[str], + location_coords_csv: str, + map_center: tuple[float, float] | None, + map_zoom_level: int | None, + amplicon_sets: dict[str, list[str]], + ) -> None: + """ + Add a panel that shows a choropleth map of drug resistance marker prevalence + + Parameters + ---------- + analysis_csv : str + Path to the analysis CSV file + master_csv : str + Path to the master CSV file + geojsons : list[str] + List of paths to GeoJSON files for different region types + location_coords_csv : str | None, optional + Path to a CSV file containing location to coordinate mappings + """ + # Get mutations and their prevalence for resistance genes + analysis_df = pd.read_csv(analysis_csv) + if "Resistance" in amplicon_sets: + resistance_df = analysis_df[ + analysis_df["amplicon"].isin(amplicon_sets["Resistance"]) + ] + else: + resistance_df = analysis_df + prevalence_df = compute_variant_prevalence(resistance_df) + + # Create a dictionary with mutation info and prevalence + mutations_info = {} + for _, row in prevalence_df.iterrows(): + mutation_id = f"{row['gene']}-{row['aa_change']}" + mutations_info[mutation_id] = { + "prevalence": row["prevalence"], + "label": f"{mutation_id} ({row['prevalence']:.1f}%)", + "value": mutation_id, + } + + # Sort by prevalence and create dropdown options + gene_mutations = [ + info + for info in sorted( + mutations_info.values(), key=lambda x: x["prevalence"], reverse=True + ) + ] + + # Create the dropdown with sorted mutations + mutation_dropdown = dcc.Dropdown( + id="map-mutation-dropdown", + options=[ + {"label": m["label"], "value": m["value"]} for m in gene_mutations + ], + value=gene_mutations[0]["value"] if gene_mutations else None, + style=dict(width="300px"), + clearable=False, + ) + + regions = { + path.split("/")[-1].split(".")[0].split("-")[-1]: path for path in geojsons + } + + region_dropdown = dcc.Dropdown( + id="map-region-dropdown", + options=list(regions.keys()), + value="district", + style=dict(width="300px"), + clearable=False, + ) + + # Create the map component with the prepared dropdowns + self.prevalence_map = MapComponent( + summary_name=self.summary_name, + analysis_csv=analysis_csv, + master_csv=master_csv, + component_id="prevalence-map", + mutation_dropdown_id="map-mutation-dropdown", + region_dropdown_id="map-region-dropdown", + geojsons=regions, + location_coords_csv=location_coords_csv, + map_center=map_center, + map_zoom_level=map_zoom_level, + ) + + map_row = html.Div( + className="map-row", + children=[ + html.H3("Geographic Distribution", style=dict(marginTop="0px")), + html.Div( + className="map-dropdowns", + children=[ + html.Div( + children=[ + html.Label("Select mutation:"), + mutation_dropdown, + ] + ), + html.Div( + children=[ + html.Label("Group by:"), + region_dropdown, + ] + ), + ], + ), + html.Div( + className="map-plot", + children=[self.prevalence_map.get_layout()], + ), + ], + ) + + # Add components and layout + self.components.append(self.prevalence_map) + self.layout.append(map_row) + + +class BasicSummaryDashboard(SummaryDashboardBuilder): + """ + Build a dashboard with a focus on mapping statistics + + """ + + CSS_STYLE = ["assets/summary-style.css"] + + def __init__( + self, + summary_name: str, + throughput_csv: str, + samples_csv: str, + samples_amplicons_csv: str, + coverage_csv: str, + analysis_csv: str, + gene_deletions_csv: str, + master_csv: str, + amplicons: list[str], + amplicon_sets: dict[str, list[str]], + deletion_genes: list[str], + geojson_glob: str, + settings: Settings, + location_coords_csv: str, + ): + """ + Initialise all of the dashboard components + + Parameters + ---------- + location_coords_csv : str | None, optional + Path to a CSV file containing location to coordinate mappings. + The file should have columns: location,latitude,longitude + """ + + super().__init__(summary_name, self.CSS_STYLE) + self.throughput_csv = throughput_csv + self.samples_csv = samples_csv + self.samples_amplicons_csv = samples_amplicons_csv + self.coverage_csv = coverage_csv + self.analysis_csv = analysis_csv + self.master_csv = master_csv + self.gene_deletions_csv = gene_deletions_csv + self.geojson_glob = geojson_glob + self.location_coords_csv = location_coords_csv + self.map_center, self.map_zoom_level = get_map_settings(settings) + self.amplicon_names = amplicons + self.amplicon_sets = amplicon_sets + self.deletion_genes = deletion_genes + + def _gen_layout(self): + """ + Generate the layout + + """ + self._add_throughput_banner(self.throughput_csv) + self._add_samples(self.samples_csv, self.samples_amplicons_csv) + self._add_experiment_qc(self.coverage_csv) + self._add_prevalence_row( + self.analysis_csv, self.master_csv, self.amplicon_names, self.amplicon_sets + ) + self._add_prevalence_by_col_row( + self.analysis_csv, self.master_csv, self.amplicon_names, self.amplicon_sets + ) + if self.deletion_genes: + self._add_gene_deletion_row(self.gene_deletions_csv, self.master_csv) + + if glob.glob(self.geojson_glob) or os.path.exists(self.location_coords_csv): + self._add_map_row( + self.analysis_csv, + self.master_csv, + glob.glob(self.geojson_glob), + self.location_coords_csv, + self.map_center, + self.map_zoom_level, + self.amplicon_sets, + ) + + +def setup_translations(): + """ + Set up translations for the dashboard + + This function loads the translation files from the package resources + and appends them to the i18n load path. + + """ + with as_file(files("nomadic.summarize.dashboard").joinpath("translations")) as path: + i18n.load_path.append(str(path)) + i18n.set("filename_format", "{locale}.{format}") + i18n.load_everything() + i18n.set("locale", "en") + + +def cols_to_group_by(master_csv: str, analysis_csv, *, max_cat: int) -> list[str]: + """ + Get columns that can be used to group prevalence by + + """ + + master_df = pd.read_csv(master_csv) + analysis_df = pd.read_csv(analysis_csv) + df = pd.merge( + master_df, + analysis_df[["sample_id"]], + on="sample_id", + how="inner", + ) + cols = df.columns.tolist() + cols.remove("sample_id") + + for col in cols[:]: + if pd.api.types.is_numeric_dtype(df[col]): + cols.remove(col) + continue + n_unique = df[col].nunique() + if n_unique > max_cat: + cols.remove(col) + + return cols diff --git a/src/nomadic/summarize/dashboard/components.py b/src/nomadic/summarize/dashboard/components.py new file mode 100644 index 0000000..32242ad --- /dev/null +++ b/src/nomadic/summarize/dashboard/components.py @@ -0,0 +1,1016 @@ +from abc import ABC, abstractmethod +import json +import os +from typing import Optional +import warnings + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from dash import Dash, dcc, html +from dash.dependencies import Input, Output + +from nomadic.summarize.compute import ( + Status, + compute_variant_prevalence, + gene_deletion_prevalence_by, +) +from i18n import t + +# -------------------------------------------------------------------------------- +# Interface for a single real-time dashboard component +# +# -------------------------------------------------------------------------------- + + +class SummaryDashboardComponent(ABC): + """ + Interface for summary dashboard components + + """ + + def __init__(self, summary_name: str, component_id: str): + """ + Initialise components of the dashboard component + + """ + + # User + self.summary_name = summary_name + self.component_id = component_id + + # Always + self.layout_obj = self._define_layout() + + @abstractmethod + def _define_layout(self): + pass + + def get_layout(self): + """ + Get the object that will be passed to the overall HTML + layout + + Typically, this is something from `html.` or `dcc.`, e.g. + `dcc.Graph` + + """ + + if self.layout_obj is None: + raise ValueError("Must define `self.layout_component`.") + + return self.layout_obj + + @abstractmethod + def callback(self, app: Dash) -> None: + """ + Define the callback for this componenet, which will cause it to update + in response to the timer, as well (potentially) other inputs + + """ + pass + + +# -------------------------------------------------------------------------------- +# Concrete components +# +# -------------------------------------------------------------------------------- + +# -------------------------------------------------------------------------------- +# OVERALL STATISTICS +# -------------------------------------------------------------------------------- + + +class ThroughputSummary(SummaryDashboardComponent): + """ + Make a pie chart that shows read mapping statistics + + """ + + logo_src_path = "assets/nomadic_logo.png" + + def __init__(self, summary_name: str, throughput_csv: str, component_id: str): + self.throughput_csv = throughput_csv + + # Read header only + df_header = pd.read_csv(throughput_csv, nrows=0) + dtypes: dict[str, type[str] | type[int]] = { + col: int for col in df_header.columns + } + + dtypes["sample_type"] = str + + self.throughput_df = pd.read_csv( + throughput_csv, index_col="sample_type", dtype=dtypes + ) + super().__init__(summary_name, component_id) + + def _define_layout(self): + """ + Define the layout to be a dcc.Graph object with the + appropriate ID + + """ + + layout = html.Div( + className="logo-and-summary", + children=[ + html.Img(id="logo", src=self.logo_src_path), + html.Div( + id="throughput-summary", + children=[ + html.H3("Throughput Details"), + html.P( + [ + f"Experiments: {self.throughput_df.columns.shape[0] - 1}", + html.Br(), + f"Field samples (total): {self.throughput_df.loc['field', 'All']}", + html.Br(), + f"Field samples (unique): {self.throughput_df.loc['field_unique', 'All']}", + html.Br(), + ] + ), + ], + ), + ], + ) + + return layout + + def callback(self, app: Dash) -> None: + """ + Define the update callback for the pie chart + + """ + + +SAMPLE_COLORS = { + Status.NOT_SEQUENCED.value: "#636EFA", + Status.FAILING.value: "#EF553B", + Status.PASSING.value: "#00CC96", +} + + +class SamplesPie(SummaryDashboardComponent): + """ + Make a pie chart that shows read mapping statistics + + """ + + def __init__( + self, + summary_name: str, + samples_csv: str, + component_id: str, + ): + self.samples_csv = samples_csv + self.df = pd.read_csv(samples_csv) + self.n = len(self.df) + self.df = self.df.groupby("status").count()["sample_id"] + super().__init__(summary_name, component_id) + + def _define_layout(self): + """ + Define the layout to be a dcc.Graph object with the + appropriate ID + + """ + fig = go.Figure( + data=[ + go.Pie( + values=self.df.values, + labels=[t(label) for label in self.df.index], + sort=False, + hole=0.3, + textinfo="label+percent+value", + marker=dict(colors=[SAMPLE_COLORS[cat] for cat in self.df.index]), + ) + ] + ) + + MAR = 20 + fig.update_layout( + showlegend=True, + margin=dict(t=MAR, l=MAR, r=MAR, b=MAR), + annotations=[ + dict( + text=f"N={self.n}", font_size=20, showarrow=False, xanchor="center" + ) + ], + ) + + return dcc.Graph(id=self.component_id, figure=fig) + + def callback(self, app: Dash) -> None: + """ + Define the update callback for the pie chart + + """ + + +class AmpliconsBarplot(SummaryDashboardComponent): + """ + Make a bar chart that shows the Amplicons Statistics + + """ + + def __init__( + self, + summary_name: str, + component_id: str, + samples_amplicons_csv: str, + ): + df = pd.read_csv(samples_amplicons_csv) + # Store inputs + plot_df = pd.crosstab(df["name"], df["status"]) + n_not_sequenced = (df["status"] == Status.NOT_SEQUENCED.value).sum() + not_sequenced = [n_not_sequenced] * len(plot_df.index) + # Generate figure + fig = go.Figure( + data=[ + go.Bar( + x=plot_df.index, + y=not_sequenced + if status == Status.NOT_SEQUENCED + else plot_df[status.value], + texttemplate="%{y}", + name=t(status.value), + marker=dict(color=SAMPLE_COLORS[status.value]), + ) + for status in Status + if status.value in plot_df.columns + ] + ) + + # Format + fig.update_layout( + xaxis_title="Amplicons", + yaxis_title="Number of Samples", + barmode="stack", + xaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True), + yaxis=dict( + showline=True, + linewidth=1, + linecolor="black", + mirror=True, + showgrid=True, + gridcolor="lightgray", + gridwidth=0.5, + griddash="dot", + ), + plot_bgcolor="rgba(0,0,0,0)", + hovermode="x unified", + legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0), + ) + fig.update_traces(marker=dict(line=dict(color="black", width=1))) + self.fig = fig + super().__init__(summary_name, component_id) + + def _define_layout(self): + """ + Define the layout to be a dcc.Graph object with the + appropriate ID + + """ + + return dcc.Graph(id=self.component_id, figure=self.fig) + + def callback(self, app: Dash) -> None: + pass + + +COV_MAX = 10000 +COV_NEG_MAX = 50 +COLORSCALES = { + "per_field_passing": [ + [0.00, "#FF0033"], + [0.50, "#FF9900"], + [0.70, "#FFEB33"], # set 70% as okay + [0.90, "#80FF7E"], # above 90% is good + [1.00, "#3A9A3E"], + ], + "per_field_contam": [ + [0.00, "#3A9A3E"], # low contamination is good + [0.20, "#FFEB33"], # set 30% as okay + [0.30, "#FF9900"], # contamination is bad + [1.00, "#FF0033"], + ], + "per_field_lowcov": [ + [0.00, "#3A9A3E"], # low lowcov is good + [0.10, "#80FF7E"], # above 90% is good + [0.30, "#FFEB33"], # set 30% as okay + [0.50, "#FF9900"], # lowcov is bad + [1.00, "#FF0033"], + ], + "mean_cov_field": [ + [0, "#FF0033"], # low coverage is bad + [25 / COV_MAX, "#FF9900"], + [50 / COV_MAX, "#FFEB33"], # set threshold as okay + [200 / COV_MAX, "#80FF7E"], # above 200 we can make good calls + [500 / COV_MAX, "#3A9A3E"], # above 500 is excellent also for low freq calls + [1.0, "#7585FE"], # coverage is uncapped + ], + "mean_cov_neg": [ + [0, "#3A9A3E"], # low neg cov is good + [2 / COV_NEG_MAX, "#80FF7E"], + [5 / COV_NEG_MAX, "#FFEB33"], + [10 / COV_NEG_MAX, "#FF9900"], + [50 / COV_NEG_MAX, "#FF0033"], # from 50 cov we fail neg control + [1.0, "#FF0033"], # coverage is uncapped + ], +} + + +class QualityControl(SummaryDashboardComponent): + STATISTICS = [ + "mean_cov_field", + "mean_cov_neg", + "per_field_passing", + "per_field_contam", + "per_field_lowcov", + ] + + def __init__( + self, summary_name: str, coverage_csv: str, component_id: str, dropdown_id: str + ) -> None: + """ + Initialisation loads the coverage data and prepares for plotting; + + """ + + self.coverage_csv = coverage_csv + self.coverage_df = pd.read_csv(coverage_csv) + self.plot_df = pd.pivot_table( + index="expt_name", + columns="name", + values=[*self.STATISTICS, "n_field"], + dropna=False, + observed=False, + data=self.coverage_df, + ) + + self.dropdown_id = dropdown_id + super().__init__(summary_name, component_id) + + def _define_layout(self): + return dcc.Graph(id=self.component_id) + + def callback(self, app: Dash) -> None: + @app.callback( + Output(self.component_id, "figure"), Input(self.dropdown_id, "value") + ) + def _update(focus_stat: str): + """Called whenver the input changes""" + legend = ( + "Field samples (%)" + if "per_" in focus_stat + else "Mean Coverage" + if "cov" in focus_stat + else "" + ) + # Create customdata array for hover + customdata = np.stack( + [ + self.plot_df["n_field"], + ], + axis=-1, + ) + + plot_data = [ + go.Heatmap( + x=self.plot_df[focus_stat].columns, + y=self.plot_df[focus_stat].index, + z=self.plot_df[focus_stat], + text=self.plot_df[focus_stat], + xgap=1, + ygap=1, + zmin=0, + zmax=100 + if "per_" in focus_stat + else COV_MAX + if "mean_cov_field" in focus_stat + else COV_NEG_MAX, + colorbar=dict(title=legend, outlinecolor="black", outlinewidth=1), + hoverongaps=False, + colorscale=COLORSCALES[focus_stat], + customdata=customdata, + hovertemplate=( + ( + "%{z:.1f}%
" + if "per_" in focus_stat + else "%{z:.1f}x
" + ) + + "Amplicon: %{x}
" + + "Number of samples: %{customdata[0]}
" + ), + ) + ] + MAR = 40 + fig = go.Figure(plot_data) + fig.update_layout( + xaxis_title="Amplicons", + yaxis_title="Experiments", + hovermode="y unified", + paper_bgcolor="white", # Sets the background color of the paper + plot_bgcolor="white", + title=dict(text=t(focus_stat)), + margin=dict(t=MAR, l=MAR, r=MAR, b=MAR), + xaxis=dict( + showline=True, linecolor="black", linewidth=2, dtick=1, mirror=True + ), + yaxis=dict( + showline=True, linecolor="black", linewidth=2, dtick=1, mirror=True + ), + xaxis_showgrid=False, + yaxis_showgrid=False, + # height=n_mutations*SZ # TOOD: how to adjust dynamically + ) + unit = "%" if "per_" in focus_stat else "x" if "cov" in focus_stat else "" + fig.update_traces( + text=self.plot_df[focus_stat], + texttemplate="%{text:.0f}" + unit, + textfont_size=12, + ) + return fig + + +class PrevalenceBarplot(SummaryDashboardComponent): + def __init__( + self, + summary_name: str, + analysis_csv: str, + master_csv: str, + component_id: str, + radio_id: str, + radio_id_by: str, + amplicon_sets: dict[str, list[str]], + ) -> None: + """ + Initialisation loads the coverage data and prepares for plotting; + + """ + + self.analysis_csv = analysis_csv + self.analysis_df = pd.read_csv(analysis_csv) + + self.master__csv = master_csv + self.master_df = pd.read_csv(master_csv) + + self.radio_id = radio_id + self.radio_id_by = radio_id_by + + self.amplicon_sets = amplicon_sets + super().__init__(summary_name, component_id) + + def _define_layout(self): + return dcc.Graph(id=self.component_id) + + def callback(self, app: Dash) -> None: + @app.callback( + Output(self.component_id, "figure"), + Input(self.radio_id, "value"), + Input(self.radio_id_by, "value"), + ) + def _update(amplicons: list[str], by: str): + """Called whenver the input changes""" + + analysis_df = self.analysis_df.query("amplicon in @amplicons") + + if by == "All": + plot_df = compute_variant_prevalence(analysis_df) + else: + plot_df = compute_variant_prevalence(analysis_df, self.master_df, [by]) + plot_df.sort_values(["gene", "chrom", "pos"], inplace=True) + + data = [] + htemp = "%{y:0.1f}% (%{customdata[2]}/%{customdata[1]})" + + if by == "All": + # Prepare plotting data + customdata = np.stack( + [ + plot_df["n_samples"], + plot_df["n_passed"], + plot_df["n_mixed"] + plot_df["n_mut"], + ], + axis=-1, + ) + data.append( + go.Bar( + x=plot_df["mutation"], + y=plot_df["prevalence"], + customdata=customdata, + hovertemplate=htemp, + name="Prevalence", + error_y=dict( + type="data", + array=plot_df["prevalence_highci"] - plot_df["prevalence"], + arrayminus=plot_df["prevalence"] + - plot_df["prevalence_lowci"], + ), + ) + ) + else: + for group in plot_df[by].unique(): + group_df = plot_df.query(f"{by} == @group") + # Prepare plotting data + customdata = np.stack( + [ + group_df["n_samples"], + group_df["n_passed"], + group_df["n_mixed"] + group_df["n_mut"], + ], + axis=-1, + ) + data.append( + go.Bar( + x=group_df["mutation"], + y=group_df["prevalence"], + customdata=customdata, + hovertemplate=htemp, + name=str(group), + error_y=dict( + type="data", + array=group_df["prevalence_highci"] + - group_df["prevalence"], + arrayminus=group_df["prevalence"] + - group_df["prevalence_lowci"], + ), + ) + ) + + # Plotting + fig = go.Figure(data) + fig.update_layout( + yaxis_title="Prevalence (%)", + xaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True), + yaxis=dict( + showline=True, + linewidth=1, + linecolor="black", + mirror=True, + showgrid=True, + gridcolor="lightgray", + gridwidth=0.5, + griddash="dot", + ), + legend=dict( + orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0 + ), + plot_bgcolor="rgba(0,0,0,0)", + hovermode="x unified", + ) + fig.update_yaxes(range=[0, 100]) + fig.update_traces(marker=dict(line=dict(color="black", width=1))) + + return fig + + +class PrevalenceHeatmap(SummaryDashboardComponent): + """ + Make a heatmap of prevalences + + """ + + def __init__( + self, + summary_name: str, + analysis_csv: str, + master_csv: str, + component_id: str, + amplicon_dropdown_id: str, + col_dropdown_id: str, + ): + self.amplicon_dropdown_id = amplicon_dropdown_id + self.col_dropdown_id = col_dropdown_id + self.analysis_df = pd.read_csv(analysis_csv) + self.master_df = pd.read_csv(master_csv) + super().__init__(summary_name, component_id) + + def _define_layout(self): + """Layout is graph""" + return dcc.Graph(id=self.component_id) + + def callback(self, app: Dash) -> None: + @app.callback( + Output(self.component_id, "figure"), + Input(self.amplicon_dropdown_id, "value"), + Input(self.col_dropdown_id, "value"), + ) + def _update(target_amplicon, col_by): + """Called every time an input changes""" + + df = compute_variant_prevalence( + self.analysis_df.query("amplicon == @target_amplicon"), + self.master_df, + [col_by], + ) + + plot_df = pd.pivot_table( + index="aa_change", + columns=col_by, + values=["prevalence", "n_mixed", "n_mut", "n_passed"], + data=df, + ) + + # Sort, so aa changes are in correct order + pos_order = ( + df.drop_duplicates("aa_change") + .set_index("aa_change")["aa_pos"] + .sort_values(ascending=True) + .index + ) + plot_df = plot_df.reindex(pos_order) + + if len(plot_df) > 0: + # Hover statment + customdata = np.stack( + [plot_df["n_mixed"], plot_df["n_mut"], plot_df["n_passed"]], axis=-1 + ) + htemp = "%{y} (%{x})
" + htemp += "Prevalence: %{z:.0f}%
" + htemp += "Samples: %{customdata[2]}
" + htemp += "Mixed: %{customdata[0]}
" + htemp += "Clonal: %{customdata[1]}
" + + plot_data = [ + go.Heatmap( + x=plot_df["prevalence"].columns, + y=plot_df["prevalence"].index, + z=plot_df["prevalence"], + texttemplate="%{z:.0f}%", + customdata=customdata, + zmin=0, + zmax=100, + xgap=1, + ygap=1, + colorscale="Spectral_r", + colorbar=dict(title="", outlinecolor="black", outlinewidth=1), + hoverongaps=False, + hovertemplate=htemp, + name="", + ) + ] + else: + plot_data = [] + + MAR = 40 + fig = go.Figure(plot_data) + fig.update_layout( + xaxis_title=col_by, + hovermode="y unified", + paper_bgcolor="white", # Sets the background color of the paper + plot_bgcolor="white", + title=dict(text=target_amplicon), + margin=dict(t=MAR, l=MAR, r=MAR, b=MAR), + xaxis=dict( + showline=True, linecolor="black", linewidth=2, dtick=1, mirror=True + ), + yaxis=dict( + showline=True, linecolor="black", linewidth=2, dtick=1, mirror=True + ), + xaxis_showgrid=False, + yaxis_showgrid=False, + # height=n_mutations*SZ # TOOD: how to adjust dynamically + ) + return fig + + +class GeneDeletionsBarplot(SummaryDashboardComponent): + def __init__( + self, + summary_name: str, + gene_deletions_csv: str, + master_csv: str, + component_id: str, + radio_id_by: str, + ) -> None: + """ + Initialisation loads the coverage data and prepares for plotting; + + """ + + self.gene_deletions_csv = gene_deletions_csv + self.gene_deletions_df = pd.read_csv(gene_deletions_csv) + + self.master__csv = master_csv + self.master_df = pd.read_csv(master_csv) + + self.radio_id_by = radio_id_by + super().__init__(summary_name, component_id) + + def _define_layout(self): + return dcc.Graph(id=self.component_id) + + def callback(self, app: Dash) -> None: + @app.callback( + Output(self.component_id, "figure"), + Input(self.radio_id_by, "value"), + ) + def _update(by: str): + """Called whenver the input changes""" + + if by == "All": + plot_df = gene_deletion_prevalence_by( + self.gene_deletions_df, self.master_df, [] + ) + else: + plot_df = gene_deletion_prevalence_by( + self.gene_deletions_df, self.master_df, [by] + ) + data = [] + htemp = "%{y:0.1f}% (%{customdata[2]}/%{customdata[1]})" + + if by == "All": + # Prepare plotting data + customdata = np.stack( + [ + plot_df["n_samples"], + plot_df["n_passed"], + plot_df["n_deleted"], + ], + axis=-1, + ) + data.append( + go.Bar( + x=plot_df["gene"], + y=plot_df["prevalence"], + customdata=customdata, + hovertemplate=htemp, + name="Prevalence", + error_y=dict( + type="data", + array=plot_df["prevalence_highci"] - plot_df["prevalence"], + arrayminus=plot_df["prevalence"] + - plot_df["prevalence_lowci"], + ), + ) + ) + else: + for group in plot_df[by].unique(): + group_df = plot_df.query(f"{by} == @group") + # Prepare plotting data + customdata = np.stack( + [ + group_df["n_samples"], + group_df["n_passed"], + group_df["n_deleted"], + ], + axis=-1, + ) + data.append( + go.Bar( + x=group_df["gene"], + y=group_df["prevalence"], + customdata=customdata, + hovertemplate=htemp, + name=str(group), + error_y=dict( + type="data", + array=plot_df["prevalence_highci"] + - plot_df["prevalence"], + arrayminus=plot_df["prevalence"] + - plot_df["prevalence_lowci"], + ), + ) + ) + + # Plotting + fig = go.Figure(data) + fig.update_layout( + yaxis_title="Prevalence (%)", + xaxis=dict(showline=True, linewidth=1, linecolor="black", mirror=True), + yaxis=dict( + showline=True, + linewidth=1, + linecolor="black", + mirror=True, + showgrid=True, + gridcolor="lightgray", + gridwidth=0.5, + griddash="dot", + ), + legend=dict( + orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0 + ), + plot_bgcolor="rgba(0,0,0,0)", + hovermode="x unified", + ) + fig.update_yaxes(range=[0, 100]) + fig.update_traces(marker=dict(line=dict(color="black", width=1))) + + return fig + + +class MapComponent(SummaryDashboardComponent): + """ + Component for displaying a choropleth map of drug resistance marker prevalence with sample site markers + """ + + def __init__( + self, + summary_name: str, + analysis_csv: str, + master_csv: str, + component_id: str, + mutation_dropdown_id: str, + region_dropdown_id: str, + geojsons: dict[str, str], + location_coords_csv: str, + map_zoom_level: Optional[int] = None, + map_center: Optional[tuple[float, float]] = None, + ): + self.mutation_dropdown_id = mutation_dropdown_id + self.region_dropdown_id = region_dropdown_id + self.analysis_df = pd.read_csv(analysis_csv) + self.master_df = pd.read_csv(master_csv) + self.map_zoom_level = map_zoom_level + self.map_center = map_center + + # Load location coordinates if provided + self.location_coords = None + if os.path.exists(location_coords_csv): + self.location_coords = pd.read_csv(location_coords_csv) + + # Load GeoJSON data + self.geojson_data = {} + for region, path in geojsons.items(): + with open(path) as f: + self.geojson_data[region] = json.load(f) + super().__init__(summary_name, component_id) + + def _define_layout(self): + """Layout is graph""" + return dcc.Graph(id=self.component_id) + + def callback(self, app: Dash) -> None: + @app.callback( + Output(self.component_id, "figure"), + Input(self.mutation_dropdown_id, "value"), + Input(self.region_dropdown_id, "value"), + ) + def _update(target_mutation, region_by: Optional[str]): + """Called every time an input changes""" + + fig = go.Figure() + + def normalize_location(loc): + """Normalize location names for consistent matching""" + return loc.lower().replace("-", "").replace(" ", "") + + # Split the gene-mutation value and calculate prevalence by region + gene, aa_change = target_mutation.split("-") + + if region_by is not None: + df = compute_variant_prevalence( + self.analysis_df.query("gene == @gene and aa_change == @aa_change"), + self.master_df, + [region_by], + ) + + # Normalize location names in the data + df[f"{region_by}_normalized"] = df[region_by].apply(normalize_location) + + # Get the appropriate GeoJSON data based on the region selection + if region_by not in self.geojson_data: + raise ValueError( + f"No GeoJSON data available for region type: {region_by}" + ) + + # Create a mapping from normalized names to original GeoJSON names + geojson_name_map = { + normalize_location(feat["properties"]["shapeName"]): feat[ + "properties" + ]["shapeName"] + for feat in self.geojson_data[region_by]["features"] + } + + geojson_regions = set(geojson_name_map.keys()) + metadata_regions = set(df[f"{region_by}_normalized"]) + + not_in_geojson = metadata_regions - geojson_regions + if not_in_geojson: + warnings.warn( + f"Some regions in metadata could not be mapped to a region in the geojson: {not_in_geojson}.\nDo you mean any of: {geojson_regions}" + ) + + # Map the normalized names back to GeoJSON names for display + df[f"{region_by}_display"] = df[f"{region_by}_normalized"].map( + {k: v for k, v in geojson_name_map.items()} + ) + + # Add choropleth layer + fig.add_trace( + go.Choroplethmapbox( + geojson=self.geojson_data[region_by], + locations=df[f"{region_by}_display"], + z=df["prevalence"], + colorscale="Spectral_r", + zmin=0, + zmax=100, + marker_opacity=0.8, + marker_line_width=1.0, + featureidkey="properties.shapeName", + customdata=np.stack( + [ + df["n_samples"], + df["n_passed"], + df["n_mut"] + df["n_mixed"], + ], + axis=-1, + ), + hovertemplate=( + "%{location}
" + + "Prevalence: %{z:.1f}%
" + + "Samples: %{customdata[1]}
" + + "Mutations: %{customdata[2]}
" + + "" + ), + ) + ) + + # Add sample site markers if we have location coordinates + if self.location_coords is not None: + # Case-insensitive column matching for the location join + master_location_col = "location" + coords_location_col = "location" + + # Group by location to get sample counts and average prevalence + site_data = compute_variant_prevalence( + self.analysis_df.query("gene == @gene and aa_change == @aa_change"), + self.master_df, + [master_location_col], + ) + + # Create copies with lowercase location names for case-insensitive join + site_data["location_normalized"] = site_data[ + master_location_col + ].str.lower() + + coords_df = self.location_coords.copy() + coords_df["location_normalized"] = coords_df[ + coords_location_col + ].str.lower() + + coords_df.rename( + columns={coords_location_col: "location_display"}, inplace=True + ) + + # Merge using lowercase columns + merged_df = pd.merge( + site_data, + coords_df, + on="location_normalized", + how="inner", + ) + + # --- Scale bubbles by area --- + max_size = 40 # max bubble diameter (px) + min_size = 5 # min bubble diameter (px) + + # Scale radius ~ sqrt(sample_size) + merged_df["scaled_size"] = ( + np.sqrt(merged_df["n_passed"] / merged_df["n_passed"].max()) + * max_size + ) + merged_df["scaled_size"] = merged_df["scaled_size"].clip(lower=min_size) + # Add scatter markers for sample sites + fig.add_trace( + go.Scattermapbox( + lat=merged_df["lat"], + lon=merged_df["lng"], + mode="markers", + marker=dict( + size=merged_df["scaled_size"], + color=merged_df["prevalence"], # Color by prevalence + colorscale="Spectral_r", + cmin=0, + cmax=100, + showscale=False, # Don't show a second colorbar + ), + text=merged_df.apply( + lambda row: f"{row['location_display']}
" + f"Site Samples: {row['n_passed']}
" + f"Site Prevalence: {row['prevalence']:.1f}%
" + f"Site Mutations: {row['n_mut'] + row['n_mixed']}", + axis=1, + ), + hoverinfo="text", + ) + ) + + fig.update_layout( + mapbox_style="carto-positron", + mapbox=dict( + center=dict(lat=self.map_center[0], lon=self.map_center[1]) + if self.map_center is not None + else None, + zoom=self.map_zoom_level + if self.map_zoom_level is not None + else None, + ), + margin={"r": 0, "t": 0, "l": 0, "b": 0}, + height=600, + ) + + return fig diff --git a/src/nomadic/summarize/dashboard/translations/en.yml b/src/nomadic/summarize/dashboard/translations/en.yml new file mode 100644 index 0000000..1b368fc --- /dev/null +++ b/src/nomadic/summarize/dashboard/translations/en.yml @@ -0,0 +1,14 @@ +en: + passing: "Passing QC" + failing: "Failing QC" + not_sequenced: "Not sequenced" + mean_cov_field: "Mean coverage field samples" + mean_cov_neg: "Mean coverage negative controls" + per_field_passing: "Field samples passing QC (%%)" + per_field_contam: "Field samples with contamination (%%)" + per_field_lowcov: "Field samples with low coverage (%%)" + mean_cov_field_tooltip: "The average mean coverage per amplicon across all field samples." + mean_cov_neg_tooltip: "The average mean coverage per amplicon across all negative control samples." + per_field_passing_tooltip: "The percentage of field samples that passed quality control (QC) for this amplicon i.e. has minimum coverage and no (or minimal) contamination" + per_field_contam_tooltip: "The percentage of field samples with contamination for this amplicon. Contamination is where negative controls have 50x(default) mean coverage, or the mean coverage of negative controls is more than 10%%(default) of the sample coverage" + per_field_lowcov_tooltip: "The percentage of field samples that had low coverage for this amplicon. This is defined as having mean coverage less than 50x(default)." diff --git a/src/nomadic/summarize/main.py b/src/nomadic/summarize/main.py new file mode 100644 index 0000000..c5c3cb2 --- /dev/null +++ b/src/nomadic/summarize/main.py @@ -0,0 +1,773 @@ +import glob +import os +import shutil +from typing import Iterable, Optional +from warnings import warn +from enum import StrEnum, auto +from collections import Counter +from pathlib import Path + +import pandas as pd +import numpy as np + +from nomadic.summarize.compute import ( + calc_amplicons_summary, + calc_samples_summary, + compute_variant_prevalence, + filter_false_positives, + gene_deletion_prevalence_by, + gene_deletions, +) +from nomadic.summarize.dashboard.builders import BasicSummaryDashboard +from nomadic.util.panel import get_panel_settings +from nomadic.util.workspace import Workspace +from nomadic.util.dirs import produce_dir +from nomadic.util.regions import RegionBEDParser +from nomadic.util.experiment import ( + get_summary_files, + check_experiment_outputs, +) +from nomadic.util.logging_config import LoggingFascade +from nomadic.util.summary import Settings, get_master_columns_mapping, load_settings + + +# -------------------------------------------------------------------------------- +# Check for completion and consistency across experiments +# +# -------------------------------------------------------------------------------- + + +def check_regions_consistent(expt_regions: list[RegionBEDParser]) -> None: + """ + Check that the regions are consistent across all experiment directories + + TODO: + - Might make sense to *extract* the region that was used and save it; + + """ + if len(expt_regions) == 0: + # Nothing to check + return + base = expt_regions[0] + for r in expt_regions: + if not (r.df == base.df).all().all(): + raise ValueError( + "Different regions used across experiments, this is not supported. Check region BED files are the same." + ) + + +def check_calling_consistent(expt_callers: list[str]) -> Optional[str]: + """ + Check that the same variant caller was used across all experiments, + where `expt_callers` is a list of used variant callers + """ + if len(expt_callers) == 0: + return None + caller_counts = Counter([caller for caller in expt_callers]) + if len(caller_counts) > 1: + raise ValueError( + "Found more than one variant caller used across experiments: " + + f"{', '.join([f'{v} experiment(s) used {c}' for c, v in caller_counts.items()])}." + ) + return caller_counts.most_common()[0][0] + + +def get_shared_metadata_columns( + metadata_dfs: list[pd.DataFrame], + fixed_columns: list[str] = ["expt_name", "barcode", "sample_id", "sample_type"], +) -> list[str]: + """Get metadata columns that are shared acrossa all experiments""" + + shared_columns = set(metadata_dfs[0].columns) + for df in metadata_dfs[1:]: + shared_columns.intersection_update(df.columns) + shared_columns.difference_update(fixed_columns) # why am I doing this? + return list(shared_columns) + + +# -------------------------------------------------------------------------------- +# Throughput +# +# -------------------------------------------------------------------------------- + + +def compute_throughput(metadata: pd.DataFrame, add_unique: bool = True) -> pd.DataFrame: + """ + Compute a simple throughput crosstable + + Also add information about uniqueness + + """ + + throughput_df = pd.crosstab( + metadata["sample_type"], metadata["expt_name"], margins=True + ) + + if add_unique: + um = metadata.drop_duplicates("sample_id") + throughput_df.loc["field_unique"] = pd.crosstab( + um["sample_type"], um["expt_name"], margins=True + ).loc["field"] + + throughput_df.fillna(0, inplace=True) + throughput_df = throughput_df.astype(int) + + return throughput_df + + +def get_region_coverage_dataframe( + expt_dirs: Iterable[str], metadata: pd.DataFrame +) -> pd.DataFrame: + """ + Here we load a consolidated region coverage dataframe and include information required + for quality control + """ + + # Load coverage data + bed_dfs = [] + for expt_dir in expt_dirs: + bed_csv = get_summary_files(Path(expt_dir)).region_coverage + + bed_df = pd.read_csv(bed_csv) + bed_df.insert(0, "expt_name", os.path.basename(expt_dir)) + bed_df.query("barcode != 'unclassified'", inplace=True) + if "sample_id" in bed_df.columns: + bed_df.drop(columns=["sample_id"], inplace=True) + + # TODO: Do checks + bed_df = pd.merge( + left=metadata[["expt_name", "barcode", "sample_id", "sample_type"]], + right=bed_df, + on=["expt_name", "barcode"], + how="inner", # ensure we only take samples that are in the master metadata + ) + # TODO: Do checks + bed_dfs.append(bed_df) + concat_df = pd.concat(bed_dfs) + + # Get negative control data + neg_df = ( + concat_df.query("sample_type == 'neg'") + .groupby(["expt_name", "name"]) + .mean_cov.mean() + .reset_index() + .rename({"mean_cov": "mean_cov_neg"}, axis=1) + ) + + # TODO: do checks + coverage_df = pd.merge( + left=concat_df[ + ["expt_name", "barcode", "sample_id", "sample_type", "name", "mean_cov"] + ], # sample ID, will want it at some point + right=neg_df, + on=["expt_name", "name"], + how="left", + ) + # TODO: do checks + + return coverage_df + + +def calc_unknown_samples( + inventory_metadata: pd.DataFrame, master_metadata: pd.DataFrame +): + """ + Returns the sample ids that are not in the masterdata file. + """ + field_samples = inventory_metadata.query("sample_type == 'field'") + unknown_samples = ( + field_samples.loc[ + ~field_samples["sample_id"].isin(master_metadata["sample_id"]), + "sample_id", + ] + .unique() + .tolist() + ) + return unknown_samples + + +def calc_quality_control_columns( + df: pd.DataFrame, *, min_coverage: int = 50, max_contam: float = 0.1 +) -> None: + """ + Calculate columns evaluating whether samples have passed quality control + """ + + # Do we have enough coverage? + df["fail_lowcov"] = df["mean_cov"] < min_coverage + + # Check if coverage of negative control exceeds `max_contam` + df["fail_contam_rel"] = df["mean_cov_neg"] / (df["mean_cov"] + 0.01) >= max_contam + df["fail_contam_abs"] = df["mean_cov_neg"] >= min_coverage + + df["fail_contam"] = ( + (df["fail_contam_rel"] & ~df["fail_lowcov"]) | df["fail_contam_abs"] + ) # If already failed low coverage, don't consider contamination, unless it's absolute threshold is exceeded + + # Finally, define passing + df["passing"] = ~df["fail_contam"] & ~df["fail_lowcov"] + + +# -------------------------------------------------------------------------------- +# Quality Control +# +# -------------------------------------------------------------------------------- + + +class QcStatus(StrEnum): + PASS = auto() + LOWCOV = auto() + CONTAM = auto() + DUPLICATE = auto() + CONTROL = auto() + + +def _add_qc_status_no_duplicates(df: pd.DataFrame) -> list[str]: + """ + Adds a status to each replicate/amplicon to see which ones passed QC + and if they didn't, why. + """ + status_strs = [] + for _, row in df.iterrows(): + status = [] + if row["sample_type"] in ["pos", "neg"]: + status.append(QcStatus.CONTROL) + else: + if row["fail_contam"]: + status.append(QcStatus.CONTAM) + if row["fail_lowcov"]: + status.append(QcStatus.LOWCOV) + if not status: + status.append(QcStatus.PASS) + status_strs.append(";".join(status)) + df["status"] = status_strs + + +def _mark_duplicates(df: pd.DataFrame) -> None: + """ + Mark all field samples as duplicates, if there is a better covered replicate for the same amplicon + Replicates marked as duplicate will not be used for prevalance evaluation. + """ + + def _update_duplicate(status: str, idx: int, keep_idx: int) -> str: + if idx == keep_idx: + return status + return f"{status};{QcStatus.DUPLICATE}" + + for (_, _), data in df.query("sample_type == 'field'").groupby( + ["sample_id", "name"] + ): + # Select an index to keep, i.e. the best sample that should + # marked as duplicate + passing = data["status"] == "pass" + if passing.sum() == 1: + keep_idx = passing.idxmax() + elif passing.sum() > 1: + keep_idx = data[passing]["mean_cov"].idxmax() # keep maximum coverage + else: # none are passing, arbrarily keep first + keep_idx = data.index[0] + + # NB: updating in-place + # must be a view, not a slice hence [,] + df.loc[data.index, "status"] = [ + _update_duplicate(status, idx, keep_idx) + for idx, status in data["status"].items() + ] + + +def add_quality_control_status_column(df: pd.DataFrame) -> None: + """ + Add a QC status column in-place + + Note: + - When we mark duplicates; we do it on an AMPLICON x SAMPLE level; + not on a per-sample level. So we could take amplicons from separate + samples to get the best data for that sample. + + """ + _add_qc_status_no_duplicates(df) + _mark_duplicates(df) + + +# -------------------------------------------------------------------------------- +# Variant analysis +# +# -------------------------------------------------------------------------------- + + +def load_variant_summary_csv( + variants_csv: str, define_gene: bool = True +) -> pd.DataFrame: + """ + Load an clean `summary.variants.csv` data produced by `nomadic` + + """ + + # Settings + NUMERIC_COLUMNS = ["dp", "wsaf"] + UNPHASED_GT_TO_INT = { + "./.": -1, + "0/0": 0, + "0/1": 1, + "1/1": 2, + } # TODO What about multiallelic sites + + # Load + variants_df = pd.read_csv(variants_csv) + + # Reformat numeric columns to be floats + for c in NUMERIC_COLUMNS: + variants_df[c] = [float(v) if v != "." else None for v in variants_df[c]] + + # Reformat unphased genotypes as integers + variants_df.insert( + variants_df.columns.tolist().index("gt") + 1, + "gt_int", + variants_df["gt"].map(UNPHASED_GT_TO_INT), + ) + + # Optionally reformat amplicon name to gene; assuming like gene-... + if define_gene: + variants_df.insert( + variants_df.columns.get_loc("amplicon") + 1, + "gene", + [a.split("-")[0] for a in variants_df["amplicon"]], + ) + variants_df.insert( + variants_df.columns.get_loc("gene") + 1, + "mutation", + [ + f"{gene}-{aa_change}" + for gene, aa_change in zip( + variants_df["gene"], variants_df["aa_change"] + ) + ], + ) + + return variants_df + + +def load_and_concat_variants(expt_dirs: list[str]) -> pd.DataFrame: + """ + Load all of the variant calls for a set of experiment dirs + + Note that because we do note have the unfiltered VCF files, we have to do + some additional work in order to ensure all mutations are represented across + all experiments; + """ + + # Load data + variant_dfs = [] + for expt_dir in expt_dirs: + variant_csv = f"{expt_dir}/summary.variants.csv" + variant_df = load_variant_summary_csv(variant_csv) + variant_df.insert(0, "expt_name", os.path.basename(expt_dir)) + variant_df.query("barcode != 'unclassified'", inplace=True) + variant_dfs.append(variant_df) + variant_df = pd.concat(variant_dfs) + + # Get all unique mutations + MUT_COLUMNS = [ + "chrom", + "pos", + "ref", + "alt", + "strand", + "aa_change", + "aa_pos", + "mut_type", + "mutation", + "amplicon", + "gene", + ] + uniq_mutation_df = variant_df[MUT_COLUMNS].drop_duplicates() + + # Now we merge these back in for each barcode + # - By doing a 'right' merge, we make sure all variants are present for each barcode + # - The 'NaNs' that are present when the variant doesn't exist for that barcode + # get filled with zeros, i.e. we assume homozygous reference + # - In reality, it could have been EITHER 0/0 or ./. (i.e. filtered), but we handle + # this afterwards when we merge with QC data; + full_variant_dfs = [] + for (expt_name, barcode), bdf in variant_df.groupby(["expt_name", "barcode"]): + mdf = pd.merge(bdf, uniq_mutation_df, on=MUT_COLUMNS, how="right") + mdf["expt_name"] = expt_name + mdf["barcode"] = barcode + + # Filled by default with hom reference + mdf["gt"] = mdf["gt"].fillna("0/0") + mdf["gt_int"] = mdf["gt_int"].fillna(0.0) + mdf["wsaf"] = mdf["wsaf"].fillna(0.0) + + full_variant_dfs.append(mdf) + + return pd.concat(full_variant_dfs) + + +def replicates_qc( + coverage_df: pd.DataFrame, REPLICATE_PASSING_THRESHOLD: float +) -> pd.DataFrame: + """ + Calculates which of the replicates (repeated runs of a sample) passed QC as a whole + (more than REPLICATE_PASSING_THRESHOLD passed) + """ + replicates_qc_df = ( + coverage_df.query("sample_type == 'field'") + .groupby(["expt_name", "barcode", "sample_id"]) + .agg( + n_amplicons=pd.NamedAgg("name", "count"), + n_passing=pd.NamedAgg("passing", "sum"), + n_fail_contam=pd.NamedAgg("fail_contam", "sum"), + n_fail_lowcov=pd.NamedAgg("fail_lowcov", "sum"), + ) + .reset_index() + ) + replicates_qc_df["passing"] = ( + replicates_qc_df["n_passing"] / replicates_qc_df["n_amplicons"] + >= REPLICATE_PASSING_THRESHOLD + ) + + return replicates_qc_df + + +def replicates_amplicon_qc(coverage_df): + return coverage_df.query("sample_type == 'field'") + + +# -------------------------------------------------------------------------------- +# Main +# +# -------------------------------------------------------------------------------- + + +def main( + *, + workspace: Workspace, + output_dir: Path, + expt_dirs: tuple[str], + summary_name: str, + metadata_path: Optional[Path], + settings_file_path: Path, + show_dashboard: bool = True, + prevalence_by: list[str], + no_master_metadata: bool = False, + qc_min_coverage: int, + qc_max_contam: float, +) -> None: + """ + Define the main function for the summary analysis + + + """ + + assert (metadata_path is not None) or no_master_metadata + + produce_dir(str(output_dir)) + + # PARSE EXPERIMENT DIRECTORIES + log = LoggingFascade(logger_name="nomadic") + log.info("Input parameters:") + log.info(f" Summary Name: {summary_name}") + if not no_master_metadata: + log.info(f" Master metadata: {metadata_path}") + else: + log.info(" No master metadata will be used.") + log.info(f" Setting file: {settings_file_path}") + log.info(f" Found {len(expt_dirs)} experiment directories.") + + # Check experiments are complete + expts = [check_experiment_outputs(expt_dir) for expt_dir in expt_dirs] + log.info(" All experiments are complete.") + + # Check experiments are consistent + check_regions_consistent([expt.regions for expt in expts]) + log.info(" All experiments use the same regions.") + if expts: + panel_name = expts[0].regions.name + else: + panel_name = "Unknown" + log.info(f" Panel used: {panel_name}") + caller = check_calling_consistent([expt.caller for expt in expts]) + log.info(f" All experiments use same variant caller: {caller}") + + panel_settings = get_panel_settings(panel_name) + log.info(f" Loaded panel settings for panel '{panel_settings.name}'.") + + settings: Settings = Settings() + if settings_file_path.exists(): + settings = load_settings(settings_file_path) + log.info(f" Loaded summary settings from {settings_file_path}.") + + # CHECK METADATA IS VALID + FIXED_COLUMNS = ["expt_name", "barcode", "sample_id", "sample_type"] + shared_columns = get_shared_metadata_columns( + [expt.metadata for expt in expts], fixed_columns=FIXED_COLUMNS + ) + log.info(" All metadata tables pass completion checks.") + log.info( + f" Found {len(shared_columns)} non-essential shared columns across all metadata files: {', '.join(shared_columns)}" + ) + + # for now we use the master metadata file + inventory_metadata = pd.concat([expt.metadata[FIXED_COLUMNS] for expt in expts]) + if metadata_path is not None and not no_master_metadata: + master_metadata = pd.read_csv(metadata_path).rename( + columns=get_master_columns_mapping(settings) + ) + else: + # create metadata from experiment meta data files + shared_columns = ["sample_id"] + list(shared_columns) + master_metadata = pd.concat( + [ + expt.metadata.query("sample_type == 'field'")[shared_columns] + for expt in expts + ] + ) + + master_metadata = master_metadata.astype( + {"sample_id": "str"} + ) # ensure sample IDs are strings + inventory_metadata = inventory_metadata.astype( + {"sample_id": "str"} + ) # ensure sample IDs are strings + # strip whitespaces from sample IDs + inventory_metadata["sample_id"] = inventory_metadata["sample_id"].str.strip() + master_metadata["sample_id"] = master_metadata["sample_id"].str.strip() + + unknown_samples = calc_unknown_samples(inventory_metadata, master_metadata) + if unknown_samples: + warn( + f"Samples in experiments that are not in master metadata: {unknown_samples}" + ) + + # Delete unknown samples + inventory_metadata["status"] = np.where( + inventory_metadata["sample_id"].isin(unknown_samples), "unknown", "known" + ) + # set all controls + inventory_metadata["status"] = np.where( + inventory_metadata["sample_type"].isin(["pos", "neg"]), + "control", + inventory_metadata["status"], + ) + inventory_metadata.to_csv(f"{output_dir}/inventory.csv", index=False) + inventory_metadata = inventory_metadata.query("status != 'unknown'") + + if len(inventory_metadata.query("sample_type == 'field'")) == 0: + log.info("No known field samples, exiting...") + return + + # Throughput data + log.info("Overall sequencing throughput:") + throughput_df = compute_throughput(inventory_metadata) + log.info(f" Positive controls: {throughput_df.loc['pos', 'All']}") + log.info(f" Negative controls: {throughput_df.loc['neg', 'All']}") + log.info(f" Fields samples sequenced (total): {throughput_df.loc['field', 'All']}") + log.info(f" Field samples (unique): {throughput_df.loc['field_unique', 'All']}") + log.info(f" Unknown samples (excluded): {len(unknown_samples)}") + throughput_df.to_csv(f"{output_dir}/summary.throughput.csv", index=True) + + # Now let's evaluate coverage + coverage_df = get_region_coverage_dataframe(expt_dirs, inventory_metadata) + calc_quality_control_columns( + coverage_df, min_coverage=qc_min_coverage, max_contam=qc_max_contam + ) + + log.info("Amplicon-Sample QC Statistics:") + field_coverage_df = coverage_df.query("sample_type == 'field'") + n = field_coverage_df.shape[0] + n_lowcov = field_coverage_df["fail_lowcov"].sum() + n_contam = field_coverage_df["fail_contam"].sum() + n_pass = field_coverage_df["passing"].sum() + log.info( + f" Coverage below <{qc_min_coverage}x: {n_lowcov} ({100 * n_lowcov / n:.2f}%)" + ) + log.info( + f" Contamination >{qc_max_contam}: {n_contam} ({100 * n_contam / n:.2f}%)" + ) + log.info(f" Passing QC: {n_pass} ({100 * n_pass / n:.2f}%)") + add_quality_control_status_column(coverage_df) + log.info(str(coverage_df["status"].value_counts())) + coverage_df.to_csv(f"{output_dir}/summary.coverage.csv", index=False) + + REPLICATE_PASSING_THRESHOLD = 0.8 + replicates_qc_df = replicates_qc(coverage_df, REPLICATE_PASSING_THRESHOLD) + replicates_qc_df.to_csv(f"{output_dir}/summary.replicates_qc.csv", index=False) + + samples_summary_df = calc_samples_summary(master_metadata, replicates_qc_df) + samples_summary_df.to_csv(f"{output_dir}/summary.samples_qc.csv", index=False) + + samples_by_amplicon_summary_df = calc_amplicons_summary( + master_metadata, replicates_amplicon_qc(coverage_df) + ) + samples_by_amplicon_summary_df.to_csv( + f"{output_dir}/summary.samples_amplicons_qc.csv", index=False + ) + + final_df = ( + coverage_df.query("sample_type == 'field'") + .groupby(["expt_name", "name"]) + .agg( + mean_cov_field=pd.NamedAgg("mean_cov", "median"), + mean_cov_neg=pd.NamedAgg("mean_cov_neg", "median"), + n_field=pd.NamedAgg("barcode", len), + n_field_passing=pd.NamedAgg("passing", lambda x: x.sum()), + per_field_contam=pd.NamedAgg("fail_contam", lambda x: 100 * x.mean()), + per_field_lowcov=pd.NamedAgg("fail_lowcov", lambda x: 100 * x.mean()), + per_field_passing=pd.NamedAgg("passing", lambda x: 100 * x.mean()), + ) + .reset_index() + ) + final_df.to_csv(f"{output_dir}/summary.experiments_qc.csv", index=False) + + # -------------------------------------------------------------------------------- + # Let's move onto to variant calling results + # + # -------------------------------------------------------------------------------- + + log.info("Loading variants...") + variant_df = load_and_concat_variants(expt_dirs) + + if "sample_id" in variant_df.columns: + variant_df.drop(columns=["sample_id"], inplace=True) + # Merge with the quality control results, then we can subset to the analysis set + variant_df = pd.merge( + left=coverage_df.rename({"name": "amplicon"}, axis=1)[ + ["expt_name", "barcode", "sample_id", "sample_type", "amplicon", "status"] + ], + right=variant_df, + on=["expt_name", "barcode", "amplicon"], + ) + + log.info("Filtering to analysis set...") + remove_amplicons = panel_settings.excluded_amplicons # noqa: F841 later used in query + remove_mutations = panel_settings.filtered_mutations # noqa: F841 later used in query + analysis_df = ( + variant_df.query("status == 'pass'") + .query("mut_type == 'missense'") + .query("amplicon not in @remove_amplicons") + .query("mutation not in @remove_mutations") + ) + + # Filter out false positives + analysis_df = filter_false_positives(analysis_df) + analysis_df.to_csv(f"{output_dir}/summary.variants.analysis_set.csv", index=False) + + # Then we will compute prevalence + prev_df = compute_variant_prevalence(analysis_df) + prev_df.to_csv(f"{output_dir}/summary.variants.prevalence.csv", index=False) + + for col in prevalence_by: + prev_by_col_df = compute_variant_prevalence(analysis_df, master_metadata, [col]) + prev_by_col_df.to_csv( + f"{output_dir}/summary.variants.prevalence-{col}.csv", index=False + ) + + # -------------------------------------------------------------------------------- + # Gene deletion analysis + # + # -------------------------------------------------------------------------------- + + if panel_settings.deletion_genes: + log.info("Calculate gene deletions...") + gene_deletion_df = gene_deletions(coverage_df, panel_settings.deletion_genes) + gene_deletion_df.to_csv(f"{output_dir}/summary.gene_deletions.csv", index=False) + + prev_gen_deletions_df = gene_deletion_prevalence_by( + gene_deletion_df, master_metadata, [] + ) + prev_gen_deletions_df.to_csv( + f"{output_dir}/summary.gene-deletions.prevalence.csv", index=False + ) + + for col in prevalence_by: + prev_gen_deletion_by_col_df = gene_deletion_prevalence_by( + gene_deletion_df, master_metadata, [col] + ) + prev_gen_deletion_by_col_df.to_csv( + f"{output_dir}/summary.gene-deletions.prevalence-{col}.csv", index=False + ) + + master_metadata.to_csv(f"{output_dir}/metadata.csv", index=False) + + log.info("Copy relevant files to summary output directory...") + + # Copy relevant files + if expts: + shutil.copy( + expts[0].regions.path, + os.path.join(output_dir, os.path.basename(expts[0].regions.path)), + ) + for file in glob.glob(f"{workspace.get_metadata_dir()}/{summary_name}*.geojson"): + shutil.copy(file, f"{output_dir}/{file.split('-')[-1]}") + coords_file = f"{workspace.get_metadata_dir()}/{summary_name}.coords.csv" + if os.path.isfile(coords_file): + shutil.copy( + coords_file, + os.path.join(output_dir, "coords.csv"), + ) + summary_settings_file = workspace.get_summary_settings_file(summary_name) + if os.path.isfile(summary_settings_file): + shutil.copy( + summary_settings_file, + os.path.join(output_dir, "settings.yaml"), + ) + + log.info("Summary analysis complete.") + + # -------------------------------------------------------------------------------- + # Dashboard + # + # -------------------------------------------------------------------------------- + + if show_dashboard: + view(output_dir, summary_name) + + +def view(input_dir: Path, summary_name: str) -> None: + """ + View the summary dashboard for a given summary + """ + print(f'View summary dashboard for "{summary_name}".') + settings: Settings = Settings() + settings_file = Path(f"{input_dir}/settings.yaml") + if settings_file.exists(): + print(f"Loading settings from {settings_file}...") + settings = load_settings(settings_file) + + bed_files = glob.glob(f"{input_dir}/*.bed") + if bed_files: + panel_name = os.path.basename(bed_files[0]).split(".")[0] + print(f"Use panel name from regions BED file: {panel_name}") + amplicons = RegionBEDParser(bed_files[0]).names + else: + raise ValueError("No regions BED file found in summary directory.") + + panel_settings = get_panel_settings(panel_name) + amplicon_sets = panel_settings.amplicon_sets + deletion_genes = panel_settings.deletion_genes + + print(f"Load data from {input_dir}...") + + dashboard = BasicSummaryDashboard( + summary_name, + throughput_csv=f"{input_dir}/summary.throughput.csv", + samples_csv=f"{input_dir}/summary.samples_qc.csv", + samples_amplicons_csv=f"{input_dir}/summary.samples_amplicons_qc.csv", + coverage_csv=f"{input_dir}/summary.experiments_qc.csv", + analysis_csv=f"{input_dir}/summary.variants.analysis_set.csv", + gene_deletions_csv=f"{input_dir}/summary.gene_deletions.csv", + master_csv=f"{input_dir}/metadata.csv", + geojson_glob=f"{input_dir}/*.geojson", + location_coords_csv=f"{input_dir}/coords.csv", + settings=settings, + amplicons=amplicons, + amplicon_sets=amplicon_sets, + deletion_genes=deletion_genes, + ) + + print("") + print("Launching dashboard (press CNTRL+C to exit):") + print("") + debug = bool(os.getenv("NOMADIC_DEBUG")) + dashboard.run(debug=debug, auto_open=not debug) diff --git a/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-format.csv b/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-format.csv index 95bbe63..91fbb0b 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-format.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-format.csv @@ -1,6 +1,7 @@ -barcode,sample_id,parasite_per_ul,location,platform -barcode_01,3D7 (NB01),10000,Oxford,flongle -barcode_02,Dd2 (NB02),10000,Oxford,flongle -barcode_03,CamC580Y (NB03),10000,Oxford,flongle -barcode_04,GB4 (NB04),10000,Oxford,flongle -barcode_05,HB3 (NB05),10000,Oxford,flongle +barcode,sample_id,sample_type,location,parasitemia,postclean_qubit +barcode_01,3D7,pos,berlin,1000,87 +barcode_02,HB3,pos,berlin,100,34 +barcode_03,NTC,neg,berlin,0,0 +barcode_04,DBS-A01,field,berlin,543,44 +barcode_05,DBS-A02,field,berlin,7583,85 +barcode_06,DBS-A03,field,berlin,349,32 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-int.csv b/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-int.csv index a30c2d5..8cf2967 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-int.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_badbarcode-int.csv @@ -1,6 +1,7 @@ -barcode,sample_id,parasite_per_ul,location,platform -1,3D7 (NB01),10000,Oxford,flongle -2,Dd2 (NB02),10000,Oxford,flongle -3,CamC580Y (NB03),10000,Oxford,flongle -4,GB4 (NB04),10000,Oxford,flongle -5,HB3 (NB05),10000,Oxford,flongle +barcode,sample_id,sample_type,location,parasitemia,postclean_qubit +1,3D7,pos,berlin,1000,87 +2,HB3,pos,berlin,100,34 +3,NTC,neg,berlin,0,0 +4,DBS-A01,field,berlin,543,44 +5,DBS-A02,field,berlin,7583,85 +6,DBS-A03,field,berlin,349,32 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_badheader.csv b/src/nomadic/util/_test_data/metadata/sample_info_badheader.csv index 9b24a43..2fa0660 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_badheader.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_badheader.csv @@ -1,6 +1,7 @@ -barcode;sample_id,parasite_per_ul,location;platform;number -barcode01;3D7 (NB01);10000;Oxford;flongle;8,4 -barcode02;Dd2 (NB02);10000;Oxford;flongle;2,0 -barcode03;CamC580Y (NB03);10000;Oxford;flongle;4,3 -barcode04;GB4 (NB04);10000;Oxford;flongle;5,4 -barcode05;HB3 (NB05);10000;Oxford;flongle;6,1 +barcode;sample_id,sample_type,location;parasitemia;postclean_qubit +barcode01;3D7;pos;berlin;1000;87 +barcode02;HB3;pos;berlin;100;34 +barcode03;NTC;neg;berlin;0;0 +barcode04;DBS-A01;field;berlin;543;44 +barcode05;DBS-A02;field;berlin;7583;85 +barcode06;DBS-A03;field;berlin;349;32 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_case.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_case.csv index ec8542a..40456f0 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_case.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_case.csv @@ -1,6 +1,7 @@ -BARCODE,SAMPLE_ID,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode05,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +BARCODE,SAMPLE_ID,SAMPLE_TYPE,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_other.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_other.csv index b42e42d..83630dc 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_other.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_other.csv @@ -1,6 +1,7 @@ -Barcodes,sample-ids,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode05,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +Barcode,sample-ids,sample-types,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_plural.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_plural.csv index ee1b60a..6b93fb9 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_plural.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_plural.csv @@ -1,6 +1,7 @@ -barcodes,SAMPLE_IDS,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode05,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +barcodes,SAMPLE_IDS,sample types,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_sample.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_sample.csv index 69b1a18..11338f5 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_sample.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_sample.csv @@ -1,6 +1,7 @@ -Barcode,Sample,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode05,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +Barcode,Sample,type,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_space.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_space.csv index 2c36468..7e69597 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_space.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_space.csv @@ -1,6 +1,7 @@ -Barcode,Sample ID -Barcode1,3D7 (NB01) -Barcode2,Dd2 (NB02) -Barcode3,CamC580Y (NB03) -Barcode4,GB4 (NB04) -Barcode5,HB3 (NB05) \ No newline at end of file +barcode,sample id,sample type,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_underscore.csv b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_underscore.csv index daa1046..c61390a 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_column_correction_underscore.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_column_correction_underscore.csv @@ -1,6 +1,7 @@ -Barcode,sample_ID,parasite_per_ul,location,platform -Barcode1,3D7 (NB01),10000,Oxford,flongle -Barcode2,Dd2 (NB02),10000,Oxford,flongle -Barcode3,CamC580Y (NB03),10000,Oxford,flongle -Barcode4,GB4 (NB04),10000,Oxford,flongle -Barcode5,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +barcode,sample_ID,sample_type,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_dupbarcode.csv b/src/nomadic/util/_test_data/metadata/sample_info_dupbarcode.csv index fb381ed..2da2d0a 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_dupbarcode.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_dupbarcode.csv @@ -1,6 +1,7 @@ -barcode,sample_id,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode04,HB3 (NB05),10000,Oxford,flongle +barcode,sample_id,sample_type,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode05,DBS-A03,field,berlin,349,32 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_eurosep.csv b/src/nomadic/util/_test_data/metadata/sample_info_eurosep.csv index f65f448..9967cc7 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_eurosep.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_eurosep.csv @@ -1,6 +1,7 @@ -barcode;sample_id;parasite_per_ul;location;platform;number -barcode01;3D7 (NB01);10000;Oxford;flongle;8,4 -barcode02;Dd2 (NB02);10000;Oxford;flongle;2,0 -barcode03;CamC580Y (NB03);10000;Oxford;flongle;4,3 -barcode04;GB4 (NB04);10000;Oxford;flongle;5,4 -barcode05;HB3 (NB05);10000;Oxford;flongle;6,1 +barcode;sample_id;sample_type;location;parasitemia;postclean_qubit;number +barcode01;3D7;pos;berlin;1000;87;8,4 +barcode02;HB3;pos;berlin;100;34;2,0 +barcode03;NTC;neg;berlin;0;0;4,3 +barcode04;DBS-A01;field;berlin;543;44;5,4 +barcode05;DBS-A02;field;berlin;7583;85;6,1 +barcode06;DBS-A03;field;berlin;349;32;7,2 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_good.csv b/src/nomadic/util/_test_data/metadata/sample_info_good.csv index 758e1b3..574a558 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_good.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_good.csv @@ -1,6 +1,7 @@ -barcode,sample_id,parasite_per_ul,location,platform -barcode01,3D7 (NB01),10000,Oxford,flongle -barcode02,Dd2 (NB02),10000,Oxford,flongle -barcode03,CamC580Y (NB03),10000,Oxford,flongle -barcode04,GB4 (NB04),10000,Oxford,flongle -barcode05,HB3 (NB05),10000,Oxford,flongle \ No newline at end of file +barcode,sample_id,sample_type,location,parasitemia,postclean_qubit +barcode01,3D7,pos,berlin,1000,87 +barcode02,HB3,pos,berlin,100,34 +barcode03,NTC,neg,berlin,0,0 +barcode04,DBS-A01,field,berlin,543,44 +barcode05,DBS-A02,field,berlin,7583,85 +barcode06,DBS-A03,field,berlin,349,32 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_hybridsep-col.csv b/src/nomadic/util/_test_data/metadata/sample_info_hybridsep-col.csv index dcb4aa8..db7eb93 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_hybridsep-col.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_hybridsep-col.csv @@ -1,6 +1,7 @@ -barcode,sample_id,parasite_per_ul,location,platform,extras -barcode01,3D7 (NB01),10000,Oxford,flongle,a;1;3 -barcode02,Dd2 (NB02),10000,Oxford,flongle,b;2;4 -barcode03,CamC580Y (NB03),10000,Oxford,flongle,c;3;3 -barcode04,GB4 (NB04),10000,Oxford,flongle,d;2;2 -barcode05,HB3 (NB05),10000,Oxford,flongle,e;1;1 +barcode,sample_id,sample_type,location,parasitemia,postclean_qubit,extras +barcode01,3D7,pos,berlin,1000,87,a;1;3 +barcode02,HB3,pos,berlin,100,34,b;2;4 +barcode03,NTC,neg,berlin,0,0,c;3;3 +barcode04,DBS-A01,field,berlin,543,44,d;2;2 +barcode05,DBS-A02,field,berlin,7583,85,e;1;1 +barcode06,DBS-A03,field,berlin,349,32,f;2;2 \ No newline at end of file diff --git a/src/nomadic/util/_test_data/metadata/sample_info_nobarcode.csv b/src/nomadic/util/_test_data/metadata/sample_info_nobarcode.csv index 35acba5..835a7ba 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_nobarcode.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_nobarcode.csv @@ -1,6 +1,7 @@ -sample_id,parasite_per_ul,location,platform -3D7 (NB01),10000,Oxford,flongle -Dd2 (NB02),10000,Oxford,flongle -CamC580Y (NB03),10000,Oxford,flongle -GB4 (NB04),10000,Oxford,flongle -HB3 (NB05),10000,Oxford,flongle +sample_id,sample_type,location,parasitemia,postclean_qubit +3D7,pos,berlin,1000,87 +HB3,pos,berlin,100,34 +NTC,neg,berlin,0,0 +DBS-A01,field,berlin,543,44 +DBS-A02,field,berlin,7583,85 +DBS-A03,field,berlin,349,32 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_onecolumn.csv b/src/nomadic/util/_test_data/metadata/sample_info_onecolumn.csv index ee54f69..797acbb 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_onecolumn.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_onecolumn.csv @@ -4,3 +4,4 @@ barcode02 barcode03 barcode04 barcode05 +barcode06 diff --git a/src/nomadic/util/_test_data/metadata/sample_info_semicolon.csv b/src/nomadic/util/_test_data/metadata/sample_info_semicolon.csv index ee61dc7..3388958 100644 --- a/src/nomadic/util/_test_data/metadata/sample_info_semicolon.csv +++ b/src/nomadic/util/_test_data/metadata/sample_info_semicolon.csv @@ -1,6 +1,7 @@ -barcode;sample_id;parasite_per_ul;location;platform -barcode01;3D7 (NB01);10000;Oxford;flongle -barcode02;Dd2 (NB02);10000;Oxford;flongle -barcode03;CamC580Y (NB03);10000;Oxford;flongle -barcode04;GB4 (NB04);10000;Oxford;flongle -barcode05;HB3 (NB05);10000;Oxford;flongle \ No newline at end of file +barcode;sample_id;sample_type;location;parasitemia;postclean_qubit +barcode01;3D7;pos;berlin;1000;87 +barcode02;HB3;pos;berlin;100;34 +barcode03;NTC;neg;berlin;0;0 +barcode04;DBS-A01;field;berlin;543;44 +barcode05;DBS-A02;field;berlin;7583;85 +barcode06;DBS-A03;field;berlin;349;32 \ No newline at end of file diff --git a/src/nomadic/util/dirs.py b/src/nomadic/util/dirs.py index f4286bc..4e9ded4 100644 --- a/src/nomadic/util/dirs.py +++ b/src/nomadic/util/dirs.py @@ -3,7 +3,7 @@ import platformdirs -def produce_dir(*args): +def produce_dir(*args) -> str: """ Produce a new directory by concatenating `args`, if it does not already exist diff --git a/src/nomadic/util/exceptions.py b/src/nomadic/util/exceptions.py index 632b242..15b64a7 100644 --- a/src/nomadic/util/exceptions.py +++ b/src/nomadic/util/exceptions.py @@ -1,5 +1,5 @@ class MetadataFormatError(Exception): - """Error in format / contents of a metadata file""" + """Error in format or contents of a metadata file""" pass diff --git a/src/nomadic/util/experiment.py b/src/nomadic/util/experiment.py index dfbc618..a92aca5 100644 --- a/src/nomadic/util/experiment.py +++ b/src/nomadic/util/experiment.py @@ -1,16 +1,26 @@ +import glob +import os +import json +import shutil +import pandas as pd +from pathlib import Path +from typing import NamedTuple, Any +from dataclasses import dataclass + from nomadic.util.dirs import produce_dir -from nomadic.util.metadata import MetadataTableParser +from nomadic.util.exceptions import MetadataFormatError +from nomadic.util.metadata import MetadataTableParser, ExtendedMetadataTableParser from nomadic.util.regions import RegionBEDParser -import os -import shutil -from typing import NamedTuple + +# -------------------------------------------------------------------------------- +# Handle summary file names: legacy and current +# +# -------------------------------------------------------------------------------- class SummaryFiles(NamedTuple): - """ - Named tuple to hold paths to summary files. - """ + """Define summary file names / paths""" fastqs_processed: str read_mapping: str @@ -20,8 +30,8 @@ class SummaryFiles(NamedTuple): # Currently used summary file names -default_config_path = "config/defaults.json" -summary_files = SummaryFiles( +DEFAULT_CONFIG_PATH = "config/defaults.json" +SUMMARY_NAMES = SummaryFiles( fastqs_processed="summary.fastqs_processed.csv", read_mapping="summary.read_mapping.csv", region_coverage="summary.region_coverage.csv", @@ -30,7 +40,7 @@ class SummaryFiles(NamedTuple): ) # Legacy summary file names for backward compatibility -legacy_summary_files = SummaryFiles( +SUMMARY_NAMES_LEGACY = SummaryFiles( fastqs_processed="summary.fastq.csv", read_mapping="summary.bam_flagstats.csv", region_coverage="summary.bedcov.csv", @@ -39,6 +49,30 @@ class SummaryFiles(NamedTuple): ) +def get_summary_files(expt_dir: Path) -> SummaryFiles: + """ + Determine whether the summary files are use the legacy or current names, + and return a SummaryFiles object with the appropriate file names + + """ + + if not expt_dir.exists(): + raise FileNotFoundError(f"Experiment path does not exist: {expt_dir}") + + if (expt_dir / SUMMARY_NAMES_LEGACY.read_mapping).exists(): + # Detect legacy format using *one* of the differentiating file names + format_used = SUMMARY_NAMES_LEGACY + else: + format_used = SUMMARY_NAMES + return SummaryFiles(*[str(expt_dir / field) for field in format_used]) + + +# -------------------------------------------------------------------------------- +# Define experiment directories +# +# -------------------------------------------------------------------------------- + + class ExperimentDirectories: """ Put all the information about experimental @@ -97,38 +131,7 @@ def get_settings_file(self) -> str: return os.path.join(self.metadata_dir, "settings.json") def get_summary_files(self) -> SummaryFiles: - if os.path.exists( - f"{self.approach_dir}/{legacy_summary_files.fastqs_processed}" - ): - # Use legacy summary files if the old format exists - return SummaryFiles( - fastqs_processed=os.path.join( - self.approach_dir, legacy_summary_files.fastqs_processed - ), - read_mapping=os.path.join( - self.approach_dir, legacy_summary_files.read_mapping - ), - region_coverage=os.path.join( - self.approach_dir, legacy_summary_files.region_coverage - ), - depth_profiles=os.path.join( - self.approach_dir, legacy_summary_files.depth_profiles - ), - variants=os.path.join(self.approach_dir, legacy_summary_files.variants), - ) - return SummaryFiles( - fastqs_processed=os.path.join( - self.approach_dir, summary_files.fastqs_processed - ), - read_mapping=os.path.join(self.approach_dir, summary_files.read_mapping), - region_coverage=os.path.join( - self.approach_dir, summary_files.region_coverage - ), - depth_profiles=os.path.join( - self.approach_dir, summary_files.depth_profiles - ), - variants=os.path.join(self.approach_dir, summary_files.variants), - ) + return get_summary_files(Path(self.approach_dir)) def _setup_metadata_dir( self, metadata: MetadataTableParser, regions: RegionBEDParser @@ -146,3 +149,126 @@ def _setup_metadata_dir( self.regions_bed = f"{self.metadata_dir}/{os.path.basename(regions.path)}" if not os.path.exists(self.regions_bed): shutil.copy(regions.path, self.regions_bed) + + +# -------------------------------------------------------------------------------- +# Checks on experiment outputs +# +# -------------------------------------------------------------------------------- + + +@dataclass +class ExperimentOutputs: + """Store information about outputs in `expt_dir`""" + + expt_dir: str # TODO: change to Path + metadata: pd.DataFrame + regions: RegionBEDParser + summary_files: SummaryFiles + settings: dict[str, Any] + + # Variant calling outputs + caller: str + has_complete_vcf: bool + has_filtered_vcf: bool + + +def find_metadata( + expt_dir: str, Parser: MetadataTableParser = MetadataTableParser +) -> MetadataTableParser: + """ + Given an experiment directory, search for the metadata CSV file in thee + expected location and load it + + """ + + # In most cases, should match experiment name + csv = f"{expt_dir}/metadata/{os.path.basename(expt_dir)}.csv" + if os.path.exists(csv): + return Parser(csv) + + csv = glob.glob(f"{expt_dir}/metadata/*.csv") + if len(csv) == 1: + return Parser(csv[0]) + + raise ValueError( + f"Found {len(csv)} *.csv files in '{expt_dir}/metadata', cannot determine which is metadata." + ) + + +def find_regions(expt_dir: str) -> RegionBEDParser: + """ + Given an experiment directory, search for the metadata CSV file in thee + expected location + + """ + + bed = [ + f + for f in glob.glob(f"{expt_dir}/metadata/*.bed") + if f.endswith(".bed") and not f.endswith(".lowcomplexity_mask.bed") + ] + + if len(bed) == 1: + return RegionBEDParser(bed[0]) + + raise FileNotFoundError( + f"Expected one region BED file (*.bed) at '{expt_dir}/metadata', but found {len(bed)}." + ) + + +def check_experiment_outputs(expt_dir: str) -> ExperimentOutputs: + """For a given `expt_dir` check what experiment outputs exist + + Will raise exceptions if data required for summarising is missing. + + """ + + # Existence of directory + if not os.path.isdir(expt_dir): + raise FileNotFoundError(f"Experiment directory {expt_dir} does not exist.") + + # Existence of metadata + try: + parser = find_metadata(expt_dir, Parser=ExtendedMetadataTableParser) + except MetadataFormatError as e: + raise MetadataFormatError(f"Error in metadata for '{expt_dir}': {e}") + metadata = parser.df + metadata.insert(0, "expt_name", os.path.basename(expt_dir)) + + # Existence of regions + regions = find_regions(expt_dir) + + # Existence of summary Files + summary_files = get_summary_files(Path(expt_dir)) + for file in summary_files: + if "depth" in file: + # depth files are optional, TODO: not so robust + continue + if "fastq" in file: + # fastq files are optional + continue + if not os.path.exists(file): + raise FileNotFoundError(f"Missing '{file}' file in {expt_dir}.") + + # Existence of settings / caller + settings_path = f"{expt_dir}/metadata/settings.json" + if not os.path.exists(settings_path): + settings = None + caller = "bcftools" # if no settings, was using bcftools + else: + settings = json.load(open(settings_path, "r")) + caller = settings["caller"] + + return ExperimentOutputs( + expt_dir=expt_dir, + metadata=metadata, + regions=regions, + summary_files=summary_files, + settings=settings, + caller=caller, + has_complete_vcf=os.path.exists(f"{expt_dir}/vcfs/summary.variants.vcf.gz"), + has_filtered_vcf=os.path.exists( + f"{expt_dir}/vcfs/summary.variants.filtered.annotated.vcf.gz" + ), + ) diff --git a/src/nomadic/util/metadata.py b/src/nomadic/util/metadata.py index 260febc..03c142a 100644 --- a/src/nomadic/util/metadata.py +++ b/src/nomadic/util/metadata.py @@ -32,6 +32,12 @@ def get_csv_delimiter(csv_path: str, delimiters: List[str] = [",", ";", "\t"]): return used[0] +# -------------------------------------------------------------------------------- +# Check validity of various columns in the metadata +# +# -------------------------------------------------------------------------------- + + def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str: """ Check that the format of a barcode is as expected, and optionally @@ -79,6 +85,75 @@ def correct_barcode_format(barcode: str, try_to_fix: bool = True) -> str: return barcode +def check_sample_type_format(sample_type: str, try_to_fix: bool = False) -> str: + """ + Check that the format of the `sample_type` column is correct, and optionally + try to fix if it is not + + """ + + # Settings + EXPECTED = ["field", "pos", "neg"] + KNOWN_POS_CONTROLS = [ + "3D7", + "Dd2", + "HB3", + "GB4", + "7G8", + "NF54", + "FCR3", + ] # some smaller ones could be field substrings, e.g. K1, W2 + + if pd.isna(sample_type): + raise MetadataFormatError( + "Missing information in the 'sample_type' column. Please ensure it is complete." + ) + + if not isinstance(sample_type, str): + sample_type = str(sample_type) + + sample_type = sample_type.strip() # safe to do this in all cases + if sample_type in EXPECTED: + return sample_type + + if not try_to_fix: + raise MetadataFormatError( + f"Found a sample with type '{sample_type}' which is invalid. Please assign one of: {', '.join(EXPECTED)}." + ) + + # Raise a warning and proceed if we are fixing. + # warnings.warn( + # f"Found a sample with type '{sample_type}' which is invalid. Trying to fix..." + # ) + + for e in EXPECTED: + if sample_type.lower() == e: # capitalisation issue + return e + if sample_type.lower().startswith(e): # added something to the end + return e + + for k in KNOWN_POS_CONTROLS: + if sample_type.lower() == k.lower(): + return "pos" + + for ( + e + ) in EXPECTED: # this can be dangerous, only do if other attempts haven't worked + if sample_type.lower().startswith(e[0]): + return e + + # Raise if couldn't fix + raise MetadataFormatError( + f"Found a sample with type '{sample_type}' which is invalid. Please assign one of: {', '.join(EXPECTED)}." + ) + + +# -------------------------------------------------------------------------------- +# Class(es) to parse metadata table(s) for various use cases, e.g. realtime +# analysis or summarizing +# -------------------------------------------------------------------------------- + + class MetadataTableParser: """ Parse the `metadata_csv` table, and make sure that it is formatted @@ -86,23 +161,14 @@ class MetadataTableParser: """ - REQUIRED_COLUMNS = ["barcode", "sample_id"] + REQUIRED_COLUMNS = ["barcode", "sample_id", "sample_type"] UNIQUE_COLUMNS = ["barcode"] # If the required columns are not found, try these alternative names, case insensitive ALTERNATIVE_NAMES = { - "barcode": ["barcodes"], - "sample_id": [ - "sample", - "sampleid", - "sample-id", - "sample_id", - "sampleids", - "sample-ids", - "sample_ids", - "sample id", - "sample ids", - ], + "barcode": r"barcode[s]?", + "sample_id": r"sample[s]?[\-\_\s]?(id[s]?)?", + "sample_type": r"sample[s]?[\-\_\s]?(type[s]?)?", } def __init__(self, metadata_csv: str, include_unclassified: bool = True): @@ -119,16 +185,16 @@ def __init__(self, metadata_csv: str, include_unclassified: bool = True): self._correct_all_barcodes() self.barcodes = self.df["barcode"].tolist() - self.required_metadata = self.df[self.REQUIRED_COLUMNS].set_index("barcode") + self.sample_ids_df = self.df[["barcode", "sample_id"]].set_index("barcode") if include_unclassified: - self.required_metadata.loc["unclassified"] = ["unclassified"] + self.sample_ids_df.loc["unclassified"] = ["unclassified"] self.barcodes.append("unclassified") def get_sample_id(self, barcode: str) -> Optional[str]: if barcode == "unclassified": return barcode - metadata = self.required_metadata + metadata = self.sample_ids_df if barcode not in metadata.index: return None return metadata.loc[barcode].get("sample_id", None) @@ -143,13 +209,12 @@ def _correct_columns(self): for required_column in self.REQUIRED_COLUMNS: if required_column not in self.df.columns: - for alt in [ - required_column, - *self.ALTERNATIVE_NAMES.get(required_column, []), - ]: - if alt in normalized_column_names: + for normalized_column in normalized_column_names: + if re.fullmatch( + self.ALTERNATIVE_NAMES[required_column], normalized_column + ): column_name = self.df.columns[ - normalized_column_names.index(alt) + normalized_column_names.index(normalized_column) ] warnings.warn( f"Using column '{column_name}' as '{required_column}' in metadata CSV." @@ -181,5 +246,37 @@ def _check_entries_unique(self): ) observed_entries.append(entry) - def _correct_all_barcodes(self) -> List[str]: + def _correct_all_barcodes(self): self.df["barcode"] = [correct_barcode_format(b) for b in self.df["barcode"]] + + +class ExtendedMetadataTableParser(MetadataTableParser): + """ + Add requirement for sample type, and parse it + + # Should also check we have positive and negative controls in each experiment + + """ + + def __init__(self, metadata_csv: str, include_unclassified: bool = True): + super().__init__(metadata_csv, include_unclassified) + + self._check_sample_type() + + def _check_sample_type(self): + if "sample_type" not in self.df.columns: + raise MetadataFormatError( + "Metadata is missing the column 'sample_type'. " + "Please create this column and populate it with" + " 'field' (field sample), 'pos' (positive control) or 'neg' (negative control)" + " for each sample." + ) + self.df["sample_type"] = [ + check_sample_type_format(s, try_to_fix=True) for s in self.df["sample_type"] + ] + + sample_type_counts = self.df.sample_type.value_counts().to_dict() + if "neg" not in sample_type_counts: + raise MetadataFormatError("No negative control found for experiment!") + # if "pos" not in sample_type_counts: + # raise MetadataFormatError("No positive control found for experiment!") diff --git a/src/nomadic/util/metadata_test.py b/src/nomadic/util/metadata_test.py index 70a21e7..66cb491 100644 --- a/src/nomadic/util/metadata_test.py +++ b/src/nomadic/util/metadata_test.py @@ -57,22 +57,23 @@ def test_check_barcode_warning_error(barcode, try_to_fix, expected): @pytest.mark.parametrize( "csv_path,csv_shape", [ - (test_files_folder + "metadata/sample_info_good.csv", (5, 5)), - (test_files_folder + "metadata/sample_info_semicolon.csv", (5, 5)), - (test_files_folder + "metadata/sample_info_eurosep.csv", (5, 6)), - (test_files_folder + "metadata/sample_info_hybridsep-col.csv", (5, 6)), + (test_files_folder + "metadata/sample_info_good.csv", (6, 6)), + (test_files_folder + "metadata/sample_info_semicolon.csv", (6, 6)), + (test_files_folder + "metadata/sample_info_eurosep.csv", (6, 7)), + (test_files_folder + "metadata/sample_info_hybridsep-col.csv", (6, 7)), ], ) def test_metadata_correct(csv_path, csv_shape): metadata = MetadataTableParser(csv_path) + print(metadata) assert metadata.df.shape == csv_shape @pytest.mark.parametrize( "csv_path,csv_shape", [ - (test_files_folder + "metadata/sample_info_badbarcode-format.csv", (5, 5)), - (test_files_folder + "metadata/sample_info_badbarcode-int.csv", (5, 5)), + (test_files_folder + "metadata/sample_info_badbarcode-format.csv", (6, 6)), + (test_files_folder + "metadata/sample_info_badbarcode-int.csv", (6, 6)), ], ) def test_metadata_warns(csv_path, csv_shape): @@ -90,7 +91,7 @@ def test_metadata_warns(csv_path, csv_shape): ), ( test_files_folder + "metadata/sample_info_dupbarcode.csv", - "Column barcode must contain only unique entires, but barcode04 is duplicated.", + "Column barcode must contain only unique entires, but barcode05 is duplicated.", ), ( test_files_folder + "metadata/sample_info_nobarcode.csv", @@ -98,7 +99,7 @@ def test_metadata_warns(csv_path, csv_shape): ), ( test_files_folder + "metadata/sample_info_badheader.csv", - "Found multiple delimiters (, ;) in header: barcode;sample_id,parasite_per_ul,location;platform;number.", + "Found multiple delimiters (, ;) in header: barcode;sample_id,sample_type,location;parasitemia;postclean_qubit.", ), ], ) @@ -119,9 +120,11 @@ def test_metadata_errors(csv_path, error_msg): ], ) def test_metadata_column_corrections(csv_path): - meta_table = MetadataTableParser(csv_path) + with pytest.warns(UserWarning): + meta_table = MetadataTableParser(csv_path) assert "barcode" in meta_table.df.columns assert "sample_id" in meta_table.df.columns + assert "sample_type" in meta_table.df.columns assert meta_table.df["barcode"].tolist() == [ "barcode01", @@ -129,11 +132,21 @@ def test_metadata_column_corrections(csv_path): "barcode03", "barcode04", "barcode05", + "barcode06", ] assert meta_table.df["sample_id"].tolist() == [ - "3D7 (NB01)", - "Dd2 (NB02)", - "CamC580Y (NB03)", - "GB4 (NB04)", - "HB3 (NB05)", + "3D7", + "HB3", + "NTC", + "DBS-A01", + "DBS-A02", + "DBS-A03", + ] + assert meta_table.df["sample_type"].tolist() == [ + "pos", + "pos", + "neg", + "field", + "field", + "field", ] diff --git a/src/nomadic/util/panel.py b/src/nomadic/util/panel.py new file mode 100644 index 0000000..b4a277a --- /dev/null +++ b/src/nomadic/util/panel.py @@ -0,0 +1,57 @@ +from dataclasses import dataclass + + +@dataclass +class PanelSettings: + """Settings for a panel in the summary.""" + + name: str + # List of amplicons to exclude from analysis set + excluded_amplicons: list[str] + # List of mutations to exclude from analysis set + filtered_mutations: list[str] + # Amplicon sets + amplicon_sets: dict[str, list[str]] + # List of genes for which to compute deletion prevalence + deletion_genes: list[str] + + +MVP_PANEL_SETTINGS = PanelSettings( + name="nomadsMVP", + excluded_amplicons=[ + "hrp2-p14-306", + "hrp3-p14-276", + ], + filtered_mutations=["crt-N75K"], + amplicon_sets={ + "Resistance": [ + "crt-p14-125", + "dhps-p317-707", + "dhfr-p1-410", + "kelch13-p383-727", + "mdr1-p968-1278", + "mdr1-p46-245", + ], + "Diversity": ["ama1-p74-384", "csp-p19-398"], + }, + # Don't show deletion genes at the moment, because the code is not robust + # deletion_genes=["hrp2", "hrp3"], + deletion_genes=[], +) + + +UNKNOWN_PANEL_SETTINGS = PanelSettings( + name="Unknown", + excluded_amplicons=[], + filtered_mutations=[], + amplicon_sets={}, + deletion_genes=[], +) + + +def get_panel_settings(panel_name: str) -> PanelSettings: + """Get panel settings by panel name.""" + if panel_name == "nomadsMVP": + return MVP_PANEL_SETTINGS + else: + return UNKNOWN_PANEL_SETTINGS diff --git a/src/nomadic/util/regions.py b/src/nomadic/util/regions.py index 12659a0..f730a01 100644 --- a/src/nomadic/util/regions.py +++ b/src/nomadic/util/regions.py @@ -1,3 +1,4 @@ +import os import seaborn as sns from matplotlib.colors import rgb2hex @@ -19,6 +20,7 @@ def __init__(self, bed_path: str): """Load the BED file, assign colors to regions""" self.path = bed_path + self.name = os.path.basename(bed_path).split(".")[0] self.df = load_bed_as_dataframe(bed_path) self.n_regions = self.df.shape[0] diff --git a/src/nomadic/util/summary.py b/src/nomadic/util/summary.py new file mode 100644 index 0000000..60ef25c --- /dev/null +++ b/src/nomadic/util/summary.py @@ -0,0 +1,39 @@ +from pathlib import Path +from typing import Optional + +import yaml +from pydantic import BaseModel + + +class ColumnSettings(BaseModel): + use_as: str + + +class MapSettings(BaseModel): + center: tuple[float, float] + zoom_level: int + + +class Settings(BaseModel): + master_columns: Optional[dict[str, ColumnSettings]] = None + map: Optional[MapSettings] = None + + +def load_settings(settings_file: Path) -> Settings: + """Load settings from a file.""" + data = yaml.safe_load(open(settings_file, "r")) + return Settings(**data) + + +def get_master_columns_mapping(settings: Settings) -> dict[str, str]: + if settings.master_columns is None: + return {} + return {col: column.use_as for col, column in settings.master_columns.items()} + + +def get_map_settings( + settings: Settings, +) -> tuple[Optional[tuple[float, float]], Optional[int]]: + if settings.map is None: + return None, None + return (settings.map.center, settings.map.zoom_level) diff --git a/src/nomadic/util/workspace.py b/src/nomadic/util/workspace.py index 207fe97..77ee151 100644 --- a/src/nomadic/util/workspace.py +++ b/src/nomadic/util/workspace.py @@ -73,6 +73,12 @@ def get_output_dir(self, experiment_name: str): """ return os.path.join(self.get_results_dir(), experiment_name) + def get_summary_dir(self, summary_name: str): + """ + Get the summary directory of the workspace. + """ + return os.path.join(self.path, "summaries", summary_name) + def get_beds_dir(self): """ Get the beds directory of the workspace. @@ -91,6 +97,18 @@ def get_metadata_csv(self, experiment_name: str): """ return os.path.join(self.get_metadata_dir(), f"{experiment_name}.csv") + def get_master_metadata_csv(self, summary_name: str): + """ + Get the path to the master metadata CSV file for summaries. + """ + return os.path.join(self.get_metadata_dir(), f"{summary_name}.csv") + + def get_summary_settings_file(self, summary_name: str): + """ + Get the path to the master metadata CSV file for summaries. + """ + return os.path.join(self.get_metadata_dir(), f"{summary_name}.yaml") + def get_bed_file(self, panel_name: str): """ Get the path to the BED file for a given panel name. @@ -119,3 +137,15 @@ def get_experiment_names(self): for name in os.listdir(self.get_results_dir()) if os.path.isdir(os.path.join(self.get_results_dir(), name)) ] + + def get_experiment_dirs(self): + """ + Get a list of available experiment directories in the workspace. + """ + if not os.path.exists(self.get_results_dir()): + return [] + + return [ + os.path.join(self.get_results_dir(), name) + for name in self.get_experiment_names() + ]