JasonAHendry · berndbohmeier · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Oct 11, 2025
diff --git a/build/conda/meta.yaml b/build/conda/meta.yaml
@@ -35,6 +35,7 @@ requirements:
     - pysam
     - pyyaml
     - i18nice
+    - pydantic
     # tools
     - minimap2
     - samtools >=1.20

diff --git a/environments/run.yml b/environments/run.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - python >= 3.10 
+  - python >= 3.10
   - minimap2
   - samtools >=1.20
   - bcftools >=1.20
@@ -19,5 +19,6 @@ dependencies:
   - platformdirs
   - pyyaml
   - i18nice
+  - pydantic
   - pip
   - rsync
diff --git a/scripts/run_realtime.sh b/scripts/run_realtime.sh
@@ -1,8 +1,7 @@
 # Example of how to run nomadic realtime
 # 2023/07/12, J.Hendry
 
-nomadic realtime \
--e 0000-00-00_example \
+nomadic realtime 0000-00-00_example \
 -f example_data/minknow/fastq_pass \
 -m example_data/metadata/sample_info.csv \
 -b example_data/beds/nomads8.amplicons.bed --call
diff --git a/setup.cfg b/setup.cfg
@@ -23,6 +23,7 @@ install_requires =
     pyyaml
     seaborn
     i18nice
+    pydantic
 
 python_requires = >=3.10
 zip_safe = no
@@ -31,6 +32,9 @@ zip_safe = no
 nomadic.realtime.dashboard =
     assets/*
     translations/*
+nomadic.summarize.dashboard =
+    assets/*
+    translations/*
 nomadic.start =
     data/**
 

diff --git a/src/nomadic/cli.py b/src/nomadic/cli.py
@@ -8,6 +8,7 @@
 from nomadic.process.commands import process
 from nomadic.realtime.commands import realtime
 from nomadic.start.commands import start
+from nomadic.summarize.commands import summarize
 
 
 # From: https://stackoverflow.com/questions/47972638/how-can-i-define-the-order-of-click-sub-commands-in-help
@@ -36,4 +37,5 @@ def cli():
 cli.add_command(realtime)
 cli.add_command(process)
 cli.add_command(dashboard)
+cli.add_command(summarize)
 cli.add_command(backup)
diff --git a/src/nomadic/dashboard/main.py b/src/nomadic/dashboard/main.py
@@ -1,57 +1,9 @@
 import os
 from nomadic.realtime.dashboard.builders import MappingRTDashboard, CallingRTDashboard
-from nomadic.util.metadata import MetadataTableParser
-from nomadic.util.experiment import ExperimentDirectories
-from nomadic.util.regions import RegionBEDParser
+from nomadic.util.experiment import ExperimentDirectories, find_metadata, find_regions
 from nomadic.util.settings import load_settings
 
 
-def find_metadata(input_dir: str) -> MetadataTableParser:
-    """
-    Given an experiment directory, search for the metadata CSV file in thee
-    expected location
-
-    """
-
-    metadata_dir = os.path.join(input_dir, "metadata")
-    csvs = [
-        f"{metadata_dir}/{file}"
-        for file in os.listdir(metadata_dir)
-        if file.endswith(".csv")
-        and not file.startswith("._")  # ignore AppleDouble files
-    ]  # TODO: what about no-suffix files?
-
-    if len(csvs) != 1:  # Could alternatively load and LOOK
-        raise FileNotFoundError(
-            f"Expected one metadata CSV file (*.csv) at {metadata_dir}, but found {len(csvs)}."
-        )
-
-    return MetadataTableParser(csvs[0])
-
-
-def find_regions(input_dir: str) -> RegionBEDParser:
-    """
-    Given an experiment directory, search for the metadata CSV file in thee
-    expected location
-
-    TODO: Bad duplication from above, can write inner function
-    """
-
-    metadata_dir = os.path.join(input_dir, "metadata")
-    beds = [
-        f"{metadata_dir}/{file}"
-        for file in os.listdir(metadata_dir)
-        if file.endswith(".bed") and not file.endswith(".lowcomplexity_mask.bed")
-    ]  # TODO: what about no-suffix files?
-
-    if len(beds) != 1:  # Could alternatively load and LOOK
-        raise FileNotFoundError(
-            f"Expected one region BED file (*.bed) at {metadata_dir}, but found {len(beds)}."
-        )
-
-    return RegionBEDParser(beds[0])
-
-
 def variant_calling_performed(expt_dirs: ExperimentDirectories) -> bool:
     """
     Check if the variant calling TSV is present

diff --git a/src/nomadic/realtime/dashboard/components.py b/src/nomadic/realtime/dashboard/components.py
@@ -394,7 +394,7 @@ def _update(_, selected_categories):
 
             if "sample_id" not in df.columns:
                 # for backwards compatibility to be able to show old experiments where this column was not in the data
-                df = df.join(self.metadata.required_metadata, on="barcode")
+                df = df.join(self.metadata.sample_ids_df, on="barcode")
 
             x = df.apply(
                 sample_string_from_row,
@@ -578,7 +578,7 @@ def _update(_, dropdown_stat, shift=0):
             )
             if "sample_id" not in df.columns:
                 # for backwards compatibility to be able to show old experiments where this column was not in the data
-                df = df.join(self.metadata.required_metadata, on="barcode")
+                df = df.join(self.metadata.sample_ids_df, on="barcode")
 
             # Prepare plotting data
             # plot_data = [
@@ -1126,9 +1126,7 @@ def _update(_, target_region):
             target_df = df.query(qry)
             if "sample_id" not in target_df.columns:
                 # for backwards compatibility to be able to show old experiments where this column was not in the data
-                target_df = target_df.join(
-                    self.metadata.required_metadata, on="barcode"
-                )
+                target_df = target_df.join(self.metadata.sample_ids_df, on="barcode")
 
             # Munge for plot
             target_df["sample_string"] = pd.Categorical(

diff --git a/src/nomadic/realtime/pipelines/experiment.py b/src/nomadic/realtime/pipelines/experiment.py
@@ -74,7 +74,7 @@ def _run_fastq(self):
             except FileNotFoundError:
                 continue
         df = pd.DataFrame(fastq_dts)
-        df = df.join(self.metadata.required_metadata, on="barcode")
+        df = df.join(self.metadata.sample_ids_df, on="barcode")
         df_path = self.expt_dirs.get_summary_files().fastqs_processed
         df.to_csv(df_path, index=False)
 
@@ -97,7 +97,7 @@ def _run_qcbams(self):
             except FileNotFoundError:
                 continue
         df = pd.DataFrame(qcbams_dts)
-        df = df.join(self.metadata.required_metadata, on="barcode")
+        df = df.join(self.metadata.sample_ids_df, on="barcode")
         df_path = self.expt_dirs.get_summary_files().read_mapping
         df.to_csv(df_path, index=False)
 
@@ -117,7 +117,7 @@ def _run_bedcov(self):
             except FileNotFoundError:
                 continue
         bedcov_df = pd.concat(bedcov_dfs)
-        bedcov_df = bedcov_df.join(self.metadata.required_metadata, on="barcode")
+        bedcov_df = bedcov_df.join(self.metadata.sample_ids_df, on="barcode")
         df_path = self.expt_dirs.get_summary_files().region_coverage
         bedcov_df.to_csv(df_path, index=False)
 
@@ -137,7 +137,7 @@ def _run_depth(self):
             except FileNotFoundError:
                 continue
         depth_df = pd.concat(depth_dfs)
-        depth_df = depth_df.join(self.metadata.required_metadata, on="barcode")
+        depth_df = depth_df.join(self.metadata.sample_ids_df, on="barcode")
         df_path = self.expt_dirs.get_summary_files().depth_profiles
         depth_df.to_csv(df_path, index=False)
 
@@ -192,7 +192,7 @@ def _run_variant(self, caller: str):
         annotator.convert_to_csv(temp_path)
 
         df = pd.read_csv(temp_path)
-        df = df.join(self.metadata.required_metadata, on="barcode")
+        df = df.join(self.metadata.sample_ids_df, on="barcode")
         df.to_csv(csv_path, index=False)
 
         # Clean-up

diff --git a/src/nomadic/summarize/__init__.py b/src/nomadic/summarize/__init__.py
diff --git a/src/nomadic/summarize/commands.py b/src/nomadic/summarize/commands.py
@@ -0,0 +1,171 @@
+from pathlib import Path
+
+import click
+
+from nomadic.util.exceptions import MetadataFormatError
+from nomadic.util.workspace import Workspace, check_if_workspace
+
+
+@click.command(
+    short_help="Summarize a set of experiments.",
+)
+@click.argument(
+    "experiment_dirs",
+    type=click.Path(exists=True),
+    nargs=-1,  # allow multiple arguments; gets passed as tuple
+)
+@click.option(
+    "-w",
+    "--workspace",
+    "workspace_path",
+    default="./",
+    show_default="current directory",
+    type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    help="Path of the workspace where all input/output files (beds, metadata, results) are stored. "
+    "The workspace directory simplifies the use of nomadic in that many arguments don't need to be listed "
+    "as they are predefined in the workspace config or can be loaded from the workspace",
+)
+@click.option(
+    "-m",
+    "--metadata_csv",
+    type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
+    help="Path to the master metadata CSV file.",
+    show_default="<workspace>/metadata/<summary_name>.csv",
+)
+@click.option(
+    "-n",
+    "--summary_name",
+    type=str,
+    help="Name of summary",
+    show_default="name of the workspace.",
+)
+@click.option(
+    "--prevalence-by",
+    type=str,
+    help="Column to calculate prevalence by for output files.",
+    multiple=True,
+)
+@click.option(
+    "--dashboard/--no-dashboard",
+    default=True,
+    help="Whether to start the web dashboard to look at the summary.",
+)
+@click.option(
+    "--only-dashboard",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="If set, only open the summary dashboard in the web browser. Can not be combined with --no-dashboard.",
+)
+@click.option(
+    "-s",
+    "--settings-file",
+    type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
+    show_default="<workspace>/metadata/<summary_name>.yaml",
+    help="Path to the summary settings YAML file.",
+)
+@click.option(
+    "--no-master-metadata",
+    is_flag=True,
+    default=False,
+    help="If set, no master metadata CSV needs to be provided. This is not recommended, as it's better to be explicit about the samples to be included, but it can be used to quickly get an overview of the data in the workspace.",
+)
+@click.option(
+    "--qc-min-coverage",
+    type=int,
+    default=50,
+    show_default=True,
+    help="Minimum coverage threshold for quality control. Amplicons with less than this coverage will be marked as low coverage.",
+)
+@click.option(
+    "--qc-max-contam",
+    type=float,
+    default=0.1,
+    show_default=True,
+    help="Maximum contamination fraction for quality control. Samples with contamination above this fraction will be marked as contaminated. Contamination is defined as the mean coverage of negative controls being more than this fraction of the sample coverage.",
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(dir_okay=True, file_okay=False, path_type=Path),
+    help="Path to the output directory where the summary will be stored. By default, this is <workspace>/summaries/<summary_name>.",
+)
+def summarize(
+    experiment_dirs: tuple[str],
+    summary_name: str,
+    workspace_path: str,
+    output_dir: Path,
+    metadata_csv: Path,
+    dashboard: bool,
+    only_dashboard: bool,
+    prevalence_by: tuple[str],
+    settings_file: Path,
+    no_master_metadata: bool,
+    qc_min_coverage: int,
+    qc_max_contam: float,
+):
+    """
+    Summarize a set of experiments to evaluate quality control and
+    mutation prevalence. You can either provide a list of folders of experiments,
+    or if none are provided, all experiments of this workspace will be used.
+
+    """
+    if not check_if_workspace(workspace_path):
+        raise click.BadParameter(
+            param_hint="-w/--workspace",
+            message=f"'{workspace_path}' is not a workspace.",
+        )
+    workspace = Workspace(workspace_path)
+
+    if summary_name is None:
+        summary_name = workspace.get_name()
+
+    if only_dashboard and not dashboard:
+        raise click.BadParameter(
+            param_hint="--dashboard-only",
+            message="--dashboard-only can not be used together with --no-dashboard.",
+        )
+
+    if only_dashboard:
+        from .main import view
+
+        return view(Path(workspace.get_summary_dir(summary_name)), summary_name)
+
+    if metadata_csv is None and not no_master_metadata:
+        metadata_csv = Path(workspace.get_master_metadata_csv(summary_name))
+
+    if settings_file is None:
+        settings_file = Path(workspace.get_summary_settings_file(summary_name))
+
+    if not no_master_metadata and not metadata_csv.exists():
+        raise click.BadParameter(
+            param_hint="-m/--metadata_csv",
+            message=f"Master metadata file '{metadata_csv}' does not exist.",
+        )
+
+    if len(experiment_dirs) == 0:
+        experiment_dirs = workspace.get_experiment_dirs()
+
+    if output_dir is None:
+        output_dir = Path(workspace.get_summary_dir(summary_name))
+
+    from .main import main
+
+    try:
+        main(
+            workspace=workspace,
+            output_dir=output_dir,
+            expt_dirs=experiment_dirs,
+            summary_name=summary_name,
+            metadata_path=metadata_csv,
+            settings_file_path=settings_file,
+            show_dashboard=dashboard,
+            prevalence_by=list(prevalence_by),
+            no_master_metadata=no_master_metadata,
+            qc_min_coverage=qc_min_coverage,
+            qc_max_contam=qc_max_contam,
+        )
+    except MetadataFormatError as e:
+        raise click.BadParameter(
+            message=f"Metadata format error: {e}",
+        ) from e
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,7 @@ requirements: @@
         - pysam
         - pyyaml
         - i18nice
+        - pydantic
         # tools
         - minimap2
         - samtools >=1.20
@@ Expand Down @@