diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 16c65ca..308f3fc 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -10,14 +10,38 @@ metadata_url: "https://data.nextstrain.org/files/workflows/mumps/metadata.tsv.zs strain_id_field: "accession" reference: "reference.gb" -filter: - min_length: 8000 - group_by: country year month MuV_genotype division - exclude: "{build}/exclude.txt" - include: "{build}/include.txt" - specific: - north-america: --subsample-max-sequences 4000 --min-date 2006 --query "region=='North America' & (MuV_genotype=='G')" - global: --subsample-max-sequences 4000 --min-date 1950 +# Override this with your own configuration using `custom_subsample` instead of +# `subsample`. +subsample: + north-america: + samples: + all: + min_length: 8000 + group_by: + - country + - year + - month + - MuV_genotype + - division + exclude: "north-america/exclude.txt" + include: "north-america/include.txt" + max_sequences: 4000 + min_date: 2006 + query: region=='North America' & (MuV_genotype=='G') + global: + samples: + all: + min_length: 8000 + group_by: + - country + - year + - month + - MuV_genotype + - division + exclude: "global/exclude.txt" + include: "global/include.txt" + max_sequences: 4000 + min_date: 1950 refine: north-america: "--clock-filter-iqd 4" diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 14e5f17..4f320bc 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -3,7 +3,7 @@ This part of the workflow creates additonal annotations for the phylogenetic tre REQUIRED INPUTS: - metadata = results/{build}/filtered.tsv + metadata = results/{build}/subsampled.tsv alignment = results/{build}/aligned.fasta tree = results/{build}/tree.nwk @@ -86,7 +86,7 @@ rule traits: """ input: tree = "results/{build}/tree.nwk", - metadata = "results/{build}/filtered.tsv", + metadata = "results/{build}/subsampled.tsv", output: node_data = "results/{build}/traits.json", log: diff --git a/phylogenetic/rules/config.smk b/phylogenetic/rules/config.smk index 1a14810..adc2956 100644 --- a/phylogenetic/rules/config.smk +++ b/phylogenetic/rules/config.smk @@ -6,4 +6,16 @@ OUTPUTS: results/run_config.yaml """ +# NOTE: The order is important. Filepaths must be resolved before config is +# written, otherwise augur subsample will not work. + +resolve_filepaths([ + ("subsample", "*", "samples", "*", "include"), + ("subsample", "*", "samples", "*", "exclude"), + ("subsample", "*", "samples", "*", "group_by_weights"), + ("custom_subsample", "*", "samples", "*", "include"), + ("custom_subsample", "*", "samples", "*", "exclude"), + ("custom_subsample", "*", "samples", "*", "group_by_weights"), +]) + write_config("results/run_config.yaml") diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 4dfbb87..533457b 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -3,7 +3,7 @@ This part of the workflow constructs the phylogenetic tree. REQUIRED INPUTS: - metadata = results/{build}/filtered.tsv + metadata = results/{build}/subsampled.tsv alignment = results/{build}/aligned.fasta OUTPUTS: @@ -49,7 +49,7 @@ rule refine: input: tree = "results/{build}/tree_raw.nwk", alignment = "results/{build}/aligned.fasta", - metadata = "results/{build}/filtered.tsv" + metadata = "results/{build}/subsampled.tsv" output: tree = "results/{build}/tree.nwk", node_data = "results/{build}/branch_lengths.json", diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index d0aeb14..1e41f8a 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -4,7 +4,7 @@ export a Nextstrain dataset. REQUIRED INPUTS: - metadata = results/{build}/filtered.tsv + metadata = results/{build}/subsampled.tsv tree = results/{build}/tree.nwk branch_lengths = results/{build}/branch_lengths.json node_data = results/{build}/*.json @@ -31,7 +31,7 @@ rule colors: input: color_schemes = resolve_config_path(config['colors']['color_schemes']), color_orderings = resolve_config_path(config['colors']['color_orderings']), - metadata = "results/{build}/filtered.tsv", + metadata = "results/{build}/subsampled.tsv", output: colors = "results/{build}/colors.tsv" log: @@ -53,7 +53,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/{build}/tree.nwk", - metadata = "results/{build}/filtered.tsv", + metadata = "results/{build}/subsampled.tsv", branch_lengths = "results/{build}/branch_lengths.json", traits = "results/{build}/traits.json", nt_muts = "results/{build}/nt_muts.json", @@ -93,7 +93,7 @@ rule tip_frequencies: """ input: tree = "results/{build}/tree.nwk", - metadata = "results/{build}/filtered.tsv", + metadata = "results/{build}/subsampled.tsv", output: tip_freq = "auspice/mumps_{build}_tip-frequencies.json" log: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index bcaf45e..6501af5 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -9,7 +9,7 @@ REQUIRED INPUTS: OUTPUTS: - metadata = results/{build}/filtered.tsv + metadata = results/{build}/subsampled.tsv alignment = results/{build}/aligned.fasta This part of the workflow usually includes the following steps: @@ -51,9 +51,9 @@ rule decompress: zstd -d -c {input.metadata} > {output.metadata} """ -rule filter: +rule subsample: """ - Filtering to + Subsampling to - various criteria based on the auspice JSON target - from {params.min_date} onwards - excluding strains in {input.exclude} @@ -63,35 +63,31 @@ rule filter: input: sequences = "data/sequences.fasta", metadata = "data/metadata.tsv", - exclude = resolve_config_path(config["filter"]["exclude"]), - include = resolve_config_path(config["filter"]["include"]), + config = "results/run_config.yaml", output: - sequences = "results/{build}/filtered.fasta", - metadata = "results/{build}/filtered.tsv", + sequences = "results/{build}/subsampled.fasta", + metadata = "results/{build}/subsampled.tsv", log: - "logs/{build}/filtered.txt", + "logs/{build}/subsample.txt", benchmark: - "benchmarks/{build}/filtered.txt", + "benchmarks/{build}/subsample.txt", params: - min_length = config['filter']['min_length'], - group_by = config['filter']['group_by'], - filter_params = lambda wildcard: config['filter']['specific'][wildcard.build], + config_section = lambda w: ["custom_subsample" if config.get("custom_subsample") else "subsample", w.build], strain_id = config.get("strain_id_field", "strain"), + threads: workflow.cores shell: r""" exec &> >(tee {log:q}) - augur filter \ + augur subsample \ --sequences {input.sequences:q} \ --metadata {input.metadata:q} \ --metadata-id-columns {params.strain_id:q} \ - --exclude {input.exclude:q} \ - --include {input.include:q} \ + --config {input.config:q} \ + --config-section {params.config_section:q} \ + --nthreads {threads:q} \ --output-sequences {output.sequences:q} \ - --output-metadata {output.metadata:q} \ - --min-length {params.min_length:q} \ - --group-by {params.group_by} \ - {params.filter_params} + --output-metadata {output.metadata:q} """ rule align: @@ -100,7 +96,7 @@ rule align: - filling gaps with N """ input: - sequences = "results/{build}/filtered.fasta", + sequences = "results/{build}/subsampled.fasta", reference = resolve_config_path(config['reference']), output: alignment = "results/{build}/aligned.fasta",