diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/main.nf b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/main.nf new file mode 100644 index 00000000000..0a8cc10b3b8 --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/main.nf @@ -0,0 +1,239 @@ +// statistics +include { FASTQ_QC_STATS as PRE_STATS } from '../fastq_qc_stats/main' +include { FASTQ_QC_STATS as POST_STATS } from '../fastq_qc_stats/main' +// preprocessing +include { FASTQ_PREPROCESS_SEQKIT } from '../fastq_preprocess_seqkit/main' +// barcoding +include { UMITOOLS_EXTRACT } from '../../../modules/nf-core/umitools/extract/main' +// adapter removal and merging +include { FASTQ_REMOVEADAPTERS_MERGE } from '../fastq_removeadapters_merge/main' +// complexity filtering +include { FASTQ_COMPLEXITY_FILTER } from '../fastq_complexity_filter/main' +// deduplication +include { BBMAP_CLUMPIFY } from '../../../modules/nf-core/bbmap/clumpify/main' +// host decontamination +include { FASTQ_DECONTAMINATE_DEACON_HOSTILE } from '../fastq_decontaminate_deacon_hostile/main' +// final concatenation +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' + +workflow FASTQ_SHORTREADS_PREPROCESS_QC { + + take: + ch_reads // channel: [ val(meta), [ fastq ] ] + // statistics + skip_fastqc // boolean + skip_seqfu_check // boolean + skip_seqfu_stats // boolean + skip_seqkit_stats // boolean + skip_seqtk_comp // boolean + // preprocessing + skip_seqkit_sana_pair // boolean + skip_seqkit_seq // boolean + skip_seqkit_replace // boolean + skip_seqkit_rmdup // boolean + // barcoding + skip_umitools_extract // boolean + val_umi_discard_read // integer: 0, 1 or 2 + // adapter removal and merging + skip_adapterremoval // boolean + val_adapter_tool // string: [mandatory] tool_name // choose from: ["trimmomatic", "cutadapt", "trimgalore", "bbduk", "leehom", "fastp", "adapterremoval"] + ch_custom_adapters_file // channel: [optional] [ {fasta,txt} ] // fasta, for bbduk or fastp, or txt, for adapterremoval + val_save_merged // boolean: [mandatory] if true, will return the merged reads instead, for fastp and adapterremoval + val_fastp_discard_trimmed_pass // boolean: [mandatory] // only for fastp + val_fastp_save_trimmed_fail // boolean: [mandatory] // only for fastp + // complexity filtering + skip_complexity_filtering // boolean + val_complexity_filter_tool // string: [mandatory] tool_name // choose from: ["prinseqplusplus", "bbduk", "fastp"] + // deduplication + skip_deduplication // boolean + // host decontamination + skip_decontamination // boolean + ch_decontamination_fasta // channel: [ val(meta), [ fasta ] ] (optional) + ch_decontamination_reference // channel: [ val(reference_name), path(reference_dir) ] (optional) + val_decontamination_index_name // val (optional) + val_decontamination_tool // string (enum): 'hostile' or 'deacon' + // final concatenation + skip_final_concatenation // boolean + + main: + + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + ch_umi_log = channel.empty() + ch_adapterremoval_discarded_reads = channel.empty() + ch_adapterremoval_logfile = channel.empty() + ch_adapterremoval_report = channel.empty() + ch_complexity_filter_log = channel.empty() + ch_complexity_filter_report = channel.empty() + ch_clumpify_log = channel.empty() + ch_hostile_reference = channel.empty() + ch_hostile_json = channel.empty() + ch_deacon_index = channel.empty() + ch_deacon_summary = channel.empty() + + // pre-statistics + PRE_STATS ( + ch_reads, + skip_fastqc, + skip_seqfu_check, + skip_seqfu_stats, + skip_seqkit_stats, + skip_seqtk_comp + ) + ch_pre_stats_fastqc_html = PRE_STATS.out.fastqc_html + ch_pre_stats_fastqc_zip = PRE_STATS.out.fastqc_zip + ch_pre_stats_seqfu_check = PRE_STATS.out.seqfu_check + ch_pre_stats_seqfu_stats = PRE_STATS.out.seqfu_stats + ch_pre_stats_seqkit_stats = PRE_STATS.out.seqkit_stats + ch_pre_stats_seqtk_stats = PRE_STATS.out.seqtk_stats + ch_multiqc_files = ch_multiqc_files.mix(PRE_STATS.out.seqfu_multiqc) + ch_versions = ch_versions.mix(PRE_STATS.out.versions) + + // preprocessing + FASTQ_PREPROCESS_SEQKIT ( + ch_reads, + skip_seqkit_sana_pair, + skip_seqkit_seq, + skip_seqkit_replace, + skip_seqkit_rmdup + ) + ch_reads = FASTQ_PREPROCESS_SEQKIT.out.reads + ch_versions = ch_versions.mix(FASTQ_PREPROCESS_SEQKIT.out.versions) + + // barcoding + if (!skip_umitools_extract) { + UMITOOLS_EXTRACT( ch_reads ) + ch_umi_reads = UMITOOLS_EXTRACT.out.reads + ch_umi_log = UMITOOLS_EXTRACT.out.log + ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) + + // Discard R1 / R2 if required + if (val_umi_discard_read in [1, 2]) { + ch_umi_reads = UMITOOLS_EXTRACT.out.reads + .map { meta, reads -> + meta.single_end ? [meta, reads] : [meta + ['single_end': true], reads[val_umi_discard_read % 2]] + } + } + + ch_reads = ch_umi_reads + } + + // adapter removal and merging + if (!skip_adapterremoval) { + FASTQ_REMOVEADAPTERS_MERGE ( + ch_reads, + val_adapter_tool, + ch_custom_adapters_file, + val_save_merged, + val_fastp_discard_trimmed_pass, + val_fastp_save_trimmed_fail + ) + ch_adapterremoval_discarded_reads = FASTQ_REMOVEADAPTERS_MERGE.out.discarded_reads + ch_adapterremoval_logfile = FASTQ_REMOVEADAPTERS_MERGE.out.logfile + ch_adapterremoval_report = FASTQ_REMOVEADAPTERS_MERGE.out.report + ch_reads = FASTQ_REMOVEADAPTERS_MERGE.out.processed_reads + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_REMOVEADAPTERS_MERGE.out.multiqc_files) + ch_versions = ch_versions.mix(FASTQ_REMOVEADAPTERS_MERGE.out.versions) + } + + // complexity filtering + if (!skip_complexity_filtering) { + FASTQ_COMPLEXITY_FILTER( ch_reads, val_complexity_filter_tool ) + ch_reads = FASTQ_COMPLEXITY_FILTER.out.filtered_reads + ch_complexity_filter_log = FASTQ_COMPLEXITY_FILTER.out.logfile + ch_complexity_filter_report = FASTQ_COMPLEXITY_FILTER.out.report + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_COMPLEXITY_FILTER.out.multiqc_files) + ch_versions = ch_versions.mix(FASTQ_COMPLEXITY_FILTER.out.versions) + } + + // deduplication + if (!skip_deduplication) { + BBMAP_CLUMPIFY( ch_reads ) + ch_reads = BBMAP_CLUMPIFY.out.reads + ch_clumpify_log = BBMAP_CLUMPIFY.out.log + ch_versions = ch_versions.mix(BBMAP_CLUMPIFY.out.versions.first()) + } + + // host decontamination + if (!skip_decontamination) { + FASTQ_DECONTAMINATE_DEACON_HOSTILE ( + ch_reads, + ch_decontamination_fasta, + ch_decontamination_reference, + val_decontamination_index_name, + val_decontamination_tool + ) + ch_reads = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.fastq_filtered + ch_hostile_reference = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.reference + ch_hostile_json = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.json + ch_deacon_index = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.index + ch_deacon_summary = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.summary + ch_versions = ch_versions.mix(FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.versions) + } + + + // final concatenation + if (!skip_final_concatenation) { + CAT_FASTQ ( ch_reads ) + ch_reads = CAT_FASTQ.out.reads + } + + // post-statistics + POST_STATS ( + ch_reads, + skip_fastqc, + skip_seqfu_check, + skip_seqfu_stats, + skip_seqkit_stats, + skip_seqtk_comp + ) + ch_post_stats_fastqc_html = POST_STATS.out.fastqc_html + ch_post_stats_fastqc_zip = POST_STATS.out.fastqc_zip + ch_post_stats_seqfu_check = POST_STATS.out.seqfu_check + ch_post_stats_seqfu_stats = POST_STATS.out.seqfu_stats + ch_post_stats_seqkit_stats = POST_STATS.out.seqkit_stats + ch_post_stats_seqtk_stats = POST_STATS.out.seqtk_stats + ch_multiqc_files = ch_multiqc_files.mix(POST_STATS.out.seqfu_multiqc) + ch_versions = ch_versions.mix(POST_STATS.out.versions) + + emit: + reads = ch_reads // channel: [ val(meta), [ fastq ] ] + + // statistics + pre_stats_fastqc_html = ch_pre_stats_fastqc_html + pre_stats_fastqc_zip = ch_pre_stats_fastqc_zip + pre_stats_seqfu_check = ch_pre_stats_seqfu_check + pre_stats_seqfu_stats = ch_pre_stats_seqfu_stats + pre_stats_seqkit_stats = ch_pre_stats_seqkit_stats + pre_stats_seqtk_stats = ch_pre_stats_seqtk_stats + post_stats_fastqc_html = ch_post_stats_fastqc_html + post_stats_fastqc_zip = ch_post_stats_fastqc_zip + post_stats_seqfu_check = ch_post_stats_seqfu_check + post_stats_seqfu_stats = ch_post_stats_seqfu_stats + post_stats_seqkit_stats = ch_post_stats_seqkit_stats + post_stats_seqtk_stats = ch_post_stats_seqtk_stats + + // barcoding + umi_log = ch_umi_log + + // adapter removal and merging + adapterremoval_discarded_reads = ch_adapterremoval_discarded_reads + adapterremoval_logfile = ch_adapterremoval_logfile + adapterremoval_report = ch_adapterremoval_report + + // complexity filtering + complexity_filter_log = ch_complexity_filter_log + complexity_filter_report = ch_complexity_filter_report + + // deduplication + clumpify_log = ch_clumpify_log + + // host decontamination + hostile_reference = ch_hostile_reference + hostile_json = ch_hostile_json + deacon_index = ch_deacon_index + deacon_summary = ch_deacon_summary + + multiqc_files = ch_multiqc_files + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/meta.yml b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/meta.yml new file mode 100644 index 00000000000..7249143ee0c --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/meta.yml @@ -0,0 +1,337 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fastq_shortreads_preprocess_qc" +description: | + Quality check and preprocessing subworkflow of Illumina short reads + that can do: quality check of input reads and generate statistics, + preprocess and validate reads, barcode removal, remove adapters and merge reads, + filter by sequence complexity, deduplicate reads, remove host contamination, + concatenate reads and generate statistics for post-processing reads. +keywords: + - fastq + - illumina + - short + - reads + - qc + - stats + - preprocessing + - barcoding + - adapters + - merge + - complexity + - deduplication + - host + - decontamination +components: + - fastq_qc_stats + - fastqc + - seqfu/check + - seqfu/stats + - seqkit/stats + - seqtk/comp + - fastq_preprocess_seqkit + - fastq_sanitise_seqkit + - seqkit/sana + - seqkit/pair + - seqkit/seq + - seqkit/replace + - seqkit/rmdup + - umitools/extract + - fastq_removeadapters_merge + - trimmomatic + - cutadapt + - trimgalore + - bbmap/bbduk + - leehom + - fastp + - adapterremoval + - cat/fastq + - fastq_complexity_filter + - prinseqplusplus + - bbmap/clumpify + - fastq_decontaminate_deacon_hostile + - fastq_index_filter_deacon + - fastq_fetch_clean_hostile + - hostile/fetch + - hostile/clean + - bowtie2/build + - deacon/filter + - deacon/index + +input: + - ch_reads: + type: file + description: | + List of FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + Structure: [ val(meta), [ path(reads) ] ] + pattern: "*.fastq.gz" + - skip_fastqc: + type: boolean + description: | + Skip FastQC quality control step + - skip_seqfu_check: + type: boolean + description: | + Skip SeqFu check step + - skip_seqfu_stats: + type: boolean + description: | + Skip SeqFu statistics step + - skip_seqkit_stats: + type: boolean + description: | + Skip SeqKit statistics step + - skip_seqtk_comp: + type: boolean + description: | + Skip SeqTk composition analysis step + - skip_seqkit_sana_pair: + type: boolean + description: | + Skip SeqKit sanitize and pair step + - skip_seqkit_seq: + type: boolean + description: | + Skip SeqKit sequence processing step + - skip_seqkit_replace: + type: boolean + description: | + Skip SeqKit replace step + - skip_seqkit_rmdup: + type: boolean + description: | + Skip SeqKit remove duplicates step + - skip_umitools_extract: + type: boolean + description: | + Skip UMI-tools extract barcoding step + - val_umi_discard_read: + type: integer + description: | + Discard R1 or R2 after UMI extraction (0 = keep both, 1 = discard R1, 2 = discard R2) + - skip_adapterremoval: + type: boolean + description: | + Skip the adapter removal and merge subworkflow completely + - val_adapter_tool: + type: string + description: | + Choose one of the available adapter removal and/or merging tools + enum: ["trimmomatic", "cutadapt", "trimgalore", "bbduk", "leehom", "fastp", "adapterremoval"] + - ch_custom_adapters_file: + type: file + description: | + Optional reference files, containing adapter and/or contaminant sequences for removal. + In fasta format for bbmap/bbduk and fastp, or in text format for AdapterRemoval (one adapter per line). + - val_save_merged: + type: boolean + description: | + Specify true to output merged reads instead + Used by fastp and adapterremoval + - val_fastp_discard_trimmed_pass: + type: boolean + description: | + Used only by fastp. + Specify true to not write any reads that pass trimming thresholds from the fastp process. + This can be used to use fastp for the output report only. + - val_fastp_save_trimmed_fail: + type: boolean + description: | + Used only by fastp. + Specify true to save files that failed to pass fastp trimming thresholds + - skip_complexity_filtering: + type: boolean + description: | + Skip PRINSEQ++ complexity filtering step + - val_complexity_filter_tool: + type: string + description: | + Complexity filtering tool to use. + Must be one of: 'prinseqplusplus', 'bbduk', or 'fastp'. + - skip_deduplication: + type: boolean + description: | + Skip BBMap Clumpify deduplication step + - skip_decontamination: + type: boolean + description: | + Skip host decontamination step + - ch_decontamination_fasta: + type: file + description: | + Reference genome FASTA file for decontamination (optional) + Structure: [ val(meta), [ path(fasta) ] ] + pattern: "*.{fasta,fa,fna}" + - ch_decontamination_reference: + type: directory + description: | + Pre-built reference index directory for decontamination (optional) + Structure: [ val(reference_name), path(reference_dir) ] + - val_decontamination_index_name: + type: string + description: | + Name for the decontamination index (optional) + - val_decontamination_tool: + type: string + description: | + Decontamination tool to use ('hostile' or 'deacon') + - skip_final_concatenation: + type: boolean + description: | + Skip final FASTQ concatenation step + +output: + - reads: + type: file + description: | + Channel containing processed short reads + Structure: [ val(meta), path(reads) ] + pattern: "*.fastq.gz" + - pre_stats_fastqc_html: + type: file + description: | + FastQC HTML reports for pre-processing reads + Structure: [ val(meta), path(html) ] + pattern: "*.html" + - pre_stats_fastqc_zip: + type: file + description: | + FastQC ZIP archives for pre-processing reads + Structure: [ val(meta), path(zip) ] + pattern: "*.zip" + - pre_stats_seqfu_check: + type: file + description: | + SeqFu check results for pre-processing reads + Structure: [ val(meta), path(check) ] + - pre_stats_seqfu_stats: + type: file + description: | + SeqFu statistics for pre-processing reads + Structure: [ val(meta), path(stats) ] + - pre_stats_seqfu_multiqc: + type: file + description: | + SeqFu MultiQC-compatible stats for pre-processing reads + Structure: [ val(meta), path(multiqc) ] + - pre_stats_seqkit_stats: + type: file + description: | + SeqKit statistics for pre-processing reads + Structure: [ val(meta), path(stats) ] + - pre_stats_seqtk_stats: + type: file + description: | + SeqTk composition statistics for pre-processing reads + Structure: [ val(meta), path(stats) ] + - post_stats_fastqc_html: + type: file + description: | + FastQC HTML reports for post-processing reads + Structure: [ val(meta), path(html) ] + pattern: "*.html" + - post_stats_fastqc_zip: + type: file + description: | + FastQC ZIP archives for post-processing reads + Structure: [ val(meta), path(zip) ] + pattern: "*.zip" + - post_stats_seqfu_check: + type: file + description: | + SeqFu check results for post-processing reads + Structure: [ val(meta), path(check) ] + - post_stats_seqfu_stats: + type: file + description: | + SeqFu statistics for post-processing reads + Structure: [ val(meta), path(stats) ] + - post_stats_seqfu_multiqc: + type: file + description: | + SeqFu MultiQC-compatible stats for post-processing reads + Structure: [ val(meta), path(multiqc) ] + - post_stats_seqkit_stats: + type: file + description: | + SeqKit statistics for post-processing reads + Structure: [ val(meta), path(stats) ] + - post_stats_seqtk_stats: + type: file + description: | + SeqTk composition statistics for post-processing reads + Structure: [ val(meta), path(stats) ] + - umi_log: + type: file + description: | + UMI-tools extract log file + Structure: [ val(meta), path(log) ] + - adapterremoval_discarded_reads: + type: file + description: | + Reads discarded during adapter removal or merging + Structure: [ val(meta), path(fastq) ] + pattern: "*.fastq.gz" + - adapterremoval_logfile: + type: file + description: | + Adapter removal execution log file + (trimmomatic {log}, trimgalore {txt}, fastp {log}) + Structure: [ val(meta), path({log,txt}) ] + - adapterremoval_report: + type: file + description: | + Adapter removal report + (trimmomatic {summary}, trimgalore {html,zip}, fastp {html}) + Structure: [ val(meta), path({summary,html,zip}) ] + - complexity_filter_log: + type: file + description: | + Log file from complexity filtering + Structure: [ val(meta), path(log) ] + - complexity_filter_report: + type: file + description: | + Report generated by complexity filtering + HTML report generated by fastp. Empty for other tools. + Structure: [ val(meta), path(html) ] + - clumpify_log: + type: file + description: | + BBMap Clumpify log file + Structure: [ val(meta), path(log) ] + - hostile_reference: + type: file + description: | + Hostile reference files used for decontamination + Structure: [ val(reference_name), path(reference_dir) ] + - hostile_json: + type: file + description: | + Hostile JSON report + Structure: [ val(meta), path(json) ] + - deacon_index: + type: directory + description: | + Deacon index directory + Structure: [ val(meta), path(index) ] + - deacon_summary: + type: file + description: | + Deacon decontamination summary file + Structure: [ val(meta), path(log) ] + - multiqc_files: + type: file + description: | + MultiQC compatible files for aggregated reporting + Structure: [ path(files) ] + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" + +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/nextflow.config b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/nextflow.config new file mode 100644 index 00000000000..b81471fa060 --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/nextflow.config @@ -0,0 +1,26 @@ +// IMPORTANT: Add this configuration to your modules.config + +process { + withName: ".*:FASTQ_COMPLEXITY_FILTER:BBMAP_BBDUK" { + ext.args = [ + 'entropy=0.7', // needed to turn on complexity filtering + 'minlength=0' // needed, to not discard reads shorter than this + ].join(' ') + ext.prefix = { "${meta.id}.trim" } + } + + // need FASTP to only do complexity filtering + withName: ".*:FASTQ_COMPLEXITY_FILTER:FASTP" { + ext.args = [ + '--low_complexity_filter', + '--disable_adapter_trimming', + '--disable_trim_poly_g', + '--disable_quality_filtering', + '--disable_length_filtering' + ].join(' ') + } + + withName: "BBMAP_CLUMPIFY" { + ext.args = 'dedupe=t' + } +} diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test new file mode 100644 index 00000000000..563ec899cff --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test @@ -0,0 +1,417 @@ +nextflow_workflow { + + name "Test Subworkflow FASTQ_SHORTREADS_PREPROCESS_QC" + script "../main.nf" + workflow "FASTQ_SHORTREADS_PREPROCESS_QC" + config './nextflow.config' + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_shortreads_preprocess_qc" + tag "subworkflows/fastq_qc_stats" + tag "fastqc" + tag "seqfu" + tag "seqfu/check" + tag "seqfu/stats" + tag "seqkit" + tag "seqkit/stats" + tag "seqtk" + tag "seqtk/comp" + tag "subworkflows/fastq_preprocess_seqkit" + tag "subworkflows/fastq_sanitise_seqkit" + tag "seqkit" + tag "seqkit/sana" + tag "seqkit/pair" + tag "seqkit/seq" + tag "seqkit/replace" + tag "seqkit/rmdup" + tag "umitools" + tag "umitools/extract" + tag "subworkflows/fastq_removeadapters_merge" + tag "trimmomatic" + tag "cutadapt" + tag "trimgalore" + tag "bbmap" + tag "bbmap/bbduk" + tag "leehom" + tag "fastp" + tag "adapterremoval" + tag "cat" + tag "cat/fastq" + tag "subworkflows/fastq_complexity_filter" + tag "prinseqplusplus" + tag "bbmap/clumpify" + tag "subworkflows/fastq_decontaminate_deacon_hostile" + tag "subworkflows/fastq_index_filter_deacon" + tag "subworkflows/fastq_fetch_clean_hostile" + tag "hostile" + tag "hostile/fetch" + tag "hostile/clean" + tag "bowtie2/build" + tag "deacon" + tag "deacon/filter" + tag "deacon/index" + + test("sarscov2 - fastq - seqfu - seqkit - deacon - single_end") { + + when { + params { + save_merged = false + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of([ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + input[1] = true // skip_fastqc + input[2] = false // skip_seqfu_check + input[3] = false // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = false // skip_seqkit_sana_pair + input[7] = false // skip_seqkit_seq + input[8] = false // skip_seqkit_replace + input[9] = false // skip_seqkit_rmdup + input[10] = true // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = true // skip_adapterremoval + input[13] = "" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = false // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = true // skip_complexity_filtering + input[19] = "" // val_complexity_filter_tool + input[20] = true // skip_deduplication + input[21] = false // skip_decontamination + input[22] = channel.of( + [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + ) // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = 'deacon' // val_decontamination_tool + input[26] = true // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot( + workflow.out.reads[0][1], + workflow.out.pre_stats_seqfu_check, + workflow.out.pre_stats_seqfu_stats, + workflow.out.post_stats_seqfu_check, + workflow.out.post_stats_seqfu_stats, + workflow.out.deacon_index, + file(workflow.out.deacon_summary[0][1]).name, + workflow.out.multiqc_files, + workflow.out.versions.collect { path(it).yaml } + ).match() } + ) + } + } + + test("sarscov2 - fastq - umitools - prinseq - clumpify - cat - single_end") { + + when { + params { + save_merged = false + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of([ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + input[1] = true // skip_fastqc + input[2] = true // skip_seqfu_check + input[3] = true // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = true // skip_seqkit_sana_pair + input[7] = true // skip_seqkit_seq + input[8] = true // skip_seqkit_replace + input[9] = true // skip_seqkit_rmdup + input[10] = false // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = true // skip_adapterremoval + input[13] = "" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = false // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = false // skip_complexity_filtering + input[19] = 'prinseqplusplus' // val_complexity_filter_tool + input[20] = false // skip_deduplication + input[21] = true // skip_decontamination + input[22] = channel.of( + [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + ) // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = 'deacon' // val_decontamination_tool + input[26] = false // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot( + path(workflow.out.reads[0][1]).linesGzip.size(), + path(workflow.out.umi_log[0][1]).readLines().size(), + path(workflow.out.clumpify_log[0][1]).readLines().size(), + workflow.out.versions.collect { path(it).yaml } + ).match() } + ) + } + } + + test("sarscov2 - fastq - fastqc - seqkit - cutadapt - clumpify - cat - paired_end") { + + when { + params { + save_merged = false + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of([ + [ id:'test', single_end:false ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = false // skip_fastqc + input[2] = true // skip_seqfu_check + input[3] = true // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = false // skip_seqkit_sana_pair + input[7] = false // skip_seqkit_seq + input[8] = false // skip_seqkit_replace + input[9] = false // skip_seqkit_rmdup + input[10] = true // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = false // skip_adapterremoval + input[13] = "cutadapt" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = false // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = true // skip_complexity_filtering + input[19] = "" // val_complexity_filter_tool + input[20] = false // skip_deduplication + input[21] = true // skip_decontamination + input[22] = [] // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = '' // val_decontamination_tool + input[26] = false // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot( + path(workflow.out.reads[0][1][0]).linesGzip.size(), + path(workflow.out.reads[0][1][1]).linesGzip.size(), + file(workflow.out.pre_stats_fastqc_html[0][1][0]).name, + file(workflow.out.pre_stats_fastqc_html[0][1][1]).name, + file(workflow.out.post_stats_fastqc_html[0][1][0]).name, + file(workflow.out.post_stats_fastqc_html[0][1][1]).name, + file(workflow.out.pre_stats_fastqc_zip[0][1][0]).name, + file(workflow.out.pre_stats_fastqc_zip[0][1][1]).name, + file(workflow.out.post_stats_fastqc_zip[0][1][0]).name, + file(workflow.out.post_stats_fastqc_zip[0][1][1]).name, + path(workflow.out.clumpify_log[0][1]).readLines().size(), + workflow.out.versions.collect { path(it).yaml } + ).match() } + ) + } + } + + test("sarscov2 - fastq - adapterremoval - merge - cat more files - paired_end") { + + when { + params { + save_merged = true + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of( + [ + [ id:'test', single_end:false ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ], + [ + [ id:'test2', single_end:false ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) + ] + ] + ) + input[1] = true // skip_fastqc + input[2] = true // skip_seqfu_check + input[3] = true // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = true // skip_seqkit_sana_pair + input[7] = true // skip_seqkit_seq + input[8] = true // skip_seqkit_replace + input[9] = true // skip_seqkit_rmdup + input[10] = true // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = false // skip_adapterremoval + input[13] = "adapterremoval" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = params.save_merged // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = true // skip_complexity_filtering + input[19] = "" // val_complexity_filter_tool + input[20] = true // skip_deduplication + input[21] = true // skip_decontamination + input[22] = [] // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = '' // val_decontamination_tool + input[26] = false // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot( + workflow.out.reads[0][1], + workflow.out.reads[1][1], + workflow.out.adapterremoval_discarded_reads.collect { file(it[1]).name }, + workflow.out.versions.collect { path(it).yaml } + ).match() } + ) + } + } + + test("sarscov2 - fastq - skip all - single_end") { + + when { + params { + save_merged = false + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of([ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + input[1] = true // skip_fastqc + input[2] = true // skip_seqfu_check + input[3] = true // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = true // skip_seqkit_sana_pair + input[7] = true // skip_seqkit_seq + input[8] = true // skip_seqkit_replace + input[9] = true // skip_seqkit_rmdup + input[10] = true // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = true // skip_adapterremoval + input[13] = "" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = false // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = true // skip_complexity_filtering + input[19] = "" // val_complexity_filter_tool + input[20] = true // skip_deduplication + input[21] = true // skip_decontamination + input[22] = [] // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = "" // val_decontamination_tool + input[26] = true // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot( + workflow.out.reads[0][1], + workflow.out.versions.collect { path(it).yaml } + ).match() } + ) + } + } + + test("sarscov2 - fastq - skip all - single_end - stub") { + + options "-stub" + + when { + params { + save_merged = false + adapterremoval_args = save_merged ? "--collapse" : "" + } + workflow { + """ + input[0] = channel.of([ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + input[1] = true // skip_fastqc + input[2] = true // skip_seqfu_check + input[3] = true // skip_seqfu_stats + input[4] = true // skip_seqkit_stats + input[5] = true // skip_seqtk_comp + input[6] = true // skip_seqkit_sana_pair + input[7] = true // skip_seqkit_seq + input[8] = true // skip_seqkit_replace + input[9] = true // skip_seqkit_rmdup + input[10] = true // skip_umitools_extract + input[11] = 0 // val_umi_discard_read + input[12] = true // skip_adapterremoval + input[13] = "" // val_adapter_tool + input[14] = [] // ch_custom_adapters_file + input[15] = false // val_save_merged + input[16] = false // val_fastp_discard_trimmed_pass + input[17] = false // val_fastp_save_trimmed_fail + input[18] = true // skip_complexity_filtering + input[19] = "" // val_complexity_filter_tool + input[20] = true // skip_deduplication + input[21] = true // skip_decontamination + input[22] = [] // ch_decontamination_fasta + input[23] = [] // ch_decontamination_reference + input[24] = [] // val_decontamination_index_name + input[25] = "" // val_decontamination_tool + input[26] = true // skip_final_concatenation + """ + } + } + then { + assert workflow.success + assertAll( + { assert snapshot(workflow.out.reads).match() } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test.snap new file mode 100644 index 00000000000..e0ad50793e0 --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/main.nf.test.snap @@ -0,0 +1,255 @@ +{ + "sarscov2 - fastq - seqfu - seqkit - deacon - single_end": { + "content": [ + "test.fq.gz:md5,f3a7626275fad3775b6005fad9c13c27", + [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,4c6409169772005cfb06be9e41f2c1e2" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,7573d0c83cfc9af6e1ced67a45265381" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,24dd7cfbb9ae0034d0bd804f464f11fa" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,35dd18aff6780370b48027fec9c7d900" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idx:md5,84e4985c91800686db9c9dca28fabd1a" + ] + ], + "test.json", + [ + [ + { + "id": "test", + "single_end": true + }, + "test_mqc.txt:md5,1facba42f81058e557e3d85dcff2a6f3" + ], + [ + { + "id": "test", + "single_end": true + }, + "test_mqc.txt:md5,a039b8c1cc923db88d2484d3abbf00fe" + ] + ], + [ + { + "FASTQ_SHORTREADS_PREPROCESS_QC:POST_STATS:SEQFU_CHECK": { + "seqfu": "1.22.3" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_REPLACE": { + "seqkit": "2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_RMDUP": { + "seqkit": "v2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_SEQ": { + "seqkit": "v2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_DECONTAMINATE_DEACON_HOSTILE:FASTQ_INDEX_FILTER_DEACON:DEACON_INDEX": { + "deacon": "0.12.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:POST_STATS:SEQFU_STATS": { + "seqfu": "1.22.3" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:PRE_STATS:SEQFU_CHECK": { + "seqfu": "1.22.3" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:FASTQ_SANITISE_SEQKIT:SEQKIT_SANA": { + "seqkit": "2.10.1" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:PRE_STATS:SEQFU_STATS": { + "seqfu": "1.22.3" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:42:12.770307912" + }, + "sarscov2 - fastq - adapterremoval - merge - cat more files - paired_end": { + "content": [ + "test.merged.fastq.gz:md5,369452751050a7f1e31b839702d61417", + "test2.merged.fastq.gz:md5,369452751050a7f1e31b839702d61417", + [ + "test.discarded.fastq.gz", + "test2.discarded.fastq.gz" + ], + [ + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_REMOVEADAPTERS_MERGE:ADAPTERREMOVAL_PE": { + "adapterremoval": "2.3.2" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-15T11:56:56.943674841" + }, + "sarscov2 - fastq - fastqc - seqkit - cutadapt - clumpify - cat - paired_end": { + "content": [ + 400, + 400, + "test_1_fastqc.html", + "test_2_fastqc.html", + "test_1_fastqc.html", + "test_2_fastqc.html", + "test_1_fastqc.zip", + "test_2_fastqc.zip", + "test_1_fastqc.zip", + "test_2_fastqc.zip", + 41, + [ + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_REPLACE": { + "seqkit": "2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:BBMAP_CLUMPIFY": { + "bbmap": 39.18 + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_RMDUP": { + "seqkit": "v2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:SEQKIT_SEQ": { + "seqkit": "v2.9.0" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:FASTQ_SANITISE_SEQKIT:SEQKIT_SANA": { + "seqkit": "2.10.1" + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_PREPROCESS_SEQKIT:FASTQ_SANITISE_SEQKIT:SEQKIT_PAIR": { + "seqkit": "2.9.0" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-27T15:36:35.200576192" + }, + "sarscov2 - fastq - skip all - single_end": { + "content": [ + "/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/fastq/test_1.fastq.gz", + [ + + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T11:53:40.26586129" + }, + "sarscov2 - fastq - umitools - prinseq - clumpify - cat - single_end": { + "content": [ + 400, + 51, + 41, + [ + { + "FASTQ_SHORTREADS_PREPROCESS_QC:BBMAP_CLUMPIFY": { + "bbmap": 39.18 + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:FASTQ_COMPLEXITY_FILTER:PRINSEQPLUSPLUS": { + "prinseqplusplus": 1.2 + } + }, + { + "FASTQ_SHORTREADS_PREPROCESS_QC:UMITOOLS_EXTRACT": { + "umitools": "1.1.6" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-27T16:03:31.345098636" + }, + "sarscov2 - fastq - skip all - single_end - stub": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/fastq/test_1.fastq.gz" + ] + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T12:17:19.455574032" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/nextflow.config b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/nextflow.config new file mode 100644 index 00000000000..f30477b3b19 --- /dev/null +++ b/subworkflows/nf-core/fastq_shortreads_preprocess_qc/tests/nextflow.config @@ -0,0 +1,61 @@ +process { + withName: "SEQKIT_SANA" { + ext.prefix = { "${meta.id}_${meta.strandness}" } + } + + withName: "SEQKIT_SEQ" { + ext.args = [ + "--remove-gaps", + "--upper-case", + "--validate-seq", + "--min-len 30", + "--max-len 5000" + ].join(' ').trim() + ext.prefix = { "intermediate_seqkit_seq_${meta.strandness}" } + } + + withName: "SEQKIT_REPLACE" { + ext.args = '-p "/" -r "_"' + ext.suffix = ".fasta" + ext.prefix = { "intermediate_seqkit_replace_${meta.strandness}" } + } + + withName: "SEQKIT_RMDUP" { + ext.prefix = { "${meta.id}_${meta.strandness}" } + } + + withName: "UMITOOLS_EXTRACT" { + ext.args = '--bc-pattern="NNNN"' + } + + withName: "CUTADAPT" { + ext.args = '-q 25' + } + + withName: "ADAPTERREMOVAL_PE" { + ext.args = params.adapterremoval_args + } + + withName: ".*:FASTQ_COMPLEXITY_FILTER:BBMAP_BBDUK" { + ext.args = [ + 'entropy=0.7', // needed to turn on complexity filtering + 'minlength=0' // needed, to not discard reads shorter than this + ].join(' ') + ext.prefix = { "${meta.id}.trim" } + } + + // need FASTP to only do complexity filtering + withName: ".*:FASTQ_COMPLEXITY_FILTER:FASTP" { + ext.args = [ + '--low_complexity_filter', + '--disable_adapter_trimming', + '--disable_trim_poly_g', + '--disable_quality_filtering', + '--disable_length_filtering' + ].join(' ') + } + + withName: "BBMAP_CLUMPIFY" { + ext.args = 'dedupe=t' + } +}