-
Notifications
You must be signed in to change notification settings - Fork 956
new subworkflow - fastq shortreads preprocess qc #9665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
fa0162d
c363af7
b3a01df
e1a7bbb
4447e52
4c88066
e37b11e
dd3a374
f8e0ef1
c04e29a
912c6b7
76d0119
afa52dd
5be99dd
77ff8c9
32102cc
b215d3e
a928709
c332931
883d909
cc64bd2
c6c5edc
774df05
d808b21
9a16ec1
de55b83
4c77a3b
463ffcd
a1ff0e2
2658ccf
0d86549
72fa54e
f33e1e0
d3887fd
e3b50c5
38c59f0
b42e760
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,239 @@ | ||||||
| // statistics | ||||||
| include { FASTQ_QC_STATS as PRE_STATS } from '../fastq_qc_stats/main' | ||||||
| include { FASTQ_QC_STATS as POST_STATS } from '../fastq_qc_stats/main' | ||||||
| // preprocessing | ||||||
| include { FASTQ_PREPROCESS_SEQKIT } from '../fastq_preprocess_seqkit/main' | ||||||
| // barcoding | ||||||
| include { UMITOOLS_EXTRACT } from '../../../modules/nf-core/umitools/extract/main' | ||||||
| // adapter removal and merging | ||||||
| include { FASTQ_REMOVEADAPTERS_MERGE } from '../fastq_removeadapters_merge/main' | ||||||
| // complexity filtering | ||||||
| include { FASTQ_COMPLEXITY_FILTER } from '../fastq_complexity_filter/main' | ||||||
| // deduplication | ||||||
| include { BBMAP_CLUMPIFY } from '../../../modules/nf-core/bbmap/clumpify/main' | ||||||
| // host decontamination | ||||||
| include { FASTQ_DECONTAMINATE_DEACON_HOSTILE } from '../fastq_decontaminate_deacon_hostile/main' | ||||||
| // final concatenation | ||||||
| include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' | ||||||
|
|
||||||
| workflow FASTQ_SHORTREADS_PREPROCESS_QC { | ||||||
|
|
||||||
| take: | ||||||
| ch_reads // channel: [ val(meta), [ fastq ] ] | ||||||
| // statistics | ||||||
| skip_fastqc // boolean | ||||||
| skip_seqfu_check // boolean | ||||||
| skip_seqfu_stats // boolean | ||||||
| skip_seqkit_stats // boolean | ||||||
| skip_seqtk_comp // boolean | ||||||
| // preprocessing | ||||||
| skip_seqkit_sana_pair // boolean | ||||||
| skip_seqkit_seq // boolean | ||||||
| skip_seqkit_replace // boolean | ||||||
| skip_seqkit_rmdup // boolean | ||||||
| // barcoding | ||||||
| skip_umitools_extract // boolean | ||||||
| val_umi_discard_read // integer: 0, 1 or 2 | ||||||
| // adapter removal and merging | ||||||
| skip_adapterremoval // boolean | ||||||
| val_adapter_tool // string: [mandatory] tool_name // choose from: ["trimmomatic", "cutadapt", "trimgalore", "bbduk", "leehom", "fastp", "adapterremoval"] | ||||||
| ch_custom_adapters_file // channel: [optional] [ {fasta,txt} ] // fasta, for bbduk or fastp, or txt, for adapterremoval | ||||||
| val_save_merged // boolean: [mandatory] if true, will return the merged reads instead, for fastp and adapterremoval | ||||||
| val_fastp_discard_trimmed_pass // boolean: [mandatory] // only for fastp | ||||||
| val_fastp_save_trimmed_fail // boolean: [mandatory] // only for fastp | ||||||
| // complexity filtering | ||||||
| skip_complexity_filtering // boolean | ||||||
vagkaratzas marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| val_complexity_filter_tool // string: [mandatory] tool_name // choose from: ["prinseqplusplus", "bbduk", "fastp"] | ||||||
| // deduplication | ||||||
| skip_deduplication // boolean | ||||||
| // host decontamination | ||||||
| skip_decontamination // boolean | ||||||
| ch_decontamination_fasta // channel: [ val(meta), [ fasta ] ] (optional) | ||||||
| ch_decontamination_reference // channel: [ val(reference_name), path(reference_dir) ] (optional) | ||||||
| val_decontamination_index_name // val (optional) | ||||||
| val_decontamination_tool // string (enum): 'hostile' or 'deacon' | ||||||
| // final concatenation | ||||||
| skip_final_concatenation // boolean | ||||||
|
|
||||||
| main: | ||||||
|
|
||||||
| ch_versions = channel.empty() | ||||||
| ch_multiqc_files = channel.empty() | ||||||
| ch_umi_log = channel.empty() | ||||||
| ch_adapterremoval_discarded_reads = channel.empty() | ||||||
| ch_adapterremoval_logfile = channel.empty() | ||||||
| ch_adapterremoval_report = channel.empty() | ||||||
| ch_complexity_filter_log = channel.empty() | ||||||
| ch_complexity_filter_report = channel.empty() | ||||||
| ch_clumpify_log = channel.empty() | ||||||
| ch_hostile_reference = channel.empty() | ||||||
| ch_hostile_json = channel.empty() | ||||||
| ch_deacon_index = channel.empty() | ||||||
| ch_deacon_summary = channel.empty() | ||||||
|
|
||||||
| // pre-statistics | ||||||
| PRE_STATS ( | ||||||
| ch_reads, | ||||||
| skip_fastqc, | ||||||
| skip_seqfu_check, | ||||||
| skip_seqfu_stats, | ||||||
| skip_seqkit_stats, | ||||||
| skip_seqtk_comp | ||||||
| ) | ||||||
| ch_pre_stats_fastqc_html = PRE_STATS.out.fastqc_html | ||||||
| ch_pre_stats_fastqc_zip = PRE_STATS.out.fastqc_zip | ||||||
| ch_pre_stats_seqfu_check = PRE_STATS.out.seqfu_check | ||||||
| ch_pre_stats_seqfu_stats = PRE_STATS.out.seqfu_stats | ||||||
| ch_pre_stats_seqkit_stats = PRE_STATS.out.seqkit_stats | ||||||
| ch_pre_stats_seqtk_stats = PRE_STATS.out.seqtk_stats | ||||||
| ch_multiqc_files = ch_multiqc_files.mix(PRE_STATS.out.seqfu_multiqc) | ||||||
| ch_versions = ch_versions.mix(PRE_STATS.out.versions) | ||||||
|
|
||||||
| // preprocessing | ||||||
| FASTQ_PREPROCESS_SEQKIT ( | ||||||
| ch_reads, | ||||||
| skip_seqkit_sana_pair, | ||||||
| skip_seqkit_seq, | ||||||
| skip_seqkit_replace, | ||||||
| skip_seqkit_rmdup | ||||||
| ) | ||||||
| ch_reads = FASTQ_PREPROCESS_SEQKIT.out.reads | ||||||
| ch_versions = ch_versions.mix(FASTQ_PREPROCESS_SEQKIT.out.versions) | ||||||
|
|
||||||
| // barcoding | ||||||
| if (!skip_umitools_extract) { | ||||||
| UMITOOLS_EXTRACT( ch_reads ) | ||||||
| ch_umi_reads = UMITOOLS_EXTRACT.out.reads | ||||||
| ch_umi_log = UMITOOLS_EXTRACT.out.log | ||||||
| ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) | ||||||
|
|
||||||
| // Discard R1 / R2 if required | ||||||
| if (val_umi_discard_read in [1, 2]) { | ||||||
| ch_umi_reads = UMITOOLS_EXTRACT.out.reads | ||||||
| .map { meta, reads -> | ||||||
| meta.single_end ? [meta, reads] : [meta + ['single_end': true], reads[val_umi_discard_read % 2]] | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| ch_reads = ch_umi_reads | ||||||
| } | ||||||
|
|
||||||
| // adapter removal and merging | ||||||
| if (!skip_adapterremoval) { | ||||||
| FASTQ_REMOVEADAPTERS_MERGE ( | ||||||
| ch_reads, | ||||||
| val_adapter_tool, | ||||||
| ch_custom_adapters_file, | ||||||
| val_save_merged, | ||||||
| val_fastp_discard_trimmed_pass, | ||||||
| val_fastp_save_trimmed_fail | ||||||
| ) | ||||||
| ch_adapterremoval_discarded_reads = FASTQ_REMOVEADAPTERS_MERGE.out.discarded_reads | ||||||
| ch_adapterremoval_logfile = FASTQ_REMOVEADAPTERS_MERGE.out.logfile | ||||||
| ch_adapterremoval_report = FASTQ_REMOVEADAPTERS_MERGE.out.report | ||||||
| ch_reads = FASTQ_REMOVEADAPTERS_MERGE.out.processed_reads | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure you want to overwrite You should really be passing the output of one step to the next (but having a re-route around this if you are skipping it). Something like:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nextflow is clever enough to know if Double checked with GPT: These channel / dataflow dependencies change whenever The only stuff that will run in parallel is the pre-stats generation and whichever next in line process/subworkflow is not skipped (because pre-stats does not alter the ch_reads).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't trust GPT, but Ok ... I find it MUCH harder to read though, as my mental map of Nextflow is the order in which you define things in the script doesn't necessarily guide how things get executed - it depends how you stick things together... So I for readability of the expected flow of the workflow I would much rather have separate channels. (I feel more strongly about this that the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My two cents, I think re-using the ch_reads is readable. My understanding is that nextflow will parse the file, will "discard" the sections based on the if statemets and then the DAG will know which channel to use as it goes along. But, I think having the most possible readable code is very important. I think James is suggesting something like this (almost pseudocode): // preprocessing
FASTQ_PREPROCESS_SEQKIT (
ch_reads,
skip_seqkit_sana_pair,
skip_seqkit_seq,
skip_seqkit_replace,
skip_seqkit_rmdup
)
ch_preprocessed_reads = FASTQ_PREPROCESS_SEQKIT.out.reads
// barcoding
ch_umi_reads = channel.empty()
if (!skip_umitools_extract) {
UMITOOLS_EXTRACT( ch_reads )
ch_umi_reads = UMITOOLS_EXTRACT.out.reads
ch_umi_log = UMITOOLS_EXTRACT.out.log
ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first())
// Discard R1 / R2 if required
if (val_umi_discard_read in [1, 2]) {
ch_umi_reads = UMITOOLS_EXTRACT.out.reads
.map { meta, reads ->
meta.single_end ? [meta, reads] : [meta + ['single_end': true], reads[val_umi_discard_read % 2]]
}
}
}
ch_umi_reads = ch_umi_reads.ifEmpty(ch_preprocessed_reads)
// adapter removal and merging
ch_adapter_removed = channel.empty()
if (!skip_adapterremoval) {
FASTQ_REMOVEADAPTERS_MERGE (
ch_umi_reads,
val_adapter_tool,
ch_custom_adapters_file,
val_save_merged,
val_fastp_discard_trimmed_pass,
val_fastp_save_trimmed_fail
)
ch_adapter_removed = FASTQ_REMOVEADAPTERS_MERGE.out.processed_reads
}
ch_adapter_removed = ch_adapter_removed.ifEmpty(ch_umi_reads)or instead of which I think is equivalent to re using the ch_reads.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I 'm actually not sure here. I myself also usually prefer to name channels explicitly, because it improves traceability I think, but I have also used the same pattern as here (re-use a channel and overwrite) because it looks cleaner. To me, both is acceptable. |
||||||
| ch_multiqc_files = ch_multiqc_files.mix(FASTQ_REMOVEADAPTERS_MERGE.out.multiqc_files) | ||||||
| ch_versions = ch_versions.mix(FASTQ_REMOVEADAPTERS_MERGE.out.versions) | ||||||
| } | ||||||
|
|
||||||
| // complexity filtering | ||||||
| if (!skip_complexity_filtering) { | ||||||
| FASTQ_COMPLEXITY_FILTER( ch_reads, val_complexity_filter_tool ) | ||||||
| ch_reads = FASTQ_COMPLEXITY_FILTER.out.filtered_reads | ||||||
| ch_complexity_filter_log = FASTQ_COMPLEXITY_FILTER.out.logfile | ||||||
| ch_complexity_filter_report = FASTQ_COMPLEXITY_FILTER.out.report | ||||||
| ch_multiqc_files = ch_multiqc_files.mix(FASTQ_COMPLEXITY_FILTER.out.multiqc_files) | ||||||
| ch_versions = ch_versions.mix(FASTQ_COMPLEXITY_FILTER.out.versions) | ||||||
| } | ||||||
|
|
||||||
| // deduplication | ||||||
| if (!skip_deduplication) { | ||||||
| BBMAP_CLUMPIFY( ch_reads ) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uhh, I didn't realise that clumpify does deduplication? I thought it just reordered the reads that are similar to each other, in order to improve compression?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point..it doesn't do deduplication by default, but only when the |
||||||
| ch_reads = BBMAP_CLUMPIFY.out.reads | ||||||
| ch_clumpify_log = BBMAP_CLUMPIFY.out.log | ||||||
| ch_versions = ch_versions.mix(BBMAP_CLUMPIFY.out.versions.first()) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now, it's good practice for module versions to do that doesn't make much sense. Subworkflows on the other hand are different, since they'll contain versions from different modules. Which, with the new versions topics, will be obsolete anyway. |
||||||
| } | ||||||
|
|
||||||
| // host decontamination | ||||||
| if (!skip_decontamination) { | ||||||
| FASTQ_DECONTAMINATE_DEACON_HOSTILE ( | ||||||
| ch_reads, | ||||||
| ch_decontamination_fasta, | ||||||
| ch_decontamination_reference, | ||||||
| val_decontamination_index_name, | ||||||
| val_decontamination_tool | ||||||
| ) | ||||||
| ch_reads = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.fastq_filtered | ||||||
| ch_hostile_reference = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.reference | ||||||
| ch_hostile_json = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.json | ||||||
| ch_deacon_index = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.index | ||||||
| ch_deacon_summary = FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.summary | ||||||
| ch_versions = ch_versions.mix(FASTQ_DECONTAMINATE_DEACON_HOSTILE.out.versions) | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| // final concatenation | ||||||
| if (!skip_final_concatenation) { | ||||||
| CAT_FASTQ ( ch_reads ) | ||||||
| ch_reads = CAT_FASTQ.out.reads | ||||||
| } | ||||||
|
Comment on lines
+176
to
+179
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Presumably this should be skipped if there isn't more than one fastq either.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I'll let the module itself handle these cases since it's been coded to do so. Else what would be the best alternative? |
||||||
|
|
||||||
| // post-statistics | ||||||
| POST_STATS ( | ||||||
| ch_reads, | ||||||
| skip_fastqc, | ||||||
| skip_seqfu_check, | ||||||
| skip_seqfu_stats, | ||||||
| skip_seqkit_stats, | ||||||
| skip_seqtk_comp | ||||||
| ) | ||||||
| ch_post_stats_fastqc_html = POST_STATS.out.fastqc_html | ||||||
| ch_post_stats_fastqc_zip = POST_STATS.out.fastqc_zip | ||||||
| ch_post_stats_seqfu_check = POST_STATS.out.seqfu_check | ||||||
| ch_post_stats_seqfu_stats = POST_STATS.out.seqfu_stats | ||||||
| ch_post_stats_seqkit_stats = POST_STATS.out.seqkit_stats | ||||||
| ch_post_stats_seqtk_stats = POST_STATS.out.seqtk_stats | ||||||
| ch_multiqc_files = ch_multiqc_files.mix(POST_STATS.out.seqfu_multiqc) | ||||||
| ch_versions = ch_versions.mix(POST_STATS.out.versions) | ||||||
|
|
||||||
| emit: | ||||||
| reads = ch_reads // channel: [ val(meta), [ fastq ] ] | ||||||
|
|
||||||
| // statistics | ||||||
| pre_stats_fastqc_html = ch_pre_stats_fastqc_html | ||||||
| pre_stats_fastqc_zip = ch_pre_stats_fastqc_zip | ||||||
| pre_stats_seqfu_check = ch_pre_stats_seqfu_check | ||||||
| pre_stats_seqfu_stats = ch_pre_stats_seqfu_stats | ||||||
| pre_stats_seqkit_stats = ch_pre_stats_seqkit_stats | ||||||
| pre_stats_seqtk_stats = ch_pre_stats_seqtk_stats | ||||||
| post_stats_fastqc_html = ch_post_stats_fastqc_html | ||||||
| post_stats_fastqc_zip = ch_post_stats_fastqc_zip | ||||||
| post_stats_seqfu_check = ch_post_stats_seqfu_check | ||||||
| post_stats_seqfu_stats = ch_post_stats_seqfu_stats | ||||||
| post_stats_seqkit_stats = ch_post_stats_seqkit_stats | ||||||
| post_stats_seqtk_stats = ch_post_stats_seqtk_stats | ||||||
|
Comment on lines
+203
to
+214
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ugh.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Food for thought... |
||||||
|
|
||||||
| // barcoding | ||||||
| umi_log = ch_umi_log | ||||||
|
|
||||||
| // adapter removal and merging | ||||||
| adapterremoval_discarded_reads = ch_adapterremoval_discarded_reads | ||||||
| adapterremoval_logfile = ch_adapterremoval_logfile | ||||||
| adapterremoval_report = ch_adapterremoval_report | ||||||
|
|
||||||
| // complexity filtering | ||||||
| complexity_filter_log = ch_complexity_filter_log | ||||||
| complexity_filter_report = ch_complexity_filter_report | ||||||
|
|
||||||
| // deduplication | ||||||
| clumpify_log = ch_clumpify_log | ||||||
|
|
||||||
| // host decontamination | ||||||
| hostile_reference = ch_hostile_reference | ||||||
| hostile_json = ch_hostile_json | ||||||
| deacon_index = ch_deacon_index | ||||||
| deacon_summary = ch_deacon_summary | ||||||
|
|
||||||
| multiqc_files = ch_multiqc_files | ||||||
| versions = ch_versions // channel: [ versions.yml ] | ||||||
| } | ||||||
Uh oh!
There was an error while loading. Please reload this page.