From 435e8b88b02c7e71ce70bc4c024e4191ae28428c Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 2 Jul 2024 16:36:53 -0400 Subject: [PATCH 1/5] fix: CMake build errors on Mac resolved - closes #176, CMake was including `src/utils/bedtools/gzstream/version` as a C++ source file, when it was a simple text file containing version info - I moved this version info into `src/utils/bedtools/gzstream/README` --- src/utils/bedtools/gzstream/README | 2 ++ src/utils/bedtools/gzstream/version | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) delete mode 100644 src/utils/bedtools/gzstream/version diff --git a/src/utils/bedtools/gzstream/README b/src/utils/bedtools/gzstream/README index 5fb78b2..d2fccb8 100644 --- a/src/utils/bedtools/gzstream/README +++ b/src/utils/bedtools/gzstream/README @@ -4,3 +4,5 @@ =========================================================================== See index.html for documentation and installation instructions. + +Version 1.5 (08 Jan 2003) diff --git a/src/utils/bedtools/gzstream/version b/src/utils/bedtools/gzstream/version deleted file mode 100644 index 511137d..0000000 --- a/src/utils/bedtools/gzstream/version +++ /dev/null @@ -1 +0,0 @@ -1.5 (08 Jan 2003) From 1260bf9f5c42c5b863d68fd1077f188f4df9d9e4 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 2 Jul 2024 19:18:21 +0000 Subject: [PATCH 2/5] fix: min_intron_length_ initialization in JunctionsExtractor - fixes #188, min_intron_length_ was accidentally set from min_anchor_len_ --- src/junctions/junctions_extractor.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index a95d199..0b14404 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -196,8 +196,23 @@ class JunctionsExtractor { region_ = "."; ref_ = "NA"; } - JunctionsExtractor(string bam1, string region1, int strandness1, string strand_tag1, uint32_t min_anchor_length1, uint32_t min_intron_length1, uint32_t max_intron_length1, string ref1) : - bam_(bam1), region_(region1), strandness_(strandness1), strand_tag_(strand_tag1), min_anchor_length_(min_anchor_length1), min_intron_length_(min_anchor_length1), max_intron_length_(max_intron_length1), ref_(ref1){ + JunctionsExtractor( + string bam1, + string region1, + int strandness1, + string strand_tag1, + uint32_t min_anchor_length1, + uint32_t min_intron_length1, + uint32_t max_intron_length1, + string ref1) : + bam_(bam1), + region_(region1), + strandness_(strandness1), + strand_tag_(strand_tag1), + min_anchor_length_(min_anchor_length1), + min_intron_length_(min_intron_length1), + max_intron_length_(max_intron_length1), + ref_(ref1) { junctions_sorted_ = false; output_file_ = "NA"; output_barcodes_file_ = "NA"; From 409a49257f0523233690ef81aa0141ce12526e2e Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 3 Jul 2024 16:29:26 -0400 Subject: [PATCH 3/5] feat: added per-read flag and map quality filtering - closes #183, flag based filtering - closes #189, mapping quality based filtering - option '-F' filters reads containing any of these flags - option '-f' filters reads not containing all these flags - option '-q' filters reads below this mapping quality --- .../cis_splice_effects_identifier.cc | 18 +++++++++-- .../cis_splice_effects_identifier.h | 11 ++++++- src/junctions/junctions_extractor.cc | 32 +++++++++++++++++-- src/junctions/junctions_extractor.h | 19 ++++++++++- 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.cc b/src/cis-splice-effects/cis_splice_effects_identifier.cc index d43bcea..5bead6d 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.cc +++ b/src/cis-splice-effects/cis_splice_effects_identifier.cc @@ -46,6 +46,9 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) { << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; + out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; + out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl; + out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl; out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n" << "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n" << "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl; @@ -113,7 +116,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { optind = 1; //Reset before parsing again. stringstream help_ss; char c; - while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) { + while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) { switch(c) { case 'o': output_file_ = string(optarg); @@ -170,6 +173,15 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { case 'M': max_intron_length_ = atoi(optarg); break; + case 'f': + require_flags_ = atoi(optarg); + break; + case 'F': + filter_flags_ = atoi(optarg); + break; + case 'q': + min_map_qual_ = atoi(optarg); + break; case 'b': output_barcodes_file_ = string(optarg); break; @@ -285,7 +297,9 @@ void CisSpliceEffectsIdentifier::identify() { } else { ref_to_pass = "NA"; } - JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass); + JunctionsExtractor je1(bam_, variant_region, strandness_, + strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, + filter_flags_, require_flags_, min_map_qual_, ref_to_pass); je1.identify_junctions_from_BAM(); vector junctions = je1.get_all_junctions(); //Add all the junctions to the unique set diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.h b/src/cis-splice-effects/cis_splice_effects_identifier.h index b6b1960..adc9a67 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.h +++ b/src/cis-splice-effects/cis_splice_effects_identifier.h @@ -87,13 +87,19 @@ class CisSpliceEffectsIdentifier { //tag used in BAM to denote strand, default "XS" string strand_tag_; //Minimum anchor length for junctions - //Junctions need atleast this many bp overlap + //Junctions need at least this many bp overlap // on both ends. uint32_t min_anchor_length_; //Minimum length of an intron, i.e min junction width uint32_t min_intron_length_; //Maximum length of an intron, i.e max junction width uint32_t max_intron_length_; + //filter reads containing any of these flags + uint16_t filter_flags_; + // filter reads not containing all of these flags + uint16_t require_flags_; + // filter reads below the minimum mapping quality + uint8_t min_map_qual_; //whether to override strand of extracted junctions using intron-motif method bool override_strand_with_canonical_intron_motif_; public: @@ -114,6 +120,9 @@ class CisSpliceEffectsIdentifier { min_anchor_length_(8), min_intron_length_(70), max_intron_length_(500000), + filter_flags_(0), + require_flags_(0), + min_map_qual_(0), override_strand_with_canonical_intron_motif_(false) {} //Destructor ~CisSpliceEffectsIdentifier() { diff --git a/src/junctions/junctions_extractor.cc b/src/junctions/junctions_extractor.cc index 8960513..589eb44 100644 --- a/src/junctions/junctions_extractor.cc +++ b/src/junctions/junctions_extractor.cc @@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { optind = 1; //Reset before parsing again. int c; stringstream help_ss; - while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) { + while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) { switch(c) { case 'h': usage(help_ss); @@ -57,6 +57,15 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { case 'M': max_intron_length_ = atoi(optarg); break; + case 'f': + require_flags_ = atoi(optarg); + break; + case 'F': + filter_flags_ = atoi(optarg); + break; + case 'q': + min_map_qual_ = atoi(optarg); + break; case 'o': output_file_ = string(optarg); break; @@ -108,10 +117,17 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n"); } } + if ( (require_flags_ & filter_flags_) != 0) { + usage(); + throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n"); + } cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl; cerr << "Minimum intron length: " << min_intron_length_ << endl; cerr << "Maximum intron length: " << max_intron_length_ << endl; + cerr << "Require alignment flags: " << require_flags_ << endl; + cerr << "Filter alignment flags: " << filter_flags_ << endl; + cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl; cerr << "Alignment: " << bam_ << endl; cerr << "Output file: " << output_file_ << endl; if (output_barcodes_file_ != "NA"){ @@ -130,6 +146,9 @@ int JunctionsExtractor::usage(ostream& out) { << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; + out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; + out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl; + out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl; out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl; out << "\t\t" << "-r STR\tThe region to identify junctions \n" << "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl; @@ -358,7 +377,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i return; } -//Get the the barcode +//Get the barcode void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) { uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str()); if(p != NULL) { @@ -373,6 +392,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) { } } +//Filters alignments based on filtering flags and mapping quality +bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) { + if ((aln->core.flag & filter_flags_) != 0) return true; + if ((aln->core.flag | require_flags_) != aln->core.flag) return true; + if (aln->core.qual < min_map_qual_) return true; + return false; +} + //Parse junctions from the read and store in junction map int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) { int n_cigar = aln->core.n_cigar; @@ -523,6 +550,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() { //Initiate the alignment record bam1_t *aln = bam_init1(); while(sam_itr_next(in, iter, aln) >= 0) { + if (filter_alignment(header, aln)) continue; parse_alignment_into_junctions(header, aln); } hts_itr_destroy(iter); diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index 0b14404..496bb13 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -180,12 +180,21 @@ class JunctionsExtractor { string strand_tag_; //tag used in BAM to denote single cell barcode string barcode_tag_; + //filter reads containing any of these flags + uint16_t filter_flags_; + // filter reads not containing all of these flags + uint16_t require_flags_; + // filter reads below the minimum mapping quality + uint8_t min_map_qual_; public: //Default constructor JunctionsExtractor() { min_anchor_length_ = 8; min_intron_length_ = 70; max_intron_length_ = 500000; + filter_flags_ = 0; + require_flags_ = 0; + min_map_qual_ = 0; junctions_sorted_ = false; strandness_ = -1; strand_tag_ = "XS"; @@ -204,14 +213,20 @@ class JunctionsExtractor { uint32_t min_anchor_length1, uint32_t min_intron_length1, uint32_t max_intron_length1, + uint16_t filter_flags, + uint16_t require_flags, + uint8_t min_map_qual, string ref1) : bam_(bam1), region_(region1), strandness_(strandness1), strand_tag_(strand_tag1), min_anchor_length_(min_anchor_length1), - min_intron_length_(min_intron_length1), + min_intron_length_(min_anchor_length1), max_intron_length_(max_intron_length1), + filter_flags_(filter_flags), + require_flags_(require_flags), + min_map_qual_(min_map_qual), ref_(ref1) { junctions_sorted_ = false; output_file_ = "NA"; @@ -241,6 +256,8 @@ class JunctionsExtractor { void create_junctions_vector(); //Pull out the cigar string from the read int parse_read(bam_hdr_t *header, bam1_t *aln); + //Returns whether alignment should be filtered from junction analysis + bool filter_alignment(bam_hdr_t *header, bam1_t *aln); //Parse junctions from the read and store in junction map int parse_cigar_into_junctions(string chr, int read_pos, uint32_t *cigar, int n_cigar); From 1e3080c02dfa3493293863dc136a421dc8ea6226 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 2 Jul 2024 19:29:58 +0000 Subject: [PATCH 4/5] feat: added per-read anchor requirement to junction extract - closes #186, reads now only 'support' a junction if they have at least a given minimum anchor length, supplied with the '-A' flag (default 0) --- .../cis_splice_effects_identifier.cc | 10 ++++++++-- .../cis_splice_effects_identifier.h | 6 ++++-- src/junctions/junctions_extractor.cc | 17 +++++++++++++++-- src/junctions/junctions_extractor.h | 10 +++++++--- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.cc b/src/cis-splice-effects/cis_splice_effects_identifier.cc index 5bead6d..d2d2119 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.cc +++ b/src/cis-splice-effects/cis_splice_effects_identifier.cc @@ -44,6 +44,8 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) { out << "\t\t" << "-t STR\tTag used in bam to label strand. [XS]" << endl; out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n" << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; + out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n" + << "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; @@ -116,7 +118,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { optind = 1; //Reset before parsing again. stringstream help_ss; char c; - while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) { + while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:A:m:M:f:F:q:b:C")) != -1) { switch(c) { case 'o': output_file_ = string(optarg); @@ -167,6 +169,9 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) { case 'a': min_anchor_length_ = atoi(optarg); break; + case 'A': + min_read_anchor_length_ = atoi(optarg); + break; case 'm': min_intron_length_ = atoi(optarg); break; @@ -298,7 +303,8 @@ void CisSpliceEffectsIdentifier::identify() { ref_to_pass = "NA"; } JunctionsExtractor je1(bam_, variant_region, strandness_, - strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, + strand_tag_, min_anchor_length_, min_read_anchor_length_, + min_intron_length_, max_intron_length_, filter_flags_, require_flags_, min_map_qual_, ref_to_pass); je1.identify_junctions_from_BAM(); vector junctions = je1.get_all_junctions(); diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.h b/src/cis-splice-effects/cis_splice_effects_identifier.h index adc9a67..fc70131 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.h +++ b/src/cis-splice-effects/cis_splice_effects_identifier.h @@ -87,9 +87,10 @@ class CisSpliceEffectsIdentifier { //tag used in BAM to denote strand, default "XS" string strand_tag_; //Minimum anchor length for junctions - //Junctions need at least this many bp overlap - // on both ends. + //Junctions need at least this many bp overlap on both ends. uint32_t min_anchor_length_; + //Reads need at least this many bp overlap on both ends to support junction. + uint32_t min_read_anchor_length_; //Minimum length of an intron, i.e min junction width uint32_t min_intron_length_; //Maximum length of an intron, i.e max junction width @@ -118,6 +119,7 @@ class CisSpliceEffectsIdentifier { strandness_(-1), strand_tag_("XS"), min_anchor_length_(8), + min_read_anchor_length_(0), min_intron_length_(70), max_intron_length_(500000), filter_flags_(0), diff --git a/src/junctions/junctions_extractor.cc b/src/junctions/junctions_extractor.cc index 589eb44..535dc73 100644 --- a/src/junctions/junctions_extractor.cc +++ b/src/junctions/junctions_extractor.cc @@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { optind = 1; //Reset before parsing again. int c; stringstream help_ss; - while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) { + while((c = getopt(argc, argv, "ha:A:m:M:f:F:q:o:r:t:s:b:")) != -1) { switch(c) { case 'h': usage(help_ss); @@ -51,6 +51,9 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { case 'a': min_anchor_length_ = atoi(optarg); break; + case 'A': + min_read_anchor_length_ = atoi(optarg); + break; case 'm': min_intron_length_ = atoi(optarg); break; @@ -123,6 +126,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) { } cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl; + cerr << "Minimum read anchor length: " << min_read_anchor_length_ << endl; cerr << "Minimum intron length: " << min_intron_length_ << endl; cerr << "Maximum intron length: " << max_intron_length_ << endl; cerr << "Require alignment flags: " << require_flags_ << endl; @@ -144,6 +148,8 @@ int JunctionsExtractor::usage(ostream& out) { out << "Options:" << endl; out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n" << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; + out << "\t\t" << "-A INT\tMinimum read anchor length. Reads which satisfy a minimum \n" + << "\t\t\t " << "anchor length on both sides 'support' a junction. [0]" << endl; out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl; @@ -175,12 +181,19 @@ string JunctionsExtractor::get_new_junction_name() { return name_ss.str(); } -//Do some basic qc on the junction +//Update if junction passes QC based on current read alignment bool JunctionsExtractor::junction_qc(Junction &j1) { + // don't add support for junction if intron is wrong size if(j1.end - j1.start < min_intron_length_ || j1.end - j1.start > max_intron_length_) { return false; } + + // don't add support for junction if read isn't sufficiently anchored + if(j1.start - j1.thick_start < min_read_anchor_length_) return false; + if(j1.thick_end - j1.end < min_read_anchor_length_) return false; + + // add support, update if this junction is sufficiently anchored if(j1.start - j1.thick_start >= min_anchor_length_) j1.has_left_min_anchor = true; if(j1.thick_end - j1.end >= min_anchor_length_) diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index 496bb13..c969033 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -153,9 +153,10 @@ class JunctionsExtractor { //Reference FASTA file string ref_; //Minimum anchor length for junctions - //Junctions need atleast this many bp overlap - // on both ends. + //Junctions need at least this many bp overlap on both ends. uint32_t min_anchor_length_; + //Reads need at least this many bp overlap to support a junction + uint32_t min_read_anchor_length_; //Minimum length of an intron, i.e min junction width uint32_t min_intron_length_; //Maximum length of an intron, i.e max junction width @@ -190,6 +191,7 @@ class JunctionsExtractor { //Default constructor JunctionsExtractor() { min_anchor_length_ = 8; + min_read_anchor_length_ = 0; min_intron_length_ = 70; max_intron_length_ = 500000; filter_flags_ = 0; @@ -211,6 +213,7 @@ class JunctionsExtractor { int strandness1, string strand_tag1, uint32_t min_anchor_length1, + uint32_t min_read_anchor_length1, uint32_t min_intron_length1, uint32_t max_intron_length1, uint16_t filter_flags, @@ -222,7 +225,8 @@ class JunctionsExtractor { strandness_(strandness1), strand_tag_(strand_tag1), min_anchor_length_(min_anchor_length1), - min_intron_length_(min_anchor_length1), + min_read_anchor_length_(min_read_anchor_length1), + min_intron_length_(min_intron_length1), max_intron_length_(max_intron_length1), filter_flags_(filter_flags), require_flags_(require_flags), From 9f5e456ea7176ff39b03546482cb4314459e596f Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Mon, 1 Jul 2024 20:50:31 +0000 Subject: [PATCH 5/5] fix: junctions extractor count overlapping read pairs once - closes #187, implemented by adding set of `reads` to `Junction`, and only incrementing `read_count` if read has not been seen yet - this commit should also increase RegTools speed by removing the barcode updating bottleneck caused by repeatedly copying the barcode map - updated regtools to v1.1.0 --- CMakeLists.txt | 2 +- src/junctions/junctions_extractor.cc | 64 +++++++++++++++------------- src/junctions/junctions_extractor.h | 2 + 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73872bb..dbf29fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ include(TestHelper) #versioning stuff set (regtools_VERSION_MAJOR 1) -set (regtools_VERSION_MINOR 0) +set (regtools_VERSION_MINOR 1) set (regtools_VERSION_PATCH 0) configure_file ( diff --git a/src/junctions/junctions_extractor.cc b/src/junctions/junctions_extractor.cc index 535dc73..b7358c1 100644 --- a/src/junctions/junctions_extractor.cc +++ b/src/junctions/junctions_extractor.cc @@ -202,7 +202,8 @@ bool JunctionsExtractor::junction_qc(Junction &j1) { } //Add a junction to the junctions map -//The read_count field is the number of reads supporting the junction. +//The read_count field is the number of reads supporting the junction, +// and reads is a set of the read names supporting the junction int JunctionsExtractor::add_junction(Junction j1) { //Check junction_qc if(!junction_qc(j1)) { @@ -225,44 +226,46 @@ int JunctionsExtractor::add_junction(Junction j1) { } string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy; - //Check if new junction + //add new junction if(!junctions_.count(key)) { j1.name = get_new_junction_name(); j1.read_count = 1; j1.score = common::num_to_str(j1.read_count); - } else { //existing junction - Junction j0 = junctions_[key]; - + junctions_[key] = j1; + + } else { //update existing junction + if (output_barcodes_file_ != "NA"){ - unordered_map::const_iterator it = j0.barcodes.find(j1.barcodes.begin()->first); - - if (it != j0.barcodes.end()) {// barcode exists already - j1.barcodes = j0.barcodes; - j1.barcodes[it->first]++; + // NOTE: Junction j1 will always have exactly one barcode, + // that of the current alignment; i.e. j1.barcodes = {: 1} + auto it = junctions_[key].barcodes.find(j1.barcodes.begin()->first); + if (it != junctions_[key].barcodes.end()) {// barcode exists already + junctions_[key].barcodes[it->first]++; } else { - // this block is where the slowness happens - not sure if it's the instantiation or the insertion - // well, tried to get around instantion by just inserting into j0 but that made it like another 2x slower so I don't think it's that - pair tmp_barcode = *j1.barcodes.begin(); - j1.barcodes = j0.barcodes; - j1.barcodes.insert(tmp_barcode); + junctions_[key].barcodes[it->first] = 1; } } - //increment read count - j1.read_count = j0.read_count + 1; - j1.score = common::num_to_str(j1.read_count); - //Keep the same name - j1.name = j0.name; + // following barcodes convention, Junction j1 has one read, + // that of the current alignment; i.e. j1.reads = {} + string read_name = *j1.reads.begin(); + //avoid counting the same paired-end read twice + //if both segments overlap junction + if (!junctions_[key].reads.count(read_name)) { + junctions_[key].reads.insert(read_name); + junctions_[key].read_count++; + junctions_[key].score = common::num_to_str(junctions_[key].read_count); + } //Check if thick starts are any better - if(j0.thick_start < j1.thick_start) - j1.thick_start = j0.thick_start; - if(j0.thick_end > j1.thick_end) - j1.thick_end = j0.thick_end; - //preserve min anchor information - j1.has_left_min_anchor = j1.has_left_min_anchor || j0.has_left_min_anchor; - j1.has_right_min_anchor = j1.has_right_min_anchor || j0.has_right_min_anchor; + if(j1.thick_start < junctions_[key].thick_start) + junctions_[key].thick_start = j1.thick_start; + if(j1.thick_end > junctions_[key].thick_end) + junctions_[key].thick_end = j1.thick_end; + //update min anchor information + junctions_[key].has_left_min_anchor = + junctions_[key].has_left_min_anchor || j1.has_left_min_anchor; + junctions_[key].has_right_min_anchor = + junctions_[key].has_right_min_anchor || j1.has_right_min_anchor; } - //Add junction and check anchor while printing. - junctions_[key] = j1; return 0; } @@ -426,8 +429,9 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t Junction j1; j1.chrom = chr; + j1.thick_start = read_pos; // start pos of read j1.start = read_pos; //maintain start pos of junction - j1.thick_start = read_pos; + j1.reads.insert(bam_get_qname(aln)); string intron_motif; if (output_barcodes_file_ != "NA"){ diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index c969033..1243bc8 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -39,6 +39,8 @@ using namespace std; struct Junction : BED { //Number of reads supporting the junction unsigned int read_count; + //Reads supporting the junction + unordered_set reads; //This is the start - max overhang CHRPOS thick_start; //This is the end + max overhang