diff --git a/.gitmodules b/.gitmodules index 2b8d112..39b5ca2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,4 +19,7 @@ [submodule "vendor/GenomeWorks"] path = vendor/GenomeWorks url = https://github.com/clara-parabricks/GenomeWorks.git +[submodule "vendor/intervaltree"] + path = vendor/intervaltree + url = https://github.com/ekg/intervaltree.git branch = master diff --git a/CMakeLists.txt b/CMakeLists.txt index 64c2346..ad1b55c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) project(racon) -set(racon_version 1.4.17) +set(racon_version 1.5.0) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) @@ -27,13 +27,16 @@ if(racon_enable_cuda) endif() include_directories(${PROJECT_SOURCE_DIR}/src) +include_directories(${PROJECT_SOURCE_DIR}/vendor/intervaltree) set(racon_sources src/main.cpp + src/bed.cpp src/logger.cpp src/polisher.cpp src/overlap.cpp src/sequence.cpp + src/util.cpp src/window.cpp) if(racon_enable_cuda) @@ -103,11 +106,15 @@ if (racon_build_tests) include_directories(${PROJECT_SOURCE_DIR}/src) set(racon_test_sources + test/bed_test.cpp test/racon_test.cpp + test/util_test.cpp + src/bed.cpp src/logger.cpp src/polisher.cpp src/overlap.cpp src/sequence.cpp + src/util.cpp src/window.cpp) if (racon_enable_cuda) diff --git a/meson.build b/meson.build index 6d7e10a..a71bd47 100644 --- a/meson.build +++ b/meson.build @@ -1,7 +1,7 @@ project( 'Racon', 'cpp', - version : '1.4.13', + version : '1.5.0', default_options : [ 'buildtype=release', 'warning_level=3', diff --git a/src/bed.cpp b/src/bed.cpp new file mode 100644 index 0000000..3323eb3 --- /dev/null +++ b/src/bed.cpp @@ -0,0 +1,75 @@ +/*! + * @file bed.cpp + * + * @brief BED reader source file + */ + +#include +#include +#include + +#include "bed.hpp" + +namespace racon { + +bool BedFile::Deserialize(const std::string& line, BedRecord& record) { + if (line.empty()) { + return false; + } + char name_buff[1024]; + int64_t chrom_start = 0, chrom_end = 0; + int32_t n = sscanf(line.c_str(), "%s %ld %ld", name_buff, &chrom_start, &chrom_end); + if (n < 3 || chrom_end <= chrom_start) { + throw std::runtime_error("Invalid BED line: '" + line + "'"); + } + record = BedRecord(name_buff, chrom_start, chrom_end); + return true; +} + +void BedFile::Serialize(std::ostream& os, const BedRecord& record) { + os << record.chrom() << " " << record.chrom_start() << " " << record.chrom_end(); +} + +std::string BedFile::Serialize(const BedRecord& record) { + std::ostringstream oss; + Serialize(oss, record); + return oss.str(); +} + +BedReader::BedReader(const std::string& in_fn) + : file_{std::unique_ptr(new std::ifstream(in_fn))} + , in_(*file_.get()) +{ +} + +BedReader::BedReader(std::istream& in) + : in_(in) +{ +} + +bool BedReader::GetNext(BedRecord& record) { + const bool rv1 = !std::getline(in_, line_).fail(); + if (!rv1) + return false; + const bool rv2 = BedFile::Deserialize(line_, record); + return rv2; +} + +std::vector BedReader::ReadAll(const std::string& fn) +{ + std::ifstream in{fn}; + return ReadAll(in); +} + +std::vector BedReader::ReadAll(std::istream& in) +{ + std::vector records; + BedReader reader{in}; + BedRecord record; + while (reader.GetNext(record)) { + records.emplace_back(std::move(record)); + } + return records; +} + +} diff --git a/src/bed.hpp b/src/bed.hpp new file mode 100644 index 0000000..bfc0267 --- /dev/null +++ b/src/bed.hpp @@ -0,0 +1,93 @@ +/*! + * @file bed.hpp + * + * @brief BED file containers and parser. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace racon { + +class BedRecord; + +class BedFile { +public: + static bool Deserialize(const std::string& line, BedRecord& record); + static void Serialize(std::ostream& os, const BedRecord& record); + static std::string Serialize(const BedRecord& record); +}; + +/* + * \brief BedRecord container. + * Note: BED records have 0-based coordinates, and the end coordinate is non-inclusive. +*/ +class BedRecord { +public: + ~BedRecord() = default; + + BedRecord() = default; + + BedRecord(std::string _chrom, int64_t _chrom_start, int64_t _chrom_end) + : chrom_(std::move(_chrom)) + , chrom_start_(_chrom_start) + , chrom_end_(_chrom_end) {} + + const std::string& chrom() const { + return chrom_; + } + int64_t chrom_start() const { + return chrom_start_; + } + int64_t chrom_end() const { + return chrom_end_; + } + + void chrom(const std::string& val) { + chrom_ = val; + } + void chrom_start(int64_t val) { + chrom_start_ = val; + } + void chrom_end(int64_t val) { + chrom_end_ = val; + } + + bool operator==(const BedRecord& rhs) const + { + return chrom_ == rhs.chrom_ && chrom_start_ == rhs.chrom_start_ + && chrom_end_ == rhs.chrom_end_; + } + + std::ostream& operator<<(std::ostream& os) const { + BedFile::Serialize(os, *this); + return os; + } + +private: + std::string chrom_; + int64_t chrom_start_ = 0; + int64_t chrom_end_ = 0; +}; + +class BedReader { +public: + BedReader(const std::string& in_fn); + BedReader(std::istream& in); + + static std::vector ReadAll(const std::string& fn); + static std::vector ReadAll(std::istream& in); + bool GetNext(BedRecord& record); + +private: + std::unique_ptr file_; + std::istream& in_; + std::string line_; +}; + +} diff --git a/src/main.cpp b/src/main.cpp index a6b1b48..6fe7f9d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -31,6 +32,7 @@ static struct option options[] = { {"mismatch", required_argument, 0, 'x'}, {"gap", required_argument, 0, 'g'}, {"threads", required_argument, 0, 't'}, + {"bed", required_argument, 0, 'B'}, {"version", no_argument, 0, 'v'}, {"help", no_argument, 0, 'h'}, #ifdef CUDA_ENABLED @@ -66,7 +68,9 @@ int main(int argc, char** argv) { uint32_t cudaaligner_band_width = 0; bool cuda_banded_alignment = false; - std::string optstring = "ufw:q:e:m:x:g:t:h"; + std::string bed_file; + + std::string optstring = "ufw:q:e:m:x:g:t:B:h"; #ifdef CUDA_ENABLED optstring += "bc::"; #endif @@ -104,6 +108,9 @@ int main(int argc, char** argv) { case 't': num_threads = atoi(optarg); break; + case 'B': + bed_file = std::string(optarg); + break; case 'v': printf("%s\n", version); exit(0); @@ -149,8 +156,10 @@ int main(int argc, char** argv) { exit(1); } + std::cerr << "BED file: '" << bed_file << "'\n"; + auto polisher = racon::createPolisher(input_paths[0], input_paths[1], - input_paths[2], type == 0 ? racon::PolisherType::kC : + input_paths[2], bed_file, type == 0 ? racon::PolisherType::kC : racon::PolisherType::kF, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches, @@ -209,6 +218,9 @@ void help() { " -g, --gap \n" " default: -4\n" " gap penalty (must be negative)\n" + " -B, --bed \n" + " default: ''\n" + " path to a BED file with regions to polish\n" " -t, --threads \n" " default: 1\n" " number of threads\n" diff --git a/src/meson.build b/src/meson.build index 7f0d21c..c523691 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,9 +1,11 @@ racon_cpp_sources = files([ + 'bed.cpp', 'logger.cpp', 'overlap.cpp', 'polisher.cpp', 'sequence.cpp', - 'window.cpp' + 'util.cpp', + 'window.cpp', ]) racon_extra_flags = [] diff --git a/src/overlap.cpp b/src/overlap.cpp index 98cf6f7..8a22881 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -8,6 +8,7 @@ #include "sequence.hpp" #include "overlap.hpp" +#include "util.hpp" #include "edlib.h" namespace racon { @@ -114,18 +115,6 @@ Overlap::Overlap() breaking_points_(), dual_breaking_points_() { } -template -bool transmuteId(const std::unordered_map& t_to_id, const T& t, - uint64_t& id) { - - auto it = t_to_id.find(t); - if (it == t_to_id.end()) { - return false; - } - id = it->second; - return true; -} - void Overlap::transmute(const std::vector>& sequences, const std::unordered_map& name_to_id, const std::unordered_map& id_to_id) { @@ -177,7 +166,7 @@ void Overlap::transmute(const std::vector>& sequences, } void Overlap::find_breaking_points(const std::vector>& sequences, - uint32_t window_length) { + std::vector> windows) { if (!is_transmuted_) { fprintf(stderr, "[racon::Overlap::find_breaking_points] error: " @@ -197,7 +186,7 @@ void Overlap::find_breaking_points(const std::vector>& align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_); } - find_breaking_points_from_cigar(window_length); + find_breaking_points_from_cigar(std::move(windows)); std::string().swap(cigar_); } @@ -223,72 +212,16 @@ void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, ui edlibFreeAlignResult(result); } -void Overlap::find_breaking_points_from_cigar(uint32_t window_length) +void Overlap::find_breaking_points_from_cigar(std::vector> windows) { - // find breaking points from cigar - std::vector window_ends; - for (uint32_t i = 0; i < t_end_; i += window_length) { - if (i > t_begin_) { - window_ends.emplace_back(i - 1); - } - } - window_ends.emplace_back(t_end_ - 1); - - uint32_t w = 0; - bool found_first_match = false; - std::pair first_match = {0, 0}, last_match = {0, 0}; - - int32_t q_ptr = (strand_ ? (q_length_ - q_end_) : q_begin_) - 1; - int32_t t_ptr = t_begin_ - 1; - - for (uint32_t i = 0, j = 0; i < cigar_.size(); ++i) { - if (cigar_[i] == 'M' || cigar_[i] == '=' || cigar_[i] == 'X') { - uint32_t k = 0, num_bases = atoi(&cigar_[j]); - j = i + 1; - while (k < num_bases) { - ++q_ptr; - ++t_ptr; - - if (!found_first_match) { - found_first_match = true; - first_match.first = t_ptr; - first_match.second = q_ptr; - } - last_match.first = t_ptr + 1; - last_match.second = q_ptr + 1; - if (t_ptr == window_ends[w]) { - if (found_first_match) { - breaking_points_.emplace_back(first_match); - breaking_points_.emplace_back(last_match); - } - found_first_match = false; - ++w; - } - - ++k; - } - } else if (cigar_[i] == 'I') { - q_ptr += atoi(&cigar_[j]); - j = i + 1; - } else if (cigar_[i] == 'D' || cigar_[i] == 'N') { - uint32_t k = 0, num_bases = atoi(&cigar_[j]); - j = i + 1; - while (k < num_bases) { - ++t_ptr; - if (t_ptr == window_ends[w]) { - if (found_first_match) { - breaking_points_.emplace_back(first_match); - breaking_points_.emplace_back(last_match); - } - found_first_match = false; - ++w; - } - ++k; - } - } else if (cigar_[i] == 'S' || cigar_[i] == 'H' || cigar_[i] == 'P') { - j = i + 1; - } - } + int64_t q_start = (strand_ ? (q_length_ - q_end_) : q_begin_); + int64_t t_start = t_begin_; + + std::vector result = generate_window_breakpoints( + cigar_, q_start, t_start, + std::move(windows)); + + std::swap(breaking_points_, result); } } diff --git a/src/overlap.hpp b/src/overlap.hpp index f78936f..a7a490a 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -6,11 +6,13 @@ #pragma once -#include -#include +#include "util.hpp" +#include +#include #include #include #include +#include #include #include @@ -41,6 +43,14 @@ class Overlap { return t_id_; } + uint32_t t_begin() const { + return t_begin_; + } + + uint32_t t_end() const { + return t_end_; + } + uint32_t strand() const { return strand_; } @@ -65,17 +75,19 @@ class Overlap { return cigar_; } - const std::vector>& breaking_points() const { + const std::vector& breaking_points() const { return breaking_points_; } void find_breaking_points(const std::vector>& sequences, - uint32_t window_length); + std::vector> windows); friend bioparser::MhapParser; friend bioparser::PafParser; friend bioparser::SamParser; + friend std::ostream& operator<<(::std::ostream& os, const Overlap& a); + #ifdef CUDA_ENABLED friend class CUDABatchAligner; #endif @@ -97,7 +109,7 @@ class Overlap { Overlap(); Overlap(const Overlap&) = delete; const Overlap& operator=(const Overlap&) = delete; - virtual void find_breaking_points_from_cigar(uint32_t window_length); + virtual void find_breaking_points_from_cigar(std::vector> windows); virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len); std::string q_name_; @@ -119,8 +131,19 @@ class Overlap { bool is_valid_; bool is_transmuted_; - std::vector> breaking_points_; + // std::vector> breaking_points_; std::vector> dual_breaking_points_; + + std::vector breaking_points_; }; +inline std::ostream& operator<<(::std::ostream& os, const Overlap& a) { + std::string delim(" "); + os << a.q_id_ << delim << a.q_begin_ << delim << a.q_end_ << delim << a.q_length_ + << delim << (a.strand_ ? "-" : "+") + << delim << a.t_id_ << delim << a.t_begin_ << delim << a.t_end_ << delim << a.t_length_ + << delim << a.cigar_ << delim << a.is_valid_ << delim << a.is_transmuted_; + return os; +} + } diff --git a/src/polisher.cpp b/src/polisher.cpp index 5f57afd..1a08e41 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -10,9 +10,9 @@ #include "overlap.hpp" #include "sequence.hpp" -#include "window.hpp" #include "logger.hpp" #include "polisher.hpp" +#include "util.hpp" #ifdef CUDA_ENABLED #include "cuda/cudapolisher.hpp" #endif @@ -21,6 +21,8 @@ #include "thread_pool/thread_pool.hpp" #include "spoa/spoa.hpp" +// #define BED_FEATURE_TEST + namespace racon { constexpr uint32_t kChunkSize = 1024 * 1024 * 1024; // ~ 1GB @@ -54,6 +56,7 @@ uint64_t shrinkToFit(std::vector>& src, uint64_t begin) { std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment, @@ -132,12 +135,22 @@ std::unique_ptr createPolisher(const std::string& sequences_path, exit(1); } + bool use_bed = false; + std::vector bed_records; + if (bed_path.size() > 0) { + use_bed = true; + bed_records = BedReader::ReadAll(bed_path); + } + std::cerr << "[racon::createPolisher] Use bed: " << (use_bed ? "true" : "false") << ", bed records: " << bed_records.size() << "\n"; + if (cudapoa_batches > 0 || cudaaligner_batches > 0) { #ifdef CUDA_ENABLED // If CUDA is enabled, return an instance of the CUDAPolisher object. return std::unique_ptr(new CUDAPolisher(std::move(sparser), - std::move(oparser), std::move(tparser), type, window_length, + std::move(oparser), std::move(tparser), + std::move(bed_records), use_bed, + type, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches, cudaaligner_band_width)); @@ -153,7 +166,9 @@ std::unique_ptr createPolisher(const std::string& sequences_path, { (void) cuda_banded_alignment; return std::unique_ptr(new Polisher(std::move(sparser), - std::move(oparser), std::move(tparser), type, window_length, + std::move(oparser), std::move(tparser), + std::move(bed_records), use_bed, + type, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads)); } @@ -162,11 +177,13 @@ std::unique_ptr createPolisher(const std::string& sequences_path, Polisher::Polisher(std::unique_ptr> sparser, std::unique_ptr> oparser, std::unique_ptr> tparser, + std::vector bed_records, bool use_bed, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads) : sparser_(std::move(sparser)), oparser_(std::move(oparser)), - tparser_(std::move(tparser)), type_(type), quality_threshold_( + tparser_(std::move(tparser)), bed_records_(std::move(bed_records)), + use_bed_(use_bed), type_(type), quality_threshold_( quality_threshold), error_threshold_(error_threshold), trim_(trim), alignment_engines_(), sequences_(), dummy_quality_(window_length, '!'), window_length_(window_length), windows_(), @@ -279,6 +296,61 @@ void Polisher::initialize() { logger_->log("[racon::Polisher::initialize] loaded sequences"); logger_->log(); + + + ////////////////////////////////// + /// Collect the BED intervals. /// + ////////////////////////////////// + logger_->log("[racon::Polisher::initialize] building the interval trees"); + logger_->log(); + target_bed_intervals_.clear(); + // Collect target intervals of interest. + // If the BED file is not specified, construct windows covering full spans of targets. + if (use_bed_) { + for (size_t i = 0; i < bed_records_.size(); ++i) { + const auto& record = bed_records_[i]; + uint64_t t_id = 0; + if (!transmuteId(name_to_id, record.chrom() + "t", t_id)) { + throw std::runtime_error("Target sequence '" + record.chrom() + + "' specified in the BED file was not found among the target sequences."); + } + target_bed_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + } + } else { + for (uint64_t t_id = 0; t_id < targets_size; ++t_id) { + target_bed_intervals_[t_id].emplace_back(IntervalInt64(0, static_cast(sequences_[t_id]->data().size()) - 1, -1)); + } + } + // Sort target intervals. + for (auto& it: target_bed_intervals_) { + std::stable_sort(it.second.begin(), it.second.end(), + [](const IntervalInt64& a, const IntervalInt64& b) { return a.start < b.start; }); + } + // Construct the trees. + target_bed_trees_.clear(); + for (const auto& it: target_bed_intervals_) { + // Make a copy, because the IntervalTree has only the move constructor, + // and we still need t he intvervals for validation below. + auto intervals = it.second; + target_bed_trees_[it.first] = IntervalTreeInt64(std::move(intervals)); + } + // Validate that there are no overlapping intervals. + for (const auto& it: target_bed_intervals_) { + int64_t t_id = it.first; + for (const auto& interval: it.second) { + auto foundIntervals = target_bed_trees_[t_id].findOverlapping(interval.start, interval.stop); + if (foundIntervals.size() != 1 || + (foundIntervals.size() == 1 && foundIntervals.front().value != interval.value)) { + throw std::runtime_error("Invalid BED record: '" + + BedFile::Serialize(bed_records_[interval.value]) + + "'. It overlaps other BED records, which is not allowed."); + } + } + } + ///////////////////////////////// + + + std::vector> overlaps; auto remove_invalid_overlaps = [&](uint64_t begin, uint64_t end) -> void { @@ -321,6 +393,17 @@ void Polisher::initialize() { continue; } + // Remove overlaps in regions not specified by BED. + if (use_bed_) { + auto foundIntervals = target_bed_trees_[overlaps[i]->t_id()].findOverlapping( + static_cast(overlaps[i]->t_begin()), + static_cast(overlaps[i]->t_end()) - 1); + if (foundIntervals.empty()) { + overlaps[i].reset(); + continue; + } + } + while (overlaps[c] == nullptr) { ++c; } @@ -377,26 +460,106 @@ void Polisher::initialize() { it.wait(); } - find_overlap_breaking_points(overlaps); + logger_->log("[racon::Polisher::initialize] Constructing windows for BED regions.\n"); - logger_->log(); + create_and_populate_windows_with_bed(overlaps, targets_size, window_type); - std::vector id_to_first_window_id(targets_size + 1, 0); - for (uint64_t i = 0; i < targets_size; ++i) { + logger_->log("[racon::Polisher::initialize] transformed data into windows"); + +// #ifdef BED_FEATURE_TEST +// int32_t shift = 0; +// for (int32_t i = 0; i < static_cast(windows_.size()); ++i) { +// if ((i + 1) % 100 != 0) { +// windows_[i].reset(); +// ++shift; +// // std::cerr << "nulling i = " << i << "\n"; +// continue; +// } +// std::swap(windows_[i-shift], windows_[i]); +// } +// windows_.resize(static_cast(windows_.size()) - shift); +// std::cerr << "windows_.size() = " << windows_.size() << "\n"; +// #endif + +} + +void Polisher::create_and_populate_windows_with_bed(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type) { + + // The -1 marks that the target doesn't have any windows. + id_to_first_window_id_.clear(); + id_to_first_window_id_.resize(targets_size + 1, -1); + + std::unordered_map>> windows; + + // Target intervals are sorted ahead of time. + for (const auto& it: target_bed_intervals_) { + int64_t t_id = it.first; + const std::vector& intervals = it.second; + + // Mark the window start. + id_to_first_window_id_[t_id] = static_cast(windows_.size()); + + // Generate windows for each interval separately. uint32_t k = 0; - for (uint32_t j = 0; j < sequences_[i]->data().size(); j += window_length_, ++k) { + for (const auto& interval: intervals) { + for (int64_t win_start = interval.start; win_start < (interval.stop + 1); + win_start += window_length_, ++k) { - uint32_t length = std::min(j + window_length_, - static_cast(sequences_[i]->data().size())) - j; + int64_t length = std::min(win_start + static_cast(window_length_), + (interval.stop + 1)) - win_start; + int64_t win_id = windows_.size(); - windows_.emplace_back(createWindow(i, k, window_type, - &(sequences_[i]->data()[j]), length, - sequences_[i]->quality().empty() ? &(dummy_quality_[0]) : - &(sequences_[i]->quality()[j]), length)); - } + windows_.emplace_back(createWindow(t_id, k, window_type, win_start, + &(sequences_[t_id]->data()[win_start]), length, + sequences_[t_id]->quality().empty() ? &(dummy_quality_[0]) : + &(sequences_[t_id]->quality()[win_start]), length)); - id_to_first_window_id[i + 1] = id_to_first_window_id[i] + k; + target_window_intervals_[t_id].emplace_back(IntervalInt64(win_start, win_start + length - 1, win_id)); + windows[t_id].emplace_back(win_start, win_start + length, win_id); + } + } } + // Construct the trees. The iterator is not const because the IntervalTree only has the + // move constructor. + target_window_trees_.clear(); + for (auto& it: target_window_intervals_) { + target_window_trees_[it.first] = IntervalTreeInt64(std::move(it.second)); + } + +// #ifdef BED_FEATURE_TEST +// for (size_t i = 0; i < windows_.size(); ++i) { +// std::cerr << "[window " << i << "] " << *windows_[i] << "\n"; +// } +// #endif + + find_overlap_breaking_points(overlaps, windows); + +// #ifdef BED_FEATURE_TEST +// for (uint64_t i = 0; i < overlaps.size(); ++i) { +// const auto& sequence = sequences_[overlaps[i]->q_id()]; +// const std::vector& breaking_points = overlaps[i]->breaking_points(); + +// std::cerr << "overlap_id = " << i << "\n"; +// std::cerr << " " << *overlaps[i] << "\n"; +// std::cerr << "All breaking points:\n"; +// for (uint32_t j = 0; j < breaking_points.size(); ++j) { +// const auto& bp = breaking_points[j]; +// std::cerr << "[j = " << j << "] bp = " << bp << ", Window: " << *windows_[bp.window_id] << "\n"; +// if (bp.t_start < windows_[bp.window_id]->start() || bp.t_start >= windows_[bp.window_id]->end() || +// bp.t_end < windows_[bp.window_id]->start() || bp.t_end > windows_[bp.window_id]->end()) { +// std::cerr << "ERROR! Coordiantes out of bounds!\n"; +// exit(1); +// } +// } +// std::cerr << "\n"; +// } +// #endif + + assign_sequences_to_windows(overlaps, targets_size); +} + +void Polisher::assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size) { targets_coverages_.resize(targets_size, 0); @@ -405,10 +568,28 @@ void Polisher::initialize() { ++targets_coverages_[overlaps[i]->t_id()]; const auto& sequence = sequences_[overlaps[i]->q_id()]; - const auto& breaking_points = overlaps[i]->breaking_points(); - - for (uint32_t j = 0; j < breaking_points.size(); j += 2) { - if (breaking_points[j + 1].second - breaking_points[j].second < 0.02 * window_length_) { + const std::vector& breaking_points = overlaps[i]->breaking_points(); + + // std::cerr << "overlap_id = " << i << "\n"; + // std::cerr << " " << *overlaps[i] << "\n"; + // std::cerr << "All breaking points:\n"; + // for (uint32_t j = 0; j < breaking_points.size(); ++j) { + // const auto& bp = breaking_points[j]; + // std::cerr << "[j = " << j << "] bp = " << bp << "\n"; + // } + + for (uint32_t j = 0; j < breaking_points.size(); ++j) { + const auto& bp = breaking_points[j]; + // const uint32_t win_t_start = std::get<0>(breaking_points[j]); + // const uint32_t win_t_end = std::get<0>(breaking_points[j + 1]); + // const uint32_t win_q_start = std::get<1>(breaking_points[j]); + // const uint32_t win_q_end = std::get<1>(breaking_points[j + 1]); + const auto& win_t_start = bp.t_start; + const auto& win_t_end = bp.t_end; + const auto& win_q_start = bp.q_start; + const auto& win_q_end = bp.q_end; + + if ((win_q_end - win_q_start) < 0.02 * window_length_) { continue; } @@ -418,54 +599,58 @@ void Polisher::initialize() { const auto& quality = overlaps[i]->strand() ? sequence->reverse_quality() : sequence->quality(); double average_quality = 0; - for (uint32_t k = breaking_points[j].second; k < breaking_points[j + 1].second; ++k) { + for (uint32_t k = win_q_start; k < win_q_end; ++k) { average_quality += static_cast(quality[k]) - 33; } - average_quality /= breaking_points[j + 1].second - breaking_points[j].second; + average_quality /= (win_q_end - win_q_start); if (average_quality < quality_threshold_) { continue; } } - uint64_t window_id = id_to_first_window_id[overlaps[i]->t_id()] + - breaking_points[j].first / window_length_; - uint32_t window_start = (breaking_points[j].first / window_length_) * - window_length_; + uint64_t window_id = bp.window_id; + uint32_t window_start = windows_[bp.window_id]->start(); const char* data = overlaps[i]->strand() ? - &(sequence->reverse_complement()[breaking_points[j].second]) : - &(sequence->data()[breaking_points[j].second]); - uint32_t data_length = breaking_points[j + 1].second - - breaking_points[j].second; + &(sequence->reverse_complement()[win_q_start]) : + &(sequence->data()[win_q_start]); + uint32_t data_length = win_q_end - win_q_start; const char* quality = overlaps[i]->strand() ? (sequence->reverse_quality().empty() ? - nullptr : &(sequence->reverse_quality()[breaking_points[j].second])) + nullptr : &(sequence->reverse_quality()[win_q_start])) : (sequence->quality().empty() ? - nullptr : &(sequence->quality()[breaking_points[j].second])); + nullptr : &(sequence->quality()[win_q_start])); uint32_t quality_length = quality == nullptr ? 0 : data_length; + // std::cerr << "[j = " << j << "] Adding layer for bp = " << bp << "\n"; + windows_[window_id]->add_layer(data, data_length, quality, quality_length, - breaking_points[j].first - window_start, - breaking_points[j + 1].first - window_start - 1); + win_t_start - window_start, + win_t_end - window_start - 1); } + // std::cerr << "\n"; overlaps[i].reset(); } - logger_->log("[racon::Polisher::initialize] transformed data into windows"); } -void Polisher::find_overlap_breaking_points(std::vector>& overlaps) +void Polisher::find_overlap_breaking_points(std::vector>& overlaps, + const std::unordered_map>>& windows) { std::vector> thread_futures; for (uint64_t i = 0; i < overlaps.size(); ++i) { + thread_futures.emplace_back(thread_pool_->submit( [&](uint64_t j) -> void { - overlaps[j]->find_breaking_points(sequences_, window_length_); + auto it = windows.find(overlaps[j]->t_id()); + if (it != windows.end()) { + overlaps[j]->find_breaking_points(sequences_, it->second); + } }, i)); } @@ -508,13 +693,30 @@ void Polisher::polish(std::vector>& dst, uint64_t logger_step = thread_futures.size() / 20; + uint64_t prev_window_end = 0; + for (uint64_t i = 0; i < thread_futures.size(); ++i) { thread_futures[i].wait(); num_polished_windows += thread_futures[i].get() == true ? 1 : 0; + + // BED region related: Add the sequence in between windows. + if (windows_[i]->start() > prev_window_end) { + uint64_t span = windows_[i]->start() - prev_window_end; + polished_data += sequences_[windows_[i]->id()]->data().substr(prev_window_end, span); + } + + // Add the window consensus. polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { + // BED region related: Append the remaining suffix from the last window to the end of the target. + uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); + if (windows_[i]->end() < tlen) { + uint64_t suffix_start = windows_[i]->end(); + polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); + } + double polished_ratio = num_polished_windows / static_cast(windows_[i]->rank() + 1); @@ -530,6 +732,7 @@ void Polisher::polish(std::vector>& dst, num_polished_windows = 0; polished_data.clear(); } + prev_window_end = windows_[i]->end(); windows_[i].reset(); if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) { @@ -537,6 +740,24 @@ void Polisher::polish(std::vector>& dst, } } + // Write the original sequences if there were no BED windows assigned to them. + // If the BED is used, then some sequences can have 0 windows, and consensus will not + // be generated. + if (use_bed_) { + // There is an extra element because of legacy code. + for (int32_t t_id = 0; t_id < (static_cast(id_to_first_window_id_.size()) - 1); ++t_id) { + if (id_to_first_window_id_[t_id] < 0) { + std::string tags = type_ == PolisherType::kF ? "r" : ""; + tags += " LN:i:" + std::to_string(sequences_[t_id]->data().size()); + tags += " RC:i:" + std::to_string(targets_coverages_[t_id]); + tags += " XC:f:0.0"; + dst.emplace_back(createSequence(sequences_[t_id]->name() + + tags, sequences_[t_id]->data())); + + } + } + } + if (logger_step != 0) { logger_->bar("[racon::Polisher::polish] generating consensus"); } else { diff --git a/src/polisher.hpp b/src/polisher.hpp index 051ef8c..ca3a227 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -6,11 +6,16 @@ #pragma once -#include +#include #include #include #include #include +#include "window.hpp" +#include "bed.hpp" +#include "IntervalTree.h" +#include "util.hpp" +#include namespace bioparser { template @@ -25,7 +30,6 @@ namespace spoa { class AlignmentEngine; } - namespace racon { class Sequence; @@ -41,6 +45,7 @@ enum class PolisherType { class Polisher; std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cuda_batches = 0, @@ -58,6 +63,7 @@ class Polisher { friend std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cuda_batches, bool cuda_banded_alignment, @@ -67,17 +73,26 @@ class Polisher { Polisher(std::unique_ptr> sparser, std::unique_ptr> oparser, std::unique_ptr> tparser, + std::vector bed_records, bool use_bed, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads); Polisher(const Polisher&) = delete; const Polisher& operator=(const Polisher&) = delete; - virtual void find_overlap_breaking_points(std::vector>& overlaps); + virtual void find_overlap_breaking_points(std::vector>& overlaps, + const std::unordered_map>>& windows); + + void create_and_populate_windows_with_bed(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type); + void assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size); std::unique_ptr> sparser_; std::unique_ptr> oparser_; std::unique_ptr> tparser_; + std::vector bed_records_; + bool use_bed_; + PolisherType type_; double quality_threshold_; double error_threshold_; @@ -90,11 +105,18 @@ class Polisher { uint32_t window_length_; std::vector> windows_; + std::vector id_to_first_window_id_; std::unique_ptr thread_pool_; std::unordered_map thread_to_id_; std::unique_ptr logger_; + + std::unordered_map> target_bed_intervals_; + std::unordered_map target_bed_trees_; + + std::unordered_map> target_window_intervals_; + std::unordered_map target_window_trees_; }; } diff --git a/src/util.cpp b/src/util.cpp new file mode 100644 index 0000000..b570611 --- /dev/null +++ b/src/util.cpp @@ -0,0 +1,166 @@ +/*! + * @file util.cpp + * + * @brief Utility source file. + */ + +#include "util.hpp" + +namespace racon { + +// Tuple of: +// using WindowInterval = std::vector>; + +// std::vector> find_breaking_points_from_cigar( +// const std::string& cigar, int32_t q_start, int32_t t_start, +// std::vector windows) +// { +// } + +/* + * \param windows Vector of tuples, where each tuple element is: . +*/ +std::vector generate_window_breakpoints( + const std::string& cigar, int64_t q_start, int64_t t_start, + std::vector> windows) +{ + std::vector ret; + + if (windows.empty()) { + return ret; + } + + std::sort(windows.begin(), windows.end(), + [](const std::tuple& a, + const std::tuple& b) { + return std::get<0>(a) < std::get<0>(b); + }); + + // Avoid using the operators. + const char* cigar_str = cigar.c_str(); + int64_t num_windows = static_cast(windows.size()); + + bool found_first_match = false; + std::tuple first_match(0, 0, 0); + std::tuple last_match(0, 0, 0); + + // The "-1" is because of the while loops below (increment is done at the top). + int64_t q_pos = q_start - 1; + int64_t t_pos = t_start - 1; + int64_t curr_w = 0; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_start) { + ++curr_w; + } + + for (uint32_t i = 0, j = 0; i < cigar.size(); ++i) { + if (cigar_str[i] == 'M' || cigar_str[i] == '=' || cigar_str[i] == 'X') { + uint32_t num_bases = atoi(&cigar_str[j]); + j = i + 1; + for (uint32_t k = 0; k < num_bases; ++k) { + ++q_pos; + ++t_pos; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_pos) { + ++curr_w; + } + if (curr_w >= num_windows) { + break; + } + + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + const auto& win_id = std::get<2>(windows[curr_w]); + + if (t_pos < win_start) { + continue; + } + + if (!found_first_match) { + found_first_match = true; + first_match = std::make_tuple(t_pos, q_pos, win_id); + } + last_match = std::make_tuple(t_pos + 1, q_pos + 1, win_id); + if (t_pos == (win_end - 1)) { + if (found_first_match) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + found_first_match = false; + ++curr_w; + if (curr_w >= num_windows) { + break; + } + } + } + } else if (cigar_str[i] == 'I') { + q_pos += atoi(&cigar_str[j]); + j = i + 1; + } else if (cigar_str[i] == 'D' || cigar_str[i] == 'N') { + uint32_t num_bases = atoi(&cigar_str[j]); + j = i + 1; + for (uint32_t k = 0; k < num_bases; ++k) { + ++t_pos; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_pos) { + ++curr_w; + } + if (curr_w >= num_windows) { + break; + } + + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + + if (t_pos < win_start) { + continue; + } + + if (t_pos == (win_end - 1)) { + if (found_first_match) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + found_first_match = false; + ++curr_w; + if (curr_w >= num_windows) { + break; + } + } + } + } else if (cigar_str[i] == 'S' || cigar_str[i] == 'H' || cigar_str[i] == 'P') { + j = i + 1; + } + } + + // Add the final piece. + if (curr_w < static_cast(windows.size())) { + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + const auto& final_t_start = std::get<0>(first_match); + const auto& final_t_end = std::get<0>(last_match); + if (found_first_match && final_t_start >= win_start && final_t_end <= win_end) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + } + + return ret; +} + +} diff --git a/src/util.hpp b/src/util.hpp new file mode 100644 index 0000000..d67f246 --- /dev/null +++ b/src/util.hpp @@ -0,0 +1,73 @@ +/*! + * @file util.hpp + * + * @brief Utility functions used throughout the code. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "IntervalTree.h" + +using IntervalTreeInt64 = IntervalTree; +using IntervalVectorInt64 = IntervalTreeInt64::interval_vector; +using IntervalInt64 = IntervalTreeInt64::interval; + +namespace racon { + +class WindowInterval { +public: + WindowInterval() = default; + + WindowInterval(int64_t _q_start, int64_t _q_end, int64_t _t_start, int64_t _t_end, int64_t _window_id) + : q_start(_q_start) + , q_end(_q_end) + , t_start(_t_start) + , t_end(_t_end) + , window_id(_window_id) + {} + + bool operator==(const WindowInterval& rhs) const + { + return q_start == rhs.q_start && q_end == rhs.q_end && t_start == rhs.t_start + && t_end == rhs.t_end && window_id == rhs.window_id; + } + + friend std::ostream& operator<<(::std::ostream& os, const WindowInterval& a); + + int64_t q_start = 0; + int64_t q_end = 0; + int64_t t_start = 0; + int64_t t_end = 0; + int64_t window_id = -1; +}; + +inline std::ostream& operator<<(::std::ostream& os, const WindowInterval& a) { + os << "q_start = " << a.q_start << ", q_end = " << a.q_end + << ", t_start = " << a.t_start << ", t_end = " << a.t_end + << ", window_id = " << a.window_id; + return os; +} + +template +bool transmuteId(const std::unordered_map& t_to_id, const T& t, + uint64_t& id) { + + auto it = t_to_id.find(t); + if (it == t_to_id.end()) { + return false; + } + id = it->second; + return true; +} + +std::vector generate_window_breakpoints( + const std::string& cigar, int64_t q_start, int64_t t_start, + std::vector> windows); + +} diff --git a/src/window.cpp b/src/window.cpp index a9591d4..0794331 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -13,8 +13,8 @@ namespace racon { std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - const char* backbone, uint32_t backbone_length, const char* quality, - uint32_t quality_length) { + uint32_t window_start, const char* backbone, uint32_t backbone_length, + const char* quality, uint32_t quality_length) { if (backbone_length == 0 || backbone_length != quality_length) { fprintf(stderr, "[racon::createWindow] error: " @@ -22,13 +22,15 @@ std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type exit(1); } - return std::shared_ptr(new Window(id, rank, type, backbone, + return std::shared_ptr(new Window(id, rank, type, window_start, backbone, backbone_length, quality, quality_length)); } -Window::Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone, - uint32_t backbone_length, const char* quality, uint32_t quality_length) - : id_(id), rank_(rank), type_(type), consensus_(), sequences_(), +Window::Window(uint64_t id, uint32_t rank, WindowType type, uint32_t window_start, + const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t quality_length) + : id_(id), rank_(rank), type_(type), start_(window_start), end_(window_start + backbone_length), + consensus_(), sequences_(), qualities_(), positions_() { sequences_.emplace_back(backbone, backbone_length); @@ -53,7 +55,8 @@ void Window::add_layer(const char* sequence, uint32_t sequence_length, } if (begin >= end || begin > sequences_.front().second || end > sequences_.front().second) { fprintf(stderr, "[racon::Window::add_layer] error: " - "layer begin and end positions are invalid!\n"); + "layer begin and end positions are invalid! begin = %lu, end = %lu, backbone_len = %lu\n", + begin, end, sequences_.front().second); exit(1); } @@ -141,4 +144,18 @@ bool Window::generate_consensus(std::shared_ptr alignment return true; } +std::ostream& operator<<(std::ostream& os, const Window& a) +{ + os << "Window: id = " << a.id_ << ", rank = " << a.rank_ << ", type = " + << ((a.type_ == WindowType::kNGS) ? "NGS" : "TGS") + << ", start = " << a.start_ + << ", end = " << a.end_ + << ", backbone_len = " << a.backbone_length() + << ", consensus_len = " << a.consensus_.size() + << ", seqs_len = " << a.sequences_.size() + << ", quals_len = " << a.qualities_.size() + << ", pos_len = " << a.positions_.size(); + return os; +} + } diff --git a/src/window.hpp b/src/window.hpp index 3ecb3db..d16f9af 100644 --- a/src/window.hpp +++ b/src/window.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,7 @@ enum class WindowType { class Window; std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); class Window { @@ -44,6 +45,21 @@ class Window { return consensus_; } + uint32_t backbone_length() const { + if (sequences_.empty()) { + return 0; + } + return sequences_.front().second; + } + + uint32_t start() const { + return start_; + } + + uint32_t end() const { + return end_; + } + bool generate_consensus(std::shared_ptr alignment_engine, bool trim); @@ -52,25 +68,33 @@ class Window { uint32_t end); friend std::shared_ptr createWindow(uint64_t id, uint32_t rank, - WindowType type, const char* backbone, uint32_t backbone_length, - const char* quality, uint32_t quality_length); + WindowType type, uint32_t window_start, const char* backbone, + uint32_t backbone_length, const char* quality, uint32_t quality_length); + + friend std::ostream& operator<<(std::ostream& os, const Window& a); #ifdef CUDA_ENABLED friend class CUDABatchProcessor; #endif + private: - Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone, - uint32_t backbone_length, const char* quality, uint32_t quality_length); + Window(uint64_t id, uint32_t rank, WindowType type, uint32_t window_start, + const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t quality_length); Window(const Window&) = delete; const Window& operator=(const Window&) = delete; uint64_t id_; uint32_t rank_; WindowType type_; + uint32_t start_; + uint32_t end_; std::string consensus_; std::vector> sequences_; std::vector> qualities_; std::vector> positions_; }; +std::ostream& operator<<(std::ostream& os, const Window& a); + } diff --git a/test/bed_test.cpp b/test/bed_test.cpp new file mode 100644 index 0000000..016bd25 --- /dev/null +++ b/test/bed_test.cpp @@ -0,0 +1,101 @@ +/*! + * @file bed_test.cpp + * + * @brief Unit tests for the BED parser. + */ + +#include "gtest/gtest.h" +#include "racon_test_config.h" +#include "bed.hpp" + +#include +#include +#include + +TEST(BedFile, DeserializeTests) { + // Tuple: test_name, input line, expected record, expected return value, expected to throw. + using TestTupleType = std::tuple; + std::vector> test_data{ + TestTupleType("Empty input", "", racon::BedRecord(), false,false), + TestTupleType("Three columns", "chr01 0 1000", racon::BedRecord("chr01", 0, 1000), true, false), + TestTupleType("Multiple columns", "chr01 1000 2000 some other columns that are ignored", racon::BedRecord("chr01", 1000, 2000), true, false), + TestTupleType("Invalid BED line", "bad_bed", racon::BedRecord(), true, true), + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string& in_line = std::get<1>(single_test); + const racon::BedRecord& expected_record = std::get<2>(single_test); + const bool expected_rv = std::get<3>(single_test); + const bool expected_throw = std::get<4>(single_test); + + SCOPED_TRACE(test_name); + + if (expected_throw) { + EXPECT_THROW( + { + racon::BedRecord record; + racon::BedFile::Deserialize(in_line, record); + }, + std::runtime_error); + } else { + racon::BedRecord record; + const bool rv = racon::BedFile::Deserialize(in_line, record); + EXPECT_EQ(expected_record, record); + EXPECT_EQ(expected_rv, rv); + } + + } +} + +TEST(BedReader, AllTests) { + // Tuple: test_name, input line, expected record, expected return value. + using TestTupleType = std::tuple, bool>; + std::vector, bool>> test_data { + TestTupleType("Empty input", "", {}, false), + + TestTupleType( + "Normal BED file", + R"(chr01 0 1000 +chr02 1000 2000 +chr03 2000 3000 +)", + std::vector{ + racon::BedRecord("chr01", 0, 1000), + racon::BedRecord("chr02", 1000, 2000), + racon::BedRecord("chr03", 2000, 3000), + }, + false + ), + TestTupleType( + "Normal BED file", + R"(chr01 0 1000 +bad_line +)", + std::vector(), true + ), + + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string in_line = std::get<1>(single_test); + const std::vector& expected_records = std::get<2>(single_test); + const bool expected_throw = std::get<3>(single_test); + + SCOPED_TRACE(test_name); + + std::istringstream iss(in_line); + + if (expected_throw) { + EXPECT_THROW( + { + std::vector records = racon::BedReader::ReadAll(iss); + }, + std::runtime_error); + } else { + std::vector records = racon::BedReader::ReadAll(iss); + EXPECT_EQ(expected_records, records); + } + } +} diff --git a/test/meson.build b/test/meson.build index 86d7c7b..a76ef41 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,5 +1,7 @@ racon_test_cpp_sources = files([ - 'racon_test.cpp' + 'bed_test.cpp', + 'util_test.cpp', + 'racon_test.cpp', ]) racon_test_include_directories = [include_directories('.')] diff --git a/test/racon_test.cpp b/test/racon_test.cpp index cadf658..ca51819 100644 --- a/test/racon_test.cpp +++ b/test/racon_test.cpp @@ -32,8 +32,10 @@ class RaconPolishingTest: public ::testing::Test { int8_t match, int8_t mismatch, int8_t gap, uint32_t cuda_batches = 0, bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0) { + const std::string bed_path; // Intentionally empty. + polisher = racon::createPolisher(sequences_path, overlaps_path, target_path, - type, window_length, quality_threshold, error_threshold, true, match, + bed_path, type, window_length, quality_threshold, error_threshold, true, match, mismatch, gap, 4, cuda_batches, cuda_banded_alignment, cudaaligner_batches); } @@ -53,18 +55,18 @@ class RaconPolishingTest: public ::testing::Test { }; TEST(RaconInitializeTest, PolisherTypeError) { - EXPECT_DEATH((racon::createPolisher("", "", "", static_cast(3), + EXPECT_DEATH((racon::createPolisher("", "", "", "", static_cast(3), 0, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: invalid polisher" " type!"); } TEST(RaconInitializeTest, WindowLengthError) { - EXPECT_DEATH((racon::createPolisher("", "", "", racon::PolisherType::kC, 0, + EXPECT_DEATH((racon::createPolisher("", "", "", "", racon::PolisherType::kC, 0, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: invalid window length!"); } TEST(RaconInitializeTest, SequencesPathExtensionError) { - EXPECT_DEATH((racon::createPolisher("", "", "", racon::PolisherType::kC, 500, + EXPECT_DEATH((racon::createPolisher("", "", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has unsupported " "format extension .valid extensions: .fasta, .fasta.gz, .fna, .fna.gz, " ".fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz.!"); @@ -72,14 +74,14 @@ TEST(RaconInitializeTest, SequencesPathExtensionError) { TEST(RaconInitializeTest, OverlapsPathExtensionError) { EXPECT_DEATH((racon::createPolisher(racon_test_data_path + "sample_reads.fastq.gz", - "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), + "", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has unsupported format extension " ".valid extensions: .mhap, .mhap.gz, .paf, .paf.gz, .sam, .sam.gz.!"); } TEST(RaconInitializeTest, TargetPathExtensionError) { EXPECT_DEATH((racon::createPolisher(racon_test_data_path + "sample_reads.fastq.gz", - racon_test_data_path + "sample_overlaps.paf.gz", "", racon::PolisherType::kC, + racon_test_data_path + "sample_overlaps.paf.gz", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has " "unsupported format extension .valid extensions: .fasta, .fasta.gz, .fna, " ".fna.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz.!"); diff --git a/test/util_test.cpp b/test/util_test.cpp new file mode 100644 index 0000000..33e9178 --- /dev/null +++ b/test/util_test.cpp @@ -0,0 +1,178 @@ +/*! + * @file util.cpp + * + * @brief Unit tests for the utils. + */ + +#include "gtest/gtest.h" +#include "racon_test_config.h" +#include "util.hpp" + +#include +#include +#include +#include + +TEST(Utils, find_breaking_points_from_cigar) { + // Tuple: test_name, CIGAR string, windows as tuples, expected breakpoints split into windows. + using TestTupleType = std::tuple>, + std::vector>; + using TestWindowTuple = std::tuple; + std::vector, std::vector>> test_data { + TestTupleType("Empty input", "", 0, 0, std::vector{}, std::vector{}), + + TestTupleType("Simple windowing", "1000M", 0, 0, + std::vector{ + TestWindowTuple{100, 500, 0}, + TestWindowTuple{700, 800, 1}, + TestWindowTuple{900, 950, 2}, + }, + std::vector{ + racon::WindowInterval(100, 500, 100, 500, 0), + racon::WindowInterval(700, 800, 700, 800, 1), + racon::WindowInterval(900, 950, 900, 950, 2), + } + ), + + TestTupleType("Simple windowing with indels", "200=100I300=50D450=", 0, 0, + std::vector{ + TestWindowTuple{100, 500, 0}, + TestWindowTuple{500, 700, 1}, + TestWindowTuple{700, 800, 2}, + TestWindowTuple{900, 950, 3}, + }, + std::vector{ + racon::WindowInterval(100, 600, 100, 500, 0), + racon::WindowInterval(600, 750, 550, 700, 1), + racon::WindowInterval(750, 850, 700, 800, 2), + racon::WindowInterval(950, 1000, 900, 950, 3), + } + ), + + TestTupleType("Intra-window alignments", "100=", 0, 250, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, + }, + std::vector{ + racon::WindowInterval(0, 100, 250, 350, 1), + } + ), + + TestTupleType("Cross-window alignments", "200=", 0, 250, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, + }, + std::vector{ + racon::WindowInterval(0, 150, 250, 400, 1), + racon::WindowInterval(150, 200, 400, 450, 2), + } + ), + + TestTupleType("Flanking insertions", "10I180=10I", 0, 200, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, + }, + std::vector{ + racon::WindowInterval(10, 190, 200, 380, 1), + } + ), + + TestTupleType("Flanking insertions", "10D180=10D", 0, 200, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, + }, + std::vector{ + racon::WindowInterval(0, 180, 210, 390, 1), + } + ), + + TestTupleType("Alignment outside of the windows, front", "50=", 0, 50, + std::vector{ + TestWindowTuple{200, 400, 0}, + TestWindowTuple{400, 600, 1}, + TestWindowTuple{600, 800, 2}, + TestWindowTuple{800, 1000, 3}, + }, + std::vector{ } + ), + + TestTupleType("Alignment outside of the windows, back", "50=", 0, 1050, + std::vector{ + TestWindowTuple{200, 400, 0}, + TestWindowTuple{400, 600, 1}, + TestWindowTuple{600, 800, 2}, + TestWindowTuple{800, 1000, 3}, + }, + std::vector{ } + ), + + TestTupleType("Alignment in between windows (not overlapping)", "50=", 0, 450, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, + }, + std::vector{ } + ), + + TestTupleType("Out of order windows, should still work because internal sort", "200=100I300=50D450=", 0, 0, + std::vector{ + TestWindowTuple{900, 950, 3}, + TestWindowTuple{500, 700, 1}, + TestWindowTuple{100, 500, 0}, + TestWindowTuple{700, 800, 2}, + }, + std::vector{ + racon::WindowInterval(100, 600, 100, 500, 0), + racon::WindowInterval(600, 750, 550, 700, 1), + racon::WindowInterval(750, 850, 700, 800, 2), + racon::WindowInterval(950, 1000, 900, 950, 3), + } + ), + + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string& cigar = std::get<1>(single_test); + int64_t aln_q_start = std::get<2>(single_test); + int64_t aln_t_start = std::get<3>(single_test); + const std::vector>& windows = std::get<4>(single_test); + const std::vector& expected = std::get<5>(single_test); + + SCOPED_TRACE(test_name); + + auto result = racon::generate_window_breakpoints(cigar, aln_q_start, aln_t_start, windows); + + // std::cerr << test_name << "\n"; + // std::cerr << cigar << "\n"; + // for (const auto& val: result) { + // std::cerr << "q_start = " << val.q_start << ", q_end = " << val.q_end + // << ", t_start = " << val.t_start << ", t_end = " << val.t_end + // << ", window_id = " << val.window_id << "\n"; + // // std::cerr << val << "\n"; + // } + // std::cerr << "\n"; + + EXPECT_EQ(expected, result); + } +} diff --git a/vendor/intervaltree b/vendor/intervaltree new file mode 160000 index 0000000..b90527f --- /dev/null +++ b/vendor/intervaltree @@ -0,0 +1 @@ +Subproject commit b90527f9e6d51cd36ecbb50429e4524d3a418ea5 diff --git a/vendor/meson.build b/vendor/meson.build index d56b22a..2f54dcf 100644 --- a/vendor/meson.build +++ b/vendor/meson.build @@ -7,15 +7,16 @@ vendor_cpp_sources = files([ 'spoa/src/sequence.cpp', 'spoa/src/simd_alignment_engine.cpp', 'spoa/src/sisd_alignment_engine.cpp', - 'thread_pool/src/thread_pool.cpp' + 'thread_pool/src/thread_pool.cpp', ]) vendor_include_directories = [ include_directories('bioparser/include'), include_directories('edlib/edlib/include'), + include_directories('intervaltree/'), include_directories('rampler/src'), include_directories('spoa/include'), - include_directories('thread_pool/include') + include_directories('thread_pool/include'), ] vendor_extra_flags = []