From f7c9d7d9307d8581f33985a4392cb0f534fe669e Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Sun, 21 Jun 2020 17:06:05 +0200 Subject: [PATCH 01/49] Added a CLI option '--bed' to specify a BED file with regions to polish. --- src/main.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index a6b1b48..6775ebc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -31,6 +32,7 @@ static struct option options[] = { {"mismatch", required_argument, 0, 'x'}, {"gap", required_argument, 0, 'g'}, {"threads", required_argument, 0, 't'}, + {"bed", required_argument, 0, 'B'}, {"version", no_argument, 0, 'v'}, {"help", no_argument, 0, 'h'}, #ifdef CUDA_ENABLED @@ -66,7 +68,9 @@ int main(int argc, char** argv) { uint32_t cudaaligner_band_width = 0; bool cuda_banded_alignment = false; - std::string optstring = "ufw:q:e:m:x:g:t:h"; + std::string bed_file; + + std::string optstring = "ufw:q:e:m:x:g:t:B:h"; #ifdef CUDA_ENABLED optstring += "bc::"; #endif @@ -104,6 +108,9 @@ int main(int argc, char** argv) { case 't': num_threads = atoi(optarg); break; + case 'B': + bed_file = std::string(optarg); + break; case 'v': printf("%s\n", version); exit(0); @@ -149,6 +156,8 @@ int main(int argc, char** argv) { exit(1); } + std::cerr << "BED file: '" << bed_file << "'\n"; + auto polisher = racon::createPolisher(input_paths[0], input_paths[1], input_paths[2], type == 0 ? racon::PolisherType::kC : racon::PolisherType::kF, window_length, quality_threshold, @@ -209,6 +218,9 @@ void help() { " -g, --gap \n" " default: -4\n" " gap penalty (must be negative)\n" + " -B, --bed \n" + " default: ''\n" + " path to a BED file with regions to polish\n" " -t, --threads \n" " default: 1\n" " number of threads\n" From abc739dd088f1a851ca5a9a4b1b5d733d278a1e6 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 22 Jun 2020 10:46:05 +0200 Subject: [PATCH 02/49] Now storing the start position of every window in the Window class. --- src/polisher.cpp | 2 +- src/window.cpp | 13 +++++++------ src/window.hpp | 19 ++++++++++++++----- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 5f57afd..130c6a7 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -389,7 +389,7 @@ void Polisher::initialize() { uint32_t length = std::min(j + window_length_, static_cast(sequences_[i]->data().size())) - j; - windows_.emplace_back(createWindow(i, k, window_type, + windows_.emplace_back(createWindow(i, k, window_type, j, &(sequences_[i]->data()[j]), length, sequences_[i]->quality().empty() ? &(dummy_quality_[0]) : &(sequences_[i]->quality()[j]), length)); diff --git a/src/window.cpp b/src/window.cpp index a9591d4..a2a593b 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -13,8 +13,8 @@ namespace racon { std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - const char* backbone, uint32_t backbone_length, const char* quality, - uint32_t quality_length) { + uint32_t backbone_start, const char* backbone, uint32_t backbone_length, + const char* quality, uint32_t quality_length) { if (backbone_length == 0 || backbone_length != quality_length) { fprintf(stderr, "[racon::createWindow] error: " @@ -22,13 +22,14 @@ std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type exit(1); } - return std::shared_ptr(new Window(id, rank, type, backbone, + return std::shared_ptr(new Window(id, rank, type, backbone_start, backbone, backbone_length, quality, quality_length)); } -Window::Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone, - uint32_t backbone_length, const char* quality, uint32_t quality_length) - : id_(id), rank_(rank), type_(type), consensus_(), sequences_(), +Window::Window(uint64_t id, uint32_t rank, WindowType type, uint32_t backbone_start, + const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t quality_length) + : id_(id), rank_(rank), type_(type), start_(backbone_start), consensus_(), sequences_(), qualities_(), positions_() { sequences_.emplace_back(backbone, backbone_length); diff --git a/src/window.hpp b/src/window.hpp index 3ecb3db..44ed8b4 100644 --- a/src/window.hpp +++ b/src/window.hpp @@ -25,7 +25,7 @@ enum class WindowType { class Window; std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t backbone_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); class Window { @@ -44,6 +44,13 @@ class Window { return consensus_; } + uint32_t backbone_length() const { + if (sequences_.empty()) { + return 0; + } + return sequences_.front().second; + } + bool generate_consensus(std::shared_ptr alignment_engine, bool trim); @@ -52,21 +59,23 @@ class Window { uint32_t end); friend std::shared_ptr createWindow(uint64_t id, uint32_t rank, - WindowType type, const char* backbone, uint32_t backbone_length, - const char* quality, uint32_t quality_length); + WindowType type, uint32_t backbone_start, const char* backbone, + uint32_t backbone_length, const char* quality, uint32_t quality_length); #ifdef CUDA_ENABLED friend class CUDABatchProcessor; #endif private: - Window(uint64_t id, uint32_t rank, WindowType type, const char* backbone, - uint32_t backbone_length, const char* quality, uint32_t quality_length); + Window(uint64_t id, uint32_t rank, WindowType type, uint32_t backbone_start, + const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t quality_length); Window(const Window&) = delete; const Window& operator=(const Window&) = delete; uint64_t id_; uint32_t rank_; WindowType type_; + uint32_t start_; std::string consensus_; std::vector> sequences_; std::vector> qualities_; From 011de11e0dc706b37337d1b559b6eaedfcf7700e Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 22 Jun 2020 13:18:44 +0200 Subject: [PATCH 03/49] Added a operator<< to Window for debugging purposes. --- src/window.cpp | 13 +++++++++++++ src/window.hpp | 6 ++++++ 2 files changed, 19 insertions(+) diff --git a/src/window.cpp b/src/window.cpp index a2a593b..0975512 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -142,4 +142,17 @@ bool Window::generate_consensus(std::shared_ptr alignment return true; } +std::ostream& operator<<(std::ostream& os, const Window& a) +{ + os << "Window: id = " << a.id_ << ", rank = " << a.rank_ << ", type = " + << ((a.type_ == WindowType::kNGS) ? "NGS" : "TGS") + << ", start = " << a.start_ + << ", backbone_len = " << a.backbone_length() + << ", consensus_len = " << a.consensus_.size() + << ", seqs_len = " << a.sequences_.size() + << ", quals_len = " << a.qualities_.size() + << ", pos_len = " << a.positions_.size(); + return os; +} + } diff --git a/src/window.hpp b/src/window.hpp index 44ed8b4..373a247 100644 --- a/src/window.hpp +++ b/src/window.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -62,9 +63,12 @@ class Window { WindowType type, uint32_t backbone_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); + friend std::ostream& operator<<(std::ostream& os, const Window& a); + #ifdef CUDA_ENABLED friend class CUDABatchProcessor; #endif + private: Window(uint64_t id, uint32_t rank, WindowType type, uint32_t backbone_start, const char* backbone, uint32_t backbone_length, const char* quality, @@ -82,4 +86,6 @@ class Window { std::vector> positions_; }; +std::ostream& operator<<(std::ostream& os, const Window& a); + } From 4eccb0bb6c39574ca6bd7a9061749bf4c3ad43e0 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 22 Jun 2020 14:35:24 +0200 Subject: [PATCH 04/49] The polisher can now fill in the gap in between windows if they are far apart using the original sequence. --- src/polisher.cpp | 7 +++++++ src/window.hpp | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/polisher.cpp b/src/polisher.cpp index 130c6a7..7113a80 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -508,10 +508,16 @@ void Polisher::polish(std::vector>& dst, uint64_t logger_step = thread_futures.size() / 20; + uint64_t prev_window_end = 0; + for (uint64_t i = 0; i < thread_futures.size(); ++i) { thread_futures[i].wait(); num_polished_windows += thread_futures[i].get() == true ? 1 : 0; + if (windows_[i]->start() > prev_window_end) { + uint64_t span = windows_[i]->start() - prev_window_end; + polished_data += sequences_[windows_[i]->id()]->data().substr(prev_window_end, span); + } polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { @@ -530,6 +536,7 @@ void Polisher::polish(std::vector>& dst, num_polished_windows = 0; polished_data.clear(); } + prev_window_end = windows_[i]->start() + windows_[i]->backbone_length(); windows_[i].reset(); if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) { diff --git a/src/window.hpp b/src/window.hpp index 373a247..ccdb847 100644 --- a/src/window.hpp +++ b/src/window.hpp @@ -52,6 +52,10 @@ class Window { return sequences_.front().second; } + uint32_t start() const { + return start_; + } + bool generate_consensus(std::shared_ptr alignment_engine, bool trim); From ffb68b46fdaeb5916153a81fa1abf876db612812 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 22 Jun 2020 14:42:43 +0200 Subject: [PATCH 05/49] Minor refactor in src/polisher.cpp - extracted the window creation code into a separate function for scoping. --- src/polisher.cpp | 10 +++++++--- src/polisher.hpp | 5 ++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 7113a80..fe0c849 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -10,7 +10,6 @@ #include "overlap.hpp" #include "sequence.hpp" -#include "window.hpp" #include "logger.hpp" #include "polisher.hpp" #ifdef CUDA_ENABLED @@ -377,6 +376,13 @@ void Polisher::initialize() { it.wait(); } + create_and_populate_windows(overlaps, targets_size, window_type); + + logger_->log("[racon::Polisher::initialize] transformed data into windows"); +} + +void Polisher::create_and_populate_windows(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type) { find_overlap_breaking_points(overlaps); logger_->log(); @@ -455,8 +461,6 @@ void Polisher::initialize() { overlaps[i].reset(); } - - logger_->log("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::find_overlap_breaking_points(std::vector>& overlaps) diff --git a/src/polisher.hpp b/src/polisher.hpp index 051ef8c..d8d77d2 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -6,11 +6,12 @@ #pragma once -#include +#include #include #include #include #include +#include "window.hpp" namespace bioparser { template @@ -73,6 +74,8 @@ class Polisher { Polisher(const Polisher&) = delete; const Polisher& operator=(const Polisher&) = delete; virtual void find_overlap_breaking_points(std::vector>& overlaps); + void create_and_populate_windows(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type); std::unique_ptr> sparser_; std::unique_ptr> oparser_; From 5cd83d86d87098668820a6be0d6db19f476e6f0a Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 22 Jun 2020 17:59:40 +0200 Subject: [PATCH 06/49] Added the suffix portion of the target sequence after the last window. --- src/polisher.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index fe0c849..1721bd8 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -377,8 +377,6 @@ void Polisher::initialize() { } create_and_populate_windows(overlaps, targets_size, window_type); - - logger_->log("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::create_and_populate_windows(std::vector>& overlaps, @@ -461,6 +459,8 @@ void Polisher::create_and_populate_windows(std::vector> overlaps[i].reset(); } + + logger_->log("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::find_overlap_breaking_points(std::vector>& overlaps) @@ -525,6 +525,13 @@ void Polisher::polish(std::vector>& dst, polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { + // Append the remaining suffix from the last window to the end of the target. + uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); + if ((windows_[i]->start() + windows_[i]->backbone_length()) < tlen) { + uint64_t suffix_start = windows_[i]->start() + windows_[i]->backbone_length(); + polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); + } + double polished_ratio = num_polished_windows / static_cast(windows_[i]->rank() + 1); From 6efcfc80bb3a0eea37edef14137fc1c6e560aaa1 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 12:56:51 +0200 Subject: [PATCH 07/49] Implemented a simple BED parser and added unit tests. --- src/bed.cpp | 75 ++++++++++++++++++++++++++++++++++++ src/bed.hpp | 92 +++++++++++++++++++++++++++++++++++++++++++++ src/meson.build | 3 +- test/bed_test.cpp | 96 +++++++++++++++++++++++++++++++++++++++++++++++ test/meson.build | 3 +- 5 files changed, 267 insertions(+), 2 deletions(-) create mode 100644 src/bed.cpp create mode 100644 src/bed.hpp create mode 100644 test/bed_test.cpp diff --git a/src/bed.cpp b/src/bed.cpp new file mode 100644 index 0000000..a970287 --- /dev/null +++ b/src/bed.cpp @@ -0,0 +1,75 @@ +/*! + * @file bed.cpp + * + * @brief BED reader source file + */ + +#include +#include +#include + +#include "bed.hpp" + +namespace racon { + +bool BedFile::Deserialize(const std::string& line, BedRecord& record) { + if (line.empty()) { + return false; + } + char name_buff[1024]; + int64_t chrom_start = 0, chrom_end = 0; + int32_t n = sscanf(line.c_str(), "%s %ld %ld", name_buff, &chrom_start, &chrom_end); + if (n < 3) { + throw std::runtime_error("Invalid BED line: '" + line + "'"); + } + record = BedRecord(name_buff, chrom_start, chrom_end); + return true; +} + +void BedFile::Serialize(std::ostream& os, const BedRecord& record) { + os << record.chrom() << " " << record.chrom_start() << " " << record.chrom_end(); +} + +std::string BedFile::Serialize(const BedRecord& record) { + std::ostringstream oss; + Serialize(oss, record); + return oss.str(); +} + +BedReader::BedReader(const std::string& in_fn) + : file_{std::unique_ptr(new std::ifstream(in_fn))} + , in_{*file_.get()} +{ +} + +BedReader::BedReader(std::istream& in) + : in_{in} +{ +} + +bool BedReader::GetNext(BedRecord& record) { + const bool rv1 = !std::getline(in_, line_).fail(); + if (!rv1) + return false; + const bool rv2 = BedFile::Deserialize(line_, record); + return rv2; +} + +std::vector BedReader::ReadAll(const std::string& fn) +{ + std::ifstream in{fn}; + return ReadAll(in); +} + +std::vector BedReader::ReadAll(std::istream& in) +{ + std::vector records; + BedReader reader{in}; + BedRecord record; + while (reader.GetNext(record)) { + records.emplace_back(std::move(record)); + } + return records; +} + +} diff --git a/src/bed.hpp b/src/bed.hpp new file mode 100644 index 0000000..5737602 --- /dev/null +++ b/src/bed.hpp @@ -0,0 +1,92 @@ +/*! + * @file bed.hpp + * + * @brief BED file containers and parser. + */ + +#pragma once + +#include +#include +#include +#include + +namespace racon { + +class BedRecord; + +class BedFile { +public: + static bool Deserialize(const std::string& line, BedRecord& record); + static void Serialize(std::ostream& os, const BedRecord& record); + static std::string Serialize(const BedRecord& record); +}; + +/* + * \brief BedRecord container. + * Note: BED records have 0-based coordinates, and the end coordinate is non-inclusive. +*/ +class BedRecord { +public: + ~BedRecord() = default; + + BedRecord() = default; + + BedRecord(std::string _chrom, int64_t _chrom_start, int64_t _chrom_end) + : chrom_(std::move(_chrom)) + , chrom_start_(_chrom_start) + , chrom_end_(_chrom_end) {} + + const std::string& chrom() const { + return chrom_; + } + int64_t chrom_start() const { + return chrom_start_; + } + int64_t chrom_end() const { + return chrom_end_; + } + + void chrom(const std::string& val) { + chrom_ = val; + } + void chrom_start(int64_t val) { + chrom_start_ = val; + } + void chrom_end(int64_t val) { + chrom_end_ = val; + } + + bool operator==(const BedRecord& rhs) const + { + return chrom_ == rhs.chrom_ && chrom_start_ == rhs.chrom_start_ + && chrom_end_ == rhs.chrom_end_; + } + + std::ostream& operator<<(std::ostream& os) const { + BedFile::Serialize(os, *this); + return os; + } + +private: + std::string chrom_; + int64_t chrom_start_ = 0; + int64_t chrom_end_ = 0; +}; + +class BedReader { +public: + BedReader(const std::string& in_fn); + BedReader(std::istream& in); + + static std::vector ReadAll(const std::string& fn); + static std::vector ReadAll(std::istream& in); + bool GetNext(BedRecord& record); + +private: + std::unique_ptr file_; + std::istream& in_; + std::string line_; +}; + +} diff --git a/src/meson.build b/src/meson.build index 7f0d21c..06e9b87 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,9 +1,10 @@ racon_cpp_sources = files([ + 'bed.cpp', 'logger.cpp', 'overlap.cpp', 'polisher.cpp', 'sequence.cpp', - 'window.cpp' + 'window.cpp', ]) racon_extra_flags = [] diff --git a/test/bed_test.cpp b/test/bed_test.cpp new file mode 100644 index 0000000..a798e9b --- /dev/null +++ b/test/bed_test.cpp @@ -0,0 +1,96 @@ +/*! + * @file bed_test.cpp + * + * @brief Unit tests for the BED parser. + */ + +#include "gtest/gtest.h" +#include "racon_test_config.h" +#include "bed.hpp" + +#include +#include +#include + +TEST(BedFile, DeserializeTests) { + // Tuple: test_name, input line, expected record, expected return value, expected to throw. + std::vector> test_data = { + {"Empty input", "", racon::BedRecord(), false,false}, + {"Three columns", "chr01 0 1000", racon::BedRecord("chr01", 0, 1000), true, false}, + {"Multiple columns", "chr01 1000 2000 some other columns that are ignored", racon::BedRecord("chr01", 1000, 2000), true, false}, + {"Invalid BED line", "bad_bed", racon::BedRecord(), true, true}, + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string& in_line = std::get<1>(single_test); + const racon::BedRecord& expected_record = std::get<2>(single_test); + const bool expected_rv = std::get<3>(single_test); + const bool expected_throw = std::get<4>(single_test); + + SCOPED_TRACE(test_name); + + if (expected_throw) { + EXPECT_THROW( + { + racon::BedRecord record; + racon::BedFile::Deserialize(in_line, record); + }, + std::runtime_error); + } else { + racon::BedRecord record; + const bool rv = racon::BedFile::Deserialize(in_line, record); + EXPECT_EQ(expected_record, record); + EXPECT_EQ(expected_rv, rv); + } + + } +} + +TEST(BedReader, AllTests) { + // Tuple: test_name, input line, expected record, expected return value. + std::vector, bool>> test_data = { + {"Empty input", "", {}, false}, + {"Normal BED file", +R"(chr01 0 1000 +chr02 1000 2000 +chr03 2000 3000 +)", + { + racon::BedRecord("chr01", 0, 1000), + racon::BedRecord("chr02", 1000, 2000), + racon::BedRecord("chr03", 2000, 3000), + }, + false + }, + {"Normal BED file", +R"(chr01 0 1000 +bad_line +)", + {}, true + }, + + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string in_line = std::get<1>(single_test); + const std::vector& expected_records = std::get<2>(single_test); + const bool expected_throw = std::get<3>(single_test); + + SCOPED_TRACE(test_name); + + std::istringstream iss(in_line); + + if (expected_throw) { + EXPECT_THROW( + { + std::vector records = racon::BedReader::ReadAll(iss); + }, + std::runtime_error); + } else { + std::vector records = racon::BedReader::ReadAll(iss); + EXPECT_EQ(expected_records, records); + } + } +} diff --git a/test/meson.build b/test/meson.build index 86d7c7b..2a4d878 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,5 +1,6 @@ racon_test_cpp_sources = files([ - 'racon_test.cpp' + 'bed_test.cpp', + 'racon_test.cpp', ]) racon_test_include_directories = [include_directories('.')] From 09239302df107b5459155a774b3333eacaa541be Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 13:04:59 +0200 Subject: [PATCH 08/49] The createPolisher function now accepts a BED file path and parses it. --- src/main.cpp | 2 +- src/polisher.cpp | 35 +++++++++++++++++++++++++++++++++++ src/polisher.hpp | 2 ++ test/racon_test.cpp | 14 ++++++++------ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 6775ebc..6fe7f9d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -159,7 +159,7 @@ int main(int argc, char** argv) { std::cerr << "BED file: '" << bed_file << "'\n"; auto polisher = racon::createPolisher(input_paths[0], input_paths[1], - input_paths[2], type == 0 ? racon::PolisherType::kC : + input_paths[2], bed_file, type == 0 ? racon::PolisherType::kC : racon::PolisherType::kF, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches, diff --git a/src/polisher.cpp b/src/polisher.cpp index 1721bd8..ae903ae 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -12,6 +12,7 @@ #include "sequence.hpp" #include "logger.hpp" #include "polisher.hpp" +#include "bed.hpp" #ifdef CUDA_ENABLED #include "cuda/cudapolisher.hpp" #endif @@ -20,6 +21,9 @@ #include "thread_pool/thread_pool.hpp" #include "spoa/spoa.hpp" +#define BED_FEATURE +// #define BED_FEATURE_TEST + namespace racon { constexpr uint32_t kChunkSize = 1024 * 1024 * 1024; // ~ 1GB @@ -53,6 +57,7 @@ uint64_t shrinkToFit(std::vector>& src, uint64_t begin) { std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment, @@ -131,6 +136,15 @@ std::unique_ptr createPolisher(const std::string& sequences_path, exit(1); } + // std::unordered_map bed_records; + if (bed_path.size() > 0) { + use_bed = true; + bed_records = BedReader::ReadAll(bed_path); + } + std::cerr << "Use bed: " << (use_bed ? "true" : "false") << ", bed records: " << bed_records.size() << "\n"; + if (cudapoa_batches > 0 || cudaaligner_batches > 0) { #ifdef CUDA_ENABLED @@ -460,6 +474,21 @@ void Polisher::create_and_populate_windows(std::vector> overlaps[i].reset(); } +#ifdef BED_FEATURE_TEST + int32_t shift = 0; + for (int32_t i = 0; i < static_cast(windows_.size()); ++i) { + if ((i + 1) % 100 != 0) { + windows_[i].reset(); + ++shift; + // std::cerr << "nulling i = " << i << "\n"; + continue; + } + std::swap(windows_[i-shift], windows_[i]); + } + windows_.resize(static_cast(windows_.size()) - shift); + std::cerr << "windows_.size() = " << windows_.size() << "\n"; +#endif + logger_->log("[racon::Polisher::initialize] transformed data into windows"); } @@ -518,19 +547,25 @@ void Polisher::polish(std::vector>& dst, thread_futures[i].wait(); num_polished_windows += thread_futures[i].get() == true ? 1 : 0; + +#ifdef BED_FEATURE if (windows_[i]->start() > prev_window_end) { uint64_t span = windows_[i]->start() - prev_window_end; polished_data += sequences_[windows_[i]->id()]->data().substr(prev_window_end, span); } +#endif + polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { +#ifdef BED_FEATURE // Append the remaining suffix from the last window to the end of the target. uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); if ((windows_[i]->start() + windows_[i]->backbone_length()) < tlen) { uint64_t suffix_start = windows_[i]->start() + windows_[i]->backbone_length(); polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); } +#endif double polished_ratio = num_polished_windows / static_cast(windows_[i]->rank() + 1); diff --git a/src/polisher.hpp b/src/polisher.hpp index d8d77d2..d8ec7a5 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -42,6 +42,7 @@ enum class PolisherType { class Polisher; std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cuda_batches = 0, @@ -59,6 +60,7 @@ class Polisher { friend std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, + const std::string& bed_path, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads, uint32_t cuda_batches, bool cuda_banded_alignment, diff --git a/test/racon_test.cpp b/test/racon_test.cpp index cadf658..ca51819 100644 --- a/test/racon_test.cpp +++ b/test/racon_test.cpp @@ -32,8 +32,10 @@ class RaconPolishingTest: public ::testing::Test { int8_t match, int8_t mismatch, int8_t gap, uint32_t cuda_batches = 0, bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0) { + const std::string bed_path; // Intentionally empty. + polisher = racon::createPolisher(sequences_path, overlaps_path, target_path, - type, window_length, quality_threshold, error_threshold, true, match, + bed_path, type, window_length, quality_threshold, error_threshold, true, match, mismatch, gap, 4, cuda_batches, cuda_banded_alignment, cudaaligner_batches); } @@ -53,18 +55,18 @@ class RaconPolishingTest: public ::testing::Test { }; TEST(RaconInitializeTest, PolisherTypeError) { - EXPECT_DEATH((racon::createPolisher("", "", "", static_cast(3), + EXPECT_DEATH((racon::createPolisher("", "", "", "", static_cast(3), 0, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: invalid polisher" " type!"); } TEST(RaconInitializeTest, WindowLengthError) { - EXPECT_DEATH((racon::createPolisher("", "", "", racon::PolisherType::kC, 0, + EXPECT_DEATH((racon::createPolisher("", "", "", "", racon::PolisherType::kC, 0, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: invalid window length!"); } TEST(RaconInitializeTest, SequencesPathExtensionError) { - EXPECT_DEATH((racon::createPolisher("", "", "", racon::PolisherType::kC, 500, + EXPECT_DEATH((racon::createPolisher("", "", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has unsupported " "format extension .valid extensions: .fasta, .fasta.gz, .fna, .fna.gz, " ".fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz.!"); @@ -72,14 +74,14 @@ TEST(RaconInitializeTest, SequencesPathExtensionError) { TEST(RaconInitializeTest, OverlapsPathExtensionError) { EXPECT_DEATH((racon::createPolisher(racon_test_data_path + "sample_reads.fastq.gz", - "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), + "", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has unsupported format extension " ".valid extensions: .mhap, .mhap.gz, .paf, .paf.gz, .sam, .sam.gz.!"); } TEST(RaconInitializeTest, TargetPathExtensionError) { EXPECT_DEATH((racon::createPolisher(racon_test_data_path + "sample_reads.fastq.gz", - racon_test_data_path + "sample_overlaps.paf.gz", "", racon::PolisherType::kC, + racon_test_data_path + "sample_overlaps.paf.gz", "", "", racon::PolisherType::kC, 500, 0, 0, 0, 0, 0, 0, 0)), ".racon::createPolisher. error: file has " "unsupported format extension .valid extensions: .fasta, .fasta.gz, .fna, " ".fna.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz.!"); From a7645c91b366d3a490c778e16b94a860f9643d8a Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 13:13:20 +0200 Subject: [PATCH 09/49] Added the interval tree library. --- .gitmodules | 3 +++ vendor/intervaltree | 1 + vendor/meson.build | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) create mode 160000 vendor/intervaltree diff --git a/.gitmodules b/.gitmodules index 2b8d112..39b5ca2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,4 +19,7 @@ [submodule "vendor/GenomeWorks"] path = vendor/GenomeWorks url = https://github.com/clara-parabricks/GenomeWorks.git +[submodule "vendor/intervaltree"] + path = vendor/intervaltree + url = https://github.com/ekg/intervaltree.git branch = master diff --git a/vendor/intervaltree b/vendor/intervaltree new file mode 160000 index 0000000..b90527f --- /dev/null +++ b/vendor/intervaltree @@ -0,0 +1 @@ +Subproject commit b90527f9e6d51cd36ecbb50429e4524d3a418ea5 diff --git a/vendor/meson.build b/vendor/meson.build index d56b22a..017255e 100644 --- a/vendor/meson.build +++ b/vendor/meson.build @@ -1,5 +1,6 @@ vendor_cpp_sources = files([ 'edlib/edlib/src/edlib.cpp', + 'intervaltree/interval_tree_test.cpp', 'rampler/src/sampler.cpp', 'rampler/src/sequence.cpp', 'spoa/src/alignment_engine.cpp', @@ -7,15 +8,16 @@ vendor_cpp_sources = files([ 'spoa/src/sequence.cpp', 'spoa/src/simd_alignment_engine.cpp', 'spoa/src/sisd_alignment_engine.cpp', - 'thread_pool/src/thread_pool.cpp' + 'thread_pool/src/thread_pool.cpp', ]) vendor_include_directories = [ include_directories('bioparser/include'), include_directories('edlib/edlib/include'), + include_directories('intervaltree/'), include_directories('rampler/src'), include_directories('spoa/include'), - include_directories('thread_pool/include') + include_directories('thread_pool/include'), ] vendor_extra_flags = [] From 599f56e6e34d736d40e9d8e65014802bbc992e61 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 15:47:50 +0200 Subject: [PATCH 10/49] Minor refactor, extracted the function transmuteId from overlap.cpp to a new file util.hpp. --- src/overlap.cpp | 13 +------------ src/util.hpp | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 src/util.hpp diff --git a/src/overlap.cpp b/src/overlap.cpp index 98cf6f7..e26d8a8 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -8,6 +8,7 @@ #include "sequence.hpp" #include "overlap.hpp" +#include "util.hpp" #include "edlib.h" namespace racon { @@ -114,18 +115,6 @@ Overlap::Overlap() breaking_points_(), dual_breaking_points_() { } -template -bool transmuteId(const std::unordered_map& t_to_id, const T& t, - uint64_t& id) { - - auto it = t_to_id.find(t); - if (it == t_to_id.end()) { - return false; - } - id = it->second; - return true; -} - void Overlap::transmute(const std::vector>& sequences, const std::unordered_map& name_to_id, const std::unordered_map& id_to_id) { diff --git a/src/util.hpp b/src/util.hpp new file mode 100644 index 0000000..36a4f67 --- /dev/null +++ b/src/util.hpp @@ -0,0 +1,26 @@ +/*! + * @file util.hpp + * + * @brief Utility functions used throughout the code. + */ + +#pragma once + +#include +#include + +namespace racon { + +template +bool transmuteId(const std::unordered_map& t_to_id, const T& t, + uint64_t& id) { + + auto it = t_to_id.find(t); + if (it == t_to_id.end()) { + return false; + } + id = it->second; + return true; +} + +} From e252d93e0b10f454d8a581aac7a66b55592c35c1 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 16:04:56 +0200 Subject: [PATCH 11/49] Now constructing the interval trees for the given BED records, and validating that there aren't any regions which overlap. --- src/polisher.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++----- src/polisher.hpp | 12 ++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index ae903ae..ef5bb79 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -12,7 +12,7 @@ #include "sequence.hpp" #include "logger.hpp" #include "polisher.hpp" -#include "bed.hpp" +#include "util.hpp" #ifdef CUDA_ENABLED #include "cuda/cudapolisher.hpp" #endif @@ -136,21 +136,22 @@ std::unique_ptr createPolisher(const std::string& sequences_path, exit(1); } - // std::unordered_map bed_records; if (bed_path.size() > 0) { use_bed = true; bed_records = BedReader::ReadAll(bed_path); } - std::cerr << "Use bed: " << (use_bed ? "true" : "false") << ", bed records: " << bed_records.size() << "\n"; + std::cerr << "[racon::createPolisher] Use bed: " << (use_bed ? "true" : "false") << ", bed records: " << bed_records.size() << "\n"; if (cudapoa_batches > 0 || cudaaligner_batches > 0) { #ifdef CUDA_ENABLED // If CUDA is enabled, return an instance of the CUDAPolisher object. return std::unique_ptr(new CUDAPolisher(std::move(sparser), - std::move(oparser), std::move(tparser), type, window_length, + std::move(oparser), std::move(tparser), + std::move(bed_records), use_bed, + type, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches, cudaaligner_band_width)); @@ -166,7 +167,9 @@ std::unique_ptr createPolisher(const std::string& sequences_path, { (void) cuda_banded_alignment; return std::unique_ptr(new Polisher(std::move(sparser), - std::move(oparser), std::move(tparser), type, window_length, + std::move(oparser), std::move(tparser), + std::move(bed_records), use_bed, + type, window_length, quality_threshold, error_threshold, trim, match, mismatch, gap, num_threads)); } @@ -175,11 +178,13 @@ std::unique_ptr createPolisher(const std::string& sequences_path, Polisher::Polisher(std::unique_ptr> sparser, std::unique_ptr> oparser, std::unique_ptr> tparser, + std::vector bed_records, bool use_bed, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads) : sparser_(std::move(sparser)), oparser_(std::move(oparser)), - tparser_(std::move(tparser)), type_(type), quality_threshold_( + tparser_(std::move(tparser)), bed_records_(std::move(bed_records)), + use_bed_(use_bed), type_(type), quality_threshold_( quality_threshold), error_threshold_(error_threshold), trim_(trim), alignment_engines_(), sequences_(), dummy_quality_(window_length, '!'), window_length_(window_length), windows_(), @@ -292,6 +297,48 @@ void Polisher::initialize() { logger_->log("[racon::Polisher::initialize] loaded sequences"); logger_->log(); + + + ///////////////////////////////// + // Collect the intervals. + logger_->log("[racon::Polisher::initialize] building the interval trees"); + logger_->log(); + IntervalVectorInt64 intervals; + std::unordered_map> target_intervals; + for (size_t i = 0; i < bed_records_.size(); ++i) { + const auto& record = bed_records_[i]; + uint64_t t_id = 0; + if (!transmuteId(name_to_id, record.chrom() + "t", t_id)) { + throw std::runtime_error("Target sequence '" + record.chrom() + + "' specified in the BED file was not found among the target sequences."); + } + target_intervals[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + } + // Construct the trees. + target_trees_.clear(); + for (auto& it: target_intervals) { + // Make a copy, because the IntervalTree has only the move constructor, + // and we still need t he intvervals for validation below. + auto intervals = it.second; + target_trees_[it.first] = IntervalTreeInt64(std::move(intervals)); + } + // Validate that there are no overlapping intervals. + for (const auto& it: target_intervals) { + int64_t t_id = it.first; + for (const auto& interval: it.second) { + auto foundIntervals = target_trees_[t_id].findOverlapping(interval.start, interval.stop); + if (foundIntervals.size() != 1 || + (foundIntervals.size() == 1 && foundIntervals.front().value != interval.value)) { + throw std::runtime_error("Invalid BED record: '" + + BedFile::Serialize(bed_records_[interval.value]) + + "'. It overlaps other BED records, which is not allowed."); + } + } + } + ///////////////////////////////// + + + std::vector> overlaps; auto remove_invalid_overlaps = [&](uint64_t begin, uint64_t end) -> void { diff --git a/src/polisher.hpp b/src/polisher.hpp index d8ec7a5..fbe5293 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -12,6 +12,9 @@ #include #include #include "window.hpp" +#include "bed.hpp" +#include "IntervalTree.h" +#include namespace bioparser { template @@ -39,6 +42,10 @@ enum class PolisherType { kF // Fragment error correction }; +using IntervalTreeInt64 = IntervalTree; +using IntervalVectorInt64 = IntervalTreeInt64::interval_vector; +using IntervalInt64 = IntervalTreeInt64::interval; + class Polisher; std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, @@ -70,6 +77,7 @@ class Polisher { Polisher(std::unique_ptr> sparser, std::unique_ptr> oparser, std::unique_ptr> tparser, + std::vector bed_records, bool use_bed, PolisherType type, uint32_t window_length, double quality_threshold, double error_threshold, bool trim, int8_t match, int8_t mismatch, int8_t gap, uint32_t num_threads); @@ -83,6 +91,9 @@ class Polisher { std::unique_ptr> oparser_; std::unique_ptr> tparser_; + std::vector bed_records_; + bool use_bed_; + PolisherType type_; double quality_threshold_; double error_threshold_; @@ -90,6 +101,7 @@ class Polisher { std::vector> alignment_engines_; std::vector> sequences_; + std::unordered_map target_trees_; std::vector targets_coverages_; std::string dummy_quality_; From 5386bef55865274d111ba632a2e651bf47571b8a Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 16:16:03 +0200 Subject: [PATCH 12/49] Added t_begin() and t_end() to Overlap. --- src/overlap.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/overlap.hpp b/src/overlap.hpp index f78936f..7599e5d 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -41,6 +41,14 @@ class Overlap { return t_id_; } + uint32_t t_begin() const { + return t_begin_; + } + + uint32_t t_end() const { + return t_end_; + } + uint32_t strand() const { return strand_; } From c1fabf786e19860b2c802baef345be0c1d00f0c4 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 16:16:25 +0200 Subject: [PATCH 13/49] Implemented overlap filtering when BED is specified, to remove overlaps outside of regions of interest. --- src/polisher.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/polisher.cpp b/src/polisher.cpp index ef5bb79..9ff1c48 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -381,6 +381,17 @@ void Polisher::initialize() { continue; } + // Remove overlaps in regions not specified by BED. + if (use_bed_) { + auto foundIntervals = target_trees_[overlaps[i]->t_id()].findOverlapping( + static_cast(overlaps[i]->t_begin()), + static_cast(overlaps[i]->t_end()) - 1); + if (foundIntervals.empty()) { + overlaps[i].reset(); + continue; + } + } + while (overlaps[c] == nullptr) { ++c; } From 4537c7e4f8f06b334b7062d4a465fc16bae21e30 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 23 Jun 2020 16:29:20 +0200 Subject: [PATCH 14/49] Now storing the intervals in the Polisher too, and sorting them right before the trees are constructed (will be useful later). --- src/polisher.cpp | 14 +++++++++----- src/polisher.hpp | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 9ff1c48..1e8855a 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -303,8 +303,7 @@ void Polisher::initialize() { // Collect the intervals. logger_->log("[racon::Polisher::initialize] building the interval trees"); logger_->log(); - IntervalVectorInt64 intervals; - std::unordered_map> target_intervals; + target_intervals_.clear(); for (size_t i = 0; i < bed_records_.size(); ++i) { const auto& record = bed_records_[i]; uint64_t t_id = 0; @@ -312,18 +311,23 @@ void Polisher::initialize() { throw std::runtime_error("Target sequence '" + record.chrom() + "' specified in the BED file was not found among the target sequences."); } - target_intervals[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + target_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + } + // Sort target intervals. + for (auto& it: target_intervals_) { + std::stable_sort(it.second.begin(), it.second.end(), + [](const IntervalInt64& a, const IntervalInt64& b) { return a.start < b.start; }); } // Construct the trees. target_trees_.clear(); - for (auto& it: target_intervals) { + for (const auto& it: target_intervals_) { // Make a copy, because the IntervalTree has only the move constructor, // and we still need t he intvervals for validation below. auto intervals = it.second; target_trees_[it.first] = IntervalTreeInt64(std::move(intervals)); } // Validate that there are no overlapping intervals. - for (const auto& it: target_intervals) { + for (const auto& it: target_intervals_) { int64_t t_id = it.first; for (const auto& interval: it.second) { auto foundIntervals = target_trees_[t_id].findOverlapping(interval.start, interval.stop); diff --git a/src/polisher.hpp b/src/polisher.hpp index fbe5293..04d1d47 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -101,7 +101,6 @@ class Polisher { std::vector> alignment_engines_; std::vector> sequences_; - std::unordered_map target_trees_; std::vector targets_coverages_; std::string dummy_quality_; @@ -112,6 +111,9 @@ class Polisher { std::unordered_map thread_to_id_; std::unique_ptr logger_; + + std::unordered_map> target_intervals_; + std::unordered_map target_trees_; }; } From 001d3a5ed7bf78c0c847ab879aed64346ee82c02 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 11:20:30 +0200 Subject: [PATCH 15/49] Added the end_ coordinate to the Window class. --- src/polisher.cpp | 6 +++--- src/window.cpp | 9 +++++---- src/window.hpp | 11 ++++++++--- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 1e8855a..e3763b7 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -623,8 +623,8 @@ void Polisher::polish(std::vector>& dst, #ifdef BED_FEATURE // Append the remaining suffix from the last window to the end of the target. uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); - if ((windows_[i]->start() + windows_[i]->backbone_length()) < tlen) { - uint64_t suffix_start = windows_[i]->start() + windows_[i]->backbone_length(); + if (windows_[i]->end() < tlen) { + uint64_t suffix_start = windows_[i]->end; polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); } #endif @@ -644,7 +644,7 @@ void Polisher::polish(std::vector>& dst, num_polished_windows = 0; polished_data.clear(); } - prev_window_end = windows_[i]->start() + windows_[i]->backbone_length(); + prev_window_end = windows_[i]->end(); windows_[i].reset(); if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) { diff --git a/src/window.cpp b/src/window.cpp index 0975512..99d03ca 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -13,7 +13,7 @@ namespace racon { std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - uint32_t backbone_start, const char* backbone, uint32_t backbone_length, + uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length) { if (backbone_length == 0 || backbone_length != quality_length) { @@ -22,14 +22,15 @@ std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type exit(1); } - return std::shared_ptr(new Window(id, rank, type, backbone_start, backbone, + return std::shared_ptr(new Window(id, rank, type, window_start, backbone, backbone_length, quality, quality_length)); } -Window::Window(uint64_t id, uint32_t rank, WindowType type, uint32_t backbone_start, +Window::Window(uint64_t id, uint32_t rank, WindowType type, uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length) - : id_(id), rank_(rank), type_(type), start_(backbone_start), consensus_(), sequences_(), + : id_(id), rank_(rank), type_(type), start_(window_start), end_(window_start + backbone_length), + consensus_(), sequences_(), qualities_(), positions_() { sequences_.emplace_back(backbone, backbone_length); diff --git a/src/window.hpp b/src/window.hpp index ccdb847..d16f9af 100644 --- a/src/window.hpp +++ b/src/window.hpp @@ -26,7 +26,7 @@ enum class WindowType { class Window; std::shared_ptr createWindow(uint64_t id, uint32_t rank, WindowType type, - uint32_t backbone_start, const char* backbone, uint32_t backbone_length, const char* quality, + uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); class Window { @@ -56,6 +56,10 @@ class Window { return start_; } + uint32_t end() const { + return end_; + } + bool generate_consensus(std::shared_ptr alignment_engine, bool trim); @@ -64,7 +68,7 @@ class Window { uint32_t end); friend std::shared_ptr createWindow(uint64_t id, uint32_t rank, - WindowType type, uint32_t backbone_start, const char* backbone, + WindowType type, uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); friend std::ostream& operator<<(std::ostream& os, const Window& a); @@ -74,7 +78,7 @@ class Window { #endif private: - Window(uint64_t id, uint32_t rank, WindowType type, uint32_t backbone_start, + Window(uint64_t id, uint32_t rank, WindowType type, uint32_t window_start, const char* backbone, uint32_t backbone_length, const char* quality, uint32_t quality_length); Window(const Window&) = delete; @@ -84,6 +88,7 @@ class Window { uint32_t rank_; WindowType type_; uint32_t start_; + uint32_t end_; std::string consensus_; std::vector> sequences_; std::vector> qualities_; From a16068e2a277d80cc87bb2051d90690737fa521d Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 11:24:07 +0200 Subject: [PATCH 16/49] Minor fix, should be squashed with previous. --- src/polisher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index e3763b7..de2166a 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -624,7 +624,7 @@ void Polisher::polish(std::vector>& dst, // Append the remaining suffix from the last window to the end of the target. uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); if (windows_[i]->end() < tlen) { - uint64_t suffix_start = windows_[i]->end; + uint64_t suffix_start = windows_[i]->end(); polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); } #endif From 6e556fce5708837763ef5996da7f310514bf768c Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 13:00:27 +0200 Subject: [PATCH 17/49] Added BED coordinate sanity check to src/bed.cpp. --- src/bed.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bed.cpp b/src/bed.cpp index a970287..e46dab0 100644 --- a/src/bed.cpp +++ b/src/bed.cpp @@ -19,7 +19,7 @@ bool BedFile::Deserialize(const std::string& line, BedRecord& record) { char name_buff[1024]; int64_t chrom_start = 0, chrom_end = 0; int32_t n = sscanf(line.c_str(), "%s %ld %ld", name_buff, &chrom_start, &chrom_end); - if (n < 3) { + if (n < 3 || chrom_end <= chrom_start) { throw std::runtime_error("Invalid BED line: '" + line + "'"); } record = BedRecord(name_buff, chrom_start, chrom_end); From f094ef3c865bd9e720e77ef7d74e5f65fcc9bdca Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 13:00:46 +0200 Subject: [PATCH 18/49] Added a verbose of the window end coordinate in src/window.cpp. --- src/window.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/window.cpp b/src/window.cpp index 99d03ca..75079cc 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -148,6 +148,7 @@ std::ostream& operator<<(std::ostream& os, const Window& a) os << "Window: id = " << a.id_ << ", rank = " << a.rank_ << ", type = " << ((a.type_ == WindowType::kNGS) ? "NGS" : "TGS") << ", start = " << a.start_ + << ", end = " << a.end_ << ", backbone_len = " << a.backbone_length() << ", consensus_len = " << a.consensus_.size() << ", seqs_len = " << a.sequences_.size() From 1eeeac42e1465d94e3fb4a21fcea85874b4f069f Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 13:01:36 +0200 Subject: [PATCH 19/49] Working on the new Polisher::create_and_populate_windows_with_bed. Right now only generating empty windows for the BED regions. --- src/polisher.cpp | 44 +++++++++++++++++++++++++++++++++++++++++++- src/polisher.hpp | 2 ++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index de2166a..b950461 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -452,7 +452,49 @@ void Polisher::initialize() { it.wait(); } - create_and_populate_windows(overlaps, targets_size, window_type); + if (use_bed_) { + create_and_populate_windows_with_bed(overlaps, targets_size, window_type); + } else { + create_and_populate_windows(overlaps, targets_size, window_type); + } +} + +void Polisher::create_and_populate_windows_with_bed(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type) { + + logger_->log("Constructing windows for BED regions.\n"); + + // The -1 marks that the target doesn't have any windows. + std::vector id_to_first_window_id(targets_size + 1, -1); + + // Target intervals are sorted ahead of time. + for (const auto& it: target_intervals_) { + int64_t t_id = it.first; + const std::vector& intervals = it.second; + + // Mark the window start. + id_to_first_window_id[t_id] = static_cast(windows_.size()); + + // Generate windows for each interval separately. + uint32_t k = 0; + for (const auto& interval: intervals) { + for (int64_t win_start = interval.start; win_start < (interval.stop + 1); + win_start += window_length_, ++k) { + int64_t length = std::min(win_start + static_cast(window_length_), + (interval.stop + 1)) - win_start; + windows_.emplace_back(createWindow(t_id, k, window_type, win_start, + &(sequences_[t_id]->data()[win_start]), length, + sequences_[t_id]->quality().empty() ? &(dummy_quality_[0]) : + &(sequences_[t_id]->quality()[win_start]), length)); + } + } + } + + for (size_t i = 0; i < windows_.size(); ++i) { + std::cerr << "[window " << i << "] " << *windows_[i] << "\n"; + } + + } void Polisher::create_and_populate_windows(std::vector>& overlaps, diff --git a/src/polisher.hpp b/src/polisher.hpp index 04d1d47..9b1f37e 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -86,6 +86,8 @@ class Polisher { virtual void find_overlap_breaking_points(std::vector>& overlaps); void create_and_populate_windows(std::vector>& overlaps, uint64_t targets_size, WindowType window_type); + void create_and_populate_windows_with_bed(std::vector>& overlaps, + uint64_t targets_size, WindowType window_type); std::unique_ptr> sparser_; std::unique_ptr> oparser_; From 7e6585b89592eadfd415d2475fb9a677935ac53e Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 13:21:45 +0200 Subject: [PATCH 20/49] Renamed the target_interva_s and target_trees_ to target_bed_intervals_ and target_bed_trees_, and added the trees for the windows. --- src/polisher.cpp | 64 ++++++++++++++++++++++++++++-------------------- src/polisher.hpp | 8 ++++-- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index b950461..c60991b 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -303,7 +303,7 @@ void Polisher::initialize() { // Collect the intervals. logger_->log("[racon::Polisher::initialize] building the interval trees"); logger_->log(); - target_intervals_.clear(); + target_bed_intervals_.clear(); for (size_t i = 0; i < bed_records_.size(); ++i) { const auto& record = bed_records_[i]; uint64_t t_id = 0; @@ -311,26 +311,26 @@ void Polisher::initialize() { throw std::runtime_error("Target sequence '" + record.chrom() + "' specified in the BED file was not found among the target sequences."); } - target_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + target_bed_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); } // Sort target intervals. - for (auto& it: target_intervals_) { + for (auto& it: target_bed_intervals_) { std::stable_sort(it.second.begin(), it.second.end(), [](const IntervalInt64& a, const IntervalInt64& b) { return a.start < b.start; }); } // Construct the trees. - target_trees_.clear(); - for (const auto& it: target_intervals_) { + target_bed_trees_.clear(); + for (const auto& it: target_bed_intervals_) { // Make a copy, because the IntervalTree has only the move constructor, // and we still need t he intvervals for validation below. auto intervals = it.second; - target_trees_[it.first] = IntervalTreeInt64(std::move(intervals)); + target_bed_trees_[it.first] = IntervalTreeInt64(std::move(intervals)); } // Validate that there are no overlapping intervals. - for (const auto& it: target_intervals_) { + for (const auto& it: target_bed_intervals_) { int64_t t_id = it.first; for (const auto& interval: it.second) { - auto foundIntervals = target_trees_[t_id].findOverlapping(interval.start, interval.stop); + auto foundIntervals = target_bed_trees_[t_id].findOverlapping(interval.start, interval.stop); if (foundIntervals.size() != 1 || (foundIntervals.size() == 1 && foundIntervals.front().value != interval.value)) { throw std::runtime_error("Invalid BED record: '" + @@ -387,7 +387,7 @@ void Polisher::initialize() { // Remove overlaps in regions not specified by BED. if (use_bed_) { - auto foundIntervals = target_trees_[overlaps[i]->t_id()].findOverlapping( + auto foundIntervals = target_bed_trees_[overlaps[i]->t_id()].findOverlapping( static_cast(overlaps[i]->t_begin()), static_cast(overlaps[i]->t_end()) - 1); if (foundIntervals.empty()) { @@ -457,6 +457,22 @@ void Polisher::initialize() { } else { create_and_populate_windows(overlaps, targets_size, window_type); } + +// #ifdef BED_FEATURE_TEST +// int32_t shift = 0; +// for (int32_t i = 0; i < static_cast(windows_.size()); ++i) { +// if ((i + 1) % 100 != 0) { +// windows_[i].reset(); +// ++shift; +// // std::cerr << "nulling i = " << i << "\n"; +// continue; +// } +// std::swap(windows_[i-shift], windows_[i]); +// } +// windows_.resize(static_cast(windows_.size()) - shift); +// std::cerr << "windows_.size() = " << windows_.size() << "\n"; +// #endif + } void Polisher::create_and_populate_windows_with_bed(std::vector>& overlaps, @@ -468,7 +484,7 @@ void Polisher::create_and_populate_windows_with_bed(std::vector id_to_first_window_id(targets_size + 1, -1); // Target intervals are sorted ahead of time. - for (const auto& it: target_intervals_) { + for (const auto& it: target_bed_intervals_) { int64_t t_id = it.first; const std::vector& intervals = it.second; @@ -479,16 +495,27 @@ void Polisher::create_and_populate_windows_with_bed(std::vector(window_length_), (interval.stop + 1)) - win_start; + int64_t win_id = windows_.size(); + windows_.emplace_back(createWindow(t_id, k, window_type, win_start, &(sequences_[t_id]->data()[win_start]), length, sequences_[t_id]->quality().empty() ? &(dummy_quality_[0]) : &(sequences_[t_id]->quality()[win_start]), length)); + + target_window_intervals_[t_id].emplace_back(IntervalInt64(win_start, win_start + length - 1, win_id)); } } } + // Construct the trees. The iterator is not const because the IntervalTree only has the + // move constructor. + target_window_trees_.clear(); + for (auto& it: target_window_intervals_) { + target_window_trees_[it.first] = IntervalTreeInt64(std::move(it.second)); + } for (size_t i = 0; i < windows_.size(); ++i) { std::cerr << "[window " << i << "] " << *windows_[i] << "\n"; @@ -578,21 +605,6 @@ void Polisher::create_and_populate_windows(std::vector> overlaps[i].reset(); } -#ifdef BED_FEATURE_TEST - int32_t shift = 0; - for (int32_t i = 0; i < static_cast(windows_.size()); ++i) { - if ((i + 1) % 100 != 0) { - windows_[i].reset(); - ++shift; - // std::cerr << "nulling i = " << i << "\n"; - continue; - } - std::swap(windows_[i-shift], windows_[i]); - } - windows_.resize(static_cast(windows_.size()) - shift); - std::cerr << "windows_.size() = " << windows_.size() << "\n"; -#endif - logger_->log("[racon::Polisher::initialize] transformed data into windows"); } diff --git a/src/polisher.hpp b/src/polisher.hpp index 9b1f37e..0f01574 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -114,8 +114,12 @@ class Polisher { std::unique_ptr logger_; - std::unordered_map> target_intervals_; - std::unordered_map target_trees_; + std::unordered_map> target_bed_intervals_; + std::unordered_map target_bed_trees_; + + std::unordered_map> target_window_intervals_; + std::unordered_map target_window_trees_; + }; } From b0aa724d463396526e65c4069675197f79b35bbd Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 14:30:42 +0200 Subject: [PATCH 21/49] Expanded the Overlap::breaking_points_ to be a tuple and store the window ID as well. --- src/overlap.cpp | 11 ++++++----- src/overlap.hpp | 9 +++++---- src/polisher.cpp | 30 +++++++++++++++++------------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/overlap.cpp b/src/overlap.cpp index e26d8a8..d10ccc9 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -216,16 +216,19 @@ void Overlap::find_breaking_points_from_cigar(uint32_t window_length) { // find breaking points from cigar std::vector window_ends; + uint32_t w_offset = 0; for (uint32_t i = 0; i < t_end_; i += window_length) { if (i > t_begin_) { window_ends.emplace_back(i - 1); + } else { + ++w_offset; } } window_ends.emplace_back(t_end_ - 1); uint32_t w = 0; bool found_first_match = false; - std::pair first_match = {0, 0}, last_match = {0, 0}; + std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; int32_t q_ptr = (strand_ ? (q_length_ - q_end_) : q_begin_) - 1; int32_t t_ptr = t_begin_ - 1; @@ -240,11 +243,9 @@ void Overlap::find_breaking_points_from_cigar(uint32_t window_length) if (!found_first_match) { found_first_match = true; - first_match.first = t_ptr; - first_match.second = q_ptr; + first_match = std::make_tuple(t_ptr, q_ptr, w + w_offset); } - last_match.first = t_ptr + 1; - last_match.second = q_ptr + 1; + last_match = std::make_tuple(t_ptr + 1, q_ptr + 1, w + w_offset); if (t_ptr == window_ends[w]) { if (found_first_match) { breaking_points_.emplace_back(first_match); diff --git a/src/overlap.hpp b/src/overlap.hpp index 7599e5d..a226b19 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -6,11 +6,12 @@ #pragma once -#include -#include +#include +#include #include #include #include +#include #include #include @@ -73,7 +74,7 @@ class Overlap { return cigar_; } - const std::vector>& breaking_points() const { + const std::vector>& breaking_points() const { return breaking_points_; } @@ -127,7 +128,7 @@ class Overlap { bool is_valid_; bool is_transmuted_; - std::vector> breaking_points_; + std::vector> breaking_points_; std::vector> dual_breaking_points_; }; diff --git a/src/polisher.cpp b/src/polisher.cpp index c60991b..f6ee6ac 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -557,7 +557,12 @@ void Polisher::create_and_populate_windows(std::vector> const auto& breaking_points = overlaps[i]->breaking_points(); for (uint32_t j = 0; j < breaking_points.size(); j += 2) { - if (breaking_points[j + 1].second - breaking_points[j].second < 0.02 * window_length_) { + const uint32_t start_curr = std::get<0>(breaking_points[j]); + const uint32_t start_next = std::get<0>(breaking_points[j + 1]); + const uint32_t end_curr = std::get<1>(breaking_points[j]); + const uint32_t end_next = std::get<1>(breaking_points[j + 1]); + + if ((end_next - end_curr) < 0.02 * window_length_) { continue; } @@ -567,10 +572,10 @@ void Polisher::create_and_populate_windows(std::vector> const auto& quality = overlaps[i]->strand() ? sequence->reverse_quality() : sequence->quality(); double average_quality = 0; - for (uint32_t k = breaking_points[j].second; k < breaking_points[j + 1].second; ++k) { + for (uint32_t k = end_curr; k < end_next; ++k) { average_quality += static_cast(quality[k]) - 33; } - average_quality /= breaking_points[j + 1].second - breaking_points[j].second; + average_quality /= (end_next - end_curr); if (average_quality < quality_threshold_) { continue; @@ -578,28 +583,27 @@ void Polisher::create_and_populate_windows(std::vector> } uint64_t window_id = id_to_first_window_id[overlaps[i]->t_id()] + - breaking_points[j].first / window_length_; - uint32_t window_start = (breaking_points[j].first / window_length_) * + start_curr / window_length_; + uint32_t window_start = (start_curr / window_length_) * window_length_; const char* data = overlaps[i]->strand() ? - &(sequence->reverse_complement()[breaking_points[j].second]) : - &(sequence->data()[breaking_points[j].second]); - uint32_t data_length = breaking_points[j + 1].second - - breaking_points[j].second; + &(sequence->reverse_complement()[end_curr]) : + &(sequence->data()[end_curr]); + uint32_t data_length = end_next - end_curr; const char* quality = overlaps[i]->strand() ? (sequence->reverse_quality().empty() ? - nullptr : &(sequence->reverse_quality()[breaking_points[j].second])) + nullptr : &(sequence->reverse_quality()[end_curr])) : (sequence->quality().empty() ? - nullptr : &(sequence->quality()[breaking_points[j].second])); + nullptr : &(sequence->quality()[end_curr])); uint32_t quality_length = quality == nullptr ? 0 : data_length; windows_[window_id]->add_layer(data, data_length, quality, quality_length, - breaking_points[j].first - window_start, - breaking_points[j + 1].first - window_start - 1); + start_curr - window_start, + start_next - window_start - 1); } overlaps[i].reset(); From a3d8975298cd5b4b3e01957bd1b5388242236b5f Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 14:32:02 +0200 Subject: [PATCH 22/49] Added src/util.cpp and moved the IntervalTree typedefs to util.hpp. Added a placeholder for a generic function to find window breaking points base on supplied windows. --- src/meson.build | 1 + src/polisher.hpp | 6 +--- src/util.cpp | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util.hpp | 11 ++++++ 4 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 src/util.cpp diff --git a/src/meson.build b/src/meson.build index 06e9b87..c523691 100644 --- a/src/meson.build +++ b/src/meson.build @@ -4,6 +4,7 @@ racon_cpp_sources = files([ 'overlap.cpp', 'polisher.cpp', 'sequence.cpp', + 'util.cpp', 'window.cpp', ]) diff --git a/src/polisher.hpp b/src/polisher.hpp index 0f01574..f0bdde3 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -14,6 +14,7 @@ #include "window.hpp" #include "bed.hpp" #include "IntervalTree.h" +#include "util.hpp" #include namespace bioparser { @@ -29,7 +30,6 @@ namespace spoa { class AlignmentEngine; } - namespace racon { class Sequence; @@ -42,10 +42,6 @@ enum class PolisherType { kF // Fragment error correction }; -using IntervalTreeInt64 = IntervalTree; -using IntervalVectorInt64 = IntervalTreeInt64::interval_vector; -using IntervalInt64 = IntervalTreeInt64::interval; - class Polisher; std::unique_ptr createPolisher(const std::string& sequences_path, const std::string& overlaps_path, const std::string& target_path, diff --git a/src/util.cpp b/src/util.cpp new file mode 100644 index 0000000..bd7bb60 --- /dev/null +++ b/src/util.cpp @@ -0,0 +1,87 @@ +/*! + * @file util.cpp + * + * @brief Utility source file. + */ + +#include "util.hpp" + +namespace racon { + +std::vector> find_breaking_points_from_cigar( + const std::string& cigar, std::vector windows) +{ + std::sort(windows.begin(), windows.end(), [](const IntervalInt64& a, const IntervalInt64& b) { + return a.start < b.start; + }); + + std::vector> ret; + + // // find breaking points from cigar + // std::vector window_ends; + // uint32_t w_offset = 0; + // for (uint32_t i = 0; i < t_end_; i += window_length) { + // if (i > t_begin_) { + // window_ends.emplace_back(i - 1); + // } else { + // ++w_offset; + // } + // } + // window_ends.emplace_back(t_end_ - 1); + + // uint32_t w = 0; + // bool found_first_match = false; + // std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; + + // int32_t q_ptr = (strand_ ? (q_length_ - q_end_) : q_begin_) - 1; + // int32_t t_ptr = t_begin_ - 1; + + // for (uint32_t i = 0, j = 0; i < cigar_.size(); ++i) { + // if (cigar_[i] == 'M' || cigar_[i] == '=' || cigar_[i] == 'X') { + // uint32_t k = 0, num_bases = atoi(&cigar_[j]); + // j = i + 1; + // while (k < num_bases) { + // ++q_ptr; + // ++t_ptr; + + // if (!found_first_match) { + // found_first_match = true; + // first_match = std::make_tuple(t_ptr, q_ptr, w + w_offset); + // } + // last_match = std::make_tuple(t_ptr + 1, q_ptr + 1, w + w_offset); + // if (t_ptr == window_ends[w]) { + // if (found_first_match) { + // breaking_points_.emplace_back(first_match); + // breaking_points_.emplace_back(last_match); + // } + // found_first_match = false; + // ++w; + // } + + // ++k; + // } + // } else if (cigar_[i] == 'I') { + // q_ptr += atoi(&cigar_[j]); + // j = i + 1; + // } else if (cigar_[i] == 'D' || cigar_[i] == 'N') { + // uint32_t k = 0, num_bases = atoi(&cigar_[j]); + // j = i + 1; + // while (k < num_bases) { + // ++t_ptr; + // if (t_ptr == window_ends[w]) { + // if (found_first_match) { + // breaking_points_.emplace_back(first_match); + // breaking_points_.emplace_back(last_match); + // } + // found_first_match = false; + // ++w; + // } + // ++k; + // } + // } else if (cigar_[i] == 'S' || cigar_[i] == 'H' || cigar_[i] == 'P') { + // j = i + 1; + // } + // } +} + +} diff --git a/src/util.hpp b/src/util.hpp index 36a4f67..8844571 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -6,8 +6,16 @@ #pragma once +#include #include +#include #include +#include +#include "IntervalTree.h" + +using IntervalTreeInt64 = IntervalTree; +using IntervalVectorInt64 = IntervalTreeInt64::interval_vector; +using IntervalInt64 = IntervalTreeInt64::interval; namespace racon { @@ -23,4 +31,7 @@ bool transmuteId(const std::unordered_map& t_to_id, const T& t, return true; } +std::vector> find_breaking_points_from_cigar( + const std::string& cigar, std::vector windows); + } From 662d3f761b940cc398bd175a6409bd41b6870abb Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 21:38:56 +0200 Subject: [PATCH 23/49] Removed the intervaltree/interval_tree_test.cpp from vendor/meson.build because these tests hijacked my unit tests. --- vendor/meson.build | 1 - 1 file changed, 1 deletion(-) diff --git a/vendor/meson.build b/vendor/meson.build index 017255e..2f54dcf 100644 --- a/vendor/meson.build +++ b/vendor/meson.build @@ -1,6 +1,5 @@ vendor_cpp_sources = files([ 'edlib/edlib/src/edlib.cpp', - 'intervaltree/interval_tree_test.cpp', 'rampler/src/sampler.cpp', 'rampler/src/sequence.cpp', 'spoa/src/alignment_engine.cpp', From 42700b9428a820cd2b91dafa7ab977446c2a5a40 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 19:03:02 +0200 Subject: [PATCH 24/49] Implemented a generic window splitting function generate_window_breakpoints in util.*. It breaks the alignment into supplied windows. --- src/util.cpp | 220 ++++++++++++++++++++++++++++++++++----------------- src/util.hpp | 38 ++++++++- 2 files changed, 185 insertions(+), 73 deletions(-) diff --git a/src/util.cpp b/src/util.cpp index bd7bb60..5b7bed5 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -8,80 +8,158 @@ namespace racon { -std::vector> find_breaking_points_from_cigar( - const std::string& cigar, std::vector windows) +// Tuple of: +// using WindowInterval = std::vector>; + +// std::vector> find_breaking_points_from_cigar( +// const std::string& cigar, int32_t q_start, int32_t t_start, +// std::vector windows) +// { +// } + +/* + * \param windows Vector of tuples, where each tuple element is: . +*/ +std::vector generate_window_breakpoints( + const std::string& cigar, int64_t q_start, int64_t t_start, + std::vector> windows) { - std::sort(windows.begin(), windows.end(), [](const IntervalInt64& a, const IntervalInt64& b) { - return a.start < b.start; + std::vector ret; + + if (windows.empty()) { + return ret; + } + + std::sort(windows.begin(), windows.end(), + [](const std::tuple& a, + const std::tuple& b) { + return std::get<0>(a) < std::get<0>(b); }); - std::vector> ret; - - // // find breaking points from cigar - // std::vector window_ends; - // uint32_t w_offset = 0; - // for (uint32_t i = 0; i < t_end_; i += window_length) { - // if (i > t_begin_) { - // window_ends.emplace_back(i - 1); - // } else { - // ++w_offset; - // } - // } - // window_ends.emplace_back(t_end_ - 1); - - // uint32_t w = 0; - // bool found_first_match = false; - // std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; - - // int32_t q_ptr = (strand_ ? (q_length_ - q_end_) : q_begin_) - 1; - // int32_t t_ptr = t_begin_ - 1; - - // for (uint32_t i = 0, j = 0; i < cigar_.size(); ++i) { - // if (cigar_[i] == 'M' || cigar_[i] == '=' || cigar_[i] == 'X') { - // uint32_t k = 0, num_bases = atoi(&cigar_[j]); - // j = i + 1; - // while (k < num_bases) { - // ++q_ptr; - // ++t_ptr; - - // if (!found_first_match) { - // found_first_match = true; - // first_match = std::make_tuple(t_ptr, q_ptr, w + w_offset); - // } - // last_match = std::make_tuple(t_ptr + 1, q_ptr + 1, w + w_offset); - // if (t_ptr == window_ends[w]) { - // if (found_first_match) { - // breaking_points_.emplace_back(first_match); - // breaking_points_.emplace_back(last_match); - // } - // found_first_match = false; - // ++w; - // } - - // ++k; - // } - // } else if (cigar_[i] == 'I') { - // q_ptr += atoi(&cigar_[j]); - // j = i + 1; - // } else if (cigar_[i] == 'D' || cigar_[i] == 'N') { - // uint32_t k = 0, num_bases = atoi(&cigar_[j]); - // j = i + 1; - // while (k < num_bases) { - // ++t_ptr; - // if (t_ptr == window_ends[w]) { - // if (found_first_match) { - // breaking_points_.emplace_back(first_match); - // breaking_points_.emplace_back(last_match); - // } - // found_first_match = false; - // ++w; - // } - // ++k; - // } - // } else if (cigar_[i] == 'S' || cigar_[i] == 'H' || cigar_[i] == 'P') { - // j = i + 1; - // } - // } + // Avoid using the operators. + const char* cigar_str = cigar.c_str(); + int64_t num_windows = static_cast(windows.size()); + + bool found_first_match = false; + std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; + + // The "-1" is because of the while loops below (increment is done at the top). + int64_t q_pos = q_start - 1; + int64_t t_pos = t_start - 1; + int64_t curr_w = 0; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_start) { + ++curr_w; + } + + for (uint32_t i = 0, j = 0; i < cigar.size(); ++i) { + if (cigar_str[i] == 'M' || cigar_str[i] == '=' || cigar_str[i] == 'X') { + uint32_t num_bases = atoi(&cigar_str[j]); + j = i + 1; + for (uint32_t k = 0; k < num_bases; ++k) { + ++q_pos; + ++t_pos; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_pos) { + ++curr_w; + } + if (curr_w >= num_windows) { + break; + } + + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + const auto& win_id = std::get<2>(windows[curr_w]); + + if (t_pos < win_start) { + continue; + } + + if (!found_first_match) { + found_first_match = true; + first_match = std::make_tuple(t_pos, q_pos, win_id); + } + last_match = std::make_tuple(t_pos + 1, q_pos + 1, win_id); + if (t_pos == (win_end - 1)) { + if (found_first_match) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + found_first_match = false; + ++curr_w; + if (curr_w >= num_windows) { + break; + } + } + } + } else if (cigar_str[i] == 'I') { + q_pos += atoi(&cigar_str[j]); + j = i + 1; + } else if (cigar_str[i] == 'D' || cigar_str[i] == 'N') { + uint32_t num_bases = atoi(&cigar_str[j]); + j = i + 1; + for (uint32_t k = 0; k < num_bases; ++k) { + ++t_pos; + + while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_pos) { + ++curr_w; + } + if (curr_w >= num_windows) { + break; + } + + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + + if (t_pos < win_start) { + continue; + } + + if (t_pos == (win_end - 1)) { + if (found_first_match) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + found_first_match = false; + ++curr_w; + if (curr_w >= num_windows) { + break; + } + } + } + } else if (cigar_str[i] == 'S' || cigar_str[i] == 'H' || cigar_str[i] == 'P') { + j = i + 1; + } + } + + // Add the final piece. + if (curr_w < static_cast(windows.size())) { + const auto& win_start = std::get<0>(windows[curr_w]); + const auto& win_end = std::get<1>(windows[curr_w]); + const auto& final_t_start = std::get<0>(first_match); + const auto& final_t_end = std::get<0>(last_match); + if (found_first_match && final_t_start >= win_start && final_t_end <= win_end) { + WindowInterval wi( + std::get<1>(first_match), + std::get<1>(last_match), + std::get<0>(first_match), + std::get<0>(last_match), + std::get<2>(last_match)); + ret.emplace_back(std::move(wi)); + } + } + + return ret; } } diff --git a/src/util.hpp b/src/util.hpp index 8844571..4a2349e 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,38 @@ using IntervalInt64 = IntervalTreeInt64::interval; namespace racon { +class WindowInterval { +public: + WindowInterval() = default; + + WindowInterval(int64_t _q_start, int64_t _q_end, int64_t _t_start, int64_t _t_end, int64_t _window_id) + : q_start(_q_start) + , q_end(_q_end) + , t_start(_t_start) + , t_end(_t_end) + , window_id(_window_id) + {} + + bool operator==(const WindowInterval& rhs) const + { + return q_start == rhs.q_start && q_end == rhs.q_end && t_start == rhs.t_start + && t_end == rhs.t_end && window_id == rhs.window_id; + } + + std::ostream& operator<<(std::ostream& os) const { + os << "q_start = " << q_start << ", q_end = " << q_end + << ", t_start = " << t_start << "t_end = " << t_end + << "window_id = " << window_id; + return os; + } + + int64_t q_start = 0; + int64_t q_end = 0; + int64_t t_start = 0; + int64_t t_end = 0; + int64_t window_id = -1; +}; + template bool transmuteId(const std::unordered_map& t_to_id, const T& t, uint64_t& id) { @@ -31,7 +64,8 @@ bool transmuteId(const std::unordered_map& t_to_id, const T& t, return true; } -std::vector> find_breaking_points_from_cigar( - const std::string& cigar, std::vector windows); +std::vector generate_window_breakpoints( + const std::string& cigar, int64_t q_start, int64_t t_start, + std::vector> windows); } From 9653b7343bd2dabfc1362aecaf356ae2f7f24b21 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Thu, 25 Jun 2020 19:03:23 +0200 Subject: [PATCH 25/49] Added unit tests for generate_window_breakpoints from util.* to test/util_test.cpp. --- test/meson.build | 1 + test/util_test.cpp | 174 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 test/util_test.cpp diff --git a/test/meson.build b/test/meson.build index 2a4d878..a76ef41 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,5 +1,6 @@ racon_test_cpp_sources = files([ 'bed_test.cpp', + 'util_test.cpp', 'racon_test.cpp', ]) diff --git a/test/util_test.cpp b/test/util_test.cpp new file mode 100644 index 0000000..6673fad --- /dev/null +++ b/test/util_test.cpp @@ -0,0 +1,174 @@ +/*! + * @file util.cpp + * + * @brief Unit tests for the utils. + */ + +#include "gtest/gtest.h" +#include "racon_test_config.h" +#include "util.hpp" + +#include +#include +#include +#include + +TEST(Utils, find_breaking_points_from_cigar) { + // Tuple: test_name, CIGAR string, windows as tuples, expected breakpoints split into windows. + std::vector>, std::vector>> test_data = { + {"Empty input", "", 0, 0, {}, {}}, + + {"Simple windowing", "1000M", 0, 0, + { + {100, 500, 0}, + {700, 800, 1}, + {900, 950, 2}, + }, + { + racon::WindowInterval(100, 500, 100, 500, 0), + racon::WindowInterval(700, 800, 700, 800, 1), + racon::WindowInterval(900, 950, 900, 950, 2), + } + }, + + {"Simple windowing with indels", "200=100I300=50D450=", 0, 0, + { + {100, 500, 0}, + {500, 700, 1}, + {700, 800, 2}, + {900, 950, 3}, + }, + { + racon::WindowInterval(100, 600, 100, 500, 0), + racon::WindowInterval(600, 750, 550, 700, 1), + racon::WindowInterval(750, 850, 700, 800, 2), + racon::WindowInterval(950, 1000, 900, 950, 3), + } + }, + + {"Intra-window alignments", "100=", 0, 250, + { + {0, 200, 0}, + {200, 400, 1}, + {400, 600, 2}, + {600, 800, 3}, + {800, 1000, 4}, + }, + { + racon::WindowInterval(0, 100, 250, 350, 1), + } + }, + + {"Cross-window alignments", "200=", 0, 250, + { + {0, 200, 0}, + {200, 400, 1}, + {400, 600, 2}, + {600, 800, 3}, + {800, 1000, 4}, + }, + { + racon::WindowInterval(0, 150, 250, 400, 1), + racon::WindowInterval(150, 200, 400, 450, 2), + } + }, + + {"Flanking insertions", "10I180=10I", 0, 200, + { + {0, 200, 0}, + {200, 400, 1}, + {400, 600, 2}, + {600, 800, 3}, + {800, 1000, 4}, + }, + { + racon::WindowInterval(10, 190, 200, 380, 1), + } + }, + + {"Flanking insertions", "10D180=10D", 0, 200, + { + {0, 200, 0}, + {200, 400, 1}, + {400, 600, 2}, + {600, 800, 3}, + {800, 1000, 4}, + }, + { + racon::WindowInterval(0, 180, 210, 390, 1), + } + }, + + {"Alignment outside of the windows, front", "50=", 0, 50, + { + {200, 400, 0}, + {400, 600, 1}, + {600, 800, 2}, + {800, 1000, 3}, + }, + { } + }, + + {"Alignment outside of the windows, back", "50=", 0, 1050, + { + {200, 400, 0}, + {400, 600, 1}, + {600, 800, 2}, + {800, 1000, 3}, + }, + { } + }, + + {"Alignment in between windows (not overlapping)", "50=", 0, 450, + { + {0, 200, 0}, + {200, 400, 1}, + {600, 800, 3}, + {800, 1000, 4}, + }, + { } + }, + + {"Out of order windows, should still work because internal sort", "200=100I300=50D450=", 0, 0, + { + {900, 950, 3}, + {500, 700, 1}, + {100, 500, 0}, + {700, 800, 2}, + }, + { + racon::WindowInterval(100, 600, 100, 500, 0), + racon::WindowInterval(600, 750, 550, 700, 1), + racon::WindowInterval(750, 850, 700, 800, 2), + racon::WindowInterval(950, 1000, 900, 950, 3), + } + }, + + }; + + for (const auto& single_test: test_data) { + const std::string& test_name = std::get<0>(single_test); + const std::string& cigar = std::get<1>(single_test); + int64_t aln_q_start = std::get<2>(single_test); + int64_t aln_t_start = std::get<3>(single_test); + const std::vector>& windows = std::get<4>(single_test); + const std::vector& expected = std::get<5>(single_test); + + SCOPED_TRACE(test_name); + + auto result = racon::generate_window_breakpoints(cigar, aln_q_start, aln_t_start, windows); + + // std::cerr << test_name << "\n"; + // std::cerr << cigar << "\n"; + // for (const auto& val: result) { + // std::cerr << "q_start = " << val.q_start << ", q_end = " << val.q_end + // << ", t_start = " << val.t_start << ", t_end = " << val.t_end + // << ", window_id = " << val.window_id << "\n"; + // // std::cerr << val << "\n"; + // } + // std::cerr << "\n"; + + EXPECT_EQ(expected, result); + } +} From 5d4f542380ce9322096a7a19d84cc9079d87998c Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 00:11:56 +0200 Subject: [PATCH 26/49] The Overlap::find_breaking_points_from_cigar now uses the utility function generate_window_breakpoints. --- src/overlap.cpp | 82 +++++++++++------------------------------------- src/overlap.hpp | 9 ++++-- src/polisher.cpp | 45 ++++++++++++++------------ 3 files changed, 49 insertions(+), 87 deletions(-) diff --git a/src/overlap.cpp b/src/overlap.cpp index d10ccc9..69ed60d 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -212,73 +212,27 @@ void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, ui edlibFreeAlignResult(result); } -void Overlap::find_breaking_points_from_cigar(uint32_t window_length) +void Overlap::find_breaking_points_from_cigar(int64_t window_length) { + std::vector> windows; + // find breaking points from cigar - std::vector window_ends; - uint32_t w_offset = 0; - for (uint32_t i = 0; i < t_end_; i += window_length) { - if (i > t_begin_) { - window_ends.emplace_back(i - 1); - } else { - ++w_offset; - } - } - window_ends.emplace_back(t_end_ - 1); - - uint32_t w = 0; - bool found_first_match = false; - std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; - - int32_t q_ptr = (strand_ ? (q_length_ - q_end_) : q_begin_) - 1; - int32_t t_ptr = t_begin_ - 1; - - for (uint32_t i = 0, j = 0; i < cigar_.size(); ++i) { - if (cigar_[i] == 'M' || cigar_[i] == '=' || cigar_[i] == 'X') { - uint32_t k = 0, num_bases = atoi(&cigar_[j]); - j = i + 1; - while (k < num_bases) { - ++q_ptr; - ++t_ptr; - - if (!found_first_match) { - found_first_match = true; - first_match = std::make_tuple(t_ptr, q_ptr, w + w_offset); - } - last_match = std::make_tuple(t_ptr + 1, q_ptr + 1, w + w_offset); - if (t_ptr == window_ends[w]) { - if (found_first_match) { - breaking_points_.emplace_back(first_match); - breaking_points_.emplace_back(last_match); - } - found_first_match = false; - ++w; - } - - ++k; - } - } else if (cigar_[i] == 'I') { - q_ptr += atoi(&cigar_[j]); - j = i + 1; - } else if (cigar_[i] == 'D' || cigar_[i] == 'N') { - uint32_t k = 0, num_bases = atoi(&cigar_[j]); - j = i + 1; - while (k < num_bases) { - ++t_ptr; - if (t_ptr == window_ends[w]) { - if (found_first_match) { - breaking_points_.emplace_back(first_match); - breaking_points_.emplace_back(last_match); - } - found_first_match = false; - ++w; - } - ++k; - } - } else if (cigar_[i] == 'S' || cigar_[i] == 'H' || cigar_[i] == 'P') { - j = i + 1; - } + int64_t window_id = t_begin_ / window_length; + const int64_t start = window_id * window_length; + const int64_t end = t_end_; + + for (int64_t i = start; i < end; i += window_length, ++window_id) { + windows.emplace_back(std::make_tuple(i, std::min(static_cast(t_length_), i + window_length), window_id)); } + + int64_t q_start = (strand_ ? (q_length_ - q_end_) : q_begin_); + int64_t t_start = t_begin_; + + std::vector result = generate_window_breakpoints( + cigar_, q_start, t_start, + windows); + + std::swap(breaking_points_, result); } } diff --git a/src/overlap.hpp b/src/overlap.hpp index a226b19..a841606 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -6,6 +6,7 @@ #pragma once +#include "util.hpp" #include #include #include @@ -74,7 +75,7 @@ class Overlap { return cigar_; } - const std::vector>& breaking_points() const { + const std::vector& breaking_points() const { return breaking_points_; } @@ -106,7 +107,7 @@ class Overlap { Overlap(); Overlap(const Overlap&) = delete; const Overlap& operator=(const Overlap&) = delete; - virtual void find_breaking_points_from_cigar(uint32_t window_length); + virtual void find_breaking_points_from_cigar(int64_t window_length); virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len); std::string q_name_; @@ -128,8 +129,10 @@ class Overlap { bool is_valid_; bool is_transmuted_; - std::vector> breaking_points_; + // std::vector> breaking_points_; std::vector> dual_breaking_points_; + + std::vector breaking_points_; }; } diff --git a/src/polisher.cpp b/src/polisher.cpp index f6ee6ac..60d32bb 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -554,15 +554,20 @@ void Polisher::create_and_populate_windows(std::vector> ++targets_coverages_[overlaps[i]->t_id()]; const auto& sequence = sequences_[overlaps[i]->q_id()]; - const auto& breaking_points = overlaps[i]->breaking_points(); - - for (uint32_t j = 0; j < breaking_points.size(); j += 2) { - const uint32_t start_curr = std::get<0>(breaking_points[j]); - const uint32_t start_next = std::get<0>(breaking_points[j + 1]); - const uint32_t end_curr = std::get<1>(breaking_points[j]); - const uint32_t end_next = std::get<1>(breaking_points[j + 1]); - - if ((end_next - end_curr) < 0.02 * window_length_) { + const std::vector& breaking_points = overlaps[i]->breaking_points(); + + for (uint32_t j = 0; j < breaking_points.size(); ++j) { + const auto& bp = breaking_points[j]; + // const uint32_t win_t_start = std::get<0>(breaking_points[j]); + // const uint32_t win_t_end = std::get<0>(breaking_points[j + 1]); + // const uint32_t win_q_start = std::get<1>(breaking_points[j]); + // const uint32_t win_q_end = std::get<1>(breaking_points[j + 1]); + const auto& win_t_start = bp.t_start; + const auto& win_t_end = bp.t_end; + const auto& win_q_start = bp.q_start; + const auto& win_q_end = bp.q_end; + + if ((win_q_end - win_q_start) < 0.02 * window_length_) { continue; } @@ -572,10 +577,10 @@ void Polisher::create_and_populate_windows(std::vector> const auto& quality = overlaps[i]->strand() ? sequence->reverse_quality() : sequence->quality(); double average_quality = 0; - for (uint32_t k = end_curr; k < end_next; ++k) { + for (uint32_t k = win_q_start; k < win_q_end; ++k) { average_quality += static_cast(quality[k]) - 33; } - average_quality /= (end_next - end_curr); + average_quality /= (win_q_end - win_q_start); if (average_quality < quality_threshold_) { continue; @@ -583,27 +588,27 @@ void Polisher::create_and_populate_windows(std::vector> } uint64_t window_id = id_to_first_window_id[overlaps[i]->t_id()] + - start_curr / window_length_; - uint32_t window_start = (start_curr / window_length_) * + win_t_start / window_length_; + uint32_t window_start = (win_t_start / window_length_) * window_length_; const char* data = overlaps[i]->strand() ? - &(sequence->reverse_complement()[end_curr]) : - &(sequence->data()[end_curr]); - uint32_t data_length = end_next - end_curr; + &(sequence->reverse_complement()[win_q_start]) : + &(sequence->data()[win_q_start]); + uint32_t data_length = win_q_end - win_q_start; const char* quality = overlaps[i]->strand() ? (sequence->reverse_quality().empty() ? - nullptr : &(sequence->reverse_quality()[end_curr])) + nullptr : &(sequence->reverse_quality()[win_q_start])) : (sequence->quality().empty() ? - nullptr : &(sequence->quality()[end_curr])); + nullptr : &(sequence->quality()[win_q_start])); uint32_t quality_length = quality == nullptr ? 0 : data_length; windows_[window_id]->add_layer(data, data_length, quality, quality_length, - start_curr - window_start, - start_next - window_start - 1); + win_t_start - window_start, + win_t_end - window_start - 1); } overlaps[i].reset(); From 075352377ebe08039bf58a9fd34d9211fc8405ac Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 00:16:18 +0200 Subject: [PATCH 27/49] Added overloaded versions of the Overlap::find_breaking_points_from_cigar and Overlap::find_breaking_points which take a set of windows. --- src/overlap.cpp | 40 +++++++++++++++++++++++++++++++++++++++- src/overlap.hpp | 4 ++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/overlap.cpp b/src/overlap.cpp index 69ed60d..907367c 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -191,6 +191,32 @@ void Overlap::find_breaking_points(const std::vector>& std::string().swap(cigar_); } +void Overlap::find_breaking_points(const std::vector>& sequences, + std::vector> windows) { + + if (!is_transmuted_) { + fprintf(stderr, "[racon::Overlap::find_breaking_points] error: " + "overlap is not transmuted!\n"); + exit(1); + } + + if (!breaking_points_.empty()) { + return; + } + + if (cigar_.empty()) { + const char* q = !strand_ ? &(sequences[q_id_]->data()[q_begin_]) : + &(sequences[q_id_]->reverse_complement()[q_length_ - q_end_]); + const char* t = &(sequences[t_id_]->data()[t_begin_]); + + align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_); + } + + find_breaking_points_from_cigar(std::move(windows)); + + std::string().swap(cigar_); +} + void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, uint32_t t_length) { // align overlaps with edlib @@ -212,6 +238,18 @@ void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, ui edlibFreeAlignResult(result); } +void Overlap::find_breaking_points_from_cigar(std::vector> windows) +{ + int64_t q_start = (strand_ ? (q_length_ - q_end_) : q_begin_); + int64_t t_start = t_begin_; + + std::vector result = generate_window_breakpoints( + cigar_, q_start, t_start, + std::move(windows)); + + std::swap(breaking_points_, result); +} + void Overlap::find_breaking_points_from_cigar(int64_t window_length) { std::vector> windows; @@ -230,7 +268,7 @@ void Overlap::find_breaking_points_from_cigar(int64_t window_length) std::vector result = generate_window_breakpoints( cigar_, q_start, t_start, - windows); + std::move(windows)); std::swap(breaking_points_, result); } diff --git a/src/overlap.hpp b/src/overlap.hpp index a841606..bb0aa99 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -82,6 +82,9 @@ class Overlap { void find_breaking_points(const std::vector>& sequences, uint32_t window_length); + void find_breaking_points(const std::vector>& sequences, + std::vector> windows); + friend bioparser::MhapParser; friend bioparser::PafParser; friend bioparser::SamParser; @@ -108,6 +111,7 @@ class Overlap { Overlap(const Overlap&) = delete; const Overlap& operator=(const Overlap&) = delete; virtual void find_breaking_points_from_cigar(int64_t window_length); + virtual void find_breaking_points_from_cigar(std::vector> windows); virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len); std::string q_name_; From b221d49f5fdf97e41e498623465eb30bcd842068 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 00:45:28 +0200 Subject: [PATCH 28/49] Implemented the BED-selected polishing in polisher.*. --- src/polisher.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++------ src/polisher.hpp | 4 ++++ 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 60d32bb..0338f3d 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -483,6 +483,8 @@ void Polisher::create_and_populate_windows_with_bed(std::vector id_to_first_window_id(targets_size + 1, -1); + std::unordered_map>> windows; + // Target intervals are sorted ahead of time. for (const auto& it: target_bed_intervals_) { int64_t t_id = it.first; @@ -507,6 +509,7 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorquality()[win_start]), length)); target_window_intervals_[t_id].emplace_back(IntervalInt64(win_start, win_start + length - 1, win_id)); + windows[t_id].emplace_back(win_start, win_start + length, win_id); } } } @@ -517,11 +520,15 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorlog("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::create_and_populate_windows(std::vector>& overlaps, @@ -547,6 +554,13 @@ void Polisher::create_and_populate_windows(std::vector> id_to_first_window_id[i + 1] = id_to_first_window_id[i] + k; } + assign_sequences_to_windows(overlaps, targets_size); + + logger_->log("[racon::Polisher::initialize] transformed data into windows"); +} + +void Polisher::assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size) { + targets_coverages_.resize(targets_size, 0); for (uint64_t i = 0; i < overlaps.size(); ++i) { @@ -587,8 +601,7 @@ void Polisher::create_and_populate_windows(std::vector> } } - uint64_t window_id = id_to_first_window_id[overlaps[i]->t_id()] + - win_t_start / window_length_; + uint64_t window_id = bp.window_id; uint32_t window_start = (win_t_start / window_length_) * window_length_; @@ -614,7 +627,6 @@ void Polisher::create_and_populate_windows(std::vector> overlaps[i].reset(); } - logger_->log("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::find_overlap_breaking_points(std::vector>& overlaps) @@ -641,6 +653,35 @@ void Polisher::find_overlap_breaking_points(std::vector } } +void Polisher::find_overlap_breaking_points(std::vector>& overlaps, + const std::unordered_map>>& windows) +{ + std::vector> thread_futures; + for (uint64_t i = 0; i < overlaps.size(); ++i) { + + thread_futures.emplace_back(thread_pool_->submit( + [&](uint64_t j) -> void { + auto it = windows.find(overlaps[j]->t_id()); + if (it != windows.end()) { + overlaps[j]->find_breaking_points(sequences_, it->second); + } + }, i)); + } + + uint32_t logger_step = thread_futures.size() / 20; + for (uint64_t i = 0; i < thread_futures.size(); ++i) { + thread_futures[i].wait(); + if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) { + logger_->bar("[racon::Polisher::initialize] aligning overlaps"); + } + } + if (logger_step != 0) { + logger_->bar("[racon::Polisher::initialize] aligning overlaps"); + } else { + logger_->log("[racon::Polisher::initialize] aligned overlaps"); + } +} + void Polisher::polish(std::vector>& dst, bool drop_unpolished_sequences) { diff --git a/src/polisher.hpp b/src/polisher.hpp index f0bdde3..3eb9889 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -80,10 +80,14 @@ class Polisher { Polisher(const Polisher&) = delete; const Polisher& operator=(const Polisher&) = delete; virtual void find_overlap_breaking_points(std::vector>& overlaps); + virtual void find_overlap_breaking_points(std::vector>& overlaps, + const std::unordered_map>>& windows); + void create_and_populate_windows(std::vector>& overlaps, uint64_t targets_size, WindowType window_type); void create_and_populate_windows_with_bed(std::vector>& overlaps, uint64_t targets_size, WindowType window_type); + void assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size); std::unique_ptr> sparser_; std::unique_ptr> oparser_; From 1b264b3f58dd7b3ff7fecee8b3256ad310e4a284 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 10:28:11 +0200 Subject: [PATCH 29/49] Now writing the unpolished sequences with no BED windows. --- src/polisher.cpp | 31 +++++++++++++++++++++++++++---- src/polisher.hpp | 2 +- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 0338f3d..356fe63 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -481,7 +481,8 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorlog("Constructing windows for BED regions.\n"); // The -1 marks that the target doesn't have any windows. - std::vector id_to_first_window_id(targets_size + 1, -1); + id_to_first_window_id_.clear(); + id_to_first_window_id_.resize(targets_size + 1, -1); std::unordered_map>> windows; @@ -491,7 +492,7 @@ void Polisher::create_and_populate_windows_with_bed(std::vector& intervals = it.second; // Mark the window start. - id_to_first_window_id[t_id] = static_cast(windows_.size()); + id_to_first_window_id_[t_id] = static_cast(windows_.size()); // Generate windows for each interval separately. uint32_t k = 0; @@ -537,7 +538,9 @@ void Polisher::create_and_populate_windows(std::vector> logger_->log(); - std::vector id_to_first_window_id(targets_size + 1, 0); + id_to_first_window_id_.clear(); + id_to_first_window_id_.resize(targets_size + 1, 0); + for (uint64_t i = 0; i < targets_size; ++i) { uint32_t k = 0; for (uint32_t j = 0; j < sequences_[i]->data().size(); j += window_length_, ++k) { @@ -551,7 +554,7 @@ void Polisher::create_and_populate_windows(std::vector> &(sequences_[i]->quality()[j]), length)); } - id_to_first_window_id[i + 1] = id_to_first_window_id[i] + k; + id_to_first_window_id_[i + 1] = id_to_first_window_id_[i] + k; } assign_sequences_to_windows(overlaps, targets_size); @@ -715,12 +718,14 @@ void Polisher::polish(std::vector>& dst, num_polished_windows += thread_futures[i].get() == true ? 1 : 0; #ifdef BED_FEATURE + // Add the sequence in between windows. if (windows_[i]->start() > prev_window_end) { uint64_t span = windows_[i]->start() - prev_window_end; polished_data += sequences_[windows_[i]->id()]->data().substr(prev_window_end, span); } #endif + // Add the window consensus. polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { @@ -756,6 +761,24 @@ void Polisher::polish(std::vector>& dst, } } + // Write the original sequences if there were no BED windows assigned to them. + // If the BED is used, then some sequences can have 0 windows, and consensus will not + // be generated. + if (use_bed_) { + // There is an extra element because of legacy code. + for (int32_t t_id = 0; t_id < (static_cast(id_to_first_window_id_.size()) - 1); ++t_id) { + if (id_to_first_window_id_[t_id] < 0) { + std::string tags = type_ == PolisherType::kF ? "r" : ""; + tags += " LN:i:" + std::to_string(sequences_[t_id]->data().size()); + tags += " RC:i:" + std::to_string(targets_coverages_[t_id]); + tags += " XC:f:0.0"; + dst.emplace_back(createSequence(sequences_[t_id]->name() + + tags, sequences_[t_id]->data())); + + } + } + } + if (logger_step != 0) { logger_->bar("[racon::Polisher::polish] generating consensus"); } else { diff --git a/src/polisher.hpp b/src/polisher.hpp index 3eb9889..3e57449 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -108,6 +108,7 @@ class Polisher { uint32_t window_length_; std::vector> windows_; + std::vector id_to_first_window_id_; std::unique_ptr thread_pool_; std::unordered_map thread_to_id_; @@ -119,7 +120,6 @@ class Polisher { std::unordered_map> target_window_intervals_; std::unordered_map target_window_trees_; - }; } From d49722f737757c3466825f855d8402620845748d Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:00:02 +0200 Subject: [PATCH 30/49] Updated CMakeLists.txt to compile the new code and tests. --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64c2346..83e79ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,10 +30,12 @@ include_directories(${PROJECT_SOURCE_DIR}/src) set(racon_sources src/main.cpp + src/bed.cpp src/logger.cpp src/polisher.cpp src/overlap.cpp src/sequence.cpp + src/util.cpp src/window.cpp) if(racon_enable_cuda) @@ -101,13 +103,18 @@ if (racon_build_tests) "${PROJECT_BINARY_DIR}/config/racon_test_config.h") include_directories(${PROJECT_BINARY_DIR}/config) include_directories(${PROJECT_SOURCE_DIR}/src) + include_directories(${PROJECT_SOURCE_DIR}/vendor/intervaltree) set(racon_test_sources + test/bed_test.cpp test/racon_test.cpp + test/util_test.cpp + src/bed.cpp src/logger.cpp src/polisher.cpp src/overlap.cpp src/sequence.cpp + src/util.cpp src/window.cpp) if (racon_enable_cuda) From 33359e1284010a33d2c9103a30ba966be14a09d0 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:04:24 +0200 Subject: [PATCH 31/49] Added missing include to src/bed.hpp. --- src/bed.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bed.hpp b/src/bed.hpp index 5737602..bfc0267 100644 --- a/src/bed.hpp +++ b/src/bed.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include From 8d69226fcf236036c5f7340b292bc7bf6332106a Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:11:11 +0200 Subject: [PATCH 32/49] Updated the initializer list in test/bed_test.cpp. --- test/bed_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/bed_test.cpp b/test/bed_test.cpp index a798e9b..14d0d5a 100644 --- a/test/bed_test.cpp +++ b/test/bed_test.cpp @@ -14,7 +14,7 @@ TEST(BedFile, DeserializeTests) { // Tuple: test_name, input line, expected record, expected return value, expected to throw. - std::vector> test_data = { + std::vector> test_data{ {"Empty input", "", racon::BedRecord(), false,false}, {"Three columns", "chr01 0 1000", racon::BedRecord("chr01", 0, 1000), true, false}, {"Multiple columns", "chr01 1000 2000 some other columns that are ignored", racon::BedRecord("chr01", 1000, 2000), true, false}, @@ -49,7 +49,7 @@ TEST(BedFile, DeserializeTests) { TEST(BedReader, AllTests) { // Tuple: test_name, input line, expected record, expected return value. - std::vector, bool>> test_data = { + std::vector, bool>> test_data { {"Empty input", "", {}, false}, {"Normal BED file", R"(chr01 0 1000 From bc79bc2c96b10f148f1f2ce3a162d86f6a881e97 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:11:25 +0200 Subject: [PATCH 33/49] Updated the initializer list in test/util_test.cpp. --- test/util_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/util_test.cpp b/test/util_test.cpp index 6673fad..4a518ea 100644 --- a/test/util_test.cpp +++ b/test/util_test.cpp @@ -16,7 +16,7 @@ TEST(Utils, find_breaking_points_from_cigar) { // Tuple: test_name, CIGAR string, windows as tuples, expected breakpoints split into windows. std::vector>, std::vector>> test_data = { + int64_t, std::vector>, std::vector>> test_data { {"Empty input", "", 0, 0, {}, {}}, {"Simple windowing", "1000M", 0, 0, From c8a6bd292a21f65f832aa7d9693c5dd81122e0ba Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:20:58 +0200 Subject: [PATCH 34/49] Updated the test initializer lists in test/bed_test.cpp. --- test/bed_test.cpp | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/test/bed_test.cpp b/test/bed_test.cpp index 14d0d5a..016bd25 100644 --- a/test/bed_test.cpp +++ b/test/bed_test.cpp @@ -14,11 +14,12 @@ TEST(BedFile, DeserializeTests) { // Tuple: test_name, input line, expected record, expected return value, expected to throw. + using TestTupleType = std::tuple; std::vector> test_data{ - {"Empty input", "", racon::BedRecord(), false,false}, - {"Three columns", "chr01 0 1000", racon::BedRecord("chr01", 0, 1000), true, false}, - {"Multiple columns", "chr01 1000 2000 some other columns that are ignored", racon::BedRecord("chr01", 1000, 2000), true, false}, - {"Invalid BED line", "bad_bed", racon::BedRecord(), true, true}, + TestTupleType("Empty input", "", racon::BedRecord(), false,false), + TestTupleType("Three columns", "chr01 0 1000", racon::BedRecord("chr01", 0, 1000), true, false), + TestTupleType("Multiple columns", "chr01 1000 2000 some other columns that are ignored", racon::BedRecord("chr01", 1000, 2000), true, false), + TestTupleType("Invalid BED line", "bad_bed", racon::BedRecord(), true, true), }; for (const auto& single_test: test_data) { @@ -49,26 +50,30 @@ TEST(BedFile, DeserializeTests) { TEST(BedReader, AllTests) { // Tuple: test_name, input line, expected record, expected return value. + using TestTupleType = std::tuple, bool>; std::vector, bool>> test_data { - {"Empty input", "", {}, false}, - {"Normal BED file", -R"(chr01 0 1000 + TestTupleType("Empty input", "", {}, false), + + TestTupleType( + "Normal BED file", + R"(chr01 0 1000 chr02 1000 2000 chr03 2000 3000 )", - { + std::vector{ racon::BedRecord("chr01", 0, 1000), racon::BedRecord("chr02", 1000, 2000), racon::BedRecord("chr03", 2000, 3000), }, false - }, - {"Normal BED file", -R"(chr01 0 1000 + ), + TestTupleType( + "Normal BED file", + R"(chr01 0 1000 bad_line )", - {}, true - }, + std::vector(), true + ), }; From 1c91dd194c011208a2d5d0f025b35025dd178070 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:29:10 +0200 Subject: [PATCH 35/49] Updated the initializer lists in test/util_test.cpp. --- test/util_test.cpp | 194 +++++++++++++++++++++++---------------------- 1 file changed, 99 insertions(+), 95 deletions(-) diff --git a/test/util_test.cpp b/test/util_test.cpp index 4a518ea..33e9178 100644 --- a/test/util_test.cpp +++ b/test/util_test.cpp @@ -15,135 +15,139 @@ TEST(Utils, find_breaking_points_from_cigar) { // Tuple: test_name, CIGAR string, windows as tuples, expected breakpoints split into windows. + using TestTupleType = std::tuple>, + std::vector>; + using TestWindowTuple = std::tuple; std::vector>, std::vector>> test_data { - {"Empty input", "", 0, 0, {}, {}}, - - {"Simple windowing", "1000M", 0, 0, - { - {100, 500, 0}, - {700, 800, 1}, - {900, 950, 2}, + int64_t, std::vector, std::vector>> test_data { + TestTupleType("Empty input", "", 0, 0, std::vector{}, std::vector{}), + + TestTupleType("Simple windowing", "1000M", 0, 0, + std::vector{ + TestWindowTuple{100, 500, 0}, + TestWindowTuple{700, 800, 1}, + TestWindowTuple{900, 950, 2}, }, - { + std::vector{ racon::WindowInterval(100, 500, 100, 500, 0), racon::WindowInterval(700, 800, 700, 800, 1), racon::WindowInterval(900, 950, 900, 950, 2), } - }, - - {"Simple windowing with indels", "200=100I300=50D450=", 0, 0, - { - {100, 500, 0}, - {500, 700, 1}, - {700, 800, 2}, - {900, 950, 3}, + ), + + TestTupleType("Simple windowing with indels", "200=100I300=50D450=", 0, 0, + std::vector{ + TestWindowTuple{100, 500, 0}, + TestWindowTuple{500, 700, 1}, + TestWindowTuple{700, 800, 2}, + TestWindowTuple{900, 950, 3}, }, - { + std::vector{ racon::WindowInterval(100, 600, 100, 500, 0), racon::WindowInterval(600, 750, 550, 700, 1), racon::WindowInterval(750, 850, 700, 800, 2), racon::WindowInterval(950, 1000, 900, 950, 3), } - }, - - {"Intra-window alignments", "100=", 0, 250, - { - {0, 200, 0}, - {200, 400, 1}, - {400, 600, 2}, - {600, 800, 3}, - {800, 1000, 4}, + ), + + TestTupleType("Intra-window alignments", "100=", 0, 250, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, }, - { + std::vector{ racon::WindowInterval(0, 100, 250, 350, 1), } - }, - - {"Cross-window alignments", "200=", 0, 250, - { - {0, 200, 0}, - {200, 400, 1}, - {400, 600, 2}, - {600, 800, 3}, - {800, 1000, 4}, + ), + + TestTupleType("Cross-window alignments", "200=", 0, 250, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, }, - { + std::vector{ racon::WindowInterval(0, 150, 250, 400, 1), racon::WindowInterval(150, 200, 400, 450, 2), } - }, - - {"Flanking insertions", "10I180=10I", 0, 200, - { - {0, 200, 0}, - {200, 400, 1}, - {400, 600, 2}, - {600, 800, 3}, - {800, 1000, 4}, + ), + + TestTupleType("Flanking insertions", "10I180=10I", 0, 200, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, }, - { + std::vector{ racon::WindowInterval(10, 190, 200, 380, 1), } - }, - - {"Flanking insertions", "10D180=10D", 0, 200, - { - {0, 200, 0}, - {200, 400, 1}, - {400, 600, 2}, - {600, 800, 3}, - {800, 1000, 4}, + ), + + TestTupleType("Flanking insertions", "10D180=10D", 0, 200, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{400, 600, 2}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, }, - { + std::vector{ racon::WindowInterval(0, 180, 210, 390, 1), } - }, - - {"Alignment outside of the windows, front", "50=", 0, 50, - { - {200, 400, 0}, - {400, 600, 1}, - {600, 800, 2}, - {800, 1000, 3}, + ), + + TestTupleType("Alignment outside of the windows, front", "50=", 0, 50, + std::vector{ + TestWindowTuple{200, 400, 0}, + TestWindowTuple{400, 600, 1}, + TestWindowTuple{600, 800, 2}, + TestWindowTuple{800, 1000, 3}, }, - { } - }, - - {"Alignment outside of the windows, back", "50=", 0, 1050, - { - {200, 400, 0}, - {400, 600, 1}, - {600, 800, 2}, - {800, 1000, 3}, + std::vector{ } + ), + + TestTupleType("Alignment outside of the windows, back", "50=", 0, 1050, + std::vector{ + TestWindowTuple{200, 400, 0}, + TestWindowTuple{400, 600, 1}, + TestWindowTuple{600, 800, 2}, + TestWindowTuple{800, 1000, 3}, }, - { } - }, - - {"Alignment in between windows (not overlapping)", "50=", 0, 450, - { - {0, 200, 0}, - {200, 400, 1}, - {600, 800, 3}, - {800, 1000, 4}, + std::vector{ } + ), + + TestTupleType("Alignment in between windows (not overlapping)", "50=", 0, 450, + std::vector{ + TestWindowTuple{0, 200, 0}, + TestWindowTuple{200, 400, 1}, + TestWindowTuple{600, 800, 3}, + TestWindowTuple{800, 1000, 4}, }, - { } - }, - - {"Out of order windows, should still work because internal sort", "200=100I300=50D450=", 0, 0, - { - {900, 950, 3}, - {500, 700, 1}, - {100, 500, 0}, - {700, 800, 2}, + std::vector{ } + ), + + TestTupleType("Out of order windows, should still work because internal sort", "200=100I300=50D450=", 0, 0, + std::vector{ + TestWindowTuple{900, 950, 3}, + TestWindowTuple{500, 700, 1}, + TestWindowTuple{100, 500, 0}, + TestWindowTuple{700, 800, 2}, }, - { + std::vector{ racon::WindowInterval(100, 600, 100, 500, 0), racon::WindowInterval(600, 750, 550, 700, 1), racon::WindowInterval(750, 850, 700, 800, 2), racon::WindowInterval(950, 1000, 900, 950, 3), } - }, + ), }; From c499f5da662fa20b79bcee2c8f8cac9f24ba0fe6 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:34:45 +0200 Subject: [PATCH 36/49] Updated the initializer lists in src/bed.cpp. --- src/bed.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bed.cpp b/src/bed.cpp index e46dab0..3323eb3 100644 --- a/src/bed.cpp +++ b/src/bed.cpp @@ -38,12 +38,12 @@ std::string BedFile::Serialize(const BedRecord& record) { BedReader::BedReader(const std::string& in_fn) : file_{std::unique_ptr(new std::ifstream(in_fn))} - , in_{*file_.get()} + , in_(*file_.get()) { } BedReader::BedReader(std::istream& in) - : in_{in} + : in_(in) { } From c6f1507ce1b511e89c4d828c564159d17cf1827f Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 11:45:41 +0200 Subject: [PATCH 37/49] Updated the initializer lists in src/util.cpp --- src/util.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util.cpp b/src/util.cpp index 5b7bed5..3b64e24 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -41,7 +41,8 @@ std::vector generate_window_breakpoints( int64_t num_windows = static_cast(windows.size()); bool found_first_match = false; - std::tuple first_match = {0, 0, 0}, last_match = {0, 0, 0}; + std::tuple first_match(0, 0, 0); + std::tuple last_match(0, 0, 0); // The "-1" is because of the while loops below (increment is done at the top). int64_t q_pos = q_start - 1; From 4778adf216b8b7e46714a2cac8d5c4386cece035 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 12:46:44 +0200 Subject: [PATCH 38/49] Fixed the operator<< for WindowInterval in src/util.hpp. --- src/util.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/util.hpp b/src/util.hpp index 4a2349e..d67f246 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -38,12 +38,7 @@ class WindowInterval { && t_end == rhs.t_end && window_id == rhs.window_id; } - std::ostream& operator<<(std::ostream& os) const { - os << "q_start = " << q_start << ", q_end = " << q_end - << ", t_start = " << t_start << "t_end = " << t_end - << "window_id = " << window_id; - return os; - } + friend std::ostream& operator<<(::std::ostream& os, const WindowInterval& a); int64_t q_start = 0; int64_t q_end = 0; @@ -52,6 +47,13 @@ class WindowInterval { int64_t window_id = -1; }; +inline std::ostream& operator<<(::std::ostream& os, const WindowInterval& a) { + os << "q_start = " << a.q_start << ", q_end = " << a.q_end + << ", t_start = " << a.t_start << ", t_end = " << a.t_end + << ", window_id = " << a.window_id; + return os; +} + template bool transmuteId(const std::unordered_map& t_to_id, const T& t, uint64_t& id) { From bd6fc43245f2252fdb451d9ca81912e284bdeb95 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 12:43:38 +0200 Subject: [PATCH 39/49] Added operator<< to Overlap in src/overlap.hpp. --- src/overlap.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/overlap.hpp b/src/overlap.hpp index bb0aa99..3fee6e5 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -89,6 +89,8 @@ class Overlap { friend bioparser::PafParser; friend bioparser::SamParser; + friend std::ostream& operator<<(::std::ostream& os, const Overlap& a); + #ifdef CUDA_ENABLED friend class CUDABatchAligner; #endif @@ -139,4 +141,13 @@ class Overlap { std::vector breaking_points_; }; +inline std::ostream& operator<<(::std::ostream& os, const Overlap& a) { + std::string delim(" "); + os << a.q_id_ << delim << a.q_begin_ << delim << a.q_end_ << delim << a.q_length_ + << delim << (a.strand_ ? "-" : "+") + << delim << a.t_id_ << delim << a.t_begin_ << delim << a.t_end_ << delim << a.t_length_ + << delim << a.cigar_ << delim << a.is_valid_ << delim << a.is_transmuted_; + return os; +} + } From 4012f3945a8544cc137d2265ea5d609f3cd52450 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 12:48:39 +0200 Subject: [PATCH 40/49] Removed the legacy windowing code in src/polisher.cpp, it was buggy in the new context (the window_id was wrong). Cleanup of unused legacy code in src/polisher.hpp. --- src/polisher.cpp | 98 +++++++++++++++--------------------------------- src/polisher.hpp | 3 -- 2 files changed, 30 insertions(+), 71 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 356fe63..041b8b3 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -299,19 +299,28 @@ void Polisher::initialize() { - ///////////////////////////////// - // Collect the intervals. + ////////////////////////////////// + /// Collect the BED intervals. /// + ////////////////////////////////// logger_->log("[racon::Polisher::initialize] building the interval trees"); logger_->log(); target_bed_intervals_.clear(); - for (size_t i = 0; i < bed_records_.size(); ++i) { - const auto& record = bed_records_[i]; - uint64_t t_id = 0; - if (!transmuteId(name_to_id, record.chrom() + "t", t_id)) { - throw std::runtime_error("Target sequence '" + record.chrom() + - "' specified in the BED file was not found among the target sequences."); + // Collect target intervals of interest. + // If the BED file is not specified, construct windows covering full spans of targets. + if (use_bed_) { + for (size_t i = 0; i < bed_records_.size(); ++i) { + const auto& record = bed_records_[i]; + uint64_t t_id = 0; + if (!transmuteId(name_to_id, record.chrom() + "t", t_id)) { + throw std::runtime_error("Target sequence '" + record.chrom() + + "' specified in the BED file was not found among the target sequences."); + } + target_bed_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); + } + } else { + for (uint64_t t_id = 0; t_id < targets_size; ++t_id) { + target_bed_intervals_[t_id].emplace_back(IntervalInt64(0, sequences_[t_id]->data().size(), -1)); } - target_bed_intervals_[t_id].emplace_back(IntervalInt64(record.chrom_start(), record.chrom_end() - 1, i)); } // Sort target intervals. for (auto& it: target_bed_intervals_) { @@ -452,11 +461,7 @@ void Polisher::initialize() { it.wait(); } - if (use_bed_) { - create_and_populate_windows_with_bed(overlaps, targets_size, window_type); - } else { - create_and_populate_windows(overlaps, targets_size, window_type); - } + create_and_populate_windows_with_bed(overlaps, targets_size, window_type); // #ifdef BED_FEATURE_TEST // int32_t shift = 0; @@ -532,36 +537,6 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorlog("[racon::Polisher::initialize] transformed data into windows"); } -void Polisher::create_and_populate_windows(std::vector>& overlaps, - uint64_t targets_size, WindowType window_type) { - find_overlap_breaking_points(overlaps); - - logger_->log(); - - id_to_first_window_id_.clear(); - id_to_first_window_id_.resize(targets_size + 1, 0); - - for (uint64_t i = 0; i < targets_size; ++i) { - uint32_t k = 0; - for (uint32_t j = 0; j < sequences_[i]->data().size(); j += window_length_, ++k) { - - uint32_t length = std::min(j + window_length_, - static_cast(sequences_[i]->data().size())) - j; - - windows_.emplace_back(createWindow(i, k, window_type, j, - &(sequences_[i]->data()[j]), length, - sequences_[i]->quality().empty() ? &(dummy_quality_[0]) : - &(sequences_[i]->quality()[j]), length)); - } - - id_to_first_window_id_[i + 1] = id_to_first_window_id_[i] + k; - } - - assign_sequences_to_windows(overlaps, targets_size); - - logger_->log("[racon::Polisher::initialize] transformed data into windows"); -} - void Polisher::assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size) { targets_coverages_.resize(targets_size, 0); @@ -573,6 +548,14 @@ void Polisher::assign_sequences_to_windows(std::vector> const auto& sequence = sequences_[overlaps[i]->q_id()]; const std::vector& breaking_points = overlaps[i]->breaking_points(); + // std::cerr << "overlap_id = " << i << "\n"; + // std::cerr << " " << *overlaps[i] << "\n"; + // std::cerr << "All breaking points:\n"; + // for (uint32_t j = 0; j < breaking_points.size(); ++j) { + // const auto& bp = breaking_points[j]; + // std::cerr << "[j = " << j << "] bp = " << bp << "\n"; + // } + for (uint32_t j = 0; j < breaking_points.size(); ++j) { const auto& bp = breaking_points[j]; // const uint32_t win_t_start = std::get<0>(breaking_points[j]); @@ -621,41 +604,20 @@ void Polisher::assign_sequences_to_windows(std::vector> nullptr : &(sequence->quality()[win_q_start])); uint32_t quality_length = quality == nullptr ? 0 : data_length; + // std::cerr << "[j = " << j << "] Adding layer for bp = " << bp << "\n"; + windows_[window_id]->add_layer(data, data_length, quality, quality_length, win_t_start - window_start, win_t_end - window_start - 1); } + // std::cerr << "\n"; overlaps[i].reset(); } } -void Polisher::find_overlap_breaking_points(std::vector>& overlaps) -{ - std::vector> thread_futures; - for (uint64_t i = 0; i < overlaps.size(); ++i) { - thread_futures.emplace_back(thread_pool_->submit( - [&](uint64_t j) -> void { - overlaps[j]->find_breaking_points(sequences_, window_length_); - }, i)); - } - - uint32_t logger_step = thread_futures.size() / 20; - for (uint64_t i = 0; i < thread_futures.size(); ++i) { - thread_futures[i].wait(); - if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) { - logger_->bar("[racon::Polisher::initialize] aligning overlaps"); - } - } - if (logger_step != 0) { - logger_->bar("[racon::Polisher::initialize] aligning overlaps"); - } else { - logger_->log("[racon::Polisher::initialize] aligned overlaps"); - } -} - void Polisher::find_overlap_breaking_points(std::vector>& overlaps, const std::unordered_map>>& windows) { diff --git a/src/polisher.hpp b/src/polisher.hpp index 3e57449..ca3a227 100644 --- a/src/polisher.hpp +++ b/src/polisher.hpp @@ -79,12 +79,9 @@ class Polisher { uint32_t num_threads); Polisher(const Polisher&) = delete; const Polisher& operator=(const Polisher&) = delete; - virtual void find_overlap_breaking_points(std::vector>& overlaps); virtual void find_overlap_breaking_points(std::vector>& overlaps, const std::unordered_map>>& windows); - void create_and_populate_windows(std::vector>& overlaps, - uint64_t targets_size, WindowType window_type); void create_and_populate_windows_with_bed(std::vector>& overlaps, uint64_t targets_size, WindowType window_type); void assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size); From 14eb18a827e00037524a1f80695a0d7ec9f9de3a Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 12:58:58 +0200 Subject: [PATCH 41/49] Removed the deprecated code from ../src/overlap.hpp and ../src/overlap.cpp. --- src/overlap.cpp | 49 ------------------------------------------------- src/overlap.hpp | 4 ---- 2 files changed, 53 deletions(-) diff --git a/src/overlap.cpp b/src/overlap.cpp index 907367c..8a22881 100644 --- a/src/overlap.cpp +++ b/src/overlap.cpp @@ -165,32 +165,6 @@ void Overlap::transmute(const std::vector>& sequences, is_transmuted_ = true; } -void Overlap::find_breaking_points(const std::vector>& sequences, - uint32_t window_length) { - - if (!is_transmuted_) { - fprintf(stderr, "[racon::Overlap::find_breaking_points] error: " - "overlap is not transmuted!\n"); - exit(1); - } - - if (!breaking_points_.empty()) { - return; - } - - if (cigar_.empty()) { - const char* q = !strand_ ? &(sequences[q_id_]->data()[q_begin_]) : - &(sequences[q_id_]->reverse_complement()[q_length_ - q_end_]); - const char* t = &(sequences[t_id_]->data()[t_begin_]); - - align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_); - } - - find_breaking_points_from_cigar(window_length); - - std::string().swap(cigar_); -} - void Overlap::find_breaking_points(const std::vector>& sequences, std::vector> windows) { @@ -250,27 +224,4 @@ void Overlap::find_breaking_points_from_cigar(std::vector> windows; - - // find breaking points from cigar - int64_t window_id = t_begin_ / window_length; - const int64_t start = window_id * window_length; - const int64_t end = t_end_; - - for (int64_t i = start; i < end; i += window_length, ++window_id) { - windows.emplace_back(std::make_tuple(i, std::min(static_cast(t_length_), i + window_length), window_id)); - } - - int64_t q_start = (strand_ ? (q_length_ - q_end_) : q_begin_); - int64_t t_start = t_begin_; - - std::vector result = generate_window_breakpoints( - cigar_, q_start, t_start, - std::move(windows)); - - std::swap(breaking_points_, result); -} - } diff --git a/src/overlap.hpp b/src/overlap.hpp index 3fee6e5..a7a490a 100644 --- a/src/overlap.hpp +++ b/src/overlap.hpp @@ -79,9 +79,6 @@ class Overlap { return breaking_points_; } - void find_breaking_points(const std::vector>& sequences, - uint32_t window_length); - void find_breaking_points(const std::vector>& sequences, std::vector> windows); @@ -112,7 +109,6 @@ class Overlap { Overlap(); Overlap(const Overlap&) = delete; const Overlap& operator=(const Overlap&) = delete; - virtual void find_breaking_points_from_cigar(int64_t window_length); virtual void find_breaking_points_from_cigar(std::vector> windows); virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len); From b52d1bfa44be8fc42b8bbbcf9b80c25f3d876dae Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 13:07:46 +0200 Subject: [PATCH 42/49] Trying to fix an off-by-1 error. --- src/polisher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 041b8b3..0bab65c 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -515,7 +515,7 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorquality()[win_start]), length)); target_window_intervals_[t_id].emplace_back(IntervalInt64(win_start, win_start + length - 1, win_id)); - windows[t_id].emplace_back(win_start, win_start + length, win_id); + windows[t_id].emplace_back(win_start, win_start + length - 1, win_id); } } } From ea576bc33dd7daaea836814f31bf2a9139fab7bc Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 13:31:44 +0200 Subject: [PATCH 43/49] Reverted the off-by-1 in src/polisher.cpp. --- src/polisher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 0bab65c..041b8b3 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -515,7 +515,7 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorquality()[win_start]), length)); target_window_intervals_[t_id].emplace_back(IntervalInt64(win_start, win_start + length - 1, win_id)); - windows[t_id].emplace_back(win_start, win_start + length - 1, win_id); + windows[t_id].emplace_back(win_start, win_start + length, win_id); } } } From 99f35a332f029108897e91268fc8943016253703 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 17:11:48 +0200 Subject: [PATCH 44/49] Fixed a bug in src/polisher.cpp window creation when BED is not used - the last window had an extra base. Interestingly, ASAN didn't complain. Also, fixed an off-by-1 error in src/util.cpp when predecessor windows were being skipped. --- src/polisher.cpp | 2 +- src/util.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 041b8b3..f72455f 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -319,7 +319,7 @@ void Polisher::initialize() { } } else { for (uint64_t t_id = 0; t_id < targets_size; ++t_id) { - target_bed_intervals_[t_id].emplace_back(IntervalInt64(0, sequences_[t_id]->data().size(), -1)); + target_bed_intervals_[t_id].emplace_back(IntervalInt64(0, static_cast(sequences_[t_id]->data().size()) - 1, -1)); } } // Sort target intervals. diff --git a/src/util.cpp b/src/util.cpp index 3b64e24..b570611 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -49,7 +49,7 @@ std::vector generate_window_breakpoints( int64_t t_pos = t_start - 1; int64_t curr_w = 0; - while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_start) { + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_start) { ++curr_w; } @@ -61,7 +61,7 @@ std::vector generate_window_breakpoints( ++q_pos; ++t_pos; - while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_pos) { + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_pos) { ++curr_w; } if (curr_w >= num_windows) { @@ -107,7 +107,7 @@ std::vector generate_window_breakpoints( for (uint32_t k = 0; k < num_bases; ++k) { ++t_pos; - while (curr_w < num_windows && std::get<1>(windows[curr_w]) < t_pos) { + while (curr_w < num_windows && std::get<1>(windows[curr_w]) <= t_pos) { ++curr_w; } if (curr_w >= num_windows) { From 8e5c80262da2fe114ec4dc705908f926c432b2c9 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Fri, 26 Jun 2020 17:16:18 +0200 Subject: [PATCH 45/49] Minor change in logging. --- src/polisher.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index f72455f..0e79b39 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -461,8 +461,12 @@ void Polisher::initialize() { it.wait(); } + logger_->log("[racon::Polisher::initialize] Constructing windows for BED regions.\n"); + create_and_populate_windows_with_bed(overlaps, targets_size, window_type); + logger_->log("[racon::Polisher::initialize] transformed data into windows"); + // #ifdef BED_FEATURE_TEST // int32_t shift = 0; // for (int32_t i = 0; i < static_cast(windows_.size()); ++i) { @@ -483,8 +487,6 @@ void Polisher::initialize() { void Polisher::create_and_populate_windows_with_bed(std::vector>& overlaps, uint64_t targets_size, WindowType window_type) { - logger_->log("Constructing windows for BED regions.\n"); - // The -1 marks that the target doesn't have any windows. id_to_first_window_id_.clear(); id_to_first_window_id_.resize(targets_size + 1, -1); @@ -533,8 +535,6 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorlog("[racon::Polisher::initialize] transformed data into windows"); } void Polisher::assign_sequences_to_windows(std::vector>& overlaps, uint64_t targets_size) { From f633ad0b5fd2d2fdf5d331a3a133669b8735fcf8 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 30 Jun 2020 13:10:01 +0200 Subject: [PATCH 46/49] Fixed the include for the IntervalTree library in CMakeLists.txt. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 83e79ea..f1869f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ if(racon_enable_cuda) endif() include_directories(${PROJECT_SOURCE_DIR}/src) +include_directories(${PROJECT_SOURCE_DIR}/vendor/intervaltree) set(racon_sources src/main.cpp @@ -103,7 +104,6 @@ if (racon_build_tests) "${PROJECT_BINARY_DIR}/config/racon_test_config.h") include_directories(${PROJECT_BINARY_DIR}/config) include_directories(${PROJECT_SOURCE_DIR}/src) - include_directories(${PROJECT_SOURCE_DIR}/vendor/intervaltree) set(racon_test_sources test/bed_test.cpp From 766b6fac61fed168c7fdf3d631ab8d800ef7f418 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Mon, 13 Jul 2020 13:51:45 +0200 Subject: [PATCH 47/49] Minor bugfix with the windowing - the window start offset was still being calculated via division, while the windowing may actually begin at an arbitrary coordinate. --- src/polisher.cpp | 32 +++++++++++++++++++++++++++----- src/window.cpp | 3 ++- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 0e79b39..7099d2b 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -528,12 +528,35 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorq_id()]; + const std::vector& breaking_points = overlaps[i]->breaking_points(); + + std::cerr << "overlap_id = " << i << "\n"; + std::cerr << " " << *overlaps[i] << "\n"; + std::cerr << "All breaking points:\n"; + for (uint32_t j = 0; j < breaking_points.size(); ++j) { + const auto& bp = breaking_points[j]; + std::cerr << "[j = " << j << "] bp = " << bp << ", Window: " << *windows_[bp.window_id] << "\n"; + if (bp.t_start < windows_[bp.window_id]->start() || bp.t_start >= windows_[bp.window_id]->end() || + bp.t_end < windows_[bp.window_id]->start() || bp.t_end > windows_[bp.window_id]->end()) { + std::cerr << "ERROR! Coordiantes out of bounds!\n"; + exit(1); + } + } + std::cerr << "\n"; + } +#endif + assign_sequences_to_windows(overlaps, targets_size); } @@ -588,8 +611,7 @@ void Polisher::assign_sequences_to_windows(std::vector> } uint64_t window_id = bp.window_id; - uint32_t window_start = (win_t_start / window_length_) * - window_length_; + uint32_t window_start = windows_[bp.window_id]->start(); const char* data = overlaps[i]->strand() ? &(sequence->reverse_complement()[win_q_start]) : diff --git a/src/window.cpp b/src/window.cpp index 75079cc..0794331 100644 --- a/src/window.cpp +++ b/src/window.cpp @@ -55,7 +55,8 @@ void Window::add_layer(const char* sequence, uint32_t sequence_length, } if (begin >= end || begin > sequences_.front().second || end > sequences_.front().second) { fprintf(stderr, "[racon::Window::add_layer] error: " - "layer begin and end positions are invalid!\n"); + "layer begin and end positions are invalid! begin = %lu, end = %lu, backbone_len = %lu\n", + begin, end, sequences_.front().second); exit(1); } From 8a1f6bc64ee304f54856eeea4bf69e8d3ba3d015 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 8 Sep 2020 20:13:22 +0200 Subject: [PATCH 48/49] Minor cleanup in src/polisher.cpp. --- src/polisher.cpp | 59 ++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/src/polisher.cpp b/src/polisher.cpp index 7099d2b..1a08e41 100644 --- a/src/polisher.cpp +++ b/src/polisher.cpp @@ -21,7 +21,6 @@ #include "thread_pool/thread_pool.hpp" #include "spoa/spoa.hpp" -#define BED_FEATURE // #define BED_FEATURE_TEST namespace racon { @@ -528,34 +527,34 @@ void Polisher::create_and_populate_windows_with_bed(std::vectorq_id()]; - const std::vector& breaking_points = overlaps[i]->breaking_points(); - - std::cerr << "overlap_id = " << i << "\n"; - std::cerr << " " << *overlaps[i] << "\n"; - std::cerr << "All breaking points:\n"; - for (uint32_t j = 0; j < breaking_points.size(); ++j) { - const auto& bp = breaking_points[j]; - std::cerr << "[j = " << j << "] bp = " << bp << ", Window: " << *windows_[bp.window_id] << "\n"; - if (bp.t_start < windows_[bp.window_id]->start() || bp.t_start >= windows_[bp.window_id]->end() || - bp.t_end < windows_[bp.window_id]->start() || bp.t_end > windows_[bp.window_id]->end()) { - std::cerr << "ERROR! Coordiantes out of bounds!\n"; - exit(1); - } - } - std::cerr << "\n"; - } -#endif +// #ifdef BED_FEATURE_TEST +// for (uint64_t i = 0; i < overlaps.size(); ++i) { +// const auto& sequence = sequences_[overlaps[i]->q_id()]; +// const std::vector& breaking_points = overlaps[i]->breaking_points(); + +// std::cerr << "overlap_id = " << i << "\n"; +// std::cerr << " " << *overlaps[i] << "\n"; +// std::cerr << "All breaking points:\n"; +// for (uint32_t j = 0; j < breaking_points.size(); ++j) { +// const auto& bp = breaking_points[j]; +// std::cerr << "[j = " << j << "] bp = " << bp << ", Window: " << *windows_[bp.window_id] << "\n"; +// if (bp.t_start < windows_[bp.window_id]->start() || bp.t_start >= windows_[bp.window_id]->end() || +// bp.t_end < windows_[bp.window_id]->start() || bp.t_end > windows_[bp.window_id]->end()) { +// std::cerr << "ERROR! Coordiantes out of bounds!\n"; +// exit(1); +// } +// } +// std::cerr << "\n"; +// } +// #endif assign_sequences_to_windows(overlaps, targets_size); } @@ -701,26 +700,22 @@ void Polisher::polish(std::vector>& dst, num_polished_windows += thread_futures[i].get() == true ? 1 : 0; -#ifdef BED_FEATURE - // Add the sequence in between windows. + // BED region related: Add the sequence in between windows. if (windows_[i]->start() > prev_window_end) { uint64_t span = windows_[i]->start() - prev_window_end; polished_data += sequences_[windows_[i]->id()]->data().substr(prev_window_end, span); } -#endif // Add the window consensus. polished_data += windows_[i]->consensus(); if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) { -#ifdef BED_FEATURE - // Append the remaining suffix from the last window to the end of the target. + // BED region related: Append the remaining suffix from the last window to the end of the target. uint32_t tlen = sequences_[windows_[i]->id()]->data().size(); if (windows_[i]->end() < tlen) { uint64_t suffix_start = windows_[i]->end(); polished_data += sequences_[windows_[i]->id()]->data().substr(suffix_start); } -#endif double polished_ratio = num_polished_windows / static_cast(windows_[i]->rank() + 1); From aef75a9a6b14a54845cade0395816a6a2eb992bd Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Tue, 8 Sep 2020 19:32:05 +0200 Subject: [PATCH 49/49] Version v1.5.0. --- CMakeLists.txt | 2 +- meson.build | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1869f3..ad1b55c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) project(racon) -set(racon_version 1.4.17) +set(racon_version 1.5.0) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) diff --git a/meson.build b/meson.build index 6d7e10a..a71bd47 100644 --- a/meson.build +++ b/meson.build @@ -1,7 +1,7 @@ project( 'Racon', 'cpp', - version : '1.4.13', + version : '1.5.0', default_options : [ 'buildtype=release', 'warning_level=3',