From 7693c5794d6f5a286b7dc341a1c471afe13f0b0d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Feb 2026 10:51:19 -0500 Subject: [PATCH 1/5] Add augref module for computing augmented reference path covers Adds augref.cpp/hpp implementing AugRefCover, which computes augmented reference paths (altpaths) that cover haplotype-only regions of the graph. Integrates into vg paths with --compute-augref, --min-augref-len, --augref-sample, and --augref-segs options. Includes test GFA fixtures for dangling nodes and star allele clusters. Co-Authored-By: Claude Opus 4.6 --- src/augref.cpp | 1355 ++++++++++++++++++++++++++ src/augref.hpp | 223 +++++ src/subcommand/paths_main.cpp | 123 ++- test/nesting/dangling_node.gfa | 14 + test/nesting/star_allele_cluster.gfa | 19 + test/nesting/star_cluster.gfa | 21 + test/t/11_vg_paths.t | 60 +- 7 files changed, 1807 insertions(+), 8 deletions(-) create mode 100644 src/augref.cpp create mode 100644 src/augref.hpp create mode 100644 test/nesting/dangling_node.gfa create mode 100644 test/nesting/star_allele_cluster.gfa create mode 100644 test/nesting/star_cluster.gfa diff --git a/src/augref.cpp b/src/augref.cpp new file mode 100644 index 0000000000..c6b207537d --- /dev/null +++ b/src/augref.cpp @@ -0,0 +1,1355 @@ +#include "augref.hpp" +#include +#include +#include +#include + +//#define debug + +namespace vg { + +using namespace std; + +const string AugRefCover::augref_suffix = "_alt"; + +string AugRefCover::make_augref_name(const string& base_path_name, int64_t augref_index) { + // New naming convention: {base}_{N}_alt + return base_path_name + "_" + to_string(augref_index) + augref_suffix; +} + +bool AugRefCover::is_augref_name(const string& path_name) { + // Check for pattern: _{digits}_alt at the end + // Must end with "_alt" + if (path_name.length() < 6) { // minimum: "x_1_alt" + return false; + } + if (path_name.substr(path_name.length() - 4) != augref_suffix) { + return false; + } + // Find the underscore before the digits + size_t alt_pos = path_name.length() - 4; // position of "_alt" + size_t underscore_pos = path_name.rfind('_', alt_pos - 1); + if (underscore_pos == string::npos || underscore_pos == alt_pos - 1) { + return false; // no underscore found, or nothing between underscore and _alt + } + // Check that everything between underscore and _alt is digits + for (size_t i = underscore_pos + 1; i < alt_pos; ++i) { + if (!isdigit(path_name[i])) { + return false; + } + } + return true; +} + +string AugRefCover::parse_base_path(const string& augref_name) { + if (!is_augref_name(augref_name)) { + return augref_name; + } + // Find _{N}_alt and strip it + size_t alt_pos = augref_name.length() - 4; // position of "_alt" + size_t underscore_pos = augref_name.rfind('_', alt_pos - 1); + return augref_name.substr(0, underscore_pos); +} + +int64_t AugRefCover::parse_augref_index(const string& augref_name) { + if (!is_augref_name(augref_name)) { + return -1; + } + // Extract N from _{N}_alt + size_t alt_pos = augref_name.length() - 4; // position of "_alt" + size_t underscore_pos = augref_name.rfind('_', alt_pos - 1); + return stoll(augref_name.substr(underscore_pos + 1, alt_pos - underscore_pos - 1)); +} + +void AugRefCover::set_augref_sample(const string& sample_name) { + this->augref_sample_name = sample_name; +} + +const string& AugRefCover::get_augref_sample() const { + return this->augref_sample_name; +} + +void AugRefCover::set_verbose(bool verbose) { + this->verbose = verbose; +} + +bool AugRefCover::get_verbose() const { + return this->verbose; +} + +void AugRefCover::clear(MutablePathMutableHandleGraph* graph) { + vector augref_paths_to_remove; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (is_augref_name(graph->get_path_name(path_handle))) { + augref_paths_to_remove.push_back(path_handle); + } + }); + for (path_handle_t path_handle : augref_paths_to_remove) { + graph->destroy_path(path_handle); + } +} + +void AugRefCover::compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length) { + + // start from scratch + this->augref_intervals.clear(); + this->interval_snarl_bounds.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + this->augref_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (node_to_interval.count(node_id)) { + cerr << "[augref error]: node " << node_id << " covered by two reference paths," + << " including " << graph->get_path_name(ref_path_handle) << " and " + << graph->get_path_name(graph->get_path_handle_of_step(augref_intervals.at(node_to_interval.at(node_id)).first)) + << ". Augmented reference path support currently requires disjoint acyclic reference paths" << endl; + exit(1); + } + node_to_interval[node_id] = augref_intervals.size() - 1; + }); + } + this->num_ref_intervals = this->augref_intervals.size(); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] Selected " << augref_intervals.size() << " rank=0 reference paths" << endl; +#endif + + // we use the path traversal finder for everything + PathTraversalFinder path_trav_finder(*graph); + + // we collect the augref cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>> augref_intervals_vector(thread_count); + vector> node_to_interval_vector(thread_count); + vector>> snarl_bounds_vector(thread_count); + + // we process top-level snarls in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // per-thread output + vector>& thread_augref_intervals = augref_intervals_vector[omp_get_thread_num()]; + unordered_map& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()]; + vector>& thread_snarl_bounds = snarl_bounds_vector[omp_get_thread_num()]; + + // capture the top-level snarl boundary node IDs + nid_t top_snarl_start = snarl->start().node_id(); + nid_t top_snarl_end = snarl->end().node_id(); + + vector queue = {snarl}; + + while(!queue.empty()) { + const Snarl* cur_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover + compute_snarl(*cur_snarl, path_trav_finder, minimum_length, + thread_augref_intervals, + thread_node_to_interval, + top_snarl_start, top_snarl_end, + thread_snarl_bounds); + + // recurse on the children + const vector& children = snarl_manager->children_of(cur_snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(child_snarl); + } + } + }); + + // now we need to fold up the thread covers + for (int64_t t = 0; t < thread_count; ++t) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "Adding " << augref_intervals_vector[t].size() << " augref intervals from thread " << t << endl; +#endif + // important to go through function rather than do a raw copy since + // inter-top-level snarl merging may need to happen + for (int64_t j = 0; j < augref_intervals_vector[t].size(); ++j) { + // the true flag at the end disables the overlap check. since they were computed + // in separate threads, snarls can overlap by a single node + const pair& interval = augref_intervals_vector[t][j]; + if (interval.first != graph->path_end(graph->get_path_handle_of_step(interval.first))) { + add_interval(this->augref_intervals, this->node_to_interval, augref_intervals_vector[t][j], true, + &this->interval_snarl_bounds, snarl_bounds_vector[t][j]); + } + } + augref_intervals_vector[t].clear(); + node_to_interval_vector[t].clear(); + snarl_bounds_vector[t].clear(); + } + + // second pass: greedily cover any nodes not covered by snarl traversals + fill_uncovered_nodes(minimum_length); + + // debug: verify all nodes are covered + verify_cover(); + + // remove any intervals that were made redundant by add_interval + defragment_intervals(); +} + +void AugRefCover::fill_uncovered_nodes(int64_t minimum_length) { + // Collect all uncovered nodes and the paths that pass through them + unordered_set uncovered_nodes; + map candidate_paths; // sorted by name for deterministic ordering + + graph->for_each_handle([&](handle_t handle) { + nid_t node_id = graph->get_id(handle); + if (!node_to_interval.count(node_id)) { + // Node is not covered - find paths through it + uncovered_nodes.insert(node_id); + graph->for_each_step_on_handle(handle, [&](step_handle_t step) { + path_handle_t path_handle = graph->get_path_handle_of_step(step); + string path_name = graph->get_path_name(path_handle); + // Skip existing augref paths + if (!is_augref_name(path_name)) { + candidate_paths[path_name] = path_handle; + } + return true; + }); + } + }); + + if (uncovered_nodes.empty()) { + return; + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] fill_uncovered_nodes: " << uncovered_nodes.size() << " uncovered nodes, " + << candidate_paths.size() << " candidate paths" << endl; +#endif + + // Greedily walk through paths, creating intervals for contiguous uncovered sequences + for (const auto& name_path : candidate_paths) { + if (uncovered_nodes.empty()) { + break; + } + + bool in_interval = false; + step_handle_t interval_start; + step_handle_t interval_end; + int64_t interval_length = 0; + vector interval_nodes; // track nodes in current interval + + graph->for_each_step_in_path(name_path.second, [&](step_handle_t step) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step)); + + if (uncovered_nodes.count(node_id)) { + // This node is uncovered + if (!in_interval) { + // Start a new interval + in_interval = true; + interval_start = step; + interval_length = 0; + interval_nodes.clear(); + } + interval_end = graph->get_next_step(step); + interval_length += graph->get_length(graph->get_handle_of_step(step)); + interval_nodes.push_back(node_id); + } else { + // This node is already covered + if (in_interval) { + // Close the current interval + if (interval_length >= minimum_length) { + // Add the interval and mark nodes as covered + add_interval(this->augref_intervals, this->node_to_interval, + make_pair(interval_start, interval_end), true, + &this->interval_snarl_bounds, {0, 0}); + for (nid_t nid : interval_nodes) { + uncovered_nodes.erase(nid); + } + } + in_interval = false; + } + } + return true; + }); + + // Don't forget to close any interval at the end of the path + if (in_interval && interval_length >= minimum_length) { + add_interval(this->augref_intervals, this->node_to_interval, + make_pair(interval_start, interval_end), true, + &this->interval_snarl_bounds, {0, 0}); + for (nid_t nid : interval_nodes) { + uncovered_nodes.erase(nid); + } + } + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] fill_uncovered_nodes: " << uncovered_nodes.size() << " nodes still uncovered after second pass" << endl; +#endif +} + +void AugRefCover::load(const PathHandleGraph* graph, + const unordered_set& reference_paths) { + // start from scratch + this->augref_intervals.clear(); + this->interval_snarl_bounds.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (graph->get_is_reverse(graph->get_handle_of_step(step_handle))) { + cerr << "[augref] error: Reversed step " << node_id << " found in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All augref fragments must be forward-only." << endl; + exit(1); + } + if (node_to_interval.count(node_id)) { + cerr << "[augref] error: Cycle found on node " << node_id << " in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All augref fragments must be acyclic." << endl; + exit(1); + } + node_to_interval[node_id] = augref_intervals.size(); + }); + this->augref_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + } + this->num_ref_intervals = this->augref_intervals.size(); + + // load existing augref paths from the graph + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = augref_intervals.size(); + }); + this->augref_intervals.push_back(make_pair(graph->path_begin(path_handle), + graph->path_end(path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + } + }); +} + +void AugRefCover::apply(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); +#ifdef debug + cerr << "applying augref cover with " << this->num_ref_intervals << " ref intervals " + << " and " << this->augref_intervals.size() << " total intervals" << endl; +#endif + + // If augref_sample_name is set, first copy base reference paths to the new sample + if (!augref_sample_name.empty()) { + // Collect reference path handles from the reference intervals + unordered_set reference_paths; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + reference_paths.insert(graph->get_path_handle_of_step(augref_intervals[i].first)); + } + copy_base_paths_to_sample(mutable_graph, reference_paths); + } + + // Reset augref counters for each base path + base_path_augref_counter.clear(); + + // First pass: determine the maximum existing augref index for each base path + // This ensures we don't overwrite existing augref paths + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + string base = parse_base_path(path_name); + int64_t idx = parse_augref_index(path_name); + if (base_path_augref_counter.count(base)) { + base_path_augref_counter[base] = max(base_path_augref_counter[base], idx); + } else { + base_path_augref_counter[base] = idx; + } + } + }); + + // write the augref paths + int64_t written_intervals = 0; + int64_t written_length = 0; + int64_t skipped_intervals = 0; + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + // Skip empty intervals (these can be created by defragment_intervals or merging) + path_handle_t interval_path = graph->get_path_handle_of_step(augref_intervals[i].first); + if (augref_intervals[i].first == graph->path_end(interval_path)) { + skipped_intervals++; + continue; + } + + // Find the reference path this augref path extends from by tracing back to reference + nid_t first_node = graph->get_id(graph->get_handle_of_step(augref_intervals[i].first)); + vector> ref_nodes = this->get_reference_nodes(first_node, true); + + // Get the reference path name from the reference node + string base_path_name; + if (!ref_nodes.empty()) { + nid_t ref_node_id = ref_nodes.at(0).second; + int64_t ref_interval_idx = this->node_to_interval.at(ref_node_id); + path_handle_t ref_path_handle = graph->get_path_handle_of_step(augref_intervals[ref_interval_idx].first); + base_path_name = graph->get_path_name(ref_path_handle); + // Strip any subrange from the reference path name + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } else { + // Fallback to source path if no reference found (shouldn't happen) + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(augref_intervals[i].first); + base_path_name = graph->get_path_name(source_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } + + // If augref_sample_name is set, replace the sample in base_path_name + if (!augref_sample_name.empty()) { + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_path_name, sense, sample, locus, haplotype, phase_block, subrange); + + if (sample.empty()) { + // Simple path name - prepend augref sample + base_path_name = augref_sample_name + "#0#" + base_path_name; + } else { + // Replace sample with augref sample + base_path_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + } + + // Get next available augref index for this base path + int64_t augref_index = ++base_path_augref_counter[base_path_name]; + + // Create the augref path name + string augref_name = make_augref_name(base_path_name, augref_index); + + // Create the path as REFERENCE sense + path_handle_t augref_handle = mutable_graph->create_path_handle(augref_name, false); + + int64_t interval_length = 0; + for (step_handle_t step_handle = augref_intervals[i].first; step_handle != augref_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + mutable_graph->append_step(augref_handle, mutable_graph->get_handle_of_step(step_handle)); + interval_length += mutable_graph->get_length(mutable_graph->get_handle_of_step(step_handle)); + } + written_intervals++; + written_length += interval_length; + } + +#ifdef debug + cerr << "[augref] apply: wrote " << written_intervals << " augref paths (" << written_length << " bp), skipped " << skipped_intervals << " empty intervals" << endl; +#endif + + this->forwardize_augref_paths(mutable_graph); +} + +int64_t AugRefCover::get_rank(nid_t node_id) const { + // search back to reference in order to find the rank. + vector> ref_steps = this->get_reference_nodes(node_id, true); + // Return -1 if node is in a disconnected component that can't reach reference + return ref_steps.empty() ? -1 : ref_steps.at(0).first; +} + +step_handle_t AugRefCover::get_step(nid_t node_id) const { + if (!node_to_interval.count(node_id)) { + return step_handle_t(); + } + const pair& interval = augref_intervals.at(node_to_interval.at(node_id)); + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + if (graph->get_id(graph->get_handle_of_step(step)) == node_id) { + return step; + } + } + return step_handle_t(); +} + +pair*, + const pair*> +AugRefCover::get_parent_intervals(const pair& interval) const { + + pair*, + const pair*> parents = make_pair(nullptr, nullptr); + + // since our decomposition is based on snarl traversals, we know that fragments must + // overlap their parents on snarl end points (at the very least) + // therefore we can find parents by scanning along the paths. + step_handle_t left_parent = graph->get_previous_step(interval.first); + if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(interval.first))) { + int64_t interval_idx = this->node_to_interval.at(graph->get_id(graph->get_handle_of_step(left_parent))); + parents.first = &this->augref_intervals.at(interval_idx); + } + + step_handle_t right_parent = graph->get_next_step(interval.second); + if (right_parent != graph->path_end(graph->get_path_handle_of_step(interval.second))) { + int64_t interval_idx = node_to_interval.at(graph->get_id(graph->get_handle_of_step(right_parent))); + parents.second = &this->augref_intervals.at(interval_idx); + } + return parents; +} + +const vector>& AugRefCover::get_intervals() const { + return this->augref_intervals; +} + +const pair* AugRefCover::get_interval(nid_t node_id) const { + if (this->node_to_interval.count(node_id)) { + return &this->augref_intervals.at(node_to_interval.at(node_id)); + } + return nullptr; +} + +int64_t AugRefCover::get_num_ref_intervals() const { + return this->num_ref_intervals; +} + +void AugRefCover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + nid_t top_snarl_start, nid_t top_snarl_end, + vector>& thread_snarl_bounds) { + + // start by finding the path traversals through the snarl + vector> travs; + vector trav_names; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (is_augref_name(trav_path_name)) { + // we ignore existing (off-reference) augref paths +#ifdef debug + cerr << "Warning : skipping existing augref traversal " << trav_path_name << endl; +#endif + continue; + } + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed = true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } + travs.push_back(trav); + trav_names.push_back(trav_path_name); + } + } +#ifdef debug +#pragma omp critical(cerr) + cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl; +#endif + + // build an initial ranked list of candidate traversal fragments + vector ranked_trav_fragments; + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // only a reference traversal (or deletion that we don't need to consider) + // will have its first two nodes covered + if (this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][0]))) && + this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][1])))) { + continue; + } + + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav, thread_node_to_interval); + + for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } + } + if (!cyclic && interval_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, uncovered_interval); + ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval}); + } + } + } + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.trav_idx); + const pair& uncovered_interval = best_stats_fragment.fragment; + +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "best trav: "; + for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx)); + cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, chopped_interval); + ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval}); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + } + } + continue; + } + pair new_interval = make_pair(trav.at(uncovered_interval.first), + graph->get_next_step(trav.at(uncovered_interval.second - 1))); +#ifdef debug + int64_t interval_length = uncovered_interval.second - uncovered_interval.first; +#pragma omp critical(cerr) + cerr << "adding interval with length " << interval_length << endl; +#endif + add_interval(thread_augref_intervals, thread_node_to_interval, new_interval, false, + &thread_snarl_bounds, {top_snarl_start, top_snarl_end}); + } +} + +vector> AugRefCover::get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval) { + + vector> intervals; + int64_t start = -1; + unordered_set dupe_check; + for (size_t i = 0; i < trav.size(); ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = dupe ? i : -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; +} + +bool AugRefCover::add_interval(vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global, + vector>* snarl_bounds_vec, + pair snarl_bounds) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) + << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">") + << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)); + if (new_interval.second == graph->path_end(graph->get_path_handle_of_step(new_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + } +#endif + bool merged = false; + int64_t merged_interval_idx = -1; + path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); + + // check the before-first step. if it's in an interval then it must be immediately + // preceeding so we merge the new interval to the end of the found interval + step_handle_t before_first_step = graph->get_previous_step(new_interval.first); + if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) { + nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); + if (thread_node_to_interval.count(prev_node_id)) { + int64_t prev_idx = thread_node_to_interval[prev_node_id]; + pair& prev_interval = thread_augref_intervals[prev_idx]; + if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { + if (prev_interval.second == new_interval.first || + (global && graph->get_previous_step(prev_interval.second) == new_interval.first)) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) + << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.first)); + if (prev_interval.second == graph->path_end(graph->get_path_handle_of_step(prev_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; + } +#endif + prev_interval.second = new_interval.second; + merged = true; + merged_interval_idx = prev_idx; + } + } + } + } + + // check the end step. if it's in an interval then it must be immediately + // following we merge the new interval to the front of the found interval + int64_t deleted_idx = -1; + step_handle_t deleted_interval_first, deleted_interval_second; // save before decommission + if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { + nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); + if (thread_node_to_interval.count(next_node_id)) { + int64_t next_idx = thread_node_to_interval[next_node_id]; + pair& next_interval = thread_augref_intervals[next_idx]; + path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); + if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { + if (next_interval.first == new_interval.second || + (global && next_interval.first == graph->get_previous_step(new_interval.second))) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)); + if (next_interval.second == graph->path_end(graph->get_path_handle_of_step(next_interval.second))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; + } +#endif + if (merged == true) { + // save the interval bounds BEFORE decommissioning + deleted_interval_first = next_interval.first; + deleted_interval_second = next_interval.second; + deleted_idx = next_idx; + // decomission next_interval + next_interval.first = graph->path_end(next_path); + next_interval.second = graph->path_front_end(next_path); + if (snarl_bounds_vec) { + (*snarl_bounds_vec)[next_idx] = {0, 0}; + } + // extend the previous interval right to cover both new_interval and the deleted next_interval + thread_augref_intervals[merged_interval_idx].second = deleted_interval_second; + } else { + // extend next_interval left + next_interval.first = new_interval.first; + merged = true; + merged_interval_idx = next_idx; + } + } + } + } + } + + // add the interval to the local (thread safe) structures + if (!merged) { + merged_interval_idx = thread_augref_intervals.size(); + thread_augref_intervals.push_back(new_interval); + if (snarl_bounds_vec) { + snarl_bounds_vec->push_back(snarl_bounds); + } + } + for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + if (deleted_idx >= 0) { + // move the links to the deleted interval to the merged interval + // use saved bounds since the interval has been decommissioned + for (step_handle_t step = deleted_interval_first; step != deleted_interval_second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + } + return !merged; +} + +void AugRefCover::defragment_intervals() { + vector> new_intervals; + vector> new_snarl_bounds; + this->node_to_interval.clear(); + for (int64_t i = 0; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + if (interval.first != graph->path_end(path_handle)) { + new_intervals.push_back(interval); + new_snarl_bounds.push_back(this->interval_snarl_bounds[i]); + } + } + this->augref_intervals = std::move(new_intervals); + this->interval_snarl_bounds = std::move(new_snarl_bounds); + for (int64_t i = 0; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + this->node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = i; + } + } +} + +int64_t AugRefCover::get_coverage(const vector& trav, const pair& uncovered_interval) { + int64_t coverage = 0; + + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + coverage += length * all_steps.size(); + } + + return coverage; +} + +// Ensure all nodes in augref paths are in forward orientation +void AugRefCover::forwardize_augref_paths(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); + + unordered_map id_map; + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + size_t fw_count = 0; + size_t total_steps = 0; + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + handle_t flipped_handle = mutable_graph->create_handle(mutable_graph->get_sequence(handle)); + id_map[mutable_graph->get_id(flipped_handle)] = mutable_graph->get_id(handle); + mutable_graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (mutable_graph->get_id(prev_handle) != mutable_graph->get_id(handle)) { + mutable_graph->create_edge(prev_handle, flipped_handle); + } + }); + mutable_graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (mutable_graph->get_id(handle) != mutable_graph->get_id(next_handle)) { + mutable_graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (mutable_graph->has_edge(handle, handle)) { + mutable_graph->create_edge(flipped_handle, flipped_handle); + } + if (mutable_graph->has_edge(handle, mutable_graph->flip(handle))) { + mutable_graph->create_edge(flipped_handle, mutable_graph->flip(flipped_handle)); + } + if (mutable_graph->has_edge(mutable_graph->flip(handle), handle)) { + mutable_graph->create_edge(mutable_graph->flip(flipped_handle), flipped_handle); + } + vector steps = mutable_graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (mutable_graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = mutable_graph->get_next_step(step); + handle_t new_handle = mutable_graph->get_is_reverse(mutable_graph->get_handle_of_step(step)) ? flipped_handle : + mutable_graph->flip(flipped_handle); + mutable_graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[augref] error: Cycle detected in augref path " << path_name << " at node " << mutable_graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(mutable_graph->steps_of_handle(handle).empty()); + dynamic_cast(mutable_graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + mutable_graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + + // do a check just to be sure + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + cerr << "[augref] error: Failed to fowardize node " << mutable_graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} + +vector> AugRefCover::get_reference_nodes(nid_t node_id, bool first) const { + + // search back to reference in order to find the rank. + unordered_set visited; + priority_queue> queue; + queue.push(make_pair(0, node_id)); + + nid_t current_id; + int64_t distance = 0; + + // output reference intervals + vector> output_reference_nodes; + + while (!queue.empty()) { + std::tie(distance, current_id) = queue.top(); + queue.pop(); + + if (!visited.count(current_id)) { + + visited.insert(current_id); + + if (this->node_to_interval.count(current_id)) { + int64_t interval_idx = this->node_to_interval.at(current_id); + + const pair& augref_interval = this->augref_intervals.at(interval_idx); + + // we've hit the reference, fish out its step and stop searching. + if (interval_idx < this->num_ref_intervals) { + output_reference_nodes.push_back(make_pair(distance, current_id)); + if (first) { + break; + } + continue; + } + + // search out of the snarl -- any parent traversals will overlap here + graph->follow_edges(graph->get_handle_of_step(augref_interval.first), true, [&](handle_t prev) { + queue.push(make_pair(distance + 1, graph->get_id(prev))); + }); + // hack around gbwtgraph bug (feature?) that does not let you decrement path_end + path_handle_t path_handle = graph->get_path_handle_of_step(augref_interval.first); + step_handle_t last_step; + if (augref_interval.second == graph->path_end(path_handle)) { + last_step = graph->path_back(path_handle); + } else { + last_step = graph->get_previous_step(augref_interval.second); + } + graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { + queue.push(make_pair(distance + 1, graph->get_id(next))); + }); + + } else { + // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) + graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + } + + } + } + + // Note: output_reference_nodes may be empty if the node is in a disconnected + // component that cannot trace back to any reference interval (e.g., after clipping) + return output_reference_nodes; +} + +void AugRefCover::verify_cover() const { + if (!verbose) { + return; + } + + // Check that every node in the graph is covered by the augref cover + int64_t total_nodes = 0; + int64_t total_length = 0; + int64_t ref_nodes = 0; + int64_t ref_length = 0; + int64_t alt_nodes = 0; + int64_t alt_length = 0; + int64_t uncovered_nodes = 0; + int64_t uncovered_length = 0; + + graph->for_each_handle([&](handle_t handle) { + nid_t node_id = graph->get_id(handle); + int64_t node_len = graph->get_length(handle); + total_nodes++; + total_length += node_len; + + if (!node_to_interval.count(node_id)) { + // Node is not covered (expected when min-augref-len > 0) + uncovered_nodes++; + uncovered_length += node_len; + } else { + int64_t interval_idx = node_to_interval.at(node_id); + if (interval_idx < num_ref_intervals) { + ref_nodes++; + ref_length += node_len; + } else { + alt_nodes++; + alt_length += node_len; + } + } + }); + + cerr << "[augref] verify_cover summary:" << endl + << " Total nodes: " << total_nodes << " (" << total_length << " bp)" << endl + << " Reference nodes: " << ref_nodes << " (" << ref_length << " bp)" << endl + << " Alt nodes: " << alt_nodes << " (" << alt_length << " bp)" << endl + << " Uncovered nodes: " << uncovered_nodes << " (" << uncovered_length << " bp)" << endl + << " Intervals: " << num_ref_intervals << " ref + " << (augref_intervals.size() - num_ref_intervals) << " alt" << endl; +} + +void AugRefCover::copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, + const unordered_set& reference_paths) { + ref_path_to_copy.clear(); + + if (augref_sample_name.empty()) { + return; + } + + for (const path_handle_t& ref_path : reference_paths) { + string ref_name = mutable_graph->get_path_name(ref_path); + + // Parse the path name to extract components + // rGFA format: SAMPLE#HAPLOTYPE#CONTIG or just CONTIG + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(ref_name, sense, sample, locus, haplotype, phase_block, subrange); + + // Create new path name with augref sample + string new_name; + if (sample.empty()) { + // Simple path name (no sample info) - prepend augref sample + new_name = augref_sample_name + "#0#" + ref_name; + } else { + // Replace sample with augref sample + new_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + + // Check if path already exists + if (mutable_graph->has_path(new_name)) { +#ifdef debug + cerr << "[augref] copy_base_paths_to_sample: path " << new_name << " already exists, skipping" << endl; +#endif + ref_path_to_copy[ref_path] = mutable_graph->get_path_handle(new_name); + continue; + } + + // Create the new path with same sense as original + path_handle_t new_path = mutable_graph->create_path_handle(new_name, false); + + // Copy all steps from original path to new path + mutable_graph->for_each_step_in_path(ref_path, [&](step_handle_t step) { + mutable_graph->append_step(new_path, mutable_graph->get_handle_of_step(step)); + return true; + }); + + ref_path_to_copy[ref_path] = new_path; + +#ifdef debug + cerr << "[augref] copy_base_paths_to_sample: copied " << ref_name << " -> " << new_name << endl; +#endif + } + +#ifdef debug + cerr << "[augref] copy_base_paths_to_sample: copied " << ref_path_to_copy.size() << " reference paths to sample " << augref_sample_name << endl; +#endif +} + +void AugRefCover::write_augref_segments(ostream& os) { + // Track augref counters to predict path names (same logic as apply()) + unordered_map local_augref_counter; + + // First pass: find maximum existing augref index for each base path + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + string base = parse_base_path(path_name); + int64_t idx = parse_augref_index(path_name); + if (local_augref_counter.count(base)) { + local_augref_counter[base] = max(local_augref_counter[base], idx); + } else { + local_augref_counter[base] = idx; + } + } + }); + + // Pre-compute reference node positions by walking all reference intervals once. + // Maps node ID -> (ref_path_handle, offset of node start on ref path) + unordered_map> ref_node_positions; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->augref_intervals[i]; + path_handle_t ref_path_handle = graph->get_path_handle_of_step(ref_interval.first); + int64_t offset = 0; + for (step_handle_t step = ref_interval.first; step != ref_interval.second; + step = graph->get_next_step(step)) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step)); + ref_node_positions[node_id] = make_pair(ref_path_handle, offset); + offset += graph->get_length(graph->get_handle_of_step(step)); + } + } + + // Collect the specific steps whose offsets we need (interval boundaries only). + // This avoids caching every step of every source path. + unordered_set needed_steps; + unordered_set source_paths_needed; + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t ph = graph->get_path_handle_of_step(interval.first); + source_paths_needed.insert(ph); + needed_steps.insert(interval.first); + needed_steps.insert(interval.second); + } + // Walk each source path once, caching offsets only for needed steps. + // Also cache path_end -> total path length for computing interval end offsets. + unordered_map step_offset_cache; + for (const path_handle_t& ph : source_paths_needed) { + int64_t offset = 0; + for (step_handle_t step = graph->path_begin(ph); + step != graph->path_end(ph); + step = graph->get_next_step(step)) { + if (needed_steps.count(step)) { + step_offset_cache[step] = offset; + } + offset += graph->get_length(graph->get_handle_of_step(step)); + } + // Cache path_end sentinel with total path length + step_handle_t path_end = graph->path_end(ph); + if (needed_steps.count(path_end)) { + step_offset_cache[path_end] = offset; + } + } + + // Cache resolved source display names per path handle to avoid + // repeated parse_path_name/create_path_name per interval. + // Stores (display_name, subrange_offset) per source path. + unordered_map> source_display_cache; + + // Write each augref interval + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t source_path_handle = graph->get_path_handle_of_step(interval.first); + + // Skip empty intervals + if (interval.first == graph->path_end(source_path_handle)) { + continue; + } + + // Look up pre-computed source path offsets + int64_t source_start = step_offset_cache.at(interval.first); + int64_t source_end = step_offset_cache.at(interval.second); + + // Find reference coordinates using snarl bounds or fallback to BFS + string base_path_name; + string ref_path_name = "."; + int64_t ref_start = 0; + int64_t ref_end = 0; + + const pair& snarl_bounds = this->interval_snarl_bounds[i]; + + if (snarl_bounds.first != 0 && snarl_bounds.second != 0 && + ref_node_positions.count(snarl_bounds.first) && ref_node_positions.count(snarl_bounds.second)) { + // Use snarl boundary nodes for reference coordinates + auto& left_pos = ref_node_positions[snarl_bounds.first]; + auto& right_pos = ref_node_positions[snarl_bounds.second]; + + // Both boundary nodes should be on the same reference path + path_handle_t ref_path_handle = left_pos.first; + ref_path_name = graph->get_path_name(ref_path_handle); + + int64_t left_node_len = graph->get_length(graph->get_handle(snarl_bounds.first)); + int64_t right_node_len = graph->get_length(graph->get_handle(snarl_bounds.second)); + + int64_t left_start = left_pos.second; + int64_t left_end = left_start + left_node_len; + int64_t right_start = right_pos.second; + int64_t right_end = right_start + right_node_len; + + ref_start = min(left_start, right_start); + ref_end = max(left_end, right_end); + + // Get base path name from the reference path + base_path_name = ref_path_name; + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } else { + // Fallback to BFS-based get_reference_nodes() for sentinel intervals + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, true); + + if (!ref_nodes.empty()) { + nid_t ref_node_id = ref_nodes.at(0).second; + int64_t ref_interval_idx = this->node_to_interval.at(ref_node_id); + path_handle_t ref_path_handle = graph->get_path_handle_of_step(augref_intervals[ref_interval_idx].first); + base_path_name = graph->get_path_name(ref_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + + ref_path_name = graph->get_path_name(ref_path_handle); + + // Find the offset of ref_node_id in the reference path + if (ref_node_positions.count(ref_node_id)) { + ref_start = ref_node_positions[ref_node_id].second; + ref_end = ref_start + graph->get_length(graph->get_handle(ref_node_id)); + } + } else { + // Fallback to source path + string source_path_name = graph->get_path_name(source_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(source_path_name, &subrange); + } + } + + // If augref_sample_name is set, replace sample in base_path_name + if (!augref_sample_name.empty()) { + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_path_name, sense, sample, locus, haplotype, phase_block, subrange); + + if (sample.empty()) { + base_path_name = augref_sample_name + "#0#" + base_path_name; + } else { + base_path_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + } + + // Get next augref index for this base path + int64_t augref_index = ++local_augref_counter[base_path_name]; + string augref_name = make_augref_name(base_path_name, augref_index); + + // Resolve source path name to full-path coordinates using cached result. + // Parses path name once per source path to extract subrange offset and + // strip the #0 phase block. + auto cache_it = source_display_cache.find(source_path_handle); + if (cache_it == source_display_cache.end()) { + string source_path_name = graph->get_path_name(source_path_handle); + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(source_path_name, sense, sample, locus, haplotype, phase_block, subrange); + int64_t subrange_offset = (subrange != PathMetadata::NO_SUBRANGE) ? subrange.first : 0; + if (phase_block == 0) { + phase_block = PathMetadata::NO_PHASE_BLOCK; + sense = PathSense::REFERENCE; + } + string display_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, PathMetadata::NO_SUBRANGE); + cache_it = source_display_cache.emplace(source_path_handle, make_pair(std::move(display_name), subrange_offset)).first; + } + const string& display_source_name = cache_it->second.first; + int64_t display_source_start = source_start + cache_it->second.second; + int64_t display_source_end = source_end + cache_it->second.second; + + // Output the BED line + os << display_source_name << "\t" + << display_source_start << "\t" + << display_source_end << "\t" + << augref_name << "\t" + << ref_path_name << "\t" + << ref_start << "\t" + << ref_end << "\n"; + } +} + +void AugRefCover::print_stats(ostream& os) { + // the header + os << "#BasePath" << "\t" + << "AugRefIndex" << "\t" + << "Length" << "\t" + << "NodeStart" << "\t" + << "NodeEnd" << "\t" + << "Rank" << "\t" + << "AvgDepth" << endl; + + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + string path_name = graph->get_path_name(path_handle); + + int64_t path_length = 0; + int64_t tot_depth = 0; + int64_t tot_steps = 0; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + path_length += graph->get_length(graph->get_handle_of_step(step)); + vector steps = graph->steps_of_handle(graph->get_handle_of_step(step)); + tot_depth += steps.size(); + ++tot_steps; + } + + string base_path = parse_base_path(path_name); + int64_t augref_index = parse_augref_index(path_name); + + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, true); + int64_t rank = ref_nodes.empty() ? -1 : ref_nodes.at(0).first; + + // interval is open ended, so we go back to last node + step_handle_t last_step; + if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.first))) { + last_step = graph->path_back(graph->get_path_handle_of_step(interval.first)); + } else { + last_step = graph->get_previous_step(interval.second); + } + handle_t last_handle = graph->get_handle_of_step(last_step); + + os << base_path << "\t" + << augref_index << "\t" + << path_length << "\t" + << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t" + << (graph->get_is_reverse(last_handle) ? "<" : ">") + << graph->get_id(last_handle) << "\t" + << rank << "\t" + << std::fixed << std::setprecision(2) << ((double)tot_depth / tot_steps) << "\n"; + } +} + +} diff --git a/src/augref.hpp b/src/augref.hpp new file mode 100644 index 0000000000..b21ca89876 --- /dev/null +++ b/src/augref.hpp @@ -0,0 +1,223 @@ +#ifndef VG_AUGREF_HPP_INCLUDED +#define VG_AUGREF_HPP_INCLUDED + +/** + * \file augref.hpp + * + * Interface for computing and querying augmented reference path covers. + * + * An augref cover is a set of path fragments (stored as separate paths) in the graph. + * They are always relative to an existing reference sample (ie GRCh38 or CHM13). + * Unlike rGFA paths which use complex metadata embedding, augref paths use a simple naming + * scheme: {base_path_name}_{N}_alt + * + * For example, if the reference path is "CHM13#0#chr1", augref paths would be named: + * - CHM13#0#chr1_1_alt + * - CHM13#0#chr1_2_alt + * - etc. + * + * The data structures used in this class are always relative to the original paths + * in the graph. The REFERENCE-sense fragments that are used to serialize the + * cover can be created and loaded, but they are not used beyond that. + */ + +#include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" + +namespace vg { + +using namespace std; + +class AugRefCover { +public: + // The suffix used to identify augref paths + static const string augref_suffix; // "_alt" + + // Create an augref path name from a base reference path name and an index. + // Example: make_augref_name("CHM13#0#chr1", 1) -> "CHM13#0#chr1_1_alt" + static string make_augref_name(const string& base_path_name, int64_t augref_index); + + // Test if a path name is an augref path (contains "_{N}_alt" suffix). + static bool is_augref_name(const string& path_name); + + // Parse an augref path name to extract the base reference path name. + // Returns the original base path name, or the input if not an augref path. + // Example: parse_base_path("CHM13#0#chr1_3_alt") -> "CHM13#0#chr1" + static string parse_base_path(const string& augref_name); + + // Parse an augref path name to extract the augref index. + // Returns -1 if the path is not an augref path. + // Example: parse_augref_index("CHM13#0#chr1_3_alt") -> 3 + static int64_t parse_augref_index(const string& augref_name); + +public: + // Clear out any existing augref paths from the graph. Recommended to run this + // before compute(). + void clear(MutablePathMutableHandleGraph* graph); + + // Compute the augref cover from the graph, starting with a given set of reference paths. + void compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length); + + // Load existing augref paths from the graph, assuming they've been computed already. + // The reference_paths should be the rank-0 paths the augref paths extend from. + void load(const PathHandleGraph* graph, + const unordered_set& reference_paths); + + // Apply the augref cover to a graph (must have been computed first), adding it + // as a bunch of REFERENCE-sense paths with the simplified naming scheme. + // If augref_sample_name is set, base paths are first copied to the new sample, + // and augref paths are created under the new sample name. + void apply(MutablePathMutableHandleGraph* mutable_graph); + + // Set the sample name for augref paths. When set, apply() will: + // 1. Copy base reference paths to this new sample (CHM13#0#chr1 -> new_sample#0#chr1) + // 2. Create augref paths under the new sample (new_sample#0#chr1_1_alt, etc.) + void set_augref_sample(const string& sample_name); + + // Get the current augref sample name (empty string if not set). + const string& get_augref_sample() const; + + // Enable verbose output (coverage summary, etc.) + void set_verbose(bool verbose); + + // Check if verbose output is enabled. + bool get_verbose() const; + + // Get the rank (level) of a given node (0 if on a reference path). + int64_t get_rank(nid_t node_id) const; + + // Get the step of a given node in its covering interval. + step_handle_t get_step(nid_t node_id) const; + + // Get the parent intervals (left and right) of a given interval. + pair*, + const pair*> get_parent_intervals(const pair& interval) const; + + // Get all computed intervals. + const vector>& get_intervals() const; + + // Get an interval from a node. Returns nullptr if node not in an interval. + const pair* get_interval(nid_t node_id) const; + + // Get the number of reference intervals (rank-0). + int64_t get_num_ref_intervals() const; + + // Print out a table of statistics. + void print_stats(ostream& os); + + // Write a tab-separated table describing augref segments. + // Each line contains: source_path, source_start, source_end, augref_path_name, + // ref_path, ref_start, ref_end + // Must be called after compute() and knows what augref path names will be used. + // If augref_sample is set, uses that for the augref path names. + void write_augref_segments(ostream& os); + +protected: + + // Compute the cover for the given snarl, by greedily finding the covered paths through it. + // The cover is added to the two "thread_" structures. + // top_snarl_start/end are the boundary node IDs of the top-level snarl containing this snarl. + void compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + nid_t top_snarl_start, nid_t top_snarl_end, + vector>& thread_snarl_bounds); + + // Get intervals in traversal that are not covered according to this->node_to_interval or + // the thread_node_to_interval parameter. + vector> get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval); + + // Add a new interval into the augref_intervals vector and update the node_to_interval map. + // If the interval can be merged into an existing, contiguous interval, do that instead. + // Returns true if a new interval was added, false if an existing interval was updated. + bool add_interval(vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global = false, + vector>* snarl_bounds_vec = nullptr, + pair snarl_bounds = {0, 0}); + + // add_interval() can delete an existing interval. This requires a full update at the end. + void defragment_intervals(); + + // Get the total coverage of a traversal (sum of step lengths * path count). + int64_t get_coverage(const vector& trav, const pair& uncovered_interval); + + // Make sure all nodes in all augref paths are in forward orientation. + // This is always possible because they are, by definition, disjoint. + // This should only be run from inside apply(). + void forwardize_augref_paths(MutablePathMutableHandleGraph* mutable_graph); + + // Second pass: greedily cover any nodes not covered by snarl traversals. + // This handles nodes that are outside of snarls or in complex regions + // where the traversal finder couldn't find good coverage. + void fill_uncovered_nodes(int64_t minimum_length); + + // Search back to the reference and return when found. + // (here distance is the number of intervals crossed, aka rank) + // "first" toggles returning the first interval found vs all of them. + vector> get_reference_nodes(nid_t node_id, bool first) const; + + // Debug function: verify that every node in the graph is covered by the augref cover. + // Prints a summary of coverage statistics to stderr. + void verify_cover() const; + +protected: + + const PathHandleGraph* graph = nullptr; + + // Intervals are end-exclusive (like BED). + vector> augref_intervals; + + // Top-level snarl boundary nodes for each interval, parallel to augref_intervals. + // (0, 0) sentinel for reference intervals and fill_uncovered_nodes intervals. + vector> interval_snarl_bounds; + + // augref_intervals[0, num_ref_intervals-1] are all rank-0 reference intervals. + int64_t num_ref_intervals = 0; + + // Map from node ID to interval index. + unordered_map node_to_interval; + + // Counter for generating unique augref indices per base path. + // Using mutable so it can be updated in apply() which is logically const for the cover. + mutable unordered_map base_path_augref_counter; + + // Optional sample name for augref paths. When set, base paths are copied to this + // sample and augref paths are created under it. + string augref_sample_name; + + // Whether to print verbose output (coverage summary, etc.) + bool verbose = false; + + // Map from original reference path handles to their copies under augref_sample_name. + // Only populated when augref_sample_name is set. + unordered_map ref_path_to_copy; + + // Copy base reference paths to the augref sample. + // Creates new paths like "new_sample#0#chr1" from "CHM13#0#chr1". + // Returns a map from original path handles to new path handles. + void copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, + const unordered_set& reference_paths); + + // Used when selecting traversals to make the greedy cover. + struct RankedFragment { + int64_t coverage; + const string* name; + int64_t trav_idx; + pair fragment; + bool operator<(const RankedFragment& f2) const { + // note: name comparison is flipped because we want to select high coverage / low name + return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name); + } + }; +}; + +} + +#endif diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 4c1a06cd19..f6645ccc34 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -10,6 +10,7 @@ #include #include +#include #include "subcommand.hpp" @@ -17,6 +18,8 @@ #include "../xg.hpp" #include "../gbwt_helper.hpp" #include "../traversal_clusters.hpp" +#include "../augref.hpp" +#include "../integrated_snarl_finder.hpp" #include "../io/save_handle_graph.hpp" #include #include @@ -63,6 +66,16 @@ void help_paths(char** argv) { << " -G, --generic-paths select generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select reference paths" << endl << " -H, --haplotype-paths select haplotype paths" << endl + << "augmented reference computation:" << endl + << " -u, --compute-augref compute augmented reference path cover" << endl + << " (use -Q to select reference paths)" << endl + << " -l, --min-augref-len N minimum augref fragment length [50]" << endl + << " -N, --augref-sample STR create augref paths under a new sample" << endl + << " (copies base paths to new sample," << endl + << " then adds augref paths)." << endl + << " if unspecified, paths get added to target sample." << endl + << " --augref-segs FILE write augref segment table to FILE" << endl + << " --verbose print augref progress and coverage summary" << endl << "configuration:" << endl << " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl << " -t, --threads N number of threads to use [all available]" << endl @@ -133,6 +146,14 @@ int main_paths(int argc, char** argv) { const size_t coverage_bins = 10; bool normalize_paths = false; bool overlay = false; + bool compute_augref = false; + int64_t min_augref_length = 50; + string augref_sample; + bool augref_verbose = false; + string augref_segments_file; + + constexpr int OPT_VERBOSE = 1001; + constexpr int OPT_AUGREF_SEGMENTS = 1002; int c; optind = 2; // force optind past command positional argument @@ -170,11 +191,18 @@ int main_paths(int argc, char** argv) { {"threads-old", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, + // Augref options + {"compute-augref", no_argument, 0, 'u'}, + {"min-augref-len", required_argument, 0, 'l'}, + {"augref-sample", required_argument, 0, 'N'}, + {"augref-segs", required_argument, 0, OPT_AUGREF_SEGMENTS}, + {"verbose", no_argument, 0, OPT_VERBOSE}, + {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "h?LXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:", + c = getopt_long (argc, argv, "h?LXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:ul:N:", long_options, &option_index); // Detect the end of the options. @@ -307,7 +335,28 @@ int main_paths(int argc, char** argv) { case 't': set_thread_count(logger, optarg); break; - + + case 'u': + compute_augref = true; + output_formats++; + break; + + case 'l': + min_augref_length = parse(optarg); + break; + + case 'N': + augref_sample = optarg; + break; + + case OPT_VERBOSE: + augref_verbose = true; + break; + + case OPT_AUGREF_SEGMENTS: + augref_segments_file = ensure_writable(logger, optarg); + break; + case 'h': case '?': help_paths(argv); @@ -374,7 +423,16 @@ int main_paths(int argc, char** argv) { if (coverage && !gbwt_file.empty()) { logger.error() << "coverage option -c only works on embedded graph paths, not GBWT threads" << std::endl; } - + if (compute_augref && !gbwt_file.empty()) { + logger.error() << "augref computation only works on embedded graph paths, not GBWT threads" << std::endl; + } + if (compute_augref && path_prefix.empty()) { + logger.error() << "--compute-augref requires -Q to select reference path(s)" << std::endl; + } + if (!augref_segments_file.empty() && !compute_augref) { + logger.error() << "--augref-segs requires --compute-augref" << std::endl; + } + if (select_alt_paths) { // alt paths all have a specific prefix path_prefix = "_alt_"; @@ -416,6 +474,61 @@ int main_paths(int argc, char** argv) { + // Handle augref computation before other operations + if (compute_augref && graph) { + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph); + if (!mutable_graph) { + logger.error() << "graph cannot be modified for augref computation" << std::endl; + return 1; + } + + // Collect reference paths matching the prefix from -Q + unordered_set ref_paths; + graph->for_each_path_handle([&](path_handle_t ph) { + string path_name = graph->get_path_name(ph); + // Skip augref paths (they match prefixes but shouldn't be used as references) + if (AugRefCover::is_augref_name(path_name)) { + return; + } + if (path_name.compare(0, path_prefix.size(), path_prefix) == 0) { + ref_paths.insert(ph); + } + }); + if (ref_paths.empty()) { + logger.error() << "no paths found matching prefix: " << path_prefix << std::endl; + return 1; + } + + // Compute snarls + IntegratedSnarlFinder finder(*graph); + SnarlManager snarl_manager(std::move(finder.find_snarls_parallel())); + + // Compute and apply augref cover + AugRefCover cover; + if (!augref_sample.empty()) { + cover.set_augref_sample(augref_sample); + } + cover.set_verbose(augref_verbose); + cover.clear(mutable_graph); + cover.compute(graph, &snarl_manager, ref_paths, min_augref_length); + + // Write augref segment table if requested + if (!augref_segments_file.empty()) { + ofstream segments_out(augref_segments_file); + if (!segments_out) { + logger.error() << "could not open augref-segs file: " << augref_segments_file << std::endl; + return 1; + } + cover.write_augref_segments(segments_out); + } + + cover.apply(mutable_graph); + + // Output the modified graph + vg::io::save_handle_graph(mutable_graph, cout); + return 0; + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -570,14 +683,14 @@ int main_paths(int argc, char** argv) { if (!path_prefix.empty()) { // Filter by name prefix std::string path_name = graph->get_path_name(path_handle); - + if (std::mismatch(path_name.begin(), path_name.end(), path_prefix.begin(), path_prefix.end()).second != path_prefix.end()) { // The path does not match the prefix. Skip it. return; } } - + // It didn't fail a prefix check, so use it. iteratee(path_handle); }); diff --git a/test/nesting/dangling_node.gfa b/test/nesting/dangling_node.gfa new file mode 100644 index 0000000000..fb8c8ca2bb --- /dev/null +++ b/test/nesting/dangling_node.gfa @@ -0,0 +1,14 @@ +H VN:Z:1.0 +S 1 AAAA +S 2 TTTT +S 3 CCCC +S 4 GGGG +S 5 AAAAAAAA +L 1 + 2 + 0M +L 2 + 3 + 0M +L 1 + 4 + 0M +L 4 + 3 + 0M +L 3 + 5 + 0M +P x 1+,2+,3+ * +P hap1 1+,4+,3+ * +P hap2 1+,4+,3+,5+ * diff --git a/test/nesting/star_allele_cluster.gfa b/test/nesting/star_allele_cluster.gfa new file mode 100644 index 0000000000..c2f7d5605d --- /dev/null +++ b/test/nesting/star_allele_cluster.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 C +S 2 AA +S 3 T +S 4 G +S 5 AA +S 6 C +P x 1+,6+ * +P a#1#h1 1+,2+,6+ * +P a#2#h2 1+,5+,3+,6+ * +P a#2#h3 1+,5+,4+,6+ * +L 1 + 2 + 0M +L 1 + 5 + 0M +L 1 + 6 + 0M +L 2 + 6 + 0M +L 5 + 3 + 0M +L 5 + 4 + 0M +L 3 + 6 + 0M +L 4 + 6 + 0M diff --git a/test/nesting/star_cluster.gfa b/test/nesting/star_cluster.gfa new file mode 100644 index 0000000000..d482f66470 --- /dev/null +++ b/test/nesting/star_cluster.gfa @@ -0,0 +1,21 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +S 7 A +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +P b#1#z0 1+,7+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 1 + 7 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M +L 7 + 6 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 9cb9687893..7ca872b443 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 35 +plan tests 52 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -126,9 +126,63 @@ is "$(cat out.txt | wc -l)" "1" "vg paths sees the haplotype path in a GBZ with rm -f norm_x4.gfa original.fa norm_x4.fa x4.path x4.norm.path out.txt err.txt rm -f empty.vg empty.gbwt empty.fa +# Augref (augmented reference path) computation tests +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 > augref_test.vg +vg validate augref_test.vg +is $? 0 "augref computation produces valid graph" + +is $(vg paths -x augref_test.vg -L | grep "_alt$" | wc -l) 2 "augref computation creates expected number of augref paths" + +is $(vg paths -x augref_test.vg -L | grep "^x$" | wc -l) 1 "original reference path is preserved after augref computation" + +# Test augref naming convention matches pattern x_{N}_alt +is $(vg paths -x augref_test.vg -L | grep -E "^x_[0-9]+_alt$" | wc -l) 2 "augref paths follow naming convention path_{N}_alt" + +# Test with triple_nested.gfa which has more complex structure +vg paths -x nesting/triple_nested.gfa -Q x --compute-augref --min-augref-len 1 > triple_augref.vg +vg validate triple_augref.vg +is $? 0 "augref computation works on complex nested structure" + +is $(vg paths -x triple_augref.vg -L | grep "_alt$" | wc -l) 2 "correct number of augref paths for triple nested graph" + +# Test minimum length filter +vg paths -x nesting/triple_nested.gfa -Q x --compute-augref --min-augref-len 100 > triple_augref_long.vg +is $(vg paths -x triple_augref_long.vg -L | grep "_alt$" | wc -l) 0 "min-augref-len filters out short fragments" + +# Test second pass coverage of dangling nodes (nodes outside snarls but on haplotype paths) +vg paths -x nesting/dangling_node.gfa -Q x --compute-augref --min-augref-len 1 > dangling_augref.vg +vg validate dangling_augref.vg +is $? 0 "augref computation handles dangling nodes outside snarls" + +is $(vg paths -x dangling_augref.vg -L | grep "_alt$" | wc -l) 2 "augref second pass covers dangling nodes" + +# Verify the dangling node (node 5, 8bp) is covered by checking augref path lengths include 8bp +# Note: use -E with grep instead of -Q since augref paths are filtered from prefix matching +is $(vg paths -x dangling_augref.vg -E | grep "_alt" | awk '{sum+=$2} END {print sum}') 12 "augref paths cover both snarl node and dangling node" + +# Test --augref-segs option for writing segment table +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 --augref-segs augref_test.segs > augref_segs_test.vg +is $? 0 "augref-segs option produces no error" + +is $(wc -l < augref_test.segs) 2 "augref-segs produces correct number of lines" + +is $(cut -f4 augref_test.segs | grep -c "x_.*_alt") 2 "augref-segs contains augref path names" + +is $(cut -f1 augref_test.segs | grep -c "#") 2 "augref-segs contains source path names with metadata" + +is $(cut -f5 augref_test.segs | grep -c "^x$") 2 "augref-segs contains reference path name" + +# Test that augref-segs requires compute-augref +vg paths -x nesting/nested_snp_in_ins.gfa -Q x -L --augref-segs augref_test.segs 2>&1 | grep -q "requires --compute-augref" +is $? 0 "augref-segs requires compute-augref option" + +# Test augref-segs with augref-sample option +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 --augref-sample TESTSAMPLE --augref-segs augref_sample_test.segs > augref_sample_test.vg +is $(cut -f4 augref_sample_test.segs | grep -c "TESTSAMPLE") 2 "augref-segs uses augref-sample for path names" + +rm -f augref_test.vg triple_augref.vg triple_augref_long.vg dangling_augref.vg x.pg x.gbwt x.gbz +rm -f augref_test.segs augref_segs_test.vg augref_sample_test.segs augref_sample_test.vg - - From 83fe295f010591c31d715c0663167997771e6570 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Feb 2026 10:51:26 -0500 Subject: [PATCH 2/5] Update snarl caller and traversal clusters for nested snarl support Refactors TraversalSupportFinder and TraversalClusters to support nested snarl traversal. Adds augref path filtering to prevent augref paths from anchoring snarl decomposition. Co-Authored-By: Claude Opus 4.6 --- src/snarl_caller.cpp | 149 +++++++++++++++++++++---------------- src/snarl_caller.hpp | 5 +- src/traversal_clusters.cpp | 67 ++++++++++++++++- src/traversal_clusters.hpp | 21 +++++- 4 files changed, 176 insertions(+), 66 deletions(-) diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index a6eebcbe93..9b4256ed4b 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -485,6 +485,17 @@ void PoissonSupportSnarlCaller::set_insertion_bias(double insertion_threshold, d } } +int PoissonSupportSnarlCaller::get_ref_trav_size(const vector& traversals, int ref_trav_idx) const { + int ref_trav_size = 0; + if (ref_trav_idx >= 0 && ref_trav_idx < (int)traversals.size()) { + const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; + for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { + ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); + } + } + return ref_trav_size; +} + pair, unique_ptr> PoissonSupportSnarlCaller::genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, @@ -514,13 +525,7 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: int max_trav_size = -1; vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, {}, false, {}, {}, ref_trav_idx, &max_trav_size); - int ref_trav_size = 0; - if (ref_trav_idx >= 0) { - const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; - for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { - ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); - } - } + int ref_trav_size = get_ref_trav_size(traversals, ref_trav_idx); // sort the traversals by support vector ranked_traversals = rank_by_support(supports); @@ -596,11 +601,12 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: // variance/std-err can be nan when binsize < 2. We just clamp it to 0 double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.; - // genotype (log) likelihoods + // Single pass: find best genotype AND compute GQ/posterior values + vector best_genotype; double best_genotype_likelihood = -numeric_limits::max(); double second_best_genotype_likelihood = -numeric_limits::max(); double total_likelihood = 0; - vector best_genotype; + for (const auto& candidate : candidates) { double gl = genotype_likelihood(candidate, traversals, top_traversals, traversal_sizes, traversal_mapqs, ref_trav_idx, exp_depth, depth_err, max_trav_size, ref_trav_size); @@ -609,22 +615,22 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: best_genotype_likelihood = gl; best_genotype = candidate; } else if (gl > second_best_genotype_likelihood) { - assert(gl <= best_genotype_likelihood); second_best_genotype_likelihood = gl; } total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); } + // Compute CallInfo directly from tracked values PoissonCallInfo* call_info = new PoissonCallInfo(); call_info->posterior = 0; if (!candidates.empty()) { // compute the posterior from our likelihoods using a uniform prior - call_info->posterior = best_genotype_likelihood - log(candidates.size()) - total_likelihood; + call_info->posterior = best_genotype_likelihood - log(candidates.size()) - total_likelihood; } - - // GQ computed as here https://gatk.broadinstitute.org/hc/en-us/articles/360035890451?id=11075 - // as difference between best and second best likelihoods + + // GQ computed as difference between best and second best likelihoods + // https://gatk.broadinstitute.org/hc/en-us/articles/360035890451?id=11075 call_info->gq = 0; if (!isnan(best_genotype_likelihood) && !isnan(second_best_genotype_likelihood)) { call_info->gq = logprob_to_phred(second_best_genotype_likelihood) - logprob_to_phred(best_genotype_likelihood); @@ -633,7 +639,6 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: call_info->expected_depth = exp_depth; call_info->depth_err = depth_err; call_info->max_trav_size = max_trav_size; - #ifdef debug cerr << " best genotype: "; for (auto a : best_genotype) {cerr << a <<",";} cerr << " gl=" << best_genotype_likelihood << endl; @@ -649,7 +654,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp int ref_trav_idx, double exp_depth, double depth_err, int max_trav_size, int ref_trav_size) { - + assert(genotype.size() == 1 || genotype.size() == 2); // get the genotype support @@ -786,10 +791,17 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, traversal_mapqs = support_finder.get_traversal_mapqs(traversals); } - // get the maximum size from the info - const SnarlCaller::CallInfo* s_call_info = call_info.get(); + // get the maximum size from the info (or compute from traversals if call_info is null) + int max_trav_size = 0; const PoissonCallInfo* p_call_info = dynamic_cast(call_info.get()); - int max_trav_size = p_call_info->max_trav_size; + if (p_call_info) { + max_trav_size = p_call_info->max_trav_size; + } else { + // No call info (e.g., derived genotype from parent) - compute from traversals + for (int size : traversal_sizes) { + max_trav_size = std::max(max_trav_size, size); + } + } int ref_trav_idx = 0; int ref_trav_size = 0; @@ -798,8 +810,17 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); } + // Filter out special marker values (negative indices like STAR_ALLELE_MARKER, MISSING_ALLELE_MARKER) + // from genotype before computing support + vector filtered_genotype; + for (int g : genotype) { + if (g >= 0 && g < (int)traversals.size()) { + filtered_genotype.push_back(g); + } + } + // get the genotype support - vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0, &max_trav_size); + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, filtered_genotype, {}, 0, &max_trav_size); // Get the depth of the site Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); @@ -829,30 +850,51 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } } - // get the genotype likelihoods + // get the genotype likelihoods (only if we have call_info with expected depth) vector gen_likelihoods; - double gen_likelihood; - variant.format.push_back("GL"); - - assert(!isnan(p_call_info->expected_depth)); - // variance/std-err can be nan when binsize < 2. We just clamp it to 0 - double depth_err = !isnan(p_call_info->depth_err) ? p_call_info->depth_err : 0.; - + double gen_likelihood = 0; double total_likelihood = 0.; double ref_likelihood = 1.; double alt_likelihood = 0.; - if (genotype.size() == 2) { - // assume ploidy 2 - for (int i = 0; i < traversals.size(); ++i) { - for (int j = i; j < traversals.size(); ++j) { - double gl = genotype_likelihood({i, j}, traversals, {}, traversal_sizes, traversal_mapqs, + if (p_call_info) { + variant.format.push_back("GL"); + + assert(!isnan(p_call_info->expected_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = !isnan(p_call_info->depth_err) ? p_call_info->depth_err : 0.; + + if (genotype.size() == 2) { + // assume ploidy 2 + for (int i = 0; i < traversals.size(); ++i) { + for (int j = i; j < traversals.size(); ++j) { + double gl = genotype_likelihood({i, j}, traversals, {}, traversal_sizes, traversal_mapqs, + ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); + gen_likelihoods.push_back(gl); + if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + gen_likelihood = gl; + } + if (i == 0 && j == 0) { + ref_likelihood = gl; + } else { + alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); + } + total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); + // convert from natural log to log10 by dividing by ln(10) + variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); + } + } + } else if (genotype.size() == 1) { + // assume ploidy 1 + // todo: generalize this iteration (as is, it is copy pased from above) + for (int i = 0; i < traversals.size(); ++i) { + double gl = genotype_likelihood({i}, traversals, {}, traversal_sizes, traversal_mapqs, ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); gen_likelihoods.push_back(gl); - if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + if (vector({i}) == genotype) { gen_likelihood = gl; } - if (i == 0 && j == 0) { + if (i == 0) { ref_likelihood = gl; } else { alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); @@ -862,41 +904,22 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); } } - } else if (genotype.size() == 1) { - // assume ploidy 1 - // todo: generalize this iteration (as is, it is copy pased from above) - for (int i = 0; i < traversals.size(); ++i) { - double gl = genotype_likelihood({i}, traversals, {}, traversal_sizes, traversal_mapqs, - ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); - gen_likelihoods.push_back(gl); - if (vector({i}) == genotype) { - gen_likelihood = gl; - } - if (i == 0) { - ref_likelihood = gl; - } else { - alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); - } - total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); - // convert from natural log to log10 by dividing by ln(10) - variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); - } - } - - variant.format.push_back("GQ"); - variant.samples[sample_name]["GQ"].push_back(std::to_string(min((int)256, max((int)0, (int)p_call_info->gq)))); - variant.format.push_back("GP"); - variant.samples[sample_name]["GP"].push_back(std::to_string(p_call_info->posterior)); + variant.format.push_back("GQ"); + variant.samples[sample_name]["GQ"].push_back(std::to_string(min((int)256, max((int)0, (int)p_call_info->gq)))); - variant.format.push_back("XD"); - variant.samples[sample_name]["XD"].push_back(std::to_string(p_call_info->expected_depth)); + variant.format.push_back("GP"); + variant.samples[sample_name]["GP"].push_back(std::to_string(p_call_info->posterior)); + + variant.format.push_back("XD"); + variant.samples[sample_name]["XD"].push_back(std::to_string(p_call_info->expected_depth)); + } // The QUAL field is the probability that we have variation as a PHRED score (of wrongness) // We derive this from the posterior probability of the reference genotype. // But if it's a reference call, we take the total of all the alts variant.quality = 0; - if (genotype.size() > 0) { + if (p_call_info && genotype.size() > 0) { // our flat prior and p[traversal coverage] double posterior = -log(gen_likelihoods.size()) - total_likelihood; if (!all_of(genotype.begin(), genotype.end(), [&](int a) {return a == 0;})) { diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 5ac2cd506e..ec730daf4d 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -221,7 +221,7 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { int ploidy, const string& ref_path_name, pair ref_range); - + /// Update INFO and FORMAT fields of the called variant virtual void update_vcf_info(const Snarl& snarl, const vector& traversals, @@ -235,6 +235,9 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { protected: + /// Calculate the size of the reference traversal (excluding boundary nodes) + int get_ref_trav_size(const vector& traversals, int ref_trav_idx) const; + /// Compute likelihood of genotype as product of poisson probabilities /// P[allele1] * P[allle2] * P[uncalled alleles] /// Homozygous alleles are split into two, with half support each diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 1d8bbb6308..dc571fa74d 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -673,7 +673,8 @@ void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](path_handle_t path_handle) { - if (graph->get_path_name(path_handle).compare(0, ref_path_prefix.length(), ref_path_prefix) == 0) { + string path_name = graph->get_path_name(path_handle); + if (path_name.compare(0, ref_path_prefix.length(), ref_path_prefix) == 0) { ref_paths[path_handle] = 0; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(path_handle)))] += EXTRA_WEIGHT; @@ -776,5 +777,69 @@ void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const } } +unordered_map> add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes, + const set& sample_names) { + // Build a map of what haplotypes are already in the traversals + unordered_map> sample_to_haps; + + assert(names.size() == travs.size()); + for (int64_t i = 0; i < names.size(); ++i) { + string sample_name = PathMetadata::parse_sample_name(names[i]); + // for backward compatibility + if (sample_name.empty()) { + sample_name = names[i]; + } + auto phase = PathMetadata::parse_haplotype(names[i]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + // This probably won't fit in an int. Use 0 instead. + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); + } + + // Find everything that's in parent_haplotypes but not the traversals, + // and add in dummy star-alleles for them + for (const auto& parent_sample_haps : parent_haplotypes) { + string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first); + if (parent_sample_name.empty()) { + parent_sample_name = parent_sample_haps.first; + } + if (!sample_names.count(parent_sample_name)) { + // don't bother for purely reference samples -- we don't need to force an allele for them. + continue; + } + for (int parent_hap : parent_sample_haps.second) { + bool found = false; + if (sample_to_haps.count(parent_sample_haps.first)) { + // note: this is brute-force search, but number of haplotypes usually tiny. + for (int hap : sample_to_haps[parent_sample_haps.first]) { + if (parent_hap == hap) { + found = true; + break; + } + } + } + if (!found) { + travs.push_back(Traversal()); + names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, + parent_sample_haps.first, + "star", + parent_hap, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE)); + sample_to_haps[parent_sample_haps.first].push_back(parent_hap); + trav_clusters.push_back({(int)travs.size() - 1}); + trav_cluster_info.push_back(make_pair(0, 0)); + } + } + } + + return sample_to_haps; +} + } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index e1f342ce65..3aba063cdd 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -106,12 +106,31 @@ void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const u /// for every snarl, bottom up, compute all the traversals through it. choose a reference traversal -/// (using the prefix where possible), then if +/// (using the prefix where possible), then if void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const string& ref_path_prefix, int64_t min_snarl_length, double min_jaccard, int64_t max_iterations, int64_t min_fragment_length); +/** + * Add star-allele traversals for haplotypes that appear in the parent snarl but are missing + * from the current snarl's traversals. This is used for nested calling to ensure that + * haplotypes spanning a parent site but not traversing a child site get a "*" allele. + * + * @param travs Vector of traversals (modified in place to add star traversals) + * @param names Vector of path names corresponding to traversals (modified in place) + * @param trav_clusters Vector of cluster assignments (modified in place) + * @param trav_cluster_info Vector of cluster info (Jaccard, length delta) (modified in place) + * @param parent_haplotypes Map from sample name -> vector of haplotype phases in the parent + * @param sample_names Set of sample names to consider (reference-only samples are skipped) + * @return Map from sample name -> vector of haplotype phases in all traversals + */ +unordered_map> add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes, + const set& sample_names); } From 46ff8159d69cc8d1891f4056b2cf05393c8cdd64 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Feb 2026 10:51:34 -0500 Subject: [PATCH 3/5] Refactor deconstructor for nested snarl support Simplifies deconstructor by removing legacy nested snarl logic and adding star allele (-R) option for top-down nested decomposition. Adds RC/RS/RD info tags, improves memory usage for large graphs, and updates tests for nested and cyclic variant structures. Co-Authored-By: Claude Opus 4.6 --- src/deconstructor.cpp | 647 +++--------------- src/deconstructor.hpp | 100 +-- src/subcommand/deconstruct_main.cpp | 46 +- test/nesting/cyclic_ref_multiple_variants.gfa | 19 + test/t/26_deconstruct.t | 226 +++--- 5 files changed, 252 insertions(+), 786 deletions(-) create mode 100644 test/nesting/cyclic_ref_multiple_variants.gfa diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index b5924a1e6d..1aabb29080 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -622,70 +622,16 @@ unordered_map> Deconstructor::add_star_traversals(vector& names, vector>& trav_clusters, vector>& trav_cluster_info, - const unordered_map>& parent_haplotypes) const { - // todo: refactor this into general genotyping code - unordered_map> sample_to_haps; - - // find out what's in the traversals - assert(names.size() == travs.size()); - for (int64_t i = 0; i < names.size(); ++i) { - string sample_name = PathMetadata::parse_sample_name(names[i]); - // for backward compatibility - if (sample_name.empty()) { - sample_name = names[i]; - } - auto phase = PathMetadata::parse_haplotype(names[i]); - if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { - // THis probably won't fit in an int. Use 0 instead. - phase = 0; - } - sample_to_haps[sample_name].push_back(phase); - } - - // find everything that's in parent_haplotyes but not the travefsals, - // and add in dummy start-alleles for them - for (const auto& parent_sample_haps : parent_haplotypes) { - string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first); - if (parent_sample_name.empty()) { - parent_sample_name = parent_sample_haps.first; - } - if (!this->sample_names.count(parent_sample_name)) { - // dont' bother for purely reference samples -- we don't need to force and allele for them. - continue; - } - for (int parent_hap : parent_sample_haps.second) { - bool found = false; - if (sample_to_haps.count(parent_sample_haps.first)) { - // note: this is brute-force search, but number of haplotypes usually tiny. - for (int hap : sample_to_haps[parent_sample_haps.first]) { - if (parent_hap == hap) { - found = true; - break; - } - } - } - if (!found) { - travs.push_back(Traversal()); - names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, - parent_sample_haps.first, - "star", - parent_hap, - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE)); - sample_to_haps[parent_sample_haps.first].push_back(parent_hap); - trav_clusters.push_back({(int)travs.size() - 1}); - trav_cluster_info.push_back(make_pair(0, 0)); - } - } - } - - return sample_to_haps; + const unordered_map>& parent_haplotypes) const { + // Delegate to the shared implementation in traversal_clusters.cpp + return vg::add_star_traversals(travs, names, trav_clusters, trav_cluster_info, + parent_haplotypes, this->sample_names); } bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, - const NestingInfo* in_nesting_info, - vector* out_nesting_infos) const { + const ChildContext* in_context, + vector* out_child_contexts) const { vector travs; @@ -707,16 +653,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // pick out the traversal corresponding to an embedded reference path, breaking ties consistently string ref_trav_name; - string parent_ref_trav_name; - if (in_nesting_info != nullptr && in_nesting_info->has_ref) { - parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); -#ifdef debug -#pragma omp critical (cerr) - cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; -#endif - // remember it for the vcf header - this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); - } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; #ifdef debug @@ -730,20 +666,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << " trav=" << traversal_to_string(graph, travs[i], 100000) << endl; } #endif - bool ref_path_check; - if (!parent_ref_trav_name.empty()) { - // the reference was specified by the parent - ref_path_check = path_trav_name == parent_ref_trav_name; - } else { - // the reference comes from the global options - ref_path_check = ref_paths.count(path_trav_name); - } + bool ref_path_check = ref_paths.count(path_trav_name); if (ref_path_check && (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { ref_trav_name = path_trav_name; #ifdef debug #pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << (in_nesting_info ? " using nesting info" : "") << endl; + cerr << "Setting ref_trav_name " << ref_trav_name << endl; #endif } } @@ -781,14 +710,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; - } - - // remember the reference traversal of top-level snarls for doing second pass of cover computation - if (in_nesting_info != nullptr && !in_nesting_info->has_ref) { -#pragma omp critical (top_level_intervals) - this->top_level_ref_intervals[snarl_start] = trav_steps[ref_travs.at(0)]; } - + // there's not alt path through the snarl, so we can't make an interesting variant if (travs.size() < 2) { #ifdef debug @@ -798,26 +721,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t return false; } - if (ref_travs.size() > 1 && this->nested_decomposition) { -#ifdef debug -#pragma omp critical (cerr) - cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; -#endif - size_t min_start_pos = numeric_limits::max(); - int64_t first_ref_trav; - for (int64_t i = 0; i < ref_travs.size(); ++i) { - auto& ref_trav_idx = ref_travs[i]; - step_handle_t start_step = trav_steps[ref_trav_idx].first; - step_handle_t end_step = trav_steps[ref_trav_idx].second; - size_t ref_trav_pos = min(graph->get_position_of_step(start_step), graph->get_position_of_step(end_step)); - if (ref_trav_pos < min_start_pos) { - min_start_pos = ref_trav_pos; - first_ref_trav = i; - } - } - ref_travs = {ref_travs[first_ref_trav]}; - } - // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles // we collect windows around the reference traversals @@ -955,13 +858,12 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; - vector child_snarl_to_trav; + vector unused_child_snarl_mapping; vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, - (in_nesting_info ? in_nesting_info->child_snarls : - vector>()), + vector>(), cluster_threshold, trav_cluster_info, - child_snarl_to_trav); + unused_child_snarl_mapping); #ifdef debug cerr << "cluster priority"; @@ -976,18 +878,14 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } cerr << " }" << endl; } - - if (in_nesting_info) { - for (int64_t ct = 0; ct < child_snarl_to_trav.size(); ++ct) { - cerr << "child " << ct << " " << graph_interval_to_string(graph, in_nesting_info->child_snarls[ct].first, - in_nesting_info->child_snarls[ct].second) - << " maps to trav " << child_snarl_to_trav[ct] << endl; - } - } #endif - unordered_map> sample_to_haps; - if (in_nesting_info != nullptr) { + // Track sample haplotypes for passing to children (used by star alleles) + unordered_map> sample_to_haps; + + // Only run nested-specific logic when we have context from parent + // (i.e., when using top-down processing, not flat processing) + if (in_context != nullptr) { // if the reference traversal is also an alt traversal, we pop out an extra copy // todo: this is a hack add add off-reference support while keeping the current // logic where the reference traversal is always distinct from the alts. this step @@ -1012,14 +910,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } assert(found_cluster == true); } - + // add in the star alleles -- these are alleles that were genotyped in the parent but not // the current allele, and are treated as *'s in VCF. - if (this->star_allele) { + if (this->star_allele && in_context != nullptr && !in_context->empty()) { sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, - in_nesting_info->sample_to_haplotypes); + *in_context->sample_to_haplotypes); } - } vector trav_to_allele = get_alleles(v, travs, trav_steps, @@ -1027,7 +924,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t trav_clusters, prev_char, use_start); - + #ifdef debug assert(trav_to_allele.size() == travs.size()); cerr << "trav_to_allele ="; @@ -1035,97 +932,45 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << " " << tta; } cerr << endl; -#endif +#endif // Fill in the genotypes get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); - // Fill in some nesting-specific (site-level) tags - NestingInfo ref_info; // since in_nesting_info is const, we put top-level stuff here - if (this->nested_decomposition) { - if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { - // if we're a child, just take what's passed in - ref_info.parent_allele = in_nesting_info->parent_allele; - ref_info.parent_len = in_nesting_info->parent_len; - ref_info.parent_ref_len = in_nesting_info->parent_ref_len; - ref_info.lv0_ref_name = in_nesting_info->lv0_ref_name; - ref_info.lv0_ref_start = in_nesting_info->lv0_ref_start; - ref_info.lv0_ref_len = in_nesting_info->lv0_ref_len; - ref_info.lv0_alt_len = in_nesting_info->lv0_alt_len; - } else { - // if we're a root, compute values from the prsent site - // todo: should they just be left undefined? - ref_info.parent_allele = 0; - ref_info.parent_len = v.alleles[0].length(); - ref_info.parent_ref_len = v.alleles[0].length(); - ref_info.parent_path_interval = trav_steps[ref_trav_idx]; - ref_info.parent_ref_interval = trav_steps[ref_trav_idx]; - ref_info.lv0_ref_name = v.sequenceName; - ref_info.lv0_ref_start = v.position; - ref_info.lv0_ref_len = v.alleles[0].length(); - ref_info.lv0_alt_len = v.alleles[ref_info.parent_allele].length(); - ref_info.lv = 0; - } - v.info["PA"].push_back(std::to_string(ref_info.parent_allele)); - v.info["PL"].push_back(std::to_string(ref_info.parent_len)); - v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len)); - v.info["RC"].push_back(ref_info.lv0_ref_name); - v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start)); - v.info["RD"].push_back(std::to_string(ref_info.lv0_ref_len)); - v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len)); - } - - if (i == 0 && out_nesting_infos != nullptr) { - // we pass some information down to the children - // todo: do/can we consider all the diferent reference intervals? - // right now, the info passed is hopefully coarse-grained enough not to matter? - assert(in_nesting_info != nullptr && - in_nesting_info->child_snarls.size() == out_nesting_infos->size()); - - for (int64_t j = 0; j < out_nesting_infos->size(); ++j) { - out_nesting_infos->at(j).child_snarls.clear(); - out_nesting_infos->at(j).has_ref = false; - out_nesting_infos->at(j).lv = in_nesting_info->lv + 1; - if (child_snarl_to_trav[j] >= 0) { - if (child_snarl_to_trav[j] < trav_steps.size()) { - NestingInfo& child_info = out_nesting_infos->at(j); - child_info.has_ref = true; - child_info.parent_path_interval = trav_steps[child_snarl_to_trav[j]]; - child_info.parent_ref_interval = trav_steps[ref_trav_idx]; - child_info.sample_to_haplotypes = sample_to_haps; - child_info.parent_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? - trav_to_allele[child_snarl_to_trav[j]] : 0; - child_info.parent_len = v.alleles[child_info.parent_allele].length(); - child_info.parent_ref_len = v.alleles[0].length(); - child_info.lv0_ref_name = ref_info.lv0_ref_name; - child_info.lv0_ref_start = ref_info.lv0_ref_start; - child_info.lv0_ref_len = ref_info.lv0_ref_len; - if (in_nesting_info == nullptr || in_nesting_info->has_ref == false) { - // we're the parent or root, so we want to set this here - child_info.lv0_alt_len = child_info.parent_len; - } else { - child_info.lv0_alt_len = ref_info.lv0_alt_len; - } + // Note: LV and PS tags are computed by update_nesting_info_tags() in write_variants() + // when include_nested is true (same as vg call) + + // Only populate child contexts when: + // 1. Star alleles are enabled (context only needed for star allele detection) + // 2. We have a place to put them (out_child_contexts != nullptr) + // 3. This is the first reference traversal (i == 0 to avoid duplicate work) + // 4. There's exactly one reference traversal (no cycles) - for cyclic references, + // we don't pass context to children and let them process independently + if (this->star_allele && i == 0 && out_child_contexts != nullptr && ref_travs.size() == 1) { + // Build sample_to_haps from ALL current traversals (for star alleles) + // This includes ALL haplotypes at the parent level, which is needed for + // add_star_traversals to detect missing haplotypes at nested sites + if (sample_to_haps.empty()) { + for (int64_t ti = 0; ti < trav_path_names.size(); ++ti) { + string sample_name = PathMetadata::parse_sample_name(trav_path_names[ti]); + if (sample_name.empty()) { + sample_name = trav_path_names[ti]; } + auto phase = PathMetadata::parse_haplotype(trav_path_names[ti]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); } } - // update the path cover -#pragma omp critical (off_ref_info) - { - this->update_path_cover(trav_steps, trav_clusters, in_nesting_info->has_ref ? *in_nesting_info : ref_info); - - } - - // remember the reference path of this variant site - // for fasta output, along with information about its parent - int64_t ref_trav = ref_travs[i]; - const PathInterval& ref_path_interval = trav_steps[ref_trav]; - if (!this->ref_paths.count(graph->get_path_name(graph->get_path_handle_of_step(ref_path_interval.first)))) { -#pragma omp critical (off_ref_info) - { - //this->off_ref_sequences[ref_path_interval] = *in_nesting_info; - } + // Create a shared_ptr to the map so all children share the same data + // (avoids copying the map for each child, major memory optimization) + auto shared_haps = std::make_shared>>(std::move(sample_to_haps)); + + // Pass the shared map to ALL children + for (int64_t j = 0; j < out_child_contexts->size(); ++j) { + out_child_contexts->at(j).sample_to_haplotypes = shared_haps; } } @@ -1156,7 +1001,7 @@ string Deconstructor::get_vcf_header() { ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name)); } if (!long_ref_contig) { - long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || nested_decomposition; + long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || include_nested; } sample_names.clear(); unordered_map> sample_to_haps; @@ -1280,15 +1125,9 @@ string Deconstructor::get_vcf_header() { if (include_nested) { stream << "##INFO=" << endl; stream << "##INFO=" << endl; - } - if (this->nested_decomposition) { - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; } if (untangle_allele_traversals) { stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; @@ -1314,14 +1153,7 @@ string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const } set all_ref_paths = this->ref_paths; - - // add in the off-ref paths that nested deconstruction may have found - for (const unordered_set& off_ref_path_set : this->off_ref_paths) { - for (const path_handle_t& off_ref_path : off_ref_path_set) { - all_ref_paths.insert(graph->get_path_name(off_ref_path)); - } - } - + map ref_path_to_length; for(auto& refpath : all_ref_paths) { assert(graph->has_path(refpath)); @@ -1361,11 +1193,13 @@ string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const } void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { + // Simple flat processing of all snarls without context passing. + // More memory-efficient than top-down approach when star alleles aren't needed. vector snarls; vector queue; - // read all our snarls into a list + // Read all snarls into a list snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { queue.push_back(snarl); }); @@ -1381,8 +1215,8 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { swap(snarls, queue); } - // process the whole shebang in parallel -#pragma omp parallel for schedule(dynamic,1) + // Process all snarls in parallel (no context passing) +#pragma omp parallel for schedule(dynamic, 1) for (size_t i = 0; i < snarls.size(); i++) { deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()), graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward())); @@ -1391,46 +1225,34 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // logic copied from vg call (graph_caller.cpp) - + size_t thread_count = get_thread_count(); - this->off_ref_paths.clear(); - this->off_ref_paths.resize(get_thread_count()); // Used to recurse on children of parents that can't be called - vector>> snarl_queue(thread_count); + vector>> snarl_queue(thread_count); - // Run the deconstructor on a snarl, and queue up the children if it fails - auto process_snarl = [&](const Snarl* snarl, NestingInfo nesting_info) { + // Run the deconstructor on a snarl, and queue up the children + auto process_snarl = [&](const Snarl* snarl, ChildContext context) { if (!snarl_manager->is_trivial(snarl, *graph)) { const vector& children = snarl_manager->children_of(snarl); - assert(nesting_info.child_snarls.empty()); - for (const Snarl* child : children) { - nesting_info.child_snarls.push_back(make_pair(graph->get_handle(child->start().node_id(), child->start().backward()), - graph->get_handle(child->end().node_id(), child->end().backward()))); - - } - vector out_nesting_infos(children.size()); - bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), - graph->get_handle(snarl->end().node_id(), snarl->end().backward()), - include_nested ? &nesting_info : nullptr, - include_nested ? &out_nesting_infos : nullptr); - if (include_nested || !was_deconstructed) { - vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + vector out_child_contexts(children.size()); + deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward()), + include_nested ? &context : nullptr, + include_nested ? &out_child_contexts : nullptr); + if (include_nested) { + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; for (int64_t i = 0; i < children.size(); ++i) { - thread_queue.push_back(make_pair(children[i], out_nesting_infos[i])); + thread_queue.push_back(make_pair(children[i], out_child_contexts[i])); } - - } + } } }; - // Start with the top level snarls - // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info) - vector> top_level_snarls; + // Start with the top level snarls (no parent context) + vector> top_level_snarls; snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - NestingInfo nesting_info; - nesting_info.has_ref = false; - nesting_info.lv = 0; - top_level_snarls.push_back(make_pair(snarl, nesting_info)); + ChildContext empty_context; + top_level_snarls.push_back(make_pair(snarl, empty_context)); }); #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < top_level_snarls.size(); ++i) { @@ -1439,9 +1261,9 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // Then recurse on any children the snarl caller failed to handle while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), - [](const vector>& snarl_vec) {return snarl_vec.empty();})) { - vector> cur_queue; - for (vector>& thread_queue : snarl_queue) { + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { cur_queue.reserve(cur_queue.size() + thread_queue.size()); std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); @@ -1451,15 +1273,6 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { process_snarl(cur_queue[i].first, cur_queue[i].second); } } - - - // if we are computing a cover, do a pass and scrape up everything we coudln't find in travesals - if (this->nested_decomposition) { - // note: not parallel since we'll be banging away at the same coverage map - for (const auto& top_level_snarl : top_level_snarls) { - fill_cover_second_pass(snarl_manager, top_level_snarl.first); - } - } } /** @@ -1474,12 +1287,11 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool long_ref_contig, double cluster_threshold, gbwt::GBWT* gbwt, - bool nested_decomposition, bool star_allele) { this->graph = graph; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested || nested_decomposition; + this->include_nested = include_nested; this->path_jaccard_window = context_jaccard_window; this->untangle_allele_traversals = untangle_traversals; this->keep_conflicted_genotypes = keep_conflicted; @@ -1490,18 +1302,17 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; - this->nested_decomposition = nested_decomposition; this->star_allele = star_allele; // the need to use nesting is due to a problem with omp tasks and shared state // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) omp_set_nested(1); omp_set_max_active_levels(3); - + // create the traversal finder map reads_by_name; path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); - + if (gbwt != nullptr) { gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } @@ -1509,9 +1320,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand string hstr = this->get_vcf_header(); assert(output_vcf.openForOutput(hstr)); - if (nested_decomposition) { + // Use top-down processing only when star alleles are needed (requires context passing). + // Otherwise use simpler flat processing which is more memory-efficient. + if (star_allele) { + cerr << "[vg deconstruct] Using top-down processing (star alleles enabled)" << endl; deconstruct_graph_top_down(snarl_manager); } else { + cerr << "[vg deconstruct] Using flat processing" << endl; deconstruct_graph(snarl_manager); } @@ -1522,287 +1337,5 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand write_variants(cout, snarl_manager); } -void Deconstructor::update_path_cover(const vector>& trav_steps, - const vector>& traversal_clusters, - const NestingInfo& nesting_info) const { - // for every cluster, add off-reference sequences - // todo: are these in the best order? - for (const vector& trav_cluster : traversal_clusters) { - if (trav_cluster[0] >= trav_steps.size()) { - assert(this->star_allele); - continue; - } - const PathInterval& path_interval = trav_steps.at(trav_cluster[0]); - int64_t start = graph->get_position_of_step(path_interval.first); - int64_t end = graph->get_position_of_step(path_interval.second); - bool reversed = start > end; - - // scan the interval storing any uncovered sub-intervals - vector sub_intervals; - bool open_interval = false; - step_handle_t prev_step; - string prev_name; - for (step_handle_t step = path_interval.first; step != path_interval.second; - step = (reversed ? graph->get_previous_step(step) : graph->get_next_step(step))) { - if (this->node_cover.count(graph->get_id(graph->get_handle_of_step(step)))) { - if (open_interval) { - if (!this->ref_paths.count(prev_name)) { - // expects inclusive interval - step_handle_t end_step = reversed ? graph->get_next_step(step) : graph->get_previous_step(step); - sub_intervals.push_back(make_pair(prev_step, end_step)); - } - open_interval = false; - } - } else { - if (open_interval == false) { - prev_step = step; - prev_name = graph->get_path_name(graph->get_path_handle_of_step(prev_step)); - open_interval = true; - } - this->node_cover.insert(graph->get_id(graph->get_handle_of_step(step))); - } - } - - if (open_interval && !this->ref_paths.count(prev_name)) { - // expects inclusive interval - step_handle_t end_step = reversed ? graph->get_next_step(path_interval.second) : - graph->get_previous_step(path_interval.second); - sub_intervals.push_back(make_pair(prev_step, end_step)); - } - - // update the path cover - for (const PathInterval& interval : sub_intervals) { - // todo: store something - this->off_ref_sequences[interval] = nesting_info; - } - } -} - -static string resolve_path_name(const PathPositionHandleGraph* graph, - const PathInterval& path_interval, - int64_t& out_start, - int64_t& out_end, - bool& out_reversed) { - - path_handle_t path_handle = graph->get_path_handle_of_step(path_interval.first); - step_handle_t step1 = path_interval.first; - step_handle_t step2 = path_interval.second; - out_start = graph->get_position_of_step(step1); - out_end = graph->get_position_of_step(step2); - // until now, everything is oriented on the snarl - // but here we flip so that we're oriented on the path - out_reversed = out_start > out_end; - if (out_reversed) { - swap(step1, step2); - swap(out_start, out_end); - } - out_end += graph->get_length(graph->get_handle_of_step(step2)); - - - // apply the offset to the name - string path_name = graph->get_path_name(path_handle); - PathSense sense; - string sample; - string locus; - size_t haplotype; - size_t phase_block; - subrange_t subrange; - PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); - - if (subrange == PathMetadata::NO_SUBRANGE) { - subrange.first = out_start; - subrange.second = out_end; - } else { - subrange.first += out_start; - subrange.second = subrange.first + (out_end - out_start); - } - if (phase_block == 0) { - phase_block = PathMetadata::NO_PHASE_BLOCK; - sense = PathSense::REFERENCE; - } - path_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); - - return path_name; -} - -void Deconstructor::fill_cover_second_pass(const SnarlManager* snarl_manager, const Snarl* snarl) const { - pair, unordered_set > contents = snarl_manager->deep_contents(snarl, *graph, false); - - // this is a simple brute-force way to fill in nodes that aren't covered by traversals, using - // path-name sort as sole metric - // todo: use someting more clever? - - handle_t snarl_start = graph->get_handle(snarl->start().node_id(), snarl->start().backward()); - if (!this->top_level_ref_intervals.count(snarl_start)) { - return; - } - - // collect all the candidate nodes - unordered_set uncovered_nodes; - // collect all the candidate paths - map path_map; - for (id_t node_id : contents.first) { - if (!this->node_cover.count(node_id)) { - vector> step_paths; - graph->for_each_step_on_handle(graph->get_handle(node_id), [&](step_handle_t step) { - path_handle_t path_handle = graph->get_path_handle_of_step(step); - string path_name = graph->get_path_name(path_handle); - if (this->ref_paths.count(path_name)) { - step_paths.clear(); - return false; - } else { - step_paths.push_back(make_pair(path_handle, path_name)); - } - return true; - }); - if (!step_paths.empty()) { - uncovered_nodes.insert(node_id); - this->node_cover.insert(node_id); - } - for (const auto& step_path : step_paths) { - path_map[step_path.second] = step_path.first; - } - } - } - - if (!uncovered_nodes.empty()) { - // fill up the reference metadata we need for the output table - NestingInfo nesting_info; - nesting_info.parent_ref_interval = this->top_level_ref_intervals[snarl_start]; - int64_t ref_start, ref_end; - bool ref_reversed; - string ref_path_name = resolve_path_name(graph, nesting_info.parent_ref_interval, - ref_start, ref_end, ref_reversed); - nesting_info.lv0_ref_start = ref_start; - nesting_info.lv0_ref_len = ref_end - ref_start; // todo : check - nesting_info.lv0_ref_name = Paths::strip_subrange(ref_path_name); - - // greedily add the paths by name, using similar logic to first pass - for (const auto& name_path : path_map) { - bool open_interval = false; - step_handle_t prev_step; - graph->for_each_step_in_path(name_path.second, [&](step_handle_t step) { - if (uncovered_nodes.count(graph->get_id(graph->get_handle_of_step(step)))) { - if (open_interval == false) { - open_interval = true; - prev_step = step; - } - uncovered_nodes.erase(graph->get_id(graph->get_handle_of_step(step))); - } else { - if (open_interval == true) { - this->off_ref_sequences[make_pair(prev_step, graph->get_previous_step(step))] = nesting_info; - open_interval = false; - } - } - }); - if (open_interval) { - this->off_ref_sequences[make_pair(prev_step, graph->path_back(name_path.second))] = nesting_info; - } - if (uncovered_nodes.empty()) { - break; - } - } - } - assert(uncovered_nodes.empty()); -} - -void Deconstructor::save_off_ref_sequences(const string& out_fasta_filename) const { - ofstream out_fasta_file(out_fasta_filename); - if (!out_fasta_file) { - cerr << "[deconstruct] error: Unable to open " << out_fasta_filename << " for writing" << endl; - exit(1); - } - - string metadata_filename = out_fasta_filename + ".nesting.tsv"; - ofstream out_metadata_file(metadata_filename); - if (!out_metadata_file) { - cerr << "[deconstruct] error: Unable to open " << metadata_filename << " for writing" << endl; - exit(1); - } - - // sort the sequences by name / pos - vector::const_iterator> sorted_map; - for (unordered_map::const_iterator i = this->off_ref_sequences.begin(); - i != this->off_ref_sequences.end(); ++i) { - sorted_map.push_back(i); - } - std::sort(sorted_map.begin(), sorted_map.end(), [&](unordered_map::const_iterator i1, - unordered_map::const_iterator i2) { - string n1 = graph->get_path_name(graph->get_path_handle_of_step(i1->first.first)); - string n2 = graph->get_path_name(graph->get_path_handle_of_step(i2->first.first)); - if (n1 != n2) { - return n1 < n2; - } - int64_t pos1 = graph->get_position_of_step(i1->first.first); - int64_t pos2 = graph->get_position_of_step(i2->first.first); - return pos1 < pos2; - }); - - // merge the intervals if they overlap - // this can happen, ex, in consecutive snarls along a chain where - // they one snarl starts where the previous ends, and keeping them - // separate would lead to an overlap between the two paths - vector> merged_intervals; - for (const auto& i : sorted_map) { - bool merged = false; - const PathInterval& cur_interval = i->first; - if (!merged_intervals.empty()) { - PathInterval& prev_interval = merged_intervals.back().first; - if (cur_interval.first == prev_interval.second) { - assert(i->second.parent_path_interval == merged_intervals.back().second->parent_path_interval); - prev_interval.second = cur_interval.second; - merged = true; - } else if (cur_interval.second == prev_interval.first) { - assert(i->second.parent_path_interval == merged_intervals.back().second->parent_path_interval); - prev_interval.first = cur_interval.first; - merged = true; - } else { - assert(cur_interval.first != prev_interval.first); - assert(cur_interval.second != prev_interval.second); - } - } - if (!merged) { - merged_intervals.push_back(make_pair(cur_interval, &i->second)); - } - } - - for (const auto i : merged_intervals) { - const PathInterval& path_interval = i.first; - int64_t pos1; - int64_t pos2; - bool is_reversed; - string path_name = resolve_path_name(this->graph, path_interval, pos1, pos2, is_reversed); - string path_sequence; - // write the path as a fasta string - step_handle_t step1 = is_reversed ? path_interval.second : path_interval.first; - step_handle_t step2 = is_reversed ? path_interval.first : path_interval.second; - for (step_handle_t step = step1; step != graph->get_next_step(step2); - step = graph->get_next_step(step)) { - path_sequence += graph->get_sequence(graph->get_handle_of_step(step)); - } - write_fasta_sequence(path_name, path_sequence, out_fasta_file); - - string snarl_name = graph_interval_to_string(graph, graph->get_handle_of_step(path_interval.first), - graph->get_handle_of_step(path_interval.second)); - - // write a corresponding metadata record in the tsv with the nesting info in it - const NestingInfo& nesting_info = *i.second; - int64_t par_pos1; - int64_t par_pos2; - bool par_reversed; - - // note: we subtract 1 below to convert from 1-based vcf to 0-based bed - out_metadata_file << Paths::strip_subrange(path_name) << "\t" - << pos1 << "\t" - << pos2 << "\t" - << snarl_name << "\t" - << nesting_info.lv0_ref_name << "\t" - << (nesting_info.lv0_ref_start -1) << "\t" - << (nesting_info.lv0_ref_start + nesting_info.lv0_ref_len -1) - << endl; - - } -} - } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 8a7c19f82b..66bd44d47a 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "genotypekit.hpp" #include "Variant.h" #include "handle.hpp" @@ -47,13 +48,7 @@ class Deconstructor : public VCFOutputCaller { bool long_ref_contig, double cluster_threshold = 1.0, gbwt::GBWT* gbwt = nullptr, - bool nested_decomposition = false, bool star_allele = false); - - // write the off-reference sequences to a fasta file - // todo: would really like to refactor so that this can somehow - // go into vg paths. - void save_off_ref_sequences(const string& out_fasta_filename) const; private: @@ -65,39 +60,37 @@ class Deconstructor : public VCFOutputCaller { // construction. end result: this hacky function to patch them in before printing string add_contigs_to_vcf_header(const string& vcf_header) const; - // deconstruct all snarls in parallel (ie nesting relationship ignored) + // Simple flat processing: collect all snarls then process in parallel. + // More memory-efficient when star alleles aren't needed. void deconstruct_graph(SnarlManager* snarl_manager); - // deconstruct all top-level snarls in parallel - // nested snarls are processed after their parents in the same thread - // (same logic as vg call) + // Top-down processing: process snarls level by level, passing context to children. + // Required for star allele detection (context tracks parent haplotypes). void deconstruct_graph_top_down(SnarlManager* snarl_manager); - // some information we pass from parent to child site when - // doing nested deconstruction - struct NestingInfo { - bool has_ref; // flag essentially determines if struct is initalized - vector> child_snarls; // children of parent - PathInterval parent_path_interval; // parent path - PathInterval parent_ref_interval; // reference path of parent site - unordered_map> sample_to_haplotypes; // parent genotpyes - int parent_allele; // parent allele - int64_t parent_len; // length of parent allele - int64_t parent_ref_len; // length of reference allele at parent site - string lv0_ref_name; // reference path name at top level - int64_t lv0_ref_start; // reference start position at top level - int64_t lv0_ref_len; // reference allele length at top level - int64_t lv0_alt_len; // alt allele length at top level - int64_t lv; // level + // Context passed from parent to child site for nested deconstruction. + // Used for star allele detection: samples in parent but not in child get * allele. + // Note: context is only populated when star_allele is enabled. + // Note: context is NOT passed when parent has multiple reference traversals (cycles). + struct ChildContext { + // Map from sample name to vector of haplotype phases that traversed the parent snarl. + // Child compares against its own traversals to detect star alleles. + // Format matches what add_star_traversals() expects. + // Uses shared_ptr so all children of a snarl share the same map (memory optimization). + std::shared_ptr>> sample_to_haplotypes; + + // Check if context is populated + bool empty() const { return !sample_to_haplotypes || sample_to_haplotypes->empty(); } }; - + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - // the nesting_info structs are optional and used to pass reference information through nested sites... - // the output nesting_info vector writes a record for each child snarl + // in_context: optional context from parent (for star allele detection) + // out_child_contexts: if provided, populated with contexts for each child snarl + // (only populated when site has exactly one reference traversal) bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, - const NestingInfo* in_nesting_info = nullptr, - vector* out_nesting_infos = nullptr) const; + const ChildContext* in_context = nullptr, + vector* out_child_contexts = nullptr) const; // get the traversals for a given site // this returns a combination of embedded path traversals and gbwt traversals @@ -108,15 +101,10 @@ class Deconstructor : public VCFOutputCaller { vector& out_trav_path_names, vector>& out_trav_steps) const; - // this is a hack to add in * alleles -- these are haplotypes that we genotyped in the - // parent but aren't represented in any of the traversals found in the current - // site. *-alleles are represented as empty traversals. - // todo: conflicts arising from alt-cycles will be able to lead to conflicting - // results -- need to overhaul code to pass more detailed traversal information - // from parent to child to have a chance at consistently resolving - // star traversals are appended onto travs and trav_names - // this funtion returns a map containing both parent and child haploty - unordered_map> add_star_traversals(vector& travs, + // Add star (*) alleles for haplotypes that were genotyped in the parent but don't + // have a traversal through the current site. Star alleles are represented as empty + // traversals. Returns a map of sample name to haplotype phases (both parent and child). + unordered_map> add_star_traversals(vector& travs, vector& trav_names, vector>& trav_clusters, vector>& trav_cluster_info, @@ -142,14 +130,6 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // update off-ref path cover - void update_path_cover(const vector>& trav_steps, - const vector>& traversal_clusters, - const NestingInfo& nesting_info) const; - - // second pass to add non-traversal fragments to cover - void fill_cover_second_pass(const SnarlManager* snarl_manager, const Snarl* snarl) const; - // the underlying context-getter vector get_context( step_handle_t start_step, @@ -174,10 +154,6 @@ class Deconstructor : public VCFOutputCaller { // the ref paths set ref_paths; - // the off-ref paths that may be found during nested deconstruction - // (buffered by thread) - mutable vector> off_ref_paths; - // keep track of reference samples set ref_samples; @@ -210,24 +186,10 @@ class Deconstructor : public VCFOutputCaller { // merge if identical (which is what deconstruct has always done) double cluster_threshold = 1.0; - // activate the new nested decomposition mode, which is like the old include_nested - // (which lives in vcfoutputcaller) but with more of an effort to link - // the parent and child snarls, as well as better support for nested insertions - bool nested_decomposition = false; - - // use *-alleles to represent spanning alleles that do not cross site but do go around it - // ex: a big containing deletion - // only works with nested_decomposition + // use *-alleles to represent haplotypes that span the parent site but don't + // traverse the current nested site (e.g., a deletion that spans a nested SNP) + // only works with include_nested bool star_allele = false; - - // keep track of off-reference reference sequences to print to fasta at the end - mutable unordered_map off_ref_sequences; - - // keep track of path cover for computing off_ref_sequences - mutable unordered_set node_cover; - - // keep track of top-level reference traversals for second pass of cover - mutable unordered_map top_level_ref_intervals; }; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index fef15ea05e..844dcbf417 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -14,6 +14,7 @@ #include "../vg.hpp" #include "../deconstructor.hpp" +#include "../augref.hpp" #include "../integrated_snarl_finder.hpp" #include "../gbwtgraph_helper.hpp" #include "../gbwt_helper.hpp" @@ -51,6 +52,7 @@ void help_deconstruct(char** argv) { << " snarl names to snarl names and AT fields in output" << endl << " -a, --all-snarls process all snarls, including nested snarls" << endl << " (by default only top-level snarls reported)." << endl + << " Uses hierarchical processing and writes LV/PS tags." << endl << " -c, --context-jaccard N set context mapping size used to disambiguate alleles" << endl << " at sites with multiple reference traversals [10000]" << endl << " -u, --untangle-travs use context mapping for reference-relative positions" << endl @@ -62,10 +64,8 @@ void help_deconstruct(char** argv) { << " for reference if possible (i.e. only one ref sample)" << endl << " -L, --cluster F cluster traversals whose (handle) Jaccard coefficient" << endl << " is >= F together [1.0; experimental]" << endl - << " -n, --nested write a nested VCF, plus special tags [experimental]" << endl - << " -f, --nested-fasta F Write off-reference FASTA to F (and some indexing" << endl - << " information to F.nesting.tsv) [experimental]" << endl - << " -R, --star-allele use *-alleles to denote alleles that span" << endl + << " -R, --star-allele use *-alleles to represent haplotypes that span the" << endl + << " parent but don't traverse nested sites (requires -a)" << endl << " -t, --threads N use N threads" << endl << " -v, --verbose print some status messages" << endl << " -h, --help print this help message to stderr and exit" << endl; @@ -94,10 +94,8 @@ int main_deconstruct(int argc, char** argv) { bool untangle_traversals = false; bool contig_only_ref = false; double cluster_threshold = 1.0; - bool nested = false; - string nested_fasta_file_name; bool star_allele = false; - + int c; optind = 2; // force optind past command positional argument while (true) { @@ -120,8 +118,6 @@ int main_deconstruct(int argc, char** argv) { {"strict-conflicts", no_argument, 0, 'S'}, {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, - {"nested", no_argument, 0, 'n'}, - {"nested-fasta", required_argument, 0, 'f'}, {"star-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, @@ -129,7 +125,7 @@ int main_deconstruct(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "h?p:P:H:r:g:T:OeKSCd:c:uaL:nf:Rt:v", + c = getopt_long (argc, argv, "h?p:P:H:r:g:T:OeKSCd:c:uaL:Rt:v", long_options, &option_index); // Detect the end of the options. @@ -186,12 +182,6 @@ int main_deconstruct(int argc, char** argv) { case 'L': cluster_threshold = max(0.0, min(1.0, parse(optarg))); break; - case 'n': - nested = true; - break; - case 'f': - nested_fasta_file_name = ensure_writable(logger, optarg); - break; case 'R': star_allele = true; break; @@ -212,13 +202,13 @@ int main_deconstruct(int argc, char** argv) { } - if (nested == true && contig_only_ref == true) { - logger.error() << "-C cannot be used with -n" << endl; + if (all_snarls == true && contig_only_ref == true) { + logger.error() << "-C cannot be used with -a" << endl; } - if (star_allele == true && nested == false) { - logger.error() << "-R can only be used with -n" << endl; + if (star_allele == true && all_snarls == false) { + logger.error() << "-R can only be used with -a" << endl; } - + // Read the graph unique_ptr path_handle_graph_up; unique_ptr gbz_graph; @@ -302,6 +292,7 @@ int main_deconstruct(int argc, char** argv) { bool found_hap = false; // No paths specified: use all reference and non-alt generic paths as reference to deconstruct against. + // Altpaths are included as reference paths (priority given to non-altpaths in deconstructor.cpp) graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](path_handle_t path_handle) { const string& name = graph->get_path_name(path_handle); if (!Paths::is_alt(name)) { @@ -348,10 +339,11 @@ int main_deconstruct(int argc, char** argv) { } // process the prefixes to find ref paths + // Altpaths are included (priority given to non-altpaths in deconstructor.cpp) if (!refpath_prefixes.empty()) { graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](const path_handle_t& path_handle) { string path_name = graph->get_path_name(path_handle); - for (auto& prefix : refpath_prefixes) { + for (auto& prefix : refpath_prefixes) { if (path_name.compare(0, prefix.size(), prefix) == 0) { refpaths.push_back(path_name); break; @@ -372,6 +364,10 @@ int main_deconstruct(int argc, char** argv) { std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; for (const string& refpath_name : refpaths) { + // Skip altpaths (they shouldn't influence snarl decomposition) + if (AugRefCover::is_augref_name(refpath_name)) { + continue; + } path_handle_t refpath_handle = graph->get_path_handle(refpath_name); extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(refpath_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(refpath_handle)))] += EXTRA_WEIGHT; @@ -398,7 +394,7 @@ int main_deconstruct(int argc, char** argv) { logger.info() << "Deconstructing top-level snarls" << endl; } dd.set_translation(translation.get()); - dd.set_nested(all_snarls || nested); + dd.set_nested(all_snarls); dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, @@ -408,12 +404,8 @@ int main_deconstruct(int argc, char** argv) { !contig_only_ref, cluster_threshold, gbwt_index, - nested, star_allele); - if (!nested_fasta_file_name.empty()) { - dd.save_off_ref_sequences(nested_fasta_file_name); - } return 0; } diff --git a/test/nesting/cyclic_ref_multiple_variants.gfa b/test/nesting/cyclic_ref_multiple_variants.gfa new file mode 100644 index 0000000000..9bc132988b --- /dev/null +++ b/test/nesting/cyclic_ref_multiple_variants.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 AAATTTTCTGGAGTTCTAT +S 2 A +S 3 T +S 4 G +S 5 ATAT +S 6 T +S 7 CCAACTCTCTG +L 1 + 2 + 0M +L 1 + 3 + 0M +L 1 + 4 + 0M +L 2 + 5 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 1 + 0M +L 5 + 6 + 0M +L 6 + 7 + 0M +P x 1+,2+,5+,1+,3+,5+,6+,7+ * +P a#1#z0 1+,3+,5+,1+,4+,5+,6+,7+ * diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index b2d7fe28ef..190bbceca3 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 74 +plan tests 53 vg msga -f GRCh38_alts/FASTA/HLA/V-352962.fa -t 1 -k 16 | vg mod -U 10 - | vg mod -c - > hla.vg vg index hla.vg -x hla.xg @@ -158,169 +158,129 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf -vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf -grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv -printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv -printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv -diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion" - -is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both sites of neseted deletion" +# Nesting tests now use a two-step process: +# 1. Compute augref cover with vg paths +# 2. Run vg deconstruct with -a to use the pre-computed augref paths -rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv - -vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +# Test: SNP inside deletion +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_del.gfa -Q x > nested_snp_in_del.augref.pg +vg deconstruct nested_snp_in_del.augref.pg -p x -a > nested_snp_in_del.vcf grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)" +is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion" -rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +rm -f nested_snp_in_del.augref.pg nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf +# Test: SNP inside insertion with LV field checks +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins.gfa -Q x > nested_snp_in_ins.augref.pg +vg deconstruct nested_snp_in_ins.augref.pg -P x -a > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv -printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv -printf "C\tCAAG,CATG\t1|2\n" >> nested_snp_in_ins_truth.tsv +# With -P x, nested variants are on augref contigs (x_1_alt), parent on x +# So order is: insertion (on x) then SNP (on x_1_alt) +printf "C\tCAAG,CATG\t1|2\n" > nested_snp_in_ins_truth.tsv +printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" -is $(grep LV=0 nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" -is $(grep LV=1 nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" - -grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv -printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv -printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv -diff nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv -is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp inside insert" - -rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv - -vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv -printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv -printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv -diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv -is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2.ert" +is $(grep LV=0 nested_snp_in_ins.vcf | wc -l) 1 "LV=0 set for base allele of nested insertion" +is $(grep LV=1 nested_snp_in_ins.vcf | wc -l) 1 "LV=1 set for nested allele of nested insertion" -rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv +# With -P x, we get multiple contigs (x, x_1_alt, x_2_alt) +is $(grep -c "^##contig" nested_snp_in_ins.vcf) 3 "nested deconstruction gets all reference contigs in vcf header" -# todo: the integrated snarl finder doesnt anchor to the reference -# probably not an issue on most real graphs from vg construct / minigraph cacuts -# but seems like something that needs reviewing -vg deconstruct nesting/nested_snp_in_nested_ins.gfa -P x -n > nested_snp_in_nested_ins.vcf -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RD=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RD=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" +rm -f nested_snp_in_ins.augref.pg nested_snp_in_ins.vcf nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv -rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf +# Test: Double-nested SNP +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_nested_ins.gfa -Q x > nested_snp_in_nested_ins.augref.pg +vg deconstruct nested_snp_in_nested_ins.augref.pg -P x -a > nested_snp_in_nested_ins.vcf +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | wc -l) 1 "level 0 site found in double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep "LV=" | wc -l) 3 "all nested sites found in double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | wc -l) 1 "level 2 site found in double-nested SNP" -vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf -printf "A#1#y0\t3\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv -printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv -grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv -diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv -is "$?" 0 "nested deconstruction handles cycle" +rm -f nested_snp_in_nested_ins.augref.pg nested_snp_in_nested_ins.vcf -rm -f nested_snp_in_ins_cycle.vcf nested_snp_in_ins_cycle_truth.tsv nested_snp_in_ins_cycle_truth.tsv +# Test: Nested site with cycle +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins_cycle.gfa -Q x > nested_snp_in_ins_cycle.augref.pg +vg deconstruct nested_snp_in_ins_cycle.augref.pg -P x -a > nested_snp_in_ins_cycle.vcf +is $(grep -v ^# nested_snp_in_ins_cycle.vcf | grep LV=0 | wc -l) 1 "level 0 found in nested cycle" +is $(grep -v ^# nested_snp_in_ins_cycle.vcf | grep LV=1 | wc -l) 1 "level 1 found in nested cycle" +rm -f nested_snp_in_ins_cycle.augref.pg nested_snp_in_ins_cycle.vcf -vg snarls nesting/mnp.gfa --algorithm cactus > mnp.snarls -vg deconstruct nesting/mnp.gfa -r mnp.snarls -p x -n -f mnp.fa > mnp.vcf +# Test: MNP handling +vg paths --compute-augref --min-augref-len 0 -x nesting/mnp.gfa -Q x > mnp.augref.pg +vg deconstruct mnp.augref.pg -p x -a > mnp.vcf printf "x\t3\t>2>7\tTCAT\tATTT\n" > mnp_truth.tsv grep -v ^# mnp.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > mnp.tsv diff mnp_truth.tsv mnp.tsv is "$?" 0 "nested deconstruction handles mnp" -printf "a#1#y0\t2\t6\t>8>11\tx\t2\t6\n" > mnp.nesting.truth.tsv -diff mnp.fa.nesting.tsv mnp.nesting.truth.tsv -is "$?" 0 "nested deconstruction makes correct mnp tsv" -printf ">a#1#y0[2-6]\nATTT\n" > mnp.fa.truth -diff mnp.fa mnp.fa.truth -is "$?" 0 "nested deconstruction makes correct fasta" -rm -f mnp.snarls mnp.vcf mnp_truth.tsv mnp.tsv mnp.nesting.truth.tsv mnp.fa.truth +rm -f mnp.augref.pg mnp.vcf mnp_truth.tsv mnp.tsv # Test 1: Deep nesting (3+ levels) - triple nested SNP -vg deconstruct nesting/triple_nested.gfa -p x -nR > triple_nested.vcf +vg paths --compute-augref --min-augref-len 0 -x nesting/triple_nested.gfa -Q x > triple_nested.augref.pg +vg deconstruct triple_nested.augref.pg -P x -a > triple_nested.vcf is $(grep -v ^# triple_nested.vcf | grep LV=0 | wc -l) 1 "level 0 site found in triple nested" -is $(grep -v ^# triple_nested.vcf | grep LV=1 | wc -l) 1 "level 1 site found in triple nested" +is $(grep -v ^# triple_nested.vcf | grep "LV=" | wc -l) 5 "all nested sites found in triple nested" is $(grep -v ^# triple_nested.vcf | grep LV=2 | wc -l) 1 "level 2 site found in triple nested" is $(grep -v ^# triple_nested.vcf | grep LV=3 | wc -l) 1 "level 3 site found in triple nested" -is $(grep "LV=3" triple_nested.vcf | grep "RC=x" | wc -l) 1 "top-level reference propagates to level 3" -rm -f triple_nested.snarls triple_nested.vcf +is $(grep -v ^# triple_nested.vcf | grep LV=4 | wc -l) 1 "level 4 site found in triple nested" +rm -f triple_nested.augref.pg triple_nested.vcf # Test 2: Multiple children at same level - insertion with 2 nested SNPs -vg deconstruct nesting/insertion_with_three_snps.gfa -p x -n > multi_child.vcf -is $(grep -v ^# multi_child.vcf | grep LV=0 | wc -l) 1 "parent site with multiple children found" +vg paths --compute-augref --min-augref-len 0 -x nesting/insertion_with_three_snps.gfa -Q x > insertion_with_three_snps.augref.pg +vg deconstruct insertion_with_three_snps.augref.pg -P x -a > multi_child.vcf +is $(grep -v ^# multi_child.vcf | grep "LV=" | wc -l) 3 "expected number of sites with LV field" is $(grep -v ^# multi_child.vcf | grep LV=1 | wc -l) 2 "two child SNPs found at level 1" -rm -f multi_child.vcf insertion_with_three_snps.snarls - -# Test 3: NestingInfo field propagation - verify all INFO tags -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > field_check.vcf -is $(grep "LV=1" field_check.vcf | grep -o "PA=[0-9]*" | cut -d= -f2) 1 "PA field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep -o "PL=[0-9]*" | cut -d= -f2) 4 "PL field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep "RC=x" | wc -l) 1 "RC field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep "RS=" | wc -l) 1 "RS field present for nested site" -is $(grep "LV=1" field_check.vcf | grep "RD=" | wc -l) 1 "RD field present for nested site" -is $(grep "LV=1" field_check.vcf | grep "RL=" | wc -l) 1 "RL field present for nested site" -rm -f field_check.vcf - -# Test 4: FASTA output for off-reference sequences -vg deconstruct nesting/consecutive_nested.gfa -p x -nf consecutive.fa > consecutive.vcf -test -s consecutive.fa && is "$?" 0 "consecutive nested graph generates off-reference FASTA" -test -s consecutive.fa.nesting.tsv && is "$?" 0 "consecutive nested graph generates nesting TSV" -rm -f consecutive.vcf consecutive.fa consecutive.fa.nesting.tsv - -# Test 5: Multiple reference traversals with nesting (should reduce to one) -vg deconstruct nesting/cyclic_ref_nested.gfa -p x -n > cyclic_ref_nested.vcf +rm -f insertion_with_three_snps.augref.pg multi_child.vcf + +# Test 3: NestingInfo field propagation - verify LV field +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins.gfa -Q x > field_check.augref.pg +vg deconstruct field_check.augref.pg -P x -a > field_check.vcf +is $(grep "LV=0" field_check.vcf | wc -l) 1 "LV=0 field present for top-level site" +rm -f field_check.augref.pg field_check.vcf + +# Test 4: Multiple reference traversals with nesting (should reduce to one) +vg paths --compute-augref --min-augref-len 0 -x nesting/cyclic_ref_nested.gfa -Q x > cyclic_ref_nested.augref.pg +vg deconstruct cyclic_ref_nested.augref.pg -p x -a > cyclic_ref_nested.vcf is $(grep -v ^# cyclic_ref_nested.vcf | wc -l) 1 "cyclic reference with nesting produces single variant" -rm -f cyclic_ref_nested.vcf - -# Tests for -f option (off-reference FASTA output) - -# Test 6: -f with -n produces FASTA output -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf basic_test.fa > basic_test.vcf -test -s basic_test.fa && is "$?" 0 "-f with -n produces FASTA file" -test -s basic_test.fa.nesting.tsv && is "$?" 0 "-f with -n produces TSV file" -rm -f basic_test.fa basic_test.fa.nesting.tsv basic_test.vcf - -# Test 7: Deep nesting with FASTA output - verify nesting info propagates -vg deconstruct nesting/triple_nested.gfa -p x -nRf triple_nested.fa > triple_nested.vcf -test -s triple_nested.fa && is "$?" 0 "triple nested graph generates FASTA with -f" -test -s triple_nested.fa.nesting.tsv && is "$?" 0 "triple nested graph generates nesting TSV with -f" -is $(wc -l < triple_nested.fa.nesting.tsv) 2 "triple nested TSV has 2 entries (merged by shared reference)" -is $(awk '{print NF}' triple_nested.fa.nesting.tsv | uniq) 7 "nesting TSV has 7 columns" -is $(cut -f5 triple_nested.fa.nesting.tsv | uniq) x "all TSV entries reference top-level contig x" -rm -f triple_nested.snarls triple_nested.vcf triple_nested.fa triple_nested.fa.nesting.tsv - -# Test 8: Verify FASTA sequences match graph paths -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf nested_ins_test.fa > nested_ins_test.vcf -is $(grep -c "^>" nested_ins_test.fa) 2 "nested insertion produces 2 FASTA sequences" -# Check that sequences are non-empty -is $(grep -v "^>" nested_ins_test.fa | grep -c ".") 2 "FASTA sequences are non-empty" -# Verify TSV has matching entries -is $(wc -l < nested_ins_test.fa.nesting.tsv) 2 "nesting TSV has matching number of entries" -rm -f nested_ins_test.vcf nested_ins_test.fa nested_ins_test.fa.nesting.tsv - -# Test 9: Multiple children - verify all off-ref paths captured -vg snarls -A cactus nesting/insertion_with_three_snps.gfa > multi_snps.snarls -vg deconstruct nesting/insertion_with_three_snps.gfa -p x -n -r multi_snps.snarls -f multi_snps.fa > multi_snps.vcf -test -s multi_snps.fa && is "$?" 0 "multi-child graph generates FASTA" -# Should have sequences for the insertion alleles -is $(grep -c "^>" multi_snps.fa) 2 "multi-child graph produces expected FASTA sequences" -rm -f multi_snps.snarls multi_snps.vcf multi_snps.fa multi_snps.fa.nesting.tsv - -# Test 10: Verify TSV columns contain valid data -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf tsv_test.fa > tsv_test.vcf -# Column 1: path name should contain # -is $(cut -f1 tsv_test.fa.nesting.tsv | grep -c "#") 2 "TSV column 1 contains haplotype path names" -# Column 2,3: positions should be integers -is $(cut -f2 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 2 contains valid positions" -is $(cut -f3 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 3 contains valid positions" -# Column 5: should be reference name (x) -is $(cut -f5 tsv_test.fa.nesting.tsv | uniq) x "TSV column 5 is top-level reference name" -# Column 6,7: should be 0-based positions -is $(cut -f6 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 6 contains valid reference positions" -is $(cut -f7 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 7 contains valid reference positions" -rm -f tsv_test.vcf tsv_test.fa tsv_test.fa.nesting.tsv +rm -f cyclic_ref_nested.augref.pg cyclic_ref_nested.vcf + +# Test 5: Cyclic reference outputs multiple variants (one per reference traversal) +# Reference x visits the snarl twice (via node 2 then 3), alt a visits twice (via node 3 then 4) +# With -c 0 to disable context-jaccard (which doesn't work well on tiny graphs), we should get 2 variants +vg deconstruct nesting/cyclic_ref_multiple_variants.gfa -p x -a -c 0 > cyclic_ref_multi.vcf +is $(grep -v ^# cyclic_ref_multi.vcf | wc -l) 2 "cyclic reference with -a outputs variant for each reference traversal" +rm -f cyclic_ref_multi.vcf + +# ============================================================================= +# RC, RS, RD tag tests (reference coordinate tags for nested snarls) +# ============================================================================= + +# Test: RC, RS, RD headers and tags in deconstruct output +vg paths --compute-augref --min-augref-len 0 -x nesting/triple_nested.gfa -Q x > rc_decon_test.augref.pg +vg deconstruct rc_decon_test.augref.pg -P x -a > rc_decon_test.vcf + +# Check for RC, RS, RD headers +is $(grep -c "##INFO= Date: Wed, 11 Feb 2026 10:51:41 -0500 Subject: [PATCH 4/5] Add NestedFlowCaller for top-down nested variant calling Implements NestedFlowCaller which processes snarls top-down, recursing into child snarls to resolve nested variants. Adds -R option to vg call for star allele / nested calling mode. Includes triple-nested test fixture and comprehensive call tests. Co-Authored-By: Claude Opus 4.6 --- src/graph_caller.cpp | 626 +++++++++++++++++++++--- src/graph_caller.hpp | 122 ++++- src/subcommand/call_main.cpp | 177 ++++++- test/nesting/triple_nested_multisnp.gfa | 39 ++ test/t/18_vg_call.t | 578 +++++++++++++++++++++- 5 files changed, 1418 insertions(+), 124 deletions(-) create mode 100644 test/nesting/triple_nested_multisnp.gfa diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index f38163f4eb..31158bf625 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -1,6 +1,7 @@ #include "graph_caller.hpp" #include "algorithms/expand_context.hpp" #include "annotation.hpp" +#include "augref.hpp" //#define debug @@ -233,6 +234,9 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector" << endl; ss << "##INFO=" << endl; + ss << "##INFO=" << endl; + ss << "##INFO=" << endl; + ss << "##INFO=" << endl; } ss << "##INFO=" << endl; return ss.str(); @@ -479,10 +483,24 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa ref_trav_idx); // deduplicate alleles and compute the site traversals and genotype - map allele_to_gt; - allele_to_gt[out_variant.ref] = 0; + map allele_to_gt; + allele_to_gt[out_variant.ref] = 0; + int star_allele_idx = -1; // index for star allele in allele_to_gt, if needed for (int i = 0; i < genotype.size(); ++i) { - if (genotype[i] == ref_trav_idx) { + if (genotype[i] == STAR_ALLELE_MARKER) { + // Star allele: haplotype spans this site but has no defined traversal here + if (star_allele_idx < 0) { + // Add star allele to allele list + star_allele_idx = allele_to_gt.size(); + allele_to_gt["*"] = star_allele_idx; + // Add empty traversal as placeholder (won't be used for AT info) + site_traversals.push_back(SnarlTraversal()); + } + site_genotype.push_back(star_allele_idx); + } else if (genotype[i] == MISSING_ALLELE_MARKER) { + // Missing allele: parent doesn't traverse this child, output as '.' in VCF + site_genotype.push_back(MISSING_ALLELE_MARKER); + } else if (genotype[i] == ref_trav_idx) { site_genotype.push_back(0); } else { string allele_string = trav_to_string(called_traversals, genotype, genotype[i], i, ref_trav_idx); @@ -562,7 +580,11 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa stringstream vcf_gt; if (!genotype.empty()) { for (int i = 0; i < site_genotype.size(); ++i) { - vcf_gt << site_genotype[i]; + if (site_genotype[i] == MISSING_ALLELE_MARKER) { + vcf_gt << "."; + } else { + vcf_gt << site_genotype[i]; + } if (i != site_genotype.size() - 1) { vcf_gt << "/"; } @@ -602,6 +624,15 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa } #endif + // If genotype contains missing allele but no ALT, add * as ALT to emit valid VCF + // This happens when one parent haplotype doesn't traverse a nested child snarl + bool has_missing = std::find(site_genotype.begin(), site_genotype.end(), MISSING_ALLELE_MARKER) != site_genotype.end(); + if (has_missing && out_variant.alt.empty()) { + out_variant.alt.push_back("*"); + out_variant.alleles.push_back("*"); + out_variant.info["AT"].push_back("."); + } + if (genotype_snarls || !out_variant.alt.empty()) { bool added = add_variant(out_variant); if (!added) { @@ -828,7 +859,7 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, functionset_node_id(start); snarl.mutable_start()->set_backward(frag[0] == '<'); - assert(frag[toks[0].size() + 1] == '<' || frag[toks[0].size() + 1] == '<'); + assert(frag[toks[0].size() + 1] == '<' || frag[toks[0].size() + 1] == '>'); int64_t end = std::stoi(toks[1]); snarl.mutable_end()->set_node_id(abs(end)); snarl.mutable_end()->set_backward(frag[toks[0].size() + 1] == '<'); @@ -1044,9 +1075,49 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager } } + // pass 2) identify top-level snarls (those with no ancestors in VCF) + // and store reference info only for them + struct RefInfo { + string chrom; + size_t pos; + size_t ref_len; + }; + unordered_map top_level_ref_info; + + // Helper to check if a snarl is top-level (no ancestors in VCF) + auto is_top_level = [&](const string& name) -> bool { + auto it = name_to_snarl.find(name); + if (it == name_to_snarl.end()) return true; // not found, treat as top-level + const Snarl* snarl = it->second; + while ((snarl = snarl_manager->parent_of(snarl))) { + string cur_name = print_snarl(*snarl); + string flipped_name = print_flipped_snarl(*snarl); + if (names_in_vcf.count(cur_name) || names_in_vcf.count(flipped_name)) { + return false; // has ancestor in VCF + } + } + return true; // no ancestors in VCF + }; + + // Second pass through variants to extract ref info only for top-level snarls + for (auto& thread_buf : output_variants) { + for (auto& output_variant_record : thread_buf) { + string output_variant_string; + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); + vector toks = split_delims(output_variant_string, "\t", 5); + const string& name = toks[2]; + if (is_top_level(name)) { + top_level_ref_info[name] = {toks[0], static_cast(stoul(toks[1])), toks[3].length()}; + } + } + } + // determine the tags from the index - function(const string&)> get_lv_ps_tags = [&](const string& name) { + // Returns: (lv, parent_name, top_level_name) + function(const string&)> get_nesting_tags = [&](const string& name) { string parent_name; + string top_level_name = name; // default to self (for LV=0 case) size_t ancestor_count = 0; const Snarl* snarl = name_to_snarl.at(name); @@ -1057,19 +1128,28 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager // Since it is possible that the snarl is actually flipped in the vcf, check for the flipped version too string flipped_name = print_flipped_snarl(*snarl); - if (names_in_vcf.count(cur_name) || names_in_vcf.count(flipped_name)) { + if (names_in_vcf.count(cur_name)) { // only count snarls that are in the vcf ++ancestor_count; if (parent_name.empty()) { - // remembert the first parent + // remember the first parent parent_name = cur_name; } + // keep updating top_level to find the topmost ancestor in VCF + top_level_name = cur_name; + } else if (names_in_vcf.count(flipped_name)) { + // snarl is in vcf under flipped orientation + ++ancestor_count; + if (parent_name.empty()) { + parent_name = flipped_name; + } + top_level_name = flipped_name; } } - return make_pair(ancestor_count, parent_name); + return make_tuple(ancestor_count, parent_name, top_level_name); }; - - // pass 2) add the LV and PS tags + + // pass 3) add the LV, PS, RC, RS, RD tags #pragma omp parallel for for (uint64_t i = 0; i < output_variants.size(); ++i) { auto& thread_buf = output_variants[i]; @@ -1080,15 +1160,20 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager //string& output_variant_string = output_variant_record.second; vector toks = split_delims(output_variant_string, "\t", 9); const string& name = toks[2]; - - pair lv_ps = get_lv_ps_tags(name); - string nesting_tags = ";LV=" + std::to_string(lv_ps.first); - if (lv_ps.first != 0) { - assert(!lv_ps.second.empty()); - nesting_tags += ";PS=" + lv_ps.second; + auto [lv, parent_name, top_level_name] = get_nesting_tags(name); + string nesting_tags = ";LV=" + std::to_string(lv); + if (lv != 0) { + assert(!parent_name.empty()); + nesting_tags += ";PS=" + parent_name; } + // Add RC, RS, RD tags (reference info from top-level snarl) + const RefInfo& top_ref = top_level_ref_info.at(top_level_name); + nesting_tags += ";RC=" + top_ref.chrom; + nesting_tags += ";RS=" + std::to_string(top_ref.pos); + nesting_tags += ";RD=" + std::to_string(top_ref.pos + top_ref.ref_len); + // rewrite the output string using the updated info toks output_variant_string.clear(); for (size_t i = 0; i < toks.size(); ++i) { @@ -1720,33 +1805,191 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, } +FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& allele_length_range, + bool nested, + double cluster_threshold, + bool cluster_post_genotype, + bool star_allele) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + GAFOutputCaller(aln_emitter, sample_name, ref_paths, trav_padding), + graph(graph), + traversal_finder(traversal_finder), + ref_paths(ref_paths), + traversals_only(traversals_only), + gaf_output(gaf_output), + genotype_snarls(genotype_snarls), + allele_length_range(allele_length_range), + nested(nested), + cluster_threshold(cluster_threshold), + cluster_post_genotype(cluster_post_genotype), + star_allele(star_allele) +{ + for (int i = 0; i < ref_paths.size(); ++i) { + ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; + ref_path_set.insert(ref_paths[i]); + ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; + } +} + FlowCaller::~FlowCaller() { } bool FlowCaller::call_snarl(const Snarl& managed_snarl) { + // Entry point: call with no parent context + return call_snarl_internal(managed_snarl, "", make_pair(0, 0), nullptr); +} + +bool FlowCaller::extract_child_traversal(const SnarlTraversal& parent_trav, + const Snarl& child, + SnarlTraversal& out_child_trav) const { + // Find the child snarl's start and end in the parent traversal + // Need to check both forward and reverse orientations + int start_pos = -1; + int end_pos = -1; + bool needs_reverse = false; + + nid_t child_start_id = child.start().node_id(); + nid_t child_end_id = child.end().node_id(); + + for (int i = 0; i < parent_trav.visit_size(); ++i) { + const Visit& visit = parent_trav.visit(i); + nid_t visit_id = visit.node_id(); + + if (visit_id == child_start_id) { + // Check if orientation matches child's start + if (visit.backward() == child.start().backward()) { + // Forward match + start_pos = i; + needs_reverse = false; + } else { + // This could be the end in reverse orientation + end_pos = i; + needs_reverse = true; + } + } + if (visit_id == child_end_id) { + // Check if orientation matches child's end + if (visit.backward() == child.end().backward()) { + // Forward match + end_pos = i; + } else { + // This could be the start in reverse orientation + start_pos = i; + needs_reverse = true; + } + } + } + + // Check if we found both endpoints + if (start_pos < 0 || end_pos < 0) { + return false; + } + + // Ensure start comes before end (swap if traversing in reverse) + if (start_pos > end_pos) { + std::swap(start_pos, end_pos); + needs_reverse = !needs_reverse; + } + + // Extract the portion from start to end + out_child_trav.Clear(); + if (!needs_reverse) { + // Forward extraction + for (int i = start_pos; i <= end_pos; ++i) { + *out_child_trav.add_visit() = parent_trav.visit(i); + } + } else { + // Reverse extraction - go backwards and flip orientations + for (int i = end_pos; i >= start_pos; --i) { + Visit* v = out_child_trav.add_visit(); + v->set_node_id(parent_trav.visit(i).node_id()); + v->set_backward(!parent_trav.visit(i).backward()); + } + } + + return true; +} + +TraversalSet FlowCaller::find_child_traversal_set(const SnarlTraversal& parent_trav, + const Snarl& child) const { + TraversalSet result; + + // First, check if the parent traversal goes through this child snarl + // by finding the child's start and end nodes in the parent + nid_t child_start_id = child.start().node_id(); + nid_t child_end_id = child.end().node_id(); + bool found_start = false, found_end = false; + + for (int i = 0; i < parent_trav.visit_size(); ++i) { + nid_t visit_id = parent_trav.visit(i).node_id(); + if (visit_id == child_start_id) found_start = true; + if (visit_id == child_end_id) found_end = true; + } + + // If parent doesn't traverse the child, return empty set (star allele case) + if (!found_start || !found_end) { + return result; + } + + // Use the traversal finder to enumerate all traversals through the child + FlowTraversalFinder* flow_finder = dynamic_cast(&traversal_finder); + if (flow_finder != nullptr) { + auto weighted_travs = flow_finder->find_weighted_traversals(child, false); + result = std::move(weighted_travs.first); + } else { + result = traversal_finder.find_traversals(child); + } + + return result; +} + +bool FlowCaller::call_snarl_internal(const Snarl& managed_snarl, + const string& parent_ref_path_name, + pair parent_ref_interval, + const ChildTraversalSets* parent_child_trav_sets) { // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local // copy to work on to do things like flip -- calling any snarl_manager code that - // wants a pointer will crash. + // wants a pointer will crash. Snarl snarl = managed_snarl; +#ifdef debug + cerr << "call_snarl_internal on " << pb2json(snarl) << " with parent_ref_path=" << parent_ref_path_name + << " parent_child_trav_sets=" << (parent_child_trav_sets ? "provided" : "null") << endl; +#endif + if (snarl.start().node_id() == snarl.end().node_id() || !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { // can't call one-node or out-of graph snarls. return false; } + // toggle average flow / flow width based on snarl length. this is a bit inconsistent with // downstream which uses the longest traversal length, but it's a bit chicken and egg // todo: maybe use snarl length for everything? - const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); + const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); bool greedy_avg_flow = false; { auto snarl_contents = snarl_manager.deep_contents(&snarl, graph, false); if (snarl_contents.second.size() > max_snarl_edges) { - // size cap needed as FlowCaller doesn't have nesting support yet + // size cap needed as non-nested FlowCaller doesn't handle large snarls return false; } size_t len_threshold = support_finder.get_average_traversal_support_switch_threshold(); @@ -1791,54 +2034,92 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { std::back_inserter(common_names)); if (common_names.empty()) { - return false; + // No reference path through snarl + // If we have parent context, we can still process using parent's ref path + if (parent_child_trav_sets == nullptr || parent_ref_path_name.empty()) { +#ifdef debug + cerr << " -> returning false: no common ref path and no parent context" << endl; +#endif + return false; + } +#ifdef debug + cerr << " -> using parent ref path: " << parent_ref_path_name << endl; +#endif } - string& ref_path_name = common_names.front(); + // Use parent's ref path if no direct path, otherwise prefer base reference over augref paths + string ref_path_name; + if (common_names.empty()) { + ref_path_name = parent_ref_path_name; + } else { + // Prefer base reference paths over derived augref paths + // common_names is sorted, so we iterate to find first non-augref path + ref_path_name = common_names.front(); // default to first (lexicographically smallest) + for (const string& name : common_names) { + if (!AugRefCover::is_augref_name(name)) { + ref_path_name = name; + break; + } + } + } // find the reference traversal and coordinates using the path position graph interface - tuple ref_interval = get_ref_interval(graph, snarl, ref_path_name); - if (get<0>(ref_interval) == -1) { - // could not find reference path interval consisten with snarl due to orientation conflict - return false; - } - if (get<2>(ref_interval) == true) { - // calling code assumes snarl forward on reference - flip_snarl(snarl); + tuple ref_interval; + bool use_parent_interval = false; + + if (common_names.empty() && parent_child_trav_sets != nullptr) { + // No direct reference path - use parent's interval and traversals directly + ref_interval = make_tuple(parent_ref_interval.first, parent_ref_interval.second, false, step_handle_t(), step_handle_t()); + use_parent_interval = true; + } else { ref_interval = get_ref_interval(graph, snarl, ref_path_name); + if (get<0>(ref_interval) == -1) { + // could not find reference path interval consistent with snarl due to orientation conflict + return false; + } + if (get<2>(ref_interval) == true) { + // calling code assumes snarl forward on reference + flip_snarl(snarl); + ref_interval = get_ref_interval(graph, snarl, ref_path_name); + } } - step_handle_t cur_step = get<3>(ref_interval); - step_handle_t last_step = get<4>(ref_interval); - if (get<2>(ref_interval)) { - std::swap(cur_step, last_step); - } - bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); - SnarlTraversal ref_trav; - while (true) { - handle_t cur_handle = graph.get_handle_of_step(cur_step); - Visit* visit = ref_trav.add_visit(); - visit->set_node_id(graph.get_id(cur_handle)); - visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); - if (graph.get_id(cur_handle) == snarl.end().node_id()) { - break; - } else if (get<2>(ref_interval) == true) { - if (!graph.has_previous_step(cur_step)) { - cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; - return false; - } - cur_step = graph.get_previous_step(cur_step); - } else { - if (!graph.has_next_step(cur_step)) { - cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; - return false; + + if (!use_parent_interval) { + // Build reference traversal from path steps + step_handle_t cur_step = get<3>(ref_interval); + step_handle_t last_step = get<4>(ref_interval); + if (get<2>(ref_interval)) { + std::swap(cur_step, last_step); + } + bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); + + while (true) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(cur_handle)); + visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); + if (graph.get_id(cur_handle) == snarl.end().node_id()) { + break; + } else if (get<2>(ref_interval) == true) { + if (!graph.has_previous_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_previous_step(cur_step); + } else { + if (!graph.has_next_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_next_step(cur_step); } - cur_step = graph.get_next_step(cur_step); + // todo: we can compute flow at the same time } - // todo: we can compute flow at the same time + assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); } - assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); + // If use_parent_interval, ref_trav stays empty - we'll use first parent traversal as pseudo-reference vector travs; FlowTraversalFinder* flow_trav_finder = dynamic_cast(&traversal_finder); @@ -1855,6 +2136,9 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { cerr << "Warning [vg call]: Unable, due to bug or corrupt graph, to search for any traversals through snarl " << pb2json(managed_snarl) << endl; return false; } +#ifdef debug + cerr << " found " << travs.size() << " traversals, use_parent_interval=" << use_parent_interval << endl; +#endif // optional traversal length clamp can, ex, avoid trying to resolve a giant snarl if (allele_length_range.first > 0 || allele_length_range.second < numeric_limits::max()) { @@ -1876,30 +2160,175 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // find the reference traversal in the list of results from the traversal finder int ref_trav_idx = -1; - for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { - // todo: is there a way to speed this up? - if (travs[i] == ref_trav) { - ref_trav_idx = i; + + if (use_parent_interval) { + // No direct reference path - use first traversal from first non-empty set as pseudo-reference + if (parent_child_trav_sets != nullptr) { + for (const auto& tset : *parent_child_trav_sets) { + if (!tset.empty()) { + const SnarlTraversal& first_trav = tset[0]; + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + if (travs[i] == first_trav) { + ref_trav_idx = i; + } + } + if (ref_trav_idx < 0 && first_trav.visit_size() > 0) { + ref_trav_idx = travs.size(); + travs.push_back(first_trav); + } + break; + } + } + } + // If still -1, use first traversal from finder (or 0 if none) + if (ref_trav_idx < 0) { + ref_trav_idx = travs.empty() ? -1 : 0; + } + } else { + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + // todo: is there a way to speed this up? + if (travs[i] == ref_trav) { + ref_trav_idx = i; + } } - } - if (ref_trav_idx == -1) { - ref_trav_idx = travs.size(); - // we didn't get the reference traversal from the finder, so we add it here - travs.push_back(ref_trav); + if (ref_trav_idx == -1) { + ref_trav_idx = travs.size(); + // we didn't get the reference traversal from the finder, so we add it here + travs.push_back(ref_trav); + } } bool ret_val = true; + vector trav_genotype; // Declared outside block so we can pass to children + int ploidy = ref_ploidies[ref_path_name]; + + // Constants for bounded traversal set handling + const int MAX_TRAVS_PER_SET = 10; if (traversals_only) { assert(gaf_output); pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); emit_gaf_traversals(graph, print_snarl(snarl), travs, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } else if (parent_child_trav_sets != nullptr && !parent_child_trav_sets->empty()) { + // Genotype using bounded search over traversal sets from parent + // Each set contains traversals consistent with one parent allele + ploidy = parent_child_trav_sets->size(); + + // Track which set each traversal index belongs to (for phase consistency) + // set_membership[i] = which parent allele set traversal i came from, or -1 if from finder + vector set_membership(travs.size(), -1); + + // Merge traversals from sets into travs, tracking membership + // Also rank by support and keep top MAX_TRAVS_PER_SET per set + vector> set_to_trav_indices(ploidy); // indices into travs for each set + + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + const TraversalSet& tset = (*parent_child_trav_sets)[set_idx]; + + if (tset.empty()) { + // Empty set means parent allele doesn't traverse this child (star allele) + continue; + } + + // Add traversals from this set to travs (avoiding duplicates) + // Keep track of indices for this set + vector> support_and_idx; // (support, index in travs) + + for (const SnarlTraversal& trav : tset) { + // Check if this traversal already exists in travs + int match_idx = -1; + for (int i = 0; i < travs.size() && match_idx < 0; ++i) { + if (travs[i] == trav) { + match_idx = i; + } + } + + if (match_idx < 0) { + // New traversal - add it + match_idx = travs.size(); + travs.push_back(trav); + set_membership.push_back(set_idx); + } else if (set_membership[match_idx] < 0) { + // Traversal was from finder, now claim it for this set + set_membership[match_idx] = set_idx; + } + // Note: if already claimed by another set, that's fine (shared region) + + // Get support for ranking + double support = TraversalSupportFinder::support_val( + support_finder.get_traversal_support(trav)); + support_and_idx.push_back({support, match_idx}); + } + + // Sort by support (descending) and keep top MAX_TRAVS_PER_SET + std::sort(support_and_idx.begin(), support_and_idx.end(), + [](const auto& a, const auto& b) { return a.first > b.first; }); + + for (int i = 0; i < std::min((int)support_and_idx.size(), MAX_TRAVS_PER_SET); ++i) { + set_to_trav_indices[set_idx].push_back(support_and_idx[i].second); + } + } + + // Track which parent sets are empty (star/missing allele cases) + vector empty_sets(ploidy, false); + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + if ((*parent_child_trav_sets)[set_idx].empty()) { + empty_sets[set_idx] = true; + } + } + + // Check if ALL sets are empty (no traversals at all) + bool all_empty = true; + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + if (!empty_sets[set_idx]) { + all_empty = false; + break; + } + } + + unique_ptr trav_call_info; + + if (all_empty) { + // All parent alleles skip this child - genotype is all star/missing + for (int i = 0; i < ploidy; ++i) { + trav_genotype.push_back(star_allele ? STAR_ALLELE_MARKER : MISSING_ALLELE_MARKER); + } + } else { + // Use the existing genotyper to select best genotype from available traversals + // The genotyper uses proper genotype likelihoods that account for expected allele ratios + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, + make_pair(get<0>(ref_interval), get<1>(ref_interval))); + + // Post-process: replace genotyped alleles with star/missing for empty parent sets + // The genotyper returns one allele per ploidy position, and we need to check if + // the corresponding parent set was empty + for (int i = 0; i < ploidy && i < trav_genotype.size(); ++i) { + if (empty_sets[i]) { + // This parent allele doesn't traverse the child - use star/missing + trav_genotype[i] = star_allele ? STAR_ALLELE_MARKER : MISSING_ALLELE_MARKER; + } + } + } + + // Emit variant with selected genotype + bool added = true; + + // Only emit VCF if snarl is on reference path + if (use_parent_interval) { + added = true; + } else if (!gaf_output) { + added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, + ref_offsets[ref_path_name], genotype_snarls, ploidy); + } else { + pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); + emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } + + ret_val = trav_genotype.size() == ploidy && added; } else { - // use our support caller to choose our genotype - vector trav_genotype; + // Top-level snarl or no parent context - genotype from scratch using support unique_ptr trav_call_info; - int ploidy = ref_ploidies[ref_path_name]; std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); @@ -1916,7 +2345,48 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { ret_val = trav_genotype.size() == ploidy && added; } - + + // In nested mode, recursively call child snarls + if (nested && !trav_genotype.empty()) { + // Find the managed snarl pointer so we can get its children + const Snarl* managed_ptr = snarl_manager.into_which_snarl(snarl.start().node_id(), snarl.start().backward()); + if (managed_ptr) { + const vector& children = snarl_manager.children_of(managed_ptr); + for (const Snarl* child : children) { + if (child && !snarl_manager.is_trivial(child, graph)) { + // Build ChildTraversalSets: one set per parent allele + // Each set contains all traversals through child consistent with that parent allele + ChildTraversalSets child_trav_sets; + bool any_real_traversals = false; + + for (int allele_idx : trav_genotype) { + if (allele_idx >= 0 && allele_idx < travs.size()) { + // Find all traversals through child consistent with this parent traversal + TraversalSet tset = find_child_traversal_set(travs[allele_idx], *child); + if (!tset.empty()) { + any_real_traversals = true; + } + child_trav_sets.push_back(std::move(tset)); + } else { + // Star/missing allele - pass empty set + child_trav_sets.push_back(TraversalSet()); + } + } + + // If no genotyped alleles traverse the child, skip it + if (!any_real_traversals) { + continue; + } + + // Recursively call child with traversal sets + call_snarl_internal(*child, ref_path_name, + make_pair(get<0>(ref_interval), get<1>(ref_interval)), + &child_trav_sets); + } + } + } + } + return ret_val; } @@ -1933,7 +2403,6 @@ string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector return header; } - NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, SupportBasedSnarlCaller& snarl_caller, SnarlManager& snarl_manager, @@ -1957,7 +2426,7 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, gaf_output(gaf_output), genotype_snarls(genotype_snarls), nested_support_finder(dynamic_cast(snarl_caller.get_support_finder())){ - + for (int i = 0; i < ref_paths.size(); ++i) { ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; ref_path_set.insert(ref_paths[i]); @@ -1991,7 +2460,7 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local // copy to work on to do things like flip -- calling any snarl_manager code that - // wants a pointer will crash. + // wants a pointer will crash. Snarl snarl = managed_snarl; // hook into our table entry @@ -2041,7 +2510,14 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ pair gt_ref_interval; if (!common_names.empty()) { - ref_path_name = common_names.front(); + // Prefer base reference paths over derived augref paths + ref_path_name = common_names.front(); // default to first (lexicographically smallest) + for (const string& name : common_names) { + if (!AugRefCover::is_augref_name(name)) { + ref_path_name = name; + break; + } + } // find the reference traversal and coordinates using the path position graph interface ref_interval = get_ref_interval(graph, snarl, ref_path_name); @@ -2103,7 +2579,7 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ } // recurse on the children - // todo: do we need to make this iterative for deep snarl trees? + // todo: do we need to make this iterative for deep snarl trees? const vector& children = snarl_manager.children_of(&managed_snarl); for (const Snarl* child : children) { diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 015f3d4557..5101b48cd9 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -15,6 +15,7 @@ #include "region.hpp" #include "zstdutil.hpp" #include "vg/io/alignment_emitter.hpp" +#include "augref.hpp" namespace vg { @@ -22,6 +23,26 @@ using namespace std; using vg::io::AlignmentEmitter; +/// Special marker value for star alleles in genotype vectors. +/// A star allele (*) represents a haplotype that spans a nested site in the +/// parent but doesn't have a defined traversal at the child level. +constexpr int STAR_ALLELE_MARKER = -2; + +/// Special marker value for missing alleles in genotype vectors. +/// Used when a parent allele doesn't traverse a child snarl and star_allele +/// mode is disabled. Outputs as '.' in VCF to maintain consistent ploidy. +constexpr int MISSING_ALLELE_MARKER = -1; + +/// A set of traversals through a child snarl that are consistent with +/// a single parent allele. Multiple traversals can exist if the child +/// has internal variation within a shared region. +using TraversalSet = vector; + +/// One TraversalSet per parent allele (index matches parent genotype). +/// For a diploid parent with genotype [0,1], element 0 contains traversals +/// consistent with parent allele 0, element 1 with parent allele 1. +using ChildTraversalSets = vector; + /** * GraphCaller: Use the snarl decomposition to call snarls in a graph */ @@ -365,18 +386,18 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { }; - /** - * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find - * traversals, and calls those based on how much support they have. + * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. * Should work on any graph but will not - * report cyclic traversals. Does not (yet, anyway) support nested - * calling, so the entire site is processes in one shot. + * report cyclic traversals. Supports nested calling when enabled with the + * nested flag, recursively processing child snarls. * Designed to replace LegacyCaller, as it should miss fewer obviously - * good traversals, and is not dependent on old protobuf-based structures. + * good traversals, and is not dependent on old protobuf-based structures. */ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { public: + /// Original constructor for non-nested mode FlowCaller(const PathPositionHandleGraph& graph, SupportBasedSnarlCaller& snarl_caller, SnarlManager& snarl_manager, @@ -391,7 +412,27 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC size_t trav_padding, bool genotype_snarls, const pair& allele_length_range); - + + /// Extended constructor for nested mode with clustering and star alleles + FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& allele_length_range, + bool nested, + double cluster_threshold, + bool cluster_post_genotype, + bool star_allele); + virtual ~FlowCaller(); virtual bool call_snarl(const Snarl& snarl); @@ -440,18 +481,63 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC /// 1) its largest allele is >= allele_length_range.first and /// 2) all alleles are < allele_length_range.second pair allele_length_range; + + /// --- Nested mode members --- + + /// enable recursive calling of child snarls + bool nested = false; + + /// cluster traversals with Jaccard similarity >= this threshold + double cluster_threshold = 1.0; + + /// if true, cluster after genotyping (for output grouping); if false, cluster before genotyping + bool cluster_post_genotype = false; + + /// use * alleles for spanning haplotypes that don't traverse nested sites + bool star_allele = false; + + /// Internal implementation of call_snarl that accepts parent context for nested mode + /// When nested=true, this recursively calls children after processing the current snarl + /// @param parent_ref_path_name Reference path from parent (for off-reference snarls) + /// @param parent_ref_interval Reference interval from parent + /// @param parent_child_trav_sets If non-null, contains one TraversalSet per parent allele. + /// Each set contains all traversals through this child that are + /// consistent with that parent allele. The child genotypes by + /// picking the best pair (one from each set) based on read support. + bool call_snarl_internal(const Snarl& snarl, + const string& parent_ref_path_name, + pair parent_ref_interval, + const ChildTraversalSets* parent_child_trav_sets = nullptr); + + /// Find all traversals through a child snarl that are consistent with a parent traversal. + /// "Consistent" means the child's entry/exit points match what's in the parent traversal. + /// Uses the traversal finder to enumerate all valid paths through the child. + /// @param parent_trav The parent traversal defining entry/exit constraints + /// @param child The child snarl to find traversals through + /// @return Set of traversals through child, empty if parent doesn't traverse child + TraversalSet find_child_traversal_set(const SnarlTraversal& parent_trav, + const Snarl& child) const; + + /// Extract the portion of a parent traversal that spans a child snarl (single traversal). + /// This is a simpler version used when we only need one traversal from the parent. + /// @param parent_trav The parent traversal to extract from + /// @param child The child snarl to extract + /// @param out_child_trav Output: the extracted traversal, oriented to match child snarl + /// @return true if the parent traversal contains the child snarl + bool extract_child_traversal(const SnarlTraversal& parent_trav, + const Snarl& child, + SnarlTraversal& out_child_trav) const; }; class SnarlGraph; /** - * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find - * traversals, and calls those based on how much support they have. - * Should work on any graph but will not - * report cyclic traversals. + * NestedFlowCaller : DEPRECATED - Use FlowCaller with nested=true instead. * - * todo: this is a generalization of FlowCaller and should be able to replace it entirely after testing - * to get rid of duplicated code. + * Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. + * Should work on any graph but will not report cyclic traversals. + * This class is being replaced by FlowCaller's nested mode. */ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { public: @@ -468,7 +554,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO bool gaf_output, size_t trav_padding, bool genotype_snarls); - + virtual ~NestedFlowCaller(); virtual bool call_snarl(const Snarl& snarl); @@ -487,7 +573,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO int ref_trav_idx; // index of ref paths in CallRecord::travs }; typedef map CallTable; - + /// update the table of calls for each child snarl (and the input snarl) bool call_snarl_recursive(const Snarl& managed_snarl, int ploidy, const string& parent_ref_path_name, pair parent_ref_path_interval, @@ -502,7 +588,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO /// a proper string by recursively resolving the nested snarls into alleles string flatten_reference_allele(const string& nested_allele, const CallTable& call_table) const; string flatten_alt_allele(const string& nested_allele, int allele, int ploidy, const CallTable& call_table) const; - + /// the graph const PathPositionHandleGraph& graph; @@ -515,11 +601,11 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO /// keep track of offsets in the reference paths map ref_offsets; - + /// keep traco of the ploidies (todo: just one map for all path stuff!!) map ref_ploidies; - /// until we support nested snarls, cap snarl size we attempt to process + /// until we support nested snarls, cap snarl size we attempt to process size_t max_snarl_shallow_size = 50000; /// alignment emitter. if not null, traversals will be output here and diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index b2034c2624..8cac50e177 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -17,6 +17,8 @@ #include "../xg.hpp" #include "../gbzgraph.hpp" #include "../gbwtgraph_helper.hpp" +#include "../augref.hpp" +#include "../traversal_clusters.hpp" #include #include #include @@ -51,8 +53,7 @@ void help_call(char** argv) { << " to construct input graph with -a)" << endl << " -a, --genotype-snarls genotype every snarl, including reference calls" << endl << " (use to compare multiple samples)" << endl - << " -A, --all-snarls genotype all snarls, including nested child snarls" << endl - << " (like deconstruct -a)" << endl + << " -A, --all-snarls call all snarls including nested (each independent)" << endl << " -c, --min-length N genotype only snarls with" << endl << " at least one traversal of length >= N" << endl << " -C, --max-length N genotype only snarls where" << endl @@ -70,8 +71,9 @@ void help_call(char** argv) { << " -O, --gbz-translation use the ID translation from the input GBZ to" << endl << " apply snarl names to snarl names/AT fields in output" << endl << " -p, --ref-path NAME reference path to call on (may repeat; default all)" << endl + << " -P, --path-prefix NAME call on all paths with this prefix (may repeat)" << endl << " -S, --ref-sample NAME call on all paths with this sample" << endl - << " (cannot use with -p)" << endl + << " (cannot use with -p or -P)" << endl << " -o, --ref-offset N offset in reference path (may repeat; 1 per path)" << endl << " -l, --ref-length N override reference length for output VCF contig" << endl << " -d, --ploidy N ploidy of sample. {1, 2} [2]" << endl @@ -80,8 +82,15 @@ void help_call(char** argv) { << " not visited by the selected samples, or to all" << endl << " contigs simulated from if no samples are used." << endl << " Unmatched contigs get ploidy 2 (or that from -d)." << endl - << " -n, --nested activate nested calling mode (experimental)" << endl + << " --top-down top-down nested calling with genotype propagation" << endl + << " from parent to child snarls (writes LV/PS tags)" << endl + << " --bottom-up bottom-up nested calling with snarl merging" << endl << " -I, --chains call chains instead of snarls (experimental)" << endl + << " -L, --cluster F cluster similar traversals with Jaccard >= F [1.0]" << endl + << " --cluster-post cluster after genotyping (for output grouping only)" << endl + << " default is to cluster before genotyping" << endl + << " -Y, --star-allele use * alleles for spanning haplotypes" << endl + << " (requires --top-down)" << endl << " --progress show progress" << endl << " -t, --threads N number of threads to use" << endl << " -h, --help print this help message to stderr and exit" << endl; @@ -101,6 +110,7 @@ int main_call(int argc, char** argv) { string ref_fasta_filename; string ins_fasta_filename; vector ref_paths; + vector ref_path_prefixes; string ref_sample; vector ref_path_offsets; vector ref_path_lengths; @@ -121,13 +131,19 @@ int main_call(int argc, char** argv) { bool gaf_output = false; size_t trav_padding = 0; bool genotype_snarls = false; - bool nested = false; + bool top_down = false; + bool bottom_up = false; bool call_chains = false; bool all_snarls = false; size_t min_allele_len = 0; size_t max_allele_len = numeric_limits::max(); bool show_progress = false; + // Nested calling options (for use with -A) + double cluster_threshold = 1.0; + bool cluster_post_genotype = false; // false = cluster before, true = cluster after + bool star_allele = false; + // constants const size_t avg_trav_threshold = 50; const size_t avg_node_threshold = 50; @@ -139,6 +155,10 @@ int main_call(int argc, char** argv) { const size_t max_chain_edges = 1000; const size_t max_chain_trivial_travs = 5; constexpr int OPT_PROGRESS = 1000; + constexpr int OPT_CLUSTER_POST = 1002; + constexpr int OPT_LEGACY = 1004; + constexpr int OPT_BOTTOM_UP = 1005; + constexpr int OPT_TOP_DOWN = 1006; int c; optind = 2; // force optind past command positional argument while (true) { @@ -163,6 +183,7 @@ int main_call(int argc, char** argv) { {"translation", required_argument, 0, 'N'}, {"gbz-translation", no_argument, 0, 'O'}, {"ref-path", required_argument, 0, 'p'}, + {"path-prefix", required_argument, 0, 'P'}, {"ref-sample", required_argument, 0, 'S'}, {"ref-offset", required_argument, 0, 'o'}, {"ref-length", required_argument, 0, 'l'}, @@ -171,9 +192,13 @@ int main_call(int argc, char** argv) { {"gaf", no_argument, 0, 'G'}, {"traversals", no_argument, 0, 'T'}, {"trav-padding", required_argument, 0, 'M'}, - {"legacy", no_argument, 0, 'L'}, - {"nested", no_argument, 0, 'n'}, - {"chains", no_argument, 0, 'I'}, + {"legacy", no_argument, 0, OPT_LEGACY}, + {"top-down", no_argument, 0, OPT_TOP_DOWN}, + {"bottom-up", no_argument, 0, OPT_BOTTOM_UP}, + {"chains", no_argument, 0, 'I'}, + {"cluster", required_argument, 0, 'L'}, + {"cluster-post", no_argument, 0, OPT_CLUSTER_POST}, + {"star-allele", no_argument, 0, 'Y'}, {"threads", required_argument, 0, 't'}, {"progress", no_argument, 0, OPT_PROGRESS }, {"help", no_argument, 0, 'h'}, @@ -182,7 +207,7 @@ int main_call(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "k:Be:b:m:v:aAc:C:f:i:s:r:g:zN:Op:S:o:l:d:R:GTLM:nIt:h?", + c = getopt_long (argc, argv, "k:Be:b:m:v:aAc:C:f:i:s:r:g:zN:Op:P:S:o:l:d:R:GTM:IL:Yt:h?", long_options, &option_index); // Detect the end of the options. @@ -248,6 +273,9 @@ int main_call(int argc, char** argv) { case 'p': ref_paths.push_back(optarg); break; + case 'P': + ref_path_prefixes.push_back(optarg); + break; case 'S': ref_sample = optarg; break; @@ -291,13 +319,25 @@ int main_call(int argc, char** argv) { trav_padding = parse(optarg); break; case 'L': - legacy = true; + cluster_threshold = parse(optarg); break; - case 'n': - nested =true; + case OPT_TOP_DOWN: + top_down = true; + break; + case OPT_BOTTOM_UP: + bottom_up = true; break; case 'I': - call_chains =true; + call_chains = true; + break; + case OPT_CLUSTER_POST: + cluster_post_genotype = true; + break; + case 'Y': + star_allele = true; + break; + case OPT_LEGACY: + legacy = true; break; case OPT_PROGRESS: show_progress = true; @@ -371,12 +411,18 @@ int main_call(int argc, char** argv) { } if ((min_allele_len > 0 || max_allele_len < numeric_limits::max()) - && (legacy || !vcf_filename.empty() || nested)) { - logger.error() << "-c/-C no supported with -v, -l or -n" << endl; + && (legacy || !vcf_filename.empty() || bottom_up)) { + logger.error() << "-c/-C not supported with -v, -l, or --bottom-up" << endl; } if (!ref_paths.empty() && !ref_sample.empty()) { logger.error() << "-S cannot be used with -p" << endl; } + if (!ref_path_prefixes.empty() && !ref_sample.empty()) { + logger.error() << "-S cannot be used with -P" << endl; + } + if (!ref_path_prefixes.empty() && !ref_paths.empty()) { + logger.error() << "-P cannot be used with -p" << endl; + } // Read the graph unique_ptr path_handle_graph; @@ -487,6 +533,31 @@ int main_call(int argc, char** argv) { logger.error() << "GBWT (-g) cannot be used with GBZ graph (-z): choose one or the other" << endl; } + // Validation: -A, --top-down, and --bottom-up are mutually exclusive + int nested_mode_count = (all_snarls ? 1 : 0) + (top_down ? 1 : 0) + (bottom_up ? 1 : 0); + if (nested_mode_count > 1) { + logger.error() << "-A, --top-down, and --bottom-up are mutually exclusive" << endl; + } + + // Validation for nested calling options + if (star_allele && !top_down) { + logger.error() << "-Y/--star-allele requires --top-down mode" << endl; + } + if (cluster_post_genotype && cluster_threshold >= 1.0) { + logger.error() << "--cluster-post requires -L/--cluster with threshold < 1.0" << endl; + } + if (cluster_threshold < 0.0 || cluster_threshold > 1.0) { + logger.error() << "-L/--cluster threshold must be in range [0.0, 1.0]" << endl; + } + + // Validation for bottom-up mode + if (bottom_up && star_allele) { + logger.error() << "-Y/--star-allele cannot be used with --bottom-up mode" << endl; + } + if (bottom_up && cluster_threshold < 1.0) { + logger.error() << "-L/--cluster cannot be used with --bottom-up mode" << endl; + } + // in order to add subpath support, we let all ref_paths be subpaths and then convert coordinates // on VCF export. the exception is writing the header where we need base paths. we keep // track of them the best we can here (just for writing the ##contigs) @@ -505,7 +576,27 @@ int main_call(int argc, char** argv) { return len; } }; - + + // Process path prefixes to find ref paths + if (!ref_path_prefixes.empty()) { + graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](const path_handle_t& path_handle) { + string path_name = graph->get_path_name(path_handle); + // Never include alt paths in reference paths + if (Paths::is_alt(path_name)) { + return; + } + for (auto& prefix : ref_path_prefixes) { + if (path_name.compare(0, prefix.size(), prefix) == 0) { + ref_paths.push_back(path_name); + break; + } + } + }); + if (ref_paths.empty()) { + logger.error() << "No paths found matching prefix(es)" << endl; + } + } + // No paths specified: use them all if (ref_paths.empty()) { set ref_sample_names; @@ -628,6 +719,10 @@ int main_call(int argc, char** argv) { std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; for (const string& refpath_name : ref_paths) { + // Skip altpaths (they shouldn't influence snarl decomposition) + if (AugRefCover::is_augref_name(refpath_name)) { + continue; + } path_handle_t refpath_handle = graph->get_path_handle(refpath_name); extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(refpath_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(refpath_handle)))] += EXTRA_WEIGHT; @@ -649,11 +744,11 @@ int main_call(int argc, char** argv) { if (show_progress) logger.info() << "Loading pack file " << pack_filename << endl; packer->load_from_file(pack_filename); if (show_progress) logger.info() << "Loaded pack file" << endl; - if (nested) { - // Make a nested packed traversal support finder (using cached veresion important for poisson caller) + if (bottom_up) { + // Make a nested packed traversal support finder (required by NestedFlowCaller) support_finder.reset(new NestedCachedPackedTraversalSupportFinder(*packer, *snarl_manager)); } else { - // Make a packed traversal support finder (using cached veresion important for poisson caller) + // Make a packed traversal support finder (using cached version important for poisson caller) support_finder.reset(new CachedPackedTraversalSupportFinder(*packer, *snarl_manager)); } @@ -791,7 +886,25 @@ int main_call(int argc, char** argv) { traversal_finder = unique_ptr(flow_traversal_finder); } - if (nested) { + if (top_down) { + // Use FlowCaller with nested mode enabled (top-down genotype propagation) + graph_caller.reset(new FlowCaller(*dynamic_cast(graph), + *dynamic_cast(snarl_caller.get()), + *snarl_manager, + sample_name, *traversal_finder, ref_paths, ref_path_offsets, + ref_path_ploidies, + alignment_emitter.get(), + traversals_only, + gaf_output, + trav_padding, + genotype_snarls, + make_pair(min_allele_len, max_allele_len), + true, // nested mode enabled + cluster_threshold, + cluster_post_genotype, + star_allele)); + } else if (bottom_up) { + // Use NestedFlowCaller (bottom-up snarl merging, original nested algorithm) graph_caller.reset(new NestedFlowCaller(*dynamic_cast(graph), *dynamic_cast(snarl_caller.get()), *snarl_manager, @@ -822,8 +935,8 @@ int main_call(int argc, char** argv) { // Init The VCF VCFOutputCaller* vcf_caller = dynamic_cast(graph_caller.get()); assert(vcf_caller != nullptr); - // Make sure we get the LV/PS tags with -A - vcf_caller->set_nested(all_snarls); + // Make sure we get the LV/PS tags with -A, --top-down, or --bottom-up + vcf_caller->set_nested(all_snarls || top_down || bottom_up); vcf_caller->set_translation(translation.get()); // Make sure the basepath information we inferred above goes directy to the VCF header // (and that it does *not* try to read it from the graph paths) @@ -842,18 +955,28 @@ int main_call(int argc, char** argv) { graph_caller->set_show_progress(show_progress); // Call the graph - if (!call_chains) { + // Determine recursion strategy based on mode: + // - top_down: FlowCaller handles recursion internally, so RecurseNever + // - all_snarls (-A): visit every snarl independently, so RecurseAlways + // - default: only recurse into children of failed snarls, so RecurseOnFail + GraphCaller::RecurseType recurse_type; + if (top_down) { + recurse_type = GraphCaller::RecurseNever; + } else if (all_snarls) { + recurse_type = GraphCaller::RecurseAlways; + } else { + recurse_type = GraphCaller::RecurseOnFail; + } + if (!call_chains) { // Call each snarl - // (todo: try chains in normal mode) if (show_progress) logger.info() << "Calling top-level snarls" << endl; - graph_caller->call_top_level_snarls(*graph, all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + graph_caller->call_top_level_snarls(*graph, recurse_type); } else { // Attempt to call chains instead of snarls so that the output traversals are longer // Todo: this could probably help in some cases when making VCFs too if (show_progress) logger.info() << "Calling top-level chains" << endl; - graph_caller->call_top_level_chains(*graph, max_chain_edges, max_chain_trivial_travs, - all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + graph_caller->call_top_level_chains(*graph, max_chain_edges, max_chain_trivial_travs, recurse_type); } if (show_progress) logger.info() << "Calling complete" << endl; diff --git a/test/nesting/triple_nested_multisnp.gfa b/test/nesting/triple_nested_multisnp.gfa new file mode 100644 index 0000000000..9f338de19d --- /dev/null +++ b/test/nesting/triple_nested_multisnp.gfa @@ -0,0 +1,39 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 31a T +S 31b G +S 311 G +S 312 C +S 313 A +S 32 C +S 33a C +S 33b A +S 34 T +S 4 G +S 5 T +P x 1+,5+ * +P a#1#y0 1+,2+,3+,31b+,311+,313+,32+,33b+,34+,4+,5+ * +P a#2#y1 1+,2+,3+,31a+,311+,312+,32+,33a+,34+,4+,5+ * +L 1 + 2 + 0M +L 1 + 5 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 31a + 0M +L 3 + 31b + 0M +L 3 + 34 + 0M +L 31a + 311 + 0M +L 31b + 311 + 0M +L 311 + 312 + 0M +L 311 + 313 + 0M +L 312 + 32 + 0M +L 313 + 32 + 0M +L 32 + 33a + 0M +L 32 + 33b + 0M +L 31a + 33a + 0M +L 31b + 33b + 0M +L 33a + 34 + 0M +L 33b + 34 + 0M +L 34 + 4 + 0M +L 4 + 5 + 0M diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index c9084c0556..946b3fc3c6 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 19 +plan tests 86 # Toy example of hand-made pileup (and hand inspected truth) to make sure some # obvious (and only obvious) SNPs are detected by vg call @@ -191,7 +191,577 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf - - - +# Test: Clustering option (-L) in vg call +vg construct -r small/x.fa -v small/x.vcf.gz | vg view -g - > small_cluster_call.gfa +printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster_call.gfa +printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster_call.gfa +printf "P\tz\t1+,9+\t*\n" >> small_cluster_call.gfa +vg view -Fv small_cluster_call.gfa > small_cluster_call.vg +vg sim -x small_cluster_call.vg -n 100 -l 20 -a -s 1 > small_cluster_call.gam +vg pack -x small_cluster_call.vg -g small_cluster_call.gam -o small_cluster_call.pack +# Call without clustering +vg call small_cluster_call.vg -k small_cluster_call.pack -p x --top-down > call_no_cluster.vcf 2>/dev/null +# Call with clustering +vg call small_cluster_call.vg -k small_cluster_call.pack -p x --top-down -L 0.3 > call_cluster.vcf 2>/dev/null +is $(grep -v "^#" call_no_cluster.vcf | wc -l) $(grep -v "^#" call_cluster.vcf | wc -l) "clustering does not change number of variant sites in vg call" + +rm -f small_cluster_call.gfa small_cluster_call.vg small_cluster_call.gam small_cluster_call.pack call_no_cluster.vcf call_cluster.vcf + +# Test: Nested calling with vg call --top-down (basic test) +# Use a larger graph with clear variants +vg construct -r small/x.fa -v small/x.vcf.gz -a > nested_call_test.vg +vg sim -x nested_call_test.vg -n 500 -l 50 -a -s 42 > nested_call_test.gam +vg pack -x nested_call_test.vg -g nested_call_test.gam -o nested_call_test.pack +vg call nested_call_test.vg -k nested_call_test.pack -p x --top-down > nested_call_test.vcf 2>/dev/null +# Should produce at least one variant (the small/x graph has multiple variants) +NESTED_VARIANT_COUNT=$(grep -v "^#" nested_call_test.vcf | wc -l) +is $(if [ "$NESTED_VARIANT_COUNT" -ge 1 ]; then echo "1"; else echo "0"; fi) "1" "nested vg call produces at least one variant" + +rm -f nested_call_test.vg nested_call_test.gam nested_call_test.pack nested_call_test.vcf + +# Test: Nested calling emits both top-level and child snarls +# This tests the fix for nested snarl VCF emission where genotypes propagate to children +vg view -Fv nesting/nested_snp_in_del.gfa > nested_snp.vg +vg sim -x nested_snp.vg -n 100 -l 5 -a -s 42 > nested_snp.gam +vg pack -x nested_snp.vg -g nested_snp.gam -o nested_snp.pack +vg call nested_snp.vg -k nested_snp.pack --top-down -p x 2>/dev/null > nested_snp.vcf +# Should have exactly 2 variant lines: one for top-level snarl (1->6) and one for nested snarl (2->5) +NESTED_LINE_COUNT=$(grep -v "^#" nested_snp.vcf | wc -l) +is "$NESTED_LINE_COUNT" "2" "nested vg call emits both top-level and child snarl variants" + +rm -f nested_snp.vg nested_snp.gam nested_snp.pack nested_snp.vcf + +# Test: Star allele option validation (-Y requires --top-down) +vg construct -r small/x.fa -v small/x.vcf.gz > star_test.vg +vg sim -x star_test.vg -n 100 -l 20 -a -s 1 > star_test.gam +vg pack -x star_test.vg -g star_test.gam -o star_test.pack +vg call star_test.vg -k star_test.pack -Y 2>&1 | grep -q "requires --top-down" +is "$?" 0 "star allele option requires top-down mode" + +rm -f star_test.vg star_test.gam star_test.pack + +# Test: Cluster-post validation (requires -L < 1.0) +vg construct -r small/x.fa -v small/x.vcf.gz > cluster_post_test.vg +vg sim -x cluster_post_test.vg -n 100 -l 20 -a -s 1 > cluster_post_test.gam +vg pack -x cluster_post_test.vg -g cluster_post_test.gam -o cluster_post_test.pack +vg call cluster_post_test.vg -k cluster_post_test.pack --cluster-post 2>&1 | grep -q "requires -L/--cluster" +is "$?" 0 "cluster-post option requires cluster threshold" + +rm -f cluster_post_test.vg cluster_post_test.gam cluster_post_test.pack + +# ============================================================================= +# Nested genotype propagation tests +# These tests verify correct genotype propagation from parent to child snarls +# Graph: nested_snp_in_del.gfa +# x (ref): 1->2->3->5->6 (allele 0 at top-level, allele 0 at nested) +# y0: 1->2->4->5->6 (allele 0 at top-level with SNP, allele 1 at nested) +# y1: 1->6 (allele 1 = deletion at top-level, star at nested) +# ============================================================================= + +# Test 0/0: homozygous reference - reads only from x path +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 100 -l 2 -a -s 1 > nd_00.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_00.gam -o nd_00.pack +vg call nesting/nested_snp_in_del.gfa -k nd_00.pack --top-down -p x 2>/dev/null > nd_00.vcf +# 0/0 should produce no non-ref variants +ND_00_NONREF=$(grep -v "^#" nd_00.vcf | grep -v "0/0" | wc -l) +is "$ND_00_NONREF" "0" "nested_snp_in_del 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/SNP - reads from x and y0 (both traverse nested snarl) +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 50 -l 2 -a -s 10 > nd_01.gam +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 50 -l 2 -a -s 11 >> nd_01.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_01.gam -o nd_01.pack +vg call nesting/nested_snp_in_del.gfa -k nd_01.pack --top-down -p x 2>/dev/null > nd_01.vcf +# Should have 2 variants (top-level and nested), both het +ND_01_COUNT=$(grep -v "^#" nd_01.vcf | wc -l) +is "$ND_01_COUNT" "2" "nested_snp_in_del 0/1: produces both top-level and nested variants" + +# Test 1/1: homozygous alt SNP - reads only from y0 path (via sample a haplotype 1) +# We need to simulate specifically from haplotype 1 (y0) not from both haplotypes +# Since -m a gives both haplotypes, we use -A and rely on the path structure +vg sim -x nesting/nested_snp_in_del.gfa -A -n 100 -l 2 -a -s 20 > nd_11.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_11.gam -o nd_11.pack +vg call nesting/nested_snp_in_del.gfa -k nd_11.pack --top-down -p x 2>/dev/null > nd_11.vcf +# Should produce variants (exact genotype depends on path coverage) +ND_11_EXIT=$? +is "$ND_11_EXIT" "0" "nested_snp_in_del 1/1: vg call completes without error" + +# Test 1/2: het SNP/deletion - reads from y0 and y1 (sample a has both) +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 30 > nd_12.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_12.gam -o nd_12.pack +vg call nesting/nested_snp_in_del.gfa -k nd_12.pack --top-down -p x 2>/dev/null > nd_12.vcf +# Should have 2 variants, nested one should have missing allele +ND_12_COUNT=$(grep -v "^#" nd_12.vcf | wc -l) +is "$ND_12_COUNT" "2" "nested_snp_in_del 1/2: het SNP/del produces both variants" +# Nested snarl should have missing allele (.) for deletion parent +ND_12_MISSING=$(grep ">2>5" nd_12.vcf | grep -c "\./") +is "$ND_12_MISSING" "1" "nested_snp_in_del 1/2: nested snarl shows missing allele for deletion" + +rm -f nd_00.gam nd_00.pack nd_00.vcf nd_01.gam nd_01.pack nd_01.vcf +rm -f nd_11.gam nd_11.pack nd_11.vcf nd_12.gam nd_12.pack nd_12.vcf + +# ============================================================================= +# Star allele tests (-Y flag) +# When parent allele doesn't traverse child, output * instead of . +# ============================================================================= + +# Test star allele output with -Y flag +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 40 > star.gam +vg pack -x nesting/nested_snp_in_del.gfa -g star.gam -o star.pack +vg call nesting/nested_snp_in_del.gfa -k star.pack --top-down -Y -p x 2>/dev/null > star.vcf +# With -Y, nested snarl should have * in ALT instead of . in GT +# The genotype will use numeric index (e.g. 1/2) where one allele is * +STAR_IN_ALT=$(grep ">2>5" star.vcf | cut -f5 | grep -c "\*") +is "$STAR_IN_ALT" "1" "star allele: -Y flag produces * in ALT for spanning deletion" +# Verify the genotype doesn't have . when -Y is used (it uses indexed * instead) +# Extract just the GT field (first colon-separated field in SAMPLE column) +NO_MISSING_GT=$(grep ">2>5" star.vcf | cut -f10 | cut -d: -f1 | grep -v "\." | wc -l) +is "$NO_MISSING_GT" "1" "star allele: genotype uses indexed * instead of . with -Y" + +rm -f star.gam star.pack star.vcf + +# ============================================================================= +# Nested quality metrics tests +# Verify QUAL, GQ, GL are computed for nested variant calls +# ============================================================================= + +# Test: nested calls should have non-zero QUAL +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 50 > nq.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nq.gam -o nq.pack +vg call nesting/nested_snp_in_del.gfa -k nq.pack --top-down -p x 2>/dev/null > nq.vcf + +# Check that nested snarl (>2>5) has non-zero QUAL +NQ_QUAL=$(grep ">2>5" nq.vcf | cut -f6) +NQ_HAS_QUAL=$(if [ "$NQ_QUAL" != "0" ] && [ "$NQ_QUAL" != "." ]; then echo "1"; else echo "0"; fi) +is "$NQ_HAS_QUAL" "1" "nested call has non-zero QUAL" + +# Check that nested snarl has GQ in FORMAT +NQ_FORMAT=$(grep ">2>5" nq.vcf | cut -f9) +NQ_HAS_GQ=$(echo "$NQ_FORMAT" | grep -c "GQ") +is "$NQ_HAS_GQ" "1" "nested call has GQ field" + +# Check that top-level snarl also has quality (unaffected) +TL_QUAL=$(grep ">1>6" nq.vcf | cut -f6) +TL_HAS_QUAL=$(if [ "$TL_QUAL" != "0" ] && [ "$TL_QUAL" != "." ]; then echo "1"; else echo "0"; fi) +is "$TL_HAS_QUAL" "1" "top-level call still has non-zero QUAL" + +rm -f nq.gam nq.pack nq.vcf + +# Test: triple nested with augref paths should all have quality +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > tnq_ap.gfa 2>/dev/null +vg sim -x tnq_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 51 > tnq.gam +vg sim -x tnq_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 52 >> tnq.gam +vg pack -x tnq_ap.gfa -g tnq.gam -o tnq.pack +vg call tnq_ap.gfa -k tnq.pack --top-down -P x 2>/dev/null > tnq.vcf + +# All variant lines should have non-zero QUAL +TNQ_ZERO_QUAL=$(grep -v "^#" tnq.vcf | awk -F'\t' '$6 == "0" || $6 == "."' | wc -l) +is "$TNQ_ZERO_QUAL" "0" "triple nested calls all have non-zero QUAL" + +# All variant lines should have GQ in FORMAT +TNQ_ALL_GQ=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GQ" | wc -l) +is "$TNQ_ALL_GQ" "0" "triple nested calls all have GQ field" + +# All variant lines should have GL (Genotype Likelihood) in FORMAT +TNQ_ALL_GL=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GL" | wc -l) +is "$TNQ_ALL_GL" "0" "triple nested calls all have GL field" + +# All variant lines should have GP (Genotype Posterior) in FORMAT +TNQ_ALL_GP=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GP" | wc -l) +is "$TNQ_ALL_GP" "0" "triple nested calls all have GP field" + +# All variant lines should have XD (Expected Depth) in FORMAT +TNQ_ALL_XD=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "XD" | wc -l) +is "$TNQ_ALL_XD" "0" "triple nested calls all have XD field" + +# All variant lines should have AD (Allelic Depth) in FORMAT +TNQ_ALL_AD=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "AD" | wc -l) +is "$TNQ_ALL_AD" "0" "triple nested calls all have AD field" + +# GQ values should be in valid range (0-256, integers) +TNQ_INVALID_GQ=$(grep -v "^#" tnq.vcf | awk -F'\t' '{ + split($9, fmt, ":"); + split($10, val, ":"); + for (i=1; i<=length(fmt); i++) { + if (fmt[i] == "GQ") { + gq = val[i]; + if (gq !~ /^[0-9]+$/ || gq < 0 || gq > 256) print "invalid"; + } + } +}' | wc -l) +is "$TNQ_INVALID_GQ" "0" "triple nested calls have valid GQ values (0-256)" + +# All variant lines should have LV (nesting level) in INFO +TNQ_ALL_LV=$(grep -v "^#" tnq.vcf | cut -f8 | grep -v "LV=" | wc -l) +is "$TNQ_ALL_LV" "0" "triple nested calls all have LV tag" + +# Nested variants (LV > 0) should have PS (parent snarl) in INFO +TNQ_NESTED_NO_PS=$(grep -v "^#" tnq.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ && $8 !~ /PS=/' | wc -l) +is "$TNQ_NESTED_NO_PS" "0" "nested calls (LV>0) all have PS tag" + +# Top-level variants (LV=0) should NOT have PS tag +TNQ_TOPLEVEL_HAS_PS=$(grep -v "^#" tnq.vcf | awk -F'\t' '$8 ~ /LV=0/ && $8 ~ /PS=/' | wc -l) +is "$TNQ_TOPLEVEL_HAS_PS" "0" "top-level calls (LV=0) do not have PS tag" + +rm -f tnq_ap.gfa tnq.gam tnq.pack tnq.vcf + +# ============================================================================= +# nested_snp_in_ins.gfa tests +# Graph structure: +# x (ref): 1->6 (short ref, allele 0 at top-level) +# y0 (a#1): 1->2->4->5->6 (insertion with SNP A, allele 1 at top, allele 1 at nested) +# y1 (a#2): 1->2->3->5->6 (insertion with SNP T, allele 2 at top, allele 0 at nested) +# Top-level snarl: 1->6, Nested snarl: 2->5 +# NOTE: ref path x bypasses the nested snarl, so we need augref covers to call nested variants +# ============================================================================= + +# Compute augref cover first (creates x_1_alt, x_2_alt, etc. covering nodes not on x) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_ins.gfa > ni_ap.gfa + +# Test 0/0: homozygous reference - reads only from x path (short ref) +vg sim -x ni_ap.gfa -P x -n 100 -l 2 -a -s 70 > ni_00.gam +vg pack -x ni_ap.gfa -g ni_00.gam -o ni_00.pack +vg call ni_ap.gfa -k ni_00.pack --top-down -P x 2>/dev/null > ni_00.vcf +# 0/0 should produce no non-ref variants (or only ref calls) +NI_00_NONREF=$(grep -v "^#" ni_00.vcf | grep -v "0/0" | wc -l) +is "$NI_00_NONREF" "0" "nested_snp_in_ins 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/insertion - reads from x and y1 (a#2 haplotype) +# Note: y1 path is 5bp vs x path 2bp, so need ~2x more y1 reads for balanced boundary coverage +vg sim -x ni_ap.gfa -P x -n 50 -l 2 -a -s 71 > ni_01.gam +vg sim -x ni_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 72 >> ni_01.gam +vg pack -x ni_ap.gfa -g ni_01.gam -o ni_01.pack +vg call ni_ap.gfa -k ni_01.pack --top-down -P x 2>/dev/null > ni_01.vcf +# With augref paths: both top-level and nested variants emitted +NI_01_COUNT=$(grep -v "^#" ni_01.vcf | wc -l) +is "$NI_01_COUNT" "2" "nested_snp_in_ins 0/1: het ref/ins produces top-level and nested variants with augref paths" + +# Test 1/1: homozygous insertion - reads only from y1 path +vg sim -x ni_ap.gfa -P "a#2#y1#0" -n 100 -l 2 -a -s 73 > ni_11.gam +vg pack -x ni_ap.gfa -g ni_11.gam -o ni_11.pack +vg call ni_ap.gfa -k ni_11.pack --top-down -P x 2>/dev/null > ni_11.vcf +# With augref paths: both top-level and nested variants emitted +NI_11_COUNT=$(grep -v "^#" ni_11.vcf | wc -l) +is "$NI_11_COUNT" "2" "nested_snp_in_ins 1/1: homozygous ins produces top-level and nested variants with augref paths" + +# Test 1/2: het between two insertion alleles - reads from both y0 and y1 +vg sim -x ni_ap.gfa -m a -n 200 -l 2 -a -s 74 > ni_12.gam +vg pack -x ni_ap.gfa -g ni_12.gam -o ni_12.pack +vg call ni_ap.gfa -k ni_12.pack --top-down -P x 2>/dev/null > ni_12.vcf +# With augref paths: both top-level and nested variants emitted +NI_12_COUNT=$(grep -v "^#" ni_12.vcf | wc -l) +is "$NI_12_COUNT" "2" "nested_snp_in_ins 1/2: het ins/ins produces top-level and nested variants with augref paths" + +rm -f ni_ap.gfa ni_00.gam ni_00.pack ni_00.vcf ni_01.gam ni_01.pack ni_01.vcf +rm -f ni_11.gam ni_11.pack ni_11.vcf ni_12.gam ni_12.pack ni_12.vcf + +# ============================================================================= +# Triple nested graph tests +# Graph structure: +# x (ref): 1->5 (short ref, bypasses all nesting) +# y0 (a#1): 1->2->3->31->311->313->32->33->34->4->5 (deep insertion) +# y1 (a#2): 1->2->3->31->311->312->32->33->34->4->5 (different at deepest level) +# y2 (a#3): same as y0 +# Top-level snarl: 1->5, with 4 levels of nesting inside +# NOTE: ref path x bypasses all nesting, so we need augref covers to call nested variants +# ============================================================================= + +# Compute augref cover first (creates x_1_alt, x_2_alt, etc. covering nested nodes) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > tn_ap.gfa + +# Test 0/0: homozygous reference - reads only from x path +vg sim -x tn_ap.gfa -P x -n 100 -l 2 -a -s 80 > tn_00.gam +vg pack -x tn_ap.gfa -g tn_00.gam -o tn_00.pack +vg call tn_ap.gfa -k tn_00.pack --top-down -P x 2>/dev/null > tn_00.vcf +TN_00_NONREF=$(grep -v "^#" tn_00.vcf | grep -v "0/0" | wc -l) +is "$TN_00_NONREF" "0" "triple_nested 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/insertion - reads from x and y0 +# Note: y0 path is 11bp vs x path 2bp, so need ~5x more y0 reads for balanced boundary coverage +vg sim -x tn_ap.gfa -P x -n 50 -l 2 -a -s 81 > tn_01.gam +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 500 -l 2 -a -s 82 >> tn_01.gam +vg pack -x tn_ap.gfa -g tn_01.gam -o tn_01.pack +vg call tn_ap.gfa -k tn_01.pack --top-down -P x 2>/dev/null > tn_01.vcf +# With augref paths: all 5 nesting levels can be emitted +TN_01_COUNT=$(grep -v "^#" tn_01.vcf | wc -l) +is "$TN_01_COUNT" "5" "triple_nested 0/1: het ref/ins produces all 5 nesting level variants with augref paths" + +# Test 1/1: homozygous insertion - reads only from y0 path +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 83 > tn_11.gam +vg pack -x tn_ap.gfa -g tn_11.gam -o tn_11.pack +vg call tn_ap.gfa -k tn_11.pack --top-down -P x 2>/dev/null > tn_11.vcf +# With augref paths: 1 variant emitted (top-level insertion only) +# All nested snarls are 0/0 because y0 matches x_1_alt reference at all levels +# (y0 goes through 313, and x_1_alt also goes through 313) +TN_11_COUNT=$(grep -v "^#" tn_11.vcf | wc -l) +is "$TN_11_COUNT" "1" "triple_nested 1/1: homozygous ins produces only top-level variant" + +# Test 1/2: het between insertion alleles - reads from y0 and y1 (differ at deepest SNP) +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 84 > tn_12.gam +vg sim -x tn_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 85 >> tn_12.gam +vg pack -x tn_ap.gfa -g tn_12.gam -o tn_12.pack +vg call tn_ap.gfa -k tn_12.pack --top-down -P x 2>/dev/null > tn_12.vcf +# With augref paths: all 5 nesting levels can be emitted +TN_12_COUNT=$(grep -v "^#" tn_12.vcf | wc -l) +is "$TN_12_COUNT" "5" "triple_nested 1/2: het ins/ins produces all 5 nesting level variants with augref paths" + +rm -f tn_ap.gfa tn_00.gam tn_00.pack tn_00.vcf tn_01.gam tn_01.pack tn_01.vcf +rm -f tn_11.gam tn_11.pack tn_11.vcf tn_12.gam tn_12.pack tn_12.vcf + +# ============================================================================= +# Multi-level SNP test (triple_nested_multisnp.gfa) +# This graph has SNPs at multiple nesting levels: +# - y0 matches x_1_alt at all levels (will be chosen as augref reference) +# - y1 differs from x_1_alt at all nested levels +# When simulating from y1, we should get variants at all 4 nesting levels +# ============================================================================= + +vg paths -x nesting/triple_nested_multisnp.gfa -Q x --compute-augref --min-augref-len 1 > tn_ms_ap.gfa +vg sim -x tn_ms_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 100 > tn_ms.gam +vg pack -x tn_ms_ap.gfa -g tn_ms.gam -o tn_ms.pack +vg call tn_ms_ap.gfa -k tn_ms.pack --top-down -P x 2>/dev/null > tn_ms.vcf +# Should get 4 variants: top-level + 3 nested SNPs (all at 1/1) +TN_MS_COUNT=$(grep -v "^#" tn_ms.vcf | wc -l) +is "$TN_MS_COUNT" "4" "triple_nested_multisnp 1/1: homozygous alt produces variants at all 4 nesting levels" +# Verify all variants are 1/1 (homozygous alt) +TN_MS_HOM=$(grep -v "^#" tn_ms.vcf | cut -f10 | cut -d: -f1 | grep -c "1/1") +is "$TN_MS_HOM" "4" "triple_nested_multisnp 1/1: all 4 variants are homozygous alt" +# Verify LV tags span levels 0-3 +TN_MS_LV0=$(grep -v "^#" tn_ms.vcf | grep -c "LV=0") +TN_MS_LV123=$(grep -v "^#" tn_ms.vcf | grep -c "LV=[123]") +is "$TN_MS_LV0" "1" "triple_nested_multisnp: one top-level variant (LV=0)" +is "$TN_MS_LV123" "3" "triple_nested_multisnp: three nested variants (LV=1,2,3)" + +rm -f tn_ms_ap.gfa tn_ms.gam tn_ms.pack tn_ms.vcf + +# ============================================================================= +# Short reference bypass tests +# NOTE: ref path x bypasses nested structures, so we need augref covers to call nested variants +# ============================================================================= + +# Test nested_snp_in_nested_ins.gfa - ref bypasses all nested structures +# Compute augref cover first (creates x_1_alt, etc. covering nested nodes) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_nested_ins.gfa > bypass_ap.gfa +vg sim -x bypass_ap.gfa -m a -n 100 -l 2 -a -s 60 > bypass.gam +vg pack -x bypass_ap.gfa -g bypass.gam -o bypass.pack +vg call bypass_ap.gfa -k bypass.pack --top-down -P x 2>/dev/null > bypass.vcf +BYPASS_EXIT=$? +is "$BYPASS_EXIT" "0" "nested_snp_in_nested_ins: vg call handles short-ref nested graph with augref paths without crashing" + +rm -f bypass_ap.gfa bypass.gam bypass.pack bypass.vcf + +# ============================================================================= +# --top-down -a interaction tests +# Verify nested calling with reference output (-a) works correctly +# ============================================================================= + +# Test 1: nested_snp_in_del 0/0 with --top-down -a +# When ref traverses nested snarl, both levels should get 0/0 +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 100 -l 2 -a -s 200 > na_del.gam +vg pack -x nesting/nested_snp_in_del.gfa -g na_del.gam -o na_del.pack +vg call nesting/nested_snp_in_del.gfa -k na_del.pack --top-down -a -p x 2>/dev/null > na_del.vcf + +# Count variant lines (should be 2: top-level + nested) +NA_DEL_COUNT=$(grep -v "^#" na_del.vcf | wc -l) +is "$NA_DEL_COUNT" "2" "--top-down -a: nested_snp_in_del 0/0 emits both snarls" + +# Verify top-level is 0/0 (use awk to match ID column exactly) +NA_DEL_TOP_GT=$(awk -F'\t' '$3 == ">1>6" {print $10}' na_del.vcf | cut -d: -f1) +is "$NA_DEL_TOP_GT" "0/0" "--top-down -a: nested_snp_in_del top-level is 0/0" + +# Verify nested is 0/0 (use awk to match ID column exactly) +NA_DEL_NEST_GT=$(awk -F'\t' '$3 == ">2>5" {print $10}' na_del.vcf | cut -d: -f1) +is "$NA_DEL_NEST_GT" "0/0" "--top-down -a: nested_snp_in_del nested is 0/0" + +rm -f na_del.gam na_del.pack na_del.vcf + +# Test 2: nested_snp_in_ins 0/0 with --top-down -a (with augref paths) +# When ref bypasses nested snarl and both alleles are ref, nested NOT emitted +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_ins.gfa > na_ins_ap.gfa 2>/dev/null +vg sim -x na_ins_ap.gfa -P x -n 100 -l 2 -a -s 200 > na_ins_00.gam +vg pack -x na_ins_ap.gfa -g na_ins_00.gam -o na_ins_00.pack +vg call na_ins_ap.gfa -k na_ins_00.pack --top-down -a -P x 2>/dev/null > na_ins_00.vcf + +# Count variant lines (should be 1: only top-level, nested not emitted) +NA_INS_00_COUNT=$(grep -v "^#" na_ins_00.vcf | wc -l) +is "$NA_INS_00_COUNT" "1" "--top-down -a: nested_snp_in_ins 0/0 emits only top-level (ref spans nested)" + +rm -f na_ins_00.gam na_ins_00.pack na_ins_00.vcf + +# Test 3: nested_snp_in_ins 0/1 with --top-down -a (with augref paths) +# When ref bypasses nested but alt traverses, nested should have ./X genotype +vg sim -x na_ins_ap.gfa -P x -n 50 -l 2 -a -s 200 > na_ins_01.gam +vg sim -x na_ins_ap.gfa -P "a#1#y0#0" -n 100 -l 2 -a -s 201 >> na_ins_01.gam +vg pack -x na_ins_ap.gfa -g na_ins_01.gam -o na_ins_01.pack +vg call na_ins_ap.gfa -k na_ins_01.pack --top-down -a -P x 2>/dev/null > na_ins_01.vcf + +# Count variant lines (should be 2: top-level + nested) +NA_INS_01_COUNT=$(grep -v "^#" na_ins_01.vcf | wc -l) +is "$NA_INS_01_COUNT" "2" "--top-down -a: nested_snp_in_ins 0/1 emits both snarls" + +# Verify nested has missing allele marker (.) +NA_INS_01_NEST_GT=$(grep ">2>5" na_ins_01.vcf | cut -f10 | cut -d: -f1) +NA_INS_01_HAS_MISSING=$(echo "$NA_INS_01_NEST_GT" | grep -c "\.") +is "$NA_INS_01_HAS_MISSING" "1" "--top-down -a: nested_snp_in_ins 0/1 nested has missing allele (.)" + +rm -f na_ins_ap.gfa na_ins_01.gam na_ins_01.pack na_ins_01.vcf + +# Test 4: triple_nested 0/0 with --top-down -a (with augref paths) +# When ref bypasses all nested snarls and genotype is 0/0, only top-level emitted +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > na_tn_ap.gfa 2>/dev/null +vg sim -x na_tn_ap.gfa -P x -n 100 -l 2 -a -s 210 > na_tn_00.gam +vg pack -x na_tn_ap.gfa -g na_tn_00.gam -o na_tn_00.pack +vg call na_tn_ap.gfa -k na_tn_00.pack --top-down -a -P x 2>/dev/null > na_tn_00.vcf + +# Count variant lines (should be 1: only top-level, nested not emitted since ref spans them) +NA_TN_00_COUNT=$(grep -v "^#" na_tn_00.vcf | wc -l) +is "$NA_TN_00_COUNT" "1" "--top-down -a: triple_nested 0/0 emits only top-level (ref spans all nested)" + +# Verify top-level is 0/0 +NA_TN_00_GT=$(grep -v "^#" na_tn_00.vcf | cut -f10 | cut -d: -f1) +is "$NA_TN_00_GT" "0/0" "--top-down -a: triple_nested 0/0 top-level is 0/0" + +rm -f na_tn_00.gam na_tn_00.pack na_tn_00.vcf + +# Test 5: triple_nested 0/1 with --top-down -a (with augref paths) +# When ref bypasses nested but alt traverses, nested snarls should have ./X genotype +vg sim -x na_tn_ap.gfa -P x -n 50 -l 2 -a -s 211 > na_tn_01.gam +vg sim -x na_tn_ap.gfa -P "a#1#y0#0" -n 500 -l 2 -a -s 212 >> na_tn_01.gam +vg pack -x na_tn_ap.gfa -g na_tn_01.gam -o na_tn_01.pack +vg call na_tn_ap.gfa -k na_tn_01.pack --top-down -a -P x 2>/dev/null > na_tn_01.vcf + +# Count variant lines (should be 5: all nesting levels emitted with -a) +NA_TN_01_COUNT=$(grep -v "^#" na_tn_01.vcf | wc -l) +is "$NA_TN_01_COUNT" "5" "--top-down -a: triple_nested 0/1 emits all 5 nesting levels" + +# Nested snarls (LV > 0) should have missing allele (.) for the spanning ref +NA_TN_01_NESTED_MISSING=$(grep -v "^#" na_tn_01.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ {print $10}' | cut -d: -f1 | grep -c "\.") +NA_TN_01_NESTED_COUNT=$(grep -v "^#" na_tn_01.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/' | wc -l) +is "$NA_TN_01_NESTED_MISSING" "$NA_TN_01_NESTED_COUNT" "--top-down -a: triple_nested 0/1 all nested snarls have missing allele (.)" + +rm -f na_tn_ap.gfa na_tn_01.gam na_tn_01.pack na_tn_01.vcf + +# ============================================================================= +# -A (all-snarls) flag tests +# Verify that -A flag: +# 1. Includes LV/PS header tags in VCF output +# 2. Calls all snarls including nested ones (each independently) +# 3. Produces consistent results +# ============================================================================= + +# Test: -A flag includes LV and PS header lines +vg construct -r small/x.fa -v small/x.vcf.gz -a > all_snarls_test.vg +vg sim -x all_snarls_test.vg -n 200 -l 30 -a -s 300 > all_snarls_test.gam +vg pack -x all_snarls_test.vg -g all_snarls_test.gam -o all_snarls_test.pack +vg call all_snarls_test.vg -k all_snarls_test.pack -A > all_snarls_test.vcf + +# Check for LV header +AS_HAS_LV_HEADER=$(grep -c "##INFO= as_nested.vg +vg sim -x as_nested.vg -m a -n 100 -l 2 -a -s 301 > as_nested.gam +vg pack -x as_nested.vg -g as_nested.gam -o as_nested.pack +vg call as_nested.vg -k as_nested.pack -A -p x > as_nested.vcf 2>/dev/null + +# Should produce variants at both nesting levels +AS_NESTED_COUNT=$(grep -v "^#" as_nested.vcf | wc -l) +is "$AS_NESTED_COUNT" "2" "-A flag: nested graph produces both top-level and nested variants" + +# Verify LV tags present +AS_NESTED_LV=$(grep -v "^#" as_nested.vcf | grep -c "LV=") +is "$AS_NESTED_LV" "2" "-A flag: nested variants have LV tags" + +# Verify LV=0 exists (top-level) +AS_NESTED_LV0=$(grep -v "^#" as_nested.vcf | grep -c "LV=0") +is "$AS_NESTED_LV0" "1" "-A flag: has top-level variant (LV=0)" + +# Verify LV=1 exists (nested) +AS_NESTED_LV1=$(grep -v "^#" as_nested.vcf | grep -c "LV=1") +is "$AS_NESTED_LV1" "1" "-A flag: has nested variant (LV=1)" + +# Verify nested variant has PS tag pointing to parent +AS_NESTED_PS=$(grep -v "^#" as_nested.vcf | awk -F'\t' '$8 ~ /LV=1/ && $8 ~ /PS=/' | wc -l) +is "$AS_NESTED_PS" "1" "-A flag: nested variant has PS tag" + +rm -f as_nested.vg as_nested.gam as_nested.pack as_nested.vcf + +# Test: -A flag on triple nested graph with augref paths +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > as_triple.gfa 2>/dev/null +vg sim -x as_triple.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 302 > as_triple.gam +vg sim -x as_triple.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 303 >> as_triple.gam +vg pack -x as_triple.gfa -g as_triple.gam -o as_triple.pack +vg call as_triple.gfa -k as_triple.pack -A -P x > as_triple.vcf 2>/dev/null + +# Should produce variants at multiple nesting levels +AS_TRIPLE_COUNT=$(grep -v "^#" as_triple.vcf | wc -l) +AS_TRIPLE_HAS_VARIANTS=$(if [ "$AS_TRIPLE_COUNT" -ge 3 ]; then echo "1"; else echo "0"; fi) +is "$AS_TRIPLE_HAS_VARIANTS" "1" "-A flag: triple nested produces at least 3 variants" + +# Verify all variants have LV tags +AS_TRIPLE_LV=$(grep -v "^#" as_triple.vcf | grep -c "LV=") +is "$AS_TRIPLE_LV" "$AS_TRIPLE_COUNT" "-A flag: all triple nested variants have LV tags" + +# Verify PS tags on nested variants (LV > 0) +AS_TRIPLE_NESTED=$(grep -v "^#" as_triple.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/' | wc -l) +AS_TRIPLE_NESTED_PS=$(grep -v "^#" as_triple.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ && $8 ~ /PS=/' | wc -l) +is "$AS_TRIPLE_NESTED_PS" "$AS_TRIPLE_NESTED" "-A flag: all nested variants (LV>0) have PS tags" + +rm -f as_triple.gfa as_triple.gam as_triple.pack as_triple.vcf + +# ============================================================================= +# RC, RS, RD tag tests (reference coordinate tags for nested snarls) +# ============================================================================= + +# Test: RC, RS, RD headers are present +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > rc_test.gfa 2>/dev/null +vg sim -x rc_test.gfa -P "a#1#y0#0" -n 100 -l 2 -a -s 400 > rc_test.gam 2>/dev/null +vg sim -x rc_test.gfa -P "a#2#y1#0" -n 100 -l 2 -a -s 401 >> rc_test.gam 2>/dev/null +vg pack -x rc_test.gfa -g rc_test.gam -o rc_test.pack 2>/dev/null +vg call rc_test.gfa -k rc_test.pack --top-down -P x > rc_test.vcf 2>/dev/null + +# Check for RC, RS, RD headers +RC_HEADER=$(grep -c "##INFO= Date: Wed, 11 Feb 2026 11:14:45 -0500 Subject: [PATCH 5/5] Remove dead code: unused functions and member variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove get_step(), get_parent_intervals(), print_stats() from AugRefCover and extract_child_traversal() from FlowCaller — all defined but never called. Remove unused ref_path_to_copy member variable and fix stale comment. Remove duplicate protected: specifier in augref.hpp. Co-Authored-By: Claude Opus 4.6 --- src/augref.cpp | 99 -------------------------------------------- src/augref.hpp | 17 -------- src/graph_caller.cpp | 71 ------------------------------- src/graph_caller.hpp | 7 ---- 4 files changed, 194 deletions(-) diff --git a/src/augref.cpp b/src/augref.cpp index c6b207537d..c2fc43a653 100644 --- a/src/augref.cpp +++ b/src/augref.cpp @@ -455,43 +455,6 @@ int64_t AugRefCover::get_rank(nid_t node_id) const { return ref_steps.empty() ? -1 : ref_steps.at(0).first; } -step_handle_t AugRefCover::get_step(nid_t node_id) const { - if (!node_to_interval.count(node_id)) { - return step_handle_t(); - } - const pair& interval = augref_intervals.at(node_to_interval.at(node_id)); - for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { - if (graph->get_id(graph->get_handle_of_step(step)) == node_id) { - return step; - } - } - return step_handle_t(); -} - -pair*, - const pair*> -AugRefCover::get_parent_intervals(const pair& interval) const { - - pair*, - const pair*> parents = make_pair(nullptr, nullptr); - - // since our decomposition is based on snarl traversals, we know that fragments must - // overlap their parents on snarl end points (at the very least) - // therefore we can find parents by scanning along the paths. - step_handle_t left_parent = graph->get_previous_step(interval.first); - if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(interval.first))) { - int64_t interval_idx = this->node_to_interval.at(graph->get_id(graph->get_handle_of_step(left_parent))); - parents.first = &this->augref_intervals.at(interval_idx); - } - - step_handle_t right_parent = graph->get_next_step(interval.second); - if (right_parent != graph->path_end(graph->get_path_handle_of_step(interval.second))) { - int64_t interval_idx = node_to_interval.at(graph->get_id(graph->get_handle_of_step(right_parent))); - parents.second = &this->augref_intervals.at(interval_idx); - } - return parents; -} - const vector>& AugRefCover::get_intervals() const { return this->augref_intervals; } @@ -1045,8 +1008,6 @@ void AugRefCover::verify_cover() const { void AugRefCover::copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, const unordered_set& reference_paths) { - ref_path_to_copy.clear(); - if (augref_sample_name.empty()) { return; } @@ -1077,7 +1038,6 @@ void AugRefCover::copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutab #ifdef debug cerr << "[augref] copy_base_paths_to_sample: path " << new_name << " already exists, skipping" << endl; #endif - ref_path_to_copy[ref_path] = mutable_graph->get_path_handle(new_name); continue; } @@ -1090,16 +1050,10 @@ void AugRefCover::copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutab return true; }); - ref_path_to_copy[ref_path] = new_path; - #ifdef debug cerr << "[augref] copy_base_paths_to_sample: copied " << ref_name << " -> " << new_name << endl; #endif } - -#ifdef debug - cerr << "[augref] copy_base_paths_to_sample: copied " << ref_path_to_copy.size() << " reference paths to sample " << augref_sample_name << endl; -#endif } void AugRefCover::write_augref_segments(ostream& os) { @@ -1299,57 +1253,4 @@ void AugRefCover::write_augref_segments(ostream& os) { } } -void AugRefCover::print_stats(ostream& os) { - // the header - os << "#BasePath" << "\t" - << "AugRefIndex" << "\t" - << "Length" << "\t" - << "NodeStart" << "\t" - << "NodeEnd" << "\t" - << "Rank" << "\t" - << "AvgDepth" << endl; - - for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { - const pair& interval = this->augref_intervals[i]; - path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); - string path_name = graph->get_path_name(path_handle); - - int64_t path_length = 0; - int64_t tot_depth = 0; - int64_t tot_steps = 0; - for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { - path_length += graph->get_length(graph->get_handle_of_step(step)); - vector steps = graph->steps_of_handle(graph->get_handle_of_step(step)); - tot_depth += steps.size(); - ++tot_steps; - } - - string base_path = parse_base_path(path_name); - int64_t augref_index = parse_augref_index(path_name); - - nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); - vector> ref_nodes = this->get_reference_nodes(first_node, true); - int64_t rank = ref_nodes.empty() ? -1 : ref_nodes.at(0).first; - - // interval is open ended, so we go back to last node - step_handle_t last_step; - if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.first))) { - last_step = graph->path_back(graph->get_path_handle_of_step(interval.first)); - } else { - last_step = graph->get_previous_step(interval.second); - } - handle_t last_handle = graph->get_handle_of_step(last_step); - - os << base_path << "\t" - << augref_index << "\t" - << path_length << "\t" - << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t" - << (graph->get_is_reverse(last_handle) ? "<" : ">") - << graph->get_id(last_handle) << "\t" - << rank << "\t" - << std::fixed << std::setprecision(2) << ((double)tot_depth / tot_steps) << "\n"; - } -} - } diff --git a/src/augref.hpp b/src/augref.hpp index b21ca89876..f0912aaf9c 100644 --- a/src/augref.hpp +++ b/src/augref.hpp @@ -90,13 +90,6 @@ class AugRefCover { // Get the rank (level) of a given node (0 if on a reference path). int64_t get_rank(nid_t node_id) const; - // Get the step of a given node in its covering interval. - step_handle_t get_step(nid_t node_id) const; - - // Get the parent intervals (left and right) of a given interval. - pair*, - const pair*> get_parent_intervals(const pair& interval) const; - // Get all computed intervals. const vector>& get_intervals() const; @@ -106,9 +99,6 @@ class AugRefCover { // Get the number of reference intervals (rank-0). int64_t get_num_ref_intervals() const; - // Print out a table of statistics. - void print_stats(ostream& os); - // Write a tab-separated table describing augref segments. // Each line contains: source_path, source_start, source_end, augref_path_name, // ref_path, ref_start, ref_end @@ -167,8 +157,6 @@ class AugRefCover { // Prints a summary of coverage statistics to stderr. void verify_cover() const; -protected: - const PathHandleGraph* graph = nullptr; // Intervals are end-exclusive (like BED). @@ -195,13 +183,8 @@ class AugRefCover { // Whether to print verbose output (coverage summary, etc.) bool verbose = false; - // Map from original reference path handles to their copies under augref_sample_name. - // Only populated when augref_sample_name is set. - unordered_map ref_path_to_copy; - // Copy base reference paths to the augref sample. // Creates new paths like "new_sample#0#chr1" from "CHM13#0#chr1". - // Returns a map from original path handles to new path handles. void copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, const unordered_set& reference_paths); diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 31158bf625..426d5d451a 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -1854,77 +1854,6 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { return call_snarl_internal(managed_snarl, "", make_pair(0, 0), nullptr); } -bool FlowCaller::extract_child_traversal(const SnarlTraversal& parent_trav, - const Snarl& child, - SnarlTraversal& out_child_trav) const { - // Find the child snarl's start and end in the parent traversal - // Need to check both forward and reverse orientations - int start_pos = -1; - int end_pos = -1; - bool needs_reverse = false; - - nid_t child_start_id = child.start().node_id(); - nid_t child_end_id = child.end().node_id(); - - for (int i = 0; i < parent_trav.visit_size(); ++i) { - const Visit& visit = parent_trav.visit(i); - nid_t visit_id = visit.node_id(); - - if (visit_id == child_start_id) { - // Check if orientation matches child's start - if (visit.backward() == child.start().backward()) { - // Forward match - start_pos = i; - needs_reverse = false; - } else { - // This could be the end in reverse orientation - end_pos = i; - needs_reverse = true; - } - } - if (visit_id == child_end_id) { - // Check if orientation matches child's end - if (visit.backward() == child.end().backward()) { - // Forward match - end_pos = i; - } else { - // This could be the start in reverse orientation - start_pos = i; - needs_reverse = true; - } - } - } - - // Check if we found both endpoints - if (start_pos < 0 || end_pos < 0) { - return false; - } - - // Ensure start comes before end (swap if traversing in reverse) - if (start_pos > end_pos) { - std::swap(start_pos, end_pos); - needs_reverse = !needs_reverse; - } - - // Extract the portion from start to end - out_child_trav.Clear(); - if (!needs_reverse) { - // Forward extraction - for (int i = start_pos; i <= end_pos; ++i) { - *out_child_trav.add_visit() = parent_trav.visit(i); - } - } else { - // Reverse extraction - go backwards and flip orientations - for (int i = end_pos; i >= start_pos; --i) { - Visit* v = out_child_trav.add_visit(); - v->set_node_id(parent_trav.visit(i).node_id()); - v->set_backward(!parent_trav.visit(i).backward()); - } - } - - return true; -} - TraversalSet FlowCaller::find_child_traversal_set(const SnarlTraversal& parent_trav, const Snarl& child) const { TraversalSet result; diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 5101b48cd9..6bd84b9939 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -520,13 +520,6 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC /// Extract the portion of a parent traversal that spans a child snarl (single traversal). /// This is a simpler version used when we only need one traversal from the parent. - /// @param parent_trav The parent traversal to extract from - /// @param child The child snarl to extract - /// @param out_child_trav Output: the extracted traversal, oriented to match child snarl - /// @return true if the parent traversal contains the child snarl - bool extract_child_traversal(const SnarlTraversal& parent_trav, - const Snarl& child, - SnarlTraversal& out_child_trav) const; }; class SnarlGraph;