diff --git a/src/augref.cpp b/src/augref.cpp new file mode 100644 index 0000000000..c2fc43a653 --- /dev/null +++ b/src/augref.cpp @@ -0,0 +1,1256 @@ +#include "augref.hpp" +#include +#include +#include +#include + +//#define debug + +namespace vg { + +using namespace std; + +const string AugRefCover::augref_suffix = "_alt"; + +string AugRefCover::make_augref_name(const string& base_path_name, int64_t augref_index) { + // New naming convention: {base}_{N}_alt + return base_path_name + "_" + to_string(augref_index) + augref_suffix; +} + +bool AugRefCover::is_augref_name(const string& path_name) { + // Check for pattern: _{digits}_alt at the end + // Must end with "_alt" + if (path_name.length() < 6) { // minimum: "x_1_alt" + return false; + } + if (path_name.substr(path_name.length() - 4) != augref_suffix) { + return false; + } + // Find the underscore before the digits + size_t alt_pos = path_name.length() - 4; // position of "_alt" + size_t underscore_pos = path_name.rfind('_', alt_pos - 1); + if (underscore_pos == string::npos || underscore_pos == alt_pos - 1) { + return false; // no underscore found, or nothing between underscore and _alt + } + // Check that everything between underscore and _alt is digits + for (size_t i = underscore_pos + 1; i < alt_pos; ++i) { + if (!isdigit(path_name[i])) { + return false; + } + } + return true; +} + +string AugRefCover::parse_base_path(const string& augref_name) { + if (!is_augref_name(augref_name)) { + return augref_name; + } + // Find _{N}_alt and strip it + size_t alt_pos = augref_name.length() - 4; // position of "_alt" + size_t underscore_pos = augref_name.rfind('_', alt_pos - 1); + return augref_name.substr(0, underscore_pos); +} + +int64_t AugRefCover::parse_augref_index(const string& augref_name) { + if (!is_augref_name(augref_name)) { + return -1; + } + // Extract N from _{N}_alt + size_t alt_pos = augref_name.length() - 4; // position of "_alt" + size_t underscore_pos = augref_name.rfind('_', alt_pos - 1); + return stoll(augref_name.substr(underscore_pos + 1, alt_pos - underscore_pos - 1)); +} + +void AugRefCover::set_augref_sample(const string& sample_name) { + this->augref_sample_name = sample_name; +} + +const string& AugRefCover::get_augref_sample() const { + return this->augref_sample_name; +} + +void AugRefCover::set_verbose(bool verbose) { + this->verbose = verbose; +} + +bool AugRefCover::get_verbose() const { + return this->verbose; +} + +void AugRefCover::clear(MutablePathMutableHandleGraph* graph) { + vector augref_paths_to_remove; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (is_augref_name(graph->get_path_name(path_handle))) { + augref_paths_to_remove.push_back(path_handle); + } + }); + for (path_handle_t path_handle : augref_paths_to_remove) { + graph->destroy_path(path_handle); + } +} + +void AugRefCover::compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length) { + + // start from scratch + this->augref_intervals.clear(); + this->interval_snarl_bounds.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + this->augref_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (node_to_interval.count(node_id)) { + cerr << "[augref error]: node " << node_id << " covered by two reference paths," + << " including " << graph->get_path_name(ref_path_handle) << " and " + << graph->get_path_name(graph->get_path_handle_of_step(augref_intervals.at(node_to_interval.at(node_id)).first)) + << ". Augmented reference path support currently requires disjoint acyclic reference paths" << endl; + exit(1); + } + node_to_interval[node_id] = augref_intervals.size() - 1; + }); + } + this->num_ref_intervals = this->augref_intervals.size(); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] Selected " << augref_intervals.size() << " rank=0 reference paths" << endl; +#endif + + // we use the path traversal finder for everything + PathTraversalFinder path_trav_finder(*graph); + + // we collect the augref cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>> augref_intervals_vector(thread_count); + vector> node_to_interval_vector(thread_count); + vector>> snarl_bounds_vector(thread_count); + + // we process top-level snarls in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // per-thread output + vector>& thread_augref_intervals = augref_intervals_vector[omp_get_thread_num()]; + unordered_map& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()]; + vector>& thread_snarl_bounds = snarl_bounds_vector[omp_get_thread_num()]; + + // capture the top-level snarl boundary node IDs + nid_t top_snarl_start = snarl->start().node_id(); + nid_t top_snarl_end = snarl->end().node_id(); + + vector queue = {snarl}; + + while(!queue.empty()) { + const Snarl* cur_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover + compute_snarl(*cur_snarl, path_trav_finder, minimum_length, + thread_augref_intervals, + thread_node_to_interval, + top_snarl_start, top_snarl_end, + thread_snarl_bounds); + + // recurse on the children + const vector& children = snarl_manager->children_of(cur_snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(child_snarl); + } + } + }); + + // now we need to fold up the thread covers + for (int64_t t = 0; t < thread_count; ++t) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "Adding " << augref_intervals_vector[t].size() << " augref intervals from thread " << t << endl; +#endif + // important to go through function rather than do a raw copy since + // inter-top-level snarl merging may need to happen + for (int64_t j = 0; j < augref_intervals_vector[t].size(); ++j) { + // the true flag at the end disables the overlap check. since they were computed + // in separate threads, snarls can overlap by a single node + const pair& interval = augref_intervals_vector[t][j]; + if (interval.first != graph->path_end(graph->get_path_handle_of_step(interval.first))) { + add_interval(this->augref_intervals, this->node_to_interval, augref_intervals_vector[t][j], true, + &this->interval_snarl_bounds, snarl_bounds_vector[t][j]); + } + } + augref_intervals_vector[t].clear(); + node_to_interval_vector[t].clear(); + snarl_bounds_vector[t].clear(); + } + + // second pass: greedily cover any nodes not covered by snarl traversals + fill_uncovered_nodes(minimum_length); + + // debug: verify all nodes are covered + verify_cover(); + + // remove any intervals that were made redundant by add_interval + defragment_intervals(); +} + +void AugRefCover::fill_uncovered_nodes(int64_t minimum_length) { + // Collect all uncovered nodes and the paths that pass through them + unordered_set uncovered_nodes; + map candidate_paths; // sorted by name for deterministic ordering + + graph->for_each_handle([&](handle_t handle) { + nid_t node_id = graph->get_id(handle); + if (!node_to_interval.count(node_id)) { + // Node is not covered - find paths through it + uncovered_nodes.insert(node_id); + graph->for_each_step_on_handle(handle, [&](step_handle_t step) { + path_handle_t path_handle = graph->get_path_handle_of_step(step); + string path_name = graph->get_path_name(path_handle); + // Skip existing augref paths + if (!is_augref_name(path_name)) { + candidate_paths[path_name] = path_handle; + } + return true; + }); + } + }); + + if (uncovered_nodes.empty()) { + return; + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] fill_uncovered_nodes: " << uncovered_nodes.size() << " uncovered nodes, " + << candidate_paths.size() << " candidate paths" << endl; +#endif + + // Greedily walk through paths, creating intervals for contiguous uncovered sequences + for (const auto& name_path : candidate_paths) { + if (uncovered_nodes.empty()) { + break; + } + + bool in_interval = false; + step_handle_t interval_start; + step_handle_t interval_end; + int64_t interval_length = 0; + vector interval_nodes; // track nodes in current interval + + graph->for_each_step_in_path(name_path.second, [&](step_handle_t step) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step)); + + if (uncovered_nodes.count(node_id)) { + // This node is uncovered + if (!in_interval) { + // Start a new interval + in_interval = true; + interval_start = step; + interval_length = 0; + interval_nodes.clear(); + } + interval_end = graph->get_next_step(step); + interval_length += graph->get_length(graph->get_handle_of_step(step)); + interval_nodes.push_back(node_id); + } else { + // This node is already covered + if (in_interval) { + // Close the current interval + if (interval_length >= minimum_length) { + // Add the interval and mark nodes as covered + add_interval(this->augref_intervals, this->node_to_interval, + make_pair(interval_start, interval_end), true, + &this->interval_snarl_bounds, {0, 0}); + for (nid_t nid : interval_nodes) { + uncovered_nodes.erase(nid); + } + } + in_interval = false; + } + } + return true; + }); + + // Don't forget to close any interval at the end of the path + if (in_interval && interval_length >= minimum_length) { + add_interval(this->augref_intervals, this->node_to_interval, + make_pair(interval_start, interval_end), true, + &this->interval_snarl_bounds, {0, 0}); + for (nid_t nid : interval_nodes) { + uncovered_nodes.erase(nid); + } + } + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[augref] fill_uncovered_nodes: " << uncovered_nodes.size() << " nodes still uncovered after second pass" << endl; +#endif +} + +void AugRefCover::load(const PathHandleGraph* graph, + const unordered_set& reference_paths) { + // start from scratch + this->augref_intervals.clear(); + this->interval_snarl_bounds.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (graph->get_is_reverse(graph->get_handle_of_step(step_handle))) { + cerr << "[augref] error: Reversed step " << node_id << " found in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All augref fragments must be forward-only." << endl; + exit(1); + } + if (node_to_interval.count(node_id)) { + cerr << "[augref] error: Cycle found on node " << node_id << " in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All augref fragments must be acyclic." << endl; + exit(1); + } + node_to_interval[node_id] = augref_intervals.size(); + }); + this->augref_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + } + this->num_ref_intervals = this->augref_intervals.size(); + + // load existing augref paths from the graph + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = augref_intervals.size(); + }); + this->augref_intervals.push_back(make_pair(graph->path_begin(path_handle), + graph->path_end(path_handle))); + this->interval_snarl_bounds.push_back({0, 0}); + } + }); +} + +void AugRefCover::apply(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); +#ifdef debug + cerr << "applying augref cover with " << this->num_ref_intervals << " ref intervals " + << " and " << this->augref_intervals.size() << " total intervals" << endl; +#endif + + // If augref_sample_name is set, first copy base reference paths to the new sample + if (!augref_sample_name.empty()) { + // Collect reference path handles from the reference intervals + unordered_set reference_paths; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + reference_paths.insert(graph->get_path_handle_of_step(augref_intervals[i].first)); + } + copy_base_paths_to_sample(mutable_graph, reference_paths); + } + + // Reset augref counters for each base path + base_path_augref_counter.clear(); + + // First pass: determine the maximum existing augref index for each base path + // This ensures we don't overwrite existing augref paths + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + string base = parse_base_path(path_name); + int64_t idx = parse_augref_index(path_name); + if (base_path_augref_counter.count(base)) { + base_path_augref_counter[base] = max(base_path_augref_counter[base], idx); + } else { + base_path_augref_counter[base] = idx; + } + } + }); + + // write the augref paths + int64_t written_intervals = 0; + int64_t written_length = 0; + int64_t skipped_intervals = 0; + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + // Skip empty intervals (these can be created by defragment_intervals or merging) + path_handle_t interval_path = graph->get_path_handle_of_step(augref_intervals[i].first); + if (augref_intervals[i].first == graph->path_end(interval_path)) { + skipped_intervals++; + continue; + } + + // Find the reference path this augref path extends from by tracing back to reference + nid_t first_node = graph->get_id(graph->get_handle_of_step(augref_intervals[i].first)); + vector> ref_nodes = this->get_reference_nodes(first_node, true); + + // Get the reference path name from the reference node + string base_path_name; + if (!ref_nodes.empty()) { + nid_t ref_node_id = ref_nodes.at(0).second; + int64_t ref_interval_idx = this->node_to_interval.at(ref_node_id); + path_handle_t ref_path_handle = graph->get_path_handle_of_step(augref_intervals[ref_interval_idx].first); + base_path_name = graph->get_path_name(ref_path_handle); + // Strip any subrange from the reference path name + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } else { + // Fallback to source path if no reference found (shouldn't happen) + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(augref_intervals[i].first); + base_path_name = graph->get_path_name(source_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } + + // If augref_sample_name is set, replace the sample in base_path_name + if (!augref_sample_name.empty()) { + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_path_name, sense, sample, locus, haplotype, phase_block, subrange); + + if (sample.empty()) { + // Simple path name - prepend augref sample + base_path_name = augref_sample_name + "#0#" + base_path_name; + } else { + // Replace sample with augref sample + base_path_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + } + + // Get next available augref index for this base path + int64_t augref_index = ++base_path_augref_counter[base_path_name]; + + // Create the augref path name + string augref_name = make_augref_name(base_path_name, augref_index); + + // Create the path as REFERENCE sense + path_handle_t augref_handle = mutable_graph->create_path_handle(augref_name, false); + + int64_t interval_length = 0; + for (step_handle_t step_handle = augref_intervals[i].first; step_handle != augref_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + mutable_graph->append_step(augref_handle, mutable_graph->get_handle_of_step(step_handle)); + interval_length += mutable_graph->get_length(mutable_graph->get_handle_of_step(step_handle)); + } + written_intervals++; + written_length += interval_length; + } + +#ifdef debug + cerr << "[augref] apply: wrote " << written_intervals << " augref paths (" << written_length << " bp), skipped " << skipped_intervals << " empty intervals" << endl; +#endif + + this->forwardize_augref_paths(mutable_graph); +} + +int64_t AugRefCover::get_rank(nid_t node_id) const { + // search back to reference in order to find the rank. + vector> ref_steps = this->get_reference_nodes(node_id, true); + // Return -1 if node is in a disconnected component that can't reach reference + return ref_steps.empty() ? -1 : ref_steps.at(0).first; +} + +const vector>& AugRefCover::get_intervals() const { + return this->augref_intervals; +} + +const pair* AugRefCover::get_interval(nid_t node_id) const { + if (this->node_to_interval.count(node_id)) { + return &this->augref_intervals.at(node_to_interval.at(node_id)); + } + return nullptr; +} + +int64_t AugRefCover::get_num_ref_intervals() const { + return this->num_ref_intervals; +} + +void AugRefCover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + nid_t top_snarl_start, nid_t top_snarl_end, + vector>& thread_snarl_bounds) { + + // start by finding the path traversals through the snarl + vector> travs; + vector trav_names; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (is_augref_name(trav_path_name)) { + // we ignore existing (off-reference) augref paths +#ifdef debug + cerr << "Warning : skipping existing augref traversal " << trav_path_name << endl; +#endif + continue; + } + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed = true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } + travs.push_back(trav); + trav_names.push_back(trav_path_name); + } + } +#ifdef debug +#pragma omp critical(cerr) + cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl; +#endif + + // build an initial ranked list of candidate traversal fragments + vector ranked_trav_fragments; + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // only a reference traversal (or deletion that we don't need to consider) + // will have its first two nodes covered + if (this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][0]))) && + this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][1])))) { + continue; + } + + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav, thread_node_to_interval); + + for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } + } + if (!cyclic && interval_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, uncovered_interval); + ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval}); + } + } + } + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.trav_idx); + const pair& uncovered_interval = best_stats_fragment.fragment; + +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "best trav: "; + for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx)); + cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, chopped_interval); + ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval}); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + } + } + continue; + } + pair new_interval = make_pair(trav.at(uncovered_interval.first), + graph->get_next_step(trav.at(uncovered_interval.second - 1))); +#ifdef debug + int64_t interval_length = uncovered_interval.second - uncovered_interval.first; +#pragma omp critical(cerr) + cerr << "adding interval with length " << interval_length << endl; +#endif + add_interval(thread_augref_intervals, thread_node_to_interval, new_interval, false, + &thread_snarl_bounds, {top_snarl_start, top_snarl_end}); + } +} + +vector> AugRefCover::get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval) { + + vector> intervals; + int64_t start = -1; + unordered_set dupe_check; + for (size_t i = 0; i < trav.size(); ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = dupe ? i : -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; +} + +bool AugRefCover::add_interval(vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global, + vector>* snarl_bounds_vec, + pair snarl_bounds) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) + << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">") + << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)); + if (new_interval.second == graph->path_end(graph->get_path_handle_of_step(new_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + } +#endif + bool merged = false; + int64_t merged_interval_idx = -1; + path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); + + // check the before-first step. if it's in an interval then it must be immediately + // preceeding so we merge the new interval to the end of the found interval + step_handle_t before_first_step = graph->get_previous_step(new_interval.first); + if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) { + nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); + if (thread_node_to_interval.count(prev_node_id)) { + int64_t prev_idx = thread_node_to_interval[prev_node_id]; + pair& prev_interval = thread_augref_intervals[prev_idx]; + if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { + if (prev_interval.second == new_interval.first || + (global && graph->get_previous_step(prev_interval.second) == new_interval.first)) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) + << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.first)); + if (prev_interval.second == graph->path_end(graph->get_path_handle_of_step(prev_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; + } +#endif + prev_interval.second = new_interval.second; + merged = true; + merged_interval_idx = prev_idx; + } + } + } + } + + // check the end step. if it's in an interval then it must be immediately + // following we merge the new interval to the front of the found interval + int64_t deleted_idx = -1; + step_handle_t deleted_interval_first, deleted_interval_second; // save before decommission + if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { + nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); + if (thread_node_to_interval.count(next_node_id)) { + int64_t next_idx = thread_node_to_interval[next_node_id]; + pair& next_interval = thread_augref_intervals[next_idx]; + path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); + if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { + if (next_interval.first == new_interval.second || + (global && next_interval.first == graph->get_previous_step(new_interval.second))) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)); + if (next_interval.second == graph->path_end(graph->get_path_handle_of_step(next_interval.second))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; + } +#endif + if (merged == true) { + // save the interval bounds BEFORE decommissioning + deleted_interval_first = next_interval.first; + deleted_interval_second = next_interval.second; + deleted_idx = next_idx; + // decomission next_interval + next_interval.first = graph->path_end(next_path); + next_interval.second = graph->path_front_end(next_path); + if (snarl_bounds_vec) { + (*snarl_bounds_vec)[next_idx] = {0, 0}; + } + // extend the previous interval right to cover both new_interval and the deleted next_interval + thread_augref_intervals[merged_interval_idx].second = deleted_interval_second; + } else { + // extend next_interval left + next_interval.first = new_interval.first; + merged = true; + merged_interval_idx = next_idx; + } + } + } + } + } + + // add the interval to the local (thread safe) structures + if (!merged) { + merged_interval_idx = thread_augref_intervals.size(); + thread_augref_intervals.push_back(new_interval); + if (snarl_bounds_vec) { + snarl_bounds_vec->push_back(snarl_bounds); + } + } + for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + if (deleted_idx >= 0) { + // move the links to the deleted interval to the merged interval + // use saved bounds since the interval has been decommissioned + for (step_handle_t step = deleted_interval_first; step != deleted_interval_second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + } + return !merged; +} + +void AugRefCover::defragment_intervals() { + vector> new_intervals; + vector> new_snarl_bounds; + this->node_to_interval.clear(); + for (int64_t i = 0; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + if (interval.first != graph->path_end(path_handle)) { + new_intervals.push_back(interval); + new_snarl_bounds.push_back(this->interval_snarl_bounds[i]); + } + } + this->augref_intervals = std::move(new_intervals); + this->interval_snarl_bounds = std::move(new_snarl_bounds); + for (int64_t i = 0; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + this->node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = i; + } + } +} + +int64_t AugRefCover::get_coverage(const vector& trav, const pair& uncovered_interval) { + int64_t coverage = 0; + + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + coverage += length * all_steps.size(); + } + + return coverage; +} + +// Ensure all nodes in augref paths are in forward orientation +void AugRefCover::forwardize_augref_paths(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); + + unordered_map id_map; + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + size_t fw_count = 0; + size_t total_steps = 0; + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + handle_t flipped_handle = mutable_graph->create_handle(mutable_graph->get_sequence(handle)); + id_map[mutable_graph->get_id(flipped_handle)] = mutable_graph->get_id(handle); + mutable_graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (mutable_graph->get_id(prev_handle) != mutable_graph->get_id(handle)) { + mutable_graph->create_edge(prev_handle, flipped_handle); + } + }); + mutable_graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (mutable_graph->get_id(handle) != mutable_graph->get_id(next_handle)) { + mutable_graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (mutable_graph->has_edge(handle, handle)) { + mutable_graph->create_edge(flipped_handle, flipped_handle); + } + if (mutable_graph->has_edge(handle, mutable_graph->flip(handle))) { + mutable_graph->create_edge(flipped_handle, mutable_graph->flip(flipped_handle)); + } + if (mutable_graph->has_edge(mutable_graph->flip(handle), handle)) { + mutable_graph->create_edge(mutable_graph->flip(flipped_handle), flipped_handle); + } + vector steps = mutable_graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (mutable_graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = mutable_graph->get_next_step(step); + handle_t new_handle = mutable_graph->get_is_reverse(mutable_graph->get_handle_of_step(step)) ? flipped_handle : + mutable_graph->flip(flipped_handle); + mutable_graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[augref] error: Cycle detected in augref path " << path_name << " at node " << mutable_graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(mutable_graph->steps_of_handle(handle).empty()); + dynamic_cast(mutable_graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + mutable_graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + + // do a check just to be sure + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + cerr << "[augref] error: Failed to fowardize node " << mutable_graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} + +vector> AugRefCover::get_reference_nodes(nid_t node_id, bool first) const { + + // search back to reference in order to find the rank. + unordered_set visited; + priority_queue> queue; + queue.push(make_pair(0, node_id)); + + nid_t current_id; + int64_t distance = 0; + + // output reference intervals + vector> output_reference_nodes; + + while (!queue.empty()) { + std::tie(distance, current_id) = queue.top(); + queue.pop(); + + if (!visited.count(current_id)) { + + visited.insert(current_id); + + if (this->node_to_interval.count(current_id)) { + int64_t interval_idx = this->node_to_interval.at(current_id); + + const pair& augref_interval = this->augref_intervals.at(interval_idx); + + // we've hit the reference, fish out its step and stop searching. + if (interval_idx < this->num_ref_intervals) { + output_reference_nodes.push_back(make_pair(distance, current_id)); + if (first) { + break; + } + continue; + } + + // search out of the snarl -- any parent traversals will overlap here + graph->follow_edges(graph->get_handle_of_step(augref_interval.first), true, [&](handle_t prev) { + queue.push(make_pair(distance + 1, graph->get_id(prev))); + }); + // hack around gbwtgraph bug (feature?) that does not let you decrement path_end + path_handle_t path_handle = graph->get_path_handle_of_step(augref_interval.first); + step_handle_t last_step; + if (augref_interval.second == graph->path_end(path_handle)) { + last_step = graph->path_back(path_handle); + } else { + last_step = graph->get_previous_step(augref_interval.second); + } + graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { + queue.push(make_pair(distance + 1, graph->get_id(next))); + }); + + } else { + // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) + graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + } + + } + } + + // Note: output_reference_nodes may be empty if the node is in a disconnected + // component that cannot trace back to any reference interval (e.g., after clipping) + return output_reference_nodes; +} + +void AugRefCover::verify_cover() const { + if (!verbose) { + return; + } + + // Check that every node in the graph is covered by the augref cover + int64_t total_nodes = 0; + int64_t total_length = 0; + int64_t ref_nodes = 0; + int64_t ref_length = 0; + int64_t alt_nodes = 0; + int64_t alt_length = 0; + int64_t uncovered_nodes = 0; + int64_t uncovered_length = 0; + + graph->for_each_handle([&](handle_t handle) { + nid_t node_id = graph->get_id(handle); + int64_t node_len = graph->get_length(handle); + total_nodes++; + total_length += node_len; + + if (!node_to_interval.count(node_id)) { + // Node is not covered (expected when min-augref-len > 0) + uncovered_nodes++; + uncovered_length += node_len; + } else { + int64_t interval_idx = node_to_interval.at(node_id); + if (interval_idx < num_ref_intervals) { + ref_nodes++; + ref_length += node_len; + } else { + alt_nodes++; + alt_length += node_len; + } + } + }); + + cerr << "[augref] verify_cover summary:" << endl + << " Total nodes: " << total_nodes << " (" << total_length << " bp)" << endl + << " Reference nodes: " << ref_nodes << " (" << ref_length << " bp)" << endl + << " Alt nodes: " << alt_nodes << " (" << alt_length << " bp)" << endl + << " Uncovered nodes: " << uncovered_nodes << " (" << uncovered_length << " bp)" << endl + << " Intervals: " << num_ref_intervals << " ref + " << (augref_intervals.size() - num_ref_intervals) << " alt" << endl; +} + +void AugRefCover::copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, + const unordered_set& reference_paths) { + if (augref_sample_name.empty()) { + return; + } + + for (const path_handle_t& ref_path : reference_paths) { + string ref_name = mutable_graph->get_path_name(ref_path); + + // Parse the path name to extract components + // rGFA format: SAMPLE#HAPLOTYPE#CONTIG or just CONTIG + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(ref_name, sense, sample, locus, haplotype, phase_block, subrange); + + // Create new path name with augref sample + string new_name; + if (sample.empty()) { + // Simple path name (no sample info) - prepend augref sample + new_name = augref_sample_name + "#0#" + ref_name; + } else { + // Replace sample with augref sample + new_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + + // Check if path already exists + if (mutable_graph->has_path(new_name)) { +#ifdef debug + cerr << "[augref] copy_base_paths_to_sample: path " << new_name << " already exists, skipping" << endl; +#endif + continue; + } + + // Create the new path with same sense as original + path_handle_t new_path = mutable_graph->create_path_handle(new_name, false); + + // Copy all steps from original path to new path + mutable_graph->for_each_step_in_path(ref_path, [&](step_handle_t step) { + mutable_graph->append_step(new_path, mutable_graph->get_handle_of_step(step)); + return true; + }); + +#ifdef debug + cerr << "[augref] copy_base_paths_to_sample: copied " << ref_name << " -> " << new_name << endl; +#endif + } +} + +void AugRefCover::write_augref_segments(ostream& os) { + // Track augref counters to predict path names (same logic as apply()) + unordered_map local_augref_counter; + + // First pass: find maximum existing augref index for each base path + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (is_augref_name(path_name)) { + string base = parse_base_path(path_name); + int64_t idx = parse_augref_index(path_name); + if (local_augref_counter.count(base)) { + local_augref_counter[base] = max(local_augref_counter[base], idx); + } else { + local_augref_counter[base] = idx; + } + } + }); + + // Pre-compute reference node positions by walking all reference intervals once. + // Maps node ID -> (ref_path_handle, offset of node start on ref path) + unordered_map> ref_node_positions; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->augref_intervals[i]; + path_handle_t ref_path_handle = graph->get_path_handle_of_step(ref_interval.first); + int64_t offset = 0; + for (step_handle_t step = ref_interval.first; step != ref_interval.second; + step = graph->get_next_step(step)) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step)); + ref_node_positions[node_id] = make_pair(ref_path_handle, offset); + offset += graph->get_length(graph->get_handle_of_step(step)); + } + } + + // Collect the specific steps whose offsets we need (interval boundaries only). + // This avoids caching every step of every source path. + unordered_set needed_steps; + unordered_set source_paths_needed; + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t ph = graph->get_path_handle_of_step(interval.first); + source_paths_needed.insert(ph); + needed_steps.insert(interval.first); + needed_steps.insert(interval.second); + } + // Walk each source path once, caching offsets only for needed steps. + // Also cache path_end -> total path length for computing interval end offsets. + unordered_map step_offset_cache; + for (const path_handle_t& ph : source_paths_needed) { + int64_t offset = 0; + for (step_handle_t step = graph->path_begin(ph); + step != graph->path_end(ph); + step = graph->get_next_step(step)) { + if (needed_steps.count(step)) { + step_offset_cache[step] = offset; + } + offset += graph->get_length(graph->get_handle_of_step(step)); + } + // Cache path_end sentinel with total path length + step_handle_t path_end = graph->path_end(ph); + if (needed_steps.count(path_end)) { + step_offset_cache[path_end] = offset; + } + } + + // Cache resolved source display names per path handle to avoid + // repeated parse_path_name/create_path_name per interval. + // Stores (display_name, subrange_offset) per source path. + unordered_map> source_display_cache; + + // Write each augref interval + for (int64_t i = this->num_ref_intervals; i < this->augref_intervals.size(); ++i) { + const pair& interval = this->augref_intervals[i]; + path_handle_t source_path_handle = graph->get_path_handle_of_step(interval.first); + + // Skip empty intervals + if (interval.first == graph->path_end(source_path_handle)) { + continue; + } + + // Look up pre-computed source path offsets + int64_t source_start = step_offset_cache.at(interval.first); + int64_t source_end = step_offset_cache.at(interval.second); + + // Find reference coordinates using snarl bounds or fallback to BFS + string base_path_name; + string ref_path_name = "."; + int64_t ref_start = 0; + int64_t ref_end = 0; + + const pair& snarl_bounds = this->interval_snarl_bounds[i]; + + if (snarl_bounds.first != 0 && snarl_bounds.second != 0 && + ref_node_positions.count(snarl_bounds.first) && ref_node_positions.count(snarl_bounds.second)) { + // Use snarl boundary nodes for reference coordinates + auto& left_pos = ref_node_positions[snarl_bounds.first]; + auto& right_pos = ref_node_positions[snarl_bounds.second]; + + // Both boundary nodes should be on the same reference path + path_handle_t ref_path_handle = left_pos.first; + ref_path_name = graph->get_path_name(ref_path_handle); + + int64_t left_node_len = graph->get_length(graph->get_handle(snarl_bounds.first)); + int64_t right_node_len = graph->get_length(graph->get_handle(snarl_bounds.second)); + + int64_t left_start = left_pos.second; + int64_t left_end = left_start + left_node_len; + int64_t right_start = right_pos.second; + int64_t right_end = right_start + right_node_len; + + ref_start = min(left_start, right_start); + ref_end = max(left_end, right_end); + + // Get base path name from the reference path + base_path_name = ref_path_name; + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + } else { + // Fallback to BFS-based get_reference_nodes() for sentinel intervals + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, true); + + if (!ref_nodes.empty()) { + nid_t ref_node_id = ref_nodes.at(0).second; + int64_t ref_interval_idx = this->node_to_interval.at(ref_node_id); + path_handle_t ref_path_handle = graph->get_path_handle_of_step(augref_intervals[ref_interval_idx].first); + base_path_name = graph->get_path_name(ref_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(base_path_name, &subrange); + + ref_path_name = graph->get_path_name(ref_path_handle); + + // Find the offset of ref_node_id in the reference path + if (ref_node_positions.count(ref_node_id)) { + ref_start = ref_node_positions[ref_node_id].second; + ref_end = ref_start + graph->get_length(graph->get_handle(ref_node_id)); + } + } else { + // Fallback to source path + string source_path_name = graph->get_path_name(source_path_handle); + subrange_t subrange; + base_path_name = Paths::strip_subrange(source_path_name, &subrange); + } + } + + // If augref_sample_name is set, replace sample in base_path_name + if (!augref_sample_name.empty()) { + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_path_name, sense, sample, locus, haplotype, phase_block, subrange); + + if (sample.empty()) { + base_path_name = augref_sample_name + "#0#" + base_path_name; + } else { + base_path_name = PathMetadata::create_path_name(sense, augref_sample_name, locus, haplotype, phase_block, subrange); + } + } + + // Get next augref index for this base path + int64_t augref_index = ++local_augref_counter[base_path_name]; + string augref_name = make_augref_name(base_path_name, augref_index); + + // Resolve source path name to full-path coordinates using cached result. + // Parses path name once per source path to extract subrange offset and + // strip the #0 phase block. + auto cache_it = source_display_cache.find(source_path_handle); + if (cache_it == source_display_cache.end()) { + string source_path_name = graph->get_path_name(source_path_handle); + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(source_path_name, sense, sample, locus, haplotype, phase_block, subrange); + int64_t subrange_offset = (subrange != PathMetadata::NO_SUBRANGE) ? subrange.first : 0; + if (phase_block == 0) { + phase_block = PathMetadata::NO_PHASE_BLOCK; + sense = PathSense::REFERENCE; + } + string display_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, PathMetadata::NO_SUBRANGE); + cache_it = source_display_cache.emplace(source_path_handle, make_pair(std::move(display_name), subrange_offset)).first; + } + const string& display_source_name = cache_it->second.first; + int64_t display_source_start = source_start + cache_it->second.second; + int64_t display_source_end = source_end + cache_it->second.second; + + // Output the BED line + os << display_source_name << "\t" + << display_source_start << "\t" + << display_source_end << "\t" + << augref_name << "\t" + << ref_path_name << "\t" + << ref_start << "\t" + << ref_end << "\n"; + } +} + +} diff --git a/src/augref.hpp b/src/augref.hpp new file mode 100644 index 0000000000..f0912aaf9c --- /dev/null +++ b/src/augref.hpp @@ -0,0 +1,206 @@ +#ifndef VG_AUGREF_HPP_INCLUDED +#define VG_AUGREF_HPP_INCLUDED + +/** + * \file augref.hpp + * + * Interface for computing and querying augmented reference path covers. + * + * An augref cover is a set of path fragments (stored as separate paths) in the graph. + * They are always relative to an existing reference sample (ie GRCh38 or CHM13). + * Unlike rGFA paths which use complex metadata embedding, augref paths use a simple naming + * scheme: {base_path_name}_{N}_alt + * + * For example, if the reference path is "CHM13#0#chr1", augref paths would be named: + * - CHM13#0#chr1_1_alt + * - CHM13#0#chr1_2_alt + * - etc. + * + * The data structures used in this class are always relative to the original paths + * in the graph. The REFERENCE-sense fragments that are used to serialize the + * cover can be created and loaded, but they are not used beyond that. + */ + +#include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" + +namespace vg { + +using namespace std; + +class AugRefCover { +public: + // The suffix used to identify augref paths + static const string augref_suffix; // "_alt" + + // Create an augref path name from a base reference path name and an index. + // Example: make_augref_name("CHM13#0#chr1", 1) -> "CHM13#0#chr1_1_alt" + static string make_augref_name(const string& base_path_name, int64_t augref_index); + + // Test if a path name is an augref path (contains "_{N}_alt" suffix). + static bool is_augref_name(const string& path_name); + + // Parse an augref path name to extract the base reference path name. + // Returns the original base path name, or the input if not an augref path. + // Example: parse_base_path("CHM13#0#chr1_3_alt") -> "CHM13#0#chr1" + static string parse_base_path(const string& augref_name); + + // Parse an augref path name to extract the augref index. + // Returns -1 if the path is not an augref path. + // Example: parse_augref_index("CHM13#0#chr1_3_alt") -> 3 + static int64_t parse_augref_index(const string& augref_name); + +public: + // Clear out any existing augref paths from the graph. Recommended to run this + // before compute(). + void clear(MutablePathMutableHandleGraph* graph); + + // Compute the augref cover from the graph, starting with a given set of reference paths. + void compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length); + + // Load existing augref paths from the graph, assuming they've been computed already. + // The reference_paths should be the rank-0 paths the augref paths extend from. + void load(const PathHandleGraph* graph, + const unordered_set& reference_paths); + + // Apply the augref cover to a graph (must have been computed first), adding it + // as a bunch of REFERENCE-sense paths with the simplified naming scheme. + // If augref_sample_name is set, base paths are first copied to the new sample, + // and augref paths are created under the new sample name. + void apply(MutablePathMutableHandleGraph* mutable_graph); + + // Set the sample name for augref paths. When set, apply() will: + // 1. Copy base reference paths to this new sample (CHM13#0#chr1 -> new_sample#0#chr1) + // 2. Create augref paths under the new sample (new_sample#0#chr1_1_alt, etc.) + void set_augref_sample(const string& sample_name); + + // Get the current augref sample name (empty string if not set). + const string& get_augref_sample() const; + + // Enable verbose output (coverage summary, etc.) + void set_verbose(bool verbose); + + // Check if verbose output is enabled. + bool get_verbose() const; + + // Get the rank (level) of a given node (0 if on a reference path). + int64_t get_rank(nid_t node_id) const; + + // Get all computed intervals. + const vector>& get_intervals() const; + + // Get an interval from a node. Returns nullptr if node not in an interval. + const pair* get_interval(nid_t node_id) const; + + // Get the number of reference intervals (rank-0). + int64_t get_num_ref_intervals() const; + + // Write a tab-separated table describing augref segments. + // Each line contains: source_path, source_start, source_end, augref_path_name, + // ref_path, ref_start, ref_end + // Must be called after compute() and knows what augref path names will be used. + // If augref_sample is set, uses that for the augref path names. + void write_augref_segments(ostream& os); + +protected: + + // Compute the cover for the given snarl, by greedily finding the covered paths through it. + // The cover is added to the two "thread_" structures. + // top_snarl_start/end are the boundary node IDs of the top-level snarl containing this snarl. + void compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + nid_t top_snarl_start, nid_t top_snarl_end, + vector>& thread_snarl_bounds); + + // Get intervals in traversal that are not covered according to this->node_to_interval or + // the thread_node_to_interval parameter. + vector> get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval); + + // Add a new interval into the augref_intervals vector and update the node_to_interval map. + // If the interval can be merged into an existing, contiguous interval, do that instead. + // Returns true if a new interval was added, false if an existing interval was updated. + bool add_interval(vector>& thread_augref_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global = false, + vector>* snarl_bounds_vec = nullptr, + pair snarl_bounds = {0, 0}); + + // add_interval() can delete an existing interval. This requires a full update at the end. + void defragment_intervals(); + + // Get the total coverage of a traversal (sum of step lengths * path count). + int64_t get_coverage(const vector& trav, const pair& uncovered_interval); + + // Make sure all nodes in all augref paths are in forward orientation. + // This is always possible because they are, by definition, disjoint. + // This should only be run from inside apply(). + void forwardize_augref_paths(MutablePathMutableHandleGraph* mutable_graph); + + // Second pass: greedily cover any nodes not covered by snarl traversals. + // This handles nodes that are outside of snarls or in complex regions + // where the traversal finder couldn't find good coverage. + void fill_uncovered_nodes(int64_t minimum_length); + + // Search back to the reference and return when found. + // (here distance is the number of intervals crossed, aka rank) + // "first" toggles returning the first interval found vs all of them. + vector> get_reference_nodes(nid_t node_id, bool first) const; + + // Debug function: verify that every node in the graph is covered by the augref cover. + // Prints a summary of coverage statistics to stderr. + void verify_cover() const; + + const PathHandleGraph* graph = nullptr; + + // Intervals are end-exclusive (like BED). + vector> augref_intervals; + + // Top-level snarl boundary nodes for each interval, parallel to augref_intervals. + // (0, 0) sentinel for reference intervals and fill_uncovered_nodes intervals. + vector> interval_snarl_bounds; + + // augref_intervals[0, num_ref_intervals-1] are all rank-0 reference intervals. + int64_t num_ref_intervals = 0; + + // Map from node ID to interval index. + unordered_map node_to_interval; + + // Counter for generating unique augref indices per base path. + // Using mutable so it can be updated in apply() which is logically const for the cover. + mutable unordered_map base_path_augref_counter; + + // Optional sample name for augref paths. When set, base paths are copied to this + // sample and augref paths are created under it. + string augref_sample_name; + + // Whether to print verbose output (coverage summary, etc.) + bool verbose = false; + + // Copy base reference paths to the augref sample. + // Creates new paths like "new_sample#0#chr1" from "CHM13#0#chr1". + void copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, + const unordered_set& reference_paths); + + // Used when selecting traversals to make the greedy cover. + struct RankedFragment { + int64_t coverage; + const string* name; + int64_t trav_idx; + pair fragment; + bool operator<(const RankedFragment& f2) const { + // note: name comparison is flipped because we want to select high coverage / low name + return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name); + } + }; +}; + +} + +#endif diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index b5924a1e6d..1aabb29080 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -622,70 +622,16 @@ unordered_map> Deconstructor::add_star_traversals(vector& names, vector>& trav_clusters, vector>& trav_cluster_info, - const unordered_map>& parent_haplotypes) const { - // todo: refactor this into general genotyping code - unordered_map> sample_to_haps; - - // find out what's in the traversals - assert(names.size() == travs.size()); - for (int64_t i = 0; i < names.size(); ++i) { - string sample_name = PathMetadata::parse_sample_name(names[i]); - // for backward compatibility - if (sample_name.empty()) { - sample_name = names[i]; - } - auto phase = PathMetadata::parse_haplotype(names[i]); - if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { - // THis probably won't fit in an int. Use 0 instead. - phase = 0; - } - sample_to_haps[sample_name].push_back(phase); - } - - // find everything that's in parent_haplotyes but not the travefsals, - // and add in dummy start-alleles for them - for (const auto& parent_sample_haps : parent_haplotypes) { - string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first); - if (parent_sample_name.empty()) { - parent_sample_name = parent_sample_haps.first; - } - if (!this->sample_names.count(parent_sample_name)) { - // dont' bother for purely reference samples -- we don't need to force and allele for them. - continue; - } - for (int parent_hap : parent_sample_haps.second) { - bool found = false; - if (sample_to_haps.count(parent_sample_haps.first)) { - // note: this is brute-force search, but number of haplotypes usually tiny. - for (int hap : sample_to_haps[parent_sample_haps.first]) { - if (parent_hap == hap) { - found = true; - break; - } - } - } - if (!found) { - travs.push_back(Traversal()); - names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, - parent_sample_haps.first, - "star", - parent_hap, - PathMetadata::NO_PHASE_BLOCK, - PathMetadata::NO_SUBRANGE)); - sample_to_haps[parent_sample_haps.first].push_back(parent_hap); - trav_clusters.push_back({(int)travs.size() - 1}); - trav_cluster_info.push_back(make_pair(0, 0)); - } - } - } - - return sample_to_haps; + const unordered_map>& parent_haplotypes) const { + // Delegate to the shared implementation in traversal_clusters.cpp + return vg::add_star_traversals(travs, names, trav_clusters, trav_cluster_info, + parent_haplotypes, this->sample_names); } bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, - const NestingInfo* in_nesting_info, - vector* out_nesting_infos) const { + const ChildContext* in_context, + vector* out_child_contexts) const { vector travs; @@ -707,16 +653,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // pick out the traversal corresponding to an embedded reference path, breaking ties consistently string ref_trav_name; - string parent_ref_trav_name; - if (in_nesting_info != nullptr && in_nesting_info->has_ref) { - parent_ref_trav_name = graph->get_path_name(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); -#ifdef debug -#pragma omp critical (cerr) - cerr << "Using nesting information to set reference to " << parent_ref_trav_name << endl; -#endif - // remember it for the vcf header - this->off_ref_paths[omp_get_thread_num()].insert(graph->get_path_handle_of_step(in_nesting_info->parent_path_interval.first)); - } for (int i = 0; i < travs.size(); ++i) { const string& path_trav_name = trav_path_names[i]; #ifdef debug @@ -730,20 +666,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << " trav=" << traversal_to_string(graph, travs[i], 100000) << endl; } #endif - bool ref_path_check; - if (!parent_ref_trav_name.empty()) { - // the reference was specified by the parent - ref_path_check = path_trav_name == parent_ref_trav_name; - } else { - // the reference comes from the global options - ref_path_check = ref_paths.count(path_trav_name); - } + bool ref_path_check = ref_paths.count(path_trav_name); if (ref_path_check && (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { ref_trav_name = path_trav_name; #ifdef debug #pragma omp critical (cerr) - cerr << "Setting ref_trav_name " << ref_trav_name << (in_nesting_info ? " using nesting info" : "") << endl; + cerr << "Setting ref_trav_name " << ref_trav_name << endl; #endif } } @@ -781,14 +710,8 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << "Skipping site because no reference traversal was found " << graph_interval_to_string(graph, snarl_start, snarl_end) << endl; #endif return false; - } - - // remember the reference traversal of top-level snarls for doing second pass of cover computation - if (in_nesting_info != nullptr && !in_nesting_info->has_ref) { -#pragma omp critical (top_level_intervals) - this->top_level_ref_intervals[snarl_start] = trav_steps[ref_travs.at(0)]; } - + // there's not alt path through the snarl, so we can't make an interesting variant if (travs.size() < 2) { #ifdef debug @@ -798,26 +721,6 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t return false; } - if (ref_travs.size() > 1 && this->nested_decomposition) { -#ifdef debug -#pragma omp critical (cerr) - cerr << "Multiple ref traversals not yet supported with nested decomposition: removing all but first" << endl; -#endif - size_t min_start_pos = numeric_limits::max(); - int64_t first_ref_trav; - for (int64_t i = 0; i < ref_travs.size(); ++i) { - auto& ref_trav_idx = ref_travs[i]; - step_handle_t start_step = trav_steps[ref_trav_idx].first; - step_handle_t end_step = trav_steps[ref_trav_idx].second; - size_t ref_trav_pos = min(graph->get_position_of_step(start_step), graph->get_position_of_step(end_step)); - if (ref_trav_pos < min_start_pos) { - min_start_pos = ref_trav_pos; - first_ref_trav = i; - } - } - ref_travs = {ref_travs[first_ref_trav]}; - } - // XXX CHECKME this assumes there is only one reference path here, and that multiple traversals are due to cycles // we collect windows around the reference traversals @@ -955,13 +858,12 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t // jaccard clustering (using handles for now) on traversals vector> trav_cluster_info; - vector child_snarl_to_trav; + vector unused_child_snarl_mapping; vector> trav_clusters = cluster_traversals(graph, travs, sorted_travs, - (in_nesting_info ? in_nesting_info->child_snarls : - vector>()), + vector>(), cluster_threshold, trav_cluster_info, - child_snarl_to_trav); + unused_child_snarl_mapping); #ifdef debug cerr << "cluster priority"; @@ -976,18 +878,14 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } cerr << " }" << endl; } - - if (in_nesting_info) { - for (int64_t ct = 0; ct < child_snarl_to_trav.size(); ++ct) { - cerr << "child " << ct << " " << graph_interval_to_string(graph, in_nesting_info->child_snarls[ct].first, - in_nesting_info->child_snarls[ct].second) - << " maps to trav " << child_snarl_to_trav[ct] << endl; - } - } #endif - unordered_map> sample_to_haps; - if (in_nesting_info != nullptr) { + // Track sample haplotypes for passing to children (used by star alleles) + unordered_map> sample_to_haps; + + // Only run nested-specific logic when we have context from parent + // (i.e., when using top-down processing, not flat processing) + if (in_context != nullptr) { // if the reference traversal is also an alt traversal, we pop out an extra copy // todo: this is a hack add add off-reference support while keeping the current // logic where the reference traversal is always distinct from the alts. this step @@ -1012,14 +910,13 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t } assert(found_cluster == true); } - + // add in the star alleles -- these are alleles that were genotyped in the parent but not // the current allele, and are treated as *'s in VCF. - if (this->star_allele) { + if (this->star_allele && in_context != nullptr && !in_context->empty()) { sample_to_haps = add_star_traversals(travs, trav_path_names, trav_clusters, trav_cluster_info, - in_nesting_info->sample_to_haplotypes); + *in_context->sample_to_haplotypes); } - } vector trav_to_allele = get_alleles(v, travs, trav_steps, @@ -1027,7 +924,7 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t trav_clusters, prev_char, use_start); - + #ifdef debug assert(trav_to_allele.size() == travs.size()); cerr << "trav_to_allele ="; @@ -1035,97 +932,45 @@ bool Deconstructor::deconstruct_site(const handle_t& snarl_start, const handle_t cerr << " " << tta; } cerr << endl; -#endif +#endif // Fill in the genotypes get_genotypes(v, trav_path_names, trav_to_allele, trav_cluster_info); - // Fill in some nesting-specific (site-level) tags - NestingInfo ref_info; // since in_nesting_info is const, we put top-level stuff here - if (this->nested_decomposition) { - if (in_nesting_info != nullptr && in_nesting_info->has_ref == true) { - // if we're a child, just take what's passed in - ref_info.parent_allele = in_nesting_info->parent_allele; - ref_info.parent_len = in_nesting_info->parent_len; - ref_info.parent_ref_len = in_nesting_info->parent_ref_len; - ref_info.lv0_ref_name = in_nesting_info->lv0_ref_name; - ref_info.lv0_ref_start = in_nesting_info->lv0_ref_start; - ref_info.lv0_ref_len = in_nesting_info->lv0_ref_len; - ref_info.lv0_alt_len = in_nesting_info->lv0_alt_len; - } else { - // if we're a root, compute values from the prsent site - // todo: should they just be left undefined? - ref_info.parent_allele = 0; - ref_info.parent_len = v.alleles[0].length(); - ref_info.parent_ref_len = v.alleles[0].length(); - ref_info.parent_path_interval = trav_steps[ref_trav_idx]; - ref_info.parent_ref_interval = trav_steps[ref_trav_idx]; - ref_info.lv0_ref_name = v.sequenceName; - ref_info.lv0_ref_start = v.position; - ref_info.lv0_ref_len = v.alleles[0].length(); - ref_info.lv0_alt_len = v.alleles[ref_info.parent_allele].length(); - ref_info.lv = 0; - } - v.info["PA"].push_back(std::to_string(ref_info.parent_allele)); - v.info["PL"].push_back(std::to_string(ref_info.parent_len)); - v.info["PR"].push_back(std::to_string(ref_info.parent_ref_len)); - v.info["RC"].push_back(ref_info.lv0_ref_name); - v.info["RS"].push_back(std::to_string(ref_info.lv0_ref_start)); - v.info["RD"].push_back(std::to_string(ref_info.lv0_ref_len)); - v.info["RL"].push_back(std::to_string(ref_info.lv0_alt_len)); - } - - if (i == 0 && out_nesting_infos != nullptr) { - // we pass some information down to the children - // todo: do/can we consider all the diferent reference intervals? - // right now, the info passed is hopefully coarse-grained enough not to matter? - assert(in_nesting_info != nullptr && - in_nesting_info->child_snarls.size() == out_nesting_infos->size()); - - for (int64_t j = 0; j < out_nesting_infos->size(); ++j) { - out_nesting_infos->at(j).child_snarls.clear(); - out_nesting_infos->at(j).has_ref = false; - out_nesting_infos->at(j).lv = in_nesting_info->lv + 1; - if (child_snarl_to_trav[j] >= 0) { - if (child_snarl_to_trav[j] < trav_steps.size()) { - NestingInfo& child_info = out_nesting_infos->at(j); - child_info.has_ref = true; - child_info.parent_path_interval = trav_steps[child_snarl_to_trav[j]]; - child_info.parent_ref_interval = trav_steps[ref_trav_idx]; - child_info.sample_to_haplotypes = sample_to_haps; - child_info.parent_allele = trav_to_allele[child_snarl_to_trav[j]] >= 0 ? - trav_to_allele[child_snarl_to_trav[j]] : 0; - child_info.parent_len = v.alleles[child_info.parent_allele].length(); - child_info.parent_ref_len = v.alleles[0].length(); - child_info.lv0_ref_name = ref_info.lv0_ref_name; - child_info.lv0_ref_start = ref_info.lv0_ref_start; - child_info.lv0_ref_len = ref_info.lv0_ref_len; - if (in_nesting_info == nullptr || in_nesting_info->has_ref == false) { - // we're the parent or root, so we want to set this here - child_info.lv0_alt_len = child_info.parent_len; - } else { - child_info.lv0_alt_len = ref_info.lv0_alt_len; - } + // Note: LV and PS tags are computed by update_nesting_info_tags() in write_variants() + // when include_nested is true (same as vg call) + + // Only populate child contexts when: + // 1. Star alleles are enabled (context only needed for star allele detection) + // 2. We have a place to put them (out_child_contexts != nullptr) + // 3. This is the first reference traversal (i == 0 to avoid duplicate work) + // 4. There's exactly one reference traversal (no cycles) - for cyclic references, + // we don't pass context to children and let them process independently + if (this->star_allele && i == 0 && out_child_contexts != nullptr && ref_travs.size() == 1) { + // Build sample_to_haps from ALL current traversals (for star alleles) + // This includes ALL haplotypes at the parent level, which is needed for + // add_star_traversals to detect missing haplotypes at nested sites + if (sample_to_haps.empty()) { + for (int64_t ti = 0; ti < trav_path_names.size(); ++ti) { + string sample_name = PathMetadata::parse_sample_name(trav_path_names[ti]); + if (sample_name.empty()) { + sample_name = trav_path_names[ti]; } + auto phase = PathMetadata::parse_haplotype(trav_path_names[ti]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); } } - // update the path cover -#pragma omp critical (off_ref_info) - { - this->update_path_cover(trav_steps, trav_clusters, in_nesting_info->has_ref ? *in_nesting_info : ref_info); - - } - - // remember the reference path of this variant site - // for fasta output, along with information about its parent - int64_t ref_trav = ref_travs[i]; - const PathInterval& ref_path_interval = trav_steps[ref_trav]; - if (!this->ref_paths.count(graph->get_path_name(graph->get_path_handle_of_step(ref_path_interval.first)))) { -#pragma omp critical (off_ref_info) - { - //this->off_ref_sequences[ref_path_interval] = *in_nesting_info; - } + // Create a shared_ptr to the map so all children share the same data + // (avoids copying the map for each child, major memory optimization) + auto shared_haps = std::make_shared>>(std::move(sample_to_haps)); + + // Pass the shared map to ALL children + for (int64_t j = 0; j < out_child_contexts->size(); ++j) { + out_child_contexts->at(j).sample_to_haplotypes = shared_haps; } } @@ -1156,7 +1001,7 @@ string Deconstructor::get_vcf_header() { ref_haplotypes.insert(PathMetadata::parse_haplotype(ref_path_name)); } if (!long_ref_contig) { - long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || nested_decomposition; + long_ref_contig = ref_samples.size() > 1 || ref_haplotypes.size() > 1 || include_nested; } sample_names.clear(); unordered_map> sample_to_haps; @@ -1280,15 +1125,9 @@ string Deconstructor::get_vcf_header() { if (include_nested) { stream << "##INFO=" << endl; stream << "##INFO=" << endl; - } - if (this->nested_decomposition) { - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; + stream << "##INFO=" << endl; } if (untangle_allele_traversals) { stream << "##INFO=|<][id]_[start|.]_[end|.], with '.' indicating non-reference nodes.\">" << endl; @@ -1314,14 +1153,7 @@ string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const } set all_ref_paths = this->ref_paths; - - // add in the off-ref paths that nested deconstruction may have found - for (const unordered_set& off_ref_path_set : this->off_ref_paths) { - for (const path_handle_t& off_ref_path : off_ref_path_set) { - all_ref_paths.insert(graph->get_path_name(off_ref_path)); - } - } - + map ref_path_to_length; for(auto& refpath : all_ref_paths) { assert(graph->has_path(refpath)); @@ -1361,11 +1193,13 @@ string Deconstructor::add_contigs_to_vcf_header(const string& vcf_header) const } void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { + // Simple flat processing of all snarls without context passing. + // More memory-efficient than top-down approach when star alleles aren't needed. vector snarls; vector queue; - // read all our snarls into a list + // Read all snarls into a list snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { queue.push_back(snarl); }); @@ -1381,8 +1215,8 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { swap(snarls, queue); } - // process the whole shebang in parallel -#pragma omp parallel for schedule(dynamic,1) + // Process all snarls in parallel (no context passing) +#pragma omp parallel for schedule(dynamic, 1) for (size_t i = 0; i < snarls.size(); i++) { deconstruct_site(graph->get_handle(snarls[i]->start().node_id(), snarls[i]->start().backward()), graph->get_handle(snarls[i]->end().node_id(), snarls[i]->end().backward())); @@ -1391,46 +1225,34 @@ void Deconstructor::deconstruct_graph(SnarlManager* snarl_manager) { void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // logic copied from vg call (graph_caller.cpp) - + size_t thread_count = get_thread_count(); - this->off_ref_paths.clear(); - this->off_ref_paths.resize(get_thread_count()); // Used to recurse on children of parents that can't be called - vector>> snarl_queue(thread_count); + vector>> snarl_queue(thread_count); - // Run the deconstructor on a snarl, and queue up the children if it fails - auto process_snarl = [&](const Snarl* snarl, NestingInfo nesting_info) { + // Run the deconstructor on a snarl, and queue up the children + auto process_snarl = [&](const Snarl* snarl, ChildContext context) { if (!snarl_manager->is_trivial(snarl, *graph)) { const vector& children = snarl_manager->children_of(snarl); - assert(nesting_info.child_snarls.empty()); - for (const Snarl* child : children) { - nesting_info.child_snarls.push_back(make_pair(graph->get_handle(child->start().node_id(), child->start().backward()), - graph->get_handle(child->end().node_id(), child->end().backward()))); - - } - vector out_nesting_infos(children.size()); - bool was_deconstructed = deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), - graph->get_handle(snarl->end().node_id(), snarl->end().backward()), - include_nested ? &nesting_info : nullptr, - include_nested ? &out_nesting_infos : nullptr); - if (include_nested || !was_deconstructed) { - vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + vector out_child_contexts(children.size()); + deconstruct_site(graph->get_handle(snarl->start().node_id(), snarl->start().backward()), + graph->get_handle(snarl->end().node_id(), snarl->end().backward()), + include_nested ? &context : nullptr, + include_nested ? &out_child_contexts : nullptr); + if (include_nested) { + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; for (int64_t i = 0; i < children.size(); ++i) { - thread_queue.push_back(make_pair(children[i], out_nesting_infos[i])); + thread_queue.push_back(make_pair(children[i], out_child_contexts[i])); } - - } + } } }; - // Start with the top level snarls - // (note, can't do for_each_top_level_snarl_parallel() because interface wont take nesting info) - vector> top_level_snarls; + // Start with the top level snarls (no parent context) + vector> top_level_snarls; snarl_manager->for_each_top_level_snarl([&](const Snarl* snarl) { - NestingInfo nesting_info; - nesting_info.has_ref = false; - nesting_info.lv = 0; - top_level_snarls.push_back(make_pair(snarl, nesting_info)); + ChildContext empty_context; + top_level_snarls.push_back(make_pair(snarl, empty_context)); }); #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < top_level_snarls.size(); ++i) { @@ -1439,9 +1261,9 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { // Then recurse on any children the snarl caller failed to handle while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), - [](const vector>& snarl_vec) {return snarl_vec.empty();})) { - vector> cur_queue; - for (vector>& thread_queue : snarl_queue) { + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { cur_queue.reserve(cur_queue.size() + thread_queue.size()); std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); @@ -1451,15 +1273,6 @@ void Deconstructor::deconstruct_graph_top_down(SnarlManager* snarl_manager) { process_snarl(cur_queue[i].first, cur_queue[i].second); } } - - - // if we are computing a cover, do a pass and scrape up everything we coudln't find in travesals - if (this->nested_decomposition) { - // note: not parallel since we'll be banging away at the same coverage map - for (const auto& top_level_snarl : top_level_snarls) { - fill_cover_second_pass(snarl_manager, top_level_snarl.first); - } - } } /** @@ -1474,12 +1287,11 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand bool long_ref_contig, double cluster_threshold, gbwt::GBWT* gbwt, - bool nested_decomposition, bool star_allele) { this->graph = graph; this->ref_paths = set(ref_paths.begin(), ref_paths.end()); - this->include_nested = include_nested || nested_decomposition; + this->include_nested = include_nested; this->path_jaccard_window = context_jaccard_window; this->untangle_allele_traversals = untangle_traversals; this->keep_conflicted_genotypes = keep_conflicted; @@ -1490,18 +1302,17 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand } this->cluster_threshold = cluster_threshold; this->gbwt = gbwt; - this->nested_decomposition = nested_decomposition; this->star_allele = star_allele; // the need to use nesting is due to a problem with omp tasks and shared state // which results in extremely high memory costs (ex. ~10x RAM for 2 threads vs. 1) omp_set_nested(1); omp_set_max_active_levels(3); - + // create the traversal finder map reads_by_name; path_trav_finder = unique_ptr(new PathTraversalFinder(*graph)); - + if (gbwt != nullptr) { gbwt_trav_finder = unique_ptr(new GBWTTraversalFinder(*graph, *gbwt)); } @@ -1509,9 +1320,13 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand string hstr = this->get_vcf_header(); assert(output_vcf.openForOutput(hstr)); - if (nested_decomposition) { + // Use top-down processing only when star alleles are needed (requires context passing). + // Otherwise use simpler flat processing which is more memory-efficient. + if (star_allele) { + cerr << "[vg deconstruct] Using top-down processing (star alleles enabled)" << endl; deconstruct_graph_top_down(snarl_manager); } else { + cerr << "[vg deconstruct] Using flat processing" << endl; deconstruct_graph(snarl_manager); } @@ -1522,287 +1337,5 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand write_variants(cout, snarl_manager); } -void Deconstructor::update_path_cover(const vector>& trav_steps, - const vector>& traversal_clusters, - const NestingInfo& nesting_info) const { - // for every cluster, add off-reference sequences - // todo: are these in the best order? - for (const vector& trav_cluster : traversal_clusters) { - if (trav_cluster[0] >= trav_steps.size()) { - assert(this->star_allele); - continue; - } - const PathInterval& path_interval = trav_steps.at(trav_cluster[0]); - int64_t start = graph->get_position_of_step(path_interval.first); - int64_t end = graph->get_position_of_step(path_interval.second); - bool reversed = start > end; - - // scan the interval storing any uncovered sub-intervals - vector sub_intervals; - bool open_interval = false; - step_handle_t prev_step; - string prev_name; - for (step_handle_t step = path_interval.first; step != path_interval.second; - step = (reversed ? graph->get_previous_step(step) : graph->get_next_step(step))) { - if (this->node_cover.count(graph->get_id(graph->get_handle_of_step(step)))) { - if (open_interval) { - if (!this->ref_paths.count(prev_name)) { - // expects inclusive interval - step_handle_t end_step = reversed ? graph->get_next_step(step) : graph->get_previous_step(step); - sub_intervals.push_back(make_pair(prev_step, end_step)); - } - open_interval = false; - } - } else { - if (open_interval == false) { - prev_step = step; - prev_name = graph->get_path_name(graph->get_path_handle_of_step(prev_step)); - open_interval = true; - } - this->node_cover.insert(graph->get_id(graph->get_handle_of_step(step))); - } - } - - if (open_interval && !this->ref_paths.count(prev_name)) { - // expects inclusive interval - step_handle_t end_step = reversed ? graph->get_next_step(path_interval.second) : - graph->get_previous_step(path_interval.second); - sub_intervals.push_back(make_pair(prev_step, end_step)); - } - - // update the path cover - for (const PathInterval& interval : sub_intervals) { - // todo: store something - this->off_ref_sequences[interval] = nesting_info; - } - } -} - -static string resolve_path_name(const PathPositionHandleGraph* graph, - const PathInterval& path_interval, - int64_t& out_start, - int64_t& out_end, - bool& out_reversed) { - - path_handle_t path_handle = graph->get_path_handle_of_step(path_interval.first); - step_handle_t step1 = path_interval.first; - step_handle_t step2 = path_interval.second; - out_start = graph->get_position_of_step(step1); - out_end = graph->get_position_of_step(step2); - // until now, everything is oriented on the snarl - // but here we flip so that we're oriented on the path - out_reversed = out_start > out_end; - if (out_reversed) { - swap(step1, step2); - swap(out_start, out_end); - } - out_end += graph->get_length(graph->get_handle_of_step(step2)); - - - // apply the offset to the name - string path_name = graph->get_path_name(path_handle); - PathSense sense; - string sample; - string locus; - size_t haplotype; - size_t phase_block; - subrange_t subrange; - PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); - - if (subrange == PathMetadata::NO_SUBRANGE) { - subrange.first = out_start; - subrange.second = out_end; - } else { - subrange.first += out_start; - subrange.second = subrange.first + (out_end - out_start); - } - if (phase_block == 0) { - phase_block = PathMetadata::NO_PHASE_BLOCK; - sense = PathSense::REFERENCE; - } - path_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); - - return path_name; -} - -void Deconstructor::fill_cover_second_pass(const SnarlManager* snarl_manager, const Snarl* snarl) const { - pair, unordered_set > contents = snarl_manager->deep_contents(snarl, *graph, false); - - // this is a simple brute-force way to fill in nodes that aren't covered by traversals, using - // path-name sort as sole metric - // todo: use someting more clever? - - handle_t snarl_start = graph->get_handle(snarl->start().node_id(), snarl->start().backward()); - if (!this->top_level_ref_intervals.count(snarl_start)) { - return; - } - - // collect all the candidate nodes - unordered_set uncovered_nodes; - // collect all the candidate paths - map path_map; - for (id_t node_id : contents.first) { - if (!this->node_cover.count(node_id)) { - vector> step_paths; - graph->for_each_step_on_handle(graph->get_handle(node_id), [&](step_handle_t step) { - path_handle_t path_handle = graph->get_path_handle_of_step(step); - string path_name = graph->get_path_name(path_handle); - if (this->ref_paths.count(path_name)) { - step_paths.clear(); - return false; - } else { - step_paths.push_back(make_pair(path_handle, path_name)); - } - return true; - }); - if (!step_paths.empty()) { - uncovered_nodes.insert(node_id); - this->node_cover.insert(node_id); - } - for (const auto& step_path : step_paths) { - path_map[step_path.second] = step_path.first; - } - } - } - - if (!uncovered_nodes.empty()) { - // fill up the reference metadata we need for the output table - NestingInfo nesting_info; - nesting_info.parent_ref_interval = this->top_level_ref_intervals[snarl_start]; - int64_t ref_start, ref_end; - bool ref_reversed; - string ref_path_name = resolve_path_name(graph, nesting_info.parent_ref_interval, - ref_start, ref_end, ref_reversed); - nesting_info.lv0_ref_start = ref_start; - nesting_info.lv0_ref_len = ref_end - ref_start; // todo : check - nesting_info.lv0_ref_name = Paths::strip_subrange(ref_path_name); - - // greedily add the paths by name, using similar logic to first pass - for (const auto& name_path : path_map) { - bool open_interval = false; - step_handle_t prev_step; - graph->for_each_step_in_path(name_path.second, [&](step_handle_t step) { - if (uncovered_nodes.count(graph->get_id(graph->get_handle_of_step(step)))) { - if (open_interval == false) { - open_interval = true; - prev_step = step; - } - uncovered_nodes.erase(graph->get_id(graph->get_handle_of_step(step))); - } else { - if (open_interval == true) { - this->off_ref_sequences[make_pair(prev_step, graph->get_previous_step(step))] = nesting_info; - open_interval = false; - } - } - }); - if (open_interval) { - this->off_ref_sequences[make_pair(prev_step, graph->path_back(name_path.second))] = nesting_info; - } - if (uncovered_nodes.empty()) { - break; - } - } - } - assert(uncovered_nodes.empty()); -} - -void Deconstructor::save_off_ref_sequences(const string& out_fasta_filename) const { - ofstream out_fasta_file(out_fasta_filename); - if (!out_fasta_file) { - cerr << "[deconstruct] error: Unable to open " << out_fasta_filename << " for writing" << endl; - exit(1); - } - - string metadata_filename = out_fasta_filename + ".nesting.tsv"; - ofstream out_metadata_file(metadata_filename); - if (!out_metadata_file) { - cerr << "[deconstruct] error: Unable to open " << metadata_filename << " for writing" << endl; - exit(1); - } - - // sort the sequences by name / pos - vector::const_iterator> sorted_map; - for (unordered_map::const_iterator i = this->off_ref_sequences.begin(); - i != this->off_ref_sequences.end(); ++i) { - sorted_map.push_back(i); - } - std::sort(sorted_map.begin(), sorted_map.end(), [&](unordered_map::const_iterator i1, - unordered_map::const_iterator i2) { - string n1 = graph->get_path_name(graph->get_path_handle_of_step(i1->first.first)); - string n2 = graph->get_path_name(graph->get_path_handle_of_step(i2->first.first)); - if (n1 != n2) { - return n1 < n2; - } - int64_t pos1 = graph->get_position_of_step(i1->first.first); - int64_t pos2 = graph->get_position_of_step(i2->first.first); - return pos1 < pos2; - }); - - // merge the intervals if they overlap - // this can happen, ex, in consecutive snarls along a chain where - // they one snarl starts where the previous ends, and keeping them - // separate would lead to an overlap between the two paths - vector> merged_intervals; - for (const auto& i : sorted_map) { - bool merged = false; - const PathInterval& cur_interval = i->first; - if (!merged_intervals.empty()) { - PathInterval& prev_interval = merged_intervals.back().first; - if (cur_interval.first == prev_interval.second) { - assert(i->second.parent_path_interval == merged_intervals.back().second->parent_path_interval); - prev_interval.second = cur_interval.second; - merged = true; - } else if (cur_interval.second == prev_interval.first) { - assert(i->second.parent_path_interval == merged_intervals.back().second->parent_path_interval); - prev_interval.first = cur_interval.first; - merged = true; - } else { - assert(cur_interval.first != prev_interval.first); - assert(cur_interval.second != prev_interval.second); - } - } - if (!merged) { - merged_intervals.push_back(make_pair(cur_interval, &i->second)); - } - } - - for (const auto i : merged_intervals) { - const PathInterval& path_interval = i.first; - int64_t pos1; - int64_t pos2; - bool is_reversed; - string path_name = resolve_path_name(this->graph, path_interval, pos1, pos2, is_reversed); - string path_sequence; - // write the path as a fasta string - step_handle_t step1 = is_reversed ? path_interval.second : path_interval.first; - step_handle_t step2 = is_reversed ? path_interval.first : path_interval.second; - for (step_handle_t step = step1; step != graph->get_next_step(step2); - step = graph->get_next_step(step)) { - path_sequence += graph->get_sequence(graph->get_handle_of_step(step)); - } - write_fasta_sequence(path_name, path_sequence, out_fasta_file); - - string snarl_name = graph_interval_to_string(graph, graph->get_handle_of_step(path_interval.first), - graph->get_handle_of_step(path_interval.second)); - - // write a corresponding metadata record in the tsv with the nesting info in it - const NestingInfo& nesting_info = *i.second; - int64_t par_pos1; - int64_t par_pos2; - bool par_reversed; - - // note: we subtract 1 below to convert from 1-based vcf to 0-based bed - out_metadata_file << Paths::strip_subrange(path_name) << "\t" - << pos1 << "\t" - << pos2 << "\t" - << snarl_name << "\t" - << nesting_info.lv0_ref_name << "\t" - << (nesting_info.lv0_ref_start -1) << "\t" - << (nesting_info.lv0_ref_start + nesting_info.lv0_ref_len -1) - << endl; - - } -} - } diff --git a/src/deconstructor.hpp b/src/deconstructor.hpp index 8a7c19f82b..66bd44d47a 100644 --- a/src/deconstructor.hpp +++ b/src/deconstructor.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "genotypekit.hpp" #include "Variant.h" #include "handle.hpp" @@ -47,13 +48,7 @@ class Deconstructor : public VCFOutputCaller { bool long_ref_contig, double cluster_threshold = 1.0, gbwt::GBWT* gbwt = nullptr, - bool nested_decomposition = false, bool star_allele = false); - - // write the off-reference sequences to a fasta file - // todo: would really like to refactor so that this can somehow - // go into vg paths. - void save_off_ref_sequences(const string& out_fasta_filename) const; private: @@ -65,39 +60,37 @@ class Deconstructor : public VCFOutputCaller { // construction. end result: this hacky function to patch them in before printing string add_contigs_to_vcf_header(const string& vcf_header) const; - // deconstruct all snarls in parallel (ie nesting relationship ignored) + // Simple flat processing: collect all snarls then process in parallel. + // More memory-efficient when star alleles aren't needed. void deconstruct_graph(SnarlManager* snarl_manager); - // deconstruct all top-level snarls in parallel - // nested snarls are processed after their parents in the same thread - // (same logic as vg call) + // Top-down processing: process snarls level by level, passing context to children. + // Required for star allele detection (context tracks parent haplotypes). void deconstruct_graph_top_down(SnarlManager* snarl_manager); - // some information we pass from parent to child site when - // doing nested deconstruction - struct NestingInfo { - bool has_ref; // flag essentially determines if struct is initalized - vector> child_snarls; // children of parent - PathInterval parent_path_interval; // parent path - PathInterval parent_ref_interval; // reference path of parent site - unordered_map> sample_to_haplotypes; // parent genotpyes - int parent_allele; // parent allele - int64_t parent_len; // length of parent allele - int64_t parent_ref_len; // length of reference allele at parent site - string lv0_ref_name; // reference path name at top level - int64_t lv0_ref_start; // reference start position at top level - int64_t lv0_ref_len; // reference allele length at top level - int64_t lv0_alt_len; // alt allele length at top level - int64_t lv; // level + // Context passed from parent to child site for nested deconstruction. + // Used for star allele detection: samples in parent but not in child get * allele. + // Note: context is only populated when star_allele is enabled. + // Note: context is NOT passed when parent has multiple reference traversals (cycles). + struct ChildContext { + // Map from sample name to vector of haplotype phases that traversed the parent snarl. + // Child compares against its own traversals to detect star alleles. + // Format matches what add_star_traversals() expects. + // Uses shared_ptr so all children of a snarl share the same map (memory optimization). + std::shared_ptr>> sample_to_haplotypes; + + // Check if context is populated + bool empty() const { return !sample_to_haplotypes || sample_to_haplotypes->empty(); } }; - + // write a vcf record for the given site. returns true if a record was written // (need to have a path going through the site) - // the nesting_info structs are optional and used to pass reference information through nested sites... - // the output nesting_info vector writes a record for each child snarl + // in_context: optional context from parent (for star allele detection) + // out_child_contexts: if provided, populated with contexts for each child snarl + // (only populated when site has exactly one reference traversal) bool deconstruct_site(const handle_t& snarl_start, const handle_t& snarl_end, - const NestingInfo* in_nesting_info = nullptr, - vector* out_nesting_infos = nullptr) const; + const ChildContext* in_context = nullptr, + vector* out_child_contexts = nullptr) const; // get the traversals for a given site // this returns a combination of embedded path traversals and gbwt traversals @@ -108,15 +101,10 @@ class Deconstructor : public VCFOutputCaller { vector& out_trav_path_names, vector>& out_trav_steps) const; - // this is a hack to add in * alleles -- these are haplotypes that we genotyped in the - // parent but aren't represented in any of the traversals found in the current - // site. *-alleles are represented as empty traversals. - // todo: conflicts arising from alt-cycles will be able to lead to conflicting - // results -- need to overhaul code to pass more detailed traversal information - // from parent to child to have a chance at consistently resolving - // star traversals are appended onto travs and trav_names - // this funtion returns a map containing both parent and child haploty - unordered_map> add_star_traversals(vector& travs, + // Add star (*) alleles for haplotypes that were genotyped in the parent but don't + // have a traversal through the current site. Star alleles are represented as empty + // traversals. Returns a map of sample name to haplotype phases (both parent and child). + unordered_map> add_star_traversals(vector& travs, vector& trav_names, vector>& trav_clusters, vector>& trav_cluster_info, @@ -142,14 +130,6 @@ class Deconstructor : public VCFOutputCaller { const vector& trav_to_name, const vector& gbwt_phases) const; - // update off-ref path cover - void update_path_cover(const vector>& trav_steps, - const vector>& traversal_clusters, - const NestingInfo& nesting_info) const; - - // second pass to add non-traversal fragments to cover - void fill_cover_second_pass(const SnarlManager* snarl_manager, const Snarl* snarl) const; - // the underlying context-getter vector get_context( step_handle_t start_step, @@ -174,10 +154,6 @@ class Deconstructor : public VCFOutputCaller { // the ref paths set ref_paths; - // the off-ref paths that may be found during nested deconstruction - // (buffered by thread) - mutable vector> off_ref_paths; - // keep track of reference samples set ref_samples; @@ -210,24 +186,10 @@ class Deconstructor : public VCFOutputCaller { // merge if identical (which is what deconstruct has always done) double cluster_threshold = 1.0; - // activate the new nested decomposition mode, which is like the old include_nested - // (which lives in vcfoutputcaller) but with more of an effort to link - // the parent and child snarls, as well as better support for nested insertions - bool nested_decomposition = false; - - // use *-alleles to represent spanning alleles that do not cross site but do go around it - // ex: a big containing deletion - // only works with nested_decomposition + // use *-alleles to represent haplotypes that span the parent site but don't + // traverse the current nested site (e.g., a deletion that spans a nested SNP) + // only works with include_nested bool star_allele = false; - - // keep track of off-reference reference sequences to print to fasta at the end - mutable unordered_map off_ref_sequences; - - // keep track of path cover for computing off_ref_sequences - mutable unordered_set node_cover; - - // keep track of top-level reference traversals for second pass of cover - mutable unordered_map top_level_ref_intervals; }; diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index f38163f4eb..426d5d451a 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -1,6 +1,7 @@ #include "graph_caller.hpp" #include "algorithms/expand_context.hpp" #include "annotation.hpp" +#include "augref.hpp" //#define debug @@ -233,6 +234,9 @@ string VCFOutputCaller::vcf_header(const PathHandleGraph& graph, const vector" << endl; ss << "##INFO=" << endl; + ss << "##INFO=" << endl; + ss << "##INFO=" << endl; + ss << "##INFO=" << endl; } ss << "##INFO=" << endl; return ss.str(); @@ -479,10 +483,24 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa ref_trav_idx); // deduplicate alleles and compute the site traversals and genotype - map allele_to_gt; - allele_to_gt[out_variant.ref] = 0; + map allele_to_gt; + allele_to_gt[out_variant.ref] = 0; + int star_allele_idx = -1; // index for star allele in allele_to_gt, if needed for (int i = 0; i < genotype.size(); ++i) { - if (genotype[i] == ref_trav_idx) { + if (genotype[i] == STAR_ALLELE_MARKER) { + // Star allele: haplotype spans this site but has no defined traversal here + if (star_allele_idx < 0) { + // Add star allele to allele list + star_allele_idx = allele_to_gt.size(); + allele_to_gt["*"] = star_allele_idx; + // Add empty traversal as placeholder (won't be used for AT info) + site_traversals.push_back(SnarlTraversal()); + } + site_genotype.push_back(star_allele_idx); + } else if (genotype[i] == MISSING_ALLELE_MARKER) { + // Missing allele: parent doesn't traverse this child, output as '.' in VCF + site_genotype.push_back(MISSING_ALLELE_MARKER); + } else if (genotype[i] == ref_trav_idx) { site_genotype.push_back(0); } else { string allele_string = trav_to_string(called_traversals, genotype, genotype[i], i, ref_trav_idx); @@ -562,7 +580,11 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa stringstream vcf_gt; if (!genotype.empty()) { for (int i = 0; i < site_genotype.size(); ++i) { - vcf_gt << site_genotype[i]; + if (site_genotype[i] == MISSING_ALLELE_MARKER) { + vcf_gt << "."; + } else { + vcf_gt << site_genotype[i]; + } if (i != site_genotype.size() - 1) { vcf_gt << "/"; } @@ -602,6 +624,15 @@ bool VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa } #endif + // If genotype contains missing allele but no ALT, add * as ALT to emit valid VCF + // This happens when one parent haplotype doesn't traverse a nested child snarl + bool has_missing = std::find(site_genotype.begin(), site_genotype.end(), MISSING_ALLELE_MARKER) != site_genotype.end(); + if (has_missing && out_variant.alt.empty()) { + out_variant.alt.push_back("*"); + out_variant.alleles.push_back("*"); + out_variant.info["AT"].push_back("."); + } + if (genotype_snarls || !out_variant.alt.empty()) { bool added = add_variant(out_variant); if (!added) { @@ -828,7 +859,7 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, functionset_node_id(start); snarl.mutable_start()->set_backward(frag[0] == '<'); - assert(frag[toks[0].size() + 1] == '<' || frag[toks[0].size() + 1] == '<'); + assert(frag[toks[0].size() + 1] == '<' || frag[toks[0].size() + 1] == '>'); int64_t end = std::stoi(toks[1]); snarl.mutable_end()->set_node_id(abs(end)); snarl.mutable_end()->set_backward(frag[toks[0].size() + 1] == '<'); @@ -1044,9 +1075,49 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager } } + // pass 2) identify top-level snarls (those with no ancestors in VCF) + // and store reference info only for them + struct RefInfo { + string chrom; + size_t pos; + size_t ref_len; + }; + unordered_map top_level_ref_info; + + // Helper to check if a snarl is top-level (no ancestors in VCF) + auto is_top_level = [&](const string& name) -> bool { + auto it = name_to_snarl.find(name); + if (it == name_to_snarl.end()) return true; // not found, treat as top-level + const Snarl* snarl = it->second; + while ((snarl = snarl_manager->parent_of(snarl))) { + string cur_name = print_snarl(*snarl); + string flipped_name = print_flipped_snarl(*snarl); + if (names_in_vcf.count(cur_name) || names_in_vcf.count(flipped_name)) { + return false; // has ancestor in VCF + } + } + return true; // no ancestors in VCF + }; + + // Second pass through variants to extract ref info only for top-level snarls + for (auto& thread_buf : output_variants) { + for (auto& output_variant_record : thread_buf) { + string output_variant_string; + int ret = zstdutil::DecompressString(output_variant_record.second, output_variant_string); + assert(ret == 0); + vector toks = split_delims(output_variant_string, "\t", 5); + const string& name = toks[2]; + if (is_top_level(name)) { + top_level_ref_info[name] = {toks[0], static_cast(stoul(toks[1])), toks[3].length()}; + } + } + } + // determine the tags from the index - function(const string&)> get_lv_ps_tags = [&](const string& name) { + // Returns: (lv, parent_name, top_level_name) + function(const string&)> get_nesting_tags = [&](const string& name) { string parent_name; + string top_level_name = name; // default to self (for LV=0 case) size_t ancestor_count = 0; const Snarl* snarl = name_to_snarl.at(name); @@ -1057,19 +1128,28 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager // Since it is possible that the snarl is actually flipped in the vcf, check for the flipped version too string flipped_name = print_flipped_snarl(*snarl); - if (names_in_vcf.count(cur_name) || names_in_vcf.count(flipped_name)) { + if (names_in_vcf.count(cur_name)) { // only count snarls that are in the vcf ++ancestor_count; if (parent_name.empty()) { - // remembert the first parent + // remember the first parent parent_name = cur_name; } + // keep updating top_level to find the topmost ancestor in VCF + top_level_name = cur_name; + } else if (names_in_vcf.count(flipped_name)) { + // snarl is in vcf under flipped orientation + ++ancestor_count; + if (parent_name.empty()) { + parent_name = flipped_name; + } + top_level_name = flipped_name; } } - return make_pair(ancestor_count, parent_name); + return make_tuple(ancestor_count, parent_name, top_level_name); }; - - // pass 2) add the LV and PS tags + + // pass 3) add the LV, PS, RC, RS, RD tags #pragma omp parallel for for (uint64_t i = 0; i < output_variants.size(); ++i) { auto& thread_buf = output_variants[i]; @@ -1080,15 +1160,20 @@ void VCFOutputCaller::update_nesting_info_tags(const SnarlManager* snarl_manager //string& output_variant_string = output_variant_record.second; vector toks = split_delims(output_variant_string, "\t", 9); const string& name = toks[2]; - - pair lv_ps = get_lv_ps_tags(name); - string nesting_tags = ";LV=" + std::to_string(lv_ps.first); - if (lv_ps.first != 0) { - assert(!lv_ps.second.empty()); - nesting_tags += ";PS=" + lv_ps.second; + auto [lv, parent_name, top_level_name] = get_nesting_tags(name); + string nesting_tags = ";LV=" + std::to_string(lv); + if (lv != 0) { + assert(!parent_name.empty()); + nesting_tags += ";PS=" + parent_name; } + // Add RC, RS, RD tags (reference info from top-level snarl) + const RefInfo& top_ref = top_level_ref_info.at(top_level_name); + nesting_tags += ";RC=" + top_ref.chrom; + nesting_tags += ";RS=" + std::to_string(top_ref.pos); + nesting_tags += ";RD=" + std::to_string(top_ref.pos + top_ref.ref_len); + // rewrite the output string using the updated info toks output_variant_string.clear(); for (size_t i = 0; i < toks.size(); ++i) { @@ -1720,33 +1805,120 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, } +FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& allele_length_range, + bool nested, + double cluster_threshold, + bool cluster_post_genotype, + bool star_allele) : + GraphCaller(snarl_caller, snarl_manager), + VCFOutputCaller(sample_name), + GAFOutputCaller(aln_emitter, sample_name, ref_paths, trav_padding), + graph(graph), + traversal_finder(traversal_finder), + ref_paths(ref_paths), + traversals_only(traversals_only), + gaf_output(gaf_output), + genotype_snarls(genotype_snarls), + allele_length_range(allele_length_range), + nested(nested), + cluster_threshold(cluster_threshold), + cluster_post_genotype(cluster_post_genotype), + star_allele(star_allele) +{ + for (int i = 0; i < ref_paths.size(); ++i) { + ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; + ref_path_set.insert(ref_paths[i]); + ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; + } +} + FlowCaller::~FlowCaller() { } bool FlowCaller::call_snarl(const Snarl& managed_snarl) { + // Entry point: call with no parent context + return call_snarl_internal(managed_snarl, "", make_pair(0, 0), nullptr); +} + +TraversalSet FlowCaller::find_child_traversal_set(const SnarlTraversal& parent_trav, + const Snarl& child) const { + TraversalSet result; + + // First, check if the parent traversal goes through this child snarl + // by finding the child's start and end nodes in the parent + nid_t child_start_id = child.start().node_id(); + nid_t child_end_id = child.end().node_id(); + bool found_start = false, found_end = false; + + for (int i = 0; i < parent_trav.visit_size(); ++i) { + nid_t visit_id = parent_trav.visit(i).node_id(); + if (visit_id == child_start_id) found_start = true; + if (visit_id == child_end_id) found_end = true; + } + + // If parent doesn't traverse the child, return empty set (star allele case) + if (!found_start || !found_end) { + return result; + } + + // Use the traversal finder to enumerate all traversals through the child + FlowTraversalFinder* flow_finder = dynamic_cast(&traversal_finder); + if (flow_finder != nullptr) { + auto weighted_travs = flow_finder->find_weighted_traversals(child, false); + result = std::move(weighted_travs.first); + } else { + result = traversal_finder.find_traversals(child); + } + + return result; +} + +bool FlowCaller::call_snarl_internal(const Snarl& managed_snarl, + const string& parent_ref_path_name, + pair parent_ref_interval, + const ChildTraversalSets* parent_child_trav_sets) { // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local // copy to work on to do things like flip -- calling any snarl_manager code that - // wants a pointer will crash. + // wants a pointer will crash. Snarl snarl = managed_snarl; +#ifdef debug + cerr << "call_snarl_internal on " << pb2json(snarl) << " with parent_ref_path=" << parent_ref_path_name + << " parent_child_trav_sets=" << (parent_child_trav_sets ? "provided" : "null") << endl; +#endif + if (snarl.start().node_id() == snarl.end().node_id() || !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { // can't call one-node or out-of graph snarls. return false; } + // toggle average flow / flow width based on snarl length. this is a bit inconsistent with // downstream which uses the longest traversal length, but it's a bit chicken and egg // todo: maybe use snarl length for everything? - const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); + const auto& support_finder = dynamic_cast(snarl_caller).get_support_finder(); bool greedy_avg_flow = false; { auto snarl_contents = snarl_manager.deep_contents(&snarl, graph, false); if (snarl_contents.second.size() > max_snarl_edges) { - // size cap needed as FlowCaller doesn't have nesting support yet + // size cap needed as non-nested FlowCaller doesn't handle large snarls return false; } size_t len_threshold = support_finder.get_average_traversal_support_switch_threshold(); @@ -1791,54 +1963,92 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { std::back_inserter(common_names)); if (common_names.empty()) { - return false; + // No reference path through snarl + // If we have parent context, we can still process using parent's ref path + if (parent_child_trav_sets == nullptr || parent_ref_path_name.empty()) { +#ifdef debug + cerr << " -> returning false: no common ref path and no parent context" << endl; +#endif + return false; + } +#ifdef debug + cerr << " -> using parent ref path: " << parent_ref_path_name << endl; +#endif } - string& ref_path_name = common_names.front(); + // Use parent's ref path if no direct path, otherwise prefer base reference over augref paths + string ref_path_name; + if (common_names.empty()) { + ref_path_name = parent_ref_path_name; + } else { + // Prefer base reference paths over derived augref paths + // common_names is sorted, so we iterate to find first non-augref path + ref_path_name = common_names.front(); // default to first (lexicographically smallest) + for (const string& name : common_names) { + if (!AugRefCover::is_augref_name(name)) { + ref_path_name = name; + break; + } + } + } // find the reference traversal and coordinates using the path position graph interface - tuple ref_interval = get_ref_interval(graph, snarl, ref_path_name); - if (get<0>(ref_interval) == -1) { - // could not find reference path interval consisten with snarl due to orientation conflict - return false; - } - if (get<2>(ref_interval) == true) { - // calling code assumes snarl forward on reference - flip_snarl(snarl); + tuple ref_interval; + bool use_parent_interval = false; + + if (common_names.empty() && parent_child_trav_sets != nullptr) { + // No direct reference path - use parent's interval and traversals directly + ref_interval = make_tuple(parent_ref_interval.first, parent_ref_interval.second, false, step_handle_t(), step_handle_t()); + use_parent_interval = true; + } else { ref_interval = get_ref_interval(graph, snarl, ref_path_name); + if (get<0>(ref_interval) == -1) { + // could not find reference path interval consistent with snarl due to orientation conflict + return false; + } + if (get<2>(ref_interval) == true) { + // calling code assumes snarl forward on reference + flip_snarl(snarl); + ref_interval = get_ref_interval(graph, snarl, ref_path_name); + } } - step_handle_t cur_step = get<3>(ref_interval); - step_handle_t last_step = get<4>(ref_interval); - if (get<2>(ref_interval)) { - std::swap(cur_step, last_step); - } - bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); - SnarlTraversal ref_trav; - while (true) { - handle_t cur_handle = graph.get_handle_of_step(cur_step); - Visit* visit = ref_trav.add_visit(); - visit->set_node_id(graph.get_id(cur_handle)); - visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); - if (graph.get_id(cur_handle) == snarl.end().node_id()) { - break; - } else if (get<2>(ref_interval) == true) { - if (!graph.has_previous_step(cur_step)) { - cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; - return false; - } - cur_step = graph.get_previous_step(cur_step); - } else { - if (!graph.has_next_step(cur_step)) { - cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; - return false; + + if (!use_parent_interval) { + // Build reference traversal from path steps + step_handle_t cur_step = get<3>(ref_interval); + step_handle_t last_step = get<4>(ref_interval); + if (get<2>(ref_interval)) { + std::swap(cur_step, last_step); + } + bool start_backwards = snarl.start().backward() != graph.get_is_reverse(graph.get_handle_of_step(cur_step)); + + while (true) { + handle_t cur_handle = graph.get_handle_of_step(cur_step); + Visit* visit = ref_trav.add_visit(); + visit->set_node_id(graph.get_id(cur_handle)); + visit->set_backward(start_backwards ? !graph.get_is_reverse(cur_handle) : graph.get_is_reverse(cur_handle)); + if (graph.get_id(cur_handle) == snarl.end().node_id()) { + break; + } else if (get<2>(ref_interval) == true) { + if (!graph.has_previous_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_previous_step(cur_step); + } else { + if (!graph.has_next_step(cur_step)) { + cerr << "Warning [vg call]: Unable, due to bug or corrupt path information, to trace reference path through snarl " << pb2json(managed_snarl) << endl; + return false; + } + cur_step = graph.get_next_step(cur_step); } - cur_step = graph.get_next_step(cur_step); + // todo: we can compute flow at the same time } - // todo: we can compute flow at the same time + assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); } - assert(ref_trav.visit(0) == snarl.start() && ref_trav.visit(ref_trav.visit_size() - 1) == snarl.end()); + // If use_parent_interval, ref_trav stays empty - we'll use first parent traversal as pseudo-reference vector travs; FlowTraversalFinder* flow_trav_finder = dynamic_cast(&traversal_finder); @@ -1855,6 +2065,9 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { cerr << "Warning [vg call]: Unable, due to bug or corrupt graph, to search for any traversals through snarl " << pb2json(managed_snarl) << endl; return false; } +#ifdef debug + cerr << " found " << travs.size() << " traversals, use_parent_interval=" << use_parent_interval << endl; +#endif // optional traversal length clamp can, ex, avoid trying to resolve a giant snarl if (allele_length_range.first > 0 || allele_length_range.second < numeric_limits::max()) { @@ -1876,30 +2089,175 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // find the reference traversal in the list of results from the traversal finder int ref_trav_idx = -1; - for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { - // todo: is there a way to speed this up? - if (travs[i] == ref_trav) { - ref_trav_idx = i; + + if (use_parent_interval) { + // No direct reference path - use first traversal from first non-empty set as pseudo-reference + if (parent_child_trav_sets != nullptr) { + for (const auto& tset : *parent_child_trav_sets) { + if (!tset.empty()) { + const SnarlTraversal& first_trav = tset[0]; + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + if (travs[i] == first_trav) { + ref_trav_idx = i; + } + } + if (ref_trav_idx < 0 && first_trav.visit_size() > 0) { + ref_trav_idx = travs.size(); + travs.push_back(first_trav); + } + break; + } + } + } + // If still -1, use first traversal from finder (or 0 if none) + if (ref_trav_idx < 0) { + ref_trav_idx = travs.empty() ? -1 : 0; + } + } else { + for (int i = 0; i < travs.size() && ref_trav_idx < 0; ++i) { + // todo: is there a way to speed this up? + if (travs[i] == ref_trav) { + ref_trav_idx = i; + } } - } - if (ref_trav_idx == -1) { - ref_trav_idx = travs.size(); - // we didn't get the reference traversal from the finder, so we add it here - travs.push_back(ref_trav); + if (ref_trav_idx == -1) { + ref_trav_idx = travs.size(); + // we didn't get the reference traversal from the finder, so we add it here + travs.push_back(ref_trav); + } } bool ret_val = true; + vector trav_genotype; // Declared outside block so we can pass to children + int ploidy = ref_ploidies[ref_path_name]; + + // Constants for bounded traversal set handling + const int MAX_TRAVS_PER_SET = 10; if (traversals_only) { assert(gaf_output); pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); emit_gaf_traversals(graph, print_snarl(snarl), travs, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } else if (parent_child_trav_sets != nullptr && !parent_child_trav_sets->empty()) { + // Genotype using bounded search over traversal sets from parent + // Each set contains traversals consistent with one parent allele + ploidy = parent_child_trav_sets->size(); + + // Track which set each traversal index belongs to (for phase consistency) + // set_membership[i] = which parent allele set traversal i came from, or -1 if from finder + vector set_membership(travs.size(), -1); + + // Merge traversals from sets into travs, tracking membership + // Also rank by support and keep top MAX_TRAVS_PER_SET per set + vector> set_to_trav_indices(ploidy); // indices into travs for each set + + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + const TraversalSet& tset = (*parent_child_trav_sets)[set_idx]; + + if (tset.empty()) { + // Empty set means parent allele doesn't traverse this child (star allele) + continue; + } + + // Add traversals from this set to travs (avoiding duplicates) + // Keep track of indices for this set + vector> support_and_idx; // (support, index in travs) + + for (const SnarlTraversal& trav : tset) { + // Check if this traversal already exists in travs + int match_idx = -1; + for (int i = 0; i < travs.size() && match_idx < 0; ++i) { + if (travs[i] == trav) { + match_idx = i; + } + } + + if (match_idx < 0) { + // New traversal - add it + match_idx = travs.size(); + travs.push_back(trav); + set_membership.push_back(set_idx); + } else if (set_membership[match_idx] < 0) { + // Traversal was from finder, now claim it for this set + set_membership[match_idx] = set_idx; + } + // Note: if already claimed by another set, that's fine (shared region) + + // Get support for ranking + double support = TraversalSupportFinder::support_val( + support_finder.get_traversal_support(trav)); + support_and_idx.push_back({support, match_idx}); + } + + // Sort by support (descending) and keep top MAX_TRAVS_PER_SET + std::sort(support_and_idx.begin(), support_and_idx.end(), + [](const auto& a, const auto& b) { return a.first > b.first; }); + + for (int i = 0; i < std::min((int)support_and_idx.size(), MAX_TRAVS_PER_SET); ++i) { + set_to_trav_indices[set_idx].push_back(support_and_idx[i].second); + } + } + + // Track which parent sets are empty (star/missing allele cases) + vector empty_sets(ploidy, false); + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + if ((*parent_child_trav_sets)[set_idx].empty()) { + empty_sets[set_idx] = true; + } + } + + // Check if ALL sets are empty (no traversals at all) + bool all_empty = true; + for (int set_idx = 0; set_idx < ploidy; ++set_idx) { + if (!empty_sets[set_idx]) { + all_empty = false; + break; + } + } + + unique_ptr trav_call_info; + + if (all_empty) { + // All parent alleles skip this child - genotype is all star/missing + for (int i = 0; i < ploidy; ++i) { + trav_genotype.push_back(star_allele ? STAR_ALLELE_MARKER : MISSING_ALLELE_MARKER); + } + } else { + // Use the existing genotyper to select best genotype from available traversals + // The genotyper uses proper genotype likelihoods that account for expected allele ratios + std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, + make_pair(get<0>(ref_interval), get<1>(ref_interval))); + + // Post-process: replace genotyped alleles with star/missing for empty parent sets + // The genotyper returns one allele per ploidy position, and we need to check if + // the corresponding parent set was empty + for (int i = 0; i < ploidy && i < trav_genotype.size(); ++i) { + if (empty_sets[i]) { + // This parent allele doesn't traverse the child - use star/missing + trav_genotype[i] = star_allele ? STAR_ALLELE_MARKER : MISSING_ALLELE_MARKER; + } + } + } + + // Emit variant with selected genotype + bool added = true; + + // Only emit VCF if snarl is on reference path + if (use_parent_interval) { + added = true; + } else if (!gaf_output) { + added = emit_variant(graph, snarl_caller, snarl, travs, trav_genotype, ref_trav_idx, trav_call_info, ref_path_name, + ref_offsets[ref_path_name], genotype_snarls, ploidy); + } else { + pair pos_info = get_ref_position(graph, snarl, ref_path_name, ref_offsets[ref_path_name]); + emit_gaf_variant(graph, print_snarl(snarl), travs, trav_genotype, ref_trav_idx, pos_info.first, pos_info.second, &support_finder); + } + + ret_val = trav_genotype.size() == ploidy && added; } else { - // use our support caller to choose our genotype - vector trav_genotype; + // Top-level snarl or no parent context - genotype from scratch using support unique_ptr trav_call_info; - int ploidy = ref_ploidies[ref_path_name]; std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); @@ -1916,7 +2274,48 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { ret_val = trav_genotype.size() == ploidy && added; } - + + // In nested mode, recursively call child snarls + if (nested && !trav_genotype.empty()) { + // Find the managed snarl pointer so we can get its children + const Snarl* managed_ptr = snarl_manager.into_which_snarl(snarl.start().node_id(), snarl.start().backward()); + if (managed_ptr) { + const vector& children = snarl_manager.children_of(managed_ptr); + for (const Snarl* child : children) { + if (child && !snarl_manager.is_trivial(child, graph)) { + // Build ChildTraversalSets: one set per parent allele + // Each set contains all traversals through child consistent with that parent allele + ChildTraversalSets child_trav_sets; + bool any_real_traversals = false; + + for (int allele_idx : trav_genotype) { + if (allele_idx >= 0 && allele_idx < travs.size()) { + // Find all traversals through child consistent with this parent traversal + TraversalSet tset = find_child_traversal_set(travs[allele_idx], *child); + if (!tset.empty()) { + any_real_traversals = true; + } + child_trav_sets.push_back(std::move(tset)); + } else { + // Star/missing allele - pass empty set + child_trav_sets.push_back(TraversalSet()); + } + } + + // If no genotyped alleles traverse the child, skip it + if (!any_real_traversals) { + continue; + } + + // Recursively call child with traversal sets + call_snarl_internal(*child, ref_path_name, + make_pair(get<0>(ref_interval), get<1>(ref_interval)), + &child_trav_sets); + } + } + } + } + return ret_val; } @@ -1933,7 +2332,6 @@ string FlowCaller::vcf_header(const PathHandleGraph& graph, const vector return header; } - NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, SupportBasedSnarlCaller& snarl_caller, SnarlManager& snarl_manager, @@ -1957,7 +2355,7 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, gaf_output(gaf_output), genotype_snarls(genotype_snarls), nested_support_finder(dynamic_cast(snarl_caller.get_support_finder())){ - + for (int i = 0; i < ref_paths.size(); ++i) { ref_offsets[ref_paths[i]] = i < ref_path_offsets.size() ? ref_path_offsets[i] : 0; ref_path_set.insert(ref_paths[i]); @@ -1991,7 +2389,7 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local // copy to work on to do things like flip -- calling any snarl_manager code that - // wants a pointer will crash. + // wants a pointer will crash. Snarl snarl = managed_snarl; // hook into our table entry @@ -2041,7 +2439,14 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ pair gt_ref_interval; if (!common_names.empty()) { - ref_path_name = common_names.front(); + // Prefer base reference paths over derived augref paths + ref_path_name = common_names.front(); // default to first (lexicographically smallest) + for (const string& name : common_names) { + if (!AugRefCover::is_augref_name(name)) { + ref_path_name = name; + break; + } + } // find the reference traversal and coordinates using the path position graph interface ref_interval = get_ref_interval(graph, snarl, ref_path_name); @@ -2103,7 +2508,7 @@ bool NestedFlowCaller::call_snarl_recursive(const Snarl& managed_snarl, int max_ } // recurse on the children - // todo: do we need to make this iterative for deep snarl trees? + // todo: do we need to make this iterative for deep snarl trees? const vector& children = snarl_manager.children_of(&managed_snarl); for (const Snarl* child : children) { diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 015f3d4557..6bd84b9939 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -15,6 +15,7 @@ #include "region.hpp" #include "zstdutil.hpp" #include "vg/io/alignment_emitter.hpp" +#include "augref.hpp" namespace vg { @@ -22,6 +23,26 @@ using namespace std; using vg::io::AlignmentEmitter; +/// Special marker value for star alleles in genotype vectors. +/// A star allele (*) represents a haplotype that spans a nested site in the +/// parent but doesn't have a defined traversal at the child level. +constexpr int STAR_ALLELE_MARKER = -2; + +/// Special marker value for missing alleles in genotype vectors. +/// Used when a parent allele doesn't traverse a child snarl and star_allele +/// mode is disabled. Outputs as '.' in VCF to maintain consistent ploidy. +constexpr int MISSING_ALLELE_MARKER = -1; + +/// A set of traversals through a child snarl that are consistent with +/// a single parent allele. Multiple traversals can exist if the child +/// has internal variation within a shared region. +using TraversalSet = vector; + +/// One TraversalSet per parent allele (index matches parent genotype). +/// For a diploid parent with genotype [0,1], element 0 contains traversals +/// consistent with parent allele 0, element 1 with parent allele 1. +using ChildTraversalSets = vector; + /** * GraphCaller: Use the snarl decomposition to call snarls in a graph */ @@ -365,18 +386,18 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { }; - /** - * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find - * traversals, and calls those based on how much support they have. + * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. * Should work on any graph but will not - * report cyclic traversals. Does not (yet, anyway) support nested - * calling, so the entire site is processes in one shot. + * report cyclic traversals. Supports nested calling when enabled with the + * nested flag, recursively processing child snarls. * Designed to replace LegacyCaller, as it should miss fewer obviously - * good traversals, and is not dependent on old protobuf-based structures. + * good traversals, and is not dependent on old protobuf-based structures. */ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { public: + /// Original constructor for non-nested mode FlowCaller(const PathPositionHandleGraph& graph, SupportBasedSnarlCaller& snarl_caller, SnarlManager& snarl_manager, @@ -391,7 +412,27 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC size_t trav_padding, bool genotype_snarls, const pair& allele_length_range); - + + /// Extended constructor for nested mode with clustering and star alleles + FlowCaller(const PathPositionHandleGraph& graph, + SupportBasedSnarlCaller& snarl_caller, + SnarlManager& snarl_manager, + const string& sample_name, + TraversalFinder& traversal_finder, + const vector& ref_paths, + const vector& ref_path_offsets, + const vector& ref_path_ploidies, + AlignmentEmitter* aln_emitter, + bool traversals_only, + bool gaf_output, + size_t trav_padding, + bool genotype_snarls, + const pair& allele_length_range, + bool nested, + double cluster_threshold, + bool cluster_post_genotype, + bool star_allele); + virtual ~FlowCaller(); virtual bool call_snarl(const Snarl& snarl); @@ -440,18 +481,56 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC /// 1) its largest allele is >= allele_length_range.first and /// 2) all alleles are < allele_length_range.second pair allele_length_range; + + /// --- Nested mode members --- + + /// enable recursive calling of child snarls + bool nested = false; + + /// cluster traversals with Jaccard similarity >= this threshold + double cluster_threshold = 1.0; + + /// if true, cluster after genotyping (for output grouping); if false, cluster before genotyping + bool cluster_post_genotype = false; + + /// use * alleles for spanning haplotypes that don't traverse nested sites + bool star_allele = false; + + /// Internal implementation of call_snarl that accepts parent context for nested mode + /// When nested=true, this recursively calls children after processing the current snarl + /// @param parent_ref_path_name Reference path from parent (for off-reference snarls) + /// @param parent_ref_interval Reference interval from parent + /// @param parent_child_trav_sets If non-null, contains one TraversalSet per parent allele. + /// Each set contains all traversals through this child that are + /// consistent with that parent allele. The child genotypes by + /// picking the best pair (one from each set) based on read support. + bool call_snarl_internal(const Snarl& snarl, + const string& parent_ref_path_name, + pair parent_ref_interval, + const ChildTraversalSets* parent_child_trav_sets = nullptr); + + /// Find all traversals through a child snarl that are consistent with a parent traversal. + /// "Consistent" means the child's entry/exit points match what's in the parent traversal. + /// Uses the traversal finder to enumerate all valid paths through the child. + /// @param parent_trav The parent traversal defining entry/exit constraints + /// @param child The child snarl to find traversals through + /// @return Set of traversals through child, empty if parent doesn't traverse child + TraversalSet find_child_traversal_set(const SnarlTraversal& parent_trav, + const Snarl& child) const; + + /// Extract the portion of a parent traversal that spans a child snarl (single traversal). + /// This is a simpler version used when we only need one traversal from the parent. }; class SnarlGraph; /** - * FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find - * traversals, and calls those based on how much support they have. - * Should work on any graph but will not - * report cyclic traversals. + * NestedFlowCaller : DEPRECATED - Use FlowCaller with nested=true instead. * - * todo: this is a generalization of FlowCaller and should be able to replace it entirely after testing - * to get rid of duplicated code. + * Uses any traversals finder (ex, FlowTraversalFinder) to find + * traversals, and calls those based on how much support they have. + * Should work on any graph but will not report cyclic traversals. + * This class is being replaced by FlowCaller's nested mode. */ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller { public: @@ -468,7 +547,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO bool gaf_output, size_t trav_padding, bool genotype_snarls); - + virtual ~NestedFlowCaller(); virtual bool call_snarl(const Snarl& snarl); @@ -487,7 +566,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO int ref_trav_idx; // index of ref paths in CallRecord::travs }; typedef map CallTable; - + /// update the table of calls for each child snarl (and the input snarl) bool call_snarl_recursive(const Snarl& managed_snarl, int ploidy, const string& parent_ref_path_name, pair parent_ref_path_interval, @@ -502,7 +581,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO /// a proper string by recursively resolving the nested snarls into alleles string flatten_reference_allele(const string& nested_allele, const CallTable& call_table) const; string flatten_alt_allele(const string& nested_allele, int allele, int ploidy, const CallTable& call_table) const; - + /// the graph const PathPositionHandleGraph& graph; @@ -515,11 +594,11 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO /// keep track of offsets in the reference paths map ref_offsets; - + /// keep traco of the ploidies (todo: just one map for all path stuff!!) map ref_ploidies; - /// until we support nested snarls, cap snarl size we attempt to process + /// until we support nested snarls, cap snarl size we attempt to process size_t max_snarl_shallow_size = 50000; /// alignment emitter. if not null, traversals will be output here and diff --git a/src/snarl_caller.cpp b/src/snarl_caller.cpp index a6eebcbe93..9b4256ed4b 100644 --- a/src/snarl_caller.cpp +++ b/src/snarl_caller.cpp @@ -485,6 +485,17 @@ void PoissonSupportSnarlCaller::set_insertion_bias(double insertion_threshold, d } } +int PoissonSupportSnarlCaller::get_ref_trav_size(const vector& traversals, int ref_trav_idx) const { + int ref_trav_size = 0; + if (ref_trav_idx >= 0 && ref_trav_idx < (int)traversals.size()) { + const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; + for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { + ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); + } + } + return ref_trav_size; +} + pair, unique_ptr> PoissonSupportSnarlCaller::genotype(const Snarl& snarl, const vector& traversals, int ref_trav_idx, @@ -514,13 +525,7 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: int max_trav_size = -1; vector supports = support_finder.get_traversal_set_support(traversals, {}, {}, {}, false, {}, {}, ref_trav_idx, &max_trav_size); - int ref_trav_size = 0; - if (ref_trav_idx >= 0) { - const SnarlTraversal& ref_trav = traversals[ref_trav_idx]; - for (int64_t i = 1; i < (int64_t)ref_trav.visit_size() - 1; ++i) { - ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); - } - } + int ref_trav_size = get_ref_trav_size(traversals, ref_trav_idx); // sort the traversals by support vector ranked_traversals = rank_by_support(supports); @@ -596,11 +601,12 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: // variance/std-err can be nan when binsize < 2. We just clamp it to 0 double depth_err = depth_info.second ? !isnan(depth_info.second) : 0.; - // genotype (log) likelihoods + // Single pass: find best genotype AND compute GQ/posterior values + vector best_genotype; double best_genotype_likelihood = -numeric_limits::max(); double second_best_genotype_likelihood = -numeric_limits::max(); double total_likelihood = 0; - vector best_genotype; + for (const auto& candidate : candidates) { double gl = genotype_likelihood(candidate, traversals, top_traversals, traversal_sizes, traversal_mapqs, ref_trav_idx, exp_depth, depth_err, max_trav_size, ref_trav_size); @@ -609,22 +615,22 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: best_genotype_likelihood = gl; best_genotype = candidate; } else if (gl > second_best_genotype_likelihood) { - assert(gl <= best_genotype_likelihood); second_best_genotype_likelihood = gl; } total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); } + // Compute CallInfo directly from tracked values PoissonCallInfo* call_info = new PoissonCallInfo(); call_info->posterior = 0; if (!candidates.empty()) { // compute the posterior from our likelihoods using a uniform prior - call_info->posterior = best_genotype_likelihood - log(candidates.size()) - total_likelihood; + call_info->posterior = best_genotype_likelihood - log(candidates.size()) - total_likelihood; } - - // GQ computed as here https://gatk.broadinstitute.org/hc/en-us/articles/360035890451?id=11075 - // as difference between best and second best likelihoods + + // GQ computed as difference between best and second best likelihoods + // https://gatk.broadinstitute.org/hc/en-us/articles/360035890451?id=11075 call_info->gq = 0; if (!isnan(best_genotype_likelihood) && !isnan(second_best_genotype_likelihood)) { call_info->gq = logprob_to_phred(second_best_genotype_likelihood) - logprob_to_phred(best_genotype_likelihood); @@ -633,7 +639,6 @@ pair, unique_ptr> PoissonSupportSnarlCaller:: call_info->expected_depth = exp_depth; call_info->depth_err = depth_err; call_info->max_trav_size = max_trav_size; - #ifdef debug cerr << " best genotype: "; for (auto a : best_genotype) {cerr << a <<",";} cerr << " gl=" << best_genotype_likelihood << endl; @@ -649,7 +654,7 @@ double PoissonSupportSnarlCaller::genotype_likelihood(const vector& genotyp int ref_trav_idx, double exp_depth, double depth_err, int max_trav_size, int ref_trav_size) { - + assert(genotype.size() == 1 || genotype.size() == 2); // get the genotype support @@ -786,10 +791,17 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, traversal_mapqs = support_finder.get_traversal_mapqs(traversals); } - // get the maximum size from the info - const SnarlCaller::CallInfo* s_call_info = call_info.get(); + // get the maximum size from the info (or compute from traversals if call_info is null) + int max_trav_size = 0; const PoissonCallInfo* p_call_info = dynamic_cast(call_info.get()); - int max_trav_size = p_call_info->max_trav_size; + if (p_call_info) { + max_trav_size = p_call_info->max_trav_size; + } else { + // No call info (e.g., derived genotype from parent) - compute from traversals + for (int size : traversal_sizes) { + max_trav_size = std::max(max_trav_size, size); + } + } int ref_trav_idx = 0; int ref_trav_size = 0; @@ -798,8 +810,17 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, ref_trav_size += graph.get_length(graph.get_handle(ref_trav.visit(i).node_id())); } + // Filter out special marker values (negative indices like STAR_ALLELE_MARKER, MISSING_ALLELE_MARKER) + // from genotype before computing support + vector filtered_genotype; + for (int g : genotype) { + if (g >= 0 && g < (int)traversals.size()) { + filtered_genotype.push_back(g); + } + } + // get the genotype support - vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, genotype, {}, 0, &max_trav_size); + vector genotype_supports = support_finder.get_traversal_genotype_support(traversals, filtered_genotype, {}, 0, &max_trav_size); // Get the depth of the site Support total_site_support = std::accumulate(genotype_supports.begin(), genotype_supports.end(), Support()); @@ -829,30 +850,51 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, } } - // get the genotype likelihoods + // get the genotype likelihoods (only if we have call_info with expected depth) vector gen_likelihoods; - double gen_likelihood; - variant.format.push_back("GL"); - - assert(!isnan(p_call_info->expected_depth)); - // variance/std-err can be nan when binsize < 2. We just clamp it to 0 - double depth_err = !isnan(p_call_info->depth_err) ? p_call_info->depth_err : 0.; - + double gen_likelihood = 0; double total_likelihood = 0.; double ref_likelihood = 1.; double alt_likelihood = 0.; - if (genotype.size() == 2) { - // assume ploidy 2 - for (int i = 0; i < traversals.size(); ++i) { - for (int j = i; j < traversals.size(); ++j) { - double gl = genotype_likelihood({i, j}, traversals, {}, traversal_sizes, traversal_mapqs, + if (p_call_info) { + variant.format.push_back("GL"); + + assert(!isnan(p_call_info->expected_depth)); + // variance/std-err can be nan when binsize < 2. We just clamp it to 0 + double depth_err = !isnan(p_call_info->depth_err) ? p_call_info->depth_err : 0.; + + if (genotype.size() == 2) { + // assume ploidy 2 + for (int i = 0; i < traversals.size(); ++i) { + for (int j = i; j < traversals.size(); ++j) { + double gl = genotype_likelihood({i, j}, traversals, {}, traversal_sizes, traversal_mapqs, + ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); + gen_likelihoods.push_back(gl); + if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + gen_likelihood = gl; + } + if (i == 0 && j == 0) { + ref_likelihood = gl; + } else { + alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); + } + total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); + // convert from natural log to log10 by dividing by ln(10) + variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); + } + } + } else if (genotype.size() == 1) { + // assume ploidy 1 + // todo: generalize this iteration (as is, it is copy pased from above) + for (int i = 0; i < traversals.size(); ++i) { + double gl = genotype_likelihood({i}, traversals, {}, traversal_sizes, traversal_mapqs, ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); gen_likelihoods.push_back(gl); - if (vector({i, j}) == genotype || vector({j,i}) == genotype) { + if (vector({i}) == genotype) { gen_likelihood = gl; } - if (i == 0 && j == 0) { + if (i == 0) { ref_likelihood = gl; } else { alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); @@ -862,41 +904,22 @@ void PoissonSupportSnarlCaller::update_vcf_info(const Snarl& snarl, variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); } } - } else if (genotype.size() == 1) { - // assume ploidy 1 - // todo: generalize this iteration (as is, it is copy pased from above) - for (int i = 0; i < traversals.size(); ++i) { - double gl = genotype_likelihood({i}, traversals, {}, traversal_sizes, traversal_mapqs, - ref_trav_idx, p_call_info->expected_depth, depth_err, max_trav_size, ref_trav_size); - gen_likelihoods.push_back(gl); - if (vector({i}) == genotype) { - gen_likelihood = gl; - } - if (i == 0) { - ref_likelihood = gl; - } else { - alt_likelihood = alt_likelihood == 0. ? gl : add_log(alt_likelihood, gl); - } - total_likelihood = total_likelihood == 0 ? gl : add_log(total_likelihood, gl); - // convert from natural log to log10 by dividing by ln(10) - variant.samples[sample_name]["GL"].push_back(std::to_string(gl / 2.30258)); - } - } - - variant.format.push_back("GQ"); - variant.samples[sample_name]["GQ"].push_back(std::to_string(min((int)256, max((int)0, (int)p_call_info->gq)))); - variant.format.push_back("GP"); - variant.samples[sample_name]["GP"].push_back(std::to_string(p_call_info->posterior)); + variant.format.push_back("GQ"); + variant.samples[sample_name]["GQ"].push_back(std::to_string(min((int)256, max((int)0, (int)p_call_info->gq)))); - variant.format.push_back("XD"); - variant.samples[sample_name]["XD"].push_back(std::to_string(p_call_info->expected_depth)); + variant.format.push_back("GP"); + variant.samples[sample_name]["GP"].push_back(std::to_string(p_call_info->posterior)); + + variant.format.push_back("XD"); + variant.samples[sample_name]["XD"].push_back(std::to_string(p_call_info->expected_depth)); + } // The QUAL field is the probability that we have variation as a PHRED score (of wrongness) // We derive this from the posterior probability of the reference genotype. // But if it's a reference call, we take the total of all the alts variant.quality = 0; - if (genotype.size() > 0) { + if (p_call_info && genotype.size() > 0) { // our flat prior and p[traversal coverage] double posterior = -log(gen_likelihoods.size()) - total_likelihood; if (!all_of(genotype.begin(), genotype.end(), [&](int a) {return a == 0;})) { diff --git a/src/snarl_caller.hpp b/src/snarl_caller.hpp index 5ac2cd506e..ec730daf4d 100644 --- a/src/snarl_caller.hpp +++ b/src/snarl_caller.hpp @@ -221,7 +221,7 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { int ploidy, const string& ref_path_name, pair ref_range); - + /// Update INFO and FORMAT fields of the called variant virtual void update_vcf_info(const Snarl& snarl, const vector& traversals, @@ -235,6 +235,9 @@ class PoissonSupportSnarlCaller : public SupportBasedSnarlCaller { protected: + /// Calculate the size of the reference traversal (excluding boundary nodes) + int get_ref_trav_size(const vector& traversals, int ref_trav_idx) const; + /// Compute likelihood of genotype as product of poisson probabilities /// P[allele1] * P[allle2] * P[uncalled alleles] /// Homozygous alleles are split into two, with half support each diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index b2034c2624..8cac50e177 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -17,6 +17,8 @@ #include "../xg.hpp" #include "../gbzgraph.hpp" #include "../gbwtgraph_helper.hpp" +#include "../augref.hpp" +#include "../traversal_clusters.hpp" #include #include #include @@ -51,8 +53,7 @@ void help_call(char** argv) { << " to construct input graph with -a)" << endl << " -a, --genotype-snarls genotype every snarl, including reference calls" << endl << " (use to compare multiple samples)" << endl - << " -A, --all-snarls genotype all snarls, including nested child snarls" << endl - << " (like deconstruct -a)" << endl + << " -A, --all-snarls call all snarls including nested (each independent)" << endl << " -c, --min-length N genotype only snarls with" << endl << " at least one traversal of length >= N" << endl << " -C, --max-length N genotype only snarls where" << endl @@ -70,8 +71,9 @@ void help_call(char** argv) { << " -O, --gbz-translation use the ID translation from the input GBZ to" << endl << " apply snarl names to snarl names/AT fields in output" << endl << " -p, --ref-path NAME reference path to call on (may repeat; default all)" << endl + << " -P, --path-prefix NAME call on all paths with this prefix (may repeat)" << endl << " -S, --ref-sample NAME call on all paths with this sample" << endl - << " (cannot use with -p)" << endl + << " (cannot use with -p or -P)" << endl << " -o, --ref-offset N offset in reference path (may repeat; 1 per path)" << endl << " -l, --ref-length N override reference length for output VCF contig" << endl << " -d, --ploidy N ploidy of sample. {1, 2} [2]" << endl @@ -80,8 +82,15 @@ void help_call(char** argv) { << " not visited by the selected samples, or to all" << endl << " contigs simulated from if no samples are used." << endl << " Unmatched contigs get ploidy 2 (or that from -d)." << endl - << " -n, --nested activate nested calling mode (experimental)" << endl + << " --top-down top-down nested calling with genotype propagation" << endl + << " from parent to child snarls (writes LV/PS tags)" << endl + << " --bottom-up bottom-up nested calling with snarl merging" << endl << " -I, --chains call chains instead of snarls (experimental)" << endl + << " -L, --cluster F cluster similar traversals with Jaccard >= F [1.0]" << endl + << " --cluster-post cluster after genotyping (for output grouping only)" << endl + << " default is to cluster before genotyping" << endl + << " -Y, --star-allele use * alleles for spanning haplotypes" << endl + << " (requires --top-down)" << endl << " --progress show progress" << endl << " -t, --threads N number of threads to use" << endl << " -h, --help print this help message to stderr and exit" << endl; @@ -101,6 +110,7 @@ int main_call(int argc, char** argv) { string ref_fasta_filename; string ins_fasta_filename; vector ref_paths; + vector ref_path_prefixes; string ref_sample; vector ref_path_offsets; vector ref_path_lengths; @@ -121,13 +131,19 @@ int main_call(int argc, char** argv) { bool gaf_output = false; size_t trav_padding = 0; bool genotype_snarls = false; - bool nested = false; + bool top_down = false; + bool bottom_up = false; bool call_chains = false; bool all_snarls = false; size_t min_allele_len = 0; size_t max_allele_len = numeric_limits::max(); bool show_progress = false; + // Nested calling options (for use with -A) + double cluster_threshold = 1.0; + bool cluster_post_genotype = false; // false = cluster before, true = cluster after + bool star_allele = false; + // constants const size_t avg_trav_threshold = 50; const size_t avg_node_threshold = 50; @@ -139,6 +155,10 @@ int main_call(int argc, char** argv) { const size_t max_chain_edges = 1000; const size_t max_chain_trivial_travs = 5; constexpr int OPT_PROGRESS = 1000; + constexpr int OPT_CLUSTER_POST = 1002; + constexpr int OPT_LEGACY = 1004; + constexpr int OPT_BOTTOM_UP = 1005; + constexpr int OPT_TOP_DOWN = 1006; int c; optind = 2; // force optind past command positional argument while (true) { @@ -163,6 +183,7 @@ int main_call(int argc, char** argv) { {"translation", required_argument, 0, 'N'}, {"gbz-translation", no_argument, 0, 'O'}, {"ref-path", required_argument, 0, 'p'}, + {"path-prefix", required_argument, 0, 'P'}, {"ref-sample", required_argument, 0, 'S'}, {"ref-offset", required_argument, 0, 'o'}, {"ref-length", required_argument, 0, 'l'}, @@ -171,9 +192,13 @@ int main_call(int argc, char** argv) { {"gaf", no_argument, 0, 'G'}, {"traversals", no_argument, 0, 'T'}, {"trav-padding", required_argument, 0, 'M'}, - {"legacy", no_argument, 0, 'L'}, - {"nested", no_argument, 0, 'n'}, - {"chains", no_argument, 0, 'I'}, + {"legacy", no_argument, 0, OPT_LEGACY}, + {"top-down", no_argument, 0, OPT_TOP_DOWN}, + {"bottom-up", no_argument, 0, OPT_BOTTOM_UP}, + {"chains", no_argument, 0, 'I'}, + {"cluster", required_argument, 0, 'L'}, + {"cluster-post", no_argument, 0, OPT_CLUSTER_POST}, + {"star-allele", no_argument, 0, 'Y'}, {"threads", required_argument, 0, 't'}, {"progress", no_argument, 0, OPT_PROGRESS }, {"help", no_argument, 0, 'h'}, @@ -182,7 +207,7 @@ int main_call(int argc, char** argv) { int option_index = 0; - c = getopt_long (argc, argv, "k:Be:b:m:v:aAc:C:f:i:s:r:g:zN:Op:S:o:l:d:R:GTLM:nIt:h?", + c = getopt_long (argc, argv, "k:Be:b:m:v:aAc:C:f:i:s:r:g:zN:Op:P:S:o:l:d:R:GTM:IL:Yt:h?", long_options, &option_index); // Detect the end of the options. @@ -248,6 +273,9 @@ int main_call(int argc, char** argv) { case 'p': ref_paths.push_back(optarg); break; + case 'P': + ref_path_prefixes.push_back(optarg); + break; case 'S': ref_sample = optarg; break; @@ -291,13 +319,25 @@ int main_call(int argc, char** argv) { trav_padding = parse(optarg); break; case 'L': - legacy = true; + cluster_threshold = parse(optarg); break; - case 'n': - nested =true; + case OPT_TOP_DOWN: + top_down = true; + break; + case OPT_BOTTOM_UP: + bottom_up = true; break; case 'I': - call_chains =true; + call_chains = true; + break; + case OPT_CLUSTER_POST: + cluster_post_genotype = true; + break; + case 'Y': + star_allele = true; + break; + case OPT_LEGACY: + legacy = true; break; case OPT_PROGRESS: show_progress = true; @@ -371,12 +411,18 @@ int main_call(int argc, char** argv) { } if ((min_allele_len > 0 || max_allele_len < numeric_limits::max()) - && (legacy || !vcf_filename.empty() || nested)) { - logger.error() << "-c/-C no supported with -v, -l or -n" << endl; + && (legacy || !vcf_filename.empty() || bottom_up)) { + logger.error() << "-c/-C not supported with -v, -l, or --bottom-up" << endl; } if (!ref_paths.empty() && !ref_sample.empty()) { logger.error() << "-S cannot be used with -p" << endl; } + if (!ref_path_prefixes.empty() && !ref_sample.empty()) { + logger.error() << "-S cannot be used with -P" << endl; + } + if (!ref_path_prefixes.empty() && !ref_paths.empty()) { + logger.error() << "-P cannot be used with -p" << endl; + } // Read the graph unique_ptr path_handle_graph; @@ -487,6 +533,31 @@ int main_call(int argc, char** argv) { logger.error() << "GBWT (-g) cannot be used with GBZ graph (-z): choose one or the other" << endl; } + // Validation: -A, --top-down, and --bottom-up are mutually exclusive + int nested_mode_count = (all_snarls ? 1 : 0) + (top_down ? 1 : 0) + (bottom_up ? 1 : 0); + if (nested_mode_count > 1) { + logger.error() << "-A, --top-down, and --bottom-up are mutually exclusive" << endl; + } + + // Validation for nested calling options + if (star_allele && !top_down) { + logger.error() << "-Y/--star-allele requires --top-down mode" << endl; + } + if (cluster_post_genotype && cluster_threshold >= 1.0) { + logger.error() << "--cluster-post requires -L/--cluster with threshold < 1.0" << endl; + } + if (cluster_threshold < 0.0 || cluster_threshold > 1.0) { + logger.error() << "-L/--cluster threshold must be in range [0.0, 1.0]" << endl; + } + + // Validation for bottom-up mode + if (bottom_up && star_allele) { + logger.error() << "-Y/--star-allele cannot be used with --bottom-up mode" << endl; + } + if (bottom_up && cluster_threshold < 1.0) { + logger.error() << "-L/--cluster cannot be used with --bottom-up mode" << endl; + } + // in order to add subpath support, we let all ref_paths be subpaths and then convert coordinates // on VCF export. the exception is writing the header where we need base paths. we keep // track of them the best we can here (just for writing the ##contigs) @@ -505,7 +576,27 @@ int main_call(int argc, char** argv) { return len; } }; - + + // Process path prefixes to find ref paths + if (!ref_path_prefixes.empty()) { + graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](const path_handle_t& path_handle) { + string path_name = graph->get_path_name(path_handle); + // Never include alt paths in reference paths + if (Paths::is_alt(path_name)) { + return; + } + for (auto& prefix : ref_path_prefixes) { + if (path_name.compare(0, prefix.size(), prefix) == 0) { + ref_paths.push_back(path_name); + break; + } + } + }); + if (ref_paths.empty()) { + logger.error() << "No paths found matching prefix(es)" << endl; + } + } + // No paths specified: use them all if (ref_paths.empty()) { set ref_sample_names; @@ -628,6 +719,10 @@ int main_call(int argc, char** argv) { std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; for (const string& refpath_name : ref_paths) { + // Skip altpaths (they shouldn't influence snarl decomposition) + if (AugRefCover::is_augref_name(refpath_name)) { + continue; + } path_handle_t refpath_handle = graph->get_path_handle(refpath_name); extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(refpath_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(refpath_handle)))] += EXTRA_WEIGHT; @@ -649,11 +744,11 @@ int main_call(int argc, char** argv) { if (show_progress) logger.info() << "Loading pack file " << pack_filename << endl; packer->load_from_file(pack_filename); if (show_progress) logger.info() << "Loaded pack file" << endl; - if (nested) { - // Make a nested packed traversal support finder (using cached veresion important for poisson caller) + if (bottom_up) { + // Make a nested packed traversal support finder (required by NestedFlowCaller) support_finder.reset(new NestedCachedPackedTraversalSupportFinder(*packer, *snarl_manager)); } else { - // Make a packed traversal support finder (using cached veresion important for poisson caller) + // Make a packed traversal support finder (using cached version important for poisson caller) support_finder.reset(new CachedPackedTraversalSupportFinder(*packer, *snarl_manager)); } @@ -791,7 +886,25 @@ int main_call(int argc, char** argv) { traversal_finder = unique_ptr(flow_traversal_finder); } - if (nested) { + if (top_down) { + // Use FlowCaller with nested mode enabled (top-down genotype propagation) + graph_caller.reset(new FlowCaller(*dynamic_cast(graph), + *dynamic_cast(snarl_caller.get()), + *snarl_manager, + sample_name, *traversal_finder, ref_paths, ref_path_offsets, + ref_path_ploidies, + alignment_emitter.get(), + traversals_only, + gaf_output, + trav_padding, + genotype_snarls, + make_pair(min_allele_len, max_allele_len), + true, // nested mode enabled + cluster_threshold, + cluster_post_genotype, + star_allele)); + } else if (bottom_up) { + // Use NestedFlowCaller (bottom-up snarl merging, original nested algorithm) graph_caller.reset(new NestedFlowCaller(*dynamic_cast(graph), *dynamic_cast(snarl_caller.get()), *snarl_manager, @@ -822,8 +935,8 @@ int main_call(int argc, char** argv) { // Init The VCF VCFOutputCaller* vcf_caller = dynamic_cast(graph_caller.get()); assert(vcf_caller != nullptr); - // Make sure we get the LV/PS tags with -A - vcf_caller->set_nested(all_snarls); + // Make sure we get the LV/PS tags with -A, --top-down, or --bottom-up + vcf_caller->set_nested(all_snarls || top_down || bottom_up); vcf_caller->set_translation(translation.get()); // Make sure the basepath information we inferred above goes directy to the VCF header // (and that it does *not* try to read it from the graph paths) @@ -842,18 +955,28 @@ int main_call(int argc, char** argv) { graph_caller->set_show_progress(show_progress); // Call the graph - if (!call_chains) { + // Determine recursion strategy based on mode: + // - top_down: FlowCaller handles recursion internally, so RecurseNever + // - all_snarls (-A): visit every snarl independently, so RecurseAlways + // - default: only recurse into children of failed snarls, so RecurseOnFail + GraphCaller::RecurseType recurse_type; + if (top_down) { + recurse_type = GraphCaller::RecurseNever; + } else if (all_snarls) { + recurse_type = GraphCaller::RecurseAlways; + } else { + recurse_type = GraphCaller::RecurseOnFail; + } + if (!call_chains) { // Call each snarl - // (todo: try chains in normal mode) if (show_progress) logger.info() << "Calling top-level snarls" << endl; - graph_caller->call_top_level_snarls(*graph, all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + graph_caller->call_top_level_snarls(*graph, recurse_type); } else { // Attempt to call chains instead of snarls so that the output traversals are longer // Todo: this could probably help in some cases when making VCFs too if (show_progress) logger.info() << "Calling top-level chains" << endl; - graph_caller->call_top_level_chains(*graph, max_chain_edges, max_chain_trivial_travs, - all_snarls ? GraphCaller::RecurseAlways : GraphCaller::RecurseOnFail); + graph_caller->call_top_level_chains(*graph, max_chain_edges, max_chain_trivial_travs, recurse_type); } if (show_progress) logger.info() << "Calling complete" << endl; diff --git a/src/subcommand/deconstruct_main.cpp b/src/subcommand/deconstruct_main.cpp index fef15ea05e..844dcbf417 100644 --- a/src/subcommand/deconstruct_main.cpp +++ b/src/subcommand/deconstruct_main.cpp @@ -14,6 +14,7 @@ #include "../vg.hpp" #include "../deconstructor.hpp" +#include "../augref.hpp" #include "../integrated_snarl_finder.hpp" #include "../gbwtgraph_helper.hpp" #include "../gbwt_helper.hpp" @@ -51,6 +52,7 @@ void help_deconstruct(char** argv) { << " snarl names to snarl names and AT fields in output" << endl << " -a, --all-snarls process all snarls, including nested snarls" << endl << " (by default only top-level snarls reported)." << endl + << " Uses hierarchical processing and writes LV/PS tags." << endl << " -c, --context-jaccard N set context mapping size used to disambiguate alleles" << endl << " at sites with multiple reference traversals [10000]" << endl << " -u, --untangle-travs use context mapping for reference-relative positions" << endl @@ -62,10 +64,8 @@ void help_deconstruct(char** argv) { << " for reference if possible (i.e. only one ref sample)" << endl << " -L, --cluster F cluster traversals whose (handle) Jaccard coefficient" << endl << " is >= F together [1.0; experimental]" << endl - << " -n, --nested write a nested VCF, plus special tags [experimental]" << endl - << " -f, --nested-fasta F Write off-reference FASTA to F (and some indexing" << endl - << " information to F.nesting.tsv) [experimental]" << endl - << " -R, --star-allele use *-alleles to denote alleles that span" << endl + << " -R, --star-allele use *-alleles to represent haplotypes that span the" << endl + << " parent but don't traverse nested sites (requires -a)" << endl << " -t, --threads N use N threads" << endl << " -v, --verbose print some status messages" << endl << " -h, --help print this help message to stderr and exit" << endl; @@ -94,10 +94,8 @@ int main_deconstruct(int argc, char** argv) { bool untangle_traversals = false; bool contig_only_ref = false; double cluster_threshold = 1.0; - bool nested = false; - string nested_fasta_file_name; bool star_allele = false; - + int c; optind = 2; // force optind past command positional argument while (true) { @@ -120,8 +118,6 @@ int main_deconstruct(int argc, char** argv) { {"strict-conflicts", no_argument, 0, 'S'}, {"contig-only-ref", no_argument, 0, 'C'}, {"cluster", required_argument, 0, 'L'}, - {"nested", no_argument, 0, 'n'}, - {"nested-fasta", required_argument, 0, 'f'}, {"star-allele", no_argument, 0, 'R'}, {"threads", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, @@ -129,7 +125,7 @@ int main_deconstruct(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "h?p:P:H:r:g:T:OeKSCd:c:uaL:nf:Rt:v", + c = getopt_long (argc, argv, "h?p:P:H:r:g:T:OeKSCd:c:uaL:Rt:v", long_options, &option_index); // Detect the end of the options. @@ -186,12 +182,6 @@ int main_deconstruct(int argc, char** argv) { case 'L': cluster_threshold = max(0.0, min(1.0, parse(optarg))); break; - case 'n': - nested = true; - break; - case 'f': - nested_fasta_file_name = ensure_writable(logger, optarg); - break; case 'R': star_allele = true; break; @@ -212,13 +202,13 @@ int main_deconstruct(int argc, char** argv) { } - if (nested == true && contig_only_ref == true) { - logger.error() << "-C cannot be used with -n" << endl; + if (all_snarls == true && contig_only_ref == true) { + logger.error() << "-C cannot be used with -a" << endl; } - if (star_allele == true && nested == false) { - logger.error() << "-R can only be used with -n" << endl; + if (star_allele == true && all_snarls == false) { + logger.error() << "-R can only be used with -a" << endl; } - + // Read the graph unique_ptr path_handle_graph_up; unique_ptr gbz_graph; @@ -302,6 +292,7 @@ int main_deconstruct(int argc, char** argv) { bool found_hap = false; // No paths specified: use all reference and non-alt generic paths as reference to deconstruct against. + // Altpaths are included as reference paths (priority given to non-altpaths in deconstructor.cpp) graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](path_handle_t path_handle) { const string& name = graph->get_path_name(path_handle); if (!Paths::is_alt(name)) { @@ -348,10 +339,11 @@ int main_deconstruct(int argc, char** argv) { } // process the prefixes to find ref paths + // Altpaths are included (priority given to non-altpaths in deconstructor.cpp) if (!refpath_prefixes.empty()) { graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](const path_handle_t& path_handle) { string path_name = graph->get_path_name(path_handle); - for (auto& prefix : refpath_prefixes) { + for (auto& prefix : refpath_prefixes) { if (path_name.compare(0, prefix.size(), prefix) == 0) { refpaths.push_back(path_name); break; @@ -372,6 +364,10 @@ int main_deconstruct(int argc, char** argv) { std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; for (const string& refpath_name : refpaths) { + // Skip altpaths (they shouldn't influence snarl decomposition) + if (AugRefCover::is_augref_name(refpath_name)) { + continue; + } path_handle_t refpath_handle = graph->get_path_handle(refpath_name); extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(refpath_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(refpath_handle)))] += EXTRA_WEIGHT; @@ -398,7 +394,7 @@ int main_deconstruct(int argc, char** argv) { logger.info() << "Deconstructing top-level snarls" << endl; } dd.set_translation(translation.get()); - dd.set_nested(all_snarls || nested); + dd.set_nested(all_snarls); dd.deconstruct(refpaths, graph, snarl_manager.get(), all_snarls, context_jaccard_window, @@ -408,12 +404,8 @@ int main_deconstruct(int argc, char** argv) { !contig_only_ref, cluster_threshold, gbwt_index, - nested, star_allele); - if (!nested_fasta_file_name.empty()) { - dd.save_off_ref_sequences(nested_fasta_file_name); - } return 0; } diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 4c1a06cd19..f6645ccc34 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -10,6 +10,7 @@ #include #include +#include #include "subcommand.hpp" @@ -17,6 +18,8 @@ #include "../xg.hpp" #include "../gbwt_helper.hpp" #include "../traversal_clusters.hpp" +#include "../augref.hpp" +#include "../integrated_snarl_finder.hpp" #include "../io/save_handle_graph.hpp" #include #include @@ -63,6 +66,16 @@ void help_paths(char** argv) { << " -G, --generic-paths select generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select reference paths" << endl << " -H, --haplotype-paths select haplotype paths" << endl + << "augmented reference computation:" << endl + << " -u, --compute-augref compute augmented reference path cover" << endl + << " (use -Q to select reference paths)" << endl + << " -l, --min-augref-len N minimum augref fragment length [50]" << endl + << " -N, --augref-sample STR create augref paths under a new sample" << endl + << " (copies base paths to new sample," << endl + << " then adds augref paths)." << endl + << " if unspecified, paths get added to target sample." << endl + << " --augref-segs FILE write augref segment table to FILE" << endl + << " --verbose print augref progress and coverage summary" << endl << "configuration:" << endl << " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl << " -t, --threads N number of threads to use [all available]" << endl @@ -133,6 +146,14 @@ int main_paths(int argc, char** argv) { const size_t coverage_bins = 10; bool normalize_paths = false; bool overlay = false; + bool compute_augref = false; + int64_t min_augref_length = 50; + string augref_sample; + bool augref_verbose = false; + string augref_segments_file; + + constexpr int OPT_VERBOSE = 1001; + constexpr int OPT_AUGREF_SEGMENTS = 1002; int c; optind = 2; // force optind past command positional argument @@ -170,11 +191,18 @@ int main_paths(int argc, char** argv) { {"threads-old", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, + // Augref options + {"compute-augref", no_argument, 0, 'u'}, + {"min-augref-len", required_argument, 0, 'l'}, + {"augref-sample", required_argument, 0, 'N'}, + {"augref-segs", required_argument, 0, OPT_AUGREF_SEGMENTS}, + {"verbose", no_argument, 0, OPT_VERBOSE}, + {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "h?LXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:", + c = getopt_long (argc, argv, "h?LXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:ul:N:", long_options, &option_index); // Detect the end of the options. @@ -307,7 +335,28 @@ int main_paths(int argc, char** argv) { case 't': set_thread_count(logger, optarg); break; - + + case 'u': + compute_augref = true; + output_formats++; + break; + + case 'l': + min_augref_length = parse(optarg); + break; + + case 'N': + augref_sample = optarg; + break; + + case OPT_VERBOSE: + augref_verbose = true; + break; + + case OPT_AUGREF_SEGMENTS: + augref_segments_file = ensure_writable(logger, optarg); + break; + case 'h': case '?': help_paths(argv); @@ -374,7 +423,16 @@ int main_paths(int argc, char** argv) { if (coverage && !gbwt_file.empty()) { logger.error() << "coverage option -c only works on embedded graph paths, not GBWT threads" << std::endl; } - + if (compute_augref && !gbwt_file.empty()) { + logger.error() << "augref computation only works on embedded graph paths, not GBWT threads" << std::endl; + } + if (compute_augref && path_prefix.empty()) { + logger.error() << "--compute-augref requires -Q to select reference path(s)" << std::endl; + } + if (!augref_segments_file.empty() && !compute_augref) { + logger.error() << "--augref-segs requires --compute-augref" << std::endl; + } + if (select_alt_paths) { // alt paths all have a specific prefix path_prefix = "_alt_"; @@ -416,6 +474,61 @@ int main_paths(int argc, char** argv) { + // Handle augref computation before other operations + if (compute_augref && graph) { + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph); + if (!mutable_graph) { + logger.error() << "graph cannot be modified for augref computation" << std::endl; + return 1; + } + + // Collect reference paths matching the prefix from -Q + unordered_set ref_paths; + graph->for_each_path_handle([&](path_handle_t ph) { + string path_name = graph->get_path_name(ph); + // Skip augref paths (they match prefixes but shouldn't be used as references) + if (AugRefCover::is_augref_name(path_name)) { + return; + } + if (path_name.compare(0, path_prefix.size(), path_prefix) == 0) { + ref_paths.insert(ph); + } + }); + if (ref_paths.empty()) { + logger.error() << "no paths found matching prefix: " << path_prefix << std::endl; + return 1; + } + + // Compute snarls + IntegratedSnarlFinder finder(*graph); + SnarlManager snarl_manager(std::move(finder.find_snarls_parallel())); + + // Compute and apply augref cover + AugRefCover cover; + if (!augref_sample.empty()) { + cover.set_augref_sample(augref_sample); + } + cover.set_verbose(augref_verbose); + cover.clear(mutable_graph); + cover.compute(graph, &snarl_manager, ref_paths, min_augref_length); + + // Write augref segment table if requested + if (!augref_segments_file.empty()) { + ofstream segments_out(augref_segments_file); + if (!segments_out) { + logger.error() << "could not open augref-segs file: " << augref_segments_file << std::endl; + return 1; + } + cover.write_augref_segments(segments_out); + } + + cover.apply(mutable_graph); + + // Output the modified graph + vg::io::save_handle_graph(mutable_graph, cout); + return 0; + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -570,14 +683,14 @@ int main_paths(int argc, char** argv) { if (!path_prefix.empty()) { // Filter by name prefix std::string path_name = graph->get_path_name(path_handle); - + if (std::mismatch(path_name.begin(), path_name.end(), path_prefix.begin(), path_prefix.end()).second != path_prefix.end()) { // The path does not match the prefix. Skip it. return; } } - + // It didn't fail a prefix check, so use it. iteratee(path_handle); }); diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 1d8bbb6308..dc571fa74d 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -673,7 +673,8 @@ void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const std::unordered_map extra_node_weight; constexpr size_t EXTRA_WEIGHT = 10000000000; graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](path_handle_t path_handle) { - if (graph->get_path_name(path_handle).compare(0, ref_path_prefix.length(), ref_path_prefix) == 0) { + string path_name = graph->get_path_name(path_handle); + if (path_name.compare(0, ref_path_prefix.length(), ref_path_prefix) == 0) { ref_paths[path_handle] = 0; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))] += EXTRA_WEIGHT; extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(path_handle)))] += EXTRA_WEIGHT; @@ -776,5 +777,69 @@ void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const } } +unordered_map> add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes, + const set& sample_names) { + // Build a map of what haplotypes are already in the traversals + unordered_map> sample_to_haps; + + assert(names.size() == travs.size()); + for (int64_t i = 0; i < names.size(); ++i) { + string sample_name = PathMetadata::parse_sample_name(names[i]); + // for backward compatibility + if (sample_name.empty()) { + sample_name = names[i]; + } + auto phase = PathMetadata::parse_haplotype(names[i]); + if (!sample_name.empty() && phase == PathMetadata::NO_HAPLOTYPE) { + // This probably won't fit in an int. Use 0 instead. + phase = 0; + } + sample_to_haps[sample_name].push_back(phase); + } + + // Find everything that's in parent_haplotypes but not the traversals, + // and add in dummy star-alleles for them + for (const auto& parent_sample_haps : parent_haplotypes) { + string parent_sample_name = PathMetadata::parse_sample_name(parent_sample_haps.first); + if (parent_sample_name.empty()) { + parent_sample_name = parent_sample_haps.first; + } + if (!sample_names.count(parent_sample_name)) { + // don't bother for purely reference samples -- we don't need to force an allele for them. + continue; + } + for (int parent_hap : parent_sample_haps.second) { + bool found = false; + if (sample_to_haps.count(parent_sample_haps.first)) { + // note: this is brute-force search, but number of haplotypes usually tiny. + for (int hap : sample_to_haps[parent_sample_haps.first]) { + if (parent_hap == hap) { + found = true; + break; + } + } + } + if (!found) { + travs.push_back(Traversal()); + names.push_back(PathMetadata::create_path_name(PathSense::REFERENCE, + parent_sample_haps.first, + "star", + parent_hap, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE)); + sample_to_haps[parent_sample_haps.first].push_back(parent_hap); + trav_clusters.push_back({(int)travs.size() - 1}); + trav_cluster_info.push_back(make_pair(0, 0)); + } + } + } + + return sample_to_haps; +} + } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index e1f342ce65..3aba063cdd 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -106,12 +106,31 @@ void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const u /// for every snarl, bottom up, compute all the traversals through it. choose a reference traversal -/// (using the prefix where possible), then if +/// (using the prefix where possible), then if void simplify_graph_using_traversals(MutablePathMutableHandleGraph* graph, const string& ref_path_prefix, int64_t min_snarl_length, double min_jaccard, int64_t max_iterations, int64_t min_fragment_length); +/** + * Add star-allele traversals for haplotypes that appear in the parent snarl but are missing + * from the current snarl's traversals. This is used for nested calling to ensure that + * haplotypes spanning a parent site but not traversing a child site get a "*" allele. + * + * @param travs Vector of traversals (modified in place to add star traversals) + * @param names Vector of path names corresponding to traversals (modified in place) + * @param trav_clusters Vector of cluster assignments (modified in place) + * @param trav_cluster_info Vector of cluster info (Jaccard, length delta) (modified in place) + * @param parent_haplotypes Map from sample name -> vector of haplotype phases in the parent + * @param sample_names Set of sample names to consider (reference-only samples are skipped) + * @return Map from sample name -> vector of haplotype phases in all traversals + */ +unordered_map> add_star_traversals(vector& travs, + vector& names, + vector>& trav_clusters, + vector>& trav_cluster_info, + const unordered_map>& parent_haplotypes, + const set& sample_names); } diff --git a/test/nesting/cyclic_ref_multiple_variants.gfa b/test/nesting/cyclic_ref_multiple_variants.gfa new file mode 100644 index 0000000000..9bc132988b --- /dev/null +++ b/test/nesting/cyclic_ref_multiple_variants.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 AAATTTTCTGGAGTTCTAT +S 2 A +S 3 T +S 4 G +S 5 ATAT +S 6 T +S 7 CCAACTCTCTG +L 1 + 2 + 0M +L 1 + 3 + 0M +L 1 + 4 + 0M +L 2 + 5 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 1 + 0M +L 5 + 6 + 0M +L 6 + 7 + 0M +P x 1+,2+,5+,1+,3+,5+,6+,7+ * +P a#1#z0 1+,3+,5+,1+,4+,5+,6+,7+ * diff --git a/test/nesting/dangling_node.gfa b/test/nesting/dangling_node.gfa new file mode 100644 index 0000000000..fb8c8ca2bb --- /dev/null +++ b/test/nesting/dangling_node.gfa @@ -0,0 +1,14 @@ +H VN:Z:1.0 +S 1 AAAA +S 2 TTTT +S 3 CCCC +S 4 GGGG +S 5 AAAAAAAA +L 1 + 2 + 0M +L 2 + 3 + 0M +L 1 + 4 + 0M +L 4 + 3 + 0M +L 3 + 5 + 0M +P x 1+,2+,3+ * +P hap1 1+,4+,3+ * +P hap2 1+,4+,3+,5+ * diff --git a/test/nesting/star_allele_cluster.gfa b/test/nesting/star_allele_cluster.gfa new file mode 100644 index 0000000000..c2f7d5605d --- /dev/null +++ b/test/nesting/star_allele_cluster.gfa @@ -0,0 +1,19 @@ +H VN:Z:1.0 +S 1 C +S 2 AA +S 3 T +S 4 G +S 5 AA +S 6 C +P x 1+,6+ * +P a#1#h1 1+,2+,6+ * +P a#2#h2 1+,5+,3+,6+ * +P a#2#h3 1+,5+,4+,6+ * +L 1 + 2 + 0M +L 1 + 5 + 0M +L 1 + 6 + 0M +L 2 + 6 + 0M +L 5 + 3 + 0M +L 5 + 4 + 0M +L 3 + 6 + 0M +L 4 + 6 + 0M diff --git a/test/nesting/star_cluster.gfa b/test/nesting/star_cluster.gfa new file mode 100644 index 0000000000..d482f66470 --- /dev/null +++ b/test/nesting/star_cluster.gfa @@ -0,0 +1,21 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 4 A +S 5 G +S 6 T +S 7 A +P a#2#y1 1+,2+,3+,5+,6+ * +P a#1#y0 1+,2+,4+,5+,6+ * +P x 1+,6+ * +P b#1#z0 1+,7+,6+ * +L 1 + 2 + 0M +L 1 + 6 + 0M +L 1 + 7 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 5 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M +L 7 + 6 + 0M diff --git a/test/nesting/triple_nested_multisnp.gfa b/test/nesting/triple_nested_multisnp.gfa new file mode 100644 index 0000000000..9f338de19d --- /dev/null +++ b/test/nesting/triple_nested_multisnp.gfa @@ -0,0 +1,39 @@ +H VN:Z:1.0 +S 1 C +S 2 A +S 3 T +S 31a T +S 31b G +S 311 G +S 312 C +S 313 A +S 32 C +S 33a C +S 33b A +S 34 T +S 4 G +S 5 T +P x 1+,5+ * +P a#1#y0 1+,2+,3+,31b+,311+,313+,32+,33b+,34+,4+,5+ * +P a#2#y1 1+,2+,3+,31a+,311+,312+,32+,33a+,34+,4+,5+ * +L 1 + 2 + 0M +L 1 + 5 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 31a + 0M +L 3 + 31b + 0M +L 3 + 34 + 0M +L 31a + 311 + 0M +L 31b + 311 + 0M +L 311 + 312 + 0M +L 311 + 313 + 0M +L 312 + 32 + 0M +L 313 + 32 + 0M +L 32 + 33a + 0M +L 32 + 33b + 0M +L 31a + 33a + 0M +L 31b + 33b + 0M +L 33a + 34 + 0M +L 33b + 34 + 0M +L 34 + 4 + 0M +L 4 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 9cb9687893..7ca872b443 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 35 +plan tests 52 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -126,9 +126,63 @@ is "$(cat out.txt | wc -l)" "1" "vg paths sees the haplotype path in a GBZ with rm -f norm_x4.gfa original.fa norm_x4.fa x4.path x4.norm.path out.txt err.txt rm -f empty.vg empty.gbwt empty.fa +# Augref (augmented reference path) computation tests +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 > augref_test.vg +vg validate augref_test.vg +is $? 0 "augref computation produces valid graph" + +is $(vg paths -x augref_test.vg -L | grep "_alt$" | wc -l) 2 "augref computation creates expected number of augref paths" + +is $(vg paths -x augref_test.vg -L | grep "^x$" | wc -l) 1 "original reference path is preserved after augref computation" + +# Test augref naming convention matches pattern x_{N}_alt +is $(vg paths -x augref_test.vg -L | grep -E "^x_[0-9]+_alt$" | wc -l) 2 "augref paths follow naming convention path_{N}_alt" + +# Test with triple_nested.gfa which has more complex structure +vg paths -x nesting/triple_nested.gfa -Q x --compute-augref --min-augref-len 1 > triple_augref.vg +vg validate triple_augref.vg +is $? 0 "augref computation works on complex nested structure" + +is $(vg paths -x triple_augref.vg -L | grep "_alt$" | wc -l) 2 "correct number of augref paths for triple nested graph" + +# Test minimum length filter +vg paths -x nesting/triple_nested.gfa -Q x --compute-augref --min-augref-len 100 > triple_augref_long.vg +is $(vg paths -x triple_augref_long.vg -L | grep "_alt$" | wc -l) 0 "min-augref-len filters out short fragments" + +# Test second pass coverage of dangling nodes (nodes outside snarls but on haplotype paths) +vg paths -x nesting/dangling_node.gfa -Q x --compute-augref --min-augref-len 1 > dangling_augref.vg +vg validate dangling_augref.vg +is $? 0 "augref computation handles dangling nodes outside snarls" + +is $(vg paths -x dangling_augref.vg -L | grep "_alt$" | wc -l) 2 "augref second pass covers dangling nodes" + +# Verify the dangling node (node 5, 8bp) is covered by checking augref path lengths include 8bp +# Note: use -E with grep instead of -Q since augref paths are filtered from prefix matching +is $(vg paths -x dangling_augref.vg -E | grep "_alt" | awk '{sum+=$2} END {print sum}') 12 "augref paths cover both snarl node and dangling node" + +# Test --augref-segs option for writing segment table +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 --augref-segs augref_test.segs > augref_segs_test.vg +is $? 0 "augref-segs option produces no error" + +is $(wc -l < augref_test.segs) 2 "augref-segs produces correct number of lines" + +is $(cut -f4 augref_test.segs | grep -c "x_.*_alt") 2 "augref-segs contains augref path names" + +is $(cut -f1 augref_test.segs | grep -c "#") 2 "augref-segs contains source path names with metadata" + +is $(cut -f5 augref_test.segs | grep -c "^x$") 2 "augref-segs contains reference path name" + +# Test that augref-segs requires compute-augref +vg paths -x nesting/nested_snp_in_ins.gfa -Q x -L --augref-segs augref_test.segs 2>&1 | grep -q "requires --compute-augref" +is $? 0 "augref-segs requires compute-augref option" + +# Test augref-segs with augref-sample option +vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 --augref-sample TESTSAMPLE --augref-segs augref_sample_test.segs > augref_sample_test.vg +is $(cut -f4 augref_sample_test.segs | grep -c "TESTSAMPLE") 2 "augref-segs uses augref-sample for path names" + +rm -f augref_test.vg triple_augref.vg triple_augref_long.vg dangling_augref.vg x.pg x.gbwt x.gbz +rm -f augref_test.segs augref_segs_test.vg augref_sample_test.segs augref_sample_test.vg - - diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index c9084c0556..946b3fc3c6 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 19 +plan tests 86 # Toy example of hand-made pileup (and hand inspected truth) to make sure some # obvious (and only obvious) SNPs are detected by vg call @@ -191,7 +191,577 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf - - - +# Test: Clustering option (-L) in vg call +vg construct -r small/x.fa -v small/x.vcf.gz | vg view -g - > small_cluster_call.gfa +printf "L\t1\t+\t9\t+\t0M\n" >> small_cluster_call.gfa +printf "P\ty\t1+,2+,4+,6+,7+,9+\t*\n" >> small_cluster_call.gfa +printf "P\tz\t1+,9+\t*\n" >> small_cluster_call.gfa +vg view -Fv small_cluster_call.gfa > small_cluster_call.vg +vg sim -x small_cluster_call.vg -n 100 -l 20 -a -s 1 > small_cluster_call.gam +vg pack -x small_cluster_call.vg -g small_cluster_call.gam -o small_cluster_call.pack +# Call without clustering +vg call small_cluster_call.vg -k small_cluster_call.pack -p x --top-down > call_no_cluster.vcf 2>/dev/null +# Call with clustering +vg call small_cluster_call.vg -k small_cluster_call.pack -p x --top-down -L 0.3 > call_cluster.vcf 2>/dev/null +is $(grep -v "^#" call_no_cluster.vcf | wc -l) $(grep -v "^#" call_cluster.vcf | wc -l) "clustering does not change number of variant sites in vg call" + +rm -f small_cluster_call.gfa small_cluster_call.vg small_cluster_call.gam small_cluster_call.pack call_no_cluster.vcf call_cluster.vcf + +# Test: Nested calling with vg call --top-down (basic test) +# Use a larger graph with clear variants +vg construct -r small/x.fa -v small/x.vcf.gz -a > nested_call_test.vg +vg sim -x nested_call_test.vg -n 500 -l 50 -a -s 42 > nested_call_test.gam +vg pack -x nested_call_test.vg -g nested_call_test.gam -o nested_call_test.pack +vg call nested_call_test.vg -k nested_call_test.pack -p x --top-down > nested_call_test.vcf 2>/dev/null +# Should produce at least one variant (the small/x graph has multiple variants) +NESTED_VARIANT_COUNT=$(grep -v "^#" nested_call_test.vcf | wc -l) +is $(if [ "$NESTED_VARIANT_COUNT" -ge 1 ]; then echo "1"; else echo "0"; fi) "1" "nested vg call produces at least one variant" + +rm -f nested_call_test.vg nested_call_test.gam nested_call_test.pack nested_call_test.vcf + +# Test: Nested calling emits both top-level and child snarls +# This tests the fix for nested snarl VCF emission where genotypes propagate to children +vg view -Fv nesting/nested_snp_in_del.gfa > nested_snp.vg +vg sim -x nested_snp.vg -n 100 -l 5 -a -s 42 > nested_snp.gam +vg pack -x nested_snp.vg -g nested_snp.gam -o nested_snp.pack +vg call nested_snp.vg -k nested_snp.pack --top-down -p x 2>/dev/null > nested_snp.vcf +# Should have exactly 2 variant lines: one for top-level snarl (1->6) and one for nested snarl (2->5) +NESTED_LINE_COUNT=$(grep -v "^#" nested_snp.vcf | wc -l) +is "$NESTED_LINE_COUNT" "2" "nested vg call emits both top-level and child snarl variants" + +rm -f nested_snp.vg nested_snp.gam nested_snp.pack nested_snp.vcf + +# Test: Star allele option validation (-Y requires --top-down) +vg construct -r small/x.fa -v small/x.vcf.gz > star_test.vg +vg sim -x star_test.vg -n 100 -l 20 -a -s 1 > star_test.gam +vg pack -x star_test.vg -g star_test.gam -o star_test.pack +vg call star_test.vg -k star_test.pack -Y 2>&1 | grep -q "requires --top-down" +is "$?" 0 "star allele option requires top-down mode" + +rm -f star_test.vg star_test.gam star_test.pack + +# Test: Cluster-post validation (requires -L < 1.0) +vg construct -r small/x.fa -v small/x.vcf.gz > cluster_post_test.vg +vg sim -x cluster_post_test.vg -n 100 -l 20 -a -s 1 > cluster_post_test.gam +vg pack -x cluster_post_test.vg -g cluster_post_test.gam -o cluster_post_test.pack +vg call cluster_post_test.vg -k cluster_post_test.pack --cluster-post 2>&1 | grep -q "requires -L/--cluster" +is "$?" 0 "cluster-post option requires cluster threshold" + +rm -f cluster_post_test.vg cluster_post_test.gam cluster_post_test.pack + +# ============================================================================= +# Nested genotype propagation tests +# These tests verify correct genotype propagation from parent to child snarls +# Graph: nested_snp_in_del.gfa +# x (ref): 1->2->3->5->6 (allele 0 at top-level, allele 0 at nested) +# y0: 1->2->4->5->6 (allele 0 at top-level with SNP, allele 1 at nested) +# y1: 1->6 (allele 1 = deletion at top-level, star at nested) +# ============================================================================= + +# Test 0/0: homozygous reference - reads only from x path +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 100 -l 2 -a -s 1 > nd_00.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_00.gam -o nd_00.pack +vg call nesting/nested_snp_in_del.gfa -k nd_00.pack --top-down -p x 2>/dev/null > nd_00.vcf +# 0/0 should produce no non-ref variants +ND_00_NONREF=$(grep -v "^#" nd_00.vcf | grep -v "0/0" | wc -l) +is "$ND_00_NONREF" "0" "nested_snp_in_del 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/SNP - reads from x and y0 (both traverse nested snarl) +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 50 -l 2 -a -s 10 > nd_01.gam +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 50 -l 2 -a -s 11 >> nd_01.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_01.gam -o nd_01.pack +vg call nesting/nested_snp_in_del.gfa -k nd_01.pack --top-down -p x 2>/dev/null > nd_01.vcf +# Should have 2 variants (top-level and nested), both het +ND_01_COUNT=$(grep -v "^#" nd_01.vcf | wc -l) +is "$ND_01_COUNT" "2" "nested_snp_in_del 0/1: produces both top-level and nested variants" + +# Test 1/1: homozygous alt SNP - reads only from y0 path (via sample a haplotype 1) +# We need to simulate specifically from haplotype 1 (y0) not from both haplotypes +# Since -m a gives both haplotypes, we use -A and rely on the path structure +vg sim -x nesting/nested_snp_in_del.gfa -A -n 100 -l 2 -a -s 20 > nd_11.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_11.gam -o nd_11.pack +vg call nesting/nested_snp_in_del.gfa -k nd_11.pack --top-down -p x 2>/dev/null > nd_11.vcf +# Should produce variants (exact genotype depends on path coverage) +ND_11_EXIT=$? +is "$ND_11_EXIT" "0" "nested_snp_in_del 1/1: vg call completes without error" + +# Test 1/2: het SNP/deletion - reads from y0 and y1 (sample a has both) +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 30 > nd_12.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nd_12.gam -o nd_12.pack +vg call nesting/nested_snp_in_del.gfa -k nd_12.pack --top-down -p x 2>/dev/null > nd_12.vcf +# Should have 2 variants, nested one should have missing allele +ND_12_COUNT=$(grep -v "^#" nd_12.vcf | wc -l) +is "$ND_12_COUNT" "2" "nested_snp_in_del 1/2: het SNP/del produces both variants" +# Nested snarl should have missing allele (.) for deletion parent +ND_12_MISSING=$(grep ">2>5" nd_12.vcf | grep -c "\./") +is "$ND_12_MISSING" "1" "nested_snp_in_del 1/2: nested snarl shows missing allele for deletion" + +rm -f nd_00.gam nd_00.pack nd_00.vcf nd_01.gam nd_01.pack nd_01.vcf +rm -f nd_11.gam nd_11.pack nd_11.vcf nd_12.gam nd_12.pack nd_12.vcf + +# ============================================================================= +# Star allele tests (-Y flag) +# When parent allele doesn't traverse child, output * instead of . +# ============================================================================= + +# Test star allele output with -Y flag +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 40 > star.gam +vg pack -x nesting/nested_snp_in_del.gfa -g star.gam -o star.pack +vg call nesting/nested_snp_in_del.gfa -k star.pack --top-down -Y -p x 2>/dev/null > star.vcf +# With -Y, nested snarl should have * in ALT instead of . in GT +# The genotype will use numeric index (e.g. 1/2) where one allele is * +STAR_IN_ALT=$(grep ">2>5" star.vcf | cut -f5 | grep -c "\*") +is "$STAR_IN_ALT" "1" "star allele: -Y flag produces * in ALT for spanning deletion" +# Verify the genotype doesn't have . when -Y is used (it uses indexed * instead) +# Extract just the GT field (first colon-separated field in SAMPLE column) +NO_MISSING_GT=$(grep ">2>5" star.vcf | cut -f10 | cut -d: -f1 | grep -v "\." | wc -l) +is "$NO_MISSING_GT" "1" "star allele: genotype uses indexed * instead of . with -Y" + +rm -f star.gam star.pack star.vcf + +# ============================================================================= +# Nested quality metrics tests +# Verify QUAL, GQ, GL are computed for nested variant calls +# ============================================================================= + +# Test: nested calls should have non-zero QUAL +vg sim -x nesting/nested_snp_in_del.gfa -m a -n 100 -l 2 -a -s 50 > nq.gam +vg pack -x nesting/nested_snp_in_del.gfa -g nq.gam -o nq.pack +vg call nesting/nested_snp_in_del.gfa -k nq.pack --top-down -p x 2>/dev/null > nq.vcf + +# Check that nested snarl (>2>5) has non-zero QUAL +NQ_QUAL=$(grep ">2>5" nq.vcf | cut -f6) +NQ_HAS_QUAL=$(if [ "$NQ_QUAL" != "0" ] && [ "$NQ_QUAL" != "." ]; then echo "1"; else echo "0"; fi) +is "$NQ_HAS_QUAL" "1" "nested call has non-zero QUAL" + +# Check that nested snarl has GQ in FORMAT +NQ_FORMAT=$(grep ">2>5" nq.vcf | cut -f9) +NQ_HAS_GQ=$(echo "$NQ_FORMAT" | grep -c "GQ") +is "$NQ_HAS_GQ" "1" "nested call has GQ field" + +# Check that top-level snarl also has quality (unaffected) +TL_QUAL=$(grep ">1>6" nq.vcf | cut -f6) +TL_HAS_QUAL=$(if [ "$TL_QUAL" != "0" ] && [ "$TL_QUAL" != "." ]; then echo "1"; else echo "0"; fi) +is "$TL_HAS_QUAL" "1" "top-level call still has non-zero QUAL" + +rm -f nq.gam nq.pack nq.vcf + +# Test: triple nested with augref paths should all have quality +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > tnq_ap.gfa 2>/dev/null +vg sim -x tnq_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 51 > tnq.gam +vg sim -x tnq_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 52 >> tnq.gam +vg pack -x tnq_ap.gfa -g tnq.gam -o tnq.pack +vg call tnq_ap.gfa -k tnq.pack --top-down -P x 2>/dev/null > tnq.vcf + +# All variant lines should have non-zero QUAL +TNQ_ZERO_QUAL=$(grep -v "^#" tnq.vcf | awk -F'\t' '$6 == "0" || $6 == "."' | wc -l) +is "$TNQ_ZERO_QUAL" "0" "triple nested calls all have non-zero QUAL" + +# All variant lines should have GQ in FORMAT +TNQ_ALL_GQ=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GQ" | wc -l) +is "$TNQ_ALL_GQ" "0" "triple nested calls all have GQ field" + +# All variant lines should have GL (Genotype Likelihood) in FORMAT +TNQ_ALL_GL=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GL" | wc -l) +is "$TNQ_ALL_GL" "0" "triple nested calls all have GL field" + +# All variant lines should have GP (Genotype Posterior) in FORMAT +TNQ_ALL_GP=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "GP" | wc -l) +is "$TNQ_ALL_GP" "0" "triple nested calls all have GP field" + +# All variant lines should have XD (Expected Depth) in FORMAT +TNQ_ALL_XD=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "XD" | wc -l) +is "$TNQ_ALL_XD" "0" "triple nested calls all have XD field" + +# All variant lines should have AD (Allelic Depth) in FORMAT +TNQ_ALL_AD=$(grep -v "^#" tnq.vcf | cut -f9 | grep -v "AD" | wc -l) +is "$TNQ_ALL_AD" "0" "triple nested calls all have AD field" + +# GQ values should be in valid range (0-256, integers) +TNQ_INVALID_GQ=$(grep -v "^#" tnq.vcf | awk -F'\t' '{ + split($9, fmt, ":"); + split($10, val, ":"); + for (i=1; i<=length(fmt); i++) { + if (fmt[i] == "GQ") { + gq = val[i]; + if (gq !~ /^[0-9]+$/ || gq < 0 || gq > 256) print "invalid"; + } + } +}' | wc -l) +is "$TNQ_INVALID_GQ" "0" "triple nested calls have valid GQ values (0-256)" + +# All variant lines should have LV (nesting level) in INFO +TNQ_ALL_LV=$(grep -v "^#" tnq.vcf | cut -f8 | grep -v "LV=" | wc -l) +is "$TNQ_ALL_LV" "0" "triple nested calls all have LV tag" + +# Nested variants (LV > 0) should have PS (parent snarl) in INFO +TNQ_NESTED_NO_PS=$(grep -v "^#" tnq.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ && $8 !~ /PS=/' | wc -l) +is "$TNQ_NESTED_NO_PS" "0" "nested calls (LV>0) all have PS tag" + +# Top-level variants (LV=0) should NOT have PS tag +TNQ_TOPLEVEL_HAS_PS=$(grep -v "^#" tnq.vcf | awk -F'\t' '$8 ~ /LV=0/ && $8 ~ /PS=/' | wc -l) +is "$TNQ_TOPLEVEL_HAS_PS" "0" "top-level calls (LV=0) do not have PS tag" + +rm -f tnq_ap.gfa tnq.gam tnq.pack tnq.vcf + +# ============================================================================= +# nested_snp_in_ins.gfa tests +# Graph structure: +# x (ref): 1->6 (short ref, allele 0 at top-level) +# y0 (a#1): 1->2->4->5->6 (insertion with SNP A, allele 1 at top, allele 1 at nested) +# y1 (a#2): 1->2->3->5->6 (insertion with SNP T, allele 2 at top, allele 0 at nested) +# Top-level snarl: 1->6, Nested snarl: 2->5 +# NOTE: ref path x bypasses the nested snarl, so we need augref covers to call nested variants +# ============================================================================= + +# Compute augref cover first (creates x_1_alt, x_2_alt, etc. covering nodes not on x) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_ins.gfa > ni_ap.gfa + +# Test 0/0: homozygous reference - reads only from x path (short ref) +vg sim -x ni_ap.gfa -P x -n 100 -l 2 -a -s 70 > ni_00.gam +vg pack -x ni_ap.gfa -g ni_00.gam -o ni_00.pack +vg call ni_ap.gfa -k ni_00.pack --top-down -P x 2>/dev/null > ni_00.vcf +# 0/0 should produce no non-ref variants (or only ref calls) +NI_00_NONREF=$(grep -v "^#" ni_00.vcf | grep -v "0/0" | wc -l) +is "$NI_00_NONREF" "0" "nested_snp_in_ins 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/insertion - reads from x and y1 (a#2 haplotype) +# Note: y1 path is 5bp vs x path 2bp, so need ~2x more y1 reads for balanced boundary coverage +vg sim -x ni_ap.gfa -P x -n 50 -l 2 -a -s 71 > ni_01.gam +vg sim -x ni_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 72 >> ni_01.gam +vg pack -x ni_ap.gfa -g ni_01.gam -o ni_01.pack +vg call ni_ap.gfa -k ni_01.pack --top-down -P x 2>/dev/null > ni_01.vcf +# With augref paths: both top-level and nested variants emitted +NI_01_COUNT=$(grep -v "^#" ni_01.vcf | wc -l) +is "$NI_01_COUNT" "2" "nested_snp_in_ins 0/1: het ref/ins produces top-level and nested variants with augref paths" + +# Test 1/1: homozygous insertion - reads only from y1 path +vg sim -x ni_ap.gfa -P "a#2#y1#0" -n 100 -l 2 -a -s 73 > ni_11.gam +vg pack -x ni_ap.gfa -g ni_11.gam -o ni_11.pack +vg call ni_ap.gfa -k ni_11.pack --top-down -P x 2>/dev/null > ni_11.vcf +# With augref paths: both top-level and nested variants emitted +NI_11_COUNT=$(grep -v "^#" ni_11.vcf | wc -l) +is "$NI_11_COUNT" "2" "nested_snp_in_ins 1/1: homozygous ins produces top-level and nested variants with augref paths" + +# Test 1/2: het between two insertion alleles - reads from both y0 and y1 +vg sim -x ni_ap.gfa -m a -n 200 -l 2 -a -s 74 > ni_12.gam +vg pack -x ni_ap.gfa -g ni_12.gam -o ni_12.pack +vg call ni_ap.gfa -k ni_12.pack --top-down -P x 2>/dev/null > ni_12.vcf +# With augref paths: both top-level and nested variants emitted +NI_12_COUNT=$(grep -v "^#" ni_12.vcf | wc -l) +is "$NI_12_COUNT" "2" "nested_snp_in_ins 1/2: het ins/ins produces top-level and nested variants with augref paths" + +rm -f ni_ap.gfa ni_00.gam ni_00.pack ni_00.vcf ni_01.gam ni_01.pack ni_01.vcf +rm -f ni_11.gam ni_11.pack ni_11.vcf ni_12.gam ni_12.pack ni_12.vcf + +# ============================================================================= +# Triple nested graph tests +# Graph structure: +# x (ref): 1->5 (short ref, bypasses all nesting) +# y0 (a#1): 1->2->3->31->311->313->32->33->34->4->5 (deep insertion) +# y1 (a#2): 1->2->3->31->311->312->32->33->34->4->5 (different at deepest level) +# y2 (a#3): same as y0 +# Top-level snarl: 1->5, with 4 levels of nesting inside +# NOTE: ref path x bypasses all nesting, so we need augref covers to call nested variants +# ============================================================================= + +# Compute augref cover first (creates x_1_alt, x_2_alt, etc. covering nested nodes) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > tn_ap.gfa + +# Test 0/0: homozygous reference - reads only from x path +vg sim -x tn_ap.gfa -P x -n 100 -l 2 -a -s 80 > tn_00.gam +vg pack -x tn_ap.gfa -g tn_00.gam -o tn_00.pack +vg call tn_ap.gfa -k tn_00.pack --top-down -P x 2>/dev/null > tn_00.vcf +TN_00_NONREF=$(grep -v "^#" tn_00.vcf | grep -v "0/0" | wc -l) +is "$TN_00_NONREF" "0" "triple_nested 0/0: homozygous ref produces no non-ref variants" + +# Test 0/1: het ref/insertion - reads from x and y0 +# Note: y0 path is 11bp vs x path 2bp, so need ~5x more y0 reads for balanced boundary coverage +vg sim -x tn_ap.gfa -P x -n 50 -l 2 -a -s 81 > tn_01.gam +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 500 -l 2 -a -s 82 >> tn_01.gam +vg pack -x tn_ap.gfa -g tn_01.gam -o tn_01.pack +vg call tn_ap.gfa -k tn_01.pack --top-down -P x 2>/dev/null > tn_01.vcf +# With augref paths: all 5 nesting levels can be emitted +TN_01_COUNT=$(grep -v "^#" tn_01.vcf | wc -l) +is "$TN_01_COUNT" "5" "triple_nested 0/1: het ref/ins produces all 5 nesting level variants with augref paths" + +# Test 1/1: homozygous insertion - reads only from y0 path +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 83 > tn_11.gam +vg pack -x tn_ap.gfa -g tn_11.gam -o tn_11.pack +vg call tn_ap.gfa -k tn_11.pack --top-down -P x 2>/dev/null > tn_11.vcf +# With augref paths: 1 variant emitted (top-level insertion only) +# All nested snarls are 0/0 because y0 matches x_1_alt reference at all levels +# (y0 goes through 313, and x_1_alt also goes through 313) +TN_11_COUNT=$(grep -v "^#" tn_11.vcf | wc -l) +is "$TN_11_COUNT" "1" "triple_nested 1/1: homozygous ins produces only top-level variant" + +# Test 1/2: het between insertion alleles - reads from y0 and y1 (differ at deepest SNP) +vg sim -x tn_ap.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 84 > tn_12.gam +vg sim -x tn_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 85 >> tn_12.gam +vg pack -x tn_ap.gfa -g tn_12.gam -o tn_12.pack +vg call tn_ap.gfa -k tn_12.pack --top-down -P x 2>/dev/null > tn_12.vcf +# With augref paths: all 5 nesting levels can be emitted +TN_12_COUNT=$(grep -v "^#" tn_12.vcf | wc -l) +is "$TN_12_COUNT" "5" "triple_nested 1/2: het ins/ins produces all 5 nesting level variants with augref paths" + +rm -f tn_ap.gfa tn_00.gam tn_00.pack tn_00.vcf tn_01.gam tn_01.pack tn_01.vcf +rm -f tn_11.gam tn_11.pack tn_11.vcf tn_12.gam tn_12.pack tn_12.vcf + +# ============================================================================= +# Multi-level SNP test (triple_nested_multisnp.gfa) +# This graph has SNPs at multiple nesting levels: +# - y0 matches x_1_alt at all levels (will be chosen as augref reference) +# - y1 differs from x_1_alt at all nested levels +# When simulating from y1, we should get variants at all 4 nesting levels +# ============================================================================= + +vg paths -x nesting/triple_nested_multisnp.gfa -Q x --compute-augref --min-augref-len 1 > tn_ms_ap.gfa +vg sim -x tn_ms_ap.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 100 > tn_ms.gam +vg pack -x tn_ms_ap.gfa -g tn_ms.gam -o tn_ms.pack +vg call tn_ms_ap.gfa -k tn_ms.pack --top-down -P x 2>/dev/null > tn_ms.vcf +# Should get 4 variants: top-level + 3 nested SNPs (all at 1/1) +TN_MS_COUNT=$(grep -v "^#" tn_ms.vcf | wc -l) +is "$TN_MS_COUNT" "4" "triple_nested_multisnp 1/1: homozygous alt produces variants at all 4 nesting levels" +# Verify all variants are 1/1 (homozygous alt) +TN_MS_HOM=$(grep -v "^#" tn_ms.vcf | cut -f10 | cut -d: -f1 | grep -c "1/1") +is "$TN_MS_HOM" "4" "triple_nested_multisnp 1/1: all 4 variants are homozygous alt" +# Verify LV tags span levels 0-3 +TN_MS_LV0=$(grep -v "^#" tn_ms.vcf | grep -c "LV=0") +TN_MS_LV123=$(grep -v "^#" tn_ms.vcf | grep -c "LV=[123]") +is "$TN_MS_LV0" "1" "triple_nested_multisnp: one top-level variant (LV=0)" +is "$TN_MS_LV123" "3" "triple_nested_multisnp: three nested variants (LV=1,2,3)" + +rm -f tn_ms_ap.gfa tn_ms.gam tn_ms.pack tn_ms.vcf + +# ============================================================================= +# Short reference bypass tests +# NOTE: ref path x bypasses nested structures, so we need augref covers to call nested variants +# ============================================================================= + +# Test nested_snp_in_nested_ins.gfa - ref bypasses all nested structures +# Compute augref cover first (creates x_1_alt, etc. covering nested nodes) +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_nested_ins.gfa > bypass_ap.gfa +vg sim -x bypass_ap.gfa -m a -n 100 -l 2 -a -s 60 > bypass.gam +vg pack -x bypass_ap.gfa -g bypass.gam -o bypass.pack +vg call bypass_ap.gfa -k bypass.pack --top-down -P x 2>/dev/null > bypass.vcf +BYPASS_EXIT=$? +is "$BYPASS_EXIT" "0" "nested_snp_in_nested_ins: vg call handles short-ref nested graph with augref paths without crashing" + +rm -f bypass_ap.gfa bypass.gam bypass.pack bypass.vcf + +# ============================================================================= +# --top-down -a interaction tests +# Verify nested calling with reference output (-a) works correctly +# ============================================================================= + +# Test 1: nested_snp_in_del 0/0 with --top-down -a +# When ref traverses nested snarl, both levels should get 0/0 +vg sim -x nesting/nested_snp_in_del.gfa -P x -n 100 -l 2 -a -s 200 > na_del.gam +vg pack -x nesting/nested_snp_in_del.gfa -g na_del.gam -o na_del.pack +vg call nesting/nested_snp_in_del.gfa -k na_del.pack --top-down -a -p x 2>/dev/null > na_del.vcf + +# Count variant lines (should be 2: top-level + nested) +NA_DEL_COUNT=$(grep -v "^#" na_del.vcf | wc -l) +is "$NA_DEL_COUNT" "2" "--top-down -a: nested_snp_in_del 0/0 emits both snarls" + +# Verify top-level is 0/0 (use awk to match ID column exactly) +NA_DEL_TOP_GT=$(awk -F'\t' '$3 == ">1>6" {print $10}' na_del.vcf | cut -d: -f1) +is "$NA_DEL_TOP_GT" "0/0" "--top-down -a: nested_snp_in_del top-level is 0/0" + +# Verify nested is 0/0 (use awk to match ID column exactly) +NA_DEL_NEST_GT=$(awk -F'\t' '$3 == ">2>5" {print $10}' na_del.vcf | cut -d: -f1) +is "$NA_DEL_NEST_GT" "0/0" "--top-down -a: nested_snp_in_del nested is 0/0" + +rm -f na_del.gam na_del.pack na_del.vcf + +# Test 2: nested_snp_in_ins 0/0 with --top-down -a (with augref paths) +# When ref bypasses nested snarl and both alleles are ref, nested NOT emitted +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/nested_snp_in_ins.gfa > na_ins_ap.gfa 2>/dev/null +vg sim -x na_ins_ap.gfa -P x -n 100 -l 2 -a -s 200 > na_ins_00.gam +vg pack -x na_ins_ap.gfa -g na_ins_00.gam -o na_ins_00.pack +vg call na_ins_ap.gfa -k na_ins_00.pack --top-down -a -P x 2>/dev/null > na_ins_00.vcf + +# Count variant lines (should be 1: only top-level, nested not emitted) +NA_INS_00_COUNT=$(grep -v "^#" na_ins_00.vcf | wc -l) +is "$NA_INS_00_COUNT" "1" "--top-down -a: nested_snp_in_ins 0/0 emits only top-level (ref spans nested)" + +rm -f na_ins_00.gam na_ins_00.pack na_ins_00.vcf + +# Test 3: nested_snp_in_ins 0/1 with --top-down -a (with augref paths) +# When ref bypasses nested but alt traverses, nested should have ./X genotype +vg sim -x na_ins_ap.gfa -P x -n 50 -l 2 -a -s 200 > na_ins_01.gam +vg sim -x na_ins_ap.gfa -P "a#1#y0#0" -n 100 -l 2 -a -s 201 >> na_ins_01.gam +vg pack -x na_ins_ap.gfa -g na_ins_01.gam -o na_ins_01.pack +vg call na_ins_ap.gfa -k na_ins_01.pack --top-down -a -P x 2>/dev/null > na_ins_01.vcf + +# Count variant lines (should be 2: top-level + nested) +NA_INS_01_COUNT=$(grep -v "^#" na_ins_01.vcf | wc -l) +is "$NA_INS_01_COUNT" "2" "--top-down -a: nested_snp_in_ins 0/1 emits both snarls" + +# Verify nested has missing allele marker (.) +NA_INS_01_NEST_GT=$(grep ">2>5" na_ins_01.vcf | cut -f10 | cut -d: -f1) +NA_INS_01_HAS_MISSING=$(echo "$NA_INS_01_NEST_GT" | grep -c "\.") +is "$NA_INS_01_HAS_MISSING" "1" "--top-down -a: nested_snp_in_ins 0/1 nested has missing allele (.)" + +rm -f na_ins_ap.gfa na_ins_01.gam na_ins_01.pack na_ins_01.vcf + +# Test 4: triple_nested 0/0 with --top-down -a (with augref paths) +# When ref bypasses all nested snarls and genotype is 0/0, only top-level emitted +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > na_tn_ap.gfa 2>/dev/null +vg sim -x na_tn_ap.gfa -P x -n 100 -l 2 -a -s 210 > na_tn_00.gam +vg pack -x na_tn_ap.gfa -g na_tn_00.gam -o na_tn_00.pack +vg call na_tn_ap.gfa -k na_tn_00.pack --top-down -a -P x 2>/dev/null > na_tn_00.vcf + +# Count variant lines (should be 1: only top-level, nested not emitted since ref spans them) +NA_TN_00_COUNT=$(grep -v "^#" na_tn_00.vcf | wc -l) +is "$NA_TN_00_COUNT" "1" "--top-down -a: triple_nested 0/0 emits only top-level (ref spans all nested)" + +# Verify top-level is 0/0 +NA_TN_00_GT=$(grep -v "^#" na_tn_00.vcf | cut -f10 | cut -d: -f1) +is "$NA_TN_00_GT" "0/0" "--top-down -a: triple_nested 0/0 top-level is 0/0" + +rm -f na_tn_00.gam na_tn_00.pack na_tn_00.vcf + +# Test 5: triple_nested 0/1 with --top-down -a (with augref paths) +# When ref bypasses nested but alt traverses, nested snarls should have ./X genotype +vg sim -x na_tn_ap.gfa -P x -n 50 -l 2 -a -s 211 > na_tn_01.gam +vg sim -x na_tn_ap.gfa -P "a#1#y0#0" -n 500 -l 2 -a -s 212 >> na_tn_01.gam +vg pack -x na_tn_ap.gfa -g na_tn_01.gam -o na_tn_01.pack +vg call na_tn_ap.gfa -k na_tn_01.pack --top-down -a -P x 2>/dev/null > na_tn_01.vcf + +# Count variant lines (should be 5: all nesting levels emitted with -a) +NA_TN_01_COUNT=$(grep -v "^#" na_tn_01.vcf | wc -l) +is "$NA_TN_01_COUNT" "5" "--top-down -a: triple_nested 0/1 emits all 5 nesting levels" + +# Nested snarls (LV > 0) should have missing allele (.) for the spanning ref +NA_TN_01_NESTED_MISSING=$(grep -v "^#" na_tn_01.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ {print $10}' | cut -d: -f1 | grep -c "\.") +NA_TN_01_NESTED_COUNT=$(grep -v "^#" na_tn_01.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/' | wc -l) +is "$NA_TN_01_NESTED_MISSING" "$NA_TN_01_NESTED_COUNT" "--top-down -a: triple_nested 0/1 all nested snarls have missing allele (.)" + +rm -f na_tn_ap.gfa na_tn_01.gam na_tn_01.pack na_tn_01.vcf + +# ============================================================================= +# -A (all-snarls) flag tests +# Verify that -A flag: +# 1. Includes LV/PS header tags in VCF output +# 2. Calls all snarls including nested ones (each independently) +# 3. Produces consistent results +# ============================================================================= + +# Test: -A flag includes LV and PS header lines +vg construct -r small/x.fa -v small/x.vcf.gz -a > all_snarls_test.vg +vg sim -x all_snarls_test.vg -n 200 -l 30 -a -s 300 > all_snarls_test.gam +vg pack -x all_snarls_test.vg -g all_snarls_test.gam -o all_snarls_test.pack +vg call all_snarls_test.vg -k all_snarls_test.pack -A > all_snarls_test.vcf + +# Check for LV header +AS_HAS_LV_HEADER=$(grep -c "##INFO= as_nested.vg +vg sim -x as_nested.vg -m a -n 100 -l 2 -a -s 301 > as_nested.gam +vg pack -x as_nested.vg -g as_nested.gam -o as_nested.pack +vg call as_nested.vg -k as_nested.pack -A -p x > as_nested.vcf 2>/dev/null + +# Should produce variants at both nesting levels +AS_NESTED_COUNT=$(grep -v "^#" as_nested.vcf | wc -l) +is "$AS_NESTED_COUNT" "2" "-A flag: nested graph produces both top-level and nested variants" + +# Verify LV tags present +AS_NESTED_LV=$(grep -v "^#" as_nested.vcf | grep -c "LV=") +is "$AS_NESTED_LV" "2" "-A flag: nested variants have LV tags" + +# Verify LV=0 exists (top-level) +AS_NESTED_LV0=$(grep -v "^#" as_nested.vcf | grep -c "LV=0") +is "$AS_NESTED_LV0" "1" "-A flag: has top-level variant (LV=0)" + +# Verify LV=1 exists (nested) +AS_NESTED_LV1=$(grep -v "^#" as_nested.vcf | grep -c "LV=1") +is "$AS_NESTED_LV1" "1" "-A flag: has nested variant (LV=1)" + +# Verify nested variant has PS tag pointing to parent +AS_NESTED_PS=$(grep -v "^#" as_nested.vcf | awk -F'\t' '$8 ~ /LV=1/ && $8 ~ /PS=/' | wc -l) +is "$AS_NESTED_PS" "1" "-A flag: nested variant has PS tag" + +rm -f as_nested.vg as_nested.gam as_nested.pack as_nested.vcf + +# Test: -A flag on triple nested graph with augref paths +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > as_triple.gfa 2>/dev/null +vg sim -x as_triple.gfa -P "a#1#y0#0" -n 200 -l 2 -a -s 302 > as_triple.gam +vg sim -x as_triple.gfa -P "a#2#y1#0" -n 200 -l 2 -a -s 303 >> as_triple.gam +vg pack -x as_triple.gfa -g as_triple.gam -o as_triple.pack +vg call as_triple.gfa -k as_triple.pack -A -P x > as_triple.vcf 2>/dev/null + +# Should produce variants at multiple nesting levels +AS_TRIPLE_COUNT=$(grep -v "^#" as_triple.vcf | wc -l) +AS_TRIPLE_HAS_VARIANTS=$(if [ "$AS_TRIPLE_COUNT" -ge 3 ]; then echo "1"; else echo "0"; fi) +is "$AS_TRIPLE_HAS_VARIANTS" "1" "-A flag: triple nested produces at least 3 variants" + +# Verify all variants have LV tags +AS_TRIPLE_LV=$(grep -v "^#" as_triple.vcf | grep -c "LV=") +is "$AS_TRIPLE_LV" "$AS_TRIPLE_COUNT" "-A flag: all triple nested variants have LV tags" + +# Verify PS tags on nested variants (LV > 0) +AS_TRIPLE_NESTED=$(grep -v "^#" as_triple.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/' | wc -l) +AS_TRIPLE_NESTED_PS=$(grep -v "^#" as_triple.vcf | awk -F'\t' '$8 ~ /LV=[1-9]/ && $8 ~ /PS=/' | wc -l) +is "$AS_TRIPLE_NESTED_PS" "$AS_TRIPLE_NESTED" "-A flag: all nested variants (LV>0) have PS tags" + +rm -f as_triple.gfa as_triple.gam as_triple.pack as_triple.vcf + +# ============================================================================= +# RC, RS, RD tag tests (reference coordinate tags for nested snarls) +# ============================================================================= + +# Test: RC, RS, RD headers are present +vg paths --compute-augref -Q x --min-augref-len 1 -x nesting/triple_nested.gfa > rc_test.gfa 2>/dev/null +vg sim -x rc_test.gfa -P "a#1#y0#0" -n 100 -l 2 -a -s 400 > rc_test.gam 2>/dev/null +vg sim -x rc_test.gfa -P "a#2#y1#0" -n 100 -l 2 -a -s 401 >> rc_test.gam 2>/dev/null +vg pack -x rc_test.gfa -g rc_test.gam -o rc_test.pack 2>/dev/null +vg call rc_test.gfa -k rc_test.pack --top-down -P x > rc_test.vcf 2>/dev/null + +# Check for RC, RS, RD headers +RC_HEADER=$(grep -c "##INFO= hla.vg vg index hla.vg -x hla.xg @@ -158,169 +158,129 @@ is "$(tail -1 small_cluster_3.vcf | awk '{print $10}')" "0:0.333:0" "clustered d rm -f small_cluster.gfa small_cluster_0.vcf small_cluster_3.vcf -vg deconstruct nesting/nested_snp_in_del.gfa -p x -nR > nested_snp_in_del.vcf -grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv -printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv -printf "T\tA,*\t1|2\n" >> nested_snp_in_del_truth.tsv -diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -is "$?" 0 "nested deconstruction gets correct star-allele for snp inside deletion" - -is $(grep PA=0 nested_snp_in_del.vcf | wc -l) 2 "PA=0 correctly set at both sites of neseted deletion" +# Nesting tests now use a two-step process: +# 1. Compute augref cover with vg paths +# 2. Run vg deconstruct with -a to use the pre-computed augref paths -rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv - -vg deconstruct nesting/nested_snp_in_del.gfa -p x -n > nested_snp_in_del.vcf +# Test: SNP inside deletion +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_del.gfa -Q x > nested_snp_in_del.augref.pg +vg deconstruct nested_snp_in_del.augref.pg -p x -a > nested_snp_in_del.vcf grep -v ^# nested_snp_in_del.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_del.tsv printf "CATG\tCAAG,C\t1|2\n" > nested_snp_in_del_truth.tsv printf "T\tA\t1|.\n" >> nested_snp_in_del_truth.tsv diff nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion (without -R)" +is "$?" 0 "nested deconstruction gets correct allele for snp inside deletion" -rm -f nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv +rm -f nested_snp_in_del.augref.pg nested_snp_in_del.vcf nested_snp_in_del.tsv nested_snp_in_del_truth.tsv -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > nested_snp_in_ins.vcf +# Test: SNP inside insertion with LV field checks +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins.gfa -Q x > nested_snp_in_ins.augref.pg +vg deconstruct nested_snp_in_ins.augref.pg -P x -a > nested_snp_in_ins.vcf grep -v ^# nested_snp_in_ins.vcf | awk '{print $4 "\t" $5 "\t" $10}' > nested_snp_in_ins.tsv -printf "A\tT\t0|1\n" > nested_snp_in_ins_truth.tsv -printf "C\tCAAG,CATG\t1|2\n" >> nested_snp_in_ins_truth.tsv +# With -P x, nested variants are on augref contigs (x_1_alt), parent on x +# So order is: insertion (on x) then SNP (on x_1_alt) +printf "C\tCAAG,CATG\t1|2\n" > nested_snp_in_ins_truth.tsv +printf "A\tT\t0|1\n" >> nested_snp_in_ins_truth.tsv diff nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv is "$?" 0 "nested deconstruction gets correct allele for snp inside insert" -is $(grep LV=0 nested_snp_in_ins.vcf | head -1 | grep PA=0 | wc -l) 1 "PA=0 set for base allele of nested insertion" -is $(grep LV=1 nested_snp_in_ins.vcf | tail -1 | grep PA=1 | wc -l) 1 "PA=1 set for nested allele of nested insertion" - -grep ^##contig nested_snp_in_ins.vcf > nested_snp_in_ins_contigs.tsv -printf "##contig=\n" > nested_snp_in_ins_contigs_truth.tsv -printf "##contig=\n" >> nested_snp_in_ins_contigs_truth.tsv -diff nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv -is "$?" 0 "nested deconstruction gets correct contigs in vcf header for snp inside insert" - -rm -f nested_snp_in_ins.tsv nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv nested_snp_in_ins_contigs_truth.tsv - -vg deconstruct nesting/nested_snp_in_ins2.gfa -p x -nR | grep -v ^# | awk '{print $4 "\t" $5 "\t" $10 "\t" $11}' > nested_snp_in_ins2.tsv -printf "A\tT,*\t0|1\t0|2\n" > nested_snp_in_ins2._truth.tsv -printf "C\tCAAG,CATG\t1|2\t1|0\n" >> nested_snp_in_ins2._truth.tsv -diff nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv -is "$?" 0 "nested deconstruction gets correct star allele for snp ins2.ide ins2.ert" +is $(grep LV=0 nested_snp_in_ins.vcf | wc -l) 1 "LV=0 set for base allele of nested insertion" +is $(grep LV=1 nested_snp_in_ins.vcf | wc -l) 1 "LV=1 set for nested allele of nested insertion" -rm -f nested_snp_in_ins2.tsv nested_snp_in_ins2._truth.tsv +# With -P x, we get multiple contigs (x, x_1_alt, x_2_alt) +is $(grep -c "^##contig" nested_snp_in_ins.vcf) 3 "nested deconstruction gets all reference contigs in vcf header" -# todo: the integrated snarl finder doesnt anchor to the reference -# probably not an issue on most real graphs from vg construct / minigraph cacuts -# but seems like something that needs reviewing -vg deconstruct nesting/nested_snp_in_nested_ins.gfa -P x -n > nested_snp_in_nested_ins.vcf -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | awk '{print $8}') "AC=1,1;AF=0.5,0.5;AN=2;AT=>1>6,>1>2>3>31>33>34>35>5>6,>1>2>3>31>32>34>35>5>6;NS=1;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0" "INFO tags correct for level-0 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=1 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>2>3>31>33>34>35>5,>2>3>31>32>34>35>5;NS=1;PA=1;PL=8;PR=1;RC=x;RD=1;RL=8;RS=1;LV=1;PS=>1>6" "INFO tags correct for level-1 site of double-nested SNP" -is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | awk '{print $8}') "AC=1;AF=0.5;AN=2;AT=>31>33>34,>31>32>34;NS=1;PA=0;PL=5;PR=5;RC=x;RD=1;RL=8;RS=1;LV=2;PS=>2>5" "INFO tags correct for level-2 site of double-nested SNP" +rm -f nested_snp_in_ins.augref.pg nested_snp_in_ins.vcf nested_snp_in_ins.tsv nested_snp_in_ins_truth.tsv nested_snp_in_ins_contigs.tsv -rm -f nested_snp_in_nested_ins.snarls nested_snp_in_nested_ins.vcf +# Test: Double-nested SNP +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_nested_ins.gfa -Q x > nested_snp_in_nested_ins.augref.pg +vg deconstruct nested_snp_in_nested_ins.augref.pg -P x -a > nested_snp_in_nested_ins.vcf +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=0 | wc -l) 1 "level 0 site found in double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep "LV=" | wc -l) 3 "all nested sites found in double-nested SNP" +is $(grep -v ^# nested_snp_in_nested_ins.vcf | grep LV=2 | wc -l) 1 "level 2 site found in double-nested SNP" -vg deconstruct nesting/nested_snp_in_ins_cycle.gfa -P x -n > nested_snp_in_ins_cycle.vcf -printf "A#1#y0\t3\t>2>5\tA\tT\tAC=2;AF=0.5;AN=4;AT=>2>4>5,>2>3>5;NS=2;PA=1;PL=7;PR=1;RC=x;RD=1;RL=7;RS=1;LV=1;PS=>1>6\t0|1\t0|1\n" > nested_snp_in_ins_cycle_truth.tsv -printf "x\t1\t>1>6\tC\tCAAGAAG,CATG,CAAG\tAC=1,2,1;AF=0.25,0.5,0.25;AN=4;AT=>1>6,>1>2>4>5>2>4>5>6,>1>2>3>5>6,>1>2>4>5>6;NS=2;PA=0;PL=1;PR=1;RC=x;RD=1;RL=1;RS=1;LV=0\t1|2\t3|2\n" >> nested_snp_in_ins_cycle_truth.tsv -grep -v ^# nested_snp_in_ins_cycle.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $8 "\t" $10 "\t" $11}' > nested_snp_in_ins_cycle.tsv -diff nested_snp_in_ins_cycle.tsv nested_snp_in_ins_cycle_truth.tsv -is "$?" 0 "nested deconstruction handles cycle" +rm -f nested_snp_in_nested_ins.augref.pg nested_snp_in_nested_ins.vcf -rm -f nested_snp_in_ins_cycle.vcf nested_snp_in_ins_cycle_truth.tsv nested_snp_in_ins_cycle_truth.tsv +# Test: Nested site with cycle +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins_cycle.gfa -Q x > nested_snp_in_ins_cycle.augref.pg +vg deconstruct nested_snp_in_ins_cycle.augref.pg -P x -a > nested_snp_in_ins_cycle.vcf +is $(grep -v ^# nested_snp_in_ins_cycle.vcf | grep LV=0 | wc -l) 1 "level 0 found in nested cycle" +is $(grep -v ^# nested_snp_in_ins_cycle.vcf | grep LV=1 | wc -l) 1 "level 1 found in nested cycle" +rm -f nested_snp_in_ins_cycle.augref.pg nested_snp_in_ins_cycle.vcf -vg snarls nesting/mnp.gfa --algorithm cactus > mnp.snarls -vg deconstruct nesting/mnp.gfa -r mnp.snarls -p x -n -f mnp.fa > mnp.vcf +# Test: MNP handling +vg paths --compute-augref --min-augref-len 0 -x nesting/mnp.gfa -Q x > mnp.augref.pg +vg deconstruct mnp.augref.pg -p x -a > mnp.vcf printf "x\t3\t>2>7\tTCAT\tATTT\n" > mnp_truth.tsv grep -v ^# mnp.vcf | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > mnp.tsv diff mnp_truth.tsv mnp.tsv is "$?" 0 "nested deconstruction handles mnp" -printf "a#1#y0\t2\t6\t>8>11\tx\t2\t6\n" > mnp.nesting.truth.tsv -diff mnp.fa.nesting.tsv mnp.nesting.truth.tsv -is "$?" 0 "nested deconstruction makes correct mnp tsv" -printf ">a#1#y0[2-6]\nATTT\n" > mnp.fa.truth -diff mnp.fa mnp.fa.truth -is "$?" 0 "nested deconstruction makes correct fasta" -rm -f mnp.snarls mnp.vcf mnp_truth.tsv mnp.tsv mnp.nesting.truth.tsv mnp.fa.truth +rm -f mnp.augref.pg mnp.vcf mnp_truth.tsv mnp.tsv # Test 1: Deep nesting (3+ levels) - triple nested SNP -vg deconstruct nesting/triple_nested.gfa -p x -nR > triple_nested.vcf +vg paths --compute-augref --min-augref-len 0 -x nesting/triple_nested.gfa -Q x > triple_nested.augref.pg +vg deconstruct triple_nested.augref.pg -P x -a > triple_nested.vcf is $(grep -v ^# triple_nested.vcf | grep LV=0 | wc -l) 1 "level 0 site found in triple nested" -is $(grep -v ^# triple_nested.vcf | grep LV=1 | wc -l) 1 "level 1 site found in triple nested" +is $(grep -v ^# triple_nested.vcf | grep "LV=" | wc -l) 5 "all nested sites found in triple nested" is $(grep -v ^# triple_nested.vcf | grep LV=2 | wc -l) 1 "level 2 site found in triple nested" is $(grep -v ^# triple_nested.vcf | grep LV=3 | wc -l) 1 "level 3 site found in triple nested" -is $(grep "LV=3" triple_nested.vcf | grep "RC=x" | wc -l) 1 "top-level reference propagates to level 3" -rm -f triple_nested.snarls triple_nested.vcf +is $(grep -v ^# triple_nested.vcf | grep LV=4 | wc -l) 1 "level 4 site found in triple nested" +rm -f triple_nested.augref.pg triple_nested.vcf # Test 2: Multiple children at same level - insertion with 2 nested SNPs -vg deconstruct nesting/insertion_with_three_snps.gfa -p x -n > multi_child.vcf -is $(grep -v ^# multi_child.vcf | grep LV=0 | wc -l) 1 "parent site with multiple children found" +vg paths --compute-augref --min-augref-len 0 -x nesting/insertion_with_three_snps.gfa -Q x > insertion_with_three_snps.augref.pg +vg deconstruct insertion_with_three_snps.augref.pg -P x -a > multi_child.vcf +is $(grep -v ^# multi_child.vcf | grep "LV=" | wc -l) 3 "expected number of sites with LV field" is $(grep -v ^# multi_child.vcf | grep LV=1 | wc -l) 2 "two child SNPs found at level 1" -rm -f multi_child.vcf insertion_with_three_snps.snarls - -# Test 3: NestingInfo field propagation - verify all INFO tags -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -n > field_check.vcf -is $(grep "LV=1" field_check.vcf | grep -o "PA=[0-9]*" | cut -d= -f2) 1 "PA field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep -o "PL=[0-9]*" | cut -d= -f2) 4 "PL field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep "RC=x" | wc -l) 1 "RC field correct for nested site" -is $(grep "LV=1" field_check.vcf | grep "RS=" | wc -l) 1 "RS field present for nested site" -is $(grep "LV=1" field_check.vcf | grep "RD=" | wc -l) 1 "RD field present for nested site" -is $(grep "LV=1" field_check.vcf | grep "RL=" | wc -l) 1 "RL field present for nested site" -rm -f field_check.vcf - -# Test 4: FASTA output for off-reference sequences -vg deconstruct nesting/consecutive_nested.gfa -p x -nf consecutive.fa > consecutive.vcf -test -s consecutive.fa && is "$?" 0 "consecutive nested graph generates off-reference FASTA" -test -s consecutive.fa.nesting.tsv && is "$?" 0 "consecutive nested graph generates nesting TSV" -rm -f consecutive.vcf consecutive.fa consecutive.fa.nesting.tsv - -# Test 5: Multiple reference traversals with nesting (should reduce to one) -vg deconstruct nesting/cyclic_ref_nested.gfa -p x -n > cyclic_ref_nested.vcf +rm -f insertion_with_three_snps.augref.pg multi_child.vcf + +# Test 3: NestingInfo field propagation - verify LV field +vg paths --compute-augref --min-augref-len 0 -x nesting/nested_snp_in_ins.gfa -Q x > field_check.augref.pg +vg deconstruct field_check.augref.pg -P x -a > field_check.vcf +is $(grep "LV=0" field_check.vcf | wc -l) 1 "LV=0 field present for top-level site" +rm -f field_check.augref.pg field_check.vcf + +# Test 4: Multiple reference traversals with nesting (should reduce to one) +vg paths --compute-augref --min-augref-len 0 -x nesting/cyclic_ref_nested.gfa -Q x > cyclic_ref_nested.augref.pg +vg deconstruct cyclic_ref_nested.augref.pg -p x -a > cyclic_ref_nested.vcf is $(grep -v ^# cyclic_ref_nested.vcf | wc -l) 1 "cyclic reference with nesting produces single variant" -rm -f cyclic_ref_nested.vcf - -# Tests for -f option (off-reference FASTA output) - -# Test 6: -f with -n produces FASTA output -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf basic_test.fa > basic_test.vcf -test -s basic_test.fa && is "$?" 0 "-f with -n produces FASTA file" -test -s basic_test.fa.nesting.tsv && is "$?" 0 "-f with -n produces TSV file" -rm -f basic_test.fa basic_test.fa.nesting.tsv basic_test.vcf - -# Test 7: Deep nesting with FASTA output - verify nesting info propagates -vg deconstruct nesting/triple_nested.gfa -p x -nRf triple_nested.fa > triple_nested.vcf -test -s triple_nested.fa && is "$?" 0 "triple nested graph generates FASTA with -f" -test -s triple_nested.fa.nesting.tsv && is "$?" 0 "triple nested graph generates nesting TSV with -f" -is $(wc -l < triple_nested.fa.nesting.tsv) 2 "triple nested TSV has 2 entries (merged by shared reference)" -is $(awk '{print NF}' triple_nested.fa.nesting.tsv | uniq) 7 "nesting TSV has 7 columns" -is $(cut -f5 triple_nested.fa.nesting.tsv | uniq) x "all TSV entries reference top-level contig x" -rm -f triple_nested.snarls triple_nested.vcf triple_nested.fa triple_nested.fa.nesting.tsv - -# Test 8: Verify FASTA sequences match graph paths -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf nested_ins_test.fa > nested_ins_test.vcf -is $(grep -c "^>" nested_ins_test.fa) 2 "nested insertion produces 2 FASTA sequences" -# Check that sequences are non-empty -is $(grep -v "^>" nested_ins_test.fa | grep -c ".") 2 "FASTA sequences are non-empty" -# Verify TSV has matching entries -is $(wc -l < nested_ins_test.fa.nesting.tsv) 2 "nesting TSV has matching number of entries" -rm -f nested_ins_test.vcf nested_ins_test.fa nested_ins_test.fa.nesting.tsv - -# Test 9: Multiple children - verify all off-ref paths captured -vg snarls -A cactus nesting/insertion_with_three_snps.gfa > multi_snps.snarls -vg deconstruct nesting/insertion_with_three_snps.gfa -p x -n -r multi_snps.snarls -f multi_snps.fa > multi_snps.vcf -test -s multi_snps.fa && is "$?" 0 "multi-child graph generates FASTA" -# Should have sequences for the insertion alleles -is $(grep -c "^>" multi_snps.fa) 2 "multi-child graph produces expected FASTA sequences" -rm -f multi_snps.snarls multi_snps.vcf multi_snps.fa multi_snps.fa.nesting.tsv - -# Test 10: Verify TSV columns contain valid data -vg deconstruct nesting/nested_snp_in_ins.gfa -p x -nf tsv_test.fa > tsv_test.vcf -# Column 1: path name should contain # -is $(cut -f1 tsv_test.fa.nesting.tsv | grep -c "#") 2 "TSV column 1 contains haplotype path names" -# Column 2,3: positions should be integers -is $(cut -f2 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 2 contains valid positions" -is $(cut -f3 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 3 contains valid positions" -# Column 5: should be reference name (x) -is $(cut -f5 tsv_test.fa.nesting.tsv | uniq) x "TSV column 5 is top-level reference name" -# Column 6,7: should be 0-based positions -is $(cut -f6 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 6 contains valid reference positions" -is $(cut -f7 tsv_test.fa.nesting.tsv | grep -c "^[0-9]*$") 2 "TSV column 7 contains valid reference positions" -rm -f tsv_test.vcf tsv_test.fa tsv_test.fa.nesting.tsv +rm -f cyclic_ref_nested.augref.pg cyclic_ref_nested.vcf + +# Test 5: Cyclic reference outputs multiple variants (one per reference traversal) +# Reference x visits the snarl twice (via node 2 then 3), alt a visits twice (via node 3 then 4) +# With -c 0 to disable context-jaccard (which doesn't work well on tiny graphs), we should get 2 variants +vg deconstruct nesting/cyclic_ref_multiple_variants.gfa -p x -a -c 0 > cyclic_ref_multi.vcf +is $(grep -v ^# cyclic_ref_multi.vcf | wc -l) 2 "cyclic reference with -a outputs variant for each reference traversal" +rm -f cyclic_ref_multi.vcf + +# ============================================================================= +# RC, RS, RD tag tests (reference coordinate tags for nested snarls) +# ============================================================================= + +# Test: RC, RS, RD headers and tags in deconstruct output +vg paths --compute-augref --min-augref-len 0 -x nesting/triple_nested.gfa -Q x > rc_decon_test.augref.pg +vg deconstruct rc_decon_test.augref.pg -P x -a > rc_decon_test.vcf + +# Check for RC, RS, RD headers +is $(grep -c "##INFO=