From f4faaff12358bb9fac1ba79d74344c6d94ec8c38 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 17 Feb 2026 21:44:42 -0500 Subject: [PATCH 1/2] Fix augref: path_front_end crash, cross-path merging, deferred length filter Three fixes to augref interval computation: 1. Fix crash in try_extend_backward when backward walk reaches the start of a path. get_next_step(path_front_end) is invalid in bdsg; return path_begin directly instead. 2. Add cross-path interval merging in add_interval. When a new interval is adjacent to an existing interval on a different path, attempt to extend one interval to absorb the other if they share the same node sequence. This reduces augref fragmentation. 3. Defer minimum length filter to after all merging is complete. Previously short intervals were discarded in compute_snarl and fill_uncovered_nodes before add_interval could merge them with adjacent intervals. Now all intervals are kept during computation (min_length=1) and filtered by the new filter_short_intervals() after defragment_intervals(). Also adds rank_by_name option (hardcoded true for testing) to rank traversal fragments by path name instead of coverage, and adds verbose output showing the post-filter interval count. Co-Authored-By: Claude Opus 4.6 --- src/augref.cpp | 177 ++++++++++++++++++++++-- src/augref.hpp | 21 +++ test/nesting/cross_path_merge.gfa | 24 ++++ test/nesting/cross_path_merge_right.gfa | 24 ++++ test/t/11_vg_paths.t | 20 ++- 5 files changed, 252 insertions(+), 14 deletions(-) create mode 100644 test/nesting/cross_path_merge.gfa create mode 100644 test/nesting/cross_path_merge_right.gfa diff --git a/src/augref.cpp b/src/augref.cpp index 3a55202568..74bf24d8e8 100644 --- a/src/augref.cpp +++ b/src/augref.cpp @@ -99,6 +99,7 @@ void AugRefCover::compute(const PathHandleGraph* graph, this->interval_snarl_bounds.clear(); this->node_to_interval.clear(); this->graph = graph; + this->rank_by_name = true; // TODO: testing — rank traversals by name only // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { @@ -151,7 +152,7 @@ void AugRefCover::compute(const PathHandleGraph* graph, queue.pop_back(); // get the snarl cover - compute_snarl(*cur_snarl, path_trav_finder, minimum_length, + compute_snarl(*cur_snarl, path_trav_finder, 1 /*defer length filter to post-merge*/, thread_augref_intervals, thread_node_to_interval, top_snarl_start, top_snarl_end, @@ -188,13 +189,22 @@ void AugRefCover::compute(const PathHandleGraph* graph, } // second pass: greedily cover any nodes not covered by snarl traversals - fill_uncovered_nodes(minimum_length); + fill_uncovered_nodes(1 /*defer length filter to post-merge*/); // debug: verify all nodes are covered verify_cover(); // remove any intervals that were made redundant by add_interval defragment_intervals(); + + // apply length filter after all merging is complete + filter_short_intervals(minimum_length); + + if (verbose) { + int64_t final_alt = augref_intervals.size() - num_ref_intervals; + cerr << "[augref] After length filter (min " << minimum_length << " bp): " + << final_alt << " alt intervals" << endl; + } } void AugRefCover::fill_uncovered_nodes(int64_t minimum_length) { @@ -550,7 +560,7 @@ void AugRefCover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_tr } } if (!cyclic && interval_length >= minimum_length) { - int64_t trav_coverage = get_coverage(trav, uncovered_interval); + int64_t trav_coverage = rank_by_name ? 0 : get_coverage(trav, uncovered_interval); ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval}); } } @@ -607,7 +617,7 @@ void AugRefCover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_tr chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); } if (chopped_trav_length >= minimum_length) { - int64_t trav_coverage = get_coverage(trav, chopped_interval); + int64_t trav_coverage = rank_by_name ? 0 : get_coverage(trav, chopped_interval); ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval}); std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); } @@ -655,6 +665,60 @@ vector> AugRefCover::get_uncovered_intervals(const vector return intervals; } +optional AugRefCover::try_extend_forward(step_handle_t start_step, path_handle_t path, + const pair& other_interval) { + step_handle_t path_end = graph->path_end(path); + step_handle_t cur = start_step; + for (step_handle_t other_step = other_interval.first; other_step != other_interval.second; + other_step = graph->get_next_step(other_step)) { + if (cur == path_end) { + return nullopt; + } + handle_t cur_handle = graph->get_handle_of_step(cur); + handle_t other_handle = graph->get_handle_of_step(other_step); + if (graph->get_id(cur_handle) != graph->get_id(other_handle) || + graph->get_is_reverse(cur_handle) != graph->get_is_reverse(other_handle)) { + return nullopt; + } + cur = graph->get_next_step(cur); + } + return cur; // new end step (one past last matching) +} + +optional AugRefCover::try_extend_backward(step_handle_t start_step, path_handle_t path, + const pair& other_interval) { + // Collect other_interval's node IDs + orientations into a vector + vector> other_steps; + for (step_handle_t s = other_interval.first; s != other_interval.second; s = graph->get_next_step(s)) { + handle_t h = graph->get_handle_of_step(s); + other_steps.push_back({graph->get_id(h), graph->get_is_reverse(h)}); + } + if (other_steps.empty()) { + return nullopt; + } + // Walk backward from start_step on path, comparing in reverse order + step_handle_t path_front_end = graph->path_front_end(path); + step_handle_t cur = start_step; + for (int64_t i = (int64_t)other_steps.size() - 1; i >= 0; --i) { + if (cur == path_front_end) { + return nullopt; + } + handle_t cur_handle = graph->get_handle_of_step(cur); + if (graph->get_id(cur_handle) != other_steps[i].first || + graph->get_is_reverse(cur_handle) != other_steps[i].second) { + return nullopt; + } + cur = graph->get_previous_step(cur); + } + // Return the step of the first matching node (one forward from where we stopped) + // If cur is path_front_end (walked back to before the first step), return path_begin + // directly since get_next_step is not valid on the path_front_end sentinel in bdsg. + if (cur == path_front_end) { + return graph->path_begin(path); + } + return graph->get_next_step(cur); +} + bool AugRefCover::add_interval(vector>& thread_augref_intervals, unordered_map& thread_node_to_interval, const pair& new_interval, @@ -677,6 +741,10 @@ bool AugRefCover::add_interval(vector>& threa bool merged = false; int64_t merged_interval_idx = -1; path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); + pair effective_interval = new_interval; + bool left_cross_path_merged = false; + int64_t deleted_idx = -1; + step_handle_t deleted_interval_first, deleted_interval_second; // save before decommission // check the before-first step. if it's in an interval then it must be immediately // preceeding so we merge the new interval to the end of the found interval @@ -705,23 +773,54 @@ bool AugRefCover::add_interval(vector>& threa merged = true; merged_interval_idx = prev_idx; } + } else { + // Cross-path left merge: prev_interval is on a different path + path_handle_t prev_path = graph->get_path_handle_of_step(prev_interval.first); + // Boundary check: verify the junction node is the last node of prev_interval + step_handle_t prev_last = graph->get_previous_step(prev_interval.second); + if (graph->get_id(graph->get_handle_of_step(prev_last)) == prev_node_id) { + // Try 1: extend prev_interval forward on its path to cover new_interval's nodes + auto ext_fwd = try_extend_forward(prev_interval.second, prev_path, new_interval); + if (ext_fwd) { + prev_interval.second = *ext_fwd; + merged = true; + merged_interval_idx = prev_idx; + left_cross_path_merged = true; + } else { + // Try 2: extend new_interval backward on its path to cover prev_interval's nodes + auto ext_bwd = try_extend_backward(before_first_step, path_handle, prev_interval); + if (ext_bwd) { + effective_interval.first = *ext_bwd; + // Decommission prev_interval + deleted_interval_first = prev_interval.first; + deleted_interval_second = prev_interval.second; + deleted_idx = prev_idx; + prev_interval.first = graph->path_end(prev_path); + prev_interval.second = graph->path_front_end(prev_path); + if (snarl_bounds_vec) { + (*snarl_bounds_vec)[prev_idx] = {0, 0}; + } + left_cross_path_merged = true; + // merged stays false — effective_interval will be pushed as new + } + } + } } } } // check the end step. if it's in an interval then it must be immediately // following we merge the new interval to the front of the found interval - int64_t deleted_idx = -1; - step_handle_t deleted_interval_first, deleted_interval_second; // save before decommission - if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { - nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); + if (!left_cross_path_merged && + effective_interval.second != graph->path_end(graph->get_path_handle_of_step(effective_interval.second))) { + nid_t next_node_id = graph->get_id(graph->get_handle_of_step(effective_interval.second)); if (thread_node_to_interval.count(next_node_id)) { int64_t next_idx = thread_node_to_interval[next_node_id]; pair& next_interval = thread_augref_intervals[next_idx]; path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { - if (next_interval.first == new_interval.second || - (global && next_interval.first == graph->get_previous_step(new_interval.second))) { + if (next_interval.first == effective_interval.second || + (global && next_interval.first == graph->get_previous_step(effective_interval.second))) { #ifdef debug #pragma omp critical(cerr) cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) @@ -747,11 +846,39 @@ bool AugRefCover::add_interval(vector>& threa thread_augref_intervals[merged_interval_idx].second = deleted_interval_second; } else { // extend next_interval left - next_interval.first = new_interval.first; + next_interval.first = effective_interval.first; merged = true; merged_interval_idx = next_idx; } } + } else if (!merged) { + // Cross-path right merge: next_interval is on a different path + // Boundary check: verify next_node_id is the first node of next_interval + if (graph->get_id(graph->get_handle_of_step(next_interval.first)) == next_node_id) { + // Try 1: extend next_interval backward on its path to cover effective_interval's nodes + step_handle_t next_pred = graph->get_previous_step(next_interval.first); + auto ext_bwd = try_extend_backward(next_pred, next_path, effective_interval); + if (ext_bwd) { + next_interval.first = *ext_bwd; + merged = true; + merged_interval_idx = next_idx; + } else { + // Try 2: extend effective_interval forward on its path to cover next_interval's nodes + auto ext_fwd = try_extend_forward(effective_interval.second, path_handle, next_interval); + if (ext_fwd) { + effective_interval.second = *ext_fwd; + // Decommission next_interval + deleted_interval_first = next_interval.first; + deleted_interval_second = next_interval.second; + deleted_idx = next_idx; + next_interval.first = graph->path_end(next_path); + next_interval.second = graph->path_front_end(next_path); + if (snarl_bounds_vec) { + (*snarl_bounds_vec)[next_idx] = {0, 0}; + } + } + } + } } } } @@ -759,12 +886,12 @@ bool AugRefCover::add_interval(vector>& threa // add the interval to the local (thread safe) structures if (!merged) { merged_interval_idx = thread_augref_intervals.size(); - thread_augref_intervals.push_back(new_interval); + thread_augref_intervals.push_back(effective_interval); if (snarl_bounds_vec) { snarl_bounds_vec->push_back(snarl_bounds); } } - for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + for (step_handle_t step = effective_interval.first; step != effective_interval.second; step = graph->get_next_step(step)) { thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; } if (deleted_idx >= 0) { @@ -799,6 +926,30 @@ void AugRefCover::defragment_intervals() { } } +void AugRefCover::filter_short_intervals(int64_t minimum_length) { + if (minimum_length <= 1) { + return; + } + for (int64_t i = num_ref_intervals; i < augref_intervals.size(); ++i) { + auto& interval = augref_intervals[i]; + path_handle_t ph = graph->get_path_handle_of_step(interval.first); + if (interval.first == graph->path_end(ph)) { + continue; // already decommissioned + } + int64_t length = 0; + for (step_handle_t s = interval.first; s != interval.second; s = graph->get_next_step(s)) { + length += graph->get_length(graph->get_handle_of_step(s)); + } + if (length < minimum_length) { + // decommission + interval.first = graph->path_end(ph); + interval.second = graph->path_front_end(ph); + } + } + // rebuild node_to_interval without the removed intervals + defragment_intervals(); +} + int64_t AugRefCover::get_coverage(const vector& trav, const pair& uncovered_interval) { int64_t coverage = 0; diff --git a/src/augref.hpp b/src/augref.hpp index f0912aaf9c..939e6b3999 100644 --- a/src/augref.hpp +++ b/src/augref.hpp @@ -21,6 +21,8 @@ * cover can be created and loaded, but they are not used beyond that. */ +#include + #include "handle.hpp" #include "snarls.hpp" #include "traversal_finder.hpp" @@ -135,6 +137,21 @@ class AugRefCover { // add_interval() can delete an existing interval. This requires a full update at the end. void defragment_intervals(); + // Remove non-reference intervals shorter than minimum_length, then defragment. + // Called after all merging is complete so short intervals have had a chance to merge. + void filter_short_intervals(int64_t minimum_length); + + // Walk forward from start_step on path, comparing each node ID + orientation + // against other_interval's steps. Returns the new end step if all match, nullopt otherwise. + optional try_extend_forward(step_handle_t start_step, path_handle_t path, + const pair& other_interval); + + // Collect other_interval's node IDs + orientations into a vector (forward walk). + // Then walk backward from start_step on path, comparing in reverse. + // Returns the first matching step if all match, nullopt otherwise. + optional try_extend_backward(step_handle_t start_step, path_handle_t path, + const pair& other_interval); + // Get the total coverage of a traversal (sum of step lengths * path count). int64_t get_coverage(const vector& trav, const pair& uncovered_interval); @@ -183,6 +200,10 @@ class AugRefCover { // Whether to print verbose output (coverage summary, etc.) bool verbose = false; + // When true, rank traversal fragments by name only (ignore coverage). + // For testing whether consistent path selection reduces fragmentation. + bool rank_by_name = false; + // Copy base reference paths to the augref sample. // Creates new paths like "new_sample#0#chr1" from "CHM13#0#chr1". void copy_base_paths_to_sample(MutablePathMutableHandleGraph* mutable_graph, diff --git a/test/nesting/cross_path_merge.gfa b/test/nesting/cross_path_merge.gfa new file mode 100644 index 0000000000..7375487929 --- /dev/null +++ b/test/nesting/cross_path_merge.gfa @@ -0,0 +1,24 @@ +H VN:Z:1.0 +S 1 AAAA +S 2 TTTT +S 3 CCCC +S 4 GGGG +S 5 AAAA +S 6 TTTT +S 7 CCCC +S 8 GGGG +S 9 TTTTTTTT +L 1 + 2 + 0M +L 1 + 5 + 0M +L 1 + 8 + 0M +L 2 + 3 + 0M +L 3 + 4 + 0M +L 4 + 8 + 0M +L 4 + 9 + 0M +L 5 + 6 + 0M +L 6 + 7 + 0M +L 7 + 8 + 0M +P x 1+,8+ * +P a#1#y0 1+,2+,3+,4+,8+ * +P a#2#y1 1+,5+,6+,7+,8+ * +P a#3#y2 1+,2+,3+,4+,9+ * diff --git a/test/nesting/cross_path_merge_right.gfa b/test/nesting/cross_path_merge_right.gfa new file mode 100644 index 0000000000..a1e87ff1a5 --- /dev/null +++ b/test/nesting/cross_path_merge_right.gfa @@ -0,0 +1,24 @@ +H VN:Z:1.0 +S 1 AAAA +S 2 TTTT +S 3 CCCC +S 4 GGGG +S 5 AAAA +S 6 TTTT +S 7 CCCC +S 8 GGGG +S 9 TTTTTTTT +L 1 + 2 + 0M +L 1 + 5 + 0M +L 1 + 8 + 0M +L 2 + 3 + 0M +L 3 + 4 + 0M +L 4 + 8 + 0M +L 5 + 6 + 0M +L 6 + 7 + 0M +L 7 + 8 + 0M +L 9 + 2 + 0M +P x 1+,8+ * +P a#1#y0 1+,2+,3+,4+,8+ * +P a#2#y1 1+,5+,6+,7+,8+ * +P a#3#y2 9+,2+,3+,4+,8+ * diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 7ca872b443..114539d024 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 52 +plan tests 56 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -180,8 +180,26 @@ is $? 0 "augref-segs requires compute-augref option" vg paths -x nesting/nested_snp_in_ins.gfa -Q x --compute-augref --min-augref-len 1 --augref-sample TESTSAMPLE --augref-segs augref_sample_test.segs > augref_sample_test.vg is $(cut -f4 augref_sample_test.segs | grep -c "TESTSAMPLE") 2 "augref-segs uses augref-sample for path names" +# Test cross-path interval merging (left merge: new interval absorbs previous from different path) +vg paths -x nesting/cross_path_merge.gfa -Q x --compute-augref --min-augref-len 1 > cross_merge_test.vg +vg validate cross_merge_test.vg +is $? 0 "cross-path merge: augref computation produces valid graph" + +# Cross-path merge should combine snarl interval [2,3,4] + dangling [9] into one path on hap3 +# Without merging: 3 augref paths. With merging: 2 augref paths. +is $(vg paths -x cross_merge_test.vg -L | grep "_alt$" | wc -l) 2 "cross-path left merge reduces augref path count" + +# Test cross-path interval merging (right merge: new interval absorbs following from different path) +vg paths -x nesting/cross_path_merge_right.gfa -Q x --compute-augref --min-augref-len 1 > cross_merge_right_test.vg +vg validate cross_merge_right_test.vg +is $? 0 "cross-path right merge: augref computation produces valid graph" + +# Cross-path merge should combine dangling [9] + snarl interval [2,3,4] into one path on hap3 +is $(vg paths -x cross_merge_right_test.vg -L | grep "_alt$" | wc -l) 2 "cross-path right merge reduces augref path count" + rm -f augref_test.vg triple_augref.vg triple_augref_long.vg dangling_augref.vg x.pg x.gbwt x.gbz rm -f augref_test.segs augref_segs_test.vg augref_sample_test.segs augref_sample_test.vg +rm -f cross_merge_test.vg cross_merge_right_test.vg From 396e1133522fe6e987e9011e81ab0f6bd4f617f3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 17 Feb 2026 21:53:02 -0500 Subject: [PATCH 2/2] Document rank_by_name as determinism requirement Co-Authored-By: Claude Opus 4.6 --- src/augref.cpp | 9 ++++++++- src/augref.hpp | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/augref.cpp b/src/augref.cpp index 74bf24d8e8..dfdca1ba46 100644 --- a/src/augref.cpp +++ b/src/augref.cpp @@ -99,7 +99,14 @@ void AugRefCover::compute(const PathHandleGraph* graph, this->interval_snarl_bounds.clear(); this->node_to_interval.clear(); this->graph = graph; - this->rank_by_name = true; // TODO: testing — rank traversals by name only + // Rank traversal fragments by path name only, ignoring coverage. This ensures + // deterministic output: the same lowest-name path wins in every snarl, so adjacent + // snarl intervals land on the same path and only same-path merging is needed, + // eliminating dependence on OpenMP thread scheduling during the global fold. + // TODO: to re-enable coverage-based ranking, the global fold must be made + // deterministic (e.g. by sorting intervals before folding rather than relying + // on per-thread vector order). + this->rank_by_name = true; // start with the reference paths for (const path_handle_t& ref_path_handle : reference_paths) { diff --git a/src/augref.hpp b/src/augref.hpp index 939e6b3999..9cc76b6fc5 100644 --- a/src/augref.hpp +++ b/src/augref.hpp @@ -201,7 +201,7 @@ class AugRefCover { bool verbose = false; // When true, rank traversal fragments by name only (ignore coverage). - // For testing whether consistent path selection reduces fragmentation. + // This ensures deterministic output regardless of thread count. bool rank_by_name = false; // Copy base reference paths to the augref sample.