Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f9f9996
added recombination info to output
dcmonti Jan 26, 2026
a4ff942
scatter.py can now show recombinant anchors
dcmonti Feb 2, 2026
64cadde
Merge remote-tracking branch 'origin/master' into recombination-output
adamnovak Feb 4, 2026
6e03520
Handle type errors in alt text generator
adamnovak Feb 4, 2026
f1f840f
Add more dumping for DP info
adamnovak Feb 5, 2026
09e9d44
Add synthesized code to make interactive visualizations from chain du…
adamnovak Feb 5, 2026
2aaf8d6
Automatically revise visualization generator to appear to work
adamnovak Feb 5, 2026
ae05555
Flip source and destination back to agree with the logs and increase …
adamnovak Feb 5, 2026
cd3dc4f
Add auto-generated code for traceback
adamnovak Feb 5, 2026
9900bb9
Add slightly adjusted synthetic code to let us see the transition to …
adamnovak Feb 5, 2026
c730224
Add synthetic code to report type-able seed numbers
adamnovak Feb 5, 2026
d86fcef
Add synthesized code to show negative-score transitions when they exist
adamnovak Feb 5, 2026
f14d5f5
Add autogenerated code to gzip the giant JSONs in the SVGs so they ar…
adamnovak Feb 5, 2026
e574532
Add seed orientation on path and proper sign values to dumps
adamnovak Feb 6, 2026
bbfdb3c
Merge remote-tracking branch 'origin/master' into recombination-output
adamnovak Feb 6, 2026
7ae78b8
Use strand and fixed negative scores in visualization generator.
adamnovak Feb 6, 2026
7a29910
Synthesize an alleged design improvement
adamnovak Feb 6, 2026
c65edbf
Reduce margin on initial load
adamnovak Feb 6, 2026
6efdce4
Automatically class-ify the visualization code to try and improve coh…
adamnovak Feb 6, 2026
dc2ebd1
Privatize all the methods that can be
adamnovak Feb 6, 2026
cf99e56
Merge remote-tracking branch 'origin/master' into recombination-output
adamnovak Feb 10, 2026
bbf07ba
Turn off chaining debugging
adamnovak Feb 10, 2026
778fc22
Adjust test to allow for new explanation files
adamnovak Feb 10, 2026
3970de9
Merge remote-tracking branch 'origin/recombination-output' into recom…
adamnovak Feb 17, 2026
e76e1bd
Merge remote-tracking branch 'origin/master' into recombination-output
adamnovak Feb 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
837 changes: 837 additions & 0 deletions scripts/build-chain-viz.py

Large diffs are not rendered by default.

37 changes: 34 additions & 3 deletions scripts/scatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import argparse, sys, os, itertools, math, collections, random, re
import matplotlib, matplotlib.ticker, matplotlib.cm, numpy
import copy
import traceback

# Scipy allows curve fitting, but we might not have it
try:
Expand Down Expand Up @@ -99,6 +100,8 @@ def parse_args(args):
help="width of connecting line")
parser.add_argument("--tsv", action="store_true",
help="use only tabs as separators in input file")
parser.add_argument("--show_rec", action="store_true",
help="highlight points whose last TSV column is REC by overplotting them")
parser.add_argument("--no_sort", dest="sort", action="store_false",
help="do not sort categories from input file")
parser.add_argument("--categories", nargs="+", default=None,
Expand Down Expand Up @@ -419,8 +422,9 @@ def main(args):
# Make the figure with the appropriate size.
figure = pyplot.figure(figsize=(options.width, options.height))

# This holds lists of x, y, weight lists for each data series.
series = collections.defaultdict(lambda: [list(), list(), list()])
# This holds lists of x, y, weight lists and rec flags for each data series.
# series[name] -> [x_list, y_list, weight_list, rec_flag_list]
series = collections.defaultdict(lambda: [list(), list(), list(), list()])

# This holds the order in which series were first encountered
initial_series_order = collections.OrderedDict()
Expand All @@ -434,10 +438,19 @@ def main(args):

for line in options.data:
# Unpack the line, splitting on tabs only if requested
parts = line.split("\t" if options.tsv else None)
parts = line.rstrip("\n").split("\t" if options.tsv else None)
# Strip whitespace from each field
parts = [p.strip() for p in parts]

# We can weight samples
weight = 1
rec_flag = False

# If the last column is the REC sentinel, mark and remove it so the
# remaining parsing can proceed normally.
if len(parts) >= 1 and parts[-1] == "REC":
rec_flag = True
parts = parts[:-1]

if options.dotplot:
# We parse a two-column name/sample format
Expand All @@ -464,6 +477,7 @@ def main(args):
if len(parts) >= 4:
# There's also a weight
weight = float(parts[3])
# If there was a REC column originally, rec_flag already captured it

else:
# Use the default series
Expand All @@ -477,6 +491,7 @@ def main(args):
series[series_name][0].append(x_value)
series[series_name][1].append(y_value)
series[series_name][2].append(weight)
series[series_name][3].append(rec_flag)

# Note this series in the ordering if we haven't already
initial_series_order[series_name] = None
Expand Down Expand Up @@ -688,6 +703,18 @@ def main(args):
else:
plotted_items.append(plot_result)

# If requested, highlight points marked REC in the input's last column
if options.show_rec:
rec_flags = series[series_name][3]
if any(rec_flags):
rec_x = [series[series_name][0][i] for i, f in enumerate(rec_flags) if f]
rec_y = [series[series_name][1][i] for i, f in enumerate(rec_flags) if f]
# Choose a sensible marker size if none set
rec_ms = options.marker_size if options.marker_size is not None else 40
# Overplot REC points with red crosses
rec_plot = series_plot.scatter(rec_x, rec_y, marker='x', color='red', s=rec_ms, zorder=10)
plotted_items.append(rec_plot)

if options.fit:
# We need to do a curve fit

Expand Down Expand Up @@ -885,6 +912,10 @@ def main(args):
except ImportError:
sys.stderr.write("Install the matplotalt package to generate figure alt text\n")
alt_text = None
except Exception as e:
print("Could not generate alt text due to internal error")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also go to stderr.

traceback.print_exc()
alt_text = None

if options.save is not None:
# Save the figure to a file
Expand Down
138 changes: 104 additions & 34 deletions src/algorithms/chain_items.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,32 @@ void TracedScore::max_in(const vector<TracedScore>& options, size_t option_numbe
// This is the new winner.
this->score = option.score;
this->source = option_number;
this->paths = option.paths;
this->rec_num = option.rec_num;
}
}

TracedScore TracedScore::score_from(const vector<TracedScore>& options, size_t option_number) {
TracedScore got = options[option_number];
got.source = option_number;
got.paths = options[option_number].paths;
got.rec_num = options[option_number].rec_num;
return got;
}

TracedScore TracedScore::add_points(int adjustment) const {
return {this->score + adjustment, this->source, this->paths};
return {this->score + adjustment, this->source, this->paths, this->rec_num};
}

TracedScore TracedScore::set_shared_paths(const std::pair<size_t,size_t>& new_paths) const {
size_t updated_paths;
size_t rec_num = this->rec_num;
if(new_paths.first == new_paths.second) {
// if the paths are the same, there is no recombination inside the anchor. check if there is a recombination between anchors now
if ((this->paths & new_paths.first) == 0) {
// there is a recombination between anchors, so we "reset" the current paths
updated_paths = new_paths.first;
rec_num++;
} else {
// there is no recombination between anchors, so we update the current paths
updated_paths = this->paths & new_paths.first;
Expand All @@ -79,7 +85,8 @@ TracedScore TracedScore::set_shared_paths(const std::pair<size_t,size_t>& new_pa
return {
this->score,
this->source,
updated_paths
updated_paths,
rec_num
};
}

Expand Down Expand Up @@ -528,20 +535,13 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
size_t max_indel_bases,
int recomb_penalty,
bool show_work) {

#ifdef debug_chaining

DiagramExplainer diagram(show_work);
#else
DiagramExplainer diagram(false);
#endif
TSVExplainer dump(show_work, "chaindump");
if (diagram) {
diagram.add_globals({{"rankdir", "LR"}});
}

#ifdef debug_chaining
show_work = true;
#endif

if (show_work) {
cerr << "Chaining group of " << to_chain.size() << " items" << endl;
}
Expand Down Expand Up @@ -679,6 +679,17 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
});
}
}
if (dump) {
// Dump a TSV of source anchor description, dest anchor description, and score achieved.
dump.line();
std::stringstream source_stream;
source_stream << source;
dump.field(source_stream.str());
std::stringstream here_stream;
here_stream << here;
dump.field(here_stream.str());
dump.field(from_source_score.score);
}
} else {
if (show_work) {
cerr << "\t\tTransition is impossible." << endl;
Expand Down Expand Up @@ -783,7 +794,10 @@ vector<pair<vector<size_t>, int>> chain_items_traceback(const vector<TracedScore
// Track the penalty we are off optimal for this traceback
int penalty = best_past_ending_score_ever - chain_scores[trace_from];
size_t here = trace_from;
//std::cerr << "[REC INFO] Starting traceback at item #" << here << " with recs: " << chain_scores[here].rec_num << " score: " << chain_scores[here].score << std::endl;
while (here != TracedScore::nowhere()) {
//std::cerr << "\trecs: " << chain_scores[here].rec_num << " paths:\t" << std::bitset<64>(chain_scores[here].paths) << std::endl;
//std::cerr << "\t\tanchor #" << here << ": " << to_chain[here] << std::endl;
// Mark here as used. Happens once per item, and so limits runtime.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should probably be #ifdef debug-flagged off instead of commented out.

item_is_used[here] = true;
size_t next = chain_scores[here].source;
Expand All @@ -804,6 +818,7 @@ vector<pair<vector<size_t>, int>> chain_items_traceback(const vector<TracedScore
}
here = next;
}

// Now put the traceback in the output list
tracebacks.emplace_back();
tracebacks.back().second = penalty;
Expand All @@ -826,7 +841,7 @@ vector<pair<vector<size_t>, int>> chain_items_traceback(const vector<TracedScore
return tracebacks;
}

vector<pair<int, vector<size_t>>> find_best_chains(const VectorView<Anchor>& to_chain,
ChainsResult find_best_chains(const VectorView<Anchor>& to_chain,
const SnarlDistanceIndex& distance_index,
const HandleGraph& graph,
int gap_open,
Expand All @@ -840,9 +855,14 @@ vector<pair<int, vector<size_t>>> find_best_chains(const VectorView<Anchor>& to_
double points_per_possible_match,
size_t max_indel_bases,
bool show_work) {


ChainsResult result;
if (to_chain.empty()) {
return {{0, vector<size_t>()}};
ChainWithRec empty_entry;
empty_entry.scored_chain = {0, vector<size_t>()};
empty_entry.rec_positions = {};
result.chains.emplace_back(std::move(empty_entry));
return result;
}

// We actually need to do DP
Expand All @@ -861,24 +881,71 @@ vector<pair<int, vector<size_t>>> find_best_chains(const VectorView<Anchor>& to_
max_indel_bases,
recomb_penalty,
show_work);
//std::cerr << "[REC INFO] Recombination number for chain: " << best_past_ending_score_ever.rec_num << "\tscore: " << best_past_ending_score_ever.score << "\tpaths: " << best_past_ending_score_ever.paths << std::endl;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also could be an #ifdef and not a comment.

// Then do the tracebacks
vector<pair<vector<size_t>, int>> tracebacks = chain_items_traceback(
chain_scores, to_chain, best_past_ending_score_ever, item_bonus, item_scale, max_chains);

if (tracebacks.empty()) {
// Somehow we got nothing
return {{0, vector<size_t>()}};
ChainWithRec empty_entry;
empty_entry.scored_chain = {0, vector<size_t>()};
empty_entry.rec_positions = {};
result.chains.emplace_back(std::move(empty_entry));
return result;
}

// Convert from traceback and penalty to score and traceback.
// Everything is already sorted.
vector<pair<int, vector<size_t>>> to_return;
to_return.reserve(tracebacks.size());
result.chains.reserve(tracebacks.size());
for (auto& traceback : tracebacks) {
// Move over the list of items and convert penalty to score
to_return.emplace_back(best_past_ending_score_ever.score - traceback.second, std::move(traceback.first));
int score = best_past_ending_score_ever.score - traceback.second;
std::vector<size_t> chain_indexes = std::move(traceback.first);

// Compute the anchor indices in this chain that introduce an
// inter-anchor recombination event. We simulate the path-bit
// propagation along the chain using the same logic as
// TracedScore::set_shared_paths (but without modifying the state).
std::vector<size_t> rec_positions;
if (!chain_indexes.empty()) {
// Start with the endpoint paths of the first anchor.
size_t first_idx = chain_indexes.front();
path_flags_t current_paths = to_chain[first_idx].anchor_end_paths();

// Walk the chain from the second anchor onward and apply the
// same recombination-detection rules used in set_shared_paths.
for (size_t k = 1; k < chain_indexes.size(); ++k) {
size_t anchor_idx = chain_indexes[k];
auto new_paths = to_chain[anchor_idx].anchor_paths();
// If the anchor's start and end paths are equal, it's not an
// internally recombinant anchor; check inter-anchor overlap.
if (new_paths.first == new_paths.second) {
if ((current_paths & new_paths.first) == 0) {
// No overlap -> inter-anchor recombination occurred here.
rec_positions.push_back(anchor_idx);
// Reset current paths to the anchor's start paths.
current_paths = new_paths.first;
} else {
// Intersect supported paths and continue.
current_paths &= new_paths.first;
}
} else {
// Recombinant anchor: do not count as inter-anchor
// recombination per original logic; reset paths to the
// anchor's end paths.
// TODO: since no fragmenting, probably unnecessary to track
current_paths = new_paths.second;
}
}
}

ChainWithRec entry;
entry.scored_chain = {score, std::move(chain_indexes)};
entry.rec_positions = std::move(rec_positions);
result.chains.emplace_back(std::move(entry));
}
return to_return;
return result;
}

pair<int, vector<size_t>> find_best_chain(const VectorView<Anchor>& to_chain,
Expand All @@ -894,21 +961,24 @@ pair<int, vector<size_t>> find_best_chain(const VectorView<Anchor>& to_chain,
double points_per_possible_match,
size_t max_indel_bases) {

return find_best_chains(
to_chain,
distance_index,
graph,
gap_open,
gap_extension,
recomb_penalty,
1,
for_each_transition,
item_bonus,
item_scale,
gap_scale,
points_per_possible_match,
max_indel_bases
).front();
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like an extra layer of braces that isn't doing anything.

ChainsResult cr = find_best_chains(
to_chain,
distance_index,
graph,
gap_open,
gap_extension,
recomb_penalty,
1,
for_each_transition,
item_bonus,
item_scale,
gap_scale,
points_per_possible_match,
max_indel_bases
);
return cr.chains.front().scored_chain;
}
}

int score_best_chain(const VectorView<Anchor>& to_chain, const SnarlDistanceIndex& distance_index,
Expand Down
20 changes: 19 additions & 1 deletion src/algorithms/chain_items.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ class TracedScore {
size_t source;
/// Supported paths
path_flags_t paths;
size_t rec_num=0;
};

}
Expand Down Expand Up @@ -396,6 +397,23 @@ struct transition_info {
: from_anchor(from), to_anchor(to), graph_distance(graph_dist), read_distance(read_dist) {}
};

/// A single chain result: scored chain plus the recombination count observed
/// on its endpoint.
struct ChainWithRec {
std::pair<int, std::vector<size_t>> scored_chain;
// Positions (anchor indices) in the chain that introduce a recombination
// event between anchors. These correspond to anchors where we had to
// reset supported paths because the previous path set did not overlap
// with the next anchor's start paths.
std::vector<size_t> rec_positions;
};

/// Result of finding best chains: a list of chains each paired with the
/// recombination count observed at that chain's endpoint.
struct ChainsResult {
std::vector<ChainWithRec> chains;
};

/**
* Iteratee function type which can be called with each transition between
* anchors.
Expand Down Expand Up @@ -538,7 +556,7 @@ vector<pair<vector<size_t>, int>> chain_items_traceback(const vector<TracedScore
* Returns the scores and the list of indexes of items visited to achieve
* that score, in order, with multiple tracebacks in descending score order.
*/
vector<pair<int, vector<size_t>>> find_best_chains(const VectorView<Anchor>& to_chain,
ChainsResult find_best_chains(const VectorView<Anchor>& to_chain,
const SnarlDistanceIndex& distance_index,
const HandleGraph& graph,
int gap_open,
Expand Down
Loading
Loading