diff --git a/scripts/build-chain-viz.py b/scripts/build-chain-viz.py new file mode 100755 index 0000000000..0ce4b98f2d --- /dev/null +++ b/scripts/build-chain-viz.py @@ -0,0 +1,837 @@ +#!/usr/bin/env python3 +""" +Build an interactive SVG visualization of a chaining problem using D3.js. + +Usage: build-chain-viz.py +""" + +import sys +import os +import glob +import json +import gzip +import base64 +from collections import defaultdict, Counter + + +def parse_seeds_file(filepath): + """ + Parse a chain seeds file. + Returns list of dicts with keys: read_pos, ref_name, ref_pos, strand, seed_num, seed_id + """ + seeds = [] + with open(filepath, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + if len(parts) < 6: + continue + seeds.append({ + 'read_pos': int(parts[0]), + 'ref_name': parts[1], + 'ref_pos': int(parts[2]), + 'strand': parts[3], + 'seed_num': int(parts[4]), + 'seed_id': parts[5] + }) + return seeds + + +def parse_chaindump_file(filepath): + """ + Parse a chaindump file. + Returns list of dicts with keys: source_id, dest_id, score (signed) + """ + transitions = [] + with open(filepath, 'r') as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + if len(parts) < 3: + continue + transitions.append({ + 'source_id': parts[0], + 'dest_id': parts[1], + 'score': int(parts[2]) + }) + return transitions + + +def find_seeds_file(data_dir, chain_num): + """Find the seeds file for the given chain number.""" + pattern = os.path.join(data_dir, f'chain{chain_num}-seeds*.tsv') + matches = glob.glob(pattern) + if not matches: + return None + return matches[0] + + +def find_all_chaindump_files(data_dir): + """Find all chaindump files in the directory.""" + pattern = os.path.join(data_dir, 'chaindump*.tsv') + return glob.glob(pattern) + + +def generate_svg(seeds, transitions, output_path): + """Generate an SVG file with embedded D3.js visualization.""" + + # Filter seeds to the most common reference path + ref_name_counts = Counter(s['ref_name'] for s in seeds) + if ref_name_counts: + best_ref_name = ref_name_counts.most_common(1)[0][0] + seeds = [s for s in seeds if s['ref_name'] == best_ref_name] + + # Build seed lookup by ID + seed_by_id = {s['seed_id']: s for s in seeds} + seed_ids = set(seed_by_id.keys()) + + # Index seeds by ID + seed_id_to_index = {s['seed_id']: i for i, s in enumerate(seeds)} + + # Filter transitions to those where both endpoints are in our seeds + relevant_transitions = [t for t in transitions + if t['source_id'] in seed_ids and t['dest_id'] in seed_ids] + + # Compute max positive score per destination + max_score_by_dest = defaultdict(lambda: float('-inf')) + for t in relevant_transitions: + if t['score'] > 0 and t['score'] > max_score_by_dest[t['dest_id']]: + max_score_by_dest[t['dest_id']] = t['score'] + + # Build seeds data with score info + seeds_data = [] + for i, s in enumerate(seeds): + max_score = max_score_by_dest.get(s['seed_id'], 0) + seeds_data.append({ + 'index': i, + 'seed_num': s['seed_num'], + 'read_pos': s['read_pos'], + 'ref_pos': s['ref_pos'], + 'strand': s['strand'], + 'seed_id': s['seed_id'], + 'max_score': max_score if max_score > float('-inf') else 0 + }) + + # Build transitions data with fraction of max + transitions_data = [] + for t in relevant_transitions: + max_score = max_score_by_dest[t['dest_id']] + fraction = max(0, t['score'] / max_score) if max_score > 0 else 0 + transitions_data.append({ + 'source_id': t['source_id'], + 'source_index': seed_id_to_index.get(t['source_id'], -1), + 'dest_id': t['dest_id'], + 'dest_index': seed_id_to_index.get(t['dest_id'], -1), + 'score': t['score'], + 'fraction': fraction, + 'is_max': (t['score'] == max_score) + }) + + # Compress JSON data + seeds_compressed = base64.b64encode(gzip.compress( + json.dumps(seeds_data).encode('utf-8'))).decode('ascii') + transitions_compressed = base64.b64encode(gzip.compress( + json.dumps(transitions_data).encode('utf-8'))).decode('ascii') + + # CSS and JS are plain strings (no f-string brace escaping needed). + # Only the data script uses f-string interpolation. + css = ''' ''' + + data_script = ( + ' ' + ) + + js = ''' ''' + + svg_content = '\n'.join([ + '', + '', + css, + ' ', + data_script, + js, + '', + '' + ]) + + with open(output_path, 'w') as f: + f.write(svg_content) + + +def main(): + if len(sys.argv) != 4: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + data_dir = sys.argv[1] + chain_num = int(sys.argv[2]) + output_path = sys.argv[3] + + if not os.path.isdir(data_dir): + print(f"Error: {data_dir} is not a directory", file=sys.stderr) + sys.exit(1) + + # Find and parse seeds file + seeds_file = find_seeds_file(data_dir, chain_num) + if not seeds_file: + print(f"Error: No seeds file found for chain {chain_num}", file=sys.stderr) + sys.exit(1) + + seeds = parse_seeds_file(seeds_file) + if not seeds: + print(f"Warning: No seeds found in {seeds_file}", file=sys.stderr) + + print(f"Loaded {len(seeds)} seeds from {seeds_file}") + + # Find and parse all chaindump files + chaindump_files = find_all_chaindump_files(data_dir) + all_transitions = [] + for cf in chaindump_files: + transitions = parse_chaindump_file(cf) + all_transitions.extend(transitions) + print(f"Loaded {len(transitions)} transitions from {cf}") + + print(f"Total transitions: {len(all_transitions)}") + + # Generate SVG + generate_svg(seeds, all_transitions, output_path) + print(f"Generated {output_path}") + + +if __name__ == '__main__': + main() diff --git a/scripts/scatter.py b/scripts/scatter.py index 39764c9500..8f142f35a5 100755 --- a/scripts/scatter.py +++ b/scripts/scatter.py @@ -15,6 +15,7 @@ import argparse, sys, os, itertools, math, collections, random, re import matplotlib, matplotlib.ticker, matplotlib.cm, numpy import copy +import traceback # Scipy allows curve fitting, but we might not have it try: @@ -99,6 +100,8 @@ def parse_args(args): help="width of connecting line") parser.add_argument("--tsv", action="store_true", help="use only tabs as separators in input file") + parser.add_argument("--show_rec", action="store_true", + help="highlight points whose last TSV column is REC by overplotting them") parser.add_argument("--no_sort", dest="sort", action="store_false", help="do not sort categories from input file") parser.add_argument("--categories", nargs="+", default=None, @@ -419,8 +422,9 @@ def main(args): # Make the figure with the appropriate size. figure = pyplot.figure(figsize=(options.width, options.height)) - # This holds lists of x, y, weight lists for each data series. - series = collections.defaultdict(lambda: [list(), list(), list()]) + # This holds lists of x, y, weight lists and rec flags for each data series. + # series[name] -> [x_list, y_list, weight_list, rec_flag_list] + series = collections.defaultdict(lambda: [list(), list(), list(), list()]) # This holds the order in which series were first encountered initial_series_order = collections.OrderedDict() @@ -434,10 +438,19 @@ def main(args): for line in options.data: # Unpack the line, splitting on tabs only if requested - parts = line.split("\t" if options.tsv else None) + parts = line.rstrip("\n").split("\t" if options.tsv else None) + # Strip whitespace from each field + parts = [p.strip() for p in parts] # We can weight samples weight = 1 + rec_flag = False + + # If the last column is the REC sentinel, mark and remove it so the + # remaining parsing can proceed normally. + if len(parts) >= 1 and parts[-1] == "REC": + rec_flag = True + parts = parts[:-1] if options.dotplot: # We parse a two-column name/sample format @@ -464,6 +477,7 @@ def main(args): if len(parts) >= 4: # There's also a weight weight = float(parts[3]) + # If there was a REC column originally, rec_flag already captured it else: # Use the default series @@ -477,6 +491,7 @@ def main(args): series[series_name][0].append(x_value) series[series_name][1].append(y_value) series[series_name][2].append(weight) + series[series_name][3].append(rec_flag) # Note this series in the ordering if we haven't already initial_series_order[series_name] = None @@ -688,6 +703,18 @@ def main(args): else: plotted_items.append(plot_result) + # If requested, highlight points marked REC in the input's last column + if options.show_rec: + rec_flags = series[series_name][3] + if any(rec_flags): + rec_x = [series[series_name][0][i] for i, f in enumerate(rec_flags) if f] + rec_y = [series[series_name][1][i] for i, f in enumerate(rec_flags) if f] + # Choose a sensible marker size if none set + rec_ms = options.marker_size if options.marker_size is not None else 40 + # Overplot REC points with red crosses + rec_plot = series_plot.scatter(rec_x, rec_y, marker='x', color='red', s=rec_ms, zorder=10) + plotted_items.append(rec_plot) + if options.fit: # We need to do a curve fit @@ -885,6 +912,10 @@ def main(args): except ImportError: sys.stderr.write("Install the matplotalt package to generate figure alt text\n") alt_text = None + except Exception as e: + print("Could not generate alt text due to internal error") + traceback.print_exc() + alt_text = None if options.save is not None: # Save the figure to a file diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index c31839910e..63e168e38f 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -48,26 +48,32 @@ void TracedScore::max_in(const vector& options, size_t option_numbe // This is the new winner. this->score = option.score; this->source = option_number; + this->paths = option.paths; + this->rec_num = option.rec_num; } } TracedScore TracedScore::score_from(const vector& options, size_t option_number) { TracedScore got = options[option_number]; got.source = option_number; + got.paths = options[option_number].paths; + got.rec_num = options[option_number].rec_num; return got; } TracedScore TracedScore::add_points(int adjustment) const { - return {this->score + adjustment, this->source, this->paths}; + return {this->score + adjustment, this->source, this->paths, this->rec_num}; } TracedScore TracedScore::set_shared_paths(const std::pair& new_paths) const { size_t updated_paths; + size_t rec_num = this->rec_num; if(new_paths.first == new_paths.second) { // if the paths are the same, there is no recombination inside the anchor. check if there is a recombination between anchors now if ((this->paths & new_paths.first) == 0) { // there is a recombination between anchors, so we "reset" the current paths updated_paths = new_paths.first; + rec_num++; } else { // there is no recombination between anchors, so we update the current paths updated_paths = this->paths & new_paths.first; @@ -79,7 +85,8 @@ TracedScore TracedScore::set_shared_paths(const std::pair& new_pa return { this->score, this->source, - updated_paths + updated_paths, + rec_num }; } @@ -528,20 +535,13 @@ TracedScore chain_items_dp(vector& chain_scores, size_t max_indel_bases, int recomb_penalty, bool show_work) { - -#ifdef debug_chaining + DiagramExplainer diagram(show_work); -#else - DiagramExplainer diagram(false); -#endif + TSVExplainer dump(show_work, "chaindump"); if (diagram) { diagram.add_globals({{"rankdir", "LR"}}); } -#ifdef debug_chaining - show_work = true; -#endif - if (show_work) { cerr << "Chaining group of " << to_chain.size() << " items" << endl; } @@ -679,6 +679,17 @@ TracedScore chain_items_dp(vector& chain_scores, }); } } + if (dump) { + // Dump a TSV of source anchor description, dest anchor description, and score achieved. + dump.line(); + std::stringstream source_stream; + source_stream << source; + dump.field(source_stream.str()); + std::stringstream here_stream; + here_stream << here; + dump.field(here_stream.str()); + dump.field(from_source_score.score); + } } else { if (show_work) { cerr << "\t\tTransition is impossible." << endl; @@ -783,7 +794,10 @@ vector, int>> chain_items_traceback(const vector(chain_scores[here].paths) << std::endl; + //std::cerr << "\t\tanchor #" << here << ": " << to_chain[here] << std::endl; // Mark here as used. Happens once per item, and so limits runtime. item_is_used[here] = true; size_t next = chain_scores[here].source; @@ -804,6 +818,7 @@ vector, int>> chain_items_traceback(const vector, int>> chain_items_traceback(const vector>> find_best_chains(const VectorView& to_chain, +ChainsResult find_best_chains(const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, int gap_open, @@ -840,9 +855,14 @@ vector>> find_best_chains(const VectorView& to_ double points_per_possible_match, size_t max_indel_bases, bool show_work) { - + + ChainsResult result; if (to_chain.empty()) { - return {{0, vector()}}; + ChainWithRec empty_entry; + empty_entry.scored_chain = {0, vector()}; + empty_entry.rec_positions = {}; + result.chains.emplace_back(std::move(empty_entry)); + return result; } // We actually need to do DP @@ -861,24 +881,71 @@ vector>> find_best_chains(const VectorView& to_ max_indel_bases, recomb_penalty, show_work); + //std::cerr << "[REC INFO] Recombination number for chain: " << best_past_ending_score_ever.rec_num << "\tscore: " << best_past_ending_score_ever.score << "\tpaths: " << best_past_ending_score_ever.paths << std::endl; // Then do the tracebacks vector, int>> tracebacks = chain_items_traceback( chain_scores, to_chain, best_past_ending_score_ever, item_bonus, item_scale, max_chains); if (tracebacks.empty()) { // Somehow we got nothing - return {{0, vector()}}; + ChainWithRec empty_entry; + empty_entry.scored_chain = {0, vector()}; + empty_entry.rec_positions = {}; + result.chains.emplace_back(std::move(empty_entry)); + return result; } // Convert from traceback and penalty to score and traceback. // Everything is already sorted. - vector>> to_return; - to_return.reserve(tracebacks.size()); + result.chains.reserve(tracebacks.size()); for (auto& traceback : tracebacks) { // Move over the list of items and convert penalty to score - to_return.emplace_back(best_past_ending_score_ever.score - traceback.second, std::move(traceback.first)); + int score = best_past_ending_score_ever.score - traceback.second; + std::vector chain_indexes = std::move(traceback.first); + + // Compute the anchor indices in this chain that introduce an + // inter-anchor recombination event. We simulate the path-bit + // propagation along the chain using the same logic as + // TracedScore::set_shared_paths (but without modifying the state). + std::vector rec_positions; + if (!chain_indexes.empty()) { + // Start with the endpoint paths of the first anchor. + size_t first_idx = chain_indexes.front(); + path_flags_t current_paths = to_chain[first_idx].anchor_end_paths(); + + // Walk the chain from the second anchor onward and apply the + // same recombination-detection rules used in set_shared_paths. + for (size_t k = 1; k < chain_indexes.size(); ++k) { + size_t anchor_idx = chain_indexes[k]; + auto new_paths = to_chain[anchor_idx].anchor_paths(); + // If the anchor's start and end paths are equal, it's not an + // internally recombinant anchor; check inter-anchor overlap. + if (new_paths.first == new_paths.second) { + if ((current_paths & new_paths.first) == 0) { + // No overlap -> inter-anchor recombination occurred here. + rec_positions.push_back(anchor_idx); + // Reset current paths to the anchor's start paths. + current_paths = new_paths.first; + } else { + // Intersect supported paths and continue. + current_paths &= new_paths.first; + } + } else { + // Recombinant anchor: do not count as inter-anchor + // recombination per original logic; reset paths to the + // anchor's end paths. + // TODO: since no fragmenting, probably unnecessary to track + current_paths = new_paths.second; + } + } + } + + ChainWithRec entry; + entry.scored_chain = {score, std::move(chain_indexes)}; + entry.rec_positions = std::move(rec_positions); + result.chains.emplace_back(std::move(entry)); } - return to_return; + return result; } pair> find_best_chain(const VectorView& to_chain, @@ -894,21 +961,24 @@ pair> find_best_chain(const VectorView& to_chain, double points_per_possible_match, size_t max_indel_bases) { - return find_best_chains( - to_chain, - distance_index, - graph, - gap_open, - gap_extension, - recomb_penalty, - 1, - for_each_transition, - item_bonus, - item_scale, - gap_scale, - points_per_possible_match, - max_indel_bases - ).front(); + { + ChainsResult cr = find_best_chains( + to_chain, + distance_index, + graph, + gap_open, + gap_extension, + recomb_penalty, + 1, + for_each_transition, + item_bonus, + item_scale, + gap_scale, + points_per_possible_match, + max_indel_bases + ); + return cr.chains.front().scored_chain; + } } int score_best_chain(const VectorView& to_chain, const SnarlDistanceIndex& distance_index, diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index be46d3d1a2..9ac7b792e0 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -350,6 +350,7 @@ class TracedScore { size_t source; /// Supported paths path_flags_t paths; + size_t rec_num=0; }; } @@ -396,6 +397,23 @@ struct transition_info { : from_anchor(from), to_anchor(to), graph_distance(graph_dist), read_distance(read_dist) {} }; +/// A single chain result: scored chain plus the recombination count observed +/// on its endpoint. +struct ChainWithRec { + std::pair> scored_chain; + // Positions (anchor indices) in the chain that introduce a recombination + // event between anchors. These correspond to anchors where we had to + // reset supported paths because the previous path set did not overlap + // with the next anchor's start paths. + std::vector rec_positions; +}; + +/// Result of finding best chains: a list of chains each paired with the +/// recombination count observed at that chain's endpoint. +struct ChainsResult { + std::vector chains; +}; + /** * Iteratee function type which can be called with each transition between * anchors. @@ -538,7 +556,7 @@ vector, int>> chain_items_traceback(const vector>> find_best_chains(const VectorView& to_chain, +ChainsResult find_best_chains(const VectorView& to_chain, const SnarlDistanceIndex& distance_index, const HandleGraph& graph, int gap_open, diff --git a/src/explainer.cpp b/src/explainer.cpp index af5035c671..d89eb6ac0f 100644 --- a/src/explainer.cpp +++ b/src/explainer.cpp @@ -114,30 +114,6 @@ void TSVExplainer::line() { need_tab = false; } -void TSVExplainer::field(const std::string& value) { - if (!explaining()) { - return; - } - if (need_tab) { - out << "\t"; - } - out << value; - // Next value on the line needs a leading tab - need_tab = true; -} - -void TSVExplainer::field(size_t value) { - if (!explaining()) { - return; - } - if (need_tab) { - out << "\t"; - } - out << value; - // Next value on the line needs a leading tab - need_tab = true; -} - ProblemDumpExplainer::ProblemDumpExplainer(bool enabled, const std::string& name) : Explainer(enabled) { if (!explaining()) { return; diff --git a/src/explainer.hpp b/src/explainer.hpp index 23bdc585b7..c84ddb4f29 100644 --- a/src/explainer.hpp +++ b/src/explainer.hpp @@ -109,11 +109,9 @@ class TSVExplainer : public Explainer { /// Start a new line. Must call this before field(). void line(); - /// Add a field with a string value - void field(const std::string& value); - - /// Add a field with an integral value - void field(size_t value); + /// Add a field with any value that can be <<'d into a stream. + template + void field(const T& value); protected: /// Stream being written to @@ -124,6 +122,19 @@ class TSVExplainer : public Explainer { bool need_line = false; }; +template +void TSVExplainer::field(const T& value) { + if (!explaining()) { + return; + } + if (need_tab) { + out << "\t"; + } + out << value; + // Next value on the line needs a leading tab + need_tab = true; +} + /** * Widget to serialize somewhat structured logs. */ diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 1e25f406ac..41a376069e 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -867,7 +867,7 @@ class MinimizerMapper : public AlignerClient { */ void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, @@ -1463,7 +1463,9 @@ class MinimizerMapper : public AlignerClient { static void dump_debug_chains(const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, + const vector& seed_anchors, const std::vector>& chains, + const std::vector>& chain_rec_flags, const std::vector& chain_source_tree, const PathPositionHandleGraph* path_graph); diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index b8509e8959..2a852d52d3 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -31,6 +31,7 @@ #include #include #include +#include // Turn on debugging prints //#define debug @@ -189,7 +190,9 @@ static bool chain_ranges_are_equivalent(const MinimizerMapper::Seed& start_seed1 void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, + const vector& seed_anchors, const std::vector>& chains, + const std::vector>& chain_rec_flags, const std::vector& chain_source_tree, const PathPositionHandleGraph* path_graph) { if (!path_graph) { @@ -234,6 +237,9 @@ void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, // The read context should already be set by the caller TSVExplainer exp(true, chain_file_name); + // We make another TSV that's more parseable, with all the seeds. + TSVExplainer seedpos(true, "chain" + std::to_string(chain_num) + "-seeds"); + // Determine the positions of all the involved seeds. std::unordered_map seed_positions; for (auto& kv : seed_sets) { @@ -245,7 +251,22 @@ void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, auto found = seed_positions.find(seed_num); if (found == seed_positions.end()) { // If we don't know the seed's positions yet, get them - seed_positions.emplace_hint(found, seed_num, algorithms::nearest_offsets_in_paths(path_graph, seed.pos, 100)); + found = seed_positions.emplace_hint(found, seed_num, algorithms::nearest_offsets_in_paths(path_graph, seed.pos, 100)); + for (auto& handle_and_positions : found->second) { + std::string path_name = path_graph->get_path_name(handle_and_positions.first); + for (auto& position : handle_and_positions.second) { + // Dump all the seed positions so we can select seeds we want to know about + seedpos.line(); + seedpos.field(seed_anchors.at(seed_num).read_start()); + seedpos.field(path_name); + seedpos.field(position.first); + seedpos.field(position.second ? "-" : "+"); + seedpos.field(seed_num); + std::stringstream ss; + ss << seed_anchors.at(seed_num); + seedpos.field(ss.str()); + } + } } } } @@ -257,10 +278,19 @@ void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, for (size_t run_number = 0; run_number < kv.second.size(); run_number++) { // For each run of seeds in it const std::vector& included_seeds = kv.second[run_number]; - for (auto& seed_num : included_seeds) { - // For each seed in the run + for (size_t idx = 0; idx < included_seeds.size(); ++idx) { + // For each seed in the run (index-based so we can consult chain flags) + size_t seed_num = included_seeds[idx]; auto& seed = seeds.at(seed_num); + // Determine whether this seed (in chain mode) is from an anchor that is recombinant + bool is_recomb = false; + if (!marker.empty() && marker == "chain") { + if (chain_num < chain_rec_flags.size() && idx < chain_rec_flags[chain_num].size()) { + is_recomb = chain_rec_flags[chain_num][idx]; + } + } + // Get its effective path positions auto& offsets = seed_positions.at(seed_num); @@ -280,6 +310,16 @@ void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, exp.field(position.first); // Offset in read *of the pin point* (not of the forward-strand start of the minimizer) exp.field(minimizers[seed.source].pin_offset()); + // Recombination flag column (only meaningful for chain rows) + if (!marker.empty()) { + if (marker == "chain") { + exp.field(is_recomb ? "REC" : ""); + } else { + exp.field(""); + } + } else { + exp.field(""); + } } } if (offsets.empty()) { @@ -296,6 +336,12 @@ void MinimizerMapper::dump_debug_chains(const ZipCodeForest& zip_code_forest, exp.field(0); // Offset in read *of the pin point* (not of the forward-strand start of the minimizer) exp.field(minimizers[seed.source].pin_offset()); + // Recombination flag column + if (marker == "chain") { + exp.field(is_recomb ? "REC" : ""); + } else { + exp.field(""); + } } } @@ -707,6 +753,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // For each chain, we need: // The chain itself, pointing into seeds std::vector> chains; + // For each chain, mark per-seed whether it came from a recombinant anchor + std::vector> chain_rec_flags; // The zip code tree it came from std::vector chain_source_tree; // An estimated alignment score @@ -717,7 +765,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector multiplicity_by_chain; do_chaining_on_trees(aln, zip_code_forest, seeds, minimizers, seed_anchors, - chains, chain_source_tree, chain_score_estimates, + chains, chain_rec_flags, chain_source_tree, chain_score_estimates, minimizer_kept_chain_count, multiplicity_by_chain, alignments, minimizer_explored, multiplicity_by_alignment, rng, funnel); @@ -732,7 +780,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // Dump all chains if requested (do this before alignments, while chains still exist) if (show_work && !chains.empty() && this->path_graph != nullptr) { - dump_debug_chains(zip_code_forest, seeds, minimizers, chains, chain_source_tree, this->path_graph); + dump_debug_chains(zip_code_forest, seeds, minimizers, seed_anchors, chains, chain_rec_flags, chain_source_tree, this->path_graph); } if (alignments.size() == 0) { @@ -1027,13 +1075,13 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { } void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, - const std::vector& seeds, const VectorView& minimizers, - const vector& seed_anchors, - std::vector>& chains, std::vector& chain_source_tree, - std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, - std::vector& multiplicity_by_chain, - std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, - LazyRNG& rng, Funnel& funnel) const { + const std::vector& seeds, const VectorView& minimizers, + const vector& seed_anchors, + std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, + std::vector& multiplicity_by_chain, + std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, + LazyRNG& rng, Funnel& funnel) const { // Keep track of which chain each alignment comes from for the funnel std::vector alignment_source_chain; @@ -1433,7 +1481,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& VectorView anchor_view {anchors_to_chain, anchor_indexes}; // This will hold our chaining results - std::vector>> results; + algorithms::ChainsResult chain_results; if (show_work) { #pragma omp critical (cerr) @@ -1461,7 +1509,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& graph_lookback_limit, read_lookback_limit ); - results = algorithms::find_best_chains( + chain_results = algorithms::find_best_chains( anchor_view, *distance_index, gbwt_graph, @@ -1475,21 +1523,23 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& this->gap_scale, this->points_per_possible_match, indel_limit, - false + show_work ); if (show_work) { #pragma omp critical (cerr) - cerr << log_name() << "Found " << results.size() << " chains in zip code tree " << item_num + cerr << log_name() << "Found " << chain_results.chains.size() << " chains in zip code tree " << item_num << " running " << anchors_to_chain[anchor_indexes.front()] << " to " << anchors_to_chain[anchor_indexes.back()] << std::endl; } - for (size_t result = 0; result < results.size(); result++) { + for (size_t result = 0; result < chain_results.chains.size(); result++) { // For each result - auto& scored_chain = results[result]; - if (show_work) { + auto& entry = chain_results.chains[result]; + auto& scored_chain = entry.scored_chain; + auto& chain_rec_positions = entry.rec_positions; + if (true) { #ifdef debug - if(true) + if(show_work) #else if (result < MANY_LIMIT) #endif @@ -1498,9 +1548,19 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& #pragma omp critical (cerr) { cerr << log_name() << "\tChain with score " << scored_chain.first - << " and length " << scored_chain.second.size() + << " (rec num =" << chain_rec_positions.size() << ") and length " << scored_chain.second.size() << " running " << anchor_view[scored_chain.second.front()] << " to " << anchor_view[scored_chain.second.back()] << std::endl; + if (!chain_rec_positions.empty()) { + { + cerr << log_name() << "\t\tRecombination introduced at anchors: "; + for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { + if (pi) cerr << ", "; + cerr << chain_rec_positions[pi]; + } + cerr << std::endl; + } + } #ifdef debug for (auto& anchor_number : scored_chain.second) { @@ -1512,7 +1572,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } } else if (result == MANY_LIMIT) { #pragma omp critical (cerr) - std::cerr << log_name() << "\t<" << (results.size() - result) << " more chains>" << std::endl; + std::cerr << log_name() << "\t<" << (chain_results.chains.size() - result) << " more chains>" << std::endl; } } @@ -1521,13 +1581,24 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& // Translate chains into seed numbers and not local anchor numbers. chains.emplace_back(); + // Also make a parallel vector that marks whether each seed in the chain + // comes from an anchor that introduced recombination. + chain_rec_flags.emplace_back(); chains.back().reserve(scored_chain.second.size() * 2); + chain_rec_flags.back().reserve(scored_chain.second.size() * 2); + + // Build a set of the anchor indices inside scored_chain that are recombinant + std::unordered_set rec_anchor_set(chain_rec_positions.begin(), chain_rec_positions.end()); + for (auto& selected_number : scored_chain.second) { // For each anchor in the chain, get its number in the whole group of anchors. size_t anchor_number = anchor_indexes.at(selected_number); + bool anchor_is_recomb = rec_anchor_set.count(selected_number) > 0; for (auto& seed_number : anchor_seed_sequences.at(anchor_number)) { // And get all the seeds it actually uses in sequence and put them in the chain. chains.back().push_back(seed_number); + // Mark whether this seed came from a recombinant anchor + chain_rec_flags.back().push_back(anchor_is_recomb); } for (auto& seed_number : anchor_represented_seeds.at(anchor_number)) { // And get all the seeds it represents exploring and mark their minimizers explored. diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 47747b0887..156976000d 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -126,7 +126,7 @@ rm -f read.fq read.gam vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.indel.multi.fq --show-work --track-position -b chaining-sr > /dev/null 2>&1 # Check that at least some TSV files and directories were created -is "$(find explanation_read1 -name '*.tsv' 2>/dev/null | wc -l | tr -d ' ')" "1" "Chain explanation files are created per chain" +is "$(find explanation_read1 -name 'chain*-dotplot*.tsv' 2>/dev/null | wc -l | tr -d ' ')" "1" "Chain explanation files are created per chain" is "$(ls -d explanation_* 2>/dev/null | wc -l | tr -d ' ')" "3" "Explanation directories are created per read" rm -rf explanation_*