diff --git a/deps/libbdsg b/deps/libbdsg index fc7321597d..e74fb663a5 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit fc7321597d2732ff3986e3b5dc97ef3fba0cd9c8 +Subproject commit e74fb663a5f85bc1f76d159b2b3a3691ed85862f diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 5276238381..65f7dec363 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -828,173 +828,210 @@ void populate_snarl_index( return curr_index; }; - //TODO: Copying the list + // TODO: Copying the list vector> all_children = temp_snarl_record.children; + // Identify tips + for (const auto& child : all_children) { + // Check if this node is a tip + if (child.first != SnarlDistanceIndex::TEMP_NODE + || (child.second != temp_snarl_record.start_node_id + && child.second != temp_snarl_record.end_node_id)) { + bool is_node = (child.first == SnarlDistanceIndex::TEMP_NODE); + // Set up to check edges leaving the end of the chain/node + nid_t node_id = is_node ? child.second + : temp_index.temp_chain_records.at(child.second).end_node_id; + size_t rank = is_node ? temp_index.temp_node_records.at(child.second - temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records.at(child.second).rank_in_parent; + bool is_reverse = is_node ? false + : temp_index.temp_chain_records.at(child.second).end_node_rev; + // Convert to an index in all_children + rank -= 2; + + bool has_edges = false; + graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, false); + // It is a tip so this isn't simple snarl + temp_snarl_record.is_simple = false; + } + // Repeat for the other side of the chain/node + node_id = is_node ? child.second + : temp_index.temp_chain_records.at(child.second).start_node_id; + is_reverse = is_node ? true + : !temp_index.temp_chain_records.at(child.second).start_node_rev; + has_edges = false; + graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, true); + // It is a tip so this isn't simple snarl + temp_snarl_record.is_simple = false; + } + } + } + /* * Do a topological sort of the children and re-assign ranks based on the sort - * TODO: Snarls aren't guaranteed to be DAGs, so ideally this will be a sort - * that minimizes back edges and the number of times a node is traversed backwards - * For now though, just do a topological sort and don't take any loops or reversing edges + * TODO: For non-DAGs, this sort will end up arbitrary. + * That doesn't matter right now since the only consumer of ranks + * (ziptrees) expects arbitrary ranks, though. */ if (!temp_snarl_record.is_root_snarl) { - //Always start the topological sort at the start - handle_t topological_sort_start = graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev); - + // Always start the topological sort at the start + handle_t topological_sort_start = graph->get_handle(temp_snarl_record.start_node_id, + temp_snarl_record.start_node_rev); - //This will hold the new order of the children. Each value is an index into all_children, which - //matches the ranks(-2) of the children + // New sort order. Each value is an index into all_children, which + // matches the ranks(-2) of the children vector topological_sort_order; topological_sort_order.reserve(all_children.size()); - // This holds everything in the topological order, to check which nodes (and therefore edges) - // have already been added - // Unlike the topological order, this stores the orientation as well. - // Each node is only added once to the topological order, but the reverse orientation - // may still be traversed to ensure that all nodes are found - unordered_set> visited_nodes; - visited_nodes.reserve(all_children.size()); + // Which ranks have already been sorted? + unordered_set visited_ranks; + visited_ranks.reserve(all_children.size()); - //All nodes that have no incoming edges + // All nodes that have no incoming edges vector> source_nodes; - /* Add all sources. This will start out as the start node and any tips or nodes that - are only reachable from the end node - */ + // Add all sources. This will start out as the start node and any tips + for (const auto& tip : temp_snarl_record.tippy_child_ranks) { + source_nodes.emplace_back(tip.first, !tip.second); + } - //Add max() to indicate that we start at the start node, since the start node doesn't actually have a - //rank. This gets added last so it is traversed first + // Start node dummy rank is max(). This is traversed first source_nodes.emplace_back(std::numeric_limits::max(), false); - //We'll be done sorting when everything is in the sorted vector + // We'll be done sorting when everything is in the sorted vector while (!source_nodes.empty()) { - - //Pick a child with no incoming edges + // Pick a child with no incoming edges pair current_child_index = source_nodes.back(); source_nodes.pop_back(); - //Mark it as being visited - assert(visited_nodes.count(current_child_index) == 0); - visited_nodes.emplace(current_child_index); + // Visit it + if (visited_ranks.count(current_child_index.first) != 0) { + // We tried to revisit a source node, so this must be a loop + // (we got turned around somewhere is the only way) + // Thus it is safe to abort and allow random ranks + break; + } + if (current_child_index.first != std::numeric_limits::max()) { + topological_sort_order.emplace_back(current_child_index.first); + } + visited_ranks.emplace(current_child_index.first); - //Get the graph handle for that child, pointing out from the end of the chain + // Get the graph handle for that child, pointing out from the end of the chain handle_t current_graph_handle; if (current_child_index.first == std::numeric_limits::max()) { - //If the current child is the start bound, then get the start node pointing in + // If the current child is the start bound, then get the start node pointing in current_graph_handle = topological_sort_start; } else { pair current_index = all_children[current_child_index.first]; if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { - //If the current child is a node, then get the node pointing in the correct direction + // If the current child is a node, then get the node pointing in the correct direction current_graph_handle = graph->get_handle(current_index.second, current_child_index.second); } else if (current_child_index.second) { - //If the current child is a chain, and we're traversing the chain backwards + // If the current child is a chain, and we're traversing the chain backwards current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, - !temp_index.temp_chain_records[current_index.second].start_node_rev); + !temp_index.temp_chain_records[current_index.second].start_node_rev); } else { - //Otherwise, the current child is a chain and we're traversing the chain forwards + // Otherwise, the current child is a chain and we're traversing the chain forwards current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, - temp_index.temp_chain_records[current_index.second].end_node_rev); + temp_index.temp_chain_records[current_index.second].end_node_rev); } } - //Add everything reachable from the start boundary node that has no other incoming edges + // Try all edges leaving this side graph->follow_edges(current_graph_handle, false, [&](const handle_t next_handle) { #ifdef debug_distance_indexing - cerr << "Following forward edges from " << graph->get_id(current_graph_handle) << " to " << graph->get_id(next_handle) << endl; + cerr << "Following forward edges from " << graph->get_id(current_graph_handle) + << " to " << graph->get_id(next_handle) << endl; #endif if (graph->get_id(next_handle) == temp_snarl_record.start_node_id || graph->get_id(next_handle) == temp_snarl_record.end_node_id) { - //If this is trying to leave the snarl, skip it + // If this is trying to leave the snarl, skip it return true; } - //Check the next_handle going in the other direction, to see if it could be a new source node. - //If it reaches anything unseen, then it can't be a source node - - //Get the index of next_handle + // Is next_handle a new source? Any unvisited predecessors? pair next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); - size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id).rank_in_parent + bool next_is_node = next_index.first == SnarlDistanceIndex::TEMP_NODE; + size_t next_rank = next_is_node + ? temp_index.temp_node_records.at(next_index.second - temp_index.min_node_id).rank_in_parent : temp_index.temp_chain_records[next_index.second].rank_in_parent; - assert(all_children[next_rank-2] == next_index); - bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial + // Subtract 2 to get the index from the rank + assert(next_rank >= 2); + next_rank -= 2; + assert(all_children[next_rank] == next_index); + bool next_rev = (next_is_node || temp_index.temp_chain_records[next_index.second].is_trivial) ? graph->get_is_reverse(next_handle) : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; - if (visited_nodes.count(make_pair(next_rank, next_rev)) != 0) { - //If this is a loop, just skip it + if (visited_ranks.count(next_rank) != 0) { + // If this is a loop, abort return true; } - //Get the handle from the child represented by next_handle going the other way + // Get the handle from the child represented by next_handle going the other way handle_t reverse_handle = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_handle(next_index.second, !next_rev) : (next_rev ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, temp_index.temp_chain_records[next_index.second].end_node_rev) : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, - !temp_index.temp_chain_records[next_index.second].start_node_rev)); + !temp_index.temp_chain_records[next_index.second].start_node_rev)); - //Does this have no unseen incoming edges? Check as we go through incoming edges + // Does this have no unseen incoming edges? Check as we go through incoming edges bool is_source = true; - //Does this have no unseen incoming edges but including nodes we've seen in the other direction? - //TODO: Actually do this + // Does this have no unseen incoming edges? graph->follow_edges(reverse_handle, false, [&](const handle_t incoming_handle) { #ifdef debug_distance_indexing cerr << "Getting backwards edge to " << graph->get_id(incoming_handle) << endl; #endif if (graph->get_id(incoming_handle) == temp_snarl_record.start_node_id || graph->get_id(incoming_handle) == temp_snarl_record.end_node_id) { - //If this is trying to leave the snarl + // If this is trying to leave the snarl, that is OK return true; } - //The index of the snarl's child that next_handle represents + // The index of the snarl's child that next_handle represents pair incoming_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); - size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(incoming_index.second-temp_index.min_node_id).rank_in_parent + bool incoming_is_node = incoming_index.first == SnarlDistanceIndex::TEMP_NODE; + size_t incoming_rank = incoming_is_node + ? temp_index.temp_node_records.at(incoming_index.second - temp_index.min_node_id).rank_in_parent : temp_index.temp_chain_records[incoming_index.second].rank_in_parent; - bool incoming_rev = incoming_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[incoming_index.second].is_trivial + bool incoming_rev = incoming_is_node || temp_index.temp_chain_records[incoming_index.second].is_trivial ? graph->get_is_reverse(incoming_handle) : graph->get_id(incoming_handle) == temp_index.temp_chain_records[incoming_index.second].end_node_id; - //subtract 2 to get the index from the rank + // Subtract 2 to get the index from the rank assert(incoming_rank >= 2); - incoming_rank-=2; + incoming_rank -= 2; - //If we haven't seen the incoming node before, then this isn't a source so we break out of - //the loop and keep going - if (visited_nodes.count(std::make_pair(incoming_rank, !incoming_rev)) == 0) { + // This predecessor is unvisited + if (visited_ranks.count(incoming_rank) == 0) { is_source = false; } - //Keep going + // Keep going return true; }); if (is_source) { - //If this is a new source node, then add it as a source node - - //subtract 2 to get the index from the rank - assert(next_rank >= 2); - next_rank-=2; source_nodes.emplace_back(next_rank, next_rev); } return true; }); - if (current_child_index.first != std::numeric_limits::max() && - visited_nodes.count(make_pair(current_child_index.first, !current_child_index.second)) == 0) { - //If this node wasn't already added in the other direction, add it to the topological sort - topological_sort_order.emplace_back(current_child_index.first); - } } - //TODO: Do this properly - // For now, we only really want a topological ordering of DAGs, and I'm going to ignore tips - // So if anything is only reachable from the end node, then add it in an arbitrary order + // If we have leftover chains, this is a non-DAG and ranks are arbitrary + // So we will add any leftover ranks to the topological order vector check_ranks (all_children.size(), false); for (size_t x : topological_sort_order) { check_ranks[x] = true; } - //If anything wasn't in the topological order, add it now for (size_t i = 0 ; i < check_ranks.size() ; i++) { if (!check_ranks[i]) { topological_sort_order.emplace_back(i); @@ -1003,7 +1040,9 @@ void populate_snarl_index( assert(topological_sort_order.size() == all_children.size()); - //We've finished doing to topological sort, so update every child's rank to be the new order + // We've finished doing to topological sort, so update every child's rank to be the new order + auto old_tippy_ranks = temp_snarl_record.tippy_child_ranks; + temp_snarl_record.tippy_child_ranks.clear(); for (size_t new_rank = 0 ; new_rank < topological_sort_order.size() ; new_rank++) { size_t old_rank = topological_sort_order[new_rank]; if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) { @@ -1011,6 +1050,10 @@ void populate_snarl_index( } else { temp_index.temp_chain_records[all_children[old_rank].second].rank_in_parent = new_rank+2; } + const auto& old_is_tip = old_tippy_ranks.find(old_rank); + if (old_is_tip != old_tippy_ranks.end()) { + temp_snarl_record.tippy_child_ranks.emplace(new_rank, old_is_tip->second); + } } } @@ -1047,38 +1090,15 @@ void populate_snarl_index( bool is_internal_node = false; - //Check if this node is a tip if ((start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second != temp_snarl_record.start_node_id && start_index.second != temp_snarl_record.end_node_id) || (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.temp_chain_records.at(start_index.second).is_trivial)) { - //If this is an internal node + // This is an internal node is_internal_node = true; - nid_t node_id = start_index.first == SnarlDistanceIndex::TEMP_NODE ? start_index.second : temp_index.temp_chain_records.at(start_index.second).start_node_id; - size_t rank = start_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records.at(start_index.second).rank_in_parent; - - bool has_edges = false; - graph->follow_edges(graph->get_handle(node_id, false), false, [&](const handle_t next_handle) { - has_edges = true; - }); - if (!has_edges) { - temp_index.temp_node_records.at(node_id-temp_index.min_node_id).is_tip = true; - temp_snarl_record.tippy_child_ranks.insert(rank); - temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl - } - has_edges = false; - graph->follow_edges(graph->get_handle(node_id, true), false, [&](const handle_t next_handle) { - has_edges = true; - }); - if (!has_edges) { - temp_index.temp_node_records.at(node_id-temp_index.min_node_id).is_tip = true; - temp_snarl_record.tippy_child_ranks.insert(rank); - temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl - } } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.temp_chain_records.at(start_index.second).is_trivial) { - //If this is an internal chain, then it isn't a simple snarl + // If this is an internal chain, then it isn't a simple snarl temp_snarl_record.is_simple=false; } diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index ff9718ee38..1b631a5417 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1645,12 +1645,12 @@ int main_giraffe(int argc, char** argv) { // Grab the distance index if (show_progress) { - logger.info() << "Loading Distance Index v3" << endl; + logger.info() << "Loading Distance Index" << endl; } auto distance_index = vg::io::VPKG::load_one(registry.require("Giraffe Distance Index").at(0)); if (show_progress) { - logger.info() << "Paging in Distance Index v3" << endl; + logger.info() << "Paging in Distance Index" << endl; } std::chrono::time_point preload_start = std::chrono::system_clock::now(); // Make sure the distance index is paged in from disk. @@ -1718,7 +1718,7 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading and initialization: " << init_seconds.count() << " seconds" << endl; - logger.info() << "Of which Distance Index v3 paging: " + logger.info() << "Of which Distance Index paging: " << di2_preload_seconds.count() << " seconds" << endl; } diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index bb4d40f099..36a1b9b74e 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -1956,6 +1956,137 @@ namespace vg { } } + TEST_CASE( "Ranks are given correctly for DAG with a tip", + "[snarl_distance]" ) { + SECTION( "node tip oriented forward" ) { + VG graph; + + Node* n1 = graph.create_node("GCA"); + Node* n2 = graph.create_node("T"); + Node* n3 = graph.create_node("G"); + Node* n4 = graph.create_node("C"); + Node* n5 = graph.create_node("CTGA"); + + Edge* e1 = graph.create_edge(n1, n3); + Edge* e2 = graph.create_edge(n3, n5); + Edge* e3 = graph.create_edge(n4, n3); + Edge* e4 = graph.create_edge(n4, n5); + Edge* e5 = graph.create_edge(n3, n2); + Edge* e6 = graph.create_edge(n2, n5); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + net_handle_t chain2 = distance_index.get_parent(distance_index.get_node_net_handle(n2->id())); + net_handle_t chain3 = distance_index.get_parent(distance_index.get_node_net_handle(n3->id())); + net_handle_t snarl_start = distance_index.get_node_from_sentinel( + distance_index.get_bound(distance_index.get_parent(chain2), false, true)); + if (distance_index.node_id(snarl_start) == 1) { + // Node 2 is strictly after node 3 + REQUIRE(distance_index.get_rank_in_parent(chain2) > distance_index.get_rank_in_parent(chain3)); + } else { + cerr << "Snarl is reversed so this test is no longer valid" << endl; + } + } + SECTION( "node tip oriented backward" ) { + VG graph; + + Node* n1 = graph.create_node("GCA"); + Node* n2 = graph.create_node("T"); + Node* n3 = graph.create_node("G"); + Node* n4 = graph.create_node("C"); + Node* n5 = graph.create_node("CTGA"); + + Edge* e1 = graph.create_edge(n1, n3); + Edge* e2 = graph.create_edge(n3, n5); + Edge* e3 = graph.create_edge(n4, n3, true, false); + Edge* e4 = graph.create_edge(n4, n5, true, false); + Edge* e5 = graph.create_edge(n3, n2); + Edge* e6 = graph.create_edge(n2, n5); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + net_handle_t chain2 = distance_index.get_parent(distance_index.get_node_net_handle(n2->id())); + net_handle_t chain3 = distance_index.get_parent(distance_index.get_node_net_handle(n3->id())); + net_handle_t snarl_start = distance_index.get_node_from_sentinel( + distance_index.get_bound(distance_index.get_parent(chain2), false, true)); + if (distance_index.node_id(snarl_start) == 1) { + // Node 2 is strictly after node 3 + REQUIRE(distance_index.get_rank_in_parent(chain2) > distance_index.get_rank_in_parent(chain3)); + } else { + cerr << "Snarl is reversed so this test is no longer valid" << endl; + } + } + SECTION( "chain tip oriented forward" ) { + VG graph; + + Node* n1 = graph.create_node("GCA"); + Node* n2 = graph.create_node("T"); + Node* n3 = graph.create_node("G"); + Node* n4 = graph.create_node("C"); + Node* n5 = graph.create_node("CTGA"); + Node* n6 = graph.create_node("A"); + + Edge* e1 = graph.create_edge(n1, n3); + Edge* e2 = graph.create_edge(n3, n5); + Edge* e3 = graph.create_edge(n4, n3); + Edge* e4 = graph.create_edge(n4, n5); + Edge* e5 = graph.create_edge(n3, n2); + Edge* e6 = graph.create_edge(n2, n5); + Edge* e7 = graph.create_edge(n6, n4); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + net_handle_t chain2 = distance_index.get_parent(distance_index.get_node_net_handle(n2->id())); + net_handle_t chain3 = distance_index.get_parent(distance_index.get_node_net_handle(n3->id())); + net_handle_t snarl_start = distance_index.get_node_from_sentinel( + distance_index.get_bound(distance_index.get_parent(chain2), false, true)); + if (distance_index.node_id(snarl_start) == 1) { + // Node 2 is strictly after node 3 + REQUIRE(distance_index.get_rank_in_parent(chain2) > distance_index.get_rank_in_parent(chain3)); + } else { + cerr << "Snarl is reversed so this test is no longer valid" << endl; + } + } + SECTION( "chain tip oriented backward" ) { + VG graph; + + Node* n1 = graph.create_node("GCA"); + Node* n2 = graph.create_node("T"); + Node* n3 = graph.create_node("G"); + Node* n4 = graph.create_node("C"); + Node* n5 = graph.create_node("CTGA"); + Node* n6 = graph.create_node("A"); + + Edge* e1 = graph.create_edge(n1, n3); + Edge* e2 = graph.create_edge(n3, n5); + Edge* e3 = graph.create_edge(n4, n3, true, false); + Edge* e4 = graph.create_edge(n4, n5, true, false); + Edge* e5 = graph.create_edge(n3, n2); + Edge* e6 = graph.create_edge(n2, n5); + Edge* e7 = graph.create_edge(n6, n4, false, true); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + net_handle_t chain2 = distance_index.get_parent(distance_index.get_node_net_handle(n2->id())); + net_handle_t chain3 = distance_index.get_parent(distance_index.get_node_net_handle(n3->id())); + net_handle_t snarl_start = distance_index.get_node_from_sentinel( + distance_index.get_bound(distance_index.get_parent(chain2), false, true)); + if (distance_index.node_id(snarl_start) == 1) { + // Node 2 is strictly after node 3 + REQUIRE(distance_index.get_rank_in_parent(chain2) > distance_index.get_rank_in_parent(chain3)); + } else { + cerr << "Snarl is reversed so this test is no longer valid" << endl; + } + } + } TEST_CASE( "Snarl decomposition can handle chains with nodes in different directions", "[snarl_distance]" ) {