From b2df0160c6651400a758db562588bc34e94bf020 Mon Sep 17 00:00:00 2001 From: VassTs Date: Sun, 12 Jan 2025 19:19:40 +0200 Subject: [PATCH] =?UTF-8?q?stitched=20parallel=20distances=20and=20new=20g?= =?UTF-8?q?reedy=201ou=20=CF=80=CE=B1=CF=81=CE=B1=CE=B4=CE=BF=CF=84=CE=AD?= =?UTF-8?q?=CE=BF=CF=85=20=CE=B3=CE=B9=CE=B1=20distances,=20with=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/greedysearch.hpp | 17 +- include/stitchedVamanaParallelDistances.hpp | 8 + include/vamana.hpp | 2 + src/greedysearch.cpp | 63 +++++++ src/stitchedVamanaParallelDistances.cpp | 104 ++++++++++++ src/vamana.cpp | 177 ++++++++++++++++++++ tests/test_greedysearch.cpp | 162 +++++++++++++++++- 7 files changed, 531 insertions(+), 2 deletions(-) create mode 100644 include/stitchedVamanaParallelDistances.hpp create mode 100644 src/stitchedVamanaParallelDistances.cpp diff --git a/include/greedysearch.hpp b/include/greedysearch.hpp index 22f87f6..66139bf 100644 --- a/include/greedysearch.hpp +++ b/include/greedysearch.hpp @@ -1,8 +1,23 @@ #include #include "../include/graph.hpp" +#include using namespace std; // L_set, V_set should be empty when calling the function // L_set, V_set will be full once function call terminated -void GreedySearch(Node *start_node, vector &queryCoords, int k, int L, set &L_set, set &V_set); \ No newline at end of file +void GreedySearch(Node *start_node, vector &queryCoords, int k, int L, set &L_set, set &V_set); + +// Hash function used for: unordered_map, double, PairHash> nodePairMap +struct PairHash { + size_t operator()(const pair& p) const { + auto h1 = hash()(p.first); + auto h2 = hash()(p.second); + // Combine hashes in an order-independent way for symmetry + return h1 ^ h2; // xor + } +}; + +// Δύο είναι οι διαφορές από τον από πάνω greedy search: 1. Το query δίνεται σε Node* και όχι σε vector, 2. Λαμβάνει ως παράμετρο τις αποθηκευμένες αποστάσεις (nodePairMap) +// IMPORTANT: Γι' αυτούς του δύο λόγους, καλείται *μόνο* από τον vamana, και *όχι* από την first_main. +void GreedySearchIndex(Node* start_node, Node* query_node, int k, int L, set &L_set, set &V_set, unordered_map, double, PairHash>& nodePairMap); \ No newline at end of file diff --git a/include/stitchedVamanaParallelDistances.hpp b/include/stitchedVamanaParallelDistances.hpp new file mode 100644 index 0000000..c52d426 --- /dev/null +++ b/include/stitchedVamanaParallelDistances.hpp @@ -0,0 +1,8 @@ +#ifndef STITCHEDVAMANAPARALLELDISTANCES_HPP +#define STITCHEDVAMANAPARALLELDISTANCES_HPP + +#include "graph.hpp" + +Graph stitchedVamanaParallelDistances(vector> &coords, set F, double a, int L_small, int R_small, int R_stitched, map &medoids); + +#endif \ No newline at end of file diff --git a/include/vamana.hpp b/include/vamana.hpp index 38342cc..86df124 100644 --- a/include/vamana.hpp +++ b/include/vamana.hpp @@ -3,3 +3,5 @@ int Vamana(Graph &graph, vector &coords, int R, double a, int int_L); // added parameter f = label, to pass to generate_graph // so that it creates a graph with nodes of label f. + +int VamanaParallelDistances(Graph &graph, vector &coords, int R, double a, int int_L); \ No newline at end of file diff --git a/src/greedysearch.cpp b/src/greedysearch.cpp index 1264edc..1acee57 100644 --- a/src/greedysearch.cpp +++ b/src/greedysearch.cpp @@ -107,5 +107,68 @@ void GreedySearch(Node* start_node, vector &queryCoords, int k, int L, s L_set.insert(it->second); // `it->second` is the node } + return; +} + +void GreedySearchIndex(Node* start_node, Node* query_node, int k, int L, set &L_set, set &V_set, unordered_map, double, PairHash>& nodePairMap) { + + // All the elements in a set have unique values + // by default set is sorted in ascending order + assert(L_set.empty() == true); // given L_set should be empty + assert(V_set.empty() == true); // given V_set should be empty + assert(L >= k); + assert(start_node != NULL); + assert(query_node->getCoordinates().size() > 0); + + unordered_map nodeMap; // Main storage + set, Compare> sortedSet; // Secondary sorted viewm of main storage + set, Compare> LminusV; + + + // Create an order-independent key + pair key = {min(start_node, query_node), max(start_node, query_node)}; // The hash and key use min and max to ensure that {node1, node2} is treated the same as {node2, node1}. This eliminates the need to insert the symmetric pair manually. + assert(nodePairMap.find(key) != nodePairMap.end()); // assert if key doesn't exist + double dist = nodePairMap[key]; // already precomputed! + insert(nodeMap, sortedSet, start_node, dist, L); // Initialization of L_set + // V_set is empty + + // L\V = {start} \ {} = {start} + LminusV.emplace(dist, start_node); // Inserts a new pair in the set, if unique. This new pair is constructed in place using args as the arguments for its construction. + + while(LminusV.empty() == false) { // while LminusV != {} + + auto first_pair = *LminusV.begin(); // LminusV is sorted based on distance in descending order. Therefore, first element of LminusV is the min we are looking for + Node* p_star = first_pair.second; // The 'Node*' + + // Update V_set [V = V U p*] + V_set.insert(p_star); // if p_star is already in V_set, p_set won't be inserted in V + + + // Update L_set [ L = L U Nout(p*) ] + list p_star_out = p_star->getEdges(); // out-neighbors of p* + for(auto node : p_star_out) { // Insert all of out-neighbors of p* into L set + + pair key = {min(node, query_node), max(node, query_node)}; // Create an order-independent key + assert(nodePairMap.find(key) != nodePairMap.end()); // assert if key doesn't exist + double dist = nodePairMap[key]; // already precomputed! + insert(nodeMap, sortedSet, node, dist, L); + } + + // Update of L to retain top L elements of vector happens immediately whenever we insert an element in "L set" + + LminusV = set_difference(sortedSet, V_set); + } + + // We update L to retain top k elements of vector + int my_k = k; + if(static_cast(sortedSet.size()) < k) { + my_k = sortedSet.size(); + } + + int i = 0; + for (auto it = sortedSet.begin(); i < my_k; ++it, ++i) { + L_set.insert(it->second); // `it->second` is the node + } + return; } \ No newline at end of file diff --git a/src/stitchedVamanaParallelDistances.cpp b/src/stitchedVamanaParallelDistances.cpp new file mode 100644 index 0000000..1158c84 --- /dev/null +++ b/src/stitchedVamanaParallelDistances.cpp @@ -0,0 +1,104 @@ +/* StitchedVamana parallel implementation file: + Contains the parallel implementation of stitchedVamana with pthreads. + + stitchedVamana returns a stitched graph, that consists of a collection of subgraphs, one for each label. + if there are no nodes for a label, then the graph would be empty. +*/ + +#include "../include/stitchedVamana.hpp" +#include "../include/stitchedVamanaParallel.hpp" +#include "../include/vamana.hpp" +#include "../include/filteredrobustprune.hpp" +#include "../include/generate_graph.hpp" +#include + +using namespace std; + +struct Thread_params { + int label; + vector Pf; + double a; + int R_small; + int L_small; + Graph *graph; // pointer to the main graph + map *medoids; // pointer to medoids map + pthread_mutex_t *mutex_union; // mutex for graphUnion - we will need the mutexes for the functions that are accessed by the threads (shared functions) + pthread_mutex_t *mutex_medoid; // mutex for store_medoid +}; + +// thread function -- is done by each thread: the work of processing one label: make Gf, stitch it, store medoid. +static void *processLabel(void *args) +{ + Thread_params *params = static_cast(args); + + // create a subgraph for this label + Graph Gf; + int medoidId = VamanaParallelDistances(Gf, params->Pf, params->R_small, params->a, params->L_small); + + // merge (stitch) the subgraph into the main graph + pthread_mutex_lock(params->mutex_union); + params->graph->graphUnion(std::move(Gf)); + pthread_mutex_unlock(params->mutex_union); + + Gf.clear(); + + // store the medoid node + if (medoidId != -1) + { + pthread_mutex_lock(params->mutex_medoid); + store_medoid(*params->graph, *params->medoids, params->label, medoidId); + pthread_mutex_unlock(params->mutex_medoid); + } + + return nullptr; +} + + +Graph stitchedVamanaParallelDistances(vector> &coords, set F, double a, int L_small, int R_small, int R_stitched, map &medoids) +{ + Graph G; + unordered_map> Fx = compute_Fx(coords); + unordered_map> PfMap = compute_PfMap(coords, F); + + // initialize pthreads + pthread_t threads[F.size()]; + Thread_params params[F.size()]; + pthread_mutex_t mutex_union = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_t mutex_medoid = PTHREAD_MUTEX_INITIALIZER; + + int i = 0; + for (int f : F) + { + vector Pf = PfMap[f]; + + // if Pf is empty, then we were given no nodes with this label + if (Pf.empty()) + { + // continue without calling vamana to avoid extra work + continue; + } + + // prepare thread parameters + params[i] = {f, Pf, a, R_small, L_small, &G, &medoids, &mutex_union, &mutex_medoid}; + + // create a thread + if (pthread_create(&threads[i], nullptr, processLabel, ¶ms[i]) != 0) { + cerr << "Error creating thread for label " << f << endl; + } + + i++; + } + + // wait for all threads to complete + for (int j = 0; j < i; j++) + { + pthread_join(threads[j], nullptr); + } + + pthread_mutex_destroy(&mutex_union); + pthread_mutex_destroy(&mutex_medoid); + + // connect_subgraphs(G, PfMap); + + return G; +} \ No newline at end of file diff --git a/src/vamana.cpp b/src/vamana.cpp index a7c8d28..2cdda19 100644 --- a/src/vamana.cpp +++ b/src/vamana.cpp @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include "../include/utility.hpp" // due to use of euclidean_distance_of_nodes and v // coords = coords of all vectors in P dataset (Graph) // randomPermutation = σ @@ -103,5 +106,179 @@ int Vamana(Graph &graph, vector &coords, int R, double a, int int_L) } } + return coords[medoidNodeId]->getId(); +} + +#define THREADS_NO 4 + +// thread parameters for distance calculation +struct arguments { + map *nodes; + map::iterator it_begin; + map::iterator it_end; + unordered_map, double, PairHash> *nodePairMap; +}; + +// Συνάρτηση που υπολογίζει τις παραμέτρους που θα λάβει κάθε thread που τρέχει την συνάρτηση childThread +static void fill_up_args(map &nodes, vector, double, PairHash>> &maps, int threads_no, struct arguments *array) { + + int size = nodes.size(); + int chunk = size / threads_no; // (πόσα nodes έχει κάθε thread) + if(size % threads_no != 0) + chunk++; + + int has_done = 0; + for(int i = 0; i < threads_no; i++) { + int offset = chunk; // 4 + if(has_done + offset > size) + offset = size - has_done; + + + if(i == 0) + array[i].it_begin = nodes.begin(); + else { + array[i].it_begin = array[i-1].it_end; + } + + array[i].it_end = array[i].it_begin; + advance(array[i].it_end, offset); + has_done += offset; + + array[i].nodes = &nodes; + array[i].nodePairMap = &(maps[i]); + } + return; +} + +// Συνάρτηση που τρέχει ένα thread που υπολογίζει τις αποστάσεις. Λαμβάνει ως παράμετρο ένα struct arguments* +static void* childThread (void* args) { + struct arguments *args_c = (struct arguments*)args; // args_child + + for (auto it = args_c->it_begin; it != args_c->it_end; ++it) { // [0. size_div_2) + Node* node = it->second; + int label = node->getLabel(); + + for (auto it2 = it; it2 != args_c->nodes->end(); ++it2) { // we start from it2 = it instead of it2 = it + 1, because we want to store that the distance of one node from itself is zero + Node* node2 = it2->second; + int label2 = node2->getLabel(); + + // Skip if nodes don't have the same label + if (label != label2) { + continue; + assert(0); + } + + // Create an order-independent key + pair key = {min(node, node2), max(node, node2)}; // The hash and key use min and max to ensure that {node1, node2} is treated the same as {node2, node1}. This eliminates the need to insert the symmetric pair manually. + double dist = euclidean_distance_of_nodes(node, node2); + + // Insert if the key does not exist + assert(args_c->nodePairMap->find(key) == args_c->nodePairMap->end()); // shouldn't exist already + (*args_c->nodePairMap)[key] = dist; // Insert the key-value pair + } + } + + pthread_exit(NULL); +} + + +int VamanaParallelDistances(Graph &graph, vector &coords, int R, double a, int int_L) +{ + vector> actual_coords; + + for (Node *node : coords) { + actual_coords.push_back(node->getCoordinates()); + } + + generate_graph_parallel(graph, coords, R); + + int medoidNodeId = findMedoid(actual_coords); + // int medoidNodeId = parallel_findMedoid(actual_coords); + // int medoidNodeId = parallel_2_findMedoid(actual_coords); + + Node *medoid = graph.getNode(coords[medoidNodeId]->getId()); + + if (medoid == nullptr) + { + cerr << "Error: Medoid node with ID " << medoidNodeId << " not found in the graph." << endl; + return -1; + } + +// Αποθήκευση αποστάσεων + // Vector to hold THREADS_NO unordered_map objects + vector, double, PairHash>> maps(THREADS_NO); // we calculate & store only the distances between nodes of the same label + map nodes = graph.getAdjList(); + + pthread_t thread_id[THREADS_NO]; // Store the threadIDs in an array + struct arguments array[THREADS_NO]; // Store the struct arguments of each thread in an array + fill_up_args(nodes, maps, THREADS_NO, array); // Prepare arguments for each thread + + // Create a new thread. The new thread will run the childThread function. + for (int threadCount = 0; threadCount < THREADS_NO; threadCount++){ + pthread_create (&thread_id[threadCount], NULL, &childThread, &(array[threadCount])); + } + + for (int threadCount = 0; threadCount< THREADS_NO; threadCount++) { // Join after all thread creation + void *status; + pthread_join(thread_id[threadCount], &status); + } + + // Combine all maps into one + unordered_map, double, PairHash> nodePairMap; + for (const auto& map : maps) { + nodePairMap.insert(map.begin(), map.end()); + } + // Size of nodePairMap should be 11029349 (without the zero distances added) +// Τέλος αποθήκευσης αποστάσεων + + // make a random permutation of 1..n, to traverse the nodes in a random order + vector randomPermutation(coords.size()); // Size of vector randomPermutation = number of points in dataset = number of vectors in coords + iota(randomPermutation.begin(), randomPermutation.end(), 0); // fills with numbers from 0 to coords.size() - 1 + + // obtain a time-based seed: + unsigned seed = chrono::system_clock::now().time_since_epoch().count(); + shuffle(randomPermutation.begin(), randomPermutation.end(), default_random_engine(seed)); + + for (int point_id : randomPermutation) + { + set V_set; + set L_set; + + Node *sigma_i = graph.getNode(coords[point_id]->getId()); + if (sigma_i == nullptr) + { + cerr << "Error: Node with ID " << coords[point_id]->getId() << " not found in the graph." << endl; + continue; // skip this iteration or handle the error appropriately + } + + GreedySearchIndex(medoid, sigma_i, 1, int_L, L_set, V_set, nodePairMap); + RobustPrune(sigma_i, V_set, a, R); + + list sigma_i_out = sigma_i->getEdges(); + for (auto node_j : sigma_i_out) + { + list j_out = node_j->getEdges(); + + set j_out_sigma_i(j_out.begin(), j_out.end()); + j_out_sigma_i.insert(sigma_i); + + if (static_cast(j_out_sigma_i.size()) > R) + { + RobustPrune(node_j, j_out_sigma_i, a, R); + } + else + { + auto it = find(j_out.begin(), j_out.end(), sigma_i); + if (it == j_out.end()) + { + // sigma_i doesn't exist in j_out + node_j->mutex_lock(); // we are going to edit a node's list of neighbors, so we need to lock the mutex + node_j->addEdge(sigma_i); + node_j->mutex_unlock(); + } + } + } + } + return coords[medoidNodeId]->getId(); } \ No newline at end of file diff --git a/tests/test_greedysearch.cpp b/tests/test_greedysearch.cpp index 1f3aee0..866291d 100644 --- a/tests/test_greedysearch.cpp +++ b/tests/test_greedysearch.cpp @@ -223,4 +223,164 @@ TEST_CASE("GreedySearch in fully disconnected big graph") { // for(int i = 0; i < fvecs_d_base_size; i++) { // delete nodes[i]; // } -// } \ No newline at end of file +// } + +TEST_CASE("GreedySearchIndex in small connected graph") { + Graph graph; + + // list initialization syntax available for C++11 and newer + // {1.0, 1.0} initializes the std::vector with two elements. + // {} creates an empty std::list + Node n0 = Node(0, {1.0, 1.0}, {}, {}); + Node n1 = Node(1, {1.0, 3.0}, {}, {}); + Node n2 = Node(2, {2.0, 2.0}, {}, {}); + + graph.addNode(&n0); + graph.addNode(&n1); + graph.addNode(&n2); + + graph.addEdge(0, 1); + graph.addEdge(1, 0); + graph.addEdge(0, 2); + graph.addEdge(2, 0); + graph.addEdge(1, 2); + graph.addEdge(2, 1); + + set L_set; + set V_set; + Node query_node = Node(3, {2.0, 2.1}, {}, {}); + graph.addNode(&query_node); // query_node should be in graph for greedyIndex to work + + set set_test; + set_test.insert(&n0); + set_test.insert(&n1); + set_test.insert(&n2); + + // Fill up nodePairSum + map nodes = graph.getAdjList(); + unordered_map, double, PairHash> nodePairMap; // Create an order-independent key + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + Node* node = it->second; + int label = node->getLabel(); + + for (auto it2 = it; it2 != nodes.end(); ++it2) { // we start from it2 = it instead of it2 = it + 1, because we want to store that the distance of one node from itself is zero + Node* node2 = it2->second; + int label2 = node2->getLabel(); + + // Skip if nodes don't have the same label + if (label != label2) + continue; + + // Create an order-independent key + pair key = {min(node, node2), max(node, node2)}; // The hash and key use min and max to ensure that {node1, node2} is treated the same as {node2, node1}. This eliminates the need to insert the symmetric pair manually. + double dist = euclidean_distance_of_nodes(node, node2); + + // Insert if the key does not exist + assert(nodePairMap.find(key) == nodePairMap.end()); // shouldn't exist already + nodePairMap[key] = dist; // Insert the key-value pair + } + } + +// 1st run k = 1, L = 3 (L = number of nodes in graph): + GreedySearchIndex(&n0, &query_node, 1, 3, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 1); + + auto it = L_set.begin(); // iterator points to a Node* + REQUIRE((*it)->getId() == 2); // node 2 should be closest neighbor + REQUIRE(V_set == set_test); // test of V_set + +// 2nd run k = L = 3 (L = number of nodes in graph): + L_set.clear(); // L_set, V_set should be empty before new call of greedy + V_set.clear(); + GreedySearchIndex(&n0, &query_node, 3, 3, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 3); + REQUIRE(L_set == set_test); + REQUIRE(V_set == set_test); + +// 3d run k = 1, L = 2 (L < number of nodes in graph): + L_set.clear(); + V_set.clear(); + GreedySearchIndex(&n0, &query_node, 1, 2, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 1); + + it = L_set.begin(); // iterator points to a Node* + REQUIRE((*it)->getId() == 2); // node 2 should be closest neighbor + + REQUIRE(V_set == set_test); + +// 4th run k = 1, L = 5 (L > number of nodes in graph): + L_set.clear(); + V_set.clear(); + GreedySearchIndex(&n0, &query_node, 1, 5, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 1); + + it = L_set.begin(); // iterator points to a Node* + REQUIRE((*it)->getId() == 2); // node 2 should be closest neighbor + + REQUIRE(V_set == set_test); +} + +TEST_CASE("GreedySearchIndex in fully disconnected small graph") { + Graph graph; + + Node n0 = Node(0, {1.0, 1.0}, {}, {}); + Node n1 = Node(1, {1.0, 3.0}, {}, {}); + Node n2 = Node(2, {2.0, 2.0}, {}, {}); + + graph.addNode(&n0); + graph.addNode(&n1); + graph.addNode(&n2); + + set L_set; + set V_set; + Node query_node = Node(3, {2.0, 2.1}, {}, {}); + graph.addNode(&query_node); // query_node should be in graph for greedyIndex to work + + // Fill up nodePairSum + map nodes = graph.getAdjList(); + unordered_map, double, PairHash> nodePairMap; // Create an order-independent key + for (auto it = nodes.begin(); it != nodes.end(); ++it) { + Node* node = it->second; + int label = node->getLabel(); + + for (auto it2 = it; it2 != nodes.end(); ++it2) { // we start from it2 = it instead of it2 = it + 1, because we want to store that the distance of one node from itself is zero + Node* node2 = it2->second; + int label2 = node2->getLabel(); + + // Skip if nodes don't have the same label + if (label != label2) + continue; + + // Create an order-independent key + pair key = {min(node, node2), max(node, node2)}; // The hash and key use min and max to ensure that {node1, node2} is treated the same as {node2, node1}. This eliminates the need to insert the symmetric pair manually. + double dist = euclidean_distance_of_nodes(node, node2); + + // Insert if the key does not exist + assert(nodePairMap.find(key) == nodePairMap.end()); // shouldn't exist already + nodePairMap[key] = dist; // Insert the key-value pair + } + } + +// 1st run k = 1, L = 3 + GreedySearchIndex(&n0, &query_node, 1, 3, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 1); + + auto it = L_set.begin(); // iterator points to a Node* + REQUIRE((*it)->getId() == 0); // node 0 should be closest neighbor + + it = V_set.begin(); + REQUIRE((*it)->getId() == 0); + +// 2nd run k = 2, L = 3 (should have same results with previous run) + L_set.clear(); + V_set.clear(); + + GreedySearchIndex(&n0, &query_node, 2, 3, L_set, V_set, nodePairMap); + REQUIRE(L_set.size() == 1); // one neighbor! not 2! + + it = L_set.begin(); // iterator points to a Node* + REQUIRE((*it)->getId() == 0); // node 0 should be closest neighbor + + it = V_set.begin(); + REQUIRE((*it)->getId() == 0); +} \ No newline at end of file