From 5088808273428813841a31e41ba564379501db95 Mon Sep 17 00:00:00 2001 From: sbstndb/sbstndbs Date: Tue, 10 Jun 2025 17:16:41 +0200 Subject: [PATCH 1/4] fix: raise MPI error when min_level too low - Previously, we could have min_level too low for the MPI comm size. This could lead to an unhandled error. - Here is a workaround to ensure the user avoids the error. - In the future, we could resolve the primary issue and then remove this workaround --- include/samurai/mesh.hpp | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/samurai/mesh.hpp b/include/samurai/mesh.hpp index 6c3329d8d..6c5b3a89b 100644 --- a/include/samurai/mesh.hpp +++ b/include/samurai/mesh.hpp @@ -21,6 +21,39 @@ namespace mpi = boost::mpi; #endif +// Résolution d'un bug : le découpage est rectangulaire. S'il y a trop de rangs MPI et un min_level trop faible, alors il est parfois +// impossible de décomposer le problème. Il faut alors restreindre un min_level limite +bool to_lot_rank(std::size_t min_level) +{ + boost::mpi::communicator world; + int size = world.size(); // Nombre total de processus + + // à vérifier : + // - en 1d ? + // - inférieur ou égal ? i.e. si min_level est à 0 alors j'ai une seule case ? + // valable sur un domaine [0,1] mais quid sur [1, b] ? + if (size <= pow(2, min_level)) + { + return false; + } + else + { + return true; + } + return true; +} + +void error_on_to_lot_rank(std::size_t min_level) +{ + auto error = to_lot_rank(min_level); + if (error) + { + std::cout << "ERROR: to lot MPI rank for this value of min_level. Please reduce MPI Size or raise min_level according to the rule size <= 2^min_level. " + << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } +} + namespace samurai { @@ -246,6 +279,9 @@ namespace samurai #ifdef SAMURAI_WITH_MPI partition_mesh(start_level, b); // load_balancing(); + + // resolve MPI issue when too lot MPI rank for 2^min_level + error_on_to_lot_rank(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif @@ -277,6 +313,8 @@ namespace samurai #ifdef SAMURAI_WITH_MPI partition_mesh(start_level, b); // load_balancing(); + + error_on_to_lot_rank(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif From 08588ccd916bea161cb07ca20b28d6856d77f911 Mon Sep 17 00:00:00 2001 From: sbstndb/sbstndbs Date: Tue, 10 Jun 2025 17:22:06 +0200 Subject: [PATCH 2/4] fix: rename --- include/samurai/mesh.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/samurai/mesh.hpp b/include/samurai/mesh.hpp index 6c5b3a89b..7de3f427f 100644 --- a/include/samurai/mesh.hpp +++ b/include/samurai/mesh.hpp @@ -23,7 +23,7 @@ namespace mpi = boost::mpi; // Résolution d'un bug : le découpage est rectangulaire. S'il y a trop de rangs MPI et un min_level trop faible, alors il est parfois // impossible de décomposer le problème. Il faut alors restreindre un min_level limite -bool to_lot_rank(std::size_t min_level) +bool check_size_min_level(std::size_t min_level) { boost::mpi::communicator world; int size = world.size(); // Nombre total de processus @@ -43,9 +43,9 @@ bool to_lot_rank(std::size_t min_level) return true; } -void error_on_to_lot_rank(std::size_t min_level) +void error_on_min_level(std::size_t min_level) { - auto error = to_lot_rank(min_level); + auto error = check_size_min_level(min_level); if (error) { std::cout << "ERROR: to lot MPI rank for this value of min_level. Please reduce MPI Size or raise min_level according to the rule size <= 2^min_level. " @@ -281,7 +281,7 @@ namespace samurai // load_balancing(); // resolve MPI issue when too lot MPI rank for 2^min_level - error_on_to_lot_rank(min_level); + error_on_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif @@ -314,7 +314,7 @@ namespace samurai partition_mesh(start_level, b); // load_balancing(); - error_on_to_lot_rank(min_level); + error_on_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif From eca9e85b2f69a7b2b0e1d4974788427b923bc67e Mon Sep 17 00:00:00 2001 From: sbstndb/sbstndbs Date: Tue, 10 Jun 2025 17:39:50 +0200 Subject: [PATCH 3/4] fix: clean up --- include/samurai/mesh.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/samurai/mesh.hpp b/include/samurai/mesh.hpp index 7de3f427f..da9f4c7cb 100644 --- a/include/samurai/mesh.hpp +++ b/include/samurai/mesh.hpp @@ -21,12 +21,13 @@ namespace mpi = boost::mpi; #endif -// Résolution d'un bug : le découpage est rectangulaire. S'il y a trop de rangs MPI et un min_level trop faible, alors il est parfois -// impossible de décomposer le problème. Il faut alors restreindre un min_level limite +#ifdef SAMURAI_WITH_MPI +// Résolution d'un bug : S'il y a trop de rangs MPI et un min_level trop faible, alors il est parfois +// impossible de décomposer le problème. Il faut alors imposer un min_level limite bool check_size_min_level(std::size_t min_level) { boost::mpi::communicator world; - int size = world.size(); // Nombre total de processus + int size = world.size(); // à vérifier : // - en 1d ? @@ -43,16 +44,16 @@ bool check_size_min_level(std::size_t min_level) return true; } -void error_on_min_level(std::size_t min_level) +void error_on_mpi_min_level(std::size_t min_level) { auto error = check_size_min_level(min_level); if (error) { - std::cout << "ERROR: to lot MPI rank for this value of min_level. Please reduce MPI Size or raise min_level according to the rule size <= 2^min_level. " - << std::endl; + std::cout << "ERROR: Please reduce MPI Size or increase min_value according to the rule mpi_size <= 2^min_level." << std::endl; MPI_Abort(MPI_COMM_WORLD, 1); } } +#endif namespace samurai { @@ -281,7 +282,7 @@ namespace samurai // load_balancing(); // resolve MPI issue when too lot MPI rank for 2^min_level - error_on_min_level(min_level); + error_on_mpi_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif @@ -314,7 +315,7 @@ namespace samurai partition_mesh(start_level, b); // load_balancing(); - error_on_min_level(min_level); + error_on_mpi_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif From d4983794b8937ae567177dd688415e361c7ead5c Mon Sep 17 00:00:00 2001 From: sbstndb/sbstndbs Date: Wed, 11 Jun 2025 11:18:56 +0200 Subject: [PATCH 4/4] fix: cleanup --- include/samurai/mesh.hpp | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/include/samurai/mesh.hpp b/include/samurai/mesh.hpp index da9f4c7cb..d9f58485c 100644 --- a/include/samurai/mesh.hpp +++ b/include/samurai/mesh.hpp @@ -23,33 +23,23 @@ namespace mpi = boost::mpi; #ifdef SAMURAI_WITH_MPI // Résolution d'un bug : S'il y a trop de rangs MPI et un min_level trop faible, alors il est parfois -// impossible de décomposer le problème. Il faut alors imposer un min_level limite -bool check_size_min_level(std::size_t min_level) +// impossible de décomposer le problème (--> segfault). Il faut alors imposer un min_level limite le temps d'un fix +bool is_invalid_mpi_size(std::size_t min_level) { boost::mpi::communicator world; - int size = world.size(); + int mpi_size = world.size(); // à vérifier : // - en 1d ? - // - inférieur ou égal ? i.e. si min_level est à 0 alors j'ai une seule case ? - // valable sur un domaine [0,1] mais quid sur [1, b] ? - if (size <= pow(2, min_level)) - { - return false; - } - else - { - return true; - } - return true; + return (mpi_size > std::pow(2, min_level)); } -void error_on_mpi_min_level(std::size_t min_level) +void validate_mpi_min_level(std::size_t min_level) { - auto error = check_size_min_level(min_level); - if (error) + if (is_invalid_mpi_size(min_level)) { - std::cout << "ERROR: Please reduce MPI Size or increase min_value according to the rule mpi_size <= 2^min_level." << std::endl; + std::cerr << "ERROR: MPI size (" << boost::mpi::communicator().size() << ") is too large for min_level = " << min_level + << ". Please ensure that mpi_size <= 2^min_level." << std::endl; MPI_Abort(MPI_COMM_WORLD, 1); } } @@ -280,9 +270,7 @@ namespace samurai #ifdef SAMURAI_WITH_MPI partition_mesh(start_level, b); // load_balancing(); - - // resolve MPI issue when too lot MPI rank for 2^min_level - error_on_mpi_min_level(min_level); + validate_mpi_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif @@ -315,7 +303,7 @@ namespace samurai partition_mesh(start_level, b); // load_balancing(); - error_on_mpi_min_level(min_level); + validate_mpi_min_level(min_level); #else this->m_cells[mesh_id_t::cells][start_level] = {start_level, b, approx_box_tol, scaling_factor_}; #endif