From 4da30d9096a57560e111a873a57ec40acaf22b55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Palancher?= <remi@rackslab.io>
Date: Mon, 5 May 2025 15:24:04 +0200
Subject: [PATCH 1/3] feat(core): add RacksDB GPUs in fhpc_nodes

Add gpus key in fhpc_nodes dictionary with the number of GPUs by model
found in RacksDB node type.
---
 firehpc/cluster.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/firehpc/cluster.py b/firehpc/cluster.py
index 62a7aca..cd963a4 100644
--- a/firehpc/cluster.py
+++ b/firehpc/cluster.py
@@ -175,6 +175,16 @@ def conf(
         # then grouped by node type.
         nodes = {}
 
+        def node_type_gpus(node_type):
+            result = {}
+            if not hasattr(node_type, "gpu"):
+                return result
+            for gpu in node_type.gpu:
+                if gpu.model not in result:
+                    result[gpu.model] = 0
+                result[gpu.model] += 1
+            return result
+
         def insert_in_node_type():
             for node_type in nodes[tag]:
                 if node_type["type"] == node.type.id:
@@ -186,6 +196,7 @@ def insert_in_node_type():
                     "sockets": node.type.cpu.sockets,
                     "cores": node.type.cpu.cores,
                     "memory": node.type.ram.dimm * (node.type.ram.size // 1024**2),
+                    "gpus": node_type_gpus(node.type),
                     "nodes": [node.name],
                 }
             )

From 4e28406675b10db8e3b5a2f9013f886142e94438 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Palancher?= <remi@rackslab.io>
Date: Mon, 5 May 2025 15:25:48 +0200
Subject: [PATCH 2/3] feat(conf): support gpu GRES in Slurm

Update Slurm role to declare gpu gres when defined in fhpc_nodes.
Emulating fake GPUs is possible thanks to undocumented Slurm feature
and fake_gpus.conf configuration file. This is not supported in Slurm
emulator mode, then it is disabled in this mode.

fix #39
---
 CHANGELOG.md                                 |  1 +
 conf/group_vars/all.yml                      |  2 ++
 conf/roles/slurm/defaults/main.yml           |  2 ++
 conf/roles/slurm/tasks/compute.yml           | 19 +++++++++++++++++++
 conf/roles/slurm/templates/fake_gpus.conf.j2 |  8 ++++++++
 conf/roles/slurm/templates/slurm.conf.j2     |  5 ++++-
 6 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 conf/roles/slurm/templates/fake_gpus.conf.j2

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 150986c..4b57cce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -68,6 +68,7 @@ and this project adheres to
   - Add `slurm_restd_port` variable in inventory to control slurmrestd TCP/IP
     listening port.
   - Support all Slurm-web to slurmrestd JWT authentication modes.
+  - Support gpu GRES in Slurm configuration (#39).
 - docs:
   - Add sysctl `fs.inotify.max_user_instances` value increase recommendation in
     README.md to avoid weird issue when launching many containers.
diff --git a/conf/group_vars/all.yml b/conf/group_vars/all.yml
index d094cf1..9c6eb5f 100644
--- a/conf/group_vars/all.yml
+++ b/conf/group_vars/all.yml
@@ -37,6 +37,8 @@ slurm_profiles:
   compute: compute
   server: admin
   login: login
+# GRES GPU enabled if at least one type of compute nodes has gpu
+slurm_with_gres_gpu: "{{ fhpc_nodes['compute'] | map(attribute='gpus') | map('length') | map('bool') | max }}"
 slurm_local_munge_key_file: "{{ fhpc_cluster_state_dir }}/munge/munge.key"
 slurm_local_slurm_key_file: "{{ fhpc_cluster_state_dir }}/slurm/slurm.key"
 slurm_local_mariadb_password_file: "{{ fhpc_cluster_state_dir }}/mariadb/mariadb.password"
diff --git a/conf/roles/slurm/defaults/main.yml b/conf/roles/slurm/defaults/main.yml
index 8d88036..4cb91fd 100644
--- a/conf/roles/slurm/defaults/main.yml
+++ b/conf/roles/slurm/defaults/main.yml
@@ -3,6 +3,8 @@ slurm_profiles: {}
 slurm_emulator: False
 slurm_with_accounting: true
 slurm_with_munge: false
+slurm_with_gres_gpu: false
+slurm_gpus_models_map: {}
 slurm_uid: 432  # picked randomly among 100-499
 slurm_gid: 432  # picked randomly among 100-499
 slurmrestd_uid: 433  # picked randomly among 100-499
diff --git a/conf/roles/slurm/tasks/compute.yml b/conf/roles/slurm/tasks/compute.yml
index 0235f57..601cc5a 100644
--- a/conf/roles/slurm/tasks/compute.yml
+++ b/conf/roles/slurm/tasks/compute.yml
@@ -12,6 +12,25 @@
     name: "{{ slurm_emulator | ternary(slurm_emulator_compute_packages, slurm_compute_packages) }}"
     state: latest
 
+- name: Set slurm current compute node fact
+  ansible.builtin.set_fact:
+    slurm_current_compute_node_type: "{{ slurm_compute_nodes | selectattr('nodes', 'contains', ansible_facts['hostname']) | first }}"
+  when: not slurm_emulator
+
+# Create fake_gpus if current node has gpus
+- name: Deploy slurm fake GPUs configuration file
+  ansible.builtin.template:
+    src: fake_gpus.conf.j2
+    dest: /etc/slurm/fake_gpus.conf
+    owner: slurm
+    group: slurm
+    mode: '0644'
+  notify:
+    - Restart slurmd
+  when:
+  - not slurm_emulator
+  - slurm_current_compute_node_type['gpus'] | length > 0
+
 # This is required on redhat based distributions as the service is not
 # automatically started by the RPM packages, and it does not hurt on Debian.
 - name: Ensure slurmd service is started
diff --git a/conf/roles/slurm/templates/fake_gpus.conf.j2 b/conf/roles/slurm/templates/fake_gpus.conf.j2
new file mode 100644
index 0000000..d4ca2c0
--- /dev/null
+++ b/conf/roles/slurm/templates/fake_gpus.conf.j2
@@ -0,0 +1,8 @@
+{% set total = slurm_current_compute_node_type['gpus'].values() | sum %}
+{% set ns = namespace(index=0) %}
+{% for model in slurm_current_compute_node_type['gpus'] %}
+{% for _ in range(slurm_current_compute_node_type['gpus'][model]) %}
+{{ slurm_gpus_models_map.get(model, 'nvidia') }}|1|~|{% for _index in range(total) %}{% if ns.index == _index %}-1{% else %}0{% endif %}{% if not loop.last %},{% endif %}{% endfor %}|/dev/nvidia{{ ns.index }}
+{% set ns.index = ns.index + 1 %}
+{% endfor %}
+{% endfor %}
diff --git a/conf/roles/slurm/templates/slurm.conf.j2 b/conf/roles/slurm/templates/slurm.conf.j2
index 88b47a9..5c163d6 100644
--- a/conf/roles/slurm/templates/slurm.conf.j2
+++ b/conf/roles/slurm/templates/slurm.conf.j2
@@ -30,9 +30,12 @@ ProctrackType=proctrack/linuxproc
 {% for key, value in slurm_params.items() %}
 {{ key }}={{ value }}
 {% endfor %}
+{% if slurm_with_gres_gpu %}
+GresTypes=gpu
+{% endif %}
 
 {% for node_type in slurm_compute_nodes %}
-NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }} {% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
+NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }}{% if not slurm_emulator %}{% for model, nb in node_type.gpus.items() %} Gres=gpu:{{ slurm_gpus_models_map.get(model, 'nvidia') }}:{{ nb }}{% endfor %}{% endif %}{% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
 {% endfor %}
 {% for slurm_partition in slurm_partitions %}
 PartitionName={{ slurm_partition.name }} Nodes={{ slurm_partition.nodes }} Default={{ "YES" if "default" in slurm_partition and slurm_partition.default else "NO" }}{% for param, value in slurm_partition.params.items() %} {{param}}={{value}}{% endfor %}

From cb27665e842e6bfc688878d92c7f23012b0f4a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Palancher?= <remi@rackslab.io>
Date: Mon, 5 May 2025 15:27:57 +0200
Subject: [PATCH 3/3] feat(load): request gpu GRES for jobs

Request GPUs allocations on partitions with gpu GRES.
---
 CHANGELOG.md    |  1 +
 firehpc/load.py | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4b57cce..1e91805 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,6 +43,7 @@ and this project adheres to
     submitting less jobs when not at work (#29).
   - Add `--time-off-factor` option to control by how much the load is divided
     outside of business hours.
+  - Request GPUs allocations on partitions with gpu GRES.
 - conf:
   - Add possibility to define additional QOS and alternative partitions in
     Slurm.
diff --git a/firehpc/load.py b/firehpc/load.py
index b9d9613..26837b9 100644
--- a/firehpc/load.py
+++ b/firehpc/load.py
@@ -31,7 +31,9 @@
 JOBS_TIMELIMITS = (["10", "30", "1:0:0", "6:0:0"], [50, 5, 2, 1])
 JOBS_DURATIONS = ([360, 540, 720, 1200], [50, 5, 2, 1])
 
-ClusterPartition = namedtuple("ClusterPartition", ["name", "nodes", "cpus", "time"])
+ClusterPartition = namedtuple(
+    "ClusterPartition", ["name", "nodes", "cpus", "gpus", "time"]
+)
 
 
 def load_clusters(
@@ -140,6 +142,28 @@ def _get_cluster_config(self) -> None:
             ):
                 self.accounting = True
 
+    def _get_partition_gpus(self, partition):
+        """Return the total number of GPU GRES in a partition."""
+        result = 0
+        stdout, stderr = self.ssh.exec(
+            [f"admin.{self.cluster.name}", "scontrol", "show", "nodes", "--json"]
+        )
+        try:
+            for node in json.loads(stdout)["nodes"]:
+                if partition not in node["partitions"]:
+                    continue
+                if not len(node["gres"]):
+                    continue
+                gres = node["gres"].split(":")
+                if gres[0] != "gpu":
+                    continue
+                result += int(gres[2])
+        except json.decoder.JSONDecodeError as err:
+            raise FireHPCRuntimeError(
+                f"Unable to retrieve nodes from cluster {self.cluster.name}: {str(err)}"
+            ) from err
+        return result
+
     def _get_partitions(self) -> list[str]:
         stdout, stderr = self.ssh.exec(
             [f"admin.{self.cluster.name}", "scontrol", "show", "partitions", "--json"]
@@ -150,6 +174,7 @@ def _get_partitions(self) -> list[str]:
                     partition["name"],
                     partition["nodes"]["total"],
                     partition["cpus"]["total"],
+                    self._get_partition_gpus(partition["name"]),
                     partition["maximums"]["time"],
                 )
                 for partition in json.loads(stdout)["partitions"]
@@ -288,6 +313,8 @@ def random_power_two(limit: int) -> int:
         # of tasks.
         if self.select_type == "select/linear":
             cmd.extend(["--nodes", str(random_power_two(partition.nodes))])
+        elif partition.gpus:
+            cmd.extend(["--gpus", str(random_power_two(partition.gpus))])
         else:
             cmd.extend(["--ntasks", str(random_power_two(partition.cpus))])