rackslab · rezib · May 5, 2025 · May 5, 2025 · May 5, 2025 · May 5, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -43,6 +43,7 @@ and this project adheres to
     submitting less jobs when not at work (#29).
   - Add `--time-off-factor` option to control by how much the load is divided
     outside of business hours.
+  - Request GPUs allocations on partitions with gpu GRES.
 - conf:
   - Add possibility to define additional QOS and alternative partitions in
     Slurm.
@@ -68,6 +69,7 @@ and this project adheres to
   - Add `slurm_restd_port` variable in inventory to control slurmrestd TCP/IP
     listening port.
   - Support all Slurm-web to slurmrestd JWT authentication modes.
+  - Support gpu GRES in Slurm configuration (#39).
 - docs:
   - Add sysctl `fs.inotify.max_user_instances` value increase recommendation in
     README.md to avoid weird issue when launching many containers.

diff --git a/conf/group_vars/all.yml b/conf/group_vars/all.yml
@@ -37,6 +37,8 @@ slurm_profiles:
   compute: compute
   server: admin
   login: login
+# GRES GPU enabled if at least one type of compute nodes has gpu
+slurm_with_gres_gpu: "{{ fhpc_nodes['compute'] | map(attribute='gpus') | map('length') | map('bool') | max }}"
 slurm_local_munge_key_file: "{{ fhpc_cluster_state_dir }}/munge/munge.key"
 slurm_local_slurm_key_file: "{{ fhpc_cluster_state_dir }}/slurm/slurm.key"
 slurm_local_mariadb_password_file: "{{ fhpc_cluster_state_dir }}/mariadb/mariadb.password"

diff --git a/conf/roles/slurm/defaults/main.yml b/conf/roles/slurm/defaults/main.yml
@@ -3,6 +3,8 @@ slurm_profiles: {}
 slurm_emulator: False
 slurm_with_accounting: true
 slurm_with_munge: false
+slurm_with_gres_gpu: false
+slurm_gpus_models_map: {}
 slurm_uid: 432  # picked randomly among 100-499
 slurm_gid: 432  # picked randomly among 100-499
 slurmrestd_uid: 433  # picked randomly among 100-499

diff --git a/conf/roles/slurm/tasks/compute.yml b/conf/roles/slurm/tasks/compute.yml
@@ -12,6 +12,25 @@
     name: "{{ slurm_emulator | ternary(slurm_emulator_compute_packages, slurm_compute_packages) }}"
     state: latest
 
+- name: Set slurm current compute node fact
+  ansible.builtin.set_fact:
+    slurm_current_compute_node_type: "{{ slurm_compute_nodes | selectattr('nodes', 'contains', ansible_facts['hostname']) | first }}"
+  when: not slurm_emulator
+
+# Create fake_gpus if current node has gpus
+- name: Deploy slurm fake GPUs configuration file
+  ansible.builtin.template:
+    src: fake_gpus.conf.j2
+    dest: /etc/slurm/fake_gpus.conf
+    owner: slurm
+    group: slurm
+    mode: '0644'
+  notify:
+    - Restart slurmd
+  when:
+  - not slurm_emulator
+  - slurm_current_compute_node_type['gpus'] | length > 0
+
 # This is required on redhat based distributions as the service is not
 # automatically started by the RPM packages, and it does not hurt on Debian.
 - name: Ensure slurmd service is started

diff --git a/conf/roles/slurm/templates/fake_gpus.conf.j2 b/conf/roles/slurm/templates/fake_gpus.conf.j2
@@ -0,0 +1,8 @@
+{% set total = slurm_current_compute_node_type['gpus'].values() | sum %}
+{% set ns = namespace(index=0) %}
+{% for model in slurm_current_compute_node_type['gpus'] %}
+{% for _ in range(slurm_current_compute_node_type['gpus'][model]) %}
+{{ slurm_gpus_models_map.get(model, 'nvidia') }}|1|~|{% for _index in range(total) %}{% if ns.index == _index %}-1{% else %}0{% endif %}{% if not loop.last %},{% endif %}{% endfor %}|/dev/nvidia{{ ns.index }}
+{% set ns.index = ns.index + 1 %}
+{% endfor %}
+{% endfor %}
diff --git a/conf/roles/slurm/templates/slurm.conf.j2 b/conf/roles/slurm/templates/slurm.conf.j2
@@ -30,9 +30,12 @@ ProctrackType=proctrack/linuxproc
 {% for key, value in slurm_params.items() %}
 {{ key }}={{ value }}
 {% endfor %}
+{% if slurm_with_gres_gpu %}
+GresTypes=gpu
+{% endif %}
 
 {% for node_type in slurm_compute_nodes %}
-NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }} {% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
+NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }}{% if not slurm_emulator %}{% for model, nb in node_type.gpus.items() %} Gres=gpu:{{ slurm_gpus_models_map.get(model, 'nvidia') }}:{{ nb }}{% endfor %}{% endif %}{% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
 {% endfor %}
 {% for slurm_partition in slurm_partitions %}
 PartitionName={{ slurm_partition.name }} Nodes={{ slurm_partition.nodes }} Default={{ "YES" if "default" in slurm_partition and slurm_partition.default else "NO" }}{% for param, value in slurm_partition.params.items() %} {{param}}={{value}}{% endfor %}

diff --git a/firehpc/cluster.py b/firehpc/cluster.py
@@ -175,6 +175,16 @@ def conf(
         # then grouped by node type.
         nodes = {}
 
+        def node_type_gpus(node_type):
+            result = {}
+            if not hasattr(node_type, "gpu"):
+                return result
+            for gpu in node_type.gpu:
+                if gpu.model not in result:
+                    result[gpu.model] = 0
+                result[gpu.model] += 1
+            return result
+
         def insert_in_node_type():
             for node_type in nodes[tag]:
                 if node_type["type"] == node.type.id:
@@ -186,6 +196,7 @@ def insert_in_node_type():
                     "sockets": node.type.cpu.sockets,
                     "cores": node.type.cpu.cores,
                     "memory": node.type.ram.dimm * (node.type.ram.size // 1024**2),
+                    "gpus": node_type_gpus(node.type),
                     "nodes": [node.name],
                 }
             )

diff --git a/firehpc/load.py b/firehpc/load.py
@@ -31,7 +31,9 @@
 JOBS_TIMELIMITS = (["10", "30", "1:0:0", "6:0:0"], [50, 5, 2, 1])
 JOBS_DURATIONS = ([360, 540, 720, 1200], [50, 5, 2, 1])
 
-ClusterPartition = namedtuple("ClusterPartition", ["name", "nodes", "cpus", "time"])
+ClusterPartition = namedtuple(
+    "ClusterPartition", ["name", "nodes", "cpus", "gpus", "time"]
+)
 
 
 def load_clusters(
@@ -140,6 +142,28 @@ def _get_cluster_config(self) -> None:
             ):
                 self.accounting = True
 
+    def _get_partition_gpus(self, partition):
+        """Return the total number of GPU GRES in a partition."""
+        result = 0
+        stdout, stderr = self.ssh.exec(
+            [f"admin.{self.cluster.name}", "scontrol", "show", "nodes", "--json"]
+        )
+        try:
+            for node in json.loads(stdout)["nodes"]:
+                if partition not in node["partitions"]:
+                    continue
+                if not len(node["gres"]):
+                    continue
+                gres = node["gres"].split(":")
+                if gres[0] != "gpu":
+                    continue
+                result += int(gres[2])
+        except json.decoder.JSONDecodeError as err:
+            raise FireHPCRuntimeError(
+                f"Unable to retrieve nodes from cluster {self.cluster.name}: {str(err)}"
+            ) from err
+        return result
+
     def _get_partitions(self) -> list[str]:
         stdout, stderr = self.ssh.exec(
             [f"admin.{self.cluster.name}", "scontrol", "show", "partitions", "--json"]
@@ -150,6 +174,7 @@ def _get_partitions(self) -> list[str]:
                     partition["name"],
                     partition["nodes"]["total"],
                     partition["cpus"]["total"],
+                    self._get_partition_gpus(partition["name"]),
                     partition["maximums"]["time"],
                 )
                 for partition in json.loads(stdout)["partitions"]
@@ -288,6 +313,8 @@ def random_power_two(limit: int) -> int:
         # of tasks.
         if self.select_type == "select/linear":
             cmd.extend(["--nodes", str(random_power_two(partition.nodes))])
+        elif partition.gpus:
+            cmd.extend(["--gpus", str(random_power_two(partition.gpus))])
         else:
             cmd.extend(["--ntasks", str(random_power_two(partition.cpus))])