From 4da30d9096a57560e111a873a57ec40acaf22b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 5 May 2025 15:24:04 +0200 Subject: [PATCH 1/3] feat(core): add RacksDB GPUs in fhpc_nodes Add gpus key in fhpc_nodes dictionary with the number of GPUs by model found in RacksDB node type. --- firehpc/cluster.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/firehpc/cluster.py b/firehpc/cluster.py index 62a7aca..cd963a4 100644 --- a/firehpc/cluster.py +++ b/firehpc/cluster.py @@ -175,6 +175,16 @@ def conf( # then grouped by node type. nodes = {} + def node_type_gpus(node_type): + result = {} + if not hasattr(node_type, "gpu"): + return result + for gpu in node_type.gpu: + if gpu.model not in result: + result[gpu.model] = 0 + result[gpu.model] += 1 + return result + def insert_in_node_type(): for node_type in nodes[tag]: if node_type["type"] == node.type.id: @@ -186,6 +196,7 @@ def insert_in_node_type(): "sockets": node.type.cpu.sockets, "cores": node.type.cpu.cores, "memory": node.type.ram.dimm * (node.type.ram.size // 1024**2), + "gpus": node_type_gpus(node.type), "nodes": [node.name], } ) From 4e28406675b10db8e3b5a2f9013f886142e94438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 5 May 2025 15:25:48 +0200 Subject: [PATCH 2/3] feat(conf): support gpu GRES in Slurm Update Slurm role to declare gpu gres when defined in fhpc_nodes. Emulating fake GPUs is possible thanks to undocumented Slurm feature and fake_gpus.conf configuration file. This is not supported in Slurm emulator mode, then it is disabled in this mode. fix #39 --- CHANGELOG.md | 1 + conf/group_vars/all.yml | 2 ++ conf/roles/slurm/defaults/main.yml | 2 ++ conf/roles/slurm/tasks/compute.yml | 19 +++++++++++++++++++ conf/roles/slurm/templates/fake_gpus.conf.j2 | 8 ++++++++ conf/roles/slurm/templates/slurm.conf.j2 | 5 ++++- 6 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 conf/roles/slurm/templates/fake_gpus.conf.j2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 150986c..4b57cce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ and this project adheres to - Add `slurm_restd_port` variable in inventory to control slurmrestd TCP/IP listening port. - Support all Slurm-web to slurmrestd JWT authentication modes. + - Support gpu GRES in Slurm configuration (#39). - docs: - Add sysctl `fs.inotify.max_user_instances` value increase recommendation in README.md to avoid weird issue when launching many containers. diff --git a/conf/group_vars/all.yml b/conf/group_vars/all.yml index d094cf1..9c6eb5f 100644 --- a/conf/group_vars/all.yml +++ b/conf/group_vars/all.yml @@ -37,6 +37,8 @@ slurm_profiles: compute: compute server: admin login: login +# GRES GPU enabled if at least one type of compute nodes has gpu +slurm_with_gres_gpu: "{{ fhpc_nodes['compute'] | map(attribute='gpus') | map('length') | map('bool') | max }}" slurm_local_munge_key_file: "{{ fhpc_cluster_state_dir }}/munge/munge.key" slurm_local_slurm_key_file: "{{ fhpc_cluster_state_dir }}/slurm/slurm.key" slurm_local_mariadb_password_file: "{{ fhpc_cluster_state_dir }}/mariadb/mariadb.password" diff --git a/conf/roles/slurm/defaults/main.yml b/conf/roles/slurm/defaults/main.yml index 8d88036..4cb91fd 100644 --- a/conf/roles/slurm/defaults/main.yml +++ b/conf/roles/slurm/defaults/main.yml @@ -3,6 +3,8 @@ slurm_profiles: {} slurm_emulator: False slurm_with_accounting: true slurm_with_munge: false +slurm_with_gres_gpu: false +slurm_gpus_models_map: {} slurm_uid: 432 # picked randomly among 100-499 slurm_gid: 432 # picked randomly among 100-499 slurmrestd_uid: 433 # picked randomly among 100-499 diff --git a/conf/roles/slurm/tasks/compute.yml b/conf/roles/slurm/tasks/compute.yml index 0235f57..601cc5a 100644 --- a/conf/roles/slurm/tasks/compute.yml +++ b/conf/roles/slurm/tasks/compute.yml @@ -12,6 +12,25 @@ name: "{{ slurm_emulator | ternary(slurm_emulator_compute_packages, slurm_compute_packages) }}" state: latest +- name: Set slurm current compute node fact + ansible.builtin.set_fact: + slurm_current_compute_node_type: "{{ slurm_compute_nodes | selectattr('nodes', 'contains', ansible_facts['hostname']) | first }}" + when: not slurm_emulator + +# Create fake_gpus if current node has gpus +- name: Deploy slurm fake GPUs configuration file + ansible.builtin.template: + src: fake_gpus.conf.j2 + dest: /etc/slurm/fake_gpus.conf + owner: slurm + group: slurm + mode: '0644' + notify: + - Restart slurmd + when: + - not slurm_emulator + - slurm_current_compute_node_type['gpus'] | length > 0 + # This is required on redhat based distributions as the service is not # automatically started by the RPM packages, and it does not hurt on Debian. - name: Ensure slurmd service is started diff --git a/conf/roles/slurm/templates/fake_gpus.conf.j2 b/conf/roles/slurm/templates/fake_gpus.conf.j2 new file mode 100644 index 0000000..d4ca2c0 --- /dev/null +++ b/conf/roles/slurm/templates/fake_gpus.conf.j2 @@ -0,0 +1,8 @@ +{% set total = slurm_current_compute_node_type['gpus'].values() | sum %} +{% set ns = namespace(index=0) %} +{% for model in slurm_current_compute_node_type['gpus'] %} +{% for _ in range(slurm_current_compute_node_type['gpus'][model]) %} +{{ slurm_gpus_models_map.get(model, 'nvidia') }}|1|~|{% for _index in range(total) %}{% if ns.index == _index %}-1{% else %}0{% endif %}{% if not loop.last %},{% endif %}{% endfor %}|/dev/nvidia{{ ns.index }} +{% set ns.index = ns.index + 1 %} +{% endfor %} +{% endfor %} diff --git a/conf/roles/slurm/templates/slurm.conf.j2 b/conf/roles/slurm/templates/slurm.conf.j2 index 88b47a9..5c163d6 100644 --- a/conf/roles/slurm/templates/slurm.conf.j2 +++ b/conf/roles/slurm/templates/slurm.conf.j2 @@ -30,9 +30,12 @@ ProctrackType=proctrack/linuxproc {% for key, value in slurm_params.items() %} {{ key }}={{ value }} {% endfor %} +{% if slurm_with_gres_gpu %} +GresTypes=gpu +{% endif %} {% for node_type in slurm_compute_nodes %} -NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }} {% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN +NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }}{% if not slurm_emulator %}{% for model, nb in node_type.gpus.items() %} Gres=gpu:{{ slurm_gpus_models_map.get(model, 'nvidia') }}:{{ nb }}{% endfor %}{% endif %}{% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN {% endfor %} {% for slurm_partition in slurm_partitions %} PartitionName={{ slurm_partition.name }} Nodes={{ slurm_partition.nodes }} Default={{ "YES" if "default" in slurm_partition and slurm_partition.default else "NO" }}{% for param, value in slurm_partition.params.items() %} {{param}}={{value}}{% endfor %} From cb27665e842e6bfc688878d92c7f23012b0f4a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 5 May 2025 15:27:57 +0200 Subject: [PATCH 3/3] feat(load): request gpu GRES for jobs Request GPUs allocations on partitions with gpu GRES. --- CHANGELOG.md | 1 + firehpc/load.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b57cce..1e91805 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,7 @@ and this project adheres to submitting less jobs when not at work (#29). - Add `--time-off-factor` option to control by how much the load is divided outside of business hours. + - Request GPUs allocations on partitions with gpu GRES. - conf: - Add possibility to define additional QOS and alternative partitions in Slurm. diff --git a/firehpc/load.py b/firehpc/load.py index b9d9613..26837b9 100644 --- a/firehpc/load.py +++ b/firehpc/load.py @@ -31,7 +31,9 @@ JOBS_TIMELIMITS = (["10", "30", "1:0:0", "6:0:0"], [50, 5, 2, 1]) JOBS_DURATIONS = ([360, 540, 720, 1200], [50, 5, 2, 1]) -ClusterPartition = namedtuple("ClusterPartition", ["name", "nodes", "cpus", "time"]) +ClusterPartition = namedtuple( + "ClusterPartition", ["name", "nodes", "cpus", "gpus", "time"] +) def load_clusters( @@ -140,6 +142,28 @@ def _get_cluster_config(self) -> None: ): self.accounting = True + def _get_partition_gpus(self, partition): + """Return the total number of GPU GRES in a partition.""" + result = 0 + stdout, stderr = self.ssh.exec( + [f"admin.{self.cluster.name}", "scontrol", "show", "nodes", "--json"] + ) + try: + for node in json.loads(stdout)["nodes"]: + if partition not in node["partitions"]: + continue + if not len(node["gres"]): + continue + gres = node["gres"].split(":") + if gres[0] != "gpu": + continue + result += int(gres[2]) + except json.decoder.JSONDecodeError as err: + raise FireHPCRuntimeError( + f"Unable to retrieve nodes from cluster {self.cluster.name}: {str(err)}" + ) from err + return result + def _get_partitions(self) -> list[str]: stdout, stderr = self.ssh.exec( [f"admin.{self.cluster.name}", "scontrol", "show", "partitions", "--json"] @@ -150,6 +174,7 @@ def _get_partitions(self) -> list[str]: partition["name"], partition["nodes"]["total"], partition["cpus"]["total"], + self._get_partition_gpus(partition["name"]), partition["maximums"]["time"], ) for partition in json.loads(stdout)["partitions"] @@ -288,6 +313,8 @@ def random_power_two(limit: int) -> int: # of tasks. if self.select_type == "select/linear": cmd.extend(["--nodes", str(random_power_two(partition.nodes))]) + elif partition.gpus: + cmd.extend(["--gpus", str(random_power_two(partition.gpus))]) else: cmd.extend(["--ntasks", str(random_power_two(partition.cpus))])