Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ and this project adheres to
submitting less jobs when not at work (#29).
- Add `--time-off-factor` option to control by how much the load is divided
outside of business hours.
- Request GPUs allocations on partitions with gpu GRES.
- conf:
- Add possibility to define additional QOS and alternative partitions in
Slurm.
Expand All @@ -68,6 +69,7 @@ and this project adheres to
- Add `slurm_restd_port` variable in inventory to control slurmrestd TCP/IP
listening port.
- Support all Slurm-web to slurmrestd JWT authentication modes.
- Support gpu GRES in Slurm configuration (#39).
- docs:
- Add sysctl `fs.inotify.max_user_instances` value increase recommendation in
README.md to avoid weird issue when launching many containers.
Expand Down
2 changes: 2 additions & 0 deletions conf/group_vars/all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ slurm_profiles:
compute: compute
server: admin
login: login
# GRES GPU enabled if at least one type of compute nodes has gpu
slurm_with_gres_gpu: "{{ fhpc_nodes['compute'] | map(attribute='gpus') | map('length') | map('bool') | max }}"
slurm_local_munge_key_file: "{{ fhpc_cluster_state_dir }}/munge/munge.key"
slurm_local_slurm_key_file: "{{ fhpc_cluster_state_dir }}/slurm/slurm.key"
slurm_local_mariadb_password_file: "{{ fhpc_cluster_state_dir }}/mariadb/mariadb.password"
Expand Down
2 changes: 2 additions & 0 deletions conf/roles/slurm/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ slurm_profiles: {}
slurm_emulator: False
slurm_with_accounting: true
slurm_with_munge: false
slurm_with_gres_gpu: false
slurm_gpus_models_map: {}
slurm_uid: 432 # picked randomly among 100-499
slurm_gid: 432 # picked randomly among 100-499
slurmrestd_uid: 433 # picked randomly among 100-499
Expand Down
19 changes: 19 additions & 0 deletions conf/roles/slurm/tasks/compute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@
name: "{{ slurm_emulator | ternary(slurm_emulator_compute_packages, slurm_compute_packages) }}"
state: latest

- name: Set slurm current compute node fact
ansible.builtin.set_fact:
slurm_current_compute_node_type: "{{ slurm_compute_nodes | selectattr('nodes', 'contains', ansible_facts['hostname']) | first }}"
when: not slurm_emulator

# Create fake_gpus if current node has gpus
- name: Deploy slurm fake GPUs configuration file
ansible.builtin.template:
src: fake_gpus.conf.j2
dest: /etc/slurm/fake_gpus.conf
owner: slurm
group: slurm
mode: '0644'
notify:
- Restart slurmd
when:
- not slurm_emulator
- slurm_current_compute_node_type['gpus'] | length > 0

# This is required on redhat based distributions as the service is not
# automatically started by the RPM packages, and it does not hurt on Debian.
- name: Ensure slurmd service is started
Expand Down
8 changes: 8 additions & 0 deletions conf/roles/slurm/templates/fake_gpus.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{% set total = slurm_current_compute_node_type['gpus'].values() | sum %}
{% set ns = namespace(index=0) %}
{% for model in slurm_current_compute_node_type['gpus'] %}
{% for _ in range(slurm_current_compute_node_type['gpus'][model]) %}
{{ slurm_gpus_models_map.get(model, 'nvidia') }}|1|~|{% for _index in range(total) %}{% if ns.index == _index %}-1{% else %}0{% endif %}{% if not loop.last %},{% endif %}{% endfor %}|/dev/nvidia{{ ns.index }}
{% set ns.index = ns.index + 1 %}
{% endfor %}
{% endfor %}
5 changes: 4 additions & 1 deletion conf/roles/slurm/templates/slurm.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ ProctrackType=proctrack/linuxproc
{% for key, value in slurm_params.items() %}
{{ key }}={{ value }}
{% endfor %}
{% if slurm_with_gres_gpu %}
GresTypes=gpu
{% endif %}

{% for node_type in slurm_compute_nodes %}
NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }} {% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
NodeName={{ node_type.nodes | nodeset_fold }} Sockets={{ node_type.sockets }} CoresPerSocket={{ node_type.cores }} RealMemory={{ node_type.memory }}{% if not slurm_emulator %}{% for model, nb in node_type.gpus.items() %} Gres=gpu:{{ slurm_gpus_models_map.get(model, 'nvidia') }}:{{ nb }}{% endfor %}{% endif %}{% if slurm_emulator %} NodeHostName={{ slurm_server }}{% endif %} State=UNKNOWN
{% endfor %}
{% for slurm_partition in slurm_partitions %}
PartitionName={{ slurm_partition.name }} Nodes={{ slurm_partition.nodes }} Default={{ "YES" if "default" in slurm_partition and slurm_partition.default else "NO" }}{% for param, value in slurm_partition.params.items() %} {{param}}={{value}}{% endfor %}
Expand Down
11 changes: 11 additions & 0 deletions firehpc/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,16 @@ def conf(
# then grouped by node type.
nodes = {}

def node_type_gpus(node_type):
result = {}
if not hasattr(node_type, "gpu"):
return result
for gpu in node_type.gpu:
if gpu.model not in result:
result[gpu.model] = 0
result[gpu.model] += 1
return result

def insert_in_node_type():
for node_type in nodes[tag]:
if node_type["type"] == node.type.id:
Expand All @@ -186,6 +196,7 @@ def insert_in_node_type():
"sockets": node.type.cpu.sockets,
"cores": node.type.cpu.cores,
"memory": node.type.ram.dimm * (node.type.ram.size // 1024**2),
"gpus": node_type_gpus(node.type),
"nodes": [node.name],
}
)
Expand Down
29 changes: 28 additions & 1 deletion firehpc/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
JOBS_TIMELIMITS = (["10", "30", "1:0:0", "6:0:0"], [50, 5, 2, 1])
JOBS_DURATIONS = ([360, 540, 720, 1200], [50, 5, 2, 1])

ClusterPartition = namedtuple("ClusterPartition", ["name", "nodes", "cpus", "time"])
ClusterPartition = namedtuple(
"ClusterPartition", ["name", "nodes", "cpus", "gpus", "time"]
)


def load_clusters(
Expand Down Expand Up @@ -140,6 +142,28 @@ def _get_cluster_config(self) -> None:
):
self.accounting = True

def _get_partition_gpus(self, partition):
"""Return the total number of GPU GRES in a partition."""
result = 0
stdout, stderr = self.ssh.exec(
[f"admin.{self.cluster.name}", "scontrol", "show", "nodes", "--json"]
)
try:
for node in json.loads(stdout)["nodes"]:
if partition not in node["partitions"]:
continue
if not len(node["gres"]):
continue
gres = node["gres"].split(":")
if gres[0] != "gpu":
continue
result += int(gres[2])
except json.decoder.JSONDecodeError as err:
raise FireHPCRuntimeError(
f"Unable to retrieve nodes from cluster {self.cluster.name}: {str(err)}"
) from err
return result

def _get_partitions(self) -> list[str]:
stdout, stderr = self.ssh.exec(
[f"admin.{self.cluster.name}", "scontrol", "show", "partitions", "--json"]
Expand All @@ -150,6 +174,7 @@ def _get_partitions(self) -> list[str]:
partition["name"],
partition["nodes"]["total"],
partition["cpus"]["total"],
self._get_partition_gpus(partition["name"]),
partition["maximums"]["time"],
)
for partition in json.loads(stdout)["partitions"]
Expand Down Expand Up @@ -288,6 +313,8 @@ def random_power_two(limit: int) -> int:
# of tasks.
if self.select_type == "select/linear":
cmd.extend(["--nodes", str(random_power_two(partition.nodes))])
elif partition.gpus:
cmd.extend(["--gpus", str(random_power_two(partition.gpus))])
else:
cmd.extend(["--ntasks", str(random_power_two(partition.cpus))])

Expand Down