From 54a6c88d1f887619994a9119b4e2a9a2b5ffb2a5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 21 Jan 2026 20:21:16 +1100 Subject: [PATCH 1/4] feat: add variables for azure resources and tools Define the directory hierarchy for cloud specific tools, scripts resources and tests and encode them into variables for common usage. The structure we want to use for static files follows this template: /opt/hpc//bin/ # one-off binaries and scripts /opt/hpc//lib/... # resources and libraries /opt/hpc//tools/... # standalone tools /opt/hpc//tests/ # test scripts for local/CI testing For runtime files (e.g. configuration files set up by boot services), we will store them in: /var/hpc//.... These common directories will be defined by the following set of variables: __hpc__resource_dir # /opt/hpc// __hpc__tools_dir # /opt/hpc//tools/ __hpc__tests_dir # /opt/hpc//tests/ __hpc__runtime_dir # /var/hpc// At the moment we only support Azure, so only those variables are defined. Signed-off-by: Dave Chinner --- vars/main.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vars/main.yml b/vars/main.yml index 2773cf3..9e7bde2 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -71,6 +71,14 @@ __hpc_kernel_versionlock_rpms: __hpc_install_prefix: /opt __hpc_moneo_install_dir: /opt/azurehpc/tools +# location for azure specific scripts and resources +__hpc_azure_resource_dir: "{{ __hpc_install_prefix }}/hpc/azure" +__hpc_azure_tools_dir: "{{ __hpc_azure_resource_dir }}/tools" +__hpc_azure_tests_dir: "{{ __hpc_azure_resource_dir }}/tests" + +# location for runtime selected azure resources +__hpc_azure_runtime_dir: /var/hpc/azure + __hpc_module_dir: /usr/share/modulefiles # A variable for testing purposes to force a specific kernel version to be installed From 94059e9842088550368f16ad0a18f8201436d1b4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Jan 2026 10:46:30 +1100 Subject: [PATCH 2/4] feat: create new azure hpc directories if they don't exist Signed-off-by: Dave Chinner --- tasks/main.yml | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tasks/main.yml b/tasks/main.yml index ba8fd7d..282ef6b 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -704,6 +704,62 @@ mode: '0644' notify: Reload udev +- name: Ensure Azure HPC resource directories exists + block: + - name: Check for resource directory existence + stat: + path: "{{ __hpc_azure_resource_dir }}" + register: __hpc_azure_resource_dir_stat + + - name: Check for runtime directory existence + stat: + path: "{{ __hpc_azure_runtime_dir }}" + register: __hpc_azure_runtime_dir_stat + + - name: Create resource directories + when: not __hpc_azure_resource_dir_stat.stat.exists + block: + - name: Create base resource directory + file: + path: "{{ __hpc_azure_resource_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for binaries + file: + path: "{{ __hpc_azure_resource_dir }}/bin" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for tools + file: + path: "{{ __hpc_azure_tools_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for tests + file: + path: "{{ __hpc_azure_tests_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Runtime resource directory + when: not __hpc_azure_runtime_dir_stat.stat.exists + file: + path: "{{ __hpc_azure_runtime_dir }}" + state: directory + owner: root + group: root + mode: '0755' + - name: Remove build dependencies vars: __hpc_dependencies: >- From 2a716720ef116bce6f045518b6178fba9c68fc7b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Jan 2026 12:10:17 +1100 Subject: [PATCH 3/4] hpc: Install azure VM SKU customisation scripts These scripts tune the hardware and software according to the type of Azure VM the image is running on. They run at machine startup and source the relevant information according to the machine type that is detected. Signed-off-by: Dave Chinner --- README.md | 11 ++++ defaults/main.yml | 1 + files/sku/customisations/hbv4.sh | 5 ++ files/sku/customisations/ncv4.sh | 5 ++ files/sku/customisations/ncv5.sh | 46 +++++++++++++ files/sku/customisations/ndv2.sh | 6 ++ files/sku/customisations/ndv4.sh | 19 ++++++ files/sku/customisations/ndv5.sh | 19 ++++++ files/sku/customisations/ndv6.sh | 6 ++ files/sku/topology/ncv4-graph.xml | 68 +++++++++++++++++++ files/sku/topology/ncv4-topo.xml | 33 ++++++++++ files/sku/topology/ndv2-topo.xml | 15 +++++ files/sku/topology/ndv4-topo.xml | 34 ++++++++++ files/sku/topology/ndv5-topo.xml | 38 +++++++++++ tasks/main.yml | 56 ++++++++++++++++ templates/sku/remove_sku_customisations.sh | 23 +++++++ templates/sku/setup_sku_customisations.sh | 76 ++++++++++++++++++++++ templates/sku/sku_customisation.service | 16 +++++ tests/roles/linux-system-roles.hpc/files | 1 + 19 files changed, 478 insertions(+) create mode 100755 files/sku/customisations/hbv4.sh create mode 100755 files/sku/customisations/ncv4.sh create mode 100755 files/sku/customisations/ncv5.sh create mode 100755 files/sku/customisations/ndv2.sh create mode 100755 files/sku/customisations/ndv4.sh create mode 100755 files/sku/customisations/ndv5.sh create mode 100755 files/sku/customisations/ndv6.sh create mode 100644 files/sku/topology/ncv4-graph.xml create mode 100644 files/sku/topology/ncv4-topo.xml create mode 100644 files/sku/topology/ndv2-topo.xml create mode 100644 files/sku/topology/ndv4-topo.xml create mode 100644 files/sku/topology/ndv5-topo.xml create mode 100644 templates/sku/remove_sku_customisations.sh create mode 100644 templates/sku/setup_sku_customisations.sh create mode 100644 templates/sku/sku_customisation.service create mode 120000 tests/roles/linux-system-roles.hpc/files diff --git a/README.md b/README.md index 9c1bd3d..c4e0001 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,17 @@ Default: `true` Type: `bool` +### hpc_sku_customisation + +Whether to install the hardware tuning files for different Azure VM types (SKUs). + +This will install definitions for optimal hardware configurations for the different types of high performance VMs that are typically used for HPC workloads in the Azure environment. +These include Infiniband and GPU/NVLink and NCCL customisations, as well as any workarounds for specific hardware problems that may be needed. + +Default: `true` + +Type: `bool` + ## Variables for Configuring How Role Reboots Managed Nodes ### hpc_reboot_ok diff --git a/defaults/main.yml b/defaults/main.yml index 9d235a3..7f7ff21 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -23,6 +23,7 @@ hpc_build_openmpi_w_nvidia_gpu_support: true hpc_install_moneo: true hpc_tuning: true +hpc_sku_customisation: true hpc_update_kernel: true hpc_update_all_packages: false diff --git a/files/sku/customisations/hbv4.sh b/files/sku/customisations/hbv4.sh new file mode 100755 index 0000000..af76aaf --- /dev/null +++ b/files/sku/customisations/hbv4.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} diff --git a/files/sku/customisations/ncv4.sh b/files/sku/customisations/ncv4.sh new file mode 100755 index 0000000..fe7ecca --- /dev/null +++ b/files/sku/customisations/ncv4.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Set up the NCv4 topology and graph files +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ncv4-topo.xml ${TOPOLOGY_FILE} +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ncv4-graph.xml ${TOPOLOGY_GRAPH} diff --git a/files/sku/customisations/ncv5.sh b/files/sku/customisations/ncv5.sh new file mode 100755 index 0000000..3177995 --- /dev/null +++ b/files/sku/customisations/ncv5.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Attempt to work around NVLink initialisation bug +if nvidia-smi nvlink --status | grep -qa inActive; then + # Ensure Hyper-V PCI devices is ready + retries=0 + while ! ls /sys/bus/vmbus/drivers/hv_pci/ | grep -q '[0-9a-f-]\{8\}-'; do + error_code=$? + if (( retries++ >= 5 )); then + echo "Hyper-V PCI devices Inactive!" + exit ${error_code} + fi + echo "Waiting for Hyper-V PCI devices..." + sleep 1 + done + + # Ensure NVIDIA GPU PCI devices is ready + retries=0 + while ! lspci | grep -qi nvidia; do + error_code=$? + if (( retries++ >= 5 )); then + echo "NVIDIA GPU PCI Inactive!" + exit ${error_code} + fi + echo "Waiting for NVIDIA GPU PCI devices..." + sleep 1 + done + + echo "Reloading NVIDIA kernel modules..." + sudo systemctl stop nvidia-dcgm.service + sudo modprobe -r nvidia_drm nvidia_modeset gdrdrv nvidia_peermem nvidia_uvm nvidia + sudo modprobe nvidia nvidia_modeset nvidia_uvm nvidia_peermem gdrdrv nvidia_drm + sudo systemctl start nvidia-dcgm.service +fi + +echo "Check NVLink status after reloading NVIDIA kernel modules..." +if nvidia-smi nvlink --status | grep -qa inActive; then + echo "NVLink is still Inactive after reloading NVIDIA kernel modules!" + exit 1 +else + echo "NVLink is Active." +fi diff --git a/files/sku/customisations/ndv2.sh b/files/sku/customisations/ndv2.sh new file mode 100755 index 0000000..9b4da0d --- /dev/null +++ b/files/sku/customisations/ndv2.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Link the NDv2 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv2-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + diff --git a/files/sku/customisations/ndv4.sh b/files/sku/customisations/ndv4.sh new file mode 100755 index 0000000..ffad164 --- /dev/null +++ b/files/sku/customisations/ndv4.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Link the NDv4 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv4-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Apply NDv4 specific NCCL configurations +echo "NCCL_IB_PCI_RELAXED_ORDERING=1" >> ${NCCL_CONF} + +## NVIDIA Fabric manager +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager + +error_code=$? +if [ ${error_code} -ne 0 ]; then + echo "NVIDIA Fabric Manager Inactive!" + exit ${error_code} +fi diff --git a/files/sku/customisations/ndv5.sh b/files/sku/customisations/ndv5.sh new file mode 100755 index 0000000..53473e0 --- /dev/null +++ b/files/sku/customisations/ndv5.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Link the NDv5 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv5-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Apply NDv5 specific NCCL configurations +echo "NCCL_IB_PCI_RELAXED_ORDERING=1" >> ${NCCL_CONF} + +## NVIDIA Fabric manager +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager + +error_code=$? +if [ ${error_code} -ne 0 ]; then + echo "NVIDIA Fabric Manager Inactive!" + exit ${error_code} +fi diff --git a/files/sku/customisations/ndv6.sh b/files/sku/customisations/ndv6.sh new file mode 100755 index 0000000..7eb7416 --- /dev/null +++ b/files/sku/customisations/ndv6.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + diff --git a/files/sku/topology/ncv4-graph.xml b/files/sku/topology/ncv4-graph.xml new file mode 100644 index 0000000..c28cf52 --- /dev/null +++ b/files/sku/topology/ncv4-graph.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ncv4-topo.xml b/files/sku/topology/ncv4-topo.xml new file mode 100644 index 0000000..cf0eeb6 --- /dev/null +++ b/files/sku/topology/ncv4-topo.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv2-topo.xml b/files/sku/topology/ndv2-topo.xml new file mode 100644 index 0000000..dd1e0cd --- /dev/null +++ b/files/sku/topology/ndv2-topo.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv4-topo.xml b/files/sku/topology/ndv4-topo.xml new file mode 100644 index 0000000..dde6f06 --- /dev/null +++ b/files/sku/topology/ndv4-topo.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv5-topo.xml b/files/sku/topology/ndv5-topo.xml new file mode 100644 index 0000000..fce6581 --- /dev/null +++ b/files/sku/topology/ndv5-topo.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tasks/main.yml b/tasks/main.yml index 282ef6b..713748a 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -760,6 +760,62 @@ group: root mode: '0755' +- name: Install SKU Customisation scripts and services + when: hpc_sku_customisation + block: + - name: Check if already installed + stat: + path: "{{ __hpc_azure_resource_dir }}/topology" + register: __hpc_sku_topology_stat + + - name: Install files + when: not __hpc_sku_topology_stat.stat.exists + block: + - name: Install Topology Definitions + copy: + src: sku/topology/ + dest: "{{ __hpc_azure_resource_dir }}/topology" + owner: root + group: root + mode: '0755' + + - name: Install Graph Files + copy: + src: sku/customisations/ + dest: "{{ __hpc_azure_resource_dir }}/customisations" + owner: root + group: root + mode: '0755' + + - name: Install setup script + template: + src: sku/setup_sku_customisations.sh + dest: "{{ __hpc_azure_resource_dir }}/bin/setup_sku_customisations.sh" + owner: root + group: root + mode: '0755' + + - name: Install removal script + template: + src: sku/remove_sku_customisations.sh + dest: "{{ __hpc_azure_resource_dir }}/bin/remove_sku_customisations.sh" + owner: root + group: root + mode: '0755' + + - name: Install systemd service file + template: + src: sku/sku_customisation.service + dest: /etc/systemd/system/ + owner: root + group: root + mode: '0755' + + - name: Enable systemd service file + service: + name: sku_customisation.service + enabled: true + - name: Remove build dependencies vars: __hpc_dependencies: >- diff --git a/templates/sku/remove_sku_customisations.sh b/templates/sku/remove_sku_customisations.sh new file mode 100644 index 0000000..ca89f9e --- /dev/null +++ b/templates/sku/remove_sku_customisations.sh @@ -0,0 +1,23 @@ +#!/bin/bash +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} + +export TOPOLOGY_SRC_DIR="{{ __hpc_azure_resource_dir }}" +export TOPOLOGY_RUNTIME_DIR="{{ __hpc_azure_runtime_dir }}/topology" + +# Stop nvidia fabric manager +if systemctl is-active --quiet nvidia-fabricmanager ; then + systemctl stop nvidia-fabricmanager + systemctl disable nvidia-fabricmanager +fi + +# Remove NVIDIA peer memory module +if lsmod | grep nvidia_peermem &> /dev/null ; then + rmmod nvidia_peermem +fi + +# Clear topo and graph files +rm -rf ${TOPOLOGY_RUNTIME_DIR} + +# Clear contents of nccl.conf +cat /dev/null > /etc/nccl.conf diff --git a/templates/sku/setup_sku_customisations.sh b/templates/sku/setup_sku_customisations.sh new file mode 100644 index 0000000..b3beebe --- /dev/null +++ b/templates/sku/setup_sku_customisations.sh @@ -0,0 +1,76 @@ +#!/bin/bash +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} + +# Set up the common NCCL config options so we can append the configured +# customisations to it as needed. +export NCCL_CONF="/etc/nccl.conf" +echo "NCCL_IGNORE_CPU_AFFINITY=1" > ${NCCL_CONF} + +# define the location for the topology and graph customisation files +export TOPOLOGY_SRC_DIR="{{ __hpc_azure_resource_dir }}" +export TOPOLOGY_RUNTIME_DIR="{{ __hpc_azure_runtime_dir }}/topology" +export TOPOLOGY_GRAPH="${TOPOLOGY_RUNTIME_DIR}/graph.xml" +export TOPOLOGY_FILE="${TOPOLOGY_RUNTIME_DIR}/topo.xml" +mkdir -p $TOPOLOGY_RUNTIME_DIR + +# Use the internal metadata service API to determine the type of machine and +# apply the necessary customisations for that machine. +# +# Note: for manual testing, we mock the SKU from the test environment rather +# than doing an API lookup. The API based customisation will be tested fully +# from the CI system that runs the testing on appropriate hardware. +if [ -z "$__MOCK_SKU" ]; then + metadata_endpoint="http://169.254.169.254/metadata/instance?api-version=2019-06-04" + + retry_count=0 + while (( retry_count++ < 5 )); do + sku=$(curl -s -H Metadata:true ${metadata_endpoint} | jq -r ".compute.vmSize") + [ -z $sku ] || break + sleep 30 + done +else + sku="$__MOCK_SKU" +fi + +if [ -z "$sku" ]; then + echo "Error! Could not retrieve VM Size from IMDS endpoint" + exit 1 +fi + +sku=$(echo "$sku" | awk '{print tolower($0)}') + +## Topo file setup based on SKU +case $sku in + standard_nc96ads_a100_v4) + $TOPOLOGY_SRC_DIR/customisations/ncv4.sh;; + + standard_nd*v4) + $TOPOLOGY_SRC_DIR/customisations/ndv4.sh;; + + standard_nd40rs_v2) + $TOPOLOGY_SRC_DIR/customisations/ndv2.sh;; + + standard_hb176*v4) + $TOPOLOGY_SRC_DIR/customisations/hbv4.sh;; + + standard_nc80adis_h100_v5) + $TOPOLOGY_SRC_DIR/customisations/ncv5.sh;; + + standard_nd96is*_h[1-2]00_v5) + $TOPOLOGY_SRC_DIR/customisations/ndv5.sh;; + + standard_nd128is*_gb[2-3]00_v6) + $TOPOLOGY_SRC_DIR/customisations/ndv6.sh;; + + *) echo "No SKU customization for $sku";; +esac + +# Point NCCL at the configured topology and graph files +if [ -e "${TOPOLOGY_FILE}" ]; then + echo "NCCL_TOPO_FILE=${TOPOLOGY_FILE}" >> ${NCCL_CONF} +fi +if [ -e "${TOPOLOGY_GRAPH}" ]; then + echo "NCCL_GRAPH_FILE=${TOPOLOGY_GRAPH}" >> ${NCCL_CONF} +fi + diff --git a/templates/sku/sku_customisation.service b/templates/sku/sku_customisation.service new file mode 100644 index 0000000..b3a83e9 --- /dev/null +++ b/templates/sku/sku_customisation.service @@ -0,0 +1,16 @@ +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} +[Unit] +Description=Customisations based on SKU +After=network.target + +[Service] +Type=oneshot +ExecStart={{ __hpc_azure_resource_dir }}/bin/setup_sku_customisations.sh +ExecStop={{ __hpc_azure_resource_dir }}/bin/remove_sku_customisations.sh +RemainAfterExit=true +StandardOutput=journal + +[Install] +WantedBy=multi-user.target + diff --git a/tests/roles/linux-system-roles.hpc/files b/tests/roles/linux-system-roles.hpc/files new file mode 120000 index 0000000..aa29175 --- /dev/null +++ b/tests/roles/linux-system-roles.hpc/files @@ -0,0 +1 @@ +../../../files \ No newline at end of file From 1b78a26cd574f608ed683bc20a4e95e527c311d5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Jan 2026 18:25:48 +1100 Subject: [PATCH 4/4] fix: unknown SKUs should not leave stale files around When testing the setup script with unknown SKUs, it was found that this fails to clean the configuration files properly: .... Testing some_unknown_sku_for_testing No SKU customization for some_unknown_sku_for_testing Unknown SKU Failed: some_unknown_sku_for_testing: /etc/nccl.conf not empty $ Fix this up by removing the various sku specific files on unknown SKU types. Signed-off-by: Dave Chinner --- templates/sku/setup_sku_customisations.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/templates/sku/setup_sku_customisations.sh b/templates/sku/setup_sku_customisations.sh index b3beebe..105638a 100644 --- a/templates/sku/setup_sku_customisations.sh +++ b/templates/sku/setup_sku_customisations.sh @@ -63,7 +63,11 @@ case $sku in standard_nd128is*_gb[2-3]00_v6) $TOPOLOGY_SRC_DIR/customisations/ndv6.sh;; - *) echo "No SKU customization for $sku";; + *) echo "No SKU customization for $sku" + rm -f $TOPOLOGY_GRAPH + rm -f $TOPOLOGY_FILE + rm -f $NCCL_CONF + ;; esac # Point NCCL at the configured topology and graph files