diff --git a/README.md b/README.md index 9c1bd3d..c4e0001 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,17 @@ Default: `true` Type: `bool` +### hpc_sku_customisation + +Whether to install the hardware tuning files for different Azure VM types (SKUs). + +This will install definitions for optimal hardware configurations for the different types of high performance VMs that are typically used for HPC workloads in the Azure environment. +These include Infiniband and GPU/NVLink and NCCL customisations, as well as any workarounds for specific hardware problems that may be needed. + +Default: `true` + +Type: `bool` + ## Variables for Configuring How Role Reboots Managed Nodes ### hpc_reboot_ok diff --git a/defaults/main.yml b/defaults/main.yml index 9d235a3..7f7ff21 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -23,6 +23,7 @@ hpc_build_openmpi_w_nvidia_gpu_support: true hpc_install_moneo: true hpc_tuning: true +hpc_sku_customisation: true hpc_update_kernel: true hpc_update_all_packages: false diff --git a/files/sku/customisations/hbv4.sh b/files/sku/customisations/hbv4.sh new file mode 100755 index 0000000..af76aaf --- /dev/null +++ b/files/sku/customisations/hbv4.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} diff --git a/files/sku/customisations/ncv4.sh b/files/sku/customisations/ncv4.sh new file mode 100755 index 0000000..fe7ecca --- /dev/null +++ b/files/sku/customisations/ncv4.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Set up the NCv4 topology and graph files +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ncv4-topo.xml ${TOPOLOGY_FILE} +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ncv4-graph.xml ${TOPOLOGY_GRAPH} diff --git a/files/sku/customisations/ncv5.sh b/files/sku/customisations/ncv5.sh new file mode 100755 index 0000000..3177995 --- /dev/null +++ b/files/sku/customisations/ncv5.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Attempt to work around NVLink initialisation bug +if nvidia-smi nvlink --status | grep -qa inActive; then + # Ensure Hyper-V PCI devices is ready + retries=0 + while ! ls /sys/bus/vmbus/drivers/hv_pci/ | grep -q '[0-9a-f-]\{8\}-'; do + error_code=$? + if (( retries++ >= 5 )); then + echo "Hyper-V PCI devices Inactive!" + exit ${error_code} + fi + echo "Waiting for Hyper-V PCI devices..." + sleep 1 + done + + # Ensure NVIDIA GPU PCI devices is ready + retries=0 + while ! lspci | grep -qi nvidia; do + error_code=$? + if (( retries++ >= 5 )); then + echo "NVIDIA GPU PCI Inactive!" + exit ${error_code} + fi + echo "Waiting for NVIDIA GPU PCI devices..." + sleep 1 + done + + echo "Reloading NVIDIA kernel modules..." + sudo systemctl stop nvidia-dcgm.service + sudo modprobe -r nvidia_drm nvidia_modeset gdrdrv nvidia_peermem nvidia_uvm nvidia + sudo modprobe nvidia nvidia_modeset nvidia_uvm nvidia_peermem gdrdrv nvidia_drm + sudo systemctl start nvidia-dcgm.service +fi + +echo "Check NVLink status after reloading NVIDIA kernel modules..." +if nvidia-smi nvlink --status | grep -qa inActive; then + echo "NVLink is still Inactive after reloading NVIDIA kernel modules!" + exit 1 +else + echo "NVLink is Active." +fi diff --git a/files/sku/customisations/ndv2.sh b/files/sku/customisations/ndv2.sh new file mode 100755 index 0000000..9b4da0d --- /dev/null +++ b/files/sku/customisations/ndv2.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Link the NDv2 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv2-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + diff --git a/files/sku/customisations/ndv4.sh b/files/sku/customisations/ndv4.sh new file mode 100755 index 0000000..ffad164 --- /dev/null +++ b/files/sku/customisations/ndv4.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Link the NDv4 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv4-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Apply NDv4 specific NCCL configurations +echo "NCCL_IB_PCI_RELAXED_ORDERING=1" >> ${NCCL_CONF} + +## NVIDIA Fabric manager +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager + +error_code=$? +if [ ${error_code} -ne 0 ]; then + echo "NVIDIA Fabric Manager Inactive!" + exit ${error_code} +fi diff --git a/files/sku/customisations/ndv5.sh b/files/sku/customisations/ndv5.sh new file mode 100755 index 0000000..53473e0 --- /dev/null +++ b/files/sku/customisations/ndv5.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Link the NDv5 topology file, no graph for this machine type +ln -sf ${TOPOLOGY_SRC_DIR}/topology/ndv5-topo.xml ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + +# Apply NDv5 specific NCCL configurations +echo "NCCL_IB_PCI_RELAXED_ORDERING=1" >> ${NCCL_CONF} + +## NVIDIA Fabric manager +systemctl enable nvidia-fabricmanager +systemctl start nvidia-fabricmanager +systemctl is-active --quiet nvidia-fabricmanager + +error_code=$? +if [ ${error_code} -ne 0 ]; then + echo "NVIDIA Fabric Manager Inactive!" + exit ${error_code} +fi diff --git a/files/sku/customisations/ndv6.sh b/files/sku/customisations/ndv6.sh new file mode 100755 index 0000000..7eb7416 --- /dev/null +++ b/files/sku/customisations/ndv6.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# there are no current topology optimisations for this hardware. +rm -f ${TOPOLOGY_FILE} +rm -f ${TOPOLOGY_GRAPH} + diff --git a/files/sku/topology/ncv4-graph.xml b/files/sku/topology/ncv4-graph.xml new file mode 100644 index 0000000..c28cf52 --- /dev/null +++ b/files/sku/topology/ncv4-graph.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ncv4-topo.xml b/files/sku/topology/ncv4-topo.xml new file mode 100644 index 0000000..cf0eeb6 --- /dev/null +++ b/files/sku/topology/ncv4-topo.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv2-topo.xml b/files/sku/topology/ndv2-topo.xml new file mode 100644 index 0000000..dd1e0cd --- /dev/null +++ b/files/sku/topology/ndv2-topo.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv4-topo.xml b/files/sku/topology/ndv4-topo.xml new file mode 100644 index 0000000..dde6f06 --- /dev/null +++ b/files/sku/topology/ndv4-topo.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/sku/topology/ndv5-topo.xml b/files/sku/topology/ndv5-topo.xml new file mode 100644 index 0000000..fce6581 --- /dev/null +++ b/files/sku/topology/ndv5-topo.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tasks/main.yml b/tasks/main.yml index ba8fd7d..713748a 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -704,6 +704,118 @@ mode: '0644' notify: Reload udev +- name: Ensure Azure HPC resource directories exists + block: + - name: Check for resource directory existence + stat: + path: "{{ __hpc_azure_resource_dir }}" + register: __hpc_azure_resource_dir_stat + + - name: Check for runtime directory existence + stat: + path: "{{ __hpc_azure_runtime_dir }}" + register: __hpc_azure_runtime_dir_stat + + - name: Create resource directories + when: not __hpc_azure_resource_dir_stat.stat.exists + block: + - name: Create base resource directory + file: + path: "{{ __hpc_azure_resource_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for binaries + file: + path: "{{ __hpc_azure_resource_dir }}/bin" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for tools + file: + path: "{{ __hpc_azure_tools_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Location for tests + file: + path: "{{ __hpc_azure_tests_dir }}" + state: directory + owner: root + group: root + mode: '0755' + + - name: Runtime resource directory + when: not __hpc_azure_runtime_dir_stat.stat.exists + file: + path: "{{ __hpc_azure_runtime_dir }}" + state: directory + owner: root + group: root + mode: '0755' + +- name: Install SKU Customisation scripts and services + when: hpc_sku_customisation + block: + - name: Check if already installed + stat: + path: "{{ __hpc_azure_resource_dir }}/topology" + register: __hpc_sku_topology_stat + + - name: Install files + when: not __hpc_sku_topology_stat.stat.exists + block: + - name: Install Topology Definitions + copy: + src: sku/topology/ + dest: "{{ __hpc_azure_resource_dir }}/topology" + owner: root + group: root + mode: '0755' + + - name: Install Graph Files + copy: + src: sku/customisations/ + dest: "{{ __hpc_azure_resource_dir }}/customisations" + owner: root + group: root + mode: '0755' + + - name: Install setup script + template: + src: sku/setup_sku_customisations.sh + dest: "{{ __hpc_azure_resource_dir }}/bin/setup_sku_customisations.sh" + owner: root + group: root + mode: '0755' + + - name: Install removal script + template: + src: sku/remove_sku_customisations.sh + dest: "{{ __hpc_azure_resource_dir }}/bin/remove_sku_customisations.sh" + owner: root + group: root + mode: '0755' + + - name: Install systemd service file + template: + src: sku/sku_customisation.service + dest: /etc/systemd/system/ + owner: root + group: root + mode: '0755' + + - name: Enable systemd service file + service: + name: sku_customisation.service + enabled: true + - name: Remove build dependencies vars: __hpc_dependencies: >- diff --git a/templates/sku/remove_sku_customisations.sh b/templates/sku/remove_sku_customisations.sh new file mode 100644 index 0000000..ca89f9e --- /dev/null +++ b/templates/sku/remove_sku_customisations.sh @@ -0,0 +1,23 @@ +#!/bin/bash +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} + +export TOPOLOGY_SRC_DIR="{{ __hpc_azure_resource_dir }}" +export TOPOLOGY_RUNTIME_DIR="{{ __hpc_azure_runtime_dir }}/topology" + +# Stop nvidia fabric manager +if systemctl is-active --quiet nvidia-fabricmanager ; then + systemctl stop nvidia-fabricmanager + systemctl disable nvidia-fabricmanager +fi + +# Remove NVIDIA peer memory module +if lsmod | grep nvidia_peermem &> /dev/null ; then + rmmod nvidia_peermem +fi + +# Clear topo and graph files +rm -rf ${TOPOLOGY_RUNTIME_DIR} + +# Clear contents of nccl.conf +cat /dev/null > /etc/nccl.conf diff --git a/templates/sku/setup_sku_customisations.sh b/templates/sku/setup_sku_customisations.sh new file mode 100644 index 0000000..105638a --- /dev/null +++ b/templates/sku/setup_sku_customisations.sh @@ -0,0 +1,80 @@ +#!/bin/bash +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} + +# Set up the common NCCL config options so we can append the configured +# customisations to it as needed. +export NCCL_CONF="/etc/nccl.conf" +echo "NCCL_IGNORE_CPU_AFFINITY=1" > ${NCCL_CONF} + +# define the location for the topology and graph customisation files +export TOPOLOGY_SRC_DIR="{{ __hpc_azure_resource_dir }}" +export TOPOLOGY_RUNTIME_DIR="{{ __hpc_azure_runtime_dir }}/topology" +export TOPOLOGY_GRAPH="${TOPOLOGY_RUNTIME_DIR}/graph.xml" +export TOPOLOGY_FILE="${TOPOLOGY_RUNTIME_DIR}/topo.xml" +mkdir -p $TOPOLOGY_RUNTIME_DIR + +# Use the internal metadata service API to determine the type of machine and +# apply the necessary customisations for that machine. +# +# Note: for manual testing, we mock the SKU from the test environment rather +# than doing an API lookup. The API based customisation will be tested fully +# from the CI system that runs the testing on appropriate hardware. +if [ -z "$__MOCK_SKU" ]; then + metadata_endpoint="http://169.254.169.254/metadata/instance?api-version=2019-06-04" + + retry_count=0 + while (( retry_count++ < 5 )); do + sku=$(curl -s -H Metadata:true ${metadata_endpoint} | jq -r ".compute.vmSize") + [ -z $sku ] || break + sleep 30 + done +else + sku="$__MOCK_SKU" +fi + +if [ -z "$sku" ]; then + echo "Error! Could not retrieve VM Size from IMDS endpoint" + exit 1 +fi + +sku=$(echo "$sku" | awk '{print tolower($0)}') + +## Topo file setup based on SKU +case $sku in + standard_nc96ads_a100_v4) + $TOPOLOGY_SRC_DIR/customisations/ncv4.sh;; + + standard_nd*v4) + $TOPOLOGY_SRC_DIR/customisations/ndv4.sh;; + + standard_nd40rs_v2) + $TOPOLOGY_SRC_DIR/customisations/ndv2.sh;; + + standard_hb176*v4) + $TOPOLOGY_SRC_DIR/customisations/hbv4.sh;; + + standard_nc80adis_h100_v5) + $TOPOLOGY_SRC_DIR/customisations/ncv5.sh;; + + standard_nd96is*_h[1-2]00_v5) + $TOPOLOGY_SRC_DIR/customisations/ndv5.sh;; + + standard_nd128is*_gb[2-3]00_v6) + $TOPOLOGY_SRC_DIR/customisations/ndv6.sh;; + + *) echo "No SKU customization for $sku" + rm -f $TOPOLOGY_GRAPH + rm -f $TOPOLOGY_FILE + rm -f $NCCL_CONF + ;; +esac + +# Point NCCL at the configured topology and graph files +if [ -e "${TOPOLOGY_FILE}" ]; then + echo "NCCL_TOPO_FILE=${TOPOLOGY_FILE}" >> ${NCCL_CONF} +fi +if [ -e "${TOPOLOGY_GRAPH}" ]; then + echo "NCCL_GRAPH_FILE=${TOPOLOGY_GRAPH}" >> ${NCCL_CONF} +fi + diff --git a/templates/sku/sku_customisation.service b/templates/sku/sku_customisation.service new file mode 100644 index 0000000..b3a83e9 --- /dev/null +++ b/templates/sku/sku_customisation.service @@ -0,0 +1,16 @@ +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} +[Unit] +Description=Customisations based on SKU +After=network.target + +[Service] +Type=oneshot +ExecStart={{ __hpc_azure_resource_dir }}/bin/setup_sku_customisations.sh +ExecStop={{ __hpc_azure_resource_dir }}/bin/remove_sku_customisations.sh +RemainAfterExit=true +StandardOutput=journal + +[Install] +WantedBy=multi-user.target + diff --git a/tests/roles/linux-system-roles.hpc/files b/tests/roles/linux-system-roles.hpc/files new file mode 120000 index 0000000..aa29175 --- /dev/null +++ b/tests/roles/linux-system-roles.hpc/files @@ -0,0 +1 @@ +../../../files \ No newline at end of file diff --git a/vars/main.yml b/vars/main.yml index 2773cf3..9e7bde2 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -71,6 +71,14 @@ __hpc_kernel_versionlock_rpms: __hpc_install_prefix: /opt __hpc_moneo_install_dir: /opt/azurehpc/tools +# location for azure specific scripts and resources +__hpc_azure_resource_dir: "{{ __hpc_install_prefix }}/hpc/azure" +__hpc_azure_tools_dir: "{{ __hpc_azure_resource_dir }}/tools" +__hpc_azure_tests_dir: "{{ __hpc_azure_resource_dir }}/tests" + +# location for runtime selected azure resources +__hpc_azure_runtime_dir: /var/hpc/azure + __hpc_module_dir: /usr/share/modulefiles # A variable for testing purposes to force a specific kernel version to be installed