From 8211200dbdb24461e57a4a0764d5c1f4b5bcc662 Mon Sep 17 00:00:00 2001 From: Aviv Zecharia Date: Mon, 19 May 2025 15:32:52 +0300 Subject: [PATCH 1/2] Remove redundant fmpm list commands and for loops --- qemu | 90 +++++++++++++++++++++++------------------------------------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/qemu b/qemu index 94aa237..718d6c5 100755 --- a/qemu +++ b/qemu @@ -9,19 +9,19 @@ import json import logging from logging.handlers import SysLogHandler -import os import subprocess import sys import xml.etree.ElementTree as etree LOG = logging.getLogger(__name__) LOG.setLevel(logging.DEBUG) -handler = SysLogHandler(facility=SysLogHandler.LOG_DAEMON,address='/dev/log') +handler = SysLogHandler(facility=SysLogHandler.LOG_DAEMON, address='/dev/log') handler.setLevel(logging.DEBUG) fmt = logging.Formatter('libvirt-hook: %(levelname)s: %(message)s') handler.setFormatter(fmt) LOG.addHandler(handler) + def call(args): LOG.debug("About to execute %s " % ' '.join(args)) subprocess.check_call(args) @@ -31,10 +31,11 @@ def gpusExist(xmlHostdevs): num_gpus = 0 gpu_bdfs = set() - output = subprocess.run(r'nvidia-smi -q | grep -i "GPU 00000000"', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout + output = subprocess.run(r'nvidia-smi -q | grep -i "GPU 00000000"', shell=True, universal_newlines=True, + stdout=subprocess.PIPE).stdout for line in output.split("\n"): if line: - gpu_bdfs.add(line.split(":",1)[1].lower()) + gpu_bdfs.add(line.split(":", 1)[1].lower()) for pci_hostdev_addr in xmlHostdevs: if pci_hostdev_addr is None: @@ -42,10 +43,12 @@ def gpusExist(xmlHostdevs): break # Form BDF string from
in VM xml - pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'),0) - pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'),0) - pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'),0) - pci_hostdev_bdf_from_xml = format(pci_hostdev_addr_bus, '02X') + ':' + format(pci_hostdev_addr_device, '02X') + '.' + format(pci_hostdev_addr_function, '01X') + pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'), 0) + pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'), 0) + pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'), 0) + pci_hostdev_bdf_from_xml = format(pci_hostdev_addr_bus, '02X') + ':' + format(pci_hostdev_addr_device, + '02X') + '.' + format( + pci_hostdev_addr_function, '01X') if pci_hostdev_bdf_from_xml.lower() in gpu_bdfs: num_gpus += 1 @@ -55,6 +58,7 @@ def gpusExist(xmlHostdevs): return False + def main(): LOG.debug("Arguments: %s", sys.argv) @@ -81,89 +85,69 @@ def main(): # Extract current GPU partitions' state and info in json format try: - output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True) + output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell=True) except subprocess.CalledProcessError as e: ret = sys.stderr.write('/usr/bin/fmpm -l --hostname ' + FM_IP + ' failed: ' + str(e.returncode)) sys.exit(0) - gpu_partition_json = output.decode("utf-8") gpu_partition_json_data = json.loads(gpu_partition_json) - platform = subprocess.check_output("dmidecode -t 1 | grep Product | awk -F \": \" '{print $2}'", shell = True) + platform = subprocess.check_output("dmidecode -t 1 | grep Product | awk -F \": \" '{print $2}'", shell=True) platform = platform.decode("utf-8").strip() LOG.debug("DEBUG: Platform is %s", platform) - # Build GPU Module ID to GPU BDF dictionary + # Build GPU Module ID to GPU BDF dictionary and GPU BDF to GPU Module ID dictionary gpus_mod_to_bdf = {} + gpus_bdf_to_mod = {} prior_line = "" module_id_string = "Module ID" if (("DGX-2" in platform) or ("V100" in platform) or ("A100" in platform) or ("A800" in platform)): for i in range(int(gpu_partition_json_data["partitionInfo"][0]["numGpus"])): mod_id = str(gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["physicalId"]) bdf = gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["pciBusId"] - gpus_mod_to_bdf[mod_id] = bdf.split(":",1)[1].strip() + gpus_mod_to_bdf[mod_id] = bdf.split(":", 1)[1].strip() else: - output = subprocess.run(r'nvidia-smi -q | grep -i "Module ID\|GPU 00000000"', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout + output = subprocess.run(r'nvidia-smi -q | grep -i "Module ID\|GPU 00000000"', shell=True, + universal_newlines=True, stdout=subprocess.PIPE).stdout for line in output.split("\n"): if module_id_string.casefold() in line.casefold(): mod_id = line.split(":")[1].strip() - gpus_mod_to_bdf[mod_id] = prior_line.split(":",1)[1] + gpus_mod_to_bdf[mod_id] = prior_line.split(":", 1)[1] + gpus_bdf_to_mod[prior_line.split(":", 1)[1]] = mod_id else: prior_line = line LOG.debug("GPU Module ID to GPU BDF mapping: %s", gpus_mod_to_bdf) - - # Build GPU BDF to GPU Module ID dictionary - gpus_bdf_to_mod = {} - prior_line = "" - if (("DGX-2" in platform) or ("V100" in platform) or ("A100" in platform) or ("A800" in platform)): - for i in range(int(gpu_partition_json_data["partitionInfo"][0]["numGpus"])): - mod_id = str(gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["physicalId"]) - bdf = gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["pciBusId"] - gpus_bdf_to_mod[bdf.split(":",1)[1].strip()] = mod_id - else: - output = subprocess.run(r'nvidia-smi -q | grep -i "Module ID\|GPU 00000000"', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout - for line in output.split("\n"): - if module_id_string.casefold() in line.casefold(): - mod_id = line.split(":")[1].strip() - gpus_bdf_to_mod[prior_line.split(":",1)[1]] = mod_id - else: - prior_line = line - LOG.debug("GPU BDF to GPU Module ID mapping: %s", gpus_bdf_to_mod) - num_pci_bdf_from_xml = len(root.findall("./devices/hostdev[@type='pci']/source/address")) - # Build Nvidia GPU Module ID list from GPUs in the VM XML num_gpu_bdf_from_xml = 0 - gpu_mod_list_from_xml =[] + gpu_mod_list_from_xml = [] for pci_hostdev_addr in root.findall("./devices/hostdev[@type='pci']/source/address"): if pci_hostdev_addr is None: LOG.debug("No PCI hostdev devices passed through to the VM") break # Form BDF string from
in VM xml - pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'),0) - pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'),0) - pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'),0) - pci_hostdev_bdf_from_xml = format(pci_hostdev_addr_bus, '02X') + ':' + format(pci_hostdev_addr_device, '02X') + '.' + format(pci_hostdev_addr_function, '01X') - + pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'), 0) + pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'), 0) + pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'), 0) + pci_hostdev_bdf_from_xml = (format(pci_hostdev_addr_bus, '02X') + ':' + + format(pci_hostdev_addr_device, '02X') + '.' + + format(pci_hostdev_addr_function, '01X')) + if pci_hostdev_bdf_from_xml in list(gpus_bdf_to_mod.keys()): # pci hostdev BDF from xml is a Nvidia GPU LOG.debug("%s is a Nvidia GPU", pci_hostdev_bdf_from_xml) - num_gpu_bdf_from_xml +=1 + num_gpu_bdf_from_xml += 1 # Add this GPU BDF's Module ID to a list gpu_mod_list_from_xml.append(int(gpus_bdf_to_mod[pci_hostdev_bdf_from_xml])) LOG.debug("Number of GPUs passed through in the VM XML is %s", num_gpu_bdf_from_xml) LOG.debug("GPU Module IDs %s for GPUs passed through in the VM XML", gpu_mod_list_from_xml) - # Extract current GPU partitions' state and info in json format - output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True) - gpu_partition_json = output.decode("utf-8") - gpu_partition_json_data = json.loads(gpu_partition_json) - - gpu_partitions = list(filter(lambda x:x["numGpus"] == num_gpu_bdf_from_xml, gpu_partition_json_data["partitionInfo"])) + gpu_partitions = list( + filter(lambda x: x["numGpus"] == num_gpu_bdf_from_xml, gpu_partition_json_data["partitionInfo"])) if not gpu_partitions: LOG.debug("No supported GPU partition with %s GPUs as passed in the VM XML", num_gpu_bdf_from_xml) sys.exit(0) @@ -192,12 +176,8 @@ def main(): else: LOG.debug("GPU partition %s contains GPUs as passed throught in the VM XML", gpu_partition_id) - # Extract current GPU partitions' state and info in json format - output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True) - gpu_partition_json = output.decode("utf-8") - gpu_partition_json_data = json.loads(gpu_partition_json) - - partition = list(filter(lambda x:x["partitionId"] == int(gpu_partition_id), gpu_partition_json_data["partitionInfo"])) + partition = list( + filter(lambda x: x["partitionId"] == int(gpu_partition_id), gpu_partition_json_data["partitionInfo"])) if not partition: LOG.debug("Get Partition state: No partitionInfo matching partition ID %s", gpu_partition_id) @@ -211,7 +191,7 @@ def main(): for gpu in gpu_infos: gpus_bdf_list.append(gpus_mod_to_bdf[str(gpu["physicalId"])]) # action is prepare during VM create - if (action == 'prepare'): + if (action == 'prepare'): if partition_isActive != 0: LOG.debug("GPU Partition %s is already active during action = prepare", gpu_partition_id) sys.exit(0) From 43b6db4447eb84bd3f34c0bb9c76fd955cbb5686 Mon Sep 17 00:00:00 2001 From: Aviv Zecharia Date: Mon, 19 May 2025 15:33:27 +0300 Subject: [PATCH 2/2] Exit code 1 if failed to list partitions --- qemu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu b/qemu index 718d6c5..d243fa9 100755 --- a/qemu +++ b/qemu @@ -87,8 +87,8 @@ def main(): try: output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell=True) except subprocess.CalledProcessError as e: - ret = sys.stderr.write('/usr/bin/fmpm -l --hostname ' + FM_IP + ' failed: ' + str(e.returncode)) - sys.exit(0) + sys.stderr.write('/usr/bin/fmpm -l --hostname ' + FM_IP + ' failed: ' + str(e.returncode)) + sys.exit(1) gpu_partition_json = output.decode("utf-8") gpu_partition_json_data = json.loads(gpu_partition_json)