From 25d0c83e078706cc0a558277afcebfeb80c516c7 Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Tue, 28 Oct 2025 23:18:30 +0100 Subject: [PATCH 1/2] Drop hardcoded Hot Aisle VM specs Use the spec object from gpuhunt offers instead. This allows newly added instance types with different CPU, RAM, disk, and GPU count configurations to automatically become available in dstack. However, limit the supported GPUs to MI300X, since other GPUs and CPU-only VMs might need to be tested by the dstack team before they become available to users. --- pyproject.toml | 6 +- .../core/backends/hotaisle/compute.py | 108 ++++-------------- 2 files changed, 30 insertions(+), 84 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3d4f6f1cb3..ea0a5e1c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.11", + # TODO: release and pin new version + "gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", @@ -67,6 +68,9 @@ artifacts = [ "src/dstack/_internal/server/statics/**", ] +[tool.hatch.metadata] +allow-direct-references = true # TODO: unset + [tool.hatch.metadata.hooks.fancy-pypi-readme] content-type = "text/markdown" diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py index 200173b1f9..10013b22a2 100644 --- a/src/dstack/_internal/core/backends/hotaisle/compute.py +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -2,7 +2,7 @@ import subprocess import tempfile from threading import Thread -from typing import List, Optional +from typing import Any, List, Optional import gpuhunt from gpuhunt.providers.hotaisle import HotAisleProvider @@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, + InstanceOffer, InstanceOfferWithAvailability, ) from dstack._internal.core.models.placement import PlacementGroup @@ -31,48 +32,7 @@ logger = get_logger(__name__) -INSTANCE_TYPE_SPECS = { - "1x MI300X 8x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "1x MI300X 13x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "2x MI300X 26x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "2x MI300X 26x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "4x MI300X 52x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "4x MI300X 52x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, - "8x MI300X 104x Xeon Platinum 8470": { - "cpu_model": "Xeon Platinum 8470", - "cpu_frequency": 2000000000, - "cpu_manufacturer": "Intel", - }, - "8x MI300X 104x Xeon Platinum 8462Y+": { - "cpu_model": "Xeon Platinum 8462Y+", - "cpu_frequency": 2800000000, - "cpu_manufacturer": "Intel", - }, -} +SUPPORTED_GPUS = ["MI300X"] class HotAisleCompute( @@ -95,45 +55,15 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability backend=BackendType.HOTAISLE, locations=self.config.regions or None, catalog=self.catalog, + extra_filter=_supported_instances, ) - supported_offers = [] - for offer in offers: - if offer.instance.name in INSTANCE_TYPE_SPECS: - supported_offers.append( - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) - ) - else: - logger.warning( - f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}" - ) - return supported_offers - - def get_payload_from_offer(self, instance_type) -> dict: - instance_type_name = instance_type.name - cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name] - cpu_cores = instance_type.resources.cpus - - return { - "cpu_cores": cpu_cores, - "cpus": { - "count": 1, - "manufacturer": cpu_specs["cpu_manufacturer"], - "model": cpu_specs["cpu_model"], - "cores": cpu_cores, - "frequency": cpu_specs["cpu_frequency"], - }, - "disk_capacity": instance_type.resources.disk.size_mib * 1024**2, - "ram_capacity": instance_type.resources.memory_mib * 1024**2, - "gpus": [ - { - "count": len(instance_type.resources.gpus), - "manufacturer": instance_type.resources.gpus[0].vendor, - "model": instance_type.resources.gpus[0].name, - } - ], - } + return [ + InstanceOfferWithAvailability( + **offer.dict(), + availability=InstanceAvailability.AVAILABLE, + ) + for offer in offers + ] def create_instance( self, @@ -143,8 +73,10 @@ def create_instance( ) -> JobProvisioningData: project_ssh_key = instance_config.ssh_keys[0] self.api_client.upload_ssh_key(project_ssh_key.public) - vm_payload = self.get_payload_from_offer(instance_offer.instance) - vm_data = self.api_client.create_virtual_machine(vm_payload) + offer_backend_data: HotAisleOfferBackendData = ( + HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data) + ) + vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs) return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, @@ -240,6 +172,12 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): ) +def _supported_instances(offer: InstanceOffer) -> bool: + return len(offer.instance.resources.gpus) > 0 and all( + gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus + ) + + class HotAisleInstanceBackendData(CoreModel): ip_address: str @@ -247,3 +185,7 @@ class HotAisleInstanceBackendData(CoreModel): def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": assert raw is not None return cls.__response__.parse_raw(raw) + + +class HotAisleOfferBackendData(CoreModel): + vm_specs: dict[str, Any] From 505e0318af00f8112de5fd85a54548a8aa10473f Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Fri, 5 Dec 2025 14:16:35 +0100 Subject: [PATCH 2/2] Pin new `gpuhunt` version --- pyproject.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ea0a5e1c6f..0e7d56c748 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,8 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - # TODO: release and pin new version - "gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip", + "gpuhunt==0.1.14", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", @@ -68,9 +67,6 @@ artifacts = [ "src/dstack/_internal/server/statics/**", ] -[tool.hatch.metadata] -allow-direct-references = true # TODO: unset - [tool.hatch.metadata.hooks.fancy-pypi-readme] content-type = "text/markdown"