diff --git a/contributing/BACKENDS.md b/contributing/BACKENDS.md index 7a7a742316..f53750791a 100644 --- a/contributing/BACKENDS.md +++ b/contributing/BACKENDS.md @@ -42,7 +42,7 @@ base class. See its docstrings for descriptions of the methods that your class s Refer to examples: - Offline providers: - [datacrunch.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/datacrunch.py), + [verda.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/verda.py), [aws.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/aws.py), [azure.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/azure.py), [lambdalabs.py](https://github.com/dstackai/gpuhunt/blob/main/src/gpuhunt/providers/lambdalabs.py). @@ -64,7 +64,7 @@ Add your provider in the following places: For offline providers, you can add data quality tests under `src/integrity_tests/`. Data quality tests are run after collecting offline catalogs to ensure their integrity. -Refer to examples: [test_datacrunch.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_datacrunch.py), +Refer to examples: [test_verda.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_verda.py), [test_gcp.py](https://github.com/dstackai/gpuhunt/blob/main/src/integrity_tests/test_gcp.py). ### 1.6. Submit a pull request @@ -125,7 +125,7 @@ Then add these models to `AnyBackendConfig*` unions in [`src/dstack/_internal/co The script also generates `*BackendStoredConfig` that extends `*BackendConfig` to be able to store extra parameters in the DB. By the same logic, it generates `*Config` that extends `*BackendStoredConfig` with creds and uses it as the main `Backend` and `Compute` config instead of using `*BackendConfigWithCreds` directly. Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/models.py), +[verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/models.py), [aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/models.py), [gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/models.py), [azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/models.py), etc. @@ -136,7 +136,7 @@ Go to `compute.py` and implement `Compute` methods. Optionally, extend and implement `ComputeWith*` classes to support additional features such as fleets, volumes, gateways, placement groups, etc. For example, extend `ComputeWithCreateInstanceSupport` to support fleets. Refer to examples: -[datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/compute.py), +[verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/compute.py), [aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/compute.py), [gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/compute.py), [azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/compute.py), etc. @@ -146,7 +146,7 @@ Refer to examples: Go to `configurator.py` and implement custom `Configurator` logic. At minimum, you should implement creds validation. You may also need to validate other config parameters if there are any. -Refer to examples: [datacrunch](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/datacrunch/configurator.py), +Refer to examples: [verda](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/verda/configurator.py), [aws](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/aws/configurator.py), [gcp](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/gcp/configurator.py), [azure](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/core/backends/azure/configurator.py), etc. @@ -195,7 +195,7 @@ For some VM-based backends, the `dstack` team also maintains [custom VM images](../scripts/packer/README.md) with the required dependencies and `dstack`-specific optimizations. -Examples of VM-based backends include: `aws`, `azure`, `gcp`, `lambda`, `datacrunch`, `tensordock`, etc. +Examples of VM-based backends include: `aws`, `azure`, `gcp`, `lambda`, `verda`, etc. #### 3.1.2. Container-based backend compute type diff --git a/contributing/GPUHUNT.md b/contributing/GPUHUNT.md index c30ccc807a..33e3150fa1 100644 --- a/contributing/GPUHUNT.md +++ b/contributing/GPUHUNT.md @@ -65,7 +65,7 @@ Some providers offer extreme flexibility in possible configurations, but not all - Filters out if: outdated family, not supported family - Queries configuration details to fill CPU, RAM, and GPU information -### DataCrunch +### Verda - Just queries all offers via API diff --git a/docs/assets/images/verda-logo.svg b/docs/assets/images/verda-logo.svg new file mode 100644 index 0000000000..3f688a7d44 --- /dev/null +++ b/docs/assets/images/verda-logo.svg @@ -0,0 +1,13 @@ + + + + + + + + + diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 7c1eb0b791..574342318f 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -854,9 +854,11 @@ There are two ways to configure OCI: using client credentials or using the defau compartment_id: ocid1.compartment.oc1..aaaaaaaa ``` -### DataCrunch + -Log into your [DataCrunch](https://cloud.datacrunch.io/) account, click Keys in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. +### Verda (formerly DataCrunch) { #verda } + +Log into your [Verda](https://console.verda.com/signin) account, click Keys in the sidebar, find `REST API Credentials` area and then click the `Generate Credentials` button. Then, go ahead and configure the backend: diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index cab6c3d29b..90283eb4de 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -258,18 +258,18 @@ to configure [backends](../../concepts/backends.md) and other [server-level sett type: required: true -##### `projects[n].backends[type=datacrunch]` { #datacrunch data-toc-label="datacrunch" } +##### `projects[n].backends[type=verda]` { #verda data-toc-label="verda" } -#SCHEMA# dstack._internal.core.backends.datacrunch.models.DataCrunchBackendConfigWithCreds +#SCHEMA# dstack._internal.core.backends.verda.models.VerdaBackendConfigWithCreds overrides: show_root_heading: false type: required: true - item_id_prefix: datacrunch- + item_id_prefix: verda- -###### `projects[n].backends[type=datacrunch].creds` { #datacrunch-creds data-toc-label="creds" } +###### `projects[n].backends[type=verda].creds` { #verda-creds data-toc-label="creds" } -#SCHEMA# dstack._internal.core.backends.datacrunch.models.DataCrunchAPIKeyCreds +#SCHEMA# dstack._internal.core.backends.verda.models.VerdaAPIKeyCreds overrides: show_root_heading: false type: diff --git a/docs/partners.md b/docs/partners.md index 86011c09b7..a863fd6a8e 100644 --- a/docs/partners.md +++ b/docs/partners.md @@ -95,9 +95,9 @@ hide: --> - +

- DataCrunch + Verda

diff --git a/pyproject.toml b/pyproject.toml index 5ad026de64..0d149339d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.14", + "gpuhunt==0.1.15", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", @@ -186,7 +186,11 @@ gcp = [ "dstack[server]", ] datacrunch = [ - "datacrunch", + "verda; python_version >= '3.10'", + "dstack[server]", +] +verda = [ + "verda; python_version >= '3.10'", "dstack[server]", ] kubernetes = [ @@ -211,5 +215,5 @@ nebius = [ "dstack[server]", ] all = [ - "dstack[gateway,server,aws,azure,gcp,datacrunch,kubernetes,lambda,nebius,oci]", + "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci]", ] diff --git a/src/dstack/_internal/core/backends/base/offers.py b/src/dstack/_internal/core/backends/base/offers.py index de1d7c4875..d4fae91af5 100644 --- a/src/dstack/_internal/core/backends/base/offers.py +++ b/src/dstack/_internal/core/backends/base/offers.py @@ -39,6 +39,8 @@ def get_catalog_offers( catalog: Optional[gpuhunt.Catalog] = None, ) -> List[InstanceOffer]: provider = backend.value + if backend == BackendType.DATACRUNCH: + provider = BackendType.VERDA.value # Backward compatibility if backend == BackendType.LAMBDA: provider = "lambdalabs" if backend == BackendType.AMDDEVCLOUD: diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index eeb91d0bac..ec7f976c53 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -127,6 +127,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(VerdaConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.vultr.configurator import VultrConfigurator diff --git a/src/dstack/_internal/core/backends/datacrunch/__init__.py b/src/dstack/_internal/core/backends/datacrunch/__init__.py index e69de29bb2..ca4773c861 100644 --- a/src/dstack/_internal/core/backends/datacrunch/__init__.py +++ b/src/dstack/_internal/core/backends/datacrunch/__init__.py @@ -0,0 +1 @@ +# DataCrunch backend for backward compatibility diff --git a/src/dstack/_internal/core/backends/datacrunch/backend.py b/src/dstack/_internal/core/backends/datacrunch/backend.py index 7df591c289..1ce1c97c42 100644 --- a/src/dstack/_internal/core/backends/datacrunch/backend.py +++ b/src/dstack/_internal/core/backends/datacrunch/backend.py @@ -1,16 +1,18 @@ -from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.backends.datacrunch.compute import DataCrunchCompute -from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig +from dstack._internal.core.backends.verda.backend import VerdaBackend +from dstack._internal.core.backends.verda.models import VerdaConfig from dstack._internal.core.models.backends.base import BackendType -class DataCrunchBackend(Backend): +# Deprecated +# TODO: Remove in 0.21 +class DataCrunchBackend(VerdaBackend): TYPE = BackendType.DATACRUNCH COMPUTE_CLASS = DataCrunchCompute - def __init__(self, config: DataCrunchConfig): + def __init__(self, config: VerdaConfig): self.config = config - self._compute = DataCrunchCompute(self.config) + self._compute = DataCrunchCompute(self.config, self.TYPE) def compute(self) -> DataCrunchCompute: return self._compute diff --git a/src/dstack/_internal/core/backends/datacrunch/compute.py b/src/dstack/_internal/core/backends/datacrunch/compute.py index 6543567c39..906c9ea2e5 100644 --- a/src/dstack/_internal/core/backends/datacrunch/compute.py +++ b/src/dstack/_internal/core/backends/datacrunch/compute.py @@ -1,265 +1,8 @@ -from collections.abc import Iterable -from typing import Dict, List, Optional - -from datacrunch import DataCrunchClient -from datacrunch.exceptions import APIException -from datacrunch.instances.instances import Instance - -from dstack._internal.core.backends.base.backend import Compute -from dstack._internal.core.backends.base.compute import ( - ComputeWithAllOffersCached, - ComputeWithCreateInstanceSupport, - ComputeWithPrivilegedSupport, - generate_unique_instance_name, - get_shim_commands, -) -from dstack._internal.core.backends.base.offers import ( - OfferModifier, - get_catalog_offers, - get_offers_disk_modifier, -) -from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig -from dstack._internal.core.errors import NoCapacityError +from dstack._internal.core.backends.verda.compute import VerdaCompute +from dstack._internal.core.backends.verda.models import VerdaConfig from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOffer, - InstanceOfferWithAvailability, -) -from dstack._internal.core.models.placement import PlacementGroup -from dstack._internal.core.models.resources import Memory, Range -from dstack._internal.core.models.runs import JobProvisioningData, Requirements -from dstack._internal.utils.logging import get_logger -from dstack._internal.utils.ssh import get_public_key_fingerprint - -logger = get_logger("datacrunch.compute") - -MAX_INSTANCE_NAME_LEN = 60 - -IMAGE_SIZE = Memory.parse("50GB") - -CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None) - - -class DataCrunchCompute( - ComputeWithAllOffersCached, - ComputeWithCreateInstanceSupport, - ComputeWithPrivilegedSupport, - Compute, -): - def __init__(self, config: DataCrunchConfig): - super().__init__() - self.config = config - self.client = DataCrunchClient( - client_id=self.config.creds.client_id, - client_secret=self.config.creds.client_secret, - ) - - def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: - offers = get_catalog_offers( - backend=BackendType.DATACRUNCH, - locations=self.config.regions, - ) - offers_with_availability = self._get_offers_with_availability(offers) - return offers_with_availability - - def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: - return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] - - def _get_offers_with_availability( - self, offers: List[InstanceOffer] - ) -> List[InstanceOfferWithAvailability]: - raw_availabilities: List[Dict] = self.client.instances.get_availabilities() - - region_availabilities = {} - for location in raw_availabilities: - location_code = location["location_code"] - availabilities = location["availabilities"] - for name in availabilities: - key = (name, location_code) - region_availabilities[key] = InstanceAvailability.AVAILABLE - - availability_offers = [] - for offer in offers: - key = (offer.instance.name, offer.region) - availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE) - availability_offers.append( - InstanceOfferWithAvailability(**offer.dict(), availability=availability) - ) - - return availability_offers - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - placement_group: Optional[PlacementGroup], - ) -> JobProvisioningData: - instance_name = generate_unique_instance_name( - instance_config, max_length=MAX_INSTANCE_NAME_LEN - ) - public_keys = instance_config.get_public_keys() - ssh_ids = [] - for ssh_public_key in public_keys: - ssh_ids.append( - # datacrunch allows you to use the same name - _get_or_create_ssh_key( - client=self.client, - name=f"dstack-{instance_config.instance_name}.key", - public_key=ssh_public_key, - ) - ) - - commands = get_shim_commands(authorized_keys=public_keys) - startup_script = " ".join([" && ".join(commands)]) - script_name = f"dstack-{instance_config.instance_name}.sh" - startup_script_ids = _get_or_create_startup_scrpit( - client=self.client, - name=script_name, - script=startup_script, - ) - - disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - image_id = _get_vm_image_id(instance_offer) - - logger.debug( - "Deploying datacrunch instance", - { - "instance_type": instance_offer.instance.name, - "ssh_key_ids": ssh_ids, - "startup_script_id": startup_script_ids, - "hostname": instance_name, - "description": instance_name, - "image": image_id, - "disk_size": disk_size, - "location": instance_offer.region, - }, - ) - instance = _deploy_instance( - client=self.client, - instance_type=instance_offer.instance.name, - ssh_key_ids=ssh_ids, - startup_script_id=startup_script_ids, - hostname=instance_name, - description=instance_name, - image=image_id, - disk_size=disk_size, - is_spot=instance_offer.instance.resources.spot, - location=instance_offer.region, - ) - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=instance.id, - hostname=None, - internal_ip=None, - region=instance.location, - price=instance_offer.price, - username="root", - ssh_port=22, - dockerized=True, - ssh_proxy=None, - backend_data=None, - ) - - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ): - try: - self.client.instances.action(id_list=[instance_id], action="delete") - except APIException as e: - if e.message in [ - "Invalid instance id", - "Can't discontinue a discontinued instance", - ]: - logger.debug("Skipping instance %s termination. Instance not found.", instance_id) - return - raise - - def update_provisioning_data( - self, - provisioning_data: JobProvisioningData, - project_ssh_public_key: str, - project_ssh_private_key: str, - ): - instance = _get_instance_by_id(self.client, provisioning_data.instance_id) - if instance is not None and instance.status == "running": - provisioning_data.hostname = instance.ip - - -def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str: - # https://api.datacrunch.io/v1/images - if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[ - 0 - ].name in ["V100", "A6000"]: - # Ubuntu 22.04 + CUDA 12.0 + Docker - return "2088da25-bb0d-41cc-a191-dccae45d96fd" - # Ubuntu 24.04 + CUDA 12.8 Open + Docker - return "77777777-4f48-4249-82b3-f199fb9b701b" - - -def _get_or_create_ssh_key(client: DataCrunchClient, name: str, public_key: str) -> str: - fingerprint = get_public_key_fingerprint(public_key) - keys = client.ssh_keys.get() - found_keys = [key for key in keys if fingerprint == get_public_key_fingerprint(key.public_key)] - if found_keys: - key = found_keys[0] - return key.id - key = client.ssh_keys.create(name, public_key) - return key.id - - -def _get_or_create_startup_scrpit(client: DataCrunchClient, name: str, script: str) -> str: - scripts = client.startup_scripts.get() - found_scripts = [startup_script for startup_script in scripts if script == startup_script] - if found_scripts: - startup_script = found_scripts[0] - return startup_script.id - - startup_script = client.startup_scripts.create(name, script) - return startup_script.id - - -def _get_instance_by_id( - client: DataCrunchClient, - instance_id: str, -) -> Optional[Instance]: - try: - return client.instances.get_by_id(instance_id) - except APIException as e: - if e.message == "Invalid instance id": - return None - raise - -def _deploy_instance( - client: DataCrunchClient, - instance_type: str, - image: str, - ssh_key_ids: List[str], - hostname: str, - description: str, - startup_script_id: str, - disk_size: int, - is_spot: bool, - location: str, -) -> Instance: - try: - instance = client.instances.create( - instance_type=instance_type, - image=image, - ssh_key_ids=ssh_key_ids, - hostname=hostname, - description=description, - startup_script_id=startup_script_id, - pricing="FIXED_PRICE", - is_spot=is_spot, - location=location, - os_volume={"name": "OS volume", "size": disk_size}, - ) - except APIException as e: - # FIXME: Catch only no capacity errors - raise NoCapacityError(f"DataCrunch API error: {e.message}") - return instance +class DataCrunchCompute(VerdaCompute): + def __init__(self, config: VerdaConfig, backend_type: BackendType): + super().__init__(config, backend_type) diff --git a/src/dstack/_internal/core/backends/datacrunch/configurator.py b/src/dstack/_internal/core/backends/datacrunch/configurator.py index f31d5a69a1..944f8657d3 100644 --- a/src/dstack/_internal/core/backends/datacrunch/configurator.py +++ b/src/dstack/_internal/core/backends/datacrunch/configurator.py @@ -1,77 +1,17 @@ -import json - -from datacrunch import DataCrunchClient -from datacrunch.exceptions import APIException - -from dstack._internal.core.backends.base.configurator import ( - BackendRecord, - Configurator, - raise_invalid_credentials_error, -) +from dstack._internal.core.backends.base.configurator import BackendRecord from dstack._internal.core.backends.datacrunch.backend import DataCrunchBackend -from dstack._internal.core.backends.datacrunch.models import ( - DataCrunchBackendConfig, - DataCrunchBackendConfigWithCreds, - DataCrunchConfig, - DataCrunchCreds, - DataCrunchStoredConfig, +from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, ) from dstack._internal.core.models.backends.base import ( BackendType, ) -class DataCrunchConfigurator( - Configurator[ - DataCrunchBackendConfig, - DataCrunchBackendConfigWithCreds, - ] -): +class DataCrunchConfigurator(VerdaConfigurator): TYPE = BackendType.DATACRUNCH BACKEND_CLASS = DataCrunchBackend - def validate_config( - self, config: DataCrunchBackendConfigWithCreds, default_creds_enabled: bool - ): - self._validate_creds(config.creds) - - def create_backend( - self, project_name: str, config: DataCrunchBackendConfigWithCreds - ) -> BackendRecord: - return BackendRecord( - config=DataCrunchStoredConfig( - **DataCrunchBackendConfig.__response__.parse_obj(config).dict() - ).json(), - auth=DataCrunchCreds.parse_obj(config.creds).json(), - ) - - def get_backend_config_with_creds( - self, record: BackendRecord - ) -> DataCrunchBackendConfigWithCreds: - config = self._get_config(record) - return DataCrunchBackendConfigWithCreds.__response__.parse_obj(config) - - def get_backend_config_without_creds(self, record: BackendRecord) -> DataCrunchBackendConfig: - config = self._get_config(record) - return DataCrunchBackendConfig.__response__.parse_obj(config) - def get_backend(self, record: BackendRecord) -> DataCrunchBackend: config = self._get_config(record) return DataCrunchBackend(config=config) - - def _get_config(self, record: BackendRecord) -> DataCrunchConfig: - return DataCrunchConfig.__response__( - **json.loads(record.config), - creds=DataCrunchCreds.parse_raw(record.auth), - ) - - def _validate_creds(self, creds: DataCrunchCreds): - try: - DataCrunchClient( - client_id=creds.client_id, - client_secret=creds.client_secret, - ) - except APIException as e: - if e.code == "unauthorized_request": - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) - raise diff --git a/src/dstack/_internal/core/backends/datacrunch/models.py b/src/dstack/_internal/core/backends/datacrunch/models.py deleted file mode 100644 index 5a98e4cc82..0000000000 --- a/src/dstack/_internal/core/backends/datacrunch/models.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import Annotated, List, Literal, Optional, Union - -from pydantic import Field - -from dstack._internal.core.models.common import CoreModel - - -class DataCrunchAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - client_id: Annotated[str, Field(description="The client ID")] - client_secret: Annotated[str, Field(description="The client secret")] - - -AnyDataCrunchCreds = DataCrunchAPIKeyCreds -DataCrunchCreds = AnyDataCrunchCreds - - -class DataCrunchBackendConfig(CoreModel): - type: Annotated[Literal["datacrunch"], Field(description="The type of backend")] = "datacrunch" - regions: Annotated[ - Optional[List[str]], - Field(description="The list of DataCrunch regions. Omit to use all regions"), - ] = None - - -class DataCrunchBackendConfigWithCreds(DataCrunchBackendConfig): - creds: Annotated[AnyDataCrunchCreds, Field(description="The credentials")] - - -AnyDataCrunchBackendConfig = Union[DataCrunchBackendConfig, DataCrunchBackendConfigWithCreds] - - -class DataCrunchStoredConfig(DataCrunchBackendConfig): - pass - - -class DataCrunchConfig(DataCrunchStoredConfig): - creds: AnyDataCrunchCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 1715080f83..f1c59e2f44 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -16,10 +16,6 @@ CudoBackendConfig, CudoBackendConfigWithCreds, ) -from dstack._internal.core.backends.datacrunch.models import ( - DataCrunchBackendConfig, - DataCrunchBackendConfigWithCreds, -) from dstack._internal.core.backends.digitalocean_base.models import ( BaseDigitalOceanBackendConfig, BaseDigitalOceanBackendConfigWithCreds, @@ -68,6 +64,10 @@ VastAIBackendConfig, VastAIBackendConfigWithCreds, ) +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfig, + VerdaBackendConfigWithCreds, +) from dstack._internal.core.backends.vultr.models import ( VultrBackendConfig, VultrBackendConfigWithCreds, @@ -80,7 +80,6 @@ AzureBackendConfig, CloudRiftBackendConfig, CudoBackendConfig, - DataCrunchBackendConfig, BaseDigitalOceanBackendConfig, GCPBackendConfig, HotAisleBackendConfig, @@ -91,6 +90,7 @@ RunpodBackendConfig, TensorDockBackendConfig, VastAIBackendConfig, + VerdaBackendConfig, VultrBackendConfig, DstackBackendConfig, DstackBaseBackendConfig, @@ -104,7 +104,7 @@ AzureBackendConfigWithCreds, CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, - DataCrunchBackendConfigWithCreds, + VerdaBackendConfigWithCreds, BaseDigitalOceanBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, @@ -127,7 +127,7 @@ AzureBackendConfigWithCreds, CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, - DataCrunchBackendConfigWithCreds, + VerdaBackendConfigWithCreds, BaseDigitalOceanBackendConfigWithCreds, GCPBackendFileConfigWithCreds, HotAisleBackendFileConfigWithCreds, diff --git a/src/tests/_internal/core/backends/datacrunch/__init__.py b/src/dstack/_internal/core/backends/verda/__init__.py similarity index 100% rename from src/tests/_internal/core/backends/datacrunch/__init__.py rename to src/dstack/_internal/core/backends/verda/__init__.py diff --git a/src/dstack/_internal/core/backends/verda/backend.py b/src/dstack/_internal/core/backends/verda/backend.py new file mode 100644 index 0000000000..eed2beb34d --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.verda.compute import VerdaCompute +from dstack._internal.core.backends.verda.models import VerdaConfig +from dstack._internal.core.models.backends.base import BackendType + + +class VerdaBackend(Backend): + TYPE = BackendType.VERDA + COMPUTE_CLASS = VerdaCompute + + def __init__(self, config: VerdaConfig): + self.config = config + self._compute = VerdaCompute(self.config, self.TYPE) + + def compute(self) -> VerdaCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/verda/compute.py b/src/dstack/_internal/core/backends/verda/compute.py new file mode 100644 index 0000000000..d6dbdd6ae0 --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/compute.py @@ -0,0 +1,266 @@ +from collections.abc import Iterable +from typing import Dict, List, Optional + +from verda import VerdaClient +from verda.exceptions import APIException +from verda.instances import Instance + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.verda.models import VerdaConfig +from dstack._internal.core.errors import NoCapacityError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.ssh import get_public_key_fingerprint + +logger = get_logger("verda.compute") + +MAX_INSTANCE_NAME_LEN = 60 + +IMAGE_SIZE = Memory.parse("50GB") + +CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None) + + +class VerdaCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + Compute, +): + def __init__(self, config: VerdaConfig, backend_type: BackendType): + super().__init__() + self.config = config + self.client = VerdaClient( + client_id=self.config.creds.client_id, + client_secret=self.config.creds.client_secret, + ) + self.backend_type = backend_type + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=self.backend_type, + locations=self.config.regions, + ) + offers_with_availability = self._get_offers_with_availability(offers) + return offers_with_availability + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)] + + def _get_offers_with_availability( + self, offers: List[InstanceOffer] + ) -> List[InstanceOfferWithAvailability]: + raw_availabilities: List[Dict] = self.client.instances.get_availabilities() + + region_availabilities = {} + for location in raw_availabilities: + location_code = location["location_code"] + availabilities = location["availabilities"] + for name in availabilities: + key = (name, location_code) + region_availabilities[key] = InstanceAvailability.AVAILABLE + + availability_offers = [] + for offer in offers: + key = (offer.instance.name, offer.region) + availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE) + availability_offers.append( + InstanceOfferWithAvailability(**offer.dict(), availability=availability) + ) + + return availability_offers + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + public_keys = instance_config.get_public_keys() + ssh_ids = [] + for ssh_public_key in public_keys: + ssh_ids.append( + # verda allows you to use the same name + _get_or_create_ssh_key( + client=self.client, + name=f"dstack-{instance_config.instance_name}.key", + public_key=ssh_public_key, + ) + ) + + commands = get_shim_commands(authorized_keys=public_keys) + startup_script = " ".join([" && ".join(commands)]) + script_name = f"dstack-{instance_config.instance_name}.sh" + startup_script_ids = _get_or_create_startup_scrpit( + client=self.client, + name=script_name, + script=startup_script, + ) + + disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) + image_id = _get_vm_image_id(instance_offer) + + logger.debug( + "Deploying Verda instance", + { + "instance_type": instance_offer.instance.name, + "ssh_key_ids": ssh_ids, + "startup_script_id": startup_script_ids, + "hostname": instance_name, + "description": instance_name, + "image": image_id, + "disk_size": disk_size, + "location": instance_offer.region, + }, + ) + instance = _deploy_instance( + client=self.client, + instance_type=instance_offer.instance.name, + ssh_key_ids=ssh_ids, + startup_script_id=startup_script_ids, + hostname=instance_name, + description=instance_name, + image=image_id, + disk_size=disk_size, + is_spot=instance_offer.instance.resources.spot, + location=instance_offer.region, + ) + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance.id, + hostname=None, + internal_ip=None, + region=instance.location, + price=instance_offer.price, + username="root", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=None, + ) + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + try: + self.client.instances.action(id_list=[instance_id], action="delete") + except APIException as e: + if e.message in [ + "Invalid instance id", + "Can't discontinue a discontinued instance", + ]: + logger.debug("Skipping instance %s termination. Instance not found.", instance_id) + return + raise + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + instance = _get_instance_by_id(self.client, provisioning_data.instance_id) + if instance is not None and instance.status == "running": + provisioning_data.hostname = instance.ip + + +def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str: + # https://api.verda.com/v1/images + if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[ + 0 + ].name in ["V100", "A6000"]: + # Ubuntu 22.04 + CUDA 12.0 + Docker + return "2088da25-bb0d-41cc-a191-dccae45d96fd" + # Ubuntu 24.04 + CUDA 12.8 Open + Docker + return "77777777-4f48-4249-82b3-f199fb9b701b" + + +def _get_or_create_ssh_key(client: VerdaClient, name: str, public_key: str) -> str: + fingerprint = get_public_key_fingerprint(public_key) + keys = client.ssh_keys.get() + found_keys = [key for key in keys if fingerprint == get_public_key_fingerprint(key.public_key)] + if found_keys: + key = found_keys[0] + return key.id + key = client.ssh_keys.create(name, public_key) + return key.id + + +def _get_or_create_startup_scrpit(client: VerdaClient, name: str, script: str) -> str: + scripts = client.startup_scripts.get() + found_scripts = [startup_script for startup_script in scripts if script == startup_script] + if found_scripts: + startup_script = found_scripts[0] + return startup_script.id + + startup_script = client.startup_scripts.create(name, script) + return startup_script.id + + +def _get_instance_by_id( + client: VerdaClient, + instance_id: str, +) -> Optional[Instance]: + try: + return client.instances.get_by_id(instance_id) + except APIException as e: + if e.message == "Invalid instance id": + return None + raise + + +def _deploy_instance( + client: VerdaClient, + instance_type: str, + image: str, + ssh_key_ids: List[str], + hostname: str, + description: str, + startup_script_id: str, + disk_size: int, + is_spot: bool, + location: str, +) -> Instance: + try: + instance = client.instances.create( + instance_type=instance_type, + image=image, + ssh_key_ids=ssh_key_ids, + hostname=hostname, + description=description, + startup_script_id=startup_script_id, + pricing="FIXED_PRICE", + is_spot=is_spot, + location=location, + os_volume={"name": "OS volume", "size": disk_size}, + ) + except APIException as e: + # FIXME: Catch only no capacity errors + raise NoCapacityError(f"Verda API error: {e.message}") + + return instance diff --git a/src/dstack/_internal/core/backends/verda/configurator.py b/src/dstack/_internal/core/backends/verda/configurator.py new file mode 100644 index 0000000000..64b0dec034 --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/configurator.py @@ -0,0 +1,73 @@ +import json + +from verda import VerdaClient +from verda.exceptions import APIException + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.verda.backend import VerdaBackend +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfig, + VerdaBackendConfigWithCreds, + VerdaConfig, + VerdaCreds, + VerdaStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class VerdaConfigurator( + Configurator[ + VerdaBackendConfig, + VerdaBackendConfigWithCreds, + ] +): + TYPE = BackendType.VERDA + BACKEND_CLASS = VerdaBackend + + def validate_config(self, config: VerdaBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds) + + def create_backend( + self, project_name: str, config: VerdaBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=VerdaStoredConfig( + **VerdaBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=VerdaCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> VerdaBackendConfigWithCreds: + config = self._get_config(record) + return VerdaBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> VerdaBackendConfig: + config = self._get_config(record) + return VerdaBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> VerdaBackend: + config = self._get_config(record) + return VerdaBackend(config=config) + + def _get_config(self, record: BackendRecord) -> VerdaConfig: + return VerdaConfig.__response__( + **json.loads(record.config), + creds=VerdaCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: VerdaCreds): + try: + VerdaClient( + client_id=creds.client_id, + client_secret=creds.client_secret, + ) + except APIException as e: + if e.code == "unauthorized_request": + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) + raise diff --git a/src/dstack/_internal/core/backends/verda/models.py b/src/dstack/_internal/core/backends/verda/models.py new file mode 100644 index 0000000000..1e0b896b5d --- /dev/null +++ b/src/dstack/_internal/core/backends/verda/models.py @@ -0,0 +1,38 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class VerdaAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + client_id: Annotated[str, Field(description="The client ID")] + client_secret: Annotated[str, Field(description="The client secret")] + + +AnyVerdaCreds = VerdaAPIKeyCreds +VerdaCreds = AnyVerdaCreds + + +class VerdaBackendConfig(CoreModel): + type: Annotated[Literal["verda", "datacrunch"], Field(description="The type of backend")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Verda regions. Omit to use all regions"), + ] = None + + +class VerdaBackendConfigWithCreds(VerdaBackendConfig): + creds: Annotated[AnyVerdaCreds, Field(description="The credentials")] + + +AnyVerdaBackendConfig = Union[VerdaBackendConfig, VerdaBackendConfigWithCreds] + + +class VerdaStoredConfig(VerdaBackendConfig): + pass + + +class VerdaConfig(VerdaStoredConfig): + creds: AnyVerdaCreds diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 3d33e75b62..82efe09efa 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -9,7 +9,7 @@ class BackendType(str, enum.Enum): AZURE (BackendType): Microsoft Azure CLOUDRIFT (BackendType): CloudRift CUDO (BackendType): Cudo - DATACRUNCH (BackendType): DataCrunch + DATACRUNCH (BackendType): DataCrunch (for backward compatibility) DIGITALOCEAN (BackendType): DigitalOcean DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform @@ -21,6 +21,7 @@ class BackendType(str, enum.Enum): RUNPOD (BackendType): Runpod Cloud TENSORDOCK (BackendType): TensorDock Marketplace VASTAI (BackendType): Vast.ai Marketplace + VERDA (BackendType): Verda Cloud VULTR (BackendType): Vultr """ @@ -29,7 +30,7 @@ class BackendType(str, enum.Enum): AZURE = "azure" CLOUDRIFT = "cloudrift" CUDO = "cudo" - DATACRUNCH = "datacrunch" + DATACRUNCH = "datacrunch" # BackendType for backward compatibility DIGITALOCEAN = "digitalocean" DSTACK = "dstack" GCP = "gcp" @@ -43,4 +44,5 @@ class BackendType(str, enum.Enum): RUNPOD = "runpod" TENSORDOCK = "tensordock" VASTAI = "vastai" + VERDA = "verda" VULTR = "vultr" diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index 8f9459a766..0c4a1c5e7f 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -689,7 +689,7 @@ async def create_instance( instance_id: Optional[UUID] = None, job: Optional[JobModel] = None, instance_num: int = 0, - backend: BackendType = BackendType.DATACRUNCH, + backend: BackendType = BackendType.VERDA, termination_policy: Optional[TerminationPolicy] = None, termination_idle_time: int = DEFAULT_FLEET_TERMINATION_IDLE_TIME, region: str = "eu-west", diff --git a/src/tests/_internal/core/backends/cloudrift/test_configurator.py b/src/tests/_internal/core/backends/cloudrift/test_configurator.py index f12499d890..89145279fd 100644 --- a/src/tests/_internal/core/backends/cloudrift/test_configurator.py +++ b/src/tests/_internal/core/backends/cloudrift/test_configurator.py @@ -12,7 +12,7 @@ from dstack._internal.core.errors import BackendInvalidCredentialsError -class TestDataCrunchConfigurator: +class TestCloudRiftConfigurator: def test_validate_config_valid(self): config = CloudRiftBackendConfigWithCreds(creds=CloudRiftCreds(api_key="valid")) with patch( diff --git a/src/tests/_internal/core/backends/datacrunch/test_configurator.py b/src/tests/_internal/core/backends/datacrunch/test_configurator.py deleted file mode 100644 index 721cebe1a2..0000000000 --- a/src/tests/_internal/core/backends/datacrunch/test_configurator.py +++ /dev/null @@ -1,21 +0,0 @@ -from unittest.mock import patch - -from dstack._internal.core.backends.datacrunch.configurator import ( - DataCrunchConfigurator, -) -from dstack._internal.core.backends.datacrunch.models import ( - DataCrunchBackendConfigWithCreds, - DataCrunchCreds, -) - - -class TestDataCrunchConfigurator: - def test_validate_config_valid(self): - config = DataCrunchBackendConfigWithCreds( - creds=DataCrunchCreds(client_id="valid", client_secret="valid"), - regions=["FIN-01"], - ) - with patch( - "dstack._internal.core.backends.datacrunch.configurator.DataCrunchConfigurator._validate_creds" - ): - DataCrunchConfigurator().validate_config(config, default_creds_enabled=True) diff --git a/src/tests/_internal/core/backends/verda/__init__.py b/src/tests/_internal/core/backends/verda/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/core/backends/verda/test_configurator.py b/src/tests/_internal/core/backends/verda/test_configurator.py new file mode 100644 index 0000000000..a17fd1182c --- /dev/null +++ b/src/tests/_internal/core/backends/verda/test_configurator.py @@ -0,0 +1,28 @@ +import sys +from unittest.mock import patch + +import pytest + +if sys.version_info < (3, 10): + pytest.skip("Verda requires Python 3.10", allow_module_level=True) + +from dstack._internal.core.backends.verda.configurator import ( + VerdaConfigurator, +) +from dstack._internal.core.backends.verda.models import ( + VerdaBackendConfigWithCreds, + VerdaCreds, +) + + +class TestVerdaConfigurator: + def test_validate_config_valid(self): + config = VerdaBackendConfigWithCreds( + type="verda", + creds=VerdaCreds(client_id="valid", client_secret="valid"), + regions=["FIN-01"], + ) + with patch( + "dstack._internal.core.backends.verda.configurator.VerdaConfigurator._validate_creds" + ): + VerdaConfigurator().validate_config(config, default_creds_enabled=True) diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index 8661b81ee0..e7c44ab434 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -553,7 +553,7 @@ class TestTerminate: @contextmanager def mock_terminate_in_backend(error: Optional[Exception] = None): backend = Mock() - backend.TYPE = BackendType.DATACRUNCH + backend.TYPE = BackendType.VERDA terminate_instance = backend.compute.return_value.terminate_instance if error is not None: terminate_instance.side_effect = error diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index febe7d17e4..433c12de30 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -88,7 +88,7 @@ async def test_returns_backend_types(self, client: AsyncClient): "azure", "cloudrift", "cudo", - "datacrunch", + *(["datacrunch"] if sys.version_info >= (3, 10) else []), "digitalocean", "gcp", "hotaisle", @@ -98,6 +98,7 @@ async def test_returns_backend_types(self, client: AsyncClient): "oci", "runpod", "vastai", + *(["verda"] if sys.version_info >= (3, 10) else []), "vultr", ]