From e978169e5e282ccb37c36e5b873bf51611adc5d8 Mon Sep 17 00:00:00 2001 From: james Date: Tue, 18 Nov 2025 11:22:16 +0800 Subject: [PATCH] feat: support hami ascend device Signed-off-by: james --- docs/user-guide/how_to_use_vnpu.md | 293 ++++++++ .../api/devices/ascend/hami/device_info.go | 650 ++++++++++++++++++ .../devices/ascend/hami/device_info_test.go | 334 +++++++++ .../ascend310p/vnpu/device_info.go | 2 +- .../ascend310p/vnpu/device_manage.go | 0 .../{ => mindcluster}/ascend310p/vnpu/type.go | 6 +- .../ascend310p/vnpu/utils.go | 0 pkg/scheduler/api/devices/config/config.go | 51 ++ pkg/scheduler/api/devices/config/vnpu.go | 36 + pkg/scheduler/api/devices/device_info.go | 192 ++++++ pkg/scheduler/api/devices/nvidia/vgpu/type.go | 9 - .../api/devices/nvidia/vgpu/utils.go | 81 +-- pkg/scheduler/api/devices/util.go | 157 +++++ pkg/scheduler/api/node_info.go | 23 +- pkg/scheduler/api/node_info_test.go | 2 +- pkg/scheduler/api/shared_device_pool.go | 13 +- .../devices/ascend/310p/vnpu/init_policy.go | 2 +- .../devices/ascend/310p/vnpu/node.go | 2 +- .../devices/ascend/310p/vnpu/score_policy.go | 2 +- .../plugins/deviceshare/deviceshare.go | 55 +- 20 files changed, 1802 insertions(+), 108 deletions(-) create mode 100644 docs/user-guide/how_to_use_vnpu.md create mode 100644 pkg/scheduler/api/devices/ascend/hami/device_info.go create mode 100644 pkg/scheduler/api/devices/ascend/hami/device_info_test.go rename pkg/scheduler/api/devices/ascend/{ => mindcluster}/ascend310p/vnpu/device_info.go (99%) rename pkg/scheduler/api/devices/ascend/{ => mindcluster}/ascend310p/vnpu/device_manage.go (100%) rename pkg/scheduler/api/devices/ascend/{ => mindcluster}/ascend310p/vnpu/type.go (99%) rename pkg/scheduler/api/devices/ascend/{ => mindcluster}/ascend310p/vnpu/utils.go (100%) create mode 100644 pkg/scheduler/api/devices/config/vnpu.go create mode 100644 pkg/scheduler/api/devices/device_info.go diff --git a/docs/user-guide/how_to_use_vnpu.md b/docs/user-guide/how_to_use_vnpu.md new file mode 100644 index 0000000000..06a6cf0494 --- /dev/null +++ b/docs/user-guide/how_to_use_vnpu.md @@ -0,0 +1,293 @@ +# Ascend vNPU User Guide + +## Introduction + +Volcano supports **two vNPU modes** for sharing Ascend devices: + +--- + +### 1. MindCluster mode + +**Description**: + +The initial version of [MindCluster](https://gitcode.com/Ascend/mind-cluster)—the official Ascend cluster scheduling add-on—required custom modifications and recompilation of Volcano. Furthermore, it was limited to Volcano release1.7 and release1.9, which complicated its use and restricted access to newer Volcano features. + +To address this, we have integrated its core scheduling logic for Ascend vNPU into Volcano's native device-share plugin, which is designed specifically for scheduling and sharing heterogeneous resources like GPUs and NPUs. This integration provides seamless access to vNPU capabilities through the procedure below, while maintaining full compatibility with the latest Volcano features. + +**Use case**: + +vNPU cluster for Ascend 310 series +with support for more chip types to come + +--- + +### 2. HAMi mode + +**Description**: + +This mode is developed by a third-party community 'HAMi', which is the developer of [volcano-vgpu](./how_to_use_volcano_vgpu.md) feature. It supports vNPU feature for both Ascend 310 and Ascend 910. It also supports managing heterogeneous Ascend cluster(Cluster with multiple Ascend types, i.e. 910A,910B2,910B3,310P) + +**Use case**: + +NPU and vNPU cluster for Ascend 910 series +NPU and vNPU cluster for Ascend 310 series +Heterogeneous Ascend cluster + +--- + +## Installation + +To enable vNPU scheduling, the following components must be set up based on the selected mode: + + +**Prerequisites**: + +Kubernetes >= 1.16 +Volcano >= 1.14 +[ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime) (for HAMi Mode) + +### Install Volcano: + +Follow instructions in Volcano Installer Guide + + * Follow instructions in [Volcano Installer Guide](https://github.com/volcano-sh/volcano?tab=readme-ov-file#quick-start-guide) + +### Install ascend-device-plugin and third-party components + +In this step, you need to select different ascend-device-plugin based on the vNPU mode you selected. MindCluster mode requires additional components from Ascend to be installed. + +--- + +#### MindCluster Mode + +##### Install Third-Party Components + +Follow the official [Ascend documentation](https://www.hiascend.com/document/detail/zh/mindcluster/72rc1/clustersched/dlug/mxdlug_start_006.html#ZH-CN_TOPIC_0000002470358262__section1837511531098) to install the following components: +- NodeD +- Ascend Device Plugin +- Ascend Docker Runtime +- ClusterD +- Ascend Operator + +> **Note:** Skip the installation of `ascend-volcano` mentioned in the document above, as we have already installed the native Volcano from the Volcano community in the **Prerequisites** part. + +**Configuration Adjustment for Ascend Device Plugin:** + +When installing `ascend-device-plugin`, you must set the `presetVirtualDevice` parameter to `"false"` in the `device-plugin-310P-volcano-v{version}.yaml` file to enable dynamic virtualization of 310P: + +```yaml +... +args: [ + "device-plugin", + "-useAscendDocker=true", + "-volcanoType=true", + "-presetVirtualDevice=false", + "-logFile=/var/log/mindx-dl/devicePlugin/devicePlugin.log", + "-logLevel=0" +] +... +``` +For detailed information, please consult the official [Ascend MindCluster documentation.](https://www.hiascend.com/document/detail/zh/mindcluster/72rc1/clustersched/dlug/cpaug_0020.html) + +##### Scheduler Config Update +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: predicates + - name: deviceshare + arguments: + deviceshare.AscendMindClusterVNPUEnable: true # enable ascend vnpu + configurations: + ... + - name: init-params + arguments: {"grace-over-time":"900","presetVirtualDevice":"false"} # to enable dynamic virtulization, presetVirtualDevice need to be set false +``` + +--- + +#### HAMi mode + +##### Label the Node with `ascend=on` + +``` +kubectl label node {ascend-node} ascend=on +``` + +##### Deploy `hami-scheduler-device` ConfigMap + +``` +kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/refs/heads/main/ascend-device-configmap.yaml +``` + +##### Deploy ascend-device-plugin + +``` +kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/refs/heads/main/ascend-device-plugin.yaml +``` + +For more information, refer to the [ascend-device-plugin documentation](https://github.com/Project-HAMi/ascend-device-plugin). + +##### Scheduler Config Update +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: predicates + - name: deviceshare + arguments: + deviceshare.AscendHAMiVNPUEnable: true # enable ascend vnpu + deviceshare.SchedulePolicy: binpack # scheduling policy. binpack / spread + deviceshare.KnownGeometriesCMNamespace: kube-system + deviceshare.KnownGeometriesCMName: hami-scheduler-device +``` + + **Note:** You may notice that, 'volcano-vgpu' has its own GeometriesCMName and GeometriesCMNamespace, which means if you want to use both vNPU and vGPU in a same volcano cluster, you need to merge the configMap from both sides and set it here. + +## Usage + +Usage is different depending on the mode you selected + +--- + +### MindCluster mode + +```yaml +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: mindx-dls + namespace: vnpu + labels: + ring-controller.atlas: ascend-310P +spec: + minAvailable: 1 + schedulerName: volcano + policies: + - event: PodEvicted + action: RestartJob + plugins: + ssh: [] + env: [] + svc: [] + maxRetry: 3 + queue: default + tasks: + - name: "default-test" + replicas: 1 + template: + metadata: + labels: + app: infers + ring-controller.atlas: ascend-310P + vnpu-dvpp: "null" + vnpu-level: low + spec: + schedulerName: volcano + containers: + - name: resnet50infer + image: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.1.RC1-300I-Duo-py311-openeuler24.03-lts + imagePullPolicy: IfNotPresent + securityContext: + privileged: false + command: ["/bin/bash", "-c", "tail -f /dev/null"] + resources: + requests: + huawei.com/npu-core: 8 + limits: + huawei.com/npu-core: 8 + nodeSelector: + host-arch: huawei-arm + +``` + +The supported Ascend chips and their `ResourceNames` are shown in the following table: + +| ChipName | JobLabel and TaskLabel | ResourceName | +|-------|------------------------------------|-------| +| 310P3 | ring-controller.atlas: ascend-310P | huawei.com/npu-core | + +**Description of Labels in the Virtualization Task YAML** + +| **Key** | **Value** | **Description** | +| ------------------------- | --------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **vnpu-level** | **low** | Low configuration (default). Selects the lowest-configuration "virtualized instance template." | +| | **high** | Performance-first. When cluster resources are sufficient, the scheduler will choose the highest-configured virtualized instance template possible. When most physical NPUs in the cluster are already in use and only a few AI Cores remain on each device, the scheduler will allocate templates that match the remaining AI Core count rather than forcing high-profile templates. For details, refer to the table below. | +| **vnpu-dvpp** | **yes** | The Pod uses DVPP. | +| | **no** | The Pod does not use DVPP. | +| | **null** | Default value. DVPP usage is not considered. | +| **ring-controller.atlas** | **ascend-310P** | Indicates that the task uses products from the Atlas inference series. | + +**Effect of DVPP and Level Configurations** + +| **Product Model** | **Requested AI Core Count** | **vnpu-dvpp** | **vnpu-level** | **Downgrade** | **Selected Template** | +| --------------------------------------- | --------------------------- |---------------| -------------------- | ------------- | --------------------- | +| **Atlas Inference Series (8 AI Cores)** | **1** | `null` | Any value | – | `vir01` | +| | **2** | `null` | `low` / other values | – | `vir02_1c` | +| | **2** | `null` | `high` | No | `vir02` | +| | **2** | `null` | `high` | Yes | `vir02_1c` | +| | **4** | `yes` | `low` / other values | – | `vir04_4c_dvpp` | +| | **4** | `no` | `low` / other values | – | `vir04_3c_ndvpp` | +| | **4** | `null` | `low` / other values | – | `vir04_3c` | +| | **4** | `yes` | `high` | – | `vir04_4c_dvpp` | +| | **4** | `no` | `high` | – | `vir04_3c_ndvpp` | +| | **4** | `null` | `high` | No | `vir04` | +| | **4** | `null` | `high` | Yes | `vir04_3c` | +| | **8 or multiples of 8** | Any value | Any value | – | – | + + +**Notice** + +For **chip virtualization (non-full card usage)**, the value of `vnpu-dvpp` must strictly match the corresponding value listed in the above table. +Any other values will cause the task to fail to be dispatched. + + +For detailed information, please consult the official [Ascend MindCluster documentation.](https://www.hiascend.com/document/detail/zh/mindcluster/72rc1/clustersched/dlug/cpaug_0020.html) + +--- + +### HAMi mode + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: ascend-pod +spec: + schedulerName: volcano + containers: + - name: ubuntu-container + image: swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-pytorch:24.0.RC1-A2-1.11.0-ubuntu20.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + huawei.com/Ascend310P: "1" + huawei.com/Ascend310P-memory: "4096" + +``` + +The supported Ascend chips and their `ResourceNames` are shown in the following table: + +| ChipName | ResourceName | ResourceMemoryName | +|-------|-------|-------| +| 910A | huawei.com/Ascend910A | huawei.com/Ascend910A-memory | +| 910B2 | huawei.com/Ascend910B2 | huawei.com/Ascend910B2-memory | +| 910B3 | huawei.com/Ascend910B3 | huawei.com/Ascend910B3-memory | +| 910B4 | huawei.com/Ascend910B4 | huawei.com/Ascend910B4-memory | +| 910B4-1 | huawei.com/Ascend910B4-1 | huawei.com/Ascend910B4-1-memory | +| 310P3 | huawei.com/Ascend310P | huawei.com/Ascend310P-memory | \ No newline at end of file diff --git a/pkg/scheduler/api/devices/ascend/hami/device_info.go b/pkg/scheduler/api/devices/ascend/hami/device_info.go new file mode 100644 index 0000000000..75e8a3c44d --- /dev/null +++ b/pkg/scheduler/api/devices/ascend/hami/device_info.go @@ -0,0 +1,650 @@ +/* +Copyright 2025 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hami + +import ( + "encoding/json" + "flag" + "fmt" + "sort" + "strconv" + "strings" + "time" + + "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "volcano.sh/volcano/pkg/scheduler/api/devices" + "volcano.sh/volcano/pkg/scheduler/api/devices/config" + "volcano.sh/volcano/pkg/scheduler/plugins/util/nodelock" +) + +const ( + HAMiAnnotationsPrefix = "hami.io" + AssignedNodeAnnotations = "hami.io/vgpu-node" + AssignedTimeAnnotations = "hami.io/vgpu-time" + BindTimeAnnotations = "hami.io/bind-time" + DeviceBindPhase = "hami.io/bind-phase" + DeviceBindAllocating = "allocating" + DeviceBindFailed = "failed" + DeviceBindSuccess = "success" + + Ascend910Prefix = "Ascend910" + Ascend910NetworkWeight = 10 + // binpack means the lower device memory remained after this allocation, the better + binpackPolicy = "binpack" + // spread means better put this task into an idle GPU card than a shared GPU card + spreadPolicy = "spread" + binpackMultiplier = 100 + spreadMultiplier = 100 +) + +type AscendDevice struct { + config config.VNPUConfig + nodeRegisterAnno string + useUUIDAnno string + noUseUUIDAnno string + handshakeAnno string + DeviceInfo *devices.DeviceInfo + DeviceUsage *devices.DeviceUsage + Score float64 +} + +type AscendDevices struct { + NodeName string + Type string + Devices map[string]*AscendDevice + Policy string +} + +type RuntimeInfo struct { + UUID string `json:"UUID,omitempty"` + Temp string `json:"temp,omitempty"` +} + +var ( + AscendHAMiVNPUEnable bool + NodeLockEnable bool +) + +func NewAscendDevices(name string, node *v1.Node) map[string]*AscendDevices { + ascendDevices := make(map[string]*AscendDevices) + if node == nil { + klog.Warningf("Node is nil for node %s, returning empty AscendDevices", name) + return ascendDevices + } + curConfig := config.GetConfig() + if curConfig == nil { + klog.V(5).InfoS("cur config is null. call GetDefaultDevicesConfig") + curConfig = config.GetDefaultDevicesConfig() + } + devs := InitDevices(curConfig.VNPUs) + for _, dev := range devs { + nodeDevices, err := dev.GetNodeDevices(*node) + if err != nil { + klog.Warningf("Failed to get node devices. nodeName %s, deviceType %s, error %s", node.Name, dev.CommonWord(), err) + continue + } + asDevices := &AscendDevices{ + NodeName: name, + Type: dev.CommonWord(), + Devices: make(map[string]*AscendDevice), + } + for _, nd := range nodeDevices { + cur_dev := &AscendDevice{ + config: dev.config, + nodeRegisterAnno: dev.nodeRegisterAnno, + useUUIDAnno: dev.useUUIDAnno, + noUseUUIDAnno: dev.noUseUUIDAnno, + handshakeAnno: dev.handshakeAnno, + DeviceInfo: nd, + DeviceUsage: &devices.DeviceUsage{ + Used: 0, + Usedmem: 0, + Usedcores: 0, + }, + } + asDevices.Devices[nd.ID] = cur_dev + klog.V(5).Infof("add device. ID %s dev_info %+v", cur_dev.DeviceInfo.ID, cur_dev.DeviceInfo) + } + ascendDevices[dev.CommonWord()] = asDevices + } + return ascendDevices +} + +func GetAscendDeviceNames() []string { + curConfig := config.GetConfig() + if curConfig == nil { + klog.V(5).InfoS("cur config is null. call GetDefaultDevicesConfig") + curConfig = config.GetDefaultDevicesConfig() + } + deviceNames := make([]string, 0, len(curConfig.VNPUs)) + for _, vnpu := range curConfig.VNPUs { + deviceNames = append(deviceNames, vnpu.CommonWord) + } + return deviceNames +} + +func (ads *AscendDevices) AddResourceUsage(id string, cores int32, mem int32) error { + dev, ok := ads.Devices[id] + if !ok { + return fmt.Errorf("ascend device %s not found", id) + } + dev.DeviceUsage.Used++ + dev.DeviceUsage.Usedcores += cores + dev.DeviceUsage.Usedmem += mem + return nil +} + +func (ads *AscendDevices) SubResourceUsage(id string, cores int32, mem int32) error { + dev, ok := ads.Devices[id] + if !ok { + return fmt.Errorf("ascend device %s not found", id) + } + dev.DeviceUsage.Used-- + dev.DeviceUsage.Usedcores -= cores + dev.DeviceUsage.Usedmem -= mem + return nil +} + +func (ads *AscendDevices) AddResource(pod *v1.Pod) { + if ads == nil { + return + } + ads.addResource(pod.Annotations, pod) +} + +func (ads *AscendDevices) SubResource(pod *v1.Pod) { + if ads == nil { + return + } + ano_key := devices.InRequestDevices[ads.Type] + ano, ok := pod.Annotations[ano_key] + if !ok { + return + } + con_devs, err := devices.DecodeContainerDevices(ano) + if err != nil { + klog.ErrorS(err, "failed to decode container devices", "pod", pod.Name, "annotation", ano) + return + } + for _, cono_dev := range con_devs { + ads.SubResourceUsage(cono_dev.UUID, cono_dev.Usedcores, cono_dev.Usedmem) + } +} + +func (ads *AscendDevices) addResource(annotations map[string]string, pod *v1.Pod) { + ano_key := devices.InRequestDevices[ads.Type] + ano, ok := annotations[ano_key] + if !ok { + return + } + con_devs, err := devices.DecodeContainerDevices(ano) + if err != nil { + klog.ErrorS(err, "failed to decode container devices", "pod", pod.Name, "annotation", ano) + return + } + for _, cono_dev := range con_devs { + ads.AddResourceUsage(cono_dev.UUID, cono_dev.Usedcores, cono_dev.Usedmem) + } +} + +func (ads *AscendDevices) AddQueueResource(pod *v1.Pod) map[string]float64 { + return map[string]float64{} +} + +func (ads *AscendDevices) HasDeviceRequest(pod *v1.Pod) bool { + if !AscendHAMiVNPUEnable { + return false + } + randDev, err := ads.getFirstDevice() + if randDev == nil || err != nil { + return false + } + var vnpu_config = randDev.config + for _, container := range pod.Spec.Containers { + _, ok := container.Resources.Limits[v1.ResourceName(vnpu_config.ResourceName)] + if ok { + klog.V(5).Infof("%s check HasDeviceRequest ok. %s", ads.Type, vnpu_config.ResourceName) + return true + } + _, ok = container.Resources.Limits[v1.ResourceName(vnpu_config.ResourceMemoryName)] + if ok { + klog.V(5).Infof("%s check HasDeviceRequest ok. %s", ads.Type, vnpu_config.ResourceMemoryName) + return true + } + } + klog.V(5).Infof("%s check HasDeviceRequest false", ads.Type) + return false +} + +func (ads *AscendDevices) FilterNode(pod *v1.Pod, policy string) (int, string, error) { + _, err := ads.selectDevices(pod, policy) + if err != nil { + return devices.Error, "no ascend device available", err + } + klog.V(4).Infoln("ascend DeviceSharing successfully filters pods. device_type:", ads.Type) + return devices.Success, "", nil +} + +func (ads *AscendDevices) ScoreNode(pod *v1.Pod, policy string) float64 { + ads.Policy = policy + podDevs, err := ads.selectDevices(pod, policy) + if err != nil { + return 0 + } + score := 0.0 + var usedDevs []*AscendDevice + for _, dev := range podDevs { + dev, ok := ads.Devices[dev[0].UUID] + if !ok { + return 0 + } + usedDevs = append(usedDevs, dev) + score += CalScore(policy, dev.DeviceUsage, dev.DeviceInfo) + } + + if strings.HasPrefix(ads.Type, Ascend910Prefix) && hasNetworkID(usedDevs) { + klog.V(4).Infof("all devices have NetworkID. device CommonWord %s", ads.Type) + cntMap := make(map[int]int) + for _, dev := range usedDevs { + if dev.DeviceInfo.CustomInfo == nil { + return 0 + } + if networkID, ok := dev.DeviceInfo.CustomInfo["NetworkID"]; ok { + if id, ok := networkID.(float64); ok { + cntMap[int(id)]++ + } + } else { + return 0 + } + } + maxCnt, totalCnt := 0, 0 + for _, cnt := range cntMap { + if cnt > maxCnt { + maxCnt = cnt + } + totalCnt += cnt + } + if totalCnt == 0 { + return 0 + } + score += (float64(maxCnt) / float64(totalCnt)) * Ascend910NetworkWeight + } + return score +} + +func (ads *AscendDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error { + klog.V(4).Infof("Allocate device %s to Pod %s", ads.Type, pod.Name) + if NodeLockEnable { + nodelock.UseClient(kubeClient) + err := nodelock.LockNode(ads.NodeName, ads.Type) + if err != nil { + return errors.Errorf("node %s locked for %s hami vnpu. lockname %s", ads.NodeName, pod.Name, err.Error()) + } + } + podDevs, err := ads.selectDevices(pod, ads.Policy) + if err != nil { + return errors.Errorf("failed to select ascend devices for pod %s: %v", pod.Name, err) + } + annotations := make(map[string]string) + ads.PatchAnnotations(pod, &annotations, podDevs) + + ads.addResource(annotations, pod) + annotations[AssignedNodeAnnotations] = ads.NodeName + annotations[AssignedTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + annotations[DeviceBindPhase] = "allocating" + annotations[BindTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + err = devices.PatchPodAnnotations(kubeClient, pod, annotations) + if err != nil { + return err + } + if NodeLockEnable { + nodelock.ReleaseNodeLock(ads.NodeName, ads.Type) + } + klog.V(4).Infof("Allocate Success. device %s Pod %s", ads.Type, pod.Name) + return nil +} + +func (ads *AscendDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) error { + return nil +} + +func (ads *AscendDevices) GetIgnoredDevices() []string { + randDev, err := ads.getFirstDevice() + if randDev == nil || err != nil { + return []string{""} + } + vnpuConfig := randDev.config + return []string{vnpuConfig.ResourceMemoryName} +} + +func (ads *AscendDevices) GetStatus() string { + return "" +} + +func (ads *AscendDevices) selectDevices(pod *v1.Pod, schedulePolicy string) (devices.PodSingleDevice, error) { + dupDevs := getDeviceSnapshot(ads) + if len(dupDevs) == 0 { + return nil, errors.Errorf("no ascend device available") + } + for _, dev := range dupDevs { + dev.Score = CalScore(schedulePolicy, dev.DeviceUsage, dev.DeviceInfo) + } + sort.Slice(dupDevs, func(i, j int) bool { + return dupDevs[i].Score > dupDevs[j].Score + }) + needTopology := false + if strings.HasPrefix(ads.Type, Ascend910Prefix) && hasNetworkID(dupDevs) { + klog.V(4).Infof("all devices have NetworkID. device CommonWord %s", ads.Type) + needTopology = true + } + reqs := dupDevs[0].ResourceReqs(pod) + var podDevs devices.PodSingleDevice + usedDevs := make([]*AscendDevice, 0) + for _, req := range reqs { + klog.V(5).Infof("req %+v", req) + availableDevs := make([]*AscendDevice, 0) + for _, dev := range dupDevs { + selected := false + for _, usedDev := range usedDevs { + if usedDev.DeviceInfo.ID == dev.DeviceInfo.ID { + selected = true + break + } + } + if !selected { + availableDevs = append(availableDevs, dev) + } + } + req_nums := req.Nums + selectedDevs := make([]*AscendDevice, 0) + for _, dev := range availableDevs { + klog.V(5).Infof("check fit. req %+v dev_info %+v dev_usage %+v", req, dev.DeviceInfo, dev.DeviceUsage) + if !fit(&req, dev) { + klog.V(5).Infof("fit false. dev ID %s", dev.DeviceInfo.ID) + continue + } + selectedDevs = append(selectedDevs, dev) + req_nums -= 1 + if req_nums <= 0 && !needTopology { + break + } + } + if req_nums > 0 { + klog.V(5).Infof("no enough ascend device available! raw req_nums %d cur req_nums %d", req.Nums, req_nums) + return nil, errors.Errorf("no enough ascend device available") + } + if needTopology { + selectedDevs = selectDevicesWithTopology(int(req.Nums), selectedDevs) + } + usedDevs = append(usedDevs, selectedDevs...) + var conDevs devices.ContainerDevices + for _, dev := range selectedDevs { + conDevs = append(conDevs, devices.ContainerDevice{ + UUID: dev.DeviceInfo.ID, + Type: ads.Type, + Usedmem: req.Memreq, + Usedcores: req.Coresreq, + CustomInfo: dev.DeviceInfo.CustomInfo, + }) + } + podDevs = append(podDevs, conDevs) + } + return podDevs, nil +} + +func hasNetworkID(devices []*AscendDevice) bool { + for _, dev := range devices { + if dev.DeviceInfo.CustomInfo == nil { + return false + } + if _, ok := dev.DeviceInfo.CustomInfo["NetworkID"]; !ok { + return false + } + } + return true +} + +func fit(req *devices.ContainerDeviceRequest, dev *AscendDevice) bool { + if req.Type != dev.config.CommonWord { + return false + } + deviceUsage := dev.DeviceUsage + deviceInfo := dev.DeviceInfo + if deviceInfo.Count <= deviceUsage.Used { + return false + } + if deviceInfo.Devmem-deviceUsage.Usedmem < req.Memreq { + return false + } + if deviceInfo.Devcore-deviceUsage.Usedcores < req.Coresreq { + return false + } + if deviceInfo.Devcore == 100 && req.Coresreq == 100 && deviceUsage.Used > 0 { + return false + } + if deviceInfo.Devcore != 0 && deviceUsage.Usedcores == deviceInfo.Devcore && req.Coresreq == 0 { + return false + } + return true +} + +func getDeviceSnapshot(ads *AscendDevices) []*AscendDevice { + dupDevs := make([]*AscendDevice, 0, len(ads.Devices)) + for _, dev := range ads.Devices { + dup_dev := &AscendDevice{ + config: dev.config, + nodeRegisterAnno: dev.nodeRegisterAnno, + useUUIDAnno: dev.useUUIDAnno, + noUseUUIDAnno: dev.noUseUUIDAnno, + handshakeAnno: dev.handshakeAnno, + DeviceInfo: dev.DeviceInfo, + DeviceUsage: &devices.DeviceUsage{ + Used: dev.DeviceUsage.Used, + Usedmem: dev.DeviceUsage.Usedmem, + Usedcores: dev.DeviceUsage.Usedcores, + }, + } + dupDevs = append(dupDevs, dup_dev) + } + return dupDevs +} + +func selectDevicesWithTopology(req_nums int, selected_devs []*AscendDevice) []*AscendDevice { + networkMap := make(map[int][]*AscendDevice) + + for _, dev := range selected_devs { + if dev.DeviceInfo.CustomInfo != nil { + if networkID, ok := dev.DeviceInfo.CustomInfo["NetworkID"]; ok { + if id, ok := networkID.(float64); ok { + networkMap[int(id)] = append(networkMap[int(id)], dev) + } + } + } + } + type NetworkDeviceCount struct { + NetworkID int + Count int + } + var sortedNetworks []NetworkDeviceCount + for networkID, devices := range networkMap { + sortedNetworks = append(sortedNetworks, NetworkDeviceCount{ + NetworkID: networkID, + Count: len(devices), + }) + } + sort.Slice(sortedNetworks, func(i, j int) bool { + return sortedNetworks[i].Count > sortedNetworks[j].Count + }) + devs := make([]*AscendDevice, 0) + for _, item := range sortedNetworks { + for _, dev := range networkMap[item.NetworkID] { + devs = append(devs, dev) + if len(devs) == req_nums { + return devs + } + } + } + return devs +} + +func (ads *AscendDevices) getFirstDevice() (*AscendDevice, error) { + if len(ads.Devices) == 0 { + return nil, errors.New("no ascend device available") + } + for _, dev := range ads.Devices { + return dev, nil + } + return nil, errors.New("no ascend device available") +} + +func (dev *AscendDevice) trimMemory(m int64) (int64, string) { + for i := range dev.config.Templates { + if m <= dev.config.Templates[i].Memory { + return dev.config.Templates[i].Memory, dev.config.Templates[i].Name + } + } + if m <= dev.config.MemoryCapacity { + return dev.config.MemoryAllocatable, "" + } + return 0, "" +} + +func InitDevices(config []config.VNPUConfig) []*AscendDevice { + devs := make([]*AscendDevice, 0) + for _, vnpu := range config { + commonWord := vnpu.CommonWord + dev := &AscendDevice{ + config: vnpu, + nodeRegisterAnno: fmt.Sprintf("%s/node-register-%s", HAMiAnnotationsPrefix, commonWord), + useUUIDAnno: fmt.Sprintf("%s/use-%s-uuid", HAMiAnnotationsPrefix, commonWord), + noUseUUIDAnno: fmt.Sprintf("%s/no-use-%s-uuid", HAMiAnnotationsPrefix, commonWord), + handshakeAnno: fmt.Sprintf("%s/node-handshake-%s", HAMiAnnotationsPrefix, commonWord), + } + sort.Slice(dev.config.Templates, func(i, j int) bool { + return dev.config.Templates[i].Memory < dev.config.Templates[j].Memory + }) + _, ok := devices.InRequestDevices[commonWord] + if !ok { + devices.InRequestDevices[commonWord] = fmt.Sprintf("%s/%s-devices-to-allocate", HAMiAnnotationsPrefix, commonWord) + devices.SupportDevices[commonWord] = fmt.Sprintf("%s/%s-devices-allocated", HAMiAnnotationsPrefix, commonWord) + // util.HandshakeAnnos[commonWord] = dev.handshakeAnno + } + devs = append(devs, dev) + klog.Infof("load ascend vnpu config %s: %v", commonWord, dev.config) + } + return devs +} + +func ParseConfig(fs *flag.FlagSet) { + fs.BoolVar(&AscendHAMiVNPUEnable, "AscendHAMiVNPUEnable", false, "enable ascend device") +} + +func (dev *AscendDevice) CommonWord() string { + return dev.config.CommonWord +} + +func (dev *AscendDevice) GetNodeDevices(n v1.Node) ([]*devices.DeviceInfo, error) { + anno, ok := n.Annotations[dev.nodeRegisterAnno] + if !ok { + return []*devices.DeviceInfo{}, fmt.Errorf("annos not found %s", dev.nodeRegisterAnno) + } + nodeDevices, err := devices.UnMarshalNodeDevices(anno) + if err != nil { + klog.ErrorS(err, "failed to unmarshal node devices", "node", n.Name, "device annotation", anno) + return []*devices.DeviceInfo{}, err + } + if len(nodeDevices) == 0 { + klog.InfoS("no ascend device found", "node", n.Name, "device annotation", anno) + return []*devices.DeviceInfo{}, errors.New("no device found on node") + } + return nodeDevices, nil +} + +func (dev *AscendDevice) ResourceReqs(pod *v1.Pod) []devices.ContainerDeviceRequest { + reqs := devices.ExtractResourceRequest(pod, dev.CommonWord(), dev.config.ResourceName, dev.config.ResourceMemoryName, "", "") + for i := range reqs { + req := &reqs[i] + if req.Memreq == 0 && req.MemPercentagereq != 0 { + req.Memreq = int32(dev.DeviceInfo.Devmem * req.MemPercentagereq / 100) + klog.V(5).Infof("new memreq %d totalmem %d mempercentage %d", req.Memreq, dev.DeviceInfo.Devmem, req.MemPercentagereq) + } + if req.Memreq > 0 { + m, _ := dev.trimMemory(int64(req.Memreq)) + klog.V(5).Infof("raw mem %d, trimed mem %d", req.Memreq, m) + req.Memreq = int32(m) + } + } + return reqs +} + +func (ads *AscendDevices) PatchAnnotations(pod *v1.Pod, annoInput *map[string]string, devList devices.PodSingleDevice) map[string]string { + dev, err := ads.getFirstDevice() + if err != nil { + return *annoInput + } + commonWord := dev.CommonWord() + + (*annoInput)[devices.InRequestDevices[commonWord]] = devices.EncodePodSingleDevice(devList) + (*annoInput)[devices.SupportDevices[commonWord]] = devices.EncodePodSingleDevice(devList) + (*annoInput)["predicate-time"] = strconv.FormatInt(time.Now().Unix(), 10) + allocateStr := fmt.Sprintf("huawei.com/%s", dev.CommonWord()) + var rtInfo []RuntimeInfo + for _, dp := range devList { + for _, val := range dp { + _, temp := dev.trimMemory(int64(val.Usedmem)) + rtInfo = append(rtInfo, RuntimeInfo{ + UUID: val.UUID, + Temp: temp, + }) + } + } + s, err := json.Marshal(rtInfo) + if err != nil { + klog.ErrorS(err, "failed to marshal runtime info", "runtime info", rtInfo) + } + (*annoInput)[allocateStr] = string(s) + + return *annoInput +} + +func (dev *AscendDevice) GetResourceNames() devices.ResourceNames { + return devices.ResourceNames{ + ResourceCountName: dev.config.ResourceName, + ResourceMemoryName: dev.config.ResourceMemoryName, + ResourceCoreName: "", + } +} + +func CalScore(schedulePolicy string, dev_usage *devices.DeviceUsage, dev_info *devices.DeviceInfo) float64 { + var score float64 + switch schedulePolicy { + case binpackPolicy: + score = binpackMultiplier * (float64(dev_usage.Usedmem) / float64(dev_info.Devmem)) + case spreadPolicy: + if dev_usage.Used == 1 { + score = spreadMultiplier + } + default: + score = float64(0) + } + return score +} diff --git a/pkg/scheduler/api/devices/ascend/hami/device_info_test.go b/pkg/scheduler/api/devices/ascend/hami/device_info_test.go new file mode 100644 index 0000000000..6c8034af9b --- /dev/null +++ b/pkg/scheduler/api/devices/ascend/hami/device_info_test.go @@ -0,0 +1,334 @@ +/* +Copyright 2025 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hami + +import ( + "fmt" + "testing" + + "volcano.sh/volcano/pkg/scheduler/api/devices" + "volcano.sh/volcano/pkg/scheduler/api/devices/config" + + "github.com/stretchr/testify/assert" + "gopkg.in/yaml.v2" +) + +var config_yaml = ` +vnpus: +- chipName: 910A + commonWord: Ascend910A + resourceName: huawei.com/Ascend910A + resourceMemoryName: huawei.com/Ascend910A-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 30 + templates: + - name: vir02 + memory: 2184 + aiCore: 2 + - name: vir04 + memory: 4369 + aiCore: 4 + - name: vir08 + memory: 8738 + aiCore: 8 + - name: vir16 + memory: 17476 + aiCore: 16 +- chipName: 910B2 + commonWord: Ascend910B2 + resourceName: huawei.com/Ascend910B2 + resourceMemoryName: huawei.com/Ascend910B2-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 24 + aiCPU: 6 + templates: + - name: vir03_1c_8g + memory: 8192 + aiCore: 3 + aiCPU: 1 + - name: vir06_1c_16g + memory: 16384 + aiCore: 6 + aiCPU: 1 + - name: vir12_3c_32g + memory: 32768 + aiCore: 12 + aiCPU: 3 +- chipName: 910B3 + commonWord: Ascend910B3 + resourceName: huawei.com/Ascend910B3 + resourceMemoryName: huawei.com/Ascend910B3-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_16g + memory: 16384 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_32g + memory: 32768 + aiCore: 10 + aiCPU: 3 +- chipName: 910B4-1 + commonWord: Ascend910B4-1 + resourceName: huawei.com/Ascend910B4-1 + resourceMemoryName: huawei.com/Ascend910B4-1-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_8g + memory: 8192 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_16g + memory: 16384 + aiCore: 10 + aiCPU: 3 +- chipName: 910B4 + commonWord: Ascend910B4 + resourceName: huawei.com/Ascend910B4 + resourceMemoryName: huawei.com/Ascend910B4-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_8g + memory: 8192 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_16g + memory: 16384 + aiCore: 10 + aiCPU: 3 +- chipName: 310P3 + commonWord: Ascend310P + resourceName: huawei.com/Ascend310P + resourceMemoryName: huawei.com/Ascend310P-memory + memoryAllocatable: 21527 + memoryCapacity: 24576 + aiCore: 8 + aiCPU: 7 + templates: + - name: vir01 + memory: 3072 + aiCore: 1 + aiCPU: 1 + - name: vir02 + memory: 6144 + aiCore: 2 + aiCPU: 2 + - name: vir04 + memory: 12288 + aiCore: 4 + aiCPU: 4 +nvidia: + resourceCountName: volcano.sh/vgpu-number + resourceMemoryName: volcano.sh/vgpu-memory + resourceMemoryPercentageName: volcano.sh/vgpu-memory-percentage + resourceCoreName: volcano.sh/vgpu-cores + overwriteEnv: false + defaultMemory: 0 + defaultCores: 0 + defaultGPUNum: 1 + deviceSplitCount: 10 + deviceMemoryScaling: 1 + deviceCoreScaling: 1 + gpuMemoryFactor: 1 + knownMigGeometries: + - models: [ "A30" ] + allowedGeometries: + - group: group1 + geometries: + - name: 1g.6gb + memory: 6144 + count: 4 + - group: group2 + geometries: + - name: 2g.12gb + memory: 12288 + count: 2 + - group: group3 + geometries: + - name: 4g.24gb + memory: 24576 + count: 1 +` + +func yamlStringToConfig(yamlStr string) (*config.Config, error) { + var config config.Config + err := yaml.Unmarshal([]byte(yamlStr), &config) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal YAML: %v", err) + } + return &config, nil +} + +func Test_trimMemory(t *testing.T) { + conf, err := yamlStringToConfig(config_yaml) + assert.Nil(t, err) + dev := AscendDevice{ + config: conf.VNPUs[len(conf.VNPUs)-1], + } + tests := []struct { + name string + inputMem int64 + wantMem int64 + }{ + {"test1", 0, 3072}, + {"test2", 1, 3072}, + {"test3", 100, 3072}, + {"test4", 3071, 3072}, + {"test5", 3072, 3072}, + {"test6", 3073, 6144}, + {"test7", 6143, 6144}, + {"test8", 6144, 6144}, + {"test9", 6144, 6144}, + {"test10", 6145, 12288}, + {"test11", 12288, 12288}, + {"test12", 12289, 21527}, + {"test13", 21527, 21527}, + {"test14", 21528, 21527}, + {"test15", 24576, 21527}, + {"test16", 24577, 0}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, _ := dev.trimMemory(tt.inputMem) + assert.Equal(t, tt.wantMem, got) + }) + } +} + +func Test_fit(t *testing.T) { + conf, err := yamlStringToConfig(config_yaml) + assert.Nil(t, err) + ascend310PConfig := conf.VNPUs[len(conf.VNPUs)-1] + device_info := &devices.DeviceInfo{ + ID: "68496E64-20E05477-92C31323-6E78030A-BD003019", + Index: 0, + Count: 7, + Devcore: 8, + Devmem: 21527, + } + tests := []struct { + name string + req *devices.ContainerDeviceRequest + dev *AscendDevice + result bool + }{ + { + "test1", + &devices.ContainerDeviceRequest{ + Nums: 1, + Type: "Ascend310P", + Memreq: 1024, + }, + &AscendDevice{ + config: ascend310PConfig, + DeviceInfo: device_info, + DeviceUsage: &devices.DeviceUsage{ + Used: 1, + Usedmem: 3072, + }, + }, + true, + }, + { + "test2", + &devices.ContainerDeviceRequest{ + Nums: 1, + Type: "Ascend310P", + Memreq: 21527, + }, + &AscendDevice{ + config: ascend310PConfig, + DeviceInfo: device_info, + DeviceUsage: &devices.DeviceUsage{ + Used: 1, + Usedmem: 3072, + }, + }, + false, + }, + { + "test3", + &devices.ContainerDeviceRequest{ + Nums: 1, + Type: "Ascend310P", + Memreq: 6144, + }, + &AscendDevice{ + config: ascend310PConfig, + DeviceInfo: device_info, + DeviceUsage: &devices.DeviceUsage{ + Used: 1, + Usedmem: 12288, + }, + }, + true, + }, + { + "test4", + &devices.ContainerDeviceRequest{ + Nums: 1, + Type: "Ascend310P", + Memreq: 24576, + }, + &AscendDevice{ + config: ascend310PConfig, + DeviceInfo: device_info, + DeviceUsage: &devices.DeviceUsage{ + Used: 0, + Usedmem: 0, + }, + }, + false, + }, + { + "test5_core", + &devices.ContainerDeviceRequest{ + Nums: 1, + Type: "Ascend310P", + Memreq: 6144, + Coresreq: 4, + }, + &AscendDevice{ + config: ascend310PConfig, + DeviceInfo: device_info, + DeviceUsage: &devices.DeviceUsage{ + Used: 1, + Usedmem: 12288, + Usedcores: 6, + }, + }, + false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ret := fit(tt.req, tt.dev) + assert.Equal(t, tt.result, ret) + }) + } +} diff --git a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/device_info.go b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/device_info.go similarity index 99% rename from pkg/scheduler/api/devices/ascend/ascend310p/vnpu/device_info.go rename to pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/device_info.go index ad6c0eb336..3876fa1bf5 100644 --- a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/device_info.go +++ b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/device_info.go @@ -155,7 +155,7 @@ func (ns *NPUDevices) AddQueueResource(pod *v1.Pod) map[string]float64 { } func (ns *NPUDevices) HasDeviceRequest(pod *v1.Pod) bool { - if Ascend310pvNPUEnable && checkVNPUResourcesInPod(pod) { + if AscendMindClusterVNPUEnable && checkVNPUResourcesInPod(pod) { return true } return false diff --git a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/device_manage.go b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/device_manage.go similarity index 100% rename from pkg/scheduler/api/devices/ascend/ascend310p/vnpu/device_manage.go rename to pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/device_manage.go diff --git a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/type.go b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/type.go similarity index 99% rename from pkg/scheduler/api/devices/ascend/ascend310p/vnpu/type.go rename to pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/type.go index bda9ba3925..0e314ec93b 100644 --- a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/type.go +++ b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/type.go @@ -9,13 +9,13 @@ import ( "k8s.io/client-go/kubernetes" ) -var Ascend310pvNPUEnable bool +var AscendMindClusterVNPUEnable bool const ( DeviceName = "ascend310p-vNPU" ) -type NPUDevice struct { // One VNode/VChip? +type NPUDevice struct { VT VTemplate // Chips map chipID to VChip class Chips map[int]*VChip @@ -42,7 +42,7 @@ type NPUDevice struct { // One VNode/VChip? // for Concurrent task. not same core request task only has one on a node in same time. // nodeName: templateName:taskUID - ConCache map[string]map[types.UID]struct{} //types.UID对应pod.UID + ConCache map[string]map[types.UID]struct{} //types.UID equals to pod.UID } type NodeInf struct { diff --git a/pkg/scheduler/api/devices/ascend/ascend310p/vnpu/utils.go b/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/utils.go similarity index 100% rename from pkg/scheduler/api/devices/ascend/ascend310p/vnpu/utils.go rename to pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu/utils.go diff --git a/pkg/scheduler/api/devices/config/config.go b/pkg/scheduler/api/devices/config/config.go index ad61bacbf2..0878e7d6d8 100644 --- a/pkg/scheduler/api/devices/config/config.go +++ b/pkg/scheduler/api/devices/config/config.go @@ -44,11 +44,30 @@ const ( hygon: resourceCountName: "volcano.sh/vdcu" ... + vnpus: + - chipName: 910B3 + commonWord: Ascend910B3 + resourceName: huawei.com/Ascend910B3 + resourceMemoryName: huawei.com/Ascend910B3-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_16g + memory: 16384 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_32g + memory: 32768 + aiCore: 10 + aiCPU: 3 */ type Config struct { //NvidiaConfig is used for vGPU feature for nvidia, gpushare is not using this config NvidiaConfig NvidiaConfig `yaml:"nvidia"` + VNPUs []VNPUConfig `yaml:"vnpus"` } var ( @@ -97,6 +116,38 @@ func GetDefaultDevicesConfig() *Config { DeviceCoreScaling: 1, DisableCoreLimit: false, }, + VNPUs: []VNPUConfig{ + { + CommonWord: "Ascend310P", + ChipName: "310P3", + ResourceName: "huawei.com/Ascend310P", + ResourceMemoryName: "huawei.com/Ascend310P-memory", + MemoryAllocatable: 21527, + MemoryCapacity: 24576, + AICore: 8, + AICPU: 7, + Templates: []Template{ + { + Name: "vir01", + Memory: 3072, + AICore: 1, + AICPU: 1, + }, + { + Name: "vir02", + Memory: 6144, + AICore: 2, + AICPU: 2, + }, + { + Name: "vir04", + Memory: 12288, + AICore: 4, + AICPU: 4, + }, + }, + }, + }, } } diff --git a/pkg/scheduler/api/devices/config/vnpu.go b/pkg/scheduler/api/devices/config/vnpu.go new file mode 100644 index 0000000000..7b986dbe4f --- /dev/null +++ b/pkg/scheduler/api/devices/config/vnpu.go @@ -0,0 +1,36 @@ +/* +Copyright 2025 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +type Template struct { + Name string `yaml:"name"` + Memory int64 `yaml:"memory"` + AICore int32 `yaml:"aiCore,omitempty"` + AICPU int32 `yaml:"aiCPU,omitempty"` +} + +type VNPUConfig struct { + CommonWord string `yaml:"commonWord"` + ChipName string `yaml:"chipName"` + ResourceName string `yaml:"resourceName"` + ResourceMemoryName string `yaml:"resourceMemoryName"` + MemoryAllocatable int64 `yaml:"memoryAllocatable"` + MemoryCapacity int64 `yaml:"memoryCapacity"` + AICore int32 `yaml:"aiCore"` + AICPU int32 `yaml:"aiCPU"` + Templates []Template `yaml:"templates"` +} diff --git a/pkg/scheduler/api/devices/device_info.go b/pkg/scheduler/api/devices/device_info.go new file mode 100644 index 0000000000..54f2696cc9 --- /dev/null +++ b/pkg/scheduler/api/devices/device_info.go @@ -0,0 +1,192 @@ +/* +Copyright 2023 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devices + +import ( + "encoding/json" + "fmt" + "strconv" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) + +type DeviceInfo struct { + ID string `json:"id,omitempty"` + Index uint `json:"index,omitempty"` + Count int32 `json:"count,omitempty"` + Devmem int32 `json:"devmem,omitempty"` + Devcore int32 `json:"devcore,omitempty"` + Type string `json:"type,omitempty"` + Numa int `json:"numa,omitempty"` + Mode string `json:"mode,omitempty"` + Health bool `json:"health,omitempty"` + DeviceVendor string `json:"devicevendor,omitempty"` + CustomInfo map[string]any `json:"custominfo,omitempty"` + DevicePairScore DevicePairScore `json:"devicepairscore,omitempty"` +} + +type DevicePairScores []DevicePairScore +type DevicePairScore struct { + ID string `json:"uuid,omitempty"` + Scores map[string]int `json:"score,omitempty"` +} + +type DeviceUsage struct { + Used int32 + Usedmem int32 + Usedcores int32 +} + +type ResourceNames struct { + ResourceCountName string + ResourceMemoryName string + ResourceCoreName string +} + +type ContainerDevice struct { + // TODO current Idx cannot use, because EncodeContainerDevices method not encode this filed. + Idx int + UUID string + Type string + Usedmem int32 + Usedcores int32 + CustomInfo map[string]any +} + +type ContainerDeviceRequest struct { + Nums int32 + Type string + Memreq int32 + MemPercentagereq int32 + Coresreq int32 +} + +const ( + // OneContainerMultiDeviceSplitSymbol this is when one container use multi device, use : symbol to join device info. + OneContainerMultiDeviceSplitSymbol = ":" + + // OnePodMultiContainerSplitSymbol this is when one pod having multi container and more than one container use device, use ; symbol to join device info. + OnePodMultiContainerSplitSymbol = ";" +) + +var ( + HandshakeAnnos = map[string]string{} + InRequestDevices = map[string]string{} + SupportDevices = map[string]string{} +) + +type ContainerDevices []ContainerDevice +type ContainerDeviceRequests map[string]ContainerDeviceRequest +type PodDeviceRequests []ContainerDeviceRequests +type PodSingleDevice []ContainerDevices +type PodDevices map[string]PodSingleDevice + +func CheckHealth(devType string, n *corev1.Node) (bool, bool) { + handshake := n.Annotations[HandshakeAnnos[devType]] + if strings.Contains(handshake, "Requesting") { + formertime, _ := time.Parse(time.DateTime, strings.Split(handshake, "_")[1]) + return time.Now().Before(formertime.Add(time.Second * 60)), false + } else if strings.Contains(handshake, "Deleted") { + return true, false + } else { + return true, true + } +} + +func UnMarshalNodeDevices(str string) ([]*DeviceInfo, error) { + var dlist []*DeviceInfo + err := json.Unmarshal([]byte(str), &dlist) + return dlist, err +} + +func EncodeContainerDevices(cd ContainerDevices) string { + tmp := "" + for _, val := range cd { + tmp += val.UUID + "," + val.Type + "," + strconv.Itoa(int(val.Usedmem)) + "," + strconv.Itoa(int(val.Usedcores)) + OneContainerMultiDeviceSplitSymbol + } + klog.Infof("Encoded container Devices: %s", tmp) + return tmp + //return strings.Join(cd, ",") +} + +func EncodePodSingleDevice(pd PodSingleDevice) string { + res := "" + for _, ctrdevs := range pd { + res = res + EncodeContainerDevices(ctrdevs) + res = res + OnePodMultiContainerSplitSymbol + } + klog.Infof("Encoded pod single devices %s", res) + return res +} + +func DecodeContainerDevices(str string) (ContainerDevices, error) { + if len(str) == 0 { + return ContainerDevices{}, nil + } + cd := strings.Split(str, OneContainerMultiDeviceSplitSymbol) + contdev := ContainerDevices{} + tmpdev := ContainerDevice{} + klog.V(5).Infof("Start to decode container device %s", str) + for _, val := range cd { + if strings.Contains(val, ",") { + //fmt.Println("cd is ", val) + tmpstr := strings.Split(val, ",") + if len(tmpstr) < 4 { + return ContainerDevices{}, fmt.Errorf("pod annotation format error; information missing, please do not use nodeName field in task") + } + tmpdev.UUID = tmpstr[0] + tmpdev.Type = tmpstr[1] + devmem, _ := strconv.ParseInt(tmpstr[2], 10, 32) + tmpdev.Usedmem = int32(devmem) + devcores, _ := strconv.ParseInt(tmpstr[3], 10, 32) + tmpdev.Usedcores = int32(devcores) + contdev = append(contdev, tmpdev) + } + } + klog.V(5).Infof("Finished decoding container devices. Total devices: %d", len(contdev)) + return contdev, nil +} + +func DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error) { + klog.V(5).Infof("checklist is [%+v], annos is [%+v]", checklist, annos) + if len(annos) == 0 { + return PodDevices{}, nil + } + pd := make(PodDevices) + for devID, devs := range checklist { + str, ok := annos[devs] + if !ok { + continue + } + pd[devID] = make(PodSingleDevice, 0) + for _, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) { + cd, err := DecodeContainerDevices(s) + if err != nil { + return PodDevices{}, err + } + if len(cd) == 0 { + continue + } + pd[devID] = append(pd[devID], cd) + } + } + klog.V(5).InfoS("Decoded pod annos", "poddevices", pd) + return pd, nil +} diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/type.go b/pkg/scheduler/api/devices/nvidia/vgpu/type.go index 68c62bb3fe..ef202572ff 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/type.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/type.go @@ -60,15 +60,6 @@ var ( NodeLockEnable bool ) -type ContainerDeviceRequest struct { - Nums int32 - // device type, like NVIDIA, MLU - Type string - Memreq uint - MemPercentagereq int32 - Coresreq uint -} - type ContainerDevice struct { UUID string // device type, like NVIDIA, MLU diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go index 96f0218076..d09f87ee61 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go @@ -170,73 +170,12 @@ func checkVGPUResourcesInPod(pod *v1.Pod) bool { return false } -func resourcereqs(pod *v1.Pod) []ContainerDeviceRequest { - resourceName := v1.ResourceName(getConfig().ResourceCountName) - resourceMem := v1.ResourceName(getConfig().ResourceMemoryName) - resourceMemPercentage := v1.ResourceName(getConfig().ResourceMemoryPercentageName) - resourceCores := v1.ResourceName(getConfig().ResourceCoreName) - counts := []ContainerDeviceRequest{} - - //Count Nvidia GPU - for i := 0; i < len(pod.Spec.Containers); i++ { - singledevice := false - v, ok := pod.Spec.Containers[i].Resources.Limits[resourceName] - if !ok { - v, ok = pod.Spec.Containers[i].Resources.Limits[resourceMem] - singledevice = true - } - if ok { - n := int64(1) - if !singledevice { - n, _ = v.AsInt64() - } - memnum := uint(0) - mem, ok := pod.Spec.Containers[i].Resources.Limits[resourceMem] - if !ok { - mem, ok = pod.Spec.Containers[i].Resources.Requests[resourceMem] - } - if ok { - memnums, ok := mem.AsInt64() - if ok { - memnum = uint(memnums) - } - } - mempnum := int32(101) - mem, ok = pod.Spec.Containers[i].Resources.Limits[resourceMemPercentage] - if !ok { - mem, ok = pod.Spec.Containers[i].Resources.Requests[resourceMemPercentage] - } - if ok { - mempnums, ok := mem.AsInt64() - if ok { - mempnum = int32(mempnums) - } - } - if mempnum == 101 && memnum == 0 { - mempnum = 100 - } - corenum := uint(0) - core, ok := pod.Spec.Containers[i].Resources.Limits[resourceCores] - if !ok { - core, ok = pod.Spec.Containers[i].Resources.Requests[resourceCores] - } - if ok { - corenums, ok := core.AsInt64() - if ok { - corenum = uint(corenums) - } - } - counts = append(counts, ContainerDeviceRequest{ - Nums: int32(n), - Type: "NVIDIA", - Memreq: memnum, - MemPercentagereq: int32(mempnum), - Coresreq: corenum, - }) - } - } - klog.V(3).Infoln("counts=", counts) - return counts +func resourcereqs(pod *v1.Pod) []devices.ContainerDeviceRequest { + countName := getConfig().ResourceCountName + memoryName := getConfig().ResourceMemoryName + percentageName := getConfig().ResourceMemoryPercentageName + coreName := getConfig().ResourceCoreName + return devices.ExtractResourceRequest(pod, "NVIDIA", countName, memoryName, percentageName, coreName) } func checkGPUtype(annos map[string]string, cardtype string) bool { @@ -273,7 +212,7 @@ func checkGPUtype(annos map[string]string, cardtype string) bool { return true } -func checkType(annos map[string]string, d GPUDevice, n ContainerDeviceRequest) bool { +func checkType(annos map[string]string, d GPUDevice, n devices.ContainerDeviceRequest) bool { //General type check, NVIDIA->NVIDIA MLU->MLU if !strings.Contains(d.Type, n.Type) { return false @@ -391,12 +330,12 @@ func checkNodeGPUSharingPredicateAndScore(pod *v1.Pod, gssnap *GPUDevices, repli if val.MemPercentagereq != 101 { memreqForCard = uint(float64(gs.Device[i].Memory) * float64(val.MemPercentagereq) / 100.0) } else { - memreqForCard = val.Memreq + memreqForCard = uint(val.Memreq) } if int(gs.Device[i].Memory)-int(gs.Device[i].UsedMem) < int(memreqForCard) { continue } - if gs.Device[i].UsedCore+val.Coresreq > 100 { + if gs.Device[i].UsedCore+uint(val.Coresreq) > 100 { continue } // Coresreq=100 indicates it want this card exclusively @@ -425,7 +364,7 @@ func checkNodeGPUSharingPredicateAndScore(pod *v1.Pod, gssnap *GPUDevices, repli UUID: uuid, Type: val.Type, Usedmem: memreqForCard, - Usedcores: val.Coresreq, + Usedcores: uint(val.Coresreq), }) score += GPUScore(schedulePolicy, gs.Device[i]) } diff --git a/pkg/scheduler/api/devices/util.go b/pkg/scheduler/api/devices/util.go index e23829f663..3fe9c039f6 100644 --- a/pkg/scheduler/api/devices/util.go +++ b/pkg/scheduler/api/devices/util.go @@ -31,6 +31,14 @@ limitations under the License. package devices import ( + "context" + "encoding/json" + "fmt" + + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8stypes "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/klog/v2" @@ -81,3 +89,152 @@ func NewClient() (*kubernetes.Clientset, error) { client, err := kubernetes.NewForConfig(config) return client, err } + +func GetNode(nodename string) (*v1.Node, error) { + if nodename == "" { + klog.ErrorS(nil, "Node name is empty") + return nil, fmt.Errorf("nodename is empty") + } + + klog.V(5).InfoS("Fetching node", "nodeName", nodename) + n, err := GetClient().CoreV1().Nodes().Get(context.Background(), nodename, metav1.GetOptions{}) + if err != nil { + switch { + case apierrors.IsNotFound(err): + klog.ErrorS(err, "Node not found", "nodeName", nodename) + return nil, fmt.Errorf("node %s not found", nodename) + case apierrors.IsUnauthorized(err): + klog.ErrorS(err, "Unauthorized to access node", "nodeName", nodename) + return nil, fmt.Errorf("unauthorized to access node %s", nodename) + default: + klog.ErrorS(err, "Failed to get node", "nodeName", nodename) + return nil, fmt.Errorf("failed to get node %s: %v", nodename, err) + } + } + + klog.V(5).InfoS("Successfully fetched node", "nodeName", nodename) + return n, nil +} + +func PatchPodAnnotations(kubeClient kubernetes.Interface, pod *v1.Pod, annotations map[string]string) error { + type patchMetadata struct { + Annotations map[string]string `json:"annotations,omitempty"` + } + type patchPod struct { + Metadata patchMetadata `json:"metadata"` + //Spec patchSpec `json:"spec,omitempty"` + } + + p := patchPod{} + p.Metadata.Annotations = annotations + + bytes, err := json.Marshal(p) + if err != nil { + return err + } + _, err = kubeClient.CoreV1().Pods(pod.Namespace). + Patch(context.Background(), pod.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + klog.Errorf("patch pod %v failed, %v", pod.Name, err) + } + + return err +} + +func PatchNodeAnnotations(node *v1.Node, annotations map[string]string) error { + type patchMetadata struct { + Annotations map[string]string `json:"annotations,omitempty"` + } + type patchPod struct { + Metadata patchMetadata `json:"metadata"` + //Spec patchSpec `json:"spec,omitempty"` + } + + p := patchPod{} + p.Metadata.Annotations = annotations + + bytes, err := json.Marshal(p) + if err != nil { + return err + } + _, err = GetClient().CoreV1().Nodes(). + Patch(context.Background(), node.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + klog.Infoln("annotations=", annotations) + klog.Infof("patch node %v failed, %v", node.Name, err) + } + return err +} + +func ExtractResourceRequest(pod *v1.Pod, resourceType, countName, memoryName, percentageName, coreName string) []ContainerDeviceRequest { + resourceName := v1.ResourceName(countName) + resourceMem := v1.ResourceName(memoryName) + counts := []ContainerDeviceRequest{} + + //Count Nvidia GPU + for i := 0; i < len(pod.Spec.Containers); i++ { + singledevice := false + v, ok := pod.Spec.Containers[i].Resources.Limits[resourceName] + if !ok { + v, ok = pod.Spec.Containers[i].Resources.Limits[resourceMem] + singledevice = true + } + if ok { + n := int64(1) + if !singledevice { + n, _ = v.AsInt64() + } + memnum := int32(0) + mem, ok := pod.Spec.Containers[i].Resources.Limits[resourceMem] + if !ok { + mem, ok = pod.Spec.Containers[i].Resources.Requests[resourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int32(memnums) + } + } + mempnum := int32(101) + if percentageName != "" { + resourceMemPercentage := v1.ResourceName(percentageName) + mem, ok = pod.Spec.Containers[i].Resources.Limits[resourceMemPercentage] + if !ok { + mem, ok = pod.Spec.Containers[i].Resources.Requests[resourceMemPercentage] + } + if ok { + mempnums, ok := mem.AsInt64() + if ok { + mempnum = int32(mempnums) + } + } + if mempnum == 101 && memnum == 0 { + mempnum = 100 + } + } + corenum := int32(0) + if coreName != "" { + resourceCores := v1.ResourceName(coreName) + core, ok := pod.Spec.Containers[i].Resources.Limits[resourceCores] + if !ok { + core, ok = pod.Spec.Containers[i].Resources.Requests[resourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + } + counts = append(counts, ContainerDeviceRequest{ + Nums: int32(n), + Type: resourceType, + Memreq: memnum, + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + }) + } + } + klog.V(3).Infoln("counts=", counts) + return counts +} diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index 12306aeae7..71bfb6a496 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -27,7 +27,8 @@ import ( "volcano.sh/apis/pkg/apis/scheduling" "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/hami" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" ) @@ -350,10 +351,16 @@ func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) { ni.Others[gpushare.DeviceName] = gpushare.NewGPUDevices(ni.Name, node) ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node) ni.Others[vnpu.DeviceName] = vnpu.NewNPUDevices(ni.Name, node) + ascend_ignored_list := []string{} + for device_name, devices := range hami.NewAscendDevices(ni.Name, node) { + ni.Others[device_name] = devices + ascend_ignored_list = append(ascend_ignored_list, devices.GetIgnoredDevices()...) + } IgnoredDevicesList.Set( ni.Others[gpushare.DeviceName].(Devices).GetIgnoredDevices(), ni.Others[vgpu.DeviceName].(Devices).GetIgnoredDevices(), ni.Others[vnpu.DeviceName].(Devices).GetIgnoredDevices(), + ascend_ignored_list, ) } @@ -507,6 +514,13 @@ func (ni *NodeInfo) addResource(pod *v1.Pod) { } ni.Others[vgpu.DeviceName].(Devices).AddResource(pod) ni.Others[vnpu.DeviceName].(Devices).AddResource(pod) + for _, name := range hami.GetAscendDeviceNames() { + if other, exists := ni.Others[name]; exists { + if devices, ok := other.(Devices); ok { + devices.AddResource(pod) + } + } + } } // subResource is used to subtract sharable devices @@ -516,6 +530,13 @@ func (ni *NodeInfo) subResource(pod *v1.Pod) { } ni.Others[vgpu.DeviceName].(Devices).SubResource(pod) ni.Others[vnpu.DeviceName].(Devices).SubResource(pod) + for _, name := range hami.GetAscendDeviceNames() { + if other, exists := ni.Others[name]; exists { + if devices, ok := other.(Devices); ok { + devices.SubResource(pod) + } + } + } } // UpdateTask is used to update a task in nodeInfo object. diff --git a/pkg/scheduler/api/node_info_test.go b/pkg/scheduler/api/node_info_test.go index 01994a089c..31ae101b6e 100644 --- a/pkg/scheduler/api/node_info_test.go +++ b/pkg/scheduler/api/node_info_test.go @@ -24,7 +24,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" fwk "k8s.io/kube-scheduler/framework" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" ) diff --git a/pkg/scheduler/api/shared_device_pool.go b/pkg/scheduler/api/shared_device_pool.go index c6e524140a..27a6a39458 100644 --- a/pkg/scheduler/api/shared_device_pool.go +++ b/pkg/scheduler/api/shared_device_pool.go @@ -22,7 +22,8 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/hami" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" ) @@ -78,6 +79,7 @@ type Devices interface { var _ Devices = new(gpushare.GPUDevices) var _ Devices = new(vgpu.GPUDevices) var _ Devices = new(vnpu.NPUDevices) +var _ Devices = new(hami.AscendDevices) var RegisteredDevices = []string{ gpushare.DeviceName, @@ -85,6 +87,15 @@ var RegisteredDevices = []string{ vnpu.DeviceName, } +func RegisterDevice(deviceName string) { + for _, name := range RegisteredDevices { + if name == deviceName { + return + } + } + RegisteredDevices = append(RegisteredDevices, deviceName) +} + var IgnoredDevicesList = ignoredDevicesList{} type ignoredDevicesList struct { diff --git a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/init_policy.go b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/init_policy.go index 9ca69f264e..8a6c133adb 100644 --- a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/init_policy.go +++ b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/init_policy.go @@ -28,7 +28,7 @@ import ( "k8s.io/klog/v2" "volcano.sh/volcano/pkg/scheduler/api" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/pkg/scheduler/conf" "volcano.sh/volcano/pkg/scheduler/framework" "volcano.sh/volcano/third_party/ascend-for-volcano/common/k8s" diff --git a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/node.go b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/node.go index 07b8f9376b..5322d91b94 100644 --- a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/node.go +++ b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/node.go @@ -19,7 +19,7 @@ package vnpu310p import ( "sort" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/third_party/ascend-for-volcano/common/util" ) diff --git a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/score_policy.go b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/score_policy.go index a4b048449b..f9bdc3f357 100644 --- a/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/score_policy.go +++ b/pkg/scheduler/plugins/deviceshare/devices/ascend/310p/vnpu/score_policy.go @@ -23,7 +23,7 @@ import ( "k8s.io/klog/v2" "volcano.sh/volcano/pkg/scheduler/api" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/third_party/ascend-for-volcano/common/util" ) diff --git a/pkg/scheduler/plugins/deviceshare/deviceshare.go b/pkg/scheduler/plugins/deviceshare/deviceshare.go index fc59e36d41..7325ebe3fd 100644 --- a/pkg/scheduler/plugins/deviceshare/deviceshare.go +++ b/pkg/scheduler/plugins/deviceshare/deviceshare.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "reflect" + "sync" v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" @@ -28,7 +29,8 @@ import ( "volcano.sh/volcano/pkg/scheduler/api" "volcano.sh/volcano/pkg/scheduler/api/devices" - "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/ascend310p/vnpu" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/hami" + "volcano.sh/volcano/pkg/scheduler/api/devices/ascend/mindcluster/ascend310p/vnpu" "volcano.sh/volcano/pkg/scheduler/api/devices/config" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" @@ -46,7 +48,8 @@ const ( VGPUEnable = "deviceshare.VGPUEnable" - ASCEND310PvGPU = "deviceshare.ASCEND310PVNPUEnable" + AscendMindClusterVNPU = "deviceshare.AscendMindClusterVNPUEnable" + AscendHAMiVNPUEnable = "deviceshare.AscendHAMiVNPUEnable" SchedulePolicyArgument = "deviceshare.SchedulePolicy" ScheduleWeight = "deviceshare.ScheduleWeight" @@ -55,6 +58,10 @@ const ( KnownGeometriesCMNamespace = "deviceshare.KnownGeometriesCMNamespace" ) +var ( + once sync.Once +) + type deviceSharePlugin struct { // Arguments given for the plugin pluginArguments framework.Arguments @@ -81,7 +88,8 @@ func enablePredicate(dsp *deviceSharePlugin) { args.GetBool(&gpushare.GpuNumberEnable, GPUNumberPredicate) args.GetBool(&nodeLockEnable, NodeLockEnable) args.GetBool(&vgpu.VGPUEnable, VGPUEnable) - args.GetBool(&vnpu.Ascend310pvNPUEnable, ASCEND310PvGPU) + args.GetBool(&vnpu.AscendMindClusterVNPUEnable, AscendMindClusterVNPU) + args.GetBool(&hami.AscendHAMiVNPUEnable, AscendHAMiVNPUEnable) gpushare.NodeLockEnable = nodeLockEnable vgpu.NodeLockEnable = nodeLockEnable @@ -96,14 +104,23 @@ func enablePredicate(dsp *deviceSharePlugin) { klog.Fatal("gpu-share and vgpu can't be used together") } - if !vgpu.VGPUEnable { - return - } knownGeometriesCMName := "volcano-vgpu-device-config" args.GetString(&knownGeometriesCMName, KnownGeometriesCMName) knownGeometriesCMNamespace := "kube-system" args.GetString(&knownGeometriesCMNamespace, KnownGeometriesCMNamespace) config.InitDevicesConfig(knownGeometriesCMName, knownGeometriesCMNamespace) + registerDevices() +} + +func registerDevices() { + once.Do(func() { + if hami.AscendHAMiVNPUEnable { + for _, vnpu := range config.GetConfig().VNPUs { + klog.V(3).Infof("register device %s", vnpu.CommonWord) + api.RegisterDevice(vnpu.CommonWord) + } + } + }) } func createStatus(code int, reason string) *api.Status { @@ -118,16 +135,12 @@ func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedu s := float64(0) for deviceType, device := range node.Others { if device.(api.Devices).HasDeviceRequest(pod) { - var ns float64 // Only process device types that use NodeOrderFn (vgpu and gpushare) // vnpu devices use BatchNodeOrderFn, skip them here - if deviceType == vgpu.DeviceName || deviceType == gpushare.DeviceName { - ns = device.(api.Devices).ScoreNode(pod, schedulePolicy) - } else { - // Other device types (like vnpu) use BatchNodeOrderFn, skip scoring here - continue + if deviceType != vnpu.DeviceName { + ns := device.(api.Devices).ScoreNode(pod, schedulePolicy) + s += ns } - s += ns } } klog.V(4).Infof("deviceScore for task %s/%s is: %v", pod.Namespace, pod.Name, s) @@ -172,11 +185,12 @@ func initializeDevicesWithSession(ssn *framework.Session) { func initializeDevice(device api.Devices, ssn *framework.Session, nodeInfo *api.NodeInfo) error { switch d := device.(type) { case *vnpu.NPUDevices: - klog.V(3).Infof("initialize ascend310p device.") - return vnpu310p.InitVNPUDevice(d, ssn, nodeInfo) - default: - return nil + if vnpu.AscendMindClusterVNPUEnable { + klog.V(3).Infof("initialize ascend310p device.") + return vnpu310p.InitVNPUDevice(d, ssn, nodeInfo) + } } + return nil } func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { @@ -191,7 +205,7 @@ func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { if dev, ok := node.Others[val].(api.Devices); ok { if reflect.ValueOf(dev).IsNil() { // TODO When a pod requests a device of the current type, but the current node does not have such a device, an error is thrown - if dev == nil || dev.HasDeviceRequest(task.Pod) { + if dev == nil { predicateStatus = append(predicateStatus, &api.Status{ Code: devices.Unschedulable, Reason: "node not initialized with device" + val, @@ -199,11 +213,16 @@ func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { }) return api.NewFitErrWithStatus(task, node, predicateStatus...) } + klog.V(4).Infof("device %s is null, skipping it", val) + continue + } + if !dev.HasDeviceRequest(task.Pod) { klog.V(4).Infof("pod %s/%s did not request device %s on %s, skipping it", task.Pod.Namespace, task.Pod.Name, val, node.Name) continue } code, msg, err := dev.FilterNode(task.Pod, dp.schedulePolicy) if err != nil { + klog.V(4).Infof("pod %s/%s fit failed. device %s node %s err %v", task.Pod.Namespace, task.Pod.Name, val, node.Name, err) predicateStatus = append(predicateStatus, createStatus(code, msg)) return api.NewFitErrWithStatus(task, node, predicateStatus...) }