From 274534ede93750cd8cfbc46d385faa1ab997f515 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 20 Jun 2025 21:53:42 +0800 Subject: [PATCH 01/52] feat(gpu): add static policy implementation for GPU resource management This commit introduces a new static policy implementation for GPU resource management, including: - GPU topology provider and state management - Static policy implementation with allocation and deallocation logic - Integration with existing QRM framework - Metrics and health checks for GPU resource management --- .../app/agent/qrm/gpu_plugin.go | 67 ++ cmd/katalyst-agent/app/enableagents.go | 2 + .../app/options/qrm/gpu_plugin.go | 63 ++ .../app/options/qrm/qrm_base.go | 6 + go.mod | 1 + go.sum | 3 + pkg/agent/qrm-plugins/gpu/consts/consts.go | 38 ++ pkg/agent/qrm-plugins/gpu/gpu.go | 27 + pkg/agent/qrm-plugins/gpu/state/checkpoint.go | 62 ++ pkg/agent/qrm-plugins/gpu/state/state.go | 247 +++++++ .../qrm-plugins/gpu/state/state_checkpoint.go | 259 ++++++++ pkg/agent/qrm-plugins/gpu/state/state_mem.go | 138 ++++ pkg/agent/qrm-plugins/gpu/state/util.go | 88 +++ .../qrm-plugins/gpu/staticpolicy/device.go | 108 +++ .../qrm-plugins/gpu/staticpolicy/policy.go | 627 ++++++++++++++++++ .../state/{state_net.go => state_mem.go} | 0 pkg/agent/qrm-plugins/util/util.go | 2 +- pkg/config/agent/qrm/gpu_plugin.go | 34 + pkg/config/agent/qrm/qrm_base.go | 2 + pkg/util/machine/gpu.go | 73 ++ pkg/util/native/kubelet.go | 25 + 21 files changed, 1871 insertions(+), 1 deletion(-) create mode 100644 cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go create mode 100644 cmd/katalyst-agent/app/options/qrm/gpu_plugin.go create mode 100644 pkg/agent/qrm-plugins/gpu/consts/consts.go create mode 100644 pkg/agent/qrm-plugins/gpu/gpu.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/checkpoint.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/state.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/state_mem.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/util.go create mode 100644 pkg/agent/qrm-plugins/gpu/staticpolicy/device.go create mode 100644 pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go rename pkg/agent/qrm-plugins/network/state/{state_net.go => state_mem.go} (100%) create mode 100644 pkg/config/agent/qrm/gpu_plugin.go create mode 100644 pkg/util/machine/gpu.go diff --git a/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go new file mode 100644 index 0000000000..e98d9a6c7e --- /dev/null +++ b/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go @@ -0,0 +1,67 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import ( + "fmt" + "strings" + "sync" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + phconsts "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler/consts" + "github.com/kubewharf/katalyst-core/pkg/config" +) + +const ( + QRMPluginNameGPU = "qrm_gpu_plugin" +) + +var QRMGPUPluginPeriodicalHandlerGroupName = strings.Join([]string{ + QRMPluginNameGPU, + phconsts.PeriodicalHandlersGroupNameSuffix, +}, phconsts.GroupNameSeparator) + +// gpuPolicyInitializers is used to store the initializing function for gpu resource plugin policies +var gpuPolicyInitializers sync.Map + +// RegisterGPUPolicyInitializer is used to register user-defined resource plugin init functions +func RegisterGPUPolicyInitializer(name string, initFunc agent.InitFunc) { + gpuPolicyInitializers.Store(name, initFunc) +} + +// getIOPolicyInitializers returns those policies with initialized functions +func getGPUPolicyInitializers() map[string]agent.InitFunc { + agents := make(map[string]agent.InitFunc) + gpuPolicyInitializers.Range(func(key, value interface{}) bool { + agents[key.(string)] = value.(agent.InitFunc) + return true + }) + return agents +} + +// InitQRMGPUPlugins initializes the gpu QRM plugins +func InitQRMGPUPlugins(agentCtx *agent.GenericContext, conf *config.Configuration, extraConf interface{}, agentName string) (bool, agent.Component, error) { + initializers := getGPUPolicyInitializers() + policyName := conf.GPUQRMPluginConfig.PolicyName + + initFunc, ok := initializers[policyName] + if !ok { + return false, agent.ComponentStub{}, fmt.Errorf("invalid policy name %v for gpu resource plugin", policyName) + } + + return initFunc(agentCtx, conf, extraConf, agentName) +} diff --git a/cmd/katalyst-agent/app/enableagents.go b/cmd/katalyst-agent/app/enableagents.go index 615ef00c93..713ea6925a 100644 --- a/cmd/katalyst-agent/app/enableagents.go +++ b/cmd/katalyst-agent/app/enableagents.go @@ -24,6 +24,7 @@ import ( "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu" + _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/io" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/network" @@ -57,6 +58,7 @@ func init() { agentInitializers.Store(qrm.QRMPluginNameMemory, AgentStarter{Init: qrm.InitQRMMemoryPlugins}) agentInitializers.Store(qrm.QRMPluginNameNetwork, AgentStarter{Init: qrm.InitQRMNetworkPlugins}) agentInitializers.Store(qrm.QRMPluginNameIO, AgentStarter{Init: qrm.InitQRMIOPlugins}) + agentInitializers.Store(qrm.QRMPluginNameGPU, AgentStarter{Init: qrm.InitQRMGPUPlugins}) } // RegisterAgentInitializer is used to register user-defined agents diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go new file mode 100644 index 0000000000..5ac43aa1cb --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -0,0 +1,63 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import ( + "k8s.io/apimachinery/pkg/api/resource" + cliflag "k8s.io/component-base/cli/flag" + + qrmconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" +) + +type GPUOptions struct { + PolicyName string + GPUResourceNames []string + GPUMemoryAllocatablePerGPU string + SkipGPUStateCorruption bool +} + +func NewGPUOptions() *GPUOptions { + return &GPUOptions{ + PolicyName: "static", + GPUResourceNames: []string{"nvidia.com/gpu"}, + GPUMemoryAllocatablePerGPU: "100", + } +} + +func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { + fs := fss.FlagSet("gpu_resource_plugin") + + fs.StringVar(&o.PolicyName, "gpu-resource-plugin-policy", + o.PolicyName, "The policy gpu resource plugin should use") + fs.StringSliceVar(&o.GPUResourceNames, "gpu-resource-names", o.GPUResourceNames, "The name of the GPU resource") + fs.StringVar(&o.GPUMemoryAllocatablePerGPU, "gpu-memory-allocatable-per-gpu", + o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") + fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", + o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties") +} + +func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { + conf.PolicyName = o.PolicyName + conf.GPUResourceNames = o.GPUResourceNames + gpuMemory, err := resource.ParseQuantity(o.GPUMemoryAllocatablePerGPU) + if err != nil { + return err + } + conf.GPUMemoryAllocatablePerGPU = gpuMemory + conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption + return nil +} diff --git a/cmd/katalyst-agent/app/options/qrm/qrm_base.go b/cmd/katalyst-agent/app/options/qrm/qrm_base.go index c02a6ddd9d..139c26ae77 100644 --- a/cmd/katalyst-agent/app/options/qrm/qrm_base.go +++ b/cmd/katalyst-agent/app/options/qrm/qrm_base.go @@ -88,6 +88,7 @@ type QRMPluginsOptions struct { MemoryOptions *MemoryOptions NetworkOptions *NetworkOptions IOOptions *IOOptions + GPUOptions *GPUOptions } func NewQRMPluginsOptions() *QRMPluginsOptions { @@ -96,6 +97,7 @@ func NewQRMPluginsOptions() *QRMPluginsOptions { MemoryOptions: NewMemoryOptions(), NetworkOptions: NewNetworkOptions(), IOOptions: NewIOOptions(), + GPUOptions: NewGPUOptions(), } } @@ -104,6 +106,7 @@ func (o *QRMPluginsOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.MemoryOptions.AddFlags(fss) o.NetworkOptions.AddFlags(fss) o.IOOptions.AddFlags(fss) + o.GPUOptions.AddFlags(fss) } func (o *QRMPluginsOptions) ApplyTo(conf *qrmconfig.QRMPluginsConfiguration) error { @@ -119,5 +122,8 @@ func (o *QRMPluginsOptions) ApplyTo(conf *qrmconfig.QRMPluginsConfiguration) err if err := o.IOOptions.ApplyTo(conf.IOQRMPluginConfig); err != nil { return err } + if err := o.GPUOptions.ApplyTo(conf.GPUQRMPluginConfig); err != nil { + return err + } return nil } diff --git a/go.mod b/go.mod index 56953cf18e..7ce2433559 100644 --- a/go.mod +++ b/go.mod @@ -175,6 +175,7 @@ require ( ) replace ( + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index 816e098731..1dd9b13203 100644 --- a/go.sum +++ b/go.sum @@ -587,6 +587,9 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= +github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5 h1:q7cwZoXMJw0Zr+b5WuN0yF4ceoSJp4XVd8fOSScQEZg= +github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= +github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/gpu/consts/consts.go b/pkg/agent/qrm-plugins/gpu/consts/consts.go new file mode 100644 index 0000000000..369b603517 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/consts/consts.go @@ -0,0 +1,38 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consts + +import ( + "time" + + "github.com/kubewharf/katalyst-api/pkg/consts" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" +) + +const ( + + // GPUResourcePluginPolicyNameStatic is the policy name of static gpu resource plugin + GPUResourcePluginPolicyNameStatic = string(consts.ResourcePluginPolicyNameStatic) + + GPUPluginDynamicPolicyName = qrm.QRMPluginNameGPU + "_" + GPUResourcePluginPolicyNameStatic + ClearResidualState = GPUPluginDynamicPolicyName + "_clear_residual_state" + + StateCheckPeriod = 30 * time.Second + StateCheckTolerationTimes = 3 + MaxResidualTime = 5 * time.Minute +) diff --git a/pkg/agent/qrm-plugins/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/gpu.go new file mode 100644 index 0000000000..62010cd178 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/gpu.go @@ -0,0 +1,27 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/staticpolicy" +) + +func init() { + qrm.RegisterGPUPolicyInitializer(gpuconsts.GPUResourcePluginPolicyNameStatic, staticpolicy.NewStaticPolicy) +} diff --git a/pkg/agent/qrm-plugins/gpu/state/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go new file mode 100644 index 0000000000..0995851134 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go @@ -0,0 +1,62 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" +) + +var _ checkpointmanager.Checkpoint = &GPUPluginCheckpoint{} + +type GPUPluginCheckpoint struct { + PolicyName string `json:"policyName"` + MachineState GPUMap `json:"machineState"` + PodEntries PodEntries `json:"pod_entries"` + Checksum checksum.Checksum `json:"checksum"` +} + +func NewGPUPluginCheckpoint() *GPUPluginCheckpoint { + return &GPUPluginCheckpoint{ + PodEntries: make(PodEntries), + MachineState: make(GPUMap), + } +} + +// MarshalCheckpoint returns marshaled checkpoint +func (cp *GPUPluginCheckpoint) MarshalCheckpoint() ([]byte, error) { + // make sure checksum wasn't set before, so it doesn't affect output checksum + cp.Checksum = 0 + cp.Checksum = checksum.New(cp) + return json.Marshal(*cp) +} + +// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint +func (cp *GPUPluginCheckpoint) UnmarshalCheckpoint(blob []byte) error { + return json.Unmarshal(blob, cp) +} + +// VerifyChecksum verifies that current checksum of checkpoint is valid +func (cp *GPUPluginCheckpoint) VerifyChecksum() error { + ck := cp.Checksum + cp.Checksum = 0 + err := ck.Verify(cp) + cp.Checksum = ck + return err +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go new file mode 100644 index 0000000000..c60ac95fb8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -0,0 +1,247 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +type AllocationInfo struct { + commonstate.AllocationMeta `json:",inline"` + + AllocatedAllocation GPUAllocation `json:"allocated_allocation"` + TopologyAwareAllocations map[string]GPUAllocation `json:"topology_aware_allocations"` +} + +type GPUAllocation struct { + GPUMemoryQuantity uint64 `json:"gpu_memory_quantity"` +} + +func (a *GPUAllocation) Clone() GPUAllocation { + return GPUAllocation{ + GPUMemoryQuantity: a.GPUMemoryQuantity, + } +} + +type ( + ContainerEntries map[string]*AllocationInfo // Keyed by container name + PodEntries map[string]ContainerEntries // Keyed by pod UID +) +type GPUState struct { + GPUMemoryAllocatable uint64 `json:"gpu_memory_allocatable"` + GPUMemoryAllocated uint64 `json:"gpu_memory_allocated"` + PodEntries PodEntries `json:"pod_entries"` +} + +type GPUMap map[string]*GPUState // GPUMap keyed by gpu device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e + +func (i *AllocationInfo) String() string { + if i == nil { + return "" + } + + contentBytes, err := json.Marshal(i) + if err != nil { + general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (i *AllocationInfo) Clone() *AllocationInfo { + if i == nil { + return nil + } + + clone := &AllocationInfo{ + AllocationMeta: *i.AllocationMeta.Clone(), + AllocatedAllocation: i.AllocatedAllocation.Clone(), + } + + if i.TopologyAwareAllocations != nil { + clone.TopologyAwareAllocations = make(map[string]GPUAllocation) + for k, v := range i.TopologyAwareAllocations { + clone.TopologyAwareAllocations[k] = v.Clone() + } + } + + return clone +} + +func (e PodEntries) String() string { + if e == nil { + return "" + } + + contentBytes, err := json.Marshal(e) + if err != nil { + general.LoggerWithPrefix("PodEntries.String", general.LoggingPKGFull).Errorf("marshal PodEntries failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (e PodEntries) Clone() PodEntries { + clone := make(PodEntries) + for podUID, containerEntries := range e { + clone[podUID] = make(ContainerEntries) + for containerName, allocationInfo := range containerEntries { + clone[podUID][containerName] = allocationInfo.Clone() + } + } + return clone +} + +func (e PodEntries) GetAllocationInfo(uid string, name string) *AllocationInfo { + if e == nil { + return nil + } + + if containerEntries, ok := e[uid]; ok { + if allocationInfo, ok := containerEntries[name]; ok { + return allocationInfo.Clone() + } + } + return nil +} + +func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if e == nil { + return + } + + if _, ok := e[podUID]; !ok { + e[podUID] = make(ContainerEntries) + } + + e[podUID][containerName] = allocationInfo.Clone() +} + +func (ns *GPUState) String() string { + if ns == nil { + return "" + } + + contentBytes, err := json.Marshal(ns) + if err != nil { + general.LoggerWithPrefix("GPUState.String", general.LoggingPKGFull).Errorf("marshal GPUState failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (ns *GPUState) Clone() *GPUState { + if ns == nil { + return nil + } + + return &GPUState{ + GPUMemoryAllocatable: ns.GPUMemoryAllocatable, + PodEntries: ns.PodEntries.Clone(), + } +} + +func (ns *GPUState) GetGPUMemoryAllocatable() uint64 { + if ns == nil { + return 0 + } + + return ns.GPUMemoryAllocatable +} + +// SetAllocationInfo adds a new AllocationInfo (for pod/container pairs) into the given NICState +func (ns *GPUState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if ns == nil { + return + } + + if allocationInfo == nil { + general.LoggerWithPrefix("GPUState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") + return + } + + if ns.PodEntries == nil { + ns.PodEntries = make(PodEntries) + } + + if _, ok := ns.PodEntries[podUID]; !ok { + ns.PodEntries[podUID] = make(ContainerEntries) + } + + ns.PodEntries[podUID][containerName] = allocationInfo.Clone() +} + +func (m GPUMap) Clone() GPUMap { + clone := make(GPUMap) + for id, ns := range m { + clone[id] = ns.Clone() + } + return clone +} + +func (m GPUMap) String() string { + if m == nil { + return "" + } + + contentBytes, err := json.Marshal(m) + if err != nil { + general.LoggerWithPrefix("GPUMap.String", general.LoggingPKGFull).Errorf("marshal GPUMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (m GPUMap) GetGPUMemoryAllocatable(id string) uint64 { + if m == nil { + return 0 + } + return m[id].GetGPUMemoryAllocatable() +} + +// reader is used to get information from local states +type reader interface { + GetMachineState() GPUMap + GetPodEntries() PodEntries + GetAllocationInfo(podUID, containerName string) *AllocationInfo +} + +// writer is used to store information into local states, +// and it also provides functionality to maintain the local files +type writer interface { + SetMachineState(gpuMap GPUMap, persist bool) + SetPodEntries(podEntries PodEntries, persist bool) + SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, persist bool) + + Delete(podUID, containerName string, persist bool) + ClearState() + StoreState() error +} + +// ReadonlyState interface only provides methods for tracking pod assignments +type ReadonlyState interface { + reader +} + +// State interface provides methods for tracking and setting pod assignments +type State interface { + writer + ReadonlyState +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go new file mode 100644 index 0000000000..a1b132cbea --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go @@ -0,0 +1,259 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "errors" + "fmt" + "path" + "reflect" + "sync" + "time" + + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const ( + metricMetaCacheStoreStateDuration = "metacache_store_state_duration" +) + +var ( + _ State = &stateCheckpoint{} + generalLog = general.LoggerWithPrefix("gpu_plugin", general.LoggingPKGFull) +) + +// stateCheckpoint is an in-memory implementation of State; +// everytime we want to read or write states, those requests will always +// go to in-memory State, and then go to disk State, i.e. in write-back mode +type stateCheckpoint struct { + sync.RWMutex + cache State + policyName string + checkpointManager checkpointmanager.CheckpointManager + checkpointName string + // when we add new properties to checkpoint, + // it will cause checkpoint corruption and we should skip it + skipStateCorruption bool + emitter metrics.MetricEmitter +} + +func (s *stateCheckpoint) SetMachineState(gpuMap GPUMap, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetMachineState(gpuMap, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store machineState to checkpoint error") + } + } +} + +func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetPodEntries(podEntries, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store pod entries to checkpoint error", "err") + } + } +} + +func (s *stateCheckpoint) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetAllocationInfo(podUID, containerName, allocationInfo, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store allocationInfo to checkpoint error") + } + } +} + +func (s *stateCheckpoint) Delete(podUID, containerName string, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.Delete(podUID, containerName, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after delete operation to checkpoint error") + } + } +} + +func (s *stateCheckpoint) ClearState() { + s.Lock() + defer s.Unlock() + + s.cache.ClearState() + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after clear operation to checkpoint error") + } +} + +func (s *stateCheckpoint) StoreState() error { + s.Lock() + defer s.Unlock() + return s.storeState() +} + +func (s *stateCheckpoint) GetMachineState() GPUMap { + s.RLock() + defer s.RUnlock() + + return s.cache.GetMachineState() +} + +func (s *stateCheckpoint) GetPodEntries() PodEntries { + s.RLock() + defer s.RUnlock() + + return s.cache.GetPodEntries() +} + +func (s *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.cache.GetAllocationInfo(podUID, containerName) +} + +func (s *stateCheckpoint) storeState() error { + startTime := time.Now() + general.InfoS("called") + defer func() { + elapsed := time.Since(startTime) + general.InfoS("finished", "duration", elapsed) + _ = s.emitter.StoreFloat64(metricMetaCacheStoreStateDuration, float64(elapsed/time.Millisecond), metrics.MetricTypeNameRaw) + }() + checkpoint := NewGPUPluginCheckpoint() + checkpoint.PolicyName = s.policyName + checkpoint.MachineState = s.cache.GetMachineState() + checkpoint.PodEntries = s.cache.GetPodEntries() + + err := s.checkpointManager.CreateCheckpoint(s.checkpointName, checkpoint) + if err != nil { + generalLog.ErrorS(err, "could not save checkpoint") + return err + } + return nil +} + +func (s *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider machine.GPUTopologyProvider) error { + s.Lock() + defer s.Unlock() + var err error + var foundAndSkippedStateCorruption bool + + checkpoint := NewGPUPluginCheckpoint() + if err = s.checkpointManager.GetCheckpoint(s.checkpointName, checkpoint); err != nil { + if errors.Is(err, cmerrors.ErrCheckpointNotFound) { + return s.storeState() + } else if errors.Is(err, cmerrors.ErrCorruptCheckpoint) { + if !s.skipStateCorruption { + return err + } + + foundAndSkippedStateCorruption = true + generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err) + } else { + return err + } + } + + if s.policyName != checkpoint.PolicyName && !s.skipStateCorruption { + return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) + } + + machineState, err := GenerateMachineStateFromPodEntries(conf, checkpoint.PodEntries, gpuTopologyProvider) + if err != nil { + return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + + s.cache.SetMachineState(machineState, false) + s.cache.SetPodEntries(checkpoint.PodEntries, false) + + if !reflect.DeepEqual(machineState, checkpoint.MachineState) { + generalLog.Warningf("machine state changed: "+ + "machineState: %s; checkpointMachineState: %s", + machineState.String(), checkpoint.MachineState.String()) + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState when machine state changed failed with error: %v", err) + } + } + + if foundAndSkippedStateCorruption { + generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum") + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState failed with error: %v", err) + } + } + + generalLog.InfoS("state checkpoint: restored state from checkpoint") + + return nil +} + +func NewCheckpointState(conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, + gpuTopologyProvider machine.GPUTopologyProvider, skipStateCorruption bool, emitter metrics.MetricEmitter, +) (State, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) + if err != nil { + return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) + } + + defaultCache, err := NewGPUPluginState(conf, gpuTopologyProvider) + if err != nil { + return nil, fmt.Errorf("NewGPUPluginState failed with error: %v", err) + } + + sc := &stateCheckpoint{ + cache: defaultCache, + policyName: policyName, + checkpointManager: checkpointManager, + checkpointName: checkpointName, + skipStateCorruption: skipStateCorruption, + emitter: emitter, + } + + if err := sc.restoreState(conf, gpuTopologyProvider); err != nil { + return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ + "the gpu plugin checkpoint file %q before restarting Kubelet", + err, path.Join(stateDir, checkpointName)) + } + + return sc, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go new file mode 100644 index 0000000000..de42ad8aa2 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -0,0 +1,138 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// gpuPluginState is an in-memory implementation of State; +// everytime we want to read or write states, those requests will always +// go to in-memory State, and then go to disk State, i.e. in write-back mode +type gpuPluginState struct { + sync.RWMutex + + qrmConf *qrm.QRMPluginsConfiguration + gpuTopologyProvider machine.GPUTopologyProvider + + machineState GPUMap + podEntries PodEntries +} + +func NewGPUPluginState( + conf *qrm.QRMPluginsConfiguration, + gpuTopologyProvider machine.GPUTopologyProvider, +) (State, error) { + generalLog.InfoS("initializing new gpu plugin in-memory state store") + + defaultMachineState, err := GenerateMachineState(conf, gpuTopologyProvider) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + } + + return &gpuPluginState{ + qrmConf: conf, + machineState: defaultMachineState, + gpuTopologyProvider: gpuTopologyProvider, + podEntries: make(PodEntries), + }, nil +} + +func (s *gpuPluginState) SetMachineState(gpuMap GPUMap, _ bool) { + s.Lock() + defer s.Unlock() + s.machineState = gpuMap.Clone() + generalLog.InfoS("updated gpu plugin machine state", + "GPUMap", gpuMap.String()) +} + +func (s *gpuPluginState) SetPodEntries(podEntries PodEntries, _ bool) { + s.Lock() + defer s.Unlock() + s.podEntries = podEntries.Clone() +} + +func (s *gpuPluginState) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, _ bool) { + s.Lock() + defer s.Unlock() + + s.podEntries.SetAllocationInfo(podUID, containerName, allocationInfo) + generalLog.InfoS("updated gpu plugin pod resource entries", + "podUID", podUID, + "containerName", containerName, + "allocationInfo", allocationInfo.String()) +} + +func (s *gpuPluginState) Delete(podUID, containerName string, _ bool) { + s.Lock() + defer s.Unlock() + + if _, ok := s.podEntries[podUID]; !ok { + return + } + + delete(s.podEntries[podUID], containerName) + if len(s.podEntries[podUID]) == 0 { + delete(s.podEntries, podUID) + } + generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) + +} + +func (s *gpuPluginState) ClearState() { + s.Lock() + defer s.Unlock() + + machineState, err := GenerateMachineState(s.qrmConf, s.gpuTopologyProvider) + if err != nil { + generalLog.ErrorS(err, "failed to generate machine state") + } + s.machineState = machineState + s.podEntries = make(PodEntries) + + generalLog.InfoS("cleared state") +} + +func (s *gpuPluginState) StoreState() error { + // nothing to do + return nil +} + +func (s *gpuPluginState) GetMachineState() GPUMap { + s.RLock() + defer s.RUnlock() + + return s.machineState.Clone() +} + +func (s *gpuPluginState) GetPodEntries() PodEntries { + s.RLock() + defer s.RUnlock() + + return s.podEntries.Clone() +} + +func (s *gpuPluginState) GetAllocationInfo(podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.podEntries.GetAllocationInfo(podUID, containerName) +} diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go new file mode 100644 index 0000000000..b84ef2de5c --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -0,0 +1,88 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func GenerateMachineState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider machine.GPUTopologyProvider) (GPUMap, error) { + if gpuTopologyProvider == nil { + return nil, fmt.Errorf("gpu topology provider must not be nil") + } + + gpuTopology, err := gpuTopologyProvider.GetGPUTopology() + if err != nil { + return nil, fmt.Errorf("gpu topology provider failed with error: %v", err) + } + + gpuMap := make(GPUMap) + for deviceID := range gpuTopology.GPUs { + gpuMap[deviceID] = &GPUState{ + GPUMemoryAllocatable: uint64(conf.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()), + GPUMemoryAllocated: 0, + } + } + + return gpuMap, nil +} + +// GenerateMachineStateFromPodEntries returns GPUMap for gpu memory based on +// machine info along with existed pod entries +func GenerateMachineStateFromPodEntries( + conf *qrm.QRMPluginsConfiguration, + podEntries PodEntries, + gpuTopologyProvider machine.GPUTopologyProvider, +) (GPUMap, error) { + machineState, err := GenerateMachineState(conf, gpuTopologyProvider) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + } + + for gpuID, gpuState := range machineState { + var gpuMemoryAllocated uint64 + for podUID, containerEntries := range podEntries { + for containerName, allocationInfo := range containerEntries { + if containerName != "" && allocationInfo != nil { + gpuAllocation, ok := allocationInfo.TopologyAwareAllocations[gpuID] + if !ok { + continue + } + alloc := allocationInfo.Clone() + alloc.AllocatedAllocation = gpuAllocation.Clone() + alloc.TopologyAwareAllocations = map[string]GPUAllocation{gpuID: gpuAllocation} + gpuMemoryAllocated += gpuAllocation.GPUMemoryQuantity + gpuState.SetAllocationInfo(podUID, containerName, alloc) + } + } + } + gpuState.GPUMemoryAllocated = gpuMemoryAllocated + + generalLog := general.LoggerWithPrefix("GenerateMachineStateFromPodEntries", general.LoggingPKGFull) + if gpuState.GPUMemoryAllocatable < gpuState.GPUMemoryAllocated { + generalLog.Warningf("invalid allocated GPU memory: %d on GPU: %s"+ + " with allocatable GPU memory size: %d", gpuState.GPUMemoryAllocated, gpuID, gpuState.GPUMemoryAllocatable) + } + machineState[gpuID] = gpuState + } + + return machineState, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go new file mode 100644 index 0000000000..cc5cf501b2 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go @@ -0,0 +1,108 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package staticpolicy + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/native" +) + +func (p *StaticPolicy) updateAllocatedDevice() error { + podDevicesMap, err := p.getPodDevicesMap() + if err != nil { + return fmt.Errorf("error getting pod devices map: %v", err) + } + + p.Lock() + defer p.Unlock() + + podEntries := p.state.GetPodEntries() + for pod, devices := range podDevicesMap { + if len(devices) == 0 { + general.ErrorS(err, "pod has no devices", "podUID", pod.PodUID, "containerName", pod.ContainerName) + continue + } + + allocationInfo := podEntries.GetAllocationInfo(pod.PodUID, pod.ContainerName) + if allocationInfo == nil { + continue + } + + gpuMemory := allocationInfo.AllocatedAllocation.GPUMemoryQuantity / uint64(devices.Len()) + topologyAwareAllocations := make(map[string]state.GPUAllocation) + for _, deviceID := range devices.UnsortedList() { + topologyAwareAllocations[deviceID] = state.GPUAllocation{ + GPUMemoryQuantity: gpuMemory, + } + } + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations + + podEntries.SetAllocationInfo(pod.PodUID, pod.ContainerName, allocationInfo) + } + + machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) + if err != nil { + return err + } + + p.state.SetPodEntries(podEntries, false) + p.state.SetMachineState(machineState, true) + + return nil +} + +type podUIDContainerName struct { + PodUID string + ContainerName string +} + +func (p *StaticPolicy) getPodDevicesMap() (map[podUIDContainerName]sets.String, error) { + kubeletCheckpoint, err := native.GetKubeletCheckpoint() + if err != nil { + return nil, fmt.Errorf("could not get kubelet checkpoint: %v", err) + } + + podDevices, _ := kubeletCheckpoint.GetDataInLatestFormat() + podDevicesMap := make(map[podUIDContainerName]sets.String) + for _, podDevice := range podDevices { + if !p.resourceNames.Has(podDevice.ResourceName) { + continue + } + + key := podUIDContainerName{ + PodUID: podDevice.PodUID, + ContainerName: podDevice.ContainerName, + } + + _, ok := podDevicesMap[key] + if !ok { + podDevicesMap[key] = sets.NewString() + } + for _, deviceList := range podDevice.DeviceIDs { + for _, deviceID := range deviceList { + podDevicesMap[key].Insert(deviceID) + } + } + } + + return podDevicesMap, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go new file mode 100644 index 0000000000..b75455eaf1 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -0,0 +1,627 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package staticpolicy + +import ( + "context" + "fmt" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-api/pkg/plugins/skeleton" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + appqrm "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" + "github.com/kubewharf/katalyst-core/pkg/config" + dynamicconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/config/generic" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" +) + +const ( + GPUPluginStateFileName = "gpu_plugin_state" +) + +// StaticPolicy is the static gpu policy +type StaticPolicy struct { + sync.Mutex + + name string + stopCh chan struct{} + started bool + qosConfig *generic.QoSConfiguration + qrmConfig *qrm.QRMPluginsConfiguration + gpuTopologyProvider machine.GPUTopologyProvider + + emitter metrics.MetricEmitter + metaServer *metaserver.MetaServer + agentCtx *agent.GenericContext + state state.State + + podAnnotationKeptKeys []string + podLabelKeptKeys []string + resourceNames sets.String + + residualHitMap map[string]int64 +} + +// NewStaticPolicy returns a static gpu policy +func NewStaticPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, + _ interface{}, agentName string, +) (bool, agent.Component, error) { + wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{ + Key: util.QRMPluginPolicyTagName, + Val: gpuconsts.GPUResourcePluginPolicyNameStatic, + }) + + gpuTopologyProvider := machine.NewKubeletCheckpointGPUTopologyProvider(conf.GPUResourceNames) + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, gpuTopologyProvider, conf.SkipGPUStateCorruption, wrappedEmitter) + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err) + } + + policyImplement := &StaticPolicy{ + emitter: wrappedEmitter, + metaServer: agentCtx.MetaServer, + agentCtx: agentCtx, + stopCh: make(chan struct{}), + name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), + qosConfig: conf.QoSConfiguration, + qrmConfig: conf.QRMPluginsConfiguration, + gpuTopologyProvider: gpuTopologyProvider, + state: stateImpl, + podAnnotationKeptKeys: conf.PodAnnotationKeptKeys, + podLabelKeptKeys: conf.PodLabelKeptKeys, + residualHitMap: make(map[string]int64), + resourceNames: sets.NewString(conf.GPUResourceNames...), + } + + pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, + func(key string, value int64) { + _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) + }) + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("static policy new plugin wrapper failed with error: %v", err) + } + + return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil +} + +// Start starts this plugin +func (p *StaticPolicy) Start() (err error) { + general.Infof("called") + + p.Lock() + defer func() { + if !p.started { + if err == nil { + p.started = true + } else { + close(p.stopCh) + } + } + p.Unlock() + }() + + if p.started { + general.Infof("already started") + return nil + } + + p.stopCh = make(chan struct{}) + + go wait.Until(func() { + _ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw) + }, time.Second*30, p.stopCh) + + err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(gpuconsts.ClearResidualState, general.HealthzCheckStateNotReady, + appqrm.QRMNetworkPluginPeriodicalHandlerGroupName, p.clearResidualState, gpuconsts.StateCheckPeriod, gpuconsts.StateCheckTolerationTimes) + if err != nil { + general.Errorf("start %v failed, err: %v", gpuconsts.ClearResidualState, err) + } + + go wait.Until(func() { + periodicalhandler.ReadyToStartHandlersByGroup(appqrm.QRMGPUPluginPeriodicalHandlerGroupName) + }, 5*time.Second, p.stopCh) + + return nil +} + +// Stop stops this plugin +func (p *StaticPolicy) Stop() error { + p.Lock() + defer func() { + p.started = false + p.Unlock() + general.Infof("stopped") + }() + + if !p.started { + general.Warningf("already stopped") + return nil + } + + close(p.stopCh) + + return nil +} + +// Name returns the name of this plugin +func (p *StaticPolicy) Name() string { + return p.name +} + +// ResourceName returns resource names managed by this plugin +func (p *StaticPolicy) ResourceName() string { + return string(consts.ResourceGPUMemory) +} + +// GetTopologyHints returns hints of corresponding resources +func (p *StaticPolicy) GetTopologyHints(_ context.Context, + req *pluginapi.ResourceRequest, +) (resp *pluginapi.ResourceHintsResponse, err error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyHints got nil req") + } + + return util.PackResourceHintsResponse(req, p.ResourceName(), map[string]*pluginapi.ListOfTopologyHints{ + p.ResourceName(): nil, // indicates that there is no numa preference + }) +} + +// GetPodTopologyHints returns hints of corresponding resources +func (p *StaticPolicy) GetPodTopologyHints(_ context.Context, + req *pluginapi.PodResourceRequest, +) (resp *pluginapi.PodResourceHintsResponse, err error) { + return nil, util.ErrNotImplemented +} + +func (p *StaticPolicy) RemovePod(_ context.Context, + req *pluginapi.RemovePodRequest, +) (*pluginapi.RemovePodResponse, error) { + if req == nil { + return nil, fmt.Errorf("RemovePod got nil req") + } + + p.Lock() + defer p.Unlock() + + if err := p.removePod(req.PodUid); err != nil { + general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) + return nil, err + } + + return &pluginapi.RemovePodResponse{}, nil +} + +// GetResourcesAllocation returns allocation results of corresponding resources +func (p *StaticPolicy) GetResourcesAllocation(_ context.Context, + _ *pluginapi.GetResourcesAllocationRequest, +) (*pluginapi.GetResourcesAllocationResponse, error) { + general.InfofV(4, "called") + err := p.updateAllocatedDevice() + if err != nil { + general.Errorf("update allocated device failed with error: %v", err) + return nil, err + } + + return &pluginapi.GetResourcesAllocationResponse{}, nil +} + +// GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format +func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, + req *pluginapi.GetTopologyAwareResourcesRequest, +) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyAwareResources got nil req") + } + + allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + if allocationInfo == nil { + return &pluginapi.GetTopologyAwareResourcesResponse{}, nil + } + + topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) + for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: float64(alloc.GPUMemoryQuantity), + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + TopologyLevel: -1, + }) + } + + resp := &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: allocationInfo.PodUid, + PodName: allocationInfo.PodName, + PodNamespace: allocationInfo.PodNamespace, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: allocationInfo.ContainerName, + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + p.ResourceName(): { + IsNodeResource: true, + IsScalarResource: true, + AggregatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), + OriginalAggregatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), + TopologyAwareQuantityList: topologyAwareQuantityList, + OriginalTopologyAwareQuantityList: topologyAwareQuantityList, + }, + }, + }, + } + + return resp, nil +} + +// GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format +func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, + req *pluginapi.GetTopologyAwareAllocatableResourcesRequest, +) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyAwareAllocatableResources got nil req") + } + + p.Lock() + defer p.Unlock() + + machineState := p.state.GetMachineState() + + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 + for deviceID, gpuState := range machineState { + aggregatedAllocatableQuantity += gpuState.GetGPUMemoryAllocatable() + aggregatedCapacityQuantity += gpuState.GetGPUMemoryAllocatable() + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + // use -1 to indicate either nor numa or socket topology level + TopologyLevel: -1, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + // use -1 to indicate either nor numa or socket topology level + TopologyLevel: -1, + }) + } + + return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ + AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{ + p.ResourceName(): { + IsNodeResource: true, + IsScalarResource: true, + AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), + TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, + AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), + TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, + }, + }, + }, nil +} + +// GetResourcePluginOptions returns options to be communicated with Resource Manager +func (p *StaticPolicy) GetResourcePluginOptions(context.Context, + *pluginapi.Empty, +) (*pluginapi.ResourcePluginOptions, error) { + return &pluginapi.ResourcePluginOptions{ + PreStartRequired: false, + // TODO: add topology aware + WithTopologyAlignment: true, + NeedReconcile: true, + }, nil +} + +// Allocate is called during pod admit so that the resource +// plugin can allocate corresponding resource for the container +// according to resource request +func (p *StaticPolicy) Allocate(_ context.Context, + req *pluginapi.ResourceRequest, +) (resp *pluginapi.ResourceAllocationResponse, err error) { + if req == nil { + return nil, fmt.Errorf("GetTopologyHints got nil req") + } + + existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + reqInt, _, err := util.GetQuantityFromResourceReq(req) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + general.InfoS("called", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.Annotations, + "gpuMemory", reqInt) + + p.Lock() + defer func() { + if err := p.state.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + if existReallocAnno { + metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) + } + _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + emptyResponse := &pluginapi.ResourceAllocationResponse{ + PodUid: req.PodUid, + PodNamespace: req.PodNamespace, + PodName: req.PodName, + ContainerName: req.ContainerName, + ContainerType: req.ContainerType, + ContainerIndex: req.ContainerIndex, + PodRole: req.PodRole, + PodType: req.PodType, + ResourceName: p.ResourceName(), + Labels: general.DeepCopyMap(req.Labels), + Annotations: general.DeepCopyMap(req.Annotations), + } + + // currently, not to deal with init containers + if req.ContainerType == pluginapi.ContainerType_INIT { + return emptyResponse, nil + } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { + // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating + return p.packAllocationResponse(req, &state.AllocationInfo{}, nil) + } + + allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + if allocationInfo != nil { + resp, packErr := p.packAllocationResponse(req, allocationInfo, nil) + if packErr != nil { + general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, packErr) + return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) + } + return resp, nil + } + + newAllocation := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.GPUAllocation{ + GPUMemoryQuantity: uint64(reqInt), + }, + } + + p.state.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) + + machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) + if stateErr != nil { + general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "gpuMemory", reqInt) + return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) + } + + // update state cache + p.state.SetMachineState(machineState, true) + + return p.packAllocationResponse(req, newAllocation, nil) +} + +// AllocateForPod is called during pod admit so that the resource +// plugin can allocate corresponding resource for the pod +// according to resource request +func (p *StaticPolicy) AllocateForPod(_ context.Context, + req *pluginapi.PodResourceRequest, +) (resp *pluginapi.PodResourceAllocationResponse, err error) { + return nil, util.ErrNotImplemented +} + +// PreStartContainer is called, if indicated by resource plugin during registration phase, +// before each container start. Resource plugin can run resource specific operations +// such as resetting the resource before making resources available to the container +func (p *StaticPolicy) PreStartContainer(context.Context, + *pluginapi.PreStartContainerRequest, +) (*pluginapi.PreStartContainerResponse, error) { + return &pluginapi.PreStartContainerResponse{}, nil +} + +func (p *StaticPolicy) removePod(podUID string) error { + // update state cache + podEntries := p.state.GetPodEntries() + delete(podEntries, podUID) + + machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) + if err != nil { + general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) + return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) + } + + p.state.SetPodEntries(podEntries, false) + p.state.SetMachineState(machineState, false) + + err = p.state.StoreState() + if err != nil { + general.Errorf("store state failed with error: %v", err) + return err + } + return nil +} + +// clearResidualState is used to clean residual pods in local state +func (p *StaticPolicy) clearResidualState(_ *config.Configuration, + _ interface{}, + _ *dynamicconfig.DynamicAgentConfiguration, + _ metrics.MetricEmitter, + _ *metaserver.MetaServer, +) { + general.Infof("exec") + var ( + err error + podList []*v1.Pod + ) + residualSet := make(map[string]bool) + + defer func() { + _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) + }() + + if p.metaServer == nil { + general.Errorf("nil metaServer") + return + } + + ctx := context.Background() + podList, err = p.metaServer.GetPodList(ctx, nil) + if err != nil { + general.Errorf("get pod list failed: %v", err) + return + } + + podSet := sets.NewString() + for _, pod := range podList { + podSet.Insert(fmt.Sprintf("%v", pod.UID)) + } + + p.Lock() + defer p.Unlock() + + podEntries := p.state.GetPodEntries() + for podUID := range podEntries { + if !podSet.Has(podUID) { + residualSet[podUID] = true + p.residualHitMap[podUID] += 1 + general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + } + } + + podsToDelete := sets.NewString() + for podUID, hitCount := range p.residualHitMap { + if !residualSet[podUID] { + general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) + delete(p.residualHitMap, podUID) + continue + } + + if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { + podsToDelete.Insert(podUID) + } + } + + if podsToDelete.Len() > 0 { + for { + podUID, found := podsToDelete.PopAny() + if !found { + break + } + + general.Infof("clear residual pod: %s in state", podUID) + delete(podEntries, podUID) + } + + machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) + if err != nil { + general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + return + } + + p.state.SetPodEntries(podEntries, false) + p.state.SetMachineState(machineState, false) + + err = p.state.StoreState() + if err != nil { + general.Errorf("store state failed: %v", err) + return + } + } +} + +// packAllocationResponse fills pluginapi.ResourceAllocationResponse with information from AllocationInfo and pluginapi.ResourceRequest +func (p *StaticPolicy) packAllocationResponse( + req *pluginapi.ResourceRequest, + allocationInfo *state.AllocationInfo, + resourceAllocationAnnotations map[string]string, +) (*pluginapi.ResourceAllocationResponse, error) { + if allocationInfo == nil { + return nil, fmt.Errorf("packAllocationResponse got nil allocationInfo") + } else if req == nil { + return nil, fmt.Errorf("packAllocationResponse got nil request") + } + + return &pluginapi.ResourceAllocationResponse{ + PodUid: req.PodUid, + PodNamespace: req.PodNamespace, + PodName: req.PodName, + ContainerName: req.ContainerName, + ContainerType: req.ContainerType, + ContainerIndex: req.ContainerIndex, + PodRole: req.PodRole, + PodType: req.PodType, + ResourceName: req.ResourceName, + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + p.ResourceName(): { + IsNodeResource: true, + IsScalarResource: true, // to avoid re-allocating + AllocatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), + Annotations: resourceAllocationAnnotations, + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + req.Hint, + }, + }, + }, + }, + }, + Labels: general.DeepCopyMap(req.Labels), + Annotations: general.DeepCopyMap(req.Annotations), + }, nil +} diff --git a/pkg/agent/qrm-plugins/network/state/state_net.go b/pkg/agent/qrm-plugins/network/state/state_mem.go similarity index 100% rename from pkg/agent/qrm-plugins/network/state/state_net.go rename to pkg/agent/qrm-plugins/network/state/state_mem.go diff --git a/pkg/agent/qrm-plugins/util/util.go b/pkg/agent/qrm-plugins/util/util.go index 5eaca3dfd7..e21c4b2166 100644 --- a/pkg/agent/qrm-plugins/util/util.go +++ b/pkg/agent/qrm-plugins/util/util.go @@ -49,7 +49,7 @@ func GetQuantityFromResourceReq(req *pluginapi.ResourceRequest) (int, float64, e return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil case string(apiconsts.ReclaimedResourceMilliCPU): return general.Max(int(math.Ceil(req.ResourceRequests[key]/1000.0)), 0), req.ResourceRequests[key] / 1000.0, nil - case string(v1.ResourceMemory), string(apiconsts.ReclaimedResourceMemory): + case string(v1.ResourceMemory), string(apiconsts.ReclaimedResourceMemory), string(apiconsts.ResourceGPUMemory): return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil case string(apiconsts.ResourceNetBandwidth): if req.Annotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue { diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go new file mode 100644 index 0000000000..9c5547d24d --- /dev/null +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -0,0 +1,34 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import "k8s.io/apimachinery/pkg/api/resource" + +type GPUQRMPluginConfig struct { + // PolicyName is used to switch between several strategies + PolicyName string + // GPUResourceName is the name of the GPU resource + GPUResourceNames []string + // GPUMemoryAllocatablePerGPU is the total memory allocatable for each GPU + GPUMemoryAllocatablePerGPU resource.Quantity + // SkipGPUStateCorruption skip gpu state corruption, and it will be used after updating state properties + SkipGPUStateCorruption bool +} + +func NewGPUQRMPluginConfig() *GPUQRMPluginConfig { + return &GPUQRMPluginConfig{} +} diff --git a/pkg/config/agent/qrm/qrm_base.go b/pkg/config/agent/qrm/qrm_base.go index 18136bb490..ffa851047e 100644 --- a/pkg/config/agent/qrm/qrm_base.go +++ b/pkg/config/agent/qrm/qrm_base.go @@ -50,6 +50,7 @@ type QRMPluginsConfiguration struct { *MemoryQRMPluginConfig *NetworkQRMPluginConfig *IOQRMPluginConfig + *GPUQRMPluginConfig } func NewGenericQRMPluginConfiguration() *GenericQRMPluginConfiguration { @@ -69,5 +70,6 @@ func NewQRMPluginsConfiguration() *QRMPluginsConfiguration { MemoryQRMPluginConfig: NewMemoryQRMPluginConfig(), NetworkQRMPluginConfig: NewNetworkQRMPluginConfig(), IOQRMPluginConfig: NewIOQRMPluginConfig(), + GPUQRMPluginConfig: NewGPUQRMPluginConfig(), } } diff --git a/pkg/util/machine/gpu.go b/pkg/util/machine/gpu.go new file mode 100644 index 0000000000..1d40338da9 --- /dev/null +++ b/pkg/util/machine/gpu.go @@ -0,0 +1,73 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import ( + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/native" +) + +type GPUTopologyProvider interface { + GetGPUTopology() (*GPUTopology, error) +} + +type GPUTopology struct { + GPUs map[string]GPUInfo +} + +type GPUInfo struct { + NUMANode int +} + +type kubeletCheckpointGPUTopologyProvider struct { + resourceNames []string +} + +func NewKubeletCheckpointGPUTopologyProvider(resourceNames []string) GPUTopologyProvider { + return &kubeletCheckpointGPUTopologyProvider{ + resourceNames: resourceNames, + } +} + +func (p *kubeletCheckpointGPUTopologyProvider) GetGPUTopology() (*GPUTopology, error) { + gpuTopology := &GPUTopology{ + GPUs: make(map[string]GPUInfo), + } + + kubeletCheckpoint, err := native.GetKubeletCheckpoint() + if err != nil { + general.Errorf("Failed to get kubelet checkpoint: %v", err) + return gpuTopology, nil + } + + _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() + for _, resourceName := range p.resourceNames { + gpuDevice, ok := registeredDevs[resourceName] + if !ok { + continue + } + + for _, id := range gpuDevice { + gpuTopology.GPUs[id] = GPUInfo{ + // TODO: get NUMA node from Kubelet + NUMANode: 0, + } + } + } + + return gpuTopology, nil +} diff --git a/pkg/util/native/kubelet.go b/pkg/util/native/kubelet.go index 91eafcbbd3..8c7dc22028 100644 --- a/pkg/util/native/kubelet.go +++ b/pkg/util/native/kubelet.go @@ -26,13 +26,20 @@ import ( "strconv" "time" + "github.com/pkg/errors" v1 "k8s.io/api/core/v1" "k8s.io/client-go/discovery" "k8s.io/client-go/rest" + "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" ) const ( defaultTimeout = time.Second * 10 + + // kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint + kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint" ) // MemoryReservation specifies the memory reservation of different types for each NUMA node @@ -198,3 +205,21 @@ func insecureConfig(host, tokenFile string) (*rest.Config, error) { BearerTokenFile: tokenFile, }, nil } + +func GetKubeletCheckpoint() (checkpoint.DeviceManagerCheckpoint, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(v1beta1.DevicePluginPath) + if err != nil { + return nil, errors.Wrap(err, "new checkpoint manager failed") + } + + registeredDevs := make(map[string][]string) + devEntries := make([]checkpoint.PodDevicesEntry, 0) + cp := checkpoint.New(devEntries, registeredDevs) + + err = checkpointManager.GetCheckpoint(kubeletDeviceManagerCheckpoint, cp) + if err != nil { + return nil, errors.Wrap(err, "get checkpoint failed") + } + + return cp, nil +} From bd12c5b8e49bd5a1d4e5d4bc97377a37a55003ee Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 20 Jun 2025 22:14:16 +0800 Subject: [PATCH 02/52] refactor(topology): skip add zone node which is not a child of socket or numa zone node --- .../qrm-plugins/gpu/staticpolicy/policy.go | 14 +++++++++----- .../kubelet/topology/topology_adapter.go | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index b75455eaf1..cf0c16710f 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -260,7 +260,9 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, ResourceValue: float64(alloc.GPUMemoryQuantity), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), - TopologyLevel: -1, + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, }) } @@ -310,15 +312,17 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), - // use -1 to indicate either nor numa or socket topology level - TopologyLevel: -1, + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), - // use -1 to indicate either nor numa or socket topology level - TopologyLevel: -1, + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, }) } diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go index 48b47588f8..15d3b4d870 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go @@ -349,8 +349,8 @@ func (p *topologyAdapterImpl) Run(ctx context.Context, handler func()) error { // validatePodResourcesServerResponse validate pod resources server response, if the resource is empty, // maybe the kubelet or qrm plugin is restarting -func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1. - AllocatableResourcesResponse, listPodResourcesResponse *podresv1.ListPodResourcesResponse, +func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1.AllocatableResourcesResponse, + listPodResourcesResponse *podresv1.ListPodResourcesResponse, ) error { if len(p.needValidationResources) > 0 { if allocatableResourcesResponse == nil { @@ -396,6 +396,11 @@ func (p *topologyAdapterImpl) addNumaSocketChildrenZoneNodes(generator *util.Top continue } + if parentZoneNode == nil { + // skip the resource which doesn't have parent zone node + continue + } + err = generator.AddNode(parentZoneNode, zoneNode) if err != nil { errList = append(errList, err) @@ -1133,6 +1138,12 @@ func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQu }, } + if identifier, ok := quantity.Annotations[apiconsts.ResourceAnnotationKeyResourceIdentifier]; ok && len(identifier) == 0 { + // if quantity has resource identifier annotation, but it is empty, it means it is unique and the parent zone node + // already exists, we can just return the zone node and nil parent zone node + return zoneNode, nil, nil + } + switch quantity.TopologyLevel { case podresv1.TopologyLevel_NUMA: parentZoneNode := util.GenerateNumaZoneNode(nodeID) @@ -1141,7 +1152,9 @@ func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQu parentZoneNode := util.GenerateSocketZoneNode(nodeID) return zoneNode, &parentZoneNode, nil default: - return zoneNode, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel) + // if quantity topology level is not numa or socket, it means that the zone is a child of socket or numa, + // and the zone node is determined by the quantity name or its resource identifier if existed. + return zoneNode, nil, nil } } } From ce727bf2ac4b504413f5f2aa15dcba723d438d30 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 25 Jul 2025 17:57:56 +0800 Subject: [PATCH 03/52] feat(gpu): enhance GPU memory allocation with NUMA awareness - Update GPU memory type from uint64 to float64 for precise allocation - Implement NUMA-aware GPU topology management and allocation - Add support for associated device allocation and topology hints - Introduce new GPU topology provider with NUMA node tracking - Extend GPU state management with NUMA node information - Add utility functions for GPU memory hint generation and NUMA calculations --- go.mod | 4 +- go.sum | 8 +- pkg/agent/qrm-plugins/gpu/state/state.go | 46 +- pkg/agent/qrm-plugins/gpu/state/state_mem.go | 1 - pkg/agent/qrm-plugins/gpu/state/util.go | 10 +- .../qrm-plugins/gpu/staticpolicy/device.go | 108 ---- .../qrm-plugins/gpu/staticpolicy/policy.go | 512 +++++++++++++++++- pkg/agent/qrm-plugins/gpu/util/util.go | 97 ++++ pkg/agent/qrm-plugins/util/util.go | 43 +- pkg/agent/qrm-plugins/util/util_test.go | 12 +- pkg/util/machine/gpu.go | 83 ++- 11 files changed, 741 insertions(+), 183 deletions(-) delete mode 100644 pkg/agent/qrm-plugins/gpu/staticpolicy/device.go create mode 100644 pkg/agent/qrm-plugins/gpu/util/util.go diff --git a/go.mod b/go.mod index 7ce2433559..c576c897b9 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5 + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 @@ -197,7 +197,7 @@ replace ( k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6 k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6 k8s.io/kubectl => k8s.io/kubectl v0.24.6 - k8s.io/kubelet => github.com/kubewharf/kubelet v1.24.6-kubewharf.9 + k8s.io/kubelet => github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99 k8s.io/kubernetes => k8s.io/kubernetes v1.24.6 k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6 k8s.io/metrics => k8s.io/metrics v0.24.6 diff --git a/go.sum b/go.sum index 1dd9b13203..d7eee09def 100644 --- a/go.sum +++ b/go.sum @@ -574,8 +574,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/katalyst-api v0.5.8-0.20251212030746-894fa2521a86 h1:GCqe9PcoTQ7akNDyAmavhnSrPV7sMAoYJ5jKEaJg4Ac= -github.com/kubewharf/katalyst-api v0.5.8-0.20251212030746-894fa2521a86/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc= github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= @@ -587,8 +585,10 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= -github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5 h1:q7cwZoXMJw0Zr+b5WuN0yF4ceoSJp4XVd8fOSScQEZg= -github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= +github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6 h1:4AOW5GQJ9NxsN9hVMrQYCxMdjgbF/rqDgvCeHvnmo8Q= +github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= +github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99 h1:9QEucppxPGGOKoJShUBGlSIDT/w49zZU6gjit546N1I= +github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go index c60ac95fb8..5c37a6faad 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -31,12 +31,20 @@ type AllocationInfo struct { } type GPUAllocation struct { - GPUMemoryQuantity uint64 `json:"gpu_memory_quantity"` + GPUMemoryQuantity float64 `json:"gpu_memory_quantity"` + NUMANodes []int `json:"numa_nodes"` } func (a *GPUAllocation) Clone() GPUAllocation { + if a == nil { + return GPUAllocation{} + } + + numaNodes := make([]int, len(a.NUMANodes)) + copy(numaNodes, a.NUMANodes) return GPUAllocation{ GPUMemoryQuantity: a.GPUMemoryQuantity, + NUMANodes: numaNodes, } } @@ -44,9 +52,10 @@ type ( ContainerEntries map[string]*AllocationInfo // Keyed by container name PodEntries map[string]ContainerEntries // Keyed by pod UID ) + type GPUState struct { - GPUMemoryAllocatable uint64 `json:"gpu_memory_allocatable"` - GPUMemoryAllocated uint64 `json:"gpu_memory_allocated"` + GPUMemoryAllocatable float64 `json:"gpu_memory_allocatable"` + GPUMemoryAllocated float64 `json:"gpu_memory_allocated"` PodEntries PodEntries `json:"pod_entries"` } @@ -154,11 +163,12 @@ func (ns *GPUState) Clone() *GPUState { return &GPUState{ GPUMemoryAllocatable: ns.GPUMemoryAllocatable, + GPUMemoryAllocated: ns.GPUMemoryAllocated, PodEntries: ns.PodEntries.Clone(), } } -func (ns *GPUState) GetGPUMemoryAllocatable() uint64 { +func (ns *GPUState) GetGPUMemoryAllocatable() float64 { if ns == nil { return 0 } @@ -166,6 +176,14 @@ func (ns *GPUState) GetGPUMemoryAllocatable() uint64 { return ns.GPUMemoryAllocatable } +func (ns *GPUState) GetGPUMemoryAllocated() float64 { + if ns == nil { + return 0 + } + + return ns.GPUMemoryAllocated +} + // SetAllocationInfo adds a new AllocationInfo (for pod/container pairs) into the given NICState func (ns *GPUState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { if ns == nil { @@ -209,13 +227,31 @@ func (m GPUMap) String() string { return string(contentBytes) } -func (m GPUMap) GetGPUMemoryAllocatable(id string) uint64 { +func (m GPUMap) GetGPUMemoryAllocatable(id string) float64 { if m == nil { return 0 } return m[id].GetGPUMemoryAllocatable() } +func (m GPUMap) GPUMemoryAllocated(id string) float64 { + if m == nil { + return 0 + } + + return m[id].GetGPUMemoryAllocated() +} + +func (m GPUMap) GPUMemorySatisfiedRequest(id string, request float64) bool { + if m == nil { + return false + } + + gpuMemoryAllocatable := m.GetGPUMemoryAllocatable(id) + gpuMemoryAllocated := m.GPUMemoryAllocated(id) + return gpuMemoryAllocatable-gpuMemoryAllocated >= request +} + // reader is used to get information from local states type reader interface { GetMachineState() GPUMap diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go index de42ad8aa2..e0d45dd209 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_mem.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -94,7 +94,6 @@ func (s *gpuPluginState) Delete(podUID, containerName string, _ bool) { delete(s.podEntries, podUID) } generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) - } func (s *gpuPluginState) ClearState() { diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go index b84ef2de5c..5c1a3dcfca 100644 --- a/pkg/agent/qrm-plugins/gpu/state/util.go +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -29,7 +29,7 @@ func GenerateMachineState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider return nil, fmt.Errorf("gpu topology provider must not be nil") } - gpuTopology, err := gpuTopologyProvider.GetGPUTopology() + gpuTopology, _, err := gpuTopologyProvider.GetGPUTopology() if err != nil { return nil, fmt.Errorf("gpu topology provider failed with error: %v", err) } @@ -37,7 +37,7 @@ func GenerateMachineState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider gpuMap := make(GPUMap) for deviceID := range gpuTopology.GPUs { gpuMap[deviceID] = &GPUState{ - GPUMemoryAllocatable: uint64(conf.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()), + GPUMemoryAllocatable: float64(conf.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()), GPUMemoryAllocated: 0, } } @@ -58,7 +58,7 @@ func GenerateMachineStateFromPodEntries( } for gpuID, gpuState := range machineState { - var gpuMemoryAllocated uint64 + var gpuMemoryAllocated float64 for podUID, containerEntries := range podEntries { for containerName, allocationInfo := range containerEntries { if containerName != "" && allocationInfo != nil { @@ -78,8 +78,8 @@ func GenerateMachineStateFromPodEntries( generalLog := general.LoggerWithPrefix("GenerateMachineStateFromPodEntries", general.LoggingPKGFull) if gpuState.GPUMemoryAllocatable < gpuState.GPUMemoryAllocated { - generalLog.Warningf("invalid allocated GPU memory: %d on GPU: %s"+ - " with allocatable GPU memory size: %d", gpuState.GPUMemoryAllocated, gpuID, gpuState.GPUMemoryAllocatable) + generalLog.Warningf("invalid allocated GPU memory: %f on GPU: %s"+ + " with allocatable GPU memory size: %f", gpuState.GPUMemoryAllocated, gpuID, gpuState.GPUMemoryAllocatable) } machineState[gpuID] = gpuState } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go deleted file mode 100644 index cc5cf501b2..0000000000 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/device.go +++ /dev/null @@ -1,108 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package staticpolicy - -import ( - "fmt" - - "k8s.io/apimachinery/pkg/util/sets" - - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/native" -) - -func (p *StaticPolicy) updateAllocatedDevice() error { - podDevicesMap, err := p.getPodDevicesMap() - if err != nil { - return fmt.Errorf("error getting pod devices map: %v", err) - } - - p.Lock() - defer p.Unlock() - - podEntries := p.state.GetPodEntries() - for pod, devices := range podDevicesMap { - if len(devices) == 0 { - general.ErrorS(err, "pod has no devices", "podUID", pod.PodUID, "containerName", pod.ContainerName) - continue - } - - allocationInfo := podEntries.GetAllocationInfo(pod.PodUID, pod.ContainerName) - if allocationInfo == nil { - continue - } - - gpuMemory := allocationInfo.AllocatedAllocation.GPUMemoryQuantity / uint64(devices.Len()) - topologyAwareAllocations := make(map[string]state.GPUAllocation) - for _, deviceID := range devices.UnsortedList() { - topologyAwareAllocations[deviceID] = state.GPUAllocation{ - GPUMemoryQuantity: gpuMemory, - } - } - allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - - podEntries.SetAllocationInfo(pod.PodUID, pod.ContainerName, allocationInfo) - } - - machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) - if err != nil { - return err - } - - p.state.SetPodEntries(podEntries, false) - p.state.SetMachineState(machineState, true) - - return nil -} - -type podUIDContainerName struct { - PodUID string - ContainerName string -} - -func (p *StaticPolicy) getPodDevicesMap() (map[podUIDContainerName]sets.String, error) { - kubeletCheckpoint, err := native.GetKubeletCheckpoint() - if err != nil { - return nil, fmt.Errorf("could not get kubelet checkpoint: %v", err) - } - - podDevices, _ := kubeletCheckpoint.GetDataInLatestFormat() - podDevicesMap := make(map[podUIDContainerName]sets.String) - for _, podDevice := range podDevices { - if !p.resourceNames.Has(podDevice.ResourceName) { - continue - } - - key := podUIDContainerName{ - PodUID: podDevice.PodUID, - ContainerName: podDevice.ContainerName, - } - - _, ok := podDevicesMap[key] - if !ok { - podDevicesMap[key] = sets.NewString() - } - for _, deviceList := range podDevice.DeviceIDs { - for _, deviceID := range deviceList { - podDevicesMap[key].Insert(deviceID) - } - } - } - - return podDevicesMap, nil -} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index cf0c16710f..8a3b871c3d 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -19,10 +19,12 @@ package staticpolicy import ( "context" "fmt" + "sort" "sync" "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -35,6 +37,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" "github.com/kubewharf/katalyst-core/pkg/config" @@ -84,7 +87,7 @@ func NewStaticPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, Val: gpuconsts.GPUResourcePluginPolicyNameStatic, }) - gpuTopologyProvider := machine.NewKubeletCheckpointGPUTopologyProvider(conf.GPUResourceNames) + gpuTopologyProvider := machine.NewGPUTopologyProvider(conf.GPUResourceNames) stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, gpuconsts.GPUResourcePluginPolicyNameStatic, gpuTopologyProvider, conf.SkipGPUStateCorruption, wrappedEmitter) if err != nil { @@ -196,9 +199,92 @@ func (p *StaticPolicy) GetTopologyHints(_ context.Context, return nil, fmt.Errorf("GetTopologyHints got nil req") } - return util.PackResourceHintsResponse(req, p.ResourceName(), map[string]*pluginapi.ListOfTopologyHints{ - p.ResourceName(): nil, // indicates that there is no numa preference - }) + existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + gpuCount, gpuNames, err := p.getGPUCount(req) + if err != nil { + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + + if gpuCount == 0 { + return nil, fmt.Errorf("no GPU resources found") + } + + general.InfoS("called", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.Annotations, + "gpuMemory", gpuMemory, + "gpuNames", gpuNames.List(), + "gpuCount", gpuCount) + + p.Lock() + defer func() { + if err := p.state.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + if existReallocAnno { + metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) + } + _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + var hints map[string]*pluginapi.ListOfTopologyHints + machineState := p.state.GetMachineState() + allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + if allocationInfo != nil { + hints = gpuutil.RegenerateGPUMemoryHints(allocationInfo, false) + + // regenerateHints failed. need to clear container record and re-calculate. + if hints == nil { + podEntries := p.state.GetPodEntries() + delete(podEntries[req.PodUid], req.ContainerName) + if len(podEntries[req.PodUid]) == 0 { + delete(podEntries, req.PodUid) + } + + var err error + machineState, err = state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) + if err != nil { + general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + } + } + + // otherwise, calculate hint for container without allocated memory + if hints == nil { + var calculateErr error + // calculate hint for container without allocated cpus + hints, calculateErr = p.calculateHints(gpuMemory, gpuCount, machineState, req) + if calculateErr != nil { + return nil, fmt.Errorf("calculateHints failed with error: %v", calculateErr) + } + } + + return util.PackResourceHintsResponse(req, p.ResourceName(), hints) } // GetPodTopologyHints returns hints of corresponding resources @@ -231,12 +317,6 @@ func (p *StaticPolicy) GetResourcesAllocation(_ context.Context, _ *pluginapi.GetResourcesAllocationRequest, ) (*pluginapi.GetResourcesAllocationResponse, error) { general.InfofV(4, "called") - err := p.updateAllocatedDevice() - if err != nil { - general.Errorf("update allocated device failed with error: %v", err) - return nil, err - } - return &pluginapi.GetResourcesAllocationResponse{}, nil } @@ -257,7 +337,7 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(alloc.GPUMemoryQuantity), + ResourceValue: alloc.GPUMemoryQuantity, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -276,8 +356,8 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, p.ResourceName(): { IsNodeResource: true, IsScalarResource: true, - AggregatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), - OriginalAggregatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), + AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, TopologyAwareQuantityList: topologyAwareQuantityList, OriginalTopologyAwareQuantityList: topologyAwareQuantityList, }, @@ -304,12 +384,12 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 + var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 for deviceID, gpuState := range machineState { aggregatedAllocatableQuantity += gpuState.GetGPUMemoryAllocatable() aggregatedCapacityQuantity += gpuState.GetGPUMemoryAllocatable() topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), + ResourceValue: gpuState.GetGPUMemoryAllocatable(), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -317,7 +397,7 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(gpuState.GetGPUMemoryAllocatable()), + ResourceValue: gpuState.GetGPUMemoryAllocatable(), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -331,9 +411,9 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, p.ResourceName(): { IsNodeResource: true, IsScalarResource: true, - AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), + AggregatedAllocatableQuantity: aggregatedAllocatableQuantity, TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, - AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), + AggregatedCapacityQuantity: aggregatedCapacityQuantity, TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, }, }, @@ -345,10 +425,10 @@ func (p *StaticPolicy) GetResourcePluginOptions(context.Context, *pluginapi.Empty, ) (*pluginapi.ResourcePluginOptions, error) { return &pluginapi.ResourcePluginOptions{ - PreStartRequired: false, - // TODO: add topology aware + PreStartRequired: false, WithTopologyAlignment: true, NeedReconcile: true, + AssociatedDevices: p.resourceNames.List(), }, nil } @@ -372,18 +452,25 @@ func (p *StaticPolicy) Allocate(_ context.Context, return nil, err } - reqInt, _, err := util.GetQuantityFromResourceReq(req) + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } + gpuCount, gpuNames, err := p.getGPUCount(req) + if err != nil { + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + general.InfoS("called", "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, "qosLevel", qosLevel, "reqAnnotations", req.Annotations, - "gpuMemory", reqInt) + "gpuMemory", gpuMemory, + "gpuNames", gpuNames.List(), + "gpuCount", gpuCount) p.Lock() defer func() { @@ -435,10 +522,18 @@ func (p *StaticPolicy) Allocate(_ context.Context, return resp, nil } + // get hint nodes from request + hintNodes, err := machine.NewCPUSetUint64(req.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, fmt.Errorf("failed to get hint nodes: %v", err) + } + newAllocation := &state.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), AllocatedAllocation: state.GPUAllocation{ - GPUMemoryQuantity: uint64(reqInt), + GPUMemoryQuantity: gpuMemory, + NUMANodes: hintNodes.ToSliceInt(), }, } @@ -450,7 +545,7 @@ func (p *StaticPolicy) Allocate(_ context.Context, "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, - "gpuMemory", reqInt) + "gpuMemory", gpuMemory) return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) } @@ -615,7 +710,7 @@ func (p *StaticPolicy) packAllocationResponse( p.ResourceName(): { IsNodeResource: true, IsScalarResource: true, // to avoid re-allocating - AllocatedQuantity: float64(allocationInfo.AllocatedAllocation.GPUMemoryQuantity), + AllocatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, Annotations: resourceAllocationAnnotations, ResourceHints: &pluginapi.ListOfTopologyHints{ Hints: []*pluginapi.TopologyHint{ @@ -629,3 +724,370 @@ func (p *StaticPolicy) packAllocationResponse( Annotations: general.DeepCopyMap(req.Annotations), }, nil } + +func (p *StaticPolicy) getGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { + gpuCount := float64(0) + gpuNames := sets.NewString() + for resourceName := range p.resourceNames { + _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + if err != nil && !errors.IsNotFound(err) { + return 0, nil, err + } + gpuCount += request + gpuNames.Insert(resourceName) + } + return gpuCount, gpuNames, nil +} + +func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machineState state.GPUMap, req *pluginapi.ResourceRequest) (map[string]*pluginapi.ListOfTopologyHints, error) { + gpuTopology, numaTopologyReady, err := p.gpuTopologyProvider.GetGPUTopology() + if err != nil { + return nil, err + } + + if !numaTopologyReady { + return nil, fmt.Errorf("numa topology is ready") + } + + perGPUMemory := gpuMemory / gpuReq + general.Infof("gpuMemory: %f, gpuReq: %f, perGPUMemory: %f", gpuMemory, gpuReq, perGPUMemory) + + numaToAvailableGPUCount := make(map[int]float64) + for gpuID, s := range machineState { + if s == nil { + continue + } + + if s.GetGPUMemoryAllocated()+perGPUMemory <= s.GetGPUMemoryAllocatable() { + info, ok := gpuTopology.GPUs[gpuID] + if !ok { + return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) + } + + for _, numaNode := range info.GetNUMANode() { + numaToAvailableGPUCount[numaNode] += 1 + } + } + } + + numaNodes := make([]int, 0, p.metaServer.NumNUMANodes) + for numaNode := range p.metaServer.NUMAToCPUs { + numaNodes = append(numaNodes, numaNode) + } + sort.Ints(numaNodes) + + minNUMAsCountNeeded, _, err := gpuutil.GetNUMANodesCountToFitGPUReq(gpuReq, p.metaServer.CPUTopology, gpuTopology) + if err != nil { + return nil, err + } + + numaCountPerSocket, err := p.metaServer.NUMAsPerSocket() + if err != nil { + return nil, fmt.Errorf("NUMAsPerSocket failed with error: %v", err) + } + + numaBound := len(numaNodes) + if numaBound > machine.LargeNUMAsPoint { + // [TODO]: to discuss refine minNUMAsCountNeeded+1 + numaBound = minNUMAsCountNeeded + 1 + } + + var availableNumaHints []*pluginapi.TopologyHint + machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { + maskCount := mask.Count() + if maskCount < minNUMAsCountNeeded { + return + } + + maskBits := mask.GetBits() + numaCountNeeded := mask.Count() + + allAvailableGPUsCountInMask := float64(0) + for _, nodeID := range maskBits { + allAvailableGPUsCountInMask += numaToAvailableGPUCount[nodeID] + } + + if allAvailableGPUsCountInMask < gpuReq { + return + } + + crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.metaServer.CPUTopology) + if err != nil { + return + } else if numaCountNeeded <= numaCountPerSocket && crossSockets { + return + } + + preferred := maskCount == minNUMAsCountNeeded + availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{ + Nodes: machine.MaskToUInt64Array(mask), + Preferred: preferred, + }) + }) + + // NOTE: because grpc is inability to distinguish between an empty array and nil, + // we return an error instead of an empty array. + // we should resolve this issue if we need to manage multi-resource in one plugin. + if len(availableNumaHints) == 0 { + general.Warningf("got no available gpu memory hints for pod: %s/%s, container: %s", + req.PodNamespace, req.PodName, req.ContainerName) + return nil, gpuutil.ErrNoAvailableGPUMemoryHints + } + + return map[string]*pluginapi.ListOfTopologyHints{ + p.ResourceName(): { + Hints: availableNumaHints, + }, + }, nil +} + +func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(_ context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + if request == nil { + return nil, fmt.Errorf("request is nil") + } + + gpuTopology := &machine.GPUTopology{ + GPUs: make(map[string]machine.GPUInfo, len(request.Devices)), + } + + for _, device := range request.Devices { + var numaNode []int + if device.Topology != nil { + numaNode = make([]int, 0, len(device.Topology.Nodes)) + + for _, node := range device.Topology.Nodes { + if node == nil { + continue + } + numaNode = append(numaNode, int(node.ID)) + } + } + + gpuTopology.GPUs[device.ID] = machine.GPUInfo{ + Health: device.Health, + NUMANode: numaNode, + } + } + + err := p.gpuTopologyProvider.SetGPUTopology(gpuTopology) + if err != nil { + general.Errorf("set gpu topology failed with error: %v", err) + return nil, fmt.Errorf("set gpu topology failed with error: %v", err) + } + + general.Infof("got device %s gpuTopology success: %v", request.DeviceName, gpuTopology) + + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + if req == nil || req.ResourceRequest == nil || req.DeviceRequest == nil { + return nil, fmt.Errorf("req is nil") + } + + if req.ResourceRequest.Hint == nil || req.ResourceRequest.Hint.Nodes == nil { + general.Warningf("got nil resource hint") + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: nil, + }, nil + } + + if req.DeviceRequest.Hint == nil || req.DeviceRequest.Hint.Nodes == nil { + general.Warningf("got nil device hint") + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: nil, + }, nil + } + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req.ResourceRequest, p.podAnnotationKeptKeys, p.podLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.ResourceRequest.PodNamespace, req.ResourceRequest.PodName, req.ResourceRequest.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + allocationInfo := p.state.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) + if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { + allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) + for gpuID := range allocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(req.ResourceRequest.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, err + } + + // get hint nodes from request + hintNodes, err := machine.NewCPUSetUint64(req.DeviceRequest.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, err + } + + gpuTopology, numaTopologyReady, err := p.gpuTopologyProvider.GetGPUTopology() + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, err + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + allocatedDevices, allocatedGPUMemory, err := p.calculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req) + if err != nil { + general.Warningf("failed to allocate associated devices: %v", err) + return nil, err + } + + topologyAwareAllocations := make(map[string]state.GPUAllocation) + for _, device := range allocatedDevices { + info, ok := gpuTopology.GPUs[device] + if !ok { + return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) + } + + topologyAwareAllocations[device] = state.GPUAllocation{ + GPUMemoryQuantity: allocatedGPUMemory[device], + NUMANodes: info.GetNUMANode(), + } + } + + if allocationInfo == nil { + allocationInfo = &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.GPUAllocation{ + GPUMemoryQuantity: gpuMemoryRequest, + NUMANodes: hintNodes.ToSliceInt(), + }, + } + } + + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations + p.state.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) + machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) + if err != nil { + return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) + } + + p.state.SetMachineState(machineState, true) + + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil +} + +func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, request *pluginapi.AssociatedDeviceRequest) ([]string, map[string]float64, error) { + gpuRequest := request.DeviceRequest.GetDeviceRequest() + gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) + + machineState := p.state.GetMachineState() + + allocatedDevices := sets.NewString() + needed := gpuRequest + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + needed-- + if needed == 0 { + return true + } + } + return false + } + + allocatedGPUMemory := func(devices ...string) map[string]float64 { + memory := make(map[string]float64) + for _, device := range devices { + memory[device] = gpuMemoryPerGPU + } + return memory + } + + availableDevices := request.DeviceRequest.GetAvailableDevices() + mustIncludeDevices := request.DeviceRequest.GetMustIncludeDevices() + + // allocate must include devices first + for _, device := range mustIncludeDevices { + if machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + } + allocateDevices(device) + } + + // if allocated devices is enough, return immediately + if allocatedDevices.Len() >= int(gpuRequest) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + + isNUMAAffinityDevice := func(device string) bool { + info, ok := gpuTopology.GPUs[device] + if !ok { + general.Errorf("failed to find hint node for device %s", device) + return false + } + + // check if gpu's numa node is the subset of hint nodes + // todo support + if machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) { + return true + } + return false + } + + // second allocate available and numa-affinity gpus + for _, device := range availableDevices { + if allocatedDevices.Has(device) { + continue + } + + if !isNUMAAffinityDevice(device) { + continue + } + + if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + } + + // final allocate available and non-numa-affinity gpus + for _, device := range availableDevices { + if allocatedDevices.Has(device) { + continue + } + + if isNUMAAffinityDevice(device) { + continue + } + + if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + general.Infof("available non-numa-affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + } + + return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go new file mode 100644 index 0000000000..3957b09a3e --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -0,0 +1,97 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "fmt" + "math" + + pkgerrors "github.com/pkg/errors" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +var ErrNoAvailableGPUMemoryHints = pkgerrors.New("no available gpu memory hints") + +// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, +// and regenerateHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func RegenerateGPUMemoryHints(allocationInfo *state.AllocationInfo, regenerate bool) map[string]*pluginapi.ListOfTopologyHints { + if allocationInfo == nil { + general.Errorf("RegenerateHints got nil allocationInfo") + return nil + } + + hints := map[string]*pluginapi.ListOfTopologyHints{} + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "podUID", allocationInfo.PodUid, + "containerName", allocationInfo.ContainerName) + + return nil + } + + allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) + + general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "hint", allocatedNumaNodes) + hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: allocatedNumaNodes.ToSliceUInt64(), + Preferred: true, + }, + }, + } + return hints +} + +func GetNUMANodesCountToFitGPUReq(gpuReq float64, cpuTopology *machine.CPUTopology, gpuTopology *machine.GPUTopology) (int, int, error) { + if gpuTopology == nil { + return 0, 0, fmt.Errorf("GetNUMANodesCountToFitGPUReq got nil gpuTopology") + } + + numaCount := cpuTopology.NumNUMANodes + if numaCount == 0 { + return 0, 0, fmt.Errorf("there is no NUMA in cpuTopology") + } + + if len(gpuTopology.GPUs)%numaCount != 0 { + general.Warningf("GPUs count %d cannot be evenly divisible by NUMA count %d", len(gpuTopology.GPUs), numaCount) + } + + gpusPerNUMA := (len(gpuTopology.GPUs) + numaCount - 1) / numaCount + numaCountNeeded := int(math.Ceil(gpuReq / float64(gpusPerNUMA))) + if numaCountNeeded == 0 { + numaCountNeeded = 1 + } + if numaCountNeeded > numaCount { + return 0, 0, fmt.Errorf("invalid gpu req: %.3f in topology with NUMAs count: %d and GPUs count: %d", gpuReq, numaCount, len(gpuTopology.GPUs)) + } + + gpusCountNeededPerNUMA := int(math.Ceil(gpuReq / float64(numaCountNeeded))) + return numaCountNeeded, gpusCountNeededPerNUMA, nil +} diff --git a/pkg/agent/qrm-plugins/util/util.go b/pkg/agent/qrm-plugins/util/util.go index e21c4b2166..ecc1cc034b 100644 --- a/pkg/agent/qrm-plugins/util/util.go +++ b/pkg/agent/qrm-plugins/util/util.go @@ -25,6 +25,8 @@ import ( "strings" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -43,27 +45,32 @@ func GetQuantityFromResourceReq(req *pluginapi.ResourceRequest) (int, float64, e return 0, 0, fmt.Errorf("invalid req.ResourceRequests length: %d", len(req.ResourceRequests)) } - for key := range req.ResourceRequests { - switch key { - case string(v1.ResourceCPU): - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - case string(apiconsts.ReclaimedResourceMilliCPU): - return general.Max(int(math.Ceil(req.ResourceRequests[key]/1000.0)), 0), req.ResourceRequests[key] / 1000.0, nil - case string(v1.ResourceMemory), string(apiconsts.ReclaimedResourceMemory), string(apiconsts.ResourceGPUMemory): - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - case string(apiconsts.ResourceNetBandwidth): - if req.Annotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue { - general.Infof("detect %s: %s, return %s: 0 instead of %s: %.2f", - PodAnnotationQuantityFromQRMDeclarationKey, PodAnnotationQuantityFromQRMDeclarationTrue, key, key, req.ResourceRequests[key]) - return 0, 0, nil - } - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - default: - return 0, 0, fmt.Errorf("invalid request resource name: %s", key) + return GetQuantityFromResourceRequests(req.ResourceRequests, req.ResourceName, IsQuantityFromQRMDeclaration(req.Annotations)) +} + +func GetQuantityFromResourceRequests(resourceRequests map[string]float64, resourceName string, isQuantityFromQRMDeclaration bool) (int, float64, error) { + quantity, ok := resourceRequests[resourceName] + if !ok { + return 0, 0, errors.NewNotFound(schema.GroupResource{}, resourceName) + } + + switch resourceName { + case string(apiconsts.ReclaimedResourceMilliCPU): + return general.Max(int(math.Ceil(quantity/1000.0)), 0), quantity / 1000.0, nil + case string(apiconsts.ResourceNetBandwidth): + if isQuantityFromQRMDeclaration { + general.Infof("detect %s: %s, return %s: 0 instead of %s: %.2f", + PodAnnotationQuantityFromQRMDeclarationKey, PodAnnotationQuantityFromQRMDeclarationTrue, resourceName, resourceName, quantity) + return 0, 0, nil } + return general.Max(int(math.Ceil(quantity)), 0), quantity, nil + default: + return general.Max(int(math.Ceil(quantity)), 0), quantity, nil } +} - return 0, 0, fmt.Errorf("unexpected end") +func IsQuantityFromQRMDeclaration(podAnnotations map[string]string) bool { + return podAnnotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue } // IsDebugPod returns true if the pod annotations show up any configurable debug key diff --git a/pkg/agent/qrm-plugins/util/util_test.go b/pkg/agent/qrm-plugins/util/util_test.go index ddcd12377c..fe8f85a0f5 100644 --- a/pkg/agent/qrm-plugins/util/util_test.go +++ b/pkg/agent/qrm-plugins/util/util_test.go @@ -17,12 +17,13 @@ limitations under the License. package util import ( - "fmt" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" @@ -41,6 +42,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }{ { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceCPU), ResourceRequests: map[string]float64{ string(v1.ResourceCPU): 123, }, @@ -49,6 +51,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ReclaimedResourceMilliCPU), ResourceRequests: map[string]float64{ string(consts.ReclaimedResourceMilliCPU): 234001, }, @@ -57,6 +60,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceMemory), ResourceRequests: map[string]float64{ string(v1.ResourceMemory): 256, }, @@ -65,6 +69,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ReclaimedResourceMemory), ResourceRequests: map[string]float64{ string(consts.ReclaimedResourceMemory): 1345, }, @@ -73,18 +78,19 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceCPU), ResourceRequests: map[string]float64{ "test": 1345, }, }, - err: fmt.Errorf("invalid request resource name: %s", "test"), + err: errors.NewNotFound(schema.GroupResource{}, string(v1.ResourceCPU)), }, } for _, tc := range testCases { res, _, err := GetQuantityFromResourceReq(tc.req) if tc.err != nil { - as.NotNil(err) + as.Equal(tc.err, err) } else { as.EqualValues(tc.result, res) } diff --git a/pkg/util/machine/gpu.go b/pkg/util/machine/gpu.go index 1d40338da9..a5317af647 100644 --- a/pkg/util/machine/gpu.go +++ b/pkg/util/machine/gpu.go @@ -17,12 +17,16 @@ limitations under the License. package machine import ( + "fmt" + "sync" + "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/native" ) type GPUTopologyProvider interface { - GetGPUTopology() (*GPUTopology, error) + GetGPUTopology() (*GPUTopology, bool, error) + SetGPUTopology(*GPUTopology) error } type GPUTopology struct { @@ -30,23 +34,67 @@ type GPUTopology struct { } type GPUInfo struct { - NUMANode int + Health string + NUMANode []int +} + +func (i GPUInfo) GetNUMANode() []int { + if i.NUMANode == nil { + return []int{} + } + return i.NUMANode } -type kubeletCheckpointGPUTopologyProvider struct { +type gpuTopologyProviderImpl struct { + mutex sync.RWMutex resourceNames []string + + gpuTopology *GPUTopology + numaTopologyReady bool } -func NewKubeletCheckpointGPUTopologyProvider(resourceNames []string) GPUTopologyProvider { - return &kubeletCheckpointGPUTopologyProvider{ +func NewGPUTopologyProvider(resourceNames []string) GPUTopologyProvider { + gpuTopology, err := initGPUTopology(resourceNames) + if err != nil { + gpuTopology = getEmptyGPUTopology() + general.Warningf("initGPUTopology failed with error: %v", err) + } else { + general.Infof("initGPUTopology success: %v", gpuTopology) + } + + return &gpuTopologyProviderImpl{ resourceNames: resourceNames, + gpuTopology: gpuTopology, } } -func (p *kubeletCheckpointGPUTopologyProvider) GetGPUTopology() (*GPUTopology, error) { - gpuTopology := &GPUTopology{ +func getEmptyGPUTopology() *GPUTopology { + return &GPUTopology{ GPUs: make(map[string]GPUInfo), } +} + +func (p *gpuTopologyProviderImpl) SetGPUTopology(gpuTopology *GPUTopology) error { + p.mutex.Lock() + defer p.mutex.Unlock() + if gpuTopology == nil { + return fmt.Errorf("gpuTopology is nil") + } + + p.gpuTopology = gpuTopology + p.numaTopologyReady = checkNUMATopologyReady(gpuTopology) + return nil +} + +func (p *gpuTopologyProviderImpl) GetGPUTopology() (*GPUTopology, bool, error) { + p.mutex.RLock() + defer p.mutex.RUnlock() + + return p.gpuTopology, p.numaTopologyReady, nil +} + +func initGPUTopology(resourceNames []string) (*GPUTopology, error) { + gpuTopology := getEmptyGPUTopology() kubeletCheckpoint, err := native.GetKubeletCheckpoint() if err != nil { @@ -55,19 +103,30 @@ func (p *kubeletCheckpointGPUTopologyProvider) GetGPUTopology() (*GPUTopology, e } _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() - for _, resourceName := range p.resourceNames { + for _, resourceName := range resourceNames { gpuDevice, ok := registeredDevs[resourceName] if !ok { continue } for _, id := range gpuDevice { - gpuTopology.GPUs[id] = GPUInfo{ - // TODO: get NUMA node from Kubelet - NUMANode: 0, - } + // TODO: get NUMA node from + gpuTopology.GPUs[id] = GPUInfo{} } } return gpuTopology, nil } + +func checkNUMATopologyReady(topology *GPUTopology) bool { + if topology == nil { + return false + } + + for _, gpu := range topology.GPUs { + if gpu.NUMANode == nil { + return false + } + } + return true +} From c76e134d333d2273a5f5d852435e1ac3fb5da1d3 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 25 Jul 2025 17:58:10 +0800 Subject: [PATCH 04/52] refactor(cpu): remove unused preferredHintIndexes variable The preferredHintIndexes variable was declared but never used in the code. Removing it improves code clarity and maintainability. --- .../qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go index 471f737075..48ad1a609a 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go @@ -259,7 +259,6 @@ func (p *DynamicPolicy) calculateHints( numaBound = minNUMAsCountNeeded + 1 } - preferredHintIndexes := []int{} var availableNumaHints []*pluginapi.TopologyHint machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { maskCount := mask.Count() @@ -295,10 +294,6 @@ func (p *DynamicPolicy) calculateHints( Nodes: machine.MaskToUInt64Array(mask), Preferred: preferred, }) - - if preferred { - preferredHintIndexes = append(preferredHintIndexes, len(availableNumaHints)-1) - } }) // todo support numa_binding without numa_exclusive in the future From 1e8bfb2bbf0c62a65b286112f9e7fc098e841041 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 25 Jul 2025 23:15:10 +0800 Subject: [PATCH 05/52] feat(resource-plugin): add associated device allocation support Add new functionality to handle associated device allocation requests in the resource plugin stub. This includes: - Adding new stub function type and default implementation - Extending the Stub struct with new field - Adding new methods for associated device operations --- .../orm/endpoint/resource_plugin_stub.go | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pkg/agent/orm/endpoint/resource_plugin_stub.go b/pkg/agent/orm/endpoint/resource_plugin_stub.go index 990118f7ed..51b3945d3f 100644 --- a/pkg/agent/orm/endpoint/resource_plugin_stub.go +++ b/pkg/agent/orm/endpoint/resource_plugin_stub.go @@ -52,6 +52,9 @@ type Stub struct { getTopologyAwareAllocatableResourcesFunc stubGetTopologyAwareAllocatableResourcesFunc getTopologyAwareResourcesFunc stubGetTopologyAwareResourcesFunc + // allocAssociateDeviceFunc is used for handling associated device allocation requests. + allocAssociateDeviceFunc stubAllocAssociatedDeviceFunc + registrationStatus chan watcherapi.RegistrationStatus // for testing endpoint string // for testing } @@ -69,6 +72,8 @@ type stubGetTopologyAwareAllocatableResourcesFunc func(r *pluginapi.GetTopologyA type stubGetTopologyAwareResourcesFunc func(r *pluginapi.GetTopologyAwareResourcesRequest) (*pluginapi.GetTopologyAwareResourcesResponse, error) +type stubAllocAssociatedDeviceFunc func(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) + func defaultAllocFunc(r *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { var response pluginapi.ResourceAllocationResponse @@ -80,6 +85,11 @@ func defaultGetAllocFunc(r *pluginapi.GetResourcesAllocationRequest) (*pluginapi return &response, nil } +func defaultAllocateAssociatedDeviceFunc(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + var response pluginapi.AssociatedDeviceAllocationResponse + return &response, nil +} + // NewResourcePluginStub returns an initialized ResourcePlugin Stub. func NewResourcePluginStub(socket string, name string, preStartContainerFlag bool) *Stub { return &Stub{ @@ -89,8 +99,9 @@ func NewResourcePluginStub(socket string, name string, preStartContainerFlag boo stop: make(chan interface{}), - allocFunc1: defaultAllocFunc, - allocFunc2: defaultGetAllocFunc, + allocFunc1: defaultAllocFunc, + allocFunc2: defaultGetAllocFunc, + allocAssociateDeviceFunc: defaultAllocateAssociatedDeviceFunc, } } @@ -103,6 +114,11 @@ func (m *Stub) SetGetAllocFunc(f stubAllocFunc2) { m.allocFunc2 = f } +// SetAssociatedDeviceFunc sets the allocation function for associated devices. +func (m *Stub) SetAssociatedDeviceFunc(f stubAllocAssociatedDeviceFunc) { + m.allocAssociateDeviceFunc = f +} + func (m *Stub) SetGetTopologyAwareAllocatableResourcesFunc(f stubGetTopologyAwareAllocatableResourcesFunc) { m.getTopologyAwareAllocatableResourcesFunc = f } @@ -289,9 +305,21 @@ func (m *Stub) GetPodTopologyHints(ctx context.Context, r *pluginapi.PodResource return &pluginapi.PodResourceHintsResponse{}, nil } -// Notify the resource plugin that the pod has beed deleted, +// RemovePod Notify the resource plugin that the pod has beed deleted, // and the plugin should do some clear-up work. func (m *Stub) RemovePod(ctx context.Context, r *pluginapi.RemovePodRequest) (*pluginapi.RemovePodResponse, error) { log.Printf("RemovePod, %+v", r) return &pluginapi.RemovePodResponse{}, nil } + +func (m *Stub) UpdateAllocatableAssociatedDevices(ctx context.Context, r *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + log.Printf("UpdateAllocatableAssociatedDevices, %+v", r) + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +// AllocateAssociatedDevice is the gRPC implementation of the allocation function for associated devices. +// It calls the registered allocation function. +func (m *Stub) AllocateAssociatedDevice(ctx context.Context, r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + log.Printf("AllocateAssociatedDevice, %+v", r) + return m.allocAssociateDeviceFunc(r) +} From 7ef74f59f6999316b7bb1f4dce6281f1b8211f2f Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 25 Jul 2025 23:50:07 +0800 Subject: [PATCH 06/52] refactor(qrm-plugins): embed UnimplementedResourcePluginServer in policy structs Add pluginapi.UnimplementedResourcePluginServer to all policy structs to ensure forward compatibility with gRPC interface changes --- pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go | 2 ++ pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go | 2 ++ pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go | 1 + pkg/agent/qrm-plugins/io/staticpolicy/policy.go | 1 + pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go | 1 + pkg/agent/qrm-plugins/network/staticpolicy/policy.go | 1 + 6 files changed, 8 insertions(+) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go index d1de701337..8f1d17d1f4 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go @@ -85,6 +85,8 @@ const ( // and adjust resource requirements and configurations type DynamicPolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer + name string stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go b/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go index e220770356..3417f17615 100644 --- a/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go @@ -58,6 +58,8 @@ const ( // NativePolicy is a policy compatible with Kubernetes native semantics and is used in topology-aware scheduling scenarios. type NativePolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer + name string stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 8a3b871c3d..6a54a7dc1a 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -58,6 +58,7 @@ const ( // StaticPolicy is the static gpu policy type StaticPolicy struct { sync.Mutex + pluginapi.UnimplementedResourcePluginServer name string stopCh chan struct{} diff --git a/pkg/agent/qrm-plugins/io/staticpolicy/policy.go b/pkg/agent/qrm-plugins/io/staticpolicy/policy.go index f0e11f9d91..39c943b217 100644 --- a/pkg/agent/qrm-plugins/io/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/io/staticpolicy/policy.go @@ -48,6 +48,7 @@ const ( // StaticPolicy is the static io policy type StaticPolicy struct { sync.Mutex + pluginapi.UnimplementedResourcePluginServer name string stopCh chan struct{} diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go index 6db20be45e..9b5a83f67f 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go @@ -96,6 +96,7 @@ const ( type DynamicPolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go index 328984c592..ece3b452a9 100644 --- a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go @@ -78,6 +78,7 @@ const ( // StaticPolicy is the static network policy type StaticPolicy struct { sync.Mutex + pluginapi.UnimplementedResourcePluginServer name string stopCh chan struct{} From 4470d0221d00fc6bda6a1917cd5f94021dc1e725 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 1 Aug 2025 02:48:12 +0800 Subject: [PATCH 07/52] feat(gpu): add associated device topology hints support Implement GetAssociatedDeviceTopologyHints method in StaticPolicy and update stub to handle topology hints requests. Also update kubelet dependency version and rename mustIncludeDevices to reusableDevices for clarity. --- go.mod | 4 +-- go.sum | 3 -- .../orm/endpoint/resource_plugin_stub.go | 29 ++++++++++++++----- .../qrm-plugins/gpu/staticpolicy/policy.go | 8 +++-- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index c576c897b9..2fa319fe2c 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6 + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 @@ -197,7 +197,7 @@ replace ( k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6 k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6 k8s.io/kubectl => k8s.io/kubectl v0.24.6 - k8s.io/kubelet => github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99 + k8s.io/kubelet => github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15 k8s.io/kubernetes => k8s.io/kubernetes v1.24.6 k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6 k8s.io/metrics => k8s.io/metrics v0.24.6 diff --git a/go.sum b/go.sum index d7eee09def..9e9533201d 100644 --- a/go.sum +++ b/go.sum @@ -574,8 +574,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc= -github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/libopenstorage/openstorage v1.0.0/go.mod h1:Sp1sIObHjat1BeXhfMqLZ14wnOzEhNx2YQedreMcUyc= @@ -589,7 +587,6 @@ github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6 h1:4AOW5G github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99 h1:9QEucppxPGGOKoJShUBGlSIDT/w49zZU6gjit546N1I= github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= -github.com/luomingmeng/katalyst-api v0.0.0-20250725113014-7d4e01c67cf5/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/orm/endpoint/resource_plugin_stub.go b/pkg/agent/orm/endpoint/resource_plugin_stub.go index 51b3945d3f..425189b2fe 100644 --- a/pkg/agent/orm/endpoint/resource_plugin_stub.go +++ b/pkg/agent/orm/endpoint/resource_plugin_stub.go @@ -52,8 +52,10 @@ type Stub struct { getTopologyAwareAllocatableResourcesFunc stubGetTopologyAwareAllocatableResourcesFunc getTopologyAwareResourcesFunc stubGetTopologyAwareResourcesFunc - // allocAssociateDeviceFunc is used for handling associated device allocation requests. - allocAssociateDeviceFunc stubAllocAssociatedDeviceFunc + // allocAssociatedDeviceFunc is used for handling associated device allocation requests. + allocAssociatedDeviceFunc stubAllocAssociatedDeviceFunc + // getAssociatedDeviceFunc is used for handling associated device get topology hints requests. + getAssociatedTopologyHintsFunc stubGetAssociatedDeviceTopologyHintsFunc registrationStatus chan watcherapi.RegistrationStatus // for testing endpoint string // for testing @@ -74,6 +76,8 @@ type stubGetTopologyAwareResourcesFunc func(r *pluginapi.GetTopologyAwareResourc type stubAllocAssociatedDeviceFunc func(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) +type stubGetAssociatedDeviceTopologyHintsFunc func(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) + func defaultAllocFunc(r *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { var response pluginapi.ResourceAllocationResponse @@ -90,6 +94,11 @@ func defaultAllocateAssociatedDeviceFunc(r *pluginapi.AssociatedDeviceRequest) ( return &response, nil } +func defaultGetAssociatedDeviceTopologyHintsFunc(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + var response pluginapi.AssociatedDeviceHintsResponse + return &response, nil +} + // NewResourcePluginStub returns an initialized ResourcePlugin Stub. func NewResourcePluginStub(socket string, name string, preStartContainerFlag bool) *Stub { return &Stub{ @@ -99,9 +108,10 @@ func NewResourcePluginStub(socket string, name string, preStartContainerFlag boo stop: make(chan interface{}), - allocFunc1: defaultAllocFunc, - allocFunc2: defaultGetAllocFunc, - allocAssociateDeviceFunc: defaultAllocateAssociatedDeviceFunc, + allocFunc1: defaultAllocFunc, + allocFunc2: defaultGetAllocFunc, + allocAssociatedDeviceFunc: defaultAllocateAssociatedDeviceFunc, + getAssociatedTopologyHintsFunc: defaultGetAssociatedDeviceTopologyHintsFunc, } } @@ -116,7 +126,7 @@ func (m *Stub) SetGetAllocFunc(f stubAllocFunc2) { // SetAssociatedDeviceFunc sets the allocation function for associated devices. func (m *Stub) SetAssociatedDeviceFunc(f stubAllocAssociatedDeviceFunc) { - m.allocAssociateDeviceFunc = f + m.allocAssociatedDeviceFunc = f } func (m *Stub) SetGetTopologyAwareAllocatableResourcesFunc(f stubGetTopologyAwareAllocatableResourcesFunc) { @@ -321,5 +331,10 @@ func (m *Stub) UpdateAllocatableAssociatedDevices(ctx context.Context, r *plugin // It calls the registered allocation function. func (m *Stub) AllocateAssociatedDevice(ctx context.Context, r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { log.Printf("AllocateAssociatedDevice, %+v", r) - return m.allocAssociateDeviceFunc(r) + return m.allocAssociatedDeviceFunc(r) +} + +func (m *Stub) GetAssociatedDeviceTopologyHints(ctx context.Context, request *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + log.Printf("GetAssociatedDeviceTopologyHints, %+v", request) + return m.getAssociatedTopologyHintsFunc(request) } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 6a54a7dc1a..4f24ee77be 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -881,6 +881,10 @@ func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(_ context.Context, req return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil } +func (*StaticPolicy) GetAssociatedDeviceTopologyHints(_ context.Context, _ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { if req == nil || req.ResourceRequest == nil || req.DeviceRequest == nil { return nil, fmt.Errorf("req is nil") @@ -1017,10 +1021,10 @@ func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopolo } availableDevices := request.DeviceRequest.GetAvailableDevices() - mustIncludeDevices := request.DeviceRequest.GetMustIncludeDevices() + reusableDevices := request.DeviceRequest.GetReusableDevices() // allocate must include devices first - for _, device := range mustIncludeDevices { + for _, device := range reusableDevices { if machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) From fe70af9582c6f2cff83a3c62d83f6f347544fcc8 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 4 Aug 2025 15:19:06 +0800 Subject: [PATCH 08/52] fix typo and add logs --- go.mod | 2 +- go.sum | 8 ++--- .../qrm-plugins/gpu/staticpolicy/policy.go | 29 +++++++++++++++++-- pkg/util/machine/gpu.go | 2 +- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 2fa319fe2c..f5da2b106f 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4 + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index 9e9533201d..0e6016b3c1 100644 --- a/go.sum +++ b/go.sum @@ -583,10 +583,10 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= -github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6 h1:4AOW5GQJ9NxsN9hVMrQYCxMdjgbF/rqDgvCeHvnmo8Q= -github.com/luomingmeng/katalyst-api v0.0.0-20250725114827-eaad320cf0d6/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= -github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99 h1:9QEucppxPGGOKoJShUBGlSIDT/w49zZU6gjit546N1I= -github.com/luomingmeng/kubelet v0.0.0-20250715133526-b1ac20d8dc99/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= +github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4 h1:x3xNQhK02NrLrxz+g5Rvs3Su1wl+bYtP+JwPmPjRUvE= +github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= +github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15 h1:QFm2/NiJscpQyVm8ejkuGiWVd7dUVL+WooTN5OxuG6A= +github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 4f24ee77be..c184e66038 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -217,10 +217,12 @@ func (p *StaticPolicy) GetTopologyHints(_ context.Context, gpuCount, gpuNames, err := p.getGPUCount(req) if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) } if gpuCount == 0 { + general.Errorf("getGPUCount failed from req %v with error: no GPU resources found", req) return nil, fmt.Errorf("no GPU resources found") } @@ -460,6 +462,7 @@ func (p *StaticPolicy) Allocate(_ context.Context, gpuCount, gpuNames, err := p.getGPUCount(req) if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) } @@ -843,7 +846,7 @@ func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machine } func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(_ context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { - if request == nil { + if request == nil || len(request.Devices) == 0 { return nil, fmt.Errorf("request is nil") } @@ -912,6 +915,21 @@ func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginap return nil, err } + general.InfoS("called", + "podNamespace", req.ResourceRequest.PodNamespace, + "podName", req.ResourceRequest.PodName, + "containerName", req.ResourceRequest.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.ResourceRequest.Annotations, + "resourceRequests", req.ResourceRequest.ResourceRequests, + "deviceName", req.DeviceRequest.DeviceName, + "resourceHint", req.ResourceRequest.Hint, + "deviceHint", req.DeviceRequest.Hint, + "availableDevices", req.DeviceRequest.AvailableDevices, + "reusableDevices", req.DeviceRequest.ReusableDevices, + "deviceRequest", req.DeviceRequest.DeviceRequest, + ) + allocationInfo := p.state.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) @@ -986,6 +1004,13 @@ func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginap p.state.SetMachineState(machineState, true) + general.InfoS("allocated devices", + "podNamespace", req.ResourceRequest.PodNamespace, + "podName", req.ResourceRequest.PodName, + "containerName", req.ResourceRequest.ContainerName, + "qosLevel", qosLevel, + "allocatedDevices", allocatedDevices) + return &pluginapi.AssociatedDeviceAllocationResponse{ AllocationResult: &pluginapi.AssociatedDeviceAllocation{ AllocatedDevices: allocatedDevices, @@ -1045,7 +1070,7 @@ func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopolo } // check if gpu's numa node is the subset of hint nodes - // todo support + // todo support multi numa node if machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) { return true } diff --git a/pkg/util/machine/gpu.go b/pkg/util/machine/gpu.go index a5317af647..ad232e51ca 100644 --- a/pkg/util/machine/gpu.go +++ b/pkg/util/machine/gpu.go @@ -110,7 +110,7 @@ func initGPUTopology(resourceNames []string) (*GPUTopology, error) { } for _, id := range gpuDevice { - // TODO: get NUMA node from + // get NUMA node from UpdateAllocatableAssociatedDevices gpuTopology.GPUs[id] = GPUInfo{} } } From 11f153ef67d52fea754bb5bd94bcd6e32615edb6 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 4 Aug 2025 23:57:44 +0800 Subject: [PATCH 09/52] refactor(gpu): remove redundant non-numa-affinity gpu allocation logic --- .../qrm-plugins/gpu/staticpolicy/policy.go | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index c184e66038..de1a699cf9 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -1098,26 +1098,5 @@ func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopolo } } - // final allocate available and non-numa-affinity gpus - for _, device := range availableDevices { - if allocatedDevices.Has(device) { - continue - } - - if isNUMAAffinityDevice(device) { - continue - } - - if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { - general.Infof("available non-numa-affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) - continue - } - - if allocateDevices(device) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - } - return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) } From ea554e669301afec0cabd76322d9a86e579fa5dd Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 22 Sep 2025 19:05:17 +0800 Subject: [PATCH 10/52] feat(gpu): optimize GPU allocation by preferring NUMA nodes with most allocated memory Add tracking of allocated GPU memory per NUMA node and modify hint calculation to prefer nodes with most allocated memory. This helps balance GPU memory usage across NUMA nodes. --- .../qrm-plugins/gpu/staticpolicy/policy.go | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index de1a699cf9..136cf28559 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -19,6 +19,7 @@ package staticpolicy import ( "context" "fmt" + "math" "sort" "sync" "time" @@ -430,7 +431,7 @@ func (p *StaticPolicy) GetResourcePluginOptions(context.Context, return &pluginapi.ResourcePluginOptions{ PreStartRequired: false, WithTopologyAlignment: true, - NeedReconcile: true, + NeedReconcile: false, AssociatedDevices: p.resourceNames.List(), }, nil } @@ -757,6 +758,7 @@ func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machine general.Infof("gpuMemory: %f, gpuReq: %f, perGPUMemory: %f", gpuMemory, gpuReq, perGPUMemory) numaToAvailableGPUCount := make(map[int]float64) + numaToMostAllocatedGPUMemory := make(map[int]float64) for gpuID, s := range machineState { if s == nil { continue @@ -770,6 +772,7 @@ func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machine for _, numaNode := range info.GetNUMANode() { numaToAvailableGPUCount[numaNode] += 1 + numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetGPUMemoryAllocated(), numaToMostAllocatedGPUMemory[numaNode]) } } } @@ -829,6 +832,9 @@ func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machine }) }) + // prefer numa nodes with most allocated gpu memory + p.preferGPUMemoryMostAllocatedHints(availableNumaHints, numaToMostAllocatedGPUMemory) + // NOTE: because grpc is inability to distinguish between an empty array and nil, // we return an error instead of an empty array. // we should resolve this issue if we need to manage multi-resource in one plugin. @@ -1077,6 +1083,10 @@ func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopolo return false } + sort.SliceStable(availableDevices, func(i, j int) bool { + return machineState.GPUMemoryAllocated(availableDevices[i]) > machineState.GPUMemoryAllocated(availableDevices[j]) + }) + // second allocate available and numa-affinity gpus for _, device := range availableDevices { if allocatedDevices.Has(device) { @@ -1100,3 +1110,40 @@ func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopolo return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) } + +func (p *StaticPolicy) preferGPUMemoryMostAllocatedHints(hints []*pluginapi.TopologyHint, numaToMostAllocatedGPUMemory map[int]float64) { + hintGPUMemoryMostAllocated := make(map[int]float64) + for index, hint := range hints { + if !hint.Preferred { + continue + } + + gpuMemoryMostAllocated := float64(0) + for _, nodeID := range hint.Nodes { + gpuMemoryMostAllocated = math.Max(gpuMemoryMostAllocated, numaToMostAllocatedGPUMemory[int(nodeID)]) + } + hintGPUMemoryMostAllocated[index] = gpuMemoryMostAllocated + } + + mostAllocatedHintIndex := -1 + for index, hint := range hints { + if !hint.Preferred { + continue + } + + if mostAllocatedHintIndex == -1 || hintGPUMemoryMostAllocated[index] > hintGPUMemoryMostAllocated[mostAllocatedHintIndex] { + mostAllocatedHintIndex = index + } + } + + if mostAllocatedHintIndex < 0 { + return + } + + for index, hint := range hints { + if !hint.Preferred || mostAllocatedHintIndex == index { + continue + } + hint.Preferred = false + } +} From 7e1e3b3aa53b1505d702353a4956837a030bb844 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Wed, 1 Oct 2025 15:50:48 +0800 Subject: [PATCH 11/52] feat: refactor code into resource plugins and custom device plugins --- .../app/options/qrm/gpu_plugin.go | 3 + pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 243 +++++ pkg/agent/qrm-plugins/gpu/consts/consts.go | 13 +- .../qrm-plugins/gpu/customdeviceplugin/gpu.go | 196 ++++ .../gpu/customdeviceplugin/interface.go | 31 + .../gpu/customdeviceplugin/registry.go | 31 + .../qrm-plugins/gpu/resourceplugin/gpu_mem.go | 484 ++++++++++ .../gpu/resourceplugin/interface.go | 36 + .../gpu/resourceplugin/registry.go | 33 + .../qrm-plugins/gpu/staticpolicy/policy.go | 855 +++--------------- pkg/config/agent/qrm/gpu_plugin.go | 2 + 11 files changed, 1188 insertions(+), 739 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/base.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go create mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go create mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go create mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go index 5ac43aa1cb..9d677f0914 100644 --- a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -28,6 +28,7 @@ type GPUOptions struct { GPUResourceNames []string GPUMemoryAllocatablePerGPU string SkipGPUStateCorruption bool + ResourcePluginsNames []string } func NewGPUOptions() *GPUOptions { @@ -48,6 +49,7 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties") + fs.StringSliceVar(&o.ResourcePluginsNames, "gpu-resource-plugins-names", o.ResourcePluginsNames, "The names of the enabled gpu resource plugins") } func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { @@ -59,5 +61,6 @@ func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { } conf.GPUMemoryAllocatablePerGPU = gpuMemory conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption + conf.ResourcePluginsNames = o.ResourcePluginsNames return nil } diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go new file mode 100644 index 0000000000..33d27d7149 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -0,0 +1,243 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "context" + "fmt" + "sort" + "sync" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/config/generic" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +const ( + GPUPluginStateFileName = "gpu_plugin_state" +) + +// BasePlugin is a shared plugin that provides common functionalities and fields for GPU resource plugins and custom device plugins. +type BasePlugin struct { + sync.Mutex + + QosConfig *generic.QoSConfiguration + QrmConfig *qrm.QRMPluginsConfiguration + GpuTopologyProvider machine.GPUTopologyProvider + + Emitter metrics.MetricEmitter + MetaServer *metaserver.MetaServer + AgentCtx *agent.GenericContext + State state.State + + PodAnnotationKeptKeys []string + PodLabelKeptKeys []string + AssociatedDevicesName sets.String +} + +func NewBasePlugin( + agentCtx *agent.GenericContext, conf *config.Configuration, wrappedEmitter metrics.MetricEmitter, +) (*BasePlugin, error) { + gpuTopologyProvider := machine.NewGPUTopologyProvider(conf.GPUResourceNames) + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, gpuTopologyProvider, conf.SkipGPUStateCorruption, wrappedEmitter) + + if err != nil { + return nil, err + } + + return &BasePlugin{ + QosConfig: conf.QoSConfiguration, + QrmConfig: conf.QRMPluginsConfiguration, + GpuTopologyProvider: gpuTopologyProvider, + + Emitter: wrappedEmitter, + MetaServer: agentCtx.MetaServer, + AgentCtx: agentCtx, + State: stateImpl, + + PodAnnotationKeptKeys: conf.PodAnnotationKeptKeys, + PodLabelKeptKeys: conf.PodLabelKeptKeys, + AssociatedDevicesName: sets.NewString(conf.GPUResourceNames...), + }, nil +} + +func (p *BasePlugin) GetGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { + gpuCount := float64(0) + gpuNames := sets.NewString() + for resourceName := range p.AssociatedDevicesName { + _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + if err != nil && !errors.IsNotFound(err) { + return 0, nil, err + } + gpuCount += request + gpuNames.Insert(resourceName) + } + return gpuCount, gpuNames, nil +} + +func (p *BasePlugin) GetResourcePluginOptions( + context.Context, *pluginapi.Empty, +) (*pluginapi.ResourcePluginOptions, error) { + return &pluginapi.ResourcePluginOptions{ + PreStartRequired: false, + WithTopologyAlignment: true, + NeedReconcile: false, + AssociatedDevices: p.AssociatedDevicesName.List(), + }, nil +} + +func (p *BasePlugin) PackAllocationResponse( + req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, + resourceAllocationAnnotations map[string]string, resourceName string, +) (*pluginapi.ResourceAllocationResponse, error) { + if allocationInfo == nil { + return nil, fmt.Errorf("packAllocationResponse got nil allocationInfo") + } else if req == nil { + return nil, fmt.Errorf("packAllocationResponse got nil request") + } + + return &pluginapi.ResourceAllocationResponse{ + PodUid: req.PodUid, + PodNamespace: req.PodNamespace, + PodName: req.PodName, + ContainerName: req.ContainerName, + ContainerType: req.ContainerType, + ContainerIndex: req.ContainerIndex, + PodRole: req.PodRole, + PodType: req.PodType, + ResourceName: req.ResourceName, + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + resourceName: { + IsNodeResource: true, + IsScalarResource: true, // to avoid re-allocating + AllocatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + Annotations: resourceAllocationAnnotations, + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + req.Hint, + }, + }, + }, + }, + }, + Labels: general.DeepCopyMap(req.Labels), + Annotations: general.DeepCopyMap(req.Annotations), + }, nil +} + +func (p *BasePlugin) CalculateAssociatedDevices( + gpuTopology *machine.GPUTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, + request *pluginapi.AssociatedDeviceRequest, +) ([]string, map[string]float64, error) { + gpuRequest := request.DeviceRequest.GetDeviceRequest() + gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) + + machineState := p.State.GetMachineState() + + allocatedDevices := sets.NewString() + needed := gpuRequest + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + needed-- + if needed == 0 { + return true + } + } + return false + } + + allocatedGPUMemory := func(devices ...string) map[string]float64 { + memory := make(map[string]float64) + for _, device := range devices { + memory[device] = gpuMemoryPerGPU + } + return memory + } + + availableDevices := request.DeviceRequest.GetAvailableDevices() + reusableDevices := request.DeviceRequest.GetReusableDevices() + + // allocate must include devices first + for _, device := range reusableDevices { + if machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + } + allocateDevices(device) + } + + // if allocated devices is enough, return immediately + if allocatedDevices.Len() >= int(gpuRequest) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + + isNUMAAffinityDevice := func(device string) bool { + info, ok := gpuTopology.GPUs[device] + if !ok { + general.Errorf("failed to find hint node for device %s", device) + return false + } + + // check if gpu's numa node is the subset of hint nodes + // todo support multi numa node + if machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) { + return true + } + return false + } + + sort.SliceStable(availableDevices, func(i, j int) bool { + return machineState.GPUMemoryAllocated(availableDevices[i]) > machineState.GPUMemoryAllocated(availableDevices[j]) + }) + + // second allocate available and numa-affinity gpus + for _, device := range availableDevices { + if allocatedDevices.Has(device) { + continue + } + + if !isNUMAAffinityDevice(device) { + continue + } + + if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + } + + return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/consts/consts.go b/pkg/agent/qrm-plugins/gpu/consts/consts.go index 369b603517..4b5aa3b040 100644 --- a/pkg/agent/qrm-plugins/gpu/consts/consts.go +++ b/pkg/agent/qrm-plugins/gpu/consts/consts.go @@ -19,11 +19,22 @@ package consts import ( "time" - "github.com/kubewharf/katalyst-api/pkg/consts" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" ) +type AllocatedResource struct { + ResourceName string + *pluginapi.TopologyAwareResource +} + +type AllocatableResource struct { + ResourceName string + *pluginapi.AllocatableTopologyAwareResource +} + const ( // GPUResourcePluginPolicyNameStatic is the policy name of static gpu resource plugin diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go new file mode 100644 index 0000000000..aeb827a873 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go @@ -0,0 +1,196 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + "fmt" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +const GPUCustomDevicePluginName = "gpu-custom-device-plugin" + +type GPUDevicePlugin struct { + *baseplugin.BasePlugin +} + +func NewGPUDevicePlugin(base *baseplugin.BasePlugin) CustomDevicePlugin { + return &GPUDevicePlugin{ + BasePlugin: base, + } +} + +func (p *GPUDevicePlugin) DeviceName() string { + return "nvidia.com/gpu" +} + +func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + gpuTopology := &machine.GPUTopology{ + GPUs: make(map[string]machine.GPUInfo, len(request.Devices)), + } + + for _, device := range request.Devices { + var numaNode []int + if device.Topology != nil { + numaNode = make([]int, 0, len(device.Topology.Nodes)) + + for _, node := range device.Topology.Nodes { + if node == nil { + continue + } + numaNode = append(numaNode, int(node.ID)) + } + } + + gpuTopology.GPUs[device.ID] = machine.GPUInfo{ + Health: device.Health, + NUMANode: numaNode, + } + } + + err := p.GpuTopologyProvider.SetGPUTopology(gpuTopology) + if err != nil { + general.Errorf("set gpu topology failed with error: %v", err) + return nil, fmt.Errorf("set gpu topology failed with error: %v", err) + } + + general.Infof("got device %s gpuTopology success: %v", request.DeviceName, gpuTopology) + + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req.ResourceRequest, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.ResourceRequest.PodNamespace, req.ResourceRequest.PodName, req.ResourceRequest.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + general.InfoS("called", + "podNamespace", req.ResourceRequest.PodNamespace, + "podName", req.ResourceRequest.PodName, + "containerName", req.ResourceRequest.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.ResourceRequest.Annotations, + "resourceRequests", req.ResourceRequest.ResourceRequests, + "deviceName", req.DeviceRequest.DeviceName, + "resourceHint", req.ResourceRequest.Hint, + "deviceHint", req.DeviceRequest.Hint, + "availableDevices", req.DeviceRequest.AvailableDevices, + "reusableDevices", req.DeviceRequest.ReusableDevices, + "deviceRequest", req.DeviceRequest.DeviceRequest, + ) + + allocationInfo := p.State.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) + if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { + allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) + for gpuID := range allocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(req.ResourceRequest.ResourceRequests, req.ResourceRequest.ResourceName, false) + if err != nil { + return nil, err + } + + // get hint nodes from request + hintNodes, err := machine.NewCPUSetUint64(req.DeviceRequest.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, err + } + + gpuTopology, numaTopologyReady, err := p.GpuTopologyProvider.GetGPUTopology() + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, err + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req) + if err != nil { + general.Warningf("failed to allocate associated devices: %v", err) + return nil, err + } + + topologyAwareAllocations := make(map[string]state.GPUAllocation) + for _, device := range allocatedDevices { + info, ok := gpuTopology.GPUs[device] + if !ok { + return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) + } + + topologyAwareAllocations[device] = state.GPUAllocation{ + GPUMemoryQuantity: allocatedGPUMemory[device], + NUMANodes: info.GetNUMANode(), + } + } + + if allocationInfo == nil { + allocationInfo = &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.GPUAllocation{ + GPUMemoryQuantity: gpuMemoryRequest, + NUMANodes: hintNodes.ToSliceInt(), + }, + } + } + + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations + p.State.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) + machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + if err != nil { + return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) + } + + p.State.SetMachineState(machineState, true) + + general.InfoS("allocated devices", + "podNamespace", req.ResourceRequest.PodNamespace, + "podName", req.ResourceRequest.PodName, + "containerName", req.ResourceRequest.ContainerName, + "qosLevel", qosLevel, + "allocatedDevices", allocatedDevices) + + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go new file mode 100644 index 0000000000..da4ebce558 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -0,0 +1,31 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +type CustomDevicePlugin interface { + DeviceName() string + + GetAssociatedDeviceTopologyHints(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) + + UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) + + AllocateAssociatedDevice(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go new file mode 100644 index 0000000000..a6507d0ebc --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go @@ -0,0 +1,31 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + +type initFunc func(plugin *baseplugin.BasePlugin) CustomDevicePlugin + +var CustomDevicePluginsMap map[string]initFunc + +func registerCustomDevicePlugin(pluginName string, initFunc initFunc) { + CustomDevicePluginsMap[pluginName] = initFunc +} + +func init() { + registerCustomDevicePlugin(GPUCustomDevicePluginName, NewGPUDevicePlugin) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go new file mode 100644 index 0000000000..1c7d60d8de --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go @@ -0,0 +1,484 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "fmt" + "math" + "sort" + "sync" + + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +const ( + GPUMemPluginName = "gpu_mem_resource_plugin" +) + +type GPUMemPlugin struct { + sync.Mutex + *baseplugin.BasePlugin +} + +func NewGPUMemPlugin(base *baseplugin.BasePlugin) ResourcePlugin { + return &GPUMemPlugin{ + BasePlugin: base, + } +} + +func (p *GPUMemPlugin) ResourceName() string { + return string(consts.ResourceGPUMemory) +} + +func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { + existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + gpuCount, gpuNames, err := p.GetGPUCount(req) + if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + + if gpuCount == 0 { + general.Errorf("getGPUCount failed from req %v with error: no GPU resources found", req) + return nil, fmt.Errorf("no GPU resources found") + } + + general.InfoS("called", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.Annotations, + "gpuMemory", gpuMemory, + "gpuNames", gpuNames.List(), + "gpuCount", gpuCount) + + p.Lock() + defer func() { + if err := p.State.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + if existReallocAnno { + metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) + } + _ = p.Emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + var hints map[string]*pluginapi.ListOfTopologyHints + machineState := p.State.GetMachineState() + allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) + if allocationInfo != nil { + hints = gpuutil.RegenerateGPUMemoryHints(allocationInfo, false) + + // regenerateHints failed. need to clear container record and re-calculate. + if hints == nil { + podEntries := p.State.GetPodEntries() + delete(podEntries[req.PodUid], req.ContainerName) + if len(podEntries[req.PodUid]) == 0 { + delete(podEntries, req.PodUid) + } + + var err error + machineState, err = state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + if err != nil { + general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + } + } + + // otherwise, calculate hint for container without allocated memory + if hints == nil { + var calculateErr error + // calculate hint for container without allocated cpus + hints, calculateErr = p.calculateHints(gpuMemory, gpuCount, machineState, req) + if calculateErr != nil { + return nil, fmt.Errorf("calculateHints failed with error: %v", calculateErr) + } + } + + return util.PackResourceHintsResponse(req, p.ResourceName(), hints) +} + +func (p *GPUMemPlugin) calculateHints( + gpuMemory float64, gpuReq float64, machineState state.GPUMap, req *pluginapi.ResourceRequest, +) (map[string]*pluginapi.ListOfTopologyHints, error) { + gpuTopology, numaTopologyReady, err := p.GpuTopologyProvider.GetGPUTopology() + if err != nil { + return nil, err + } + + if !numaTopologyReady { + return nil, fmt.Errorf("numa topology is ready") + } + + perGPUMemory := gpuMemory / gpuReq + general.Infof("gpuMemory: %f, gpuReq: %f, perGPUMemory: %f", gpuMemory, gpuReq, perGPUMemory) + + numaToAvailableGPUCount := make(map[int]float64) + numaToMostAllocatedGPUMemory := make(map[int]float64) + for gpuID, s := range machineState { + if s == nil { + continue + } + + if s.GetGPUMemoryAllocated()+perGPUMemory <= s.GetGPUMemoryAllocatable() { + info, ok := gpuTopology.GPUs[gpuID] + if !ok { + return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) + } + + for _, numaNode := range info.GetNUMANode() { + numaToAvailableGPUCount[numaNode] += 1 + numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetGPUMemoryAllocated(), numaToMostAllocatedGPUMemory[numaNode]) + } + } + } + + numaNodes := make([]int, 0, p.MetaServer.NumNUMANodes) + for numaNode := range p.MetaServer.NUMAToCPUs { + numaNodes = append(numaNodes, numaNode) + } + sort.Ints(numaNodes) + + minNUMAsCountNeeded, _, err := gpuutil.GetNUMANodesCountToFitGPUReq(gpuReq, p.MetaServer.CPUTopology, gpuTopology) + if err != nil { + return nil, err + } + + numaCountPerSocket, err := p.MetaServer.NUMAsPerSocket() + if err != nil { + return nil, fmt.Errorf("NUMAsPerSocket failed with error: %v", err) + } + + numaBound := len(numaNodes) + if numaBound > machine.LargeNUMAsPoint { + // [TODO]: to discuss refine minNUMAsCountNeeded+1 + numaBound = minNUMAsCountNeeded + 1 + } + + var availableNumaHints []*pluginapi.TopologyHint + machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { + maskCount := mask.Count() + if maskCount < minNUMAsCountNeeded { + return + } + + maskBits := mask.GetBits() + numaCountNeeded := mask.Count() + + allAvailableGPUsCountInMask := float64(0) + for _, nodeID := range maskBits { + allAvailableGPUsCountInMask += numaToAvailableGPUCount[nodeID] + } + + if allAvailableGPUsCountInMask < gpuReq { + return + } + + crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.MetaServer.CPUTopology) + if err != nil { + return + } else if numaCountNeeded <= numaCountPerSocket && crossSockets { + return + } + + preferred := maskCount == minNUMAsCountNeeded + availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{ + Nodes: machine.MaskToUInt64Array(mask), + Preferred: preferred, + }) + }) + + // prefer numa nodes with most allocated gpu memory + p.preferGPUMemoryMostAllocatedHints(availableNumaHints, numaToMostAllocatedGPUMemory) + + // NOTE: because grpc is inability to distinguish between an empty array and nil, + // we return an error instead of an empty array. + // we should resolve this issue if we need to manage multi-resource in one plugin. + if len(availableNumaHints) == 0 { + general.Warningf("got no available gpu memory hints for pod: %s/%s, container: %s", + req.PodNamespace, req.PodName, req.ContainerName) + return nil, gpuutil.ErrNoAvailableGPUMemoryHints + } + + return map[string]*pluginapi.ListOfTopologyHints{ + p.ResourceName(): { + Hints: availableNumaHints, + }, + }, nil +} + +func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( + hints []*pluginapi.TopologyHint, numaToMostAllocatedGPUMemory map[int]float64, +) { + hintGPUMemoryMostAllocated := make(map[int]float64) + for index, hint := range hints { + if !hint.Preferred { + continue + } + + gpuMemoryMostAllocated := float64(0) + for _, nodeID := range hint.Nodes { + gpuMemoryMostAllocated = math.Max(gpuMemoryMostAllocated, numaToMostAllocatedGPUMemory[int(nodeID)]) + } + hintGPUMemoryMostAllocated[index] = gpuMemoryMostAllocated + } + + mostAllocatedHintIndex := -1 + for index, hint := range hints { + if !hint.Preferred { + continue + } + + if mostAllocatedHintIndex == -1 || hintGPUMemoryMostAllocated[index] > hintGPUMemoryMostAllocated[mostAllocatedHintIndex] { + mostAllocatedHintIndex = index + } + } + + if mostAllocatedHintIndex < 0 { + return + } + + for index, hint := range hints { + if !hint.Preferred || mostAllocatedHintIndex == index { + continue + } + hint.Preferred = false + } +} + +func (p *GPUMemPlugin) GetTopologyAwareResources(allocationInfo *state.AllocationInfo) *gpuconsts.AllocatedResource { + general.InfofV(4, "called") + + topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) + for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: alloc.GPUMemoryQuantity, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + + resp := &gpuconsts.AllocatedResource{ + ResourceName: p.ResourceName(), + TopologyAwareResource: &pluginapi.TopologyAwareResource{ + IsNodeResource: true, + IsScalarResource: true, + AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + TopologyAwareQuantityList: topologyAwareQuantityList, + OriginalTopologyAwareQuantityList: topologyAwareQuantityList, + }, + } + + return resp +} + +func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(machineState state.GPUMap) *gpuconsts.AllocatableResource { + general.InfofV(4, "called") + + p.Lock() + defer p.Unlock() + + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 + for deviceID, gpuState := range machineState { + aggregatedAllocatableQuantity += gpuState.GetGPUMemoryAllocatable() + aggregatedCapacityQuantity += gpuState.GetGPUMemoryAllocatable() + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuState.GetGPUMemoryAllocatable(), + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuState.GetGPUMemoryAllocatable(), + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + + return &gpuconsts.AllocatableResource{ + ResourceName: p.ResourceName(), + AllocatableTopologyAwareResource: &pluginapi.AllocatableTopologyAwareResource{ + IsNodeResource: true, + IsScalarResource: true, + AggregatedAllocatableQuantity: aggregatedAllocatableQuantity, + TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, + AggregatedCapacityQuantity: aggregatedCapacityQuantity, + TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, + }, + } +} + +func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { + existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + gpuCount, gpuNames, err := p.GetGPUCount(req) + if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + + general.InfoS("called", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.Annotations, + "gpuMemory", gpuMemory, + "gpuNames", gpuNames.List(), + "gpuCount", gpuCount) + + p.Lock() + defer func() { + if err := p.State.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + if existReallocAnno { + metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) + } + _ = p.Emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + emptyResponse := &pluginapi.ResourceAllocationResponse{ + PodUid: req.PodUid, + PodNamespace: req.PodNamespace, + PodName: req.PodName, + ContainerName: req.ContainerName, + ContainerType: req.ContainerType, + ContainerIndex: req.ContainerIndex, + PodRole: req.PodRole, + PodType: req.PodType, + ResourceName: p.ResourceName(), + Labels: general.DeepCopyMap(req.Labels), + Annotations: general.DeepCopyMap(req.Annotations), + } + + // currently, not to deal with init containers + if req.ContainerType == pluginapi.ContainerType_INIT { + return emptyResponse, nil + } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { + // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating + return p.PackAllocationResponse(req, &state.AllocationInfo{}, nil, p.ResourceName()) + } + + allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) + if allocationInfo != nil { + resp, packErr := p.PackAllocationResponse(req, allocationInfo, nil, p.ResourceName()) + if packErr != nil { + general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, packErr) + return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) + } + return resp, nil + } + + // get hint nodes from request + hintNodes, err := machine.NewCPUSetUint64(req.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, fmt.Errorf("failed to get hint nodes: %v", err) + } + + newAllocation := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.GPUAllocation{ + GPUMemoryQuantity: gpuMemory, + NUMANodes: hintNodes.ToSliceInt(), + }, + } + + p.State.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) + + machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + if stateErr != nil { + general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "gpuMemory", gpuMemory) + return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) + } + + // update state cache + p.State.SetMachineState(machineState, true) + + return p.PackAllocationResponse(req, newAllocation, nil, p.ResourceName()) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go new file mode 100644 index 0000000000..f4170f3125 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -0,0 +1,36 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +// ResourcePlugin knows how to handle resource requests for a specific resource type. +type ResourcePlugin interface { + ResourceName() string + + GetTopologyHints(*pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) + + GetTopologyAwareResources(*state.AllocationInfo) *gpuconsts.AllocatedResource + + GetTopologyAwareAllocatableResources(state.GPUMap) *gpuconsts.AllocatableResource + + Allocate(*pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go new file mode 100644 index 0000000000..768725cc56 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go @@ -0,0 +1,33 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" +) + +type initFunc func(plugin *baseplugin.BasePlugin) ResourcePlugin + +var ResourcePluginsMap map[string]initFunc + +func registerResourcePlugin(pluginName string, initFunc initFunc) { + ResourcePluginsMap[pluginName] = initFunc +} + +func init() { + registerResourcePlugin(GPUMemPluginName, NewGPUMemPlugin) +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 136cf28559..7e967c08bd 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -19,69 +19,55 @@ package staticpolicy import ( "context" "fmt" - "math" - "sort" "sync" "time" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" - "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-api/pkg/plugins/skeleton" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" appqrm "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" "github.com/kubewharf/katalyst-core/pkg/config" dynamicconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" - "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/config/generic" "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/machine" - "github.com/kubewharf/katalyst-core/pkg/util/metric" -) - -const ( - GPUPluginStateFileName = "gpu_plugin_state" ) // StaticPolicy is the static gpu policy type StaticPolicy struct { sync.Mutex pluginapi.UnimplementedResourcePluginServer + *baseplugin.BasePlugin - name string - stopCh chan struct{} - started bool - qosConfig *generic.QoSConfiguration - qrmConfig *qrm.QRMPluginsConfiguration - gpuTopologyProvider machine.GPUTopologyProvider + name string + stopCh chan struct{} + started bool - emitter metrics.MetricEmitter - metaServer *metaserver.MetaServer - agentCtx *agent.GenericContext - state state.State - - podAnnotationKeptKeys []string - podLabelKeptKeys []string - resourceNames sets.String + emitter metrics.MetricEmitter + associatedDeviceNames sets.String + resourcePluginsNames sets.String residualHitMap map[string]int64 + + resourcePlugins map[string]resourceplugin.ResourcePlugin + customDevicePlugins map[string]customdeviceplugin.CustomDevicePlugin } // NewStaticPolicy returns a static gpu policy -func NewStaticPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, +func NewStaticPolicy( + agentCtx *agent.GenericContext, conf *config.Configuration, _ interface{}, agentName string, ) (bool, agent.Component, error) { wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{ @@ -89,29 +75,26 @@ func NewStaticPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, Val: gpuconsts.GPUResourcePluginPolicyNameStatic, }) - gpuTopologyProvider := machine.NewGPUTopologyProvider(conf.GPUResourceNames) - stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, - gpuconsts.GPUResourcePluginPolicyNameStatic, gpuTopologyProvider, conf.SkipGPUStateCorruption, wrappedEmitter) + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, wrappedEmitter) if err != nil { - return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err) + return false, agent.ComponentStub{}, fmt.Errorf("failed to create base plugin: %w", err) } policyImplement := &StaticPolicy{ emitter: wrappedEmitter, - metaServer: agentCtx.MetaServer, - agentCtx: agentCtx, stopCh: make(chan struct{}), name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), - qosConfig: conf.QoSConfiguration, - qrmConfig: conf.QRMPluginsConfiguration, - gpuTopologyProvider: gpuTopologyProvider, - state: stateImpl, - podAnnotationKeptKeys: conf.PodAnnotationKeptKeys, - podLabelKeptKeys: conf.PodLabelKeptKeys, residualHitMap: make(map[string]int64), - resourceNames: sets.NewString(conf.GPUResourceNames...), + associatedDeviceNames: sets.NewString(conf.GPUResourceNames...), + resourcePluginsNames: sets.NewString(conf.ResourcePluginsNames...), + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } + policyImplement.registerResourcePlugins() + policyImplement.registerCustomDevicePlugins() + pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, func(key string, value int64) { _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) @@ -193,7 +176,8 @@ func (p *StaticPolicy) ResourceName() string { } // GetTopologyHints returns hints of corresponding resources -func (p *StaticPolicy) GetTopologyHints(_ context.Context, +func (p *StaticPolicy) GetTopologyHints( + _ context.Context, req *pluginapi.ResourceRequest, ) (resp *pluginapi.ResourceHintsResponse, err error) { general.InfofV(4, "called") @@ -201,104 +185,23 @@ func (p *StaticPolicy) GetTopologyHints(_ context.Context, return nil, fmt.Errorf("GetTopologyHints got nil req") } - existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) - - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) - if err != nil { - err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - general.Errorf("%s", err.Error()) - return nil, err - } - - _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) - if err != nil { - return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) - } - - gpuCount, gpuNames, err := p.getGPUCount(req) - if err != nil { - general.Errorf("getGPUCount failed from req %v with error: %v", req, err) - return nil, fmt.Errorf("getGPUCount failed with error: %v", err) - } - - if gpuCount == 0 { - general.Errorf("getGPUCount failed from req %v with error: no GPU resources found", req) - return nil, fmt.Errorf("no GPU resources found") - } - - general.InfoS("called", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, - "qosLevel", qosLevel, - "reqAnnotations", req.Annotations, - "gpuMemory", gpuMemory, - "gpuNames", gpuNames.List(), - "gpuCount", gpuCount) - - p.Lock() - defer func() { - if err := p.state.StoreState(); err != nil { - general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) - } - p.Unlock() - if err != nil { - metricTags := []metrics.MetricTag{ - {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, - } - if existReallocAnno { - metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) - } - _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, metricTags...) - } - }() - - var hints map[string]*pluginapi.ListOfTopologyHints - machineState := p.state.GetMachineState() - allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) - if allocationInfo != nil { - hints = gpuutil.RegenerateGPUMemoryHints(allocationInfo, false) - - // regenerateHints failed. need to clear container record and re-calculate. - if hints == nil { - podEntries := p.state.GetPodEntries() - delete(podEntries[req.PodUid], req.ContainerName) - if len(podEntries[req.PodUid]) == 0 { - delete(podEntries, req.PodUid) - } - - var err error - machineState, err = state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) - if err != nil { - general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) - } - } - } - - // otherwise, calculate hint for container without allocated memory - if hints == nil { - var calculateErr error - // calculate hint for container without allocated cpus - hints, calculateErr = p.calculateHints(gpuMemory, gpuCount, machineState, req) - if calculateErr != nil { - return nil, fmt.Errorf("calculateHints failed with error: %v", calculateErr) - } + resourcePlugin := p.getResourcePlugin(req.ResourceName) + if resourcePlugin == nil { + return nil, fmt.Errorf("failed to find resource plugin by name %s", req.ResourceName) } - - return util.PackResourceHintsResponse(req, p.ResourceName(), hints) + return resourcePlugin.GetTopologyHints(req) } // GetPodTopologyHints returns hints of corresponding resources -func (p *StaticPolicy) GetPodTopologyHints(_ context.Context, +func (p *StaticPolicy) GetPodTopologyHints( + _ context.Context, req *pluginapi.PodResourceRequest, ) (resp *pluginapi.PodResourceHintsResponse, err error) { return nil, util.ErrNotImplemented } -func (p *StaticPolicy) RemovePod(_ context.Context, +func (p *StaticPolicy) RemovePod( + _ context.Context, req *pluginapi.RemovePodRequest, ) (*pluginapi.RemovePodResponse, error) { if req == nil { @@ -317,7 +220,8 @@ func (p *StaticPolicy) RemovePod(_ context.Context, } // GetResourcesAllocation returns allocation results of corresponding resources -func (p *StaticPolicy) GetResourcesAllocation(_ context.Context, +func (p *StaticPolicy) GetResourcesAllocation( + _ context.Context, _ *pluginapi.GetResourcesAllocationRequest, ) (*pluginapi.GetResourcesAllocationResponse, error) { general.InfofV(4, "called") @@ -325,7 +229,8 @@ func (p *StaticPolicy) GetResourcesAllocation(_ context.Context, } // GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format -func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, +func (p *StaticPolicy) GetTopologyAwareResources( + _ context.Context, req *pluginapi.GetTopologyAwareResourcesRequest, ) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") @@ -333,21 +238,16 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, return nil, fmt.Errorf("GetTopologyAwareResources got nil req") } - allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) if allocationInfo == nil { return &pluginapi.GetTopologyAwareResourcesResponse{}, nil } - topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) - for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { - topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: alloc.GPUMemoryQuantity, - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) + // Get topology aware resources for all resource plugins + allocatedResources := make(map[string]*pluginapi.TopologyAwareResource) + for _, resourcePlugin := range p.resourcePlugins { + allocatedResource := resourcePlugin.GetTopologyAwareResources(allocationInfo) + allocatedResources[allocatedResource.ResourceName] = allocatedResource.TopologyAwareResource } resp := &pluginapi.GetTopologyAwareResourcesResponse{ @@ -355,17 +255,8 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, PodName: allocationInfo.PodName, PodNamespace: allocationInfo.PodNamespace, ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ - ContainerName: allocationInfo.ContainerName, - AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ - p.ResourceName(): { - IsNodeResource: true, - IsScalarResource: true, - AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - TopologyAwareQuantityList: topologyAwareQuantityList, - OriginalTopologyAwareQuantityList: topologyAwareQuantityList, - }, - }, + ContainerName: allocationInfo.ContainerName, + AllocatedResources: allocatedResources, }, } @@ -373,7 +264,8 @@ func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, } // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format -func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, +func (p *StaticPolicy) GetTopologyAwareAllocatableResources( + _ context.Context, req *pluginapi.GetTopologyAwareAllocatableResourcesRequest, ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { general.InfofV(4, "called") @@ -384,186 +276,57 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, p.Lock() defer p.Unlock() - machineState := p.state.GetMachineState() - - topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 - for deviceID, gpuState := range machineState { - aggregatedAllocatableQuantity += gpuState.GetGPUMemoryAllocatable() - aggregatedCapacityQuantity += gpuState.GetGPUMemoryAllocatable() - topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuState.GetGPUMemoryAllocatable(), - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) - topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuState.GetGPUMemoryAllocatable(), - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) + machineState := p.State.GetMachineState() + + // Get topology aware allocatable resources for all resource plugins + allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) + for _, resourcePlugin := range p.resourcePlugins { + allocatableResource := resourcePlugin.GetTopologyAwareAllocatableResources(machineState) + allocatableResources[allocatableResource.ResourceName] = allocatableResource.AllocatableTopologyAwareResource } return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ - AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{ - p.ResourceName(): { - IsNodeResource: true, - IsScalarResource: true, - AggregatedAllocatableQuantity: aggregatedAllocatableQuantity, - TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, - AggregatedCapacityQuantity: aggregatedCapacityQuantity, - TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, - }, - }, + AllocatableResources: allocatableResources, }, nil } // GetResourcePluginOptions returns options to be communicated with Resource Manager -func (p *StaticPolicy) GetResourcePluginOptions(context.Context, +func (p *StaticPolicy) GetResourcePluginOptions( + context.Context, *pluginapi.Empty, ) (*pluginapi.ResourcePluginOptions, error) { return &pluginapi.ResourcePluginOptions{ PreStartRequired: false, WithTopologyAlignment: true, NeedReconcile: false, - AssociatedDevices: p.resourceNames.List(), + AssociatedDevices: p.associatedDeviceNames.List(), }, nil } // Allocate is called during pod admit so that the resource // plugin can allocate corresponding resource for the container // according to resource request -func (p *StaticPolicy) Allocate(_ context.Context, +func (p *StaticPolicy) Allocate( + _ context.Context, req *pluginapi.ResourceRequest, ) (resp *pluginapi.ResourceAllocationResponse, err error) { if req == nil { return nil, fmt.Errorf("GetTopologyHints got nil req") } - existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) - - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req, p.podAnnotationKeptKeys, p.podLabelKeptKeys) - if err != nil { - err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) - general.Errorf("%s", err.Error()) - return nil, err - } - - _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) - if err != nil { - return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) - } - - gpuCount, gpuNames, err := p.getGPUCount(req) - if err != nil { - general.Errorf("getGPUCount failed from req %v with error: %v", req, err) - return nil, fmt.Errorf("getGPUCount failed with error: %v", err) - } - - general.InfoS("called", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, - "qosLevel", qosLevel, - "reqAnnotations", req.Annotations, - "gpuMemory", gpuMemory, - "gpuNames", gpuNames.List(), - "gpuCount", gpuCount) - - p.Lock() - defer func() { - if err := p.state.StoreState(); err != nil { - general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) - } - p.Unlock() - if err != nil { - metricTags := []metrics.MetricTag{ - {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, - } - if existReallocAnno { - metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) - } - _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, metricTags...) - } - }() - - emptyResponse := &pluginapi.ResourceAllocationResponse{ - PodUid: req.PodUid, - PodNamespace: req.PodNamespace, - PodName: req.PodName, - ContainerName: req.ContainerName, - ContainerType: req.ContainerType, - ContainerIndex: req.ContainerIndex, - PodRole: req.PodRole, - PodType: req.PodType, - ResourceName: p.ResourceName(), - Labels: general.DeepCopyMap(req.Labels), - Annotations: general.DeepCopyMap(req.Annotations), - } - - // currently, not to deal with init containers - if req.ContainerType == pluginapi.ContainerType_INIT { - return emptyResponse, nil - } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { - // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating - return p.packAllocationResponse(req, &state.AllocationInfo{}, nil) - } - - allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) - if allocationInfo != nil { - resp, packErr := p.packAllocationResponse(req, allocationInfo, nil) - if packErr != nil { - general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, packErr) - return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) - } - return resp, nil - } - - // get hint nodes from request - hintNodes, err := machine.NewCPUSetUint64(req.GetHint().GetNodes()...) - if err != nil { - general.Warningf("failed to get hint nodes: %v", err) - return nil, fmt.Errorf("failed to get hint nodes: %v", err) - } - - newAllocation := &state.AllocationInfo{ - AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: state.GPUAllocation{ - GPUMemoryQuantity: gpuMemory, - NUMANodes: hintNodes.ToSliceInt(), - }, - } - - p.state.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) - - machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) - if stateErr != nil { - general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, - "gpuMemory", gpuMemory) - return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) + resourcePlugin := p.getResourcePlugin(req.ResourceName) + if resourcePlugin == nil { + return nil, fmt.Errorf("failed to get resource plugin by name %s", req.ResourceName) } - // update state cache - p.state.SetMachineState(machineState, true) - - return p.packAllocationResponse(req, newAllocation, nil) + return resourcePlugin.Allocate(req) } // AllocateForPod is called during pod admit so that the resource // plugin can allocate corresponding resource for the pod // according to resource request -func (p *StaticPolicy) AllocateForPod(_ context.Context, +func (p *StaticPolicy) AllocateForPod( + _ context.Context, req *pluginapi.PodResourceRequest, ) (resp *pluginapi.PodResourceAllocationResponse, err error) { return nil, util.ErrNotImplemented @@ -572,7 +335,8 @@ func (p *StaticPolicy) AllocateForPod(_ context.Context, // PreStartContainer is called, if indicated by resource plugin during registration phase, // before each container start. Resource plugin can run resource specific operations // such as resetting the resource before making resources available to the container -func (p *StaticPolicy) PreStartContainer(context.Context, +func (p *StaticPolicy) PreStartContainer( + context.Context, *pluginapi.PreStartContainerRequest, ) (*pluginapi.PreStartContainerResponse, error) { return &pluginapi.PreStartContainerResponse{}, nil @@ -580,19 +344,19 @@ func (p *StaticPolicy) PreStartContainer(context.Context, func (p *StaticPolicy) removePod(podUID string) error { // update state cache - podEntries := p.state.GetPodEntries() + podEntries := p.State.GetPodEntries() delete(podEntries, podUID) - machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) + machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.GpuTopologyProvider) if err != nil { general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } - p.state.SetPodEntries(podEntries, false) - p.state.SetMachineState(machineState, false) + p.State.SetPodEntries(podEntries, false) + p.State.SetMachineState(machineState, false) - err = p.state.StoreState() + err = p.State.StoreState() if err != nil { general.Errorf("store state failed with error: %v", err) return err @@ -601,7 +365,8 @@ func (p *StaticPolicy) removePod(podUID string) error { } // clearResidualState is used to clean residual pods in local state -func (p *StaticPolicy) clearResidualState(_ *config.Configuration, +func (p *StaticPolicy) clearResidualState( + _ *config.Configuration, _ interface{}, _ *dynamicconfig.DynamicAgentConfiguration, _ metrics.MetricEmitter, @@ -618,13 +383,13 @@ func (p *StaticPolicy) clearResidualState(_ *config.Configuration, _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) }() - if p.metaServer == nil { + if p.MetaServer == nil { general.Errorf("nil metaServer") return } ctx := context.Background() - podList, err = p.metaServer.GetPodList(ctx, nil) + podList, err = p.MetaServer.GetPodList(ctx, nil) if err != nil { general.Errorf("get pod list failed: %v", err) return @@ -638,7 +403,7 @@ func (p *StaticPolicy) clearResidualState(_ *config.Configuration, p.Lock() defer p.Unlock() - podEntries := p.state.GetPodEntries() + podEntries := p.State.GetPodEntries() for podUID := range podEntries { if !podSet.Has(podUID) { residualSet[podUID] = true @@ -671,16 +436,16 @@ func (p *StaticPolicy) clearResidualState(_ *config.Configuration, delete(podEntries, podUID) } - machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, podEntries, p.gpuTopologyProvider) + machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.GpuTopologyProvider) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return } - p.state.SetPodEntries(podEntries, false) - p.state.SetMachineState(machineState, false) + p.State.SetPodEntries(podEntries, false) + p.State.SetMachineState(machineState, false) - err = p.state.StoreState() + err = p.State.StoreState() if err != nil { general.Errorf("store state failed: %v", err) return @@ -688,213 +453,28 @@ func (p *StaticPolicy) clearResidualState(_ *config.Configuration, } } -// packAllocationResponse fills pluginapi.ResourceAllocationResponse with information from AllocationInfo and pluginapi.ResourceRequest -func (p *StaticPolicy) packAllocationResponse( - req *pluginapi.ResourceRequest, - allocationInfo *state.AllocationInfo, - resourceAllocationAnnotations map[string]string, -) (*pluginapi.ResourceAllocationResponse, error) { - if allocationInfo == nil { - return nil, fmt.Errorf("packAllocationResponse got nil allocationInfo") - } else if req == nil { - return nil, fmt.Errorf("packAllocationResponse got nil request") - } - - return &pluginapi.ResourceAllocationResponse{ - PodUid: req.PodUid, - PodNamespace: req.PodNamespace, - PodName: req.PodName, - ContainerName: req.ContainerName, - ContainerType: req.ContainerType, - ContainerIndex: req.ContainerIndex, - PodRole: req.PodRole, - PodType: req.PodType, - ResourceName: req.ResourceName, - AllocationResult: &pluginapi.ResourceAllocation{ - ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ - p.ResourceName(): { - IsNodeResource: true, - IsScalarResource: true, // to avoid re-allocating - AllocatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - Annotations: resourceAllocationAnnotations, - ResourceHints: &pluginapi.ListOfTopologyHints{ - Hints: []*pluginapi.TopologyHint{ - req.Hint, - }, - }, - }, - }, - }, - Labels: general.DeepCopyMap(req.Labels), - Annotations: general.DeepCopyMap(req.Annotations), - }, nil -} - -func (p *StaticPolicy) getGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { - gpuCount := float64(0) - gpuNames := sets.NewString() - for resourceName := range p.resourceNames { - _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) - if err != nil && !errors.IsNotFound(err) { - return 0, nil, err - } - gpuCount += request - gpuNames.Insert(resourceName) - } - return gpuCount, gpuNames, nil -} - -func (p *StaticPolicy) calculateHints(gpuMemory float64, gpuReq float64, machineState state.GPUMap, req *pluginapi.ResourceRequest) (map[string]*pluginapi.ListOfTopologyHints, error) { - gpuTopology, numaTopologyReady, err := p.gpuTopologyProvider.GetGPUTopology() - if err != nil { - return nil, err - } - - if !numaTopologyReady { - return nil, fmt.Errorf("numa topology is ready") - } - - perGPUMemory := gpuMemory / gpuReq - general.Infof("gpuMemory: %f, gpuReq: %f, perGPUMemory: %f", gpuMemory, gpuReq, perGPUMemory) - - numaToAvailableGPUCount := make(map[int]float64) - numaToMostAllocatedGPUMemory := make(map[int]float64) - for gpuID, s := range machineState { - if s == nil { - continue - } - - if s.GetGPUMemoryAllocated()+perGPUMemory <= s.GetGPUMemoryAllocatable() { - info, ok := gpuTopology.GPUs[gpuID] - if !ok { - return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) - } - - for _, numaNode := range info.GetNUMANode() { - numaToAvailableGPUCount[numaNode] += 1 - numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetGPUMemoryAllocated(), numaToMostAllocatedGPUMemory[numaNode]) - } - } - } - - numaNodes := make([]int, 0, p.metaServer.NumNUMANodes) - for numaNode := range p.metaServer.NUMAToCPUs { - numaNodes = append(numaNodes, numaNode) - } - sort.Ints(numaNodes) - - minNUMAsCountNeeded, _, err := gpuutil.GetNUMANodesCountToFitGPUReq(gpuReq, p.metaServer.CPUTopology, gpuTopology) - if err != nil { - return nil, err - } - - numaCountPerSocket, err := p.metaServer.NUMAsPerSocket() - if err != nil { - return nil, fmt.Errorf("NUMAsPerSocket failed with error: %v", err) - } - - numaBound := len(numaNodes) - if numaBound > machine.LargeNUMAsPoint { - // [TODO]: to discuss refine minNUMAsCountNeeded+1 - numaBound = minNUMAsCountNeeded + 1 - } - - var availableNumaHints []*pluginapi.TopologyHint - machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { - maskCount := mask.Count() - if maskCount < minNUMAsCountNeeded { - return - } - - maskBits := mask.GetBits() - numaCountNeeded := mask.Count() - - allAvailableGPUsCountInMask := float64(0) - for _, nodeID := range maskBits { - allAvailableGPUsCountInMask += numaToAvailableGPUCount[nodeID] - } - - if allAvailableGPUsCountInMask < gpuReq { - return - } - - crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.metaServer.CPUTopology) - if err != nil { - return - } else if numaCountNeeded <= numaCountPerSocket && crossSockets { - return - } - - preferred := maskCount == minNUMAsCountNeeded - availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{ - Nodes: machine.MaskToUInt64Array(mask), - Preferred: preferred, - }) - }) - - // prefer numa nodes with most allocated gpu memory - p.preferGPUMemoryMostAllocatedHints(availableNumaHints, numaToMostAllocatedGPUMemory) - - // NOTE: because grpc is inability to distinguish between an empty array and nil, - // we return an error instead of an empty array. - // we should resolve this issue if we need to manage multi-resource in one plugin. - if len(availableNumaHints) == 0 { - general.Warningf("got no available gpu memory hints for pod: %s/%s, container: %s", - req.PodNamespace, req.PodName, req.ContainerName) - return nil, gpuutil.ErrNoAvailableGPUMemoryHints - } - - return map[string]*pluginapi.ListOfTopologyHints{ - p.ResourceName(): { - Hints: availableNumaHints, - }, - }, nil -} - -func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(_ context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { if request == nil || len(request.Devices) == 0 { return nil, fmt.Errorf("request is nil") } - gpuTopology := &machine.GPUTopology{ - GPUs: make(map[string]machine.GPUInfo, len(request.Devices)), + customDevicePlugin := p.getCustomDevicePlugin(request.DeviceName) + if customDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for device %s", request.DeviceName) } - for _, device := range request.Devices { - var numaNode []int - if device.Topology != nil { - numaNode = make([]int, 0, len(device.Topology.Nodes)) - - for _, node := range device.Topology.Nodes { - if node == nil { - continue - } - numaNode = append(numaNode, int(node.ID)) - } - } - - gpuTopology.GPUs[device.ID] = machine.GPUInfo{ - Health: device.Health, - NUMANode: numaNode, - } - } - - err := p.gpuTopologyProvider.SetGPUTopology(gpuTopology) - if err != nil { - general.Errorf("set gpu topology failed with error: %v", err) - return nil, fmt.Errorf("set gpu topology failed with error: %v", err) - } - - general.Infof("got device %s gpuTopology success: %v", request.DeviceName, gpuTopology) - - return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil + return customDevicePlugin.UpdateAllocatableAssociatedDevices(request) } -func (*StaticPolicy) GetAssociatedDeviceTopologyHints(_ context.Context, _ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { +func (*StaticPolicy) GetAssociatedDeviceTopologyHints( + _ context.Context, _ *pluginapi.AssociatedDeviceRequest, +) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } -func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { +func (p *StaticPolicy) AllocateAssociatedDevice( + _ context.Context, req *pluginapi.AssociatedDeviceRequest, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { if req == nil || req.ResourceRequest == nil || req.DeviceRequest == nil { return nil, fmt.Errorf("req is nil") } @@ -913,237 +493,36 @@ func (p *StaticPolicy) AllocateAssociatedDevice(_ context.Context, req *pluginap }, nil } - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req.ResourceRequest, p.podAnnotationKeptKeys, p.podLabelKeptKeys) - if err != nil { - err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.ResourceRequest.PodNamespace, req.ResourceRequest.PodName, req.ResourceRequest.ContainerName, err) - general.Errorf("%s", err.Error()) - return nil, err - } - - general.InfoS("called", - "podNamespace", req.ResourceRequest.PodNamespace, - "podName", req.ResourceRequest.PodName, - "containerName", req.ResourceRequest.ContainerName, - "qosLevel", qosLevel, - "reqAnnotations", req.ResourceRequest.Annotations, - "resourceRequests", req.ResourceRequest.ResourceRequests, - "deviceName", req.DeviceRequest.DeviceName, - "resourceHint", req.ResourceRequest.Hint, - "deviceHint", req.DeviceRequest.Hint, - "availableDevices", req.DeviceRequest.AvailableDevices, - "reusableDevices", req.DeviceRequest.ReusableDevices, - "deviceRequest", req.DeviceRequest.DeviceRequest, - ) - - allocationInfo := p.state.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) - if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { - allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) - for gpuID := range allocationInfo.TopologyAwareAllocations { - allocatedDevices = append(allocatedDevices, gpuID) - } - return &pluginapi.AssociatedDeviceAllocationResponse{ - AllocationResult: &pluginapi.AssociatedDeviceAllocation{ - AllocatedDevices: allocatedDevices, - }, - }, nil - } - - _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(req.ResourceRequest.ResourceRequests, p.ResourceName(), false) - if err != nil { - return nil, err - } - - // get hint nodes from request - hintNodes, err := machine.NewCPUSetUint64(req.DeviceRequest.GetHint().GetNodes()...) - if err != nil { - general.Warningf("failed to get hint nodes: %v", err) - return nil, err - } - - gpuTopology, numaTopologyReady, err := p.gpuTopologyProvider.GetGPUTopology() - if err != nil { - general.Warningf("failed to get gpu topology: %v", err) - return nil, err - } - - if !numaTopologyReady { - general.Warningf("numa topology is not ready") - return nil, fmt.Errorf("numa topology is not ready") - } - - allocatedDevices, allocatedGPUMemory, err := p.calculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req) - if err != nil { - general.Warningf("failed to allocate associated devices: %v", err) - return nil, err - } - - topologyAwareAllocations := make(map[string]state.GPUAllocation) - for _, device := range allocatedDevices { - info, ok := gpuTopology.GPUs[device] - if !ok { - return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) - } - - topologyAwareAllocations[device] = state.GPUAllocation{ - GPUMemoryQuantity: allocatedGPUMemory[device], - NUMANodes: info.GetNUMANode(), - } + customDevicePlugin := p.getCustomDevicePlugin(req.DeviceRequest.DeviceName) + if customDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for device %s", req.DeviceRequest.DeviceName) } - if allocationInfo == nil { - allocationInfo = &state.AllocationInfo{ - AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: state.GPUAllocation{ - GPUMemoryQuantity: gpuMemoryRequest, - NUMANodes: hintNodes.ToSliceInt(), - }, - } - } - - allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - p.state.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) - machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.state.GetPodEntries(), p.gpuTopologyProvider) - if err != nil { - return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) - } - - p.state.SetMachineState(machineState, true) - - general.InfoS("allocated devices", - "podNamespace", req.ResourceRequest.PodNamespace, - "podName", req.ResourceRequest.PodName, - "containerName", req.ResourceRequest.ContainerName, - "qosLevel", qosLevel, - "allocatedDevices", allocatedDevices) - - return &pluginapi.AssociatedDeviceAllocationResponse{ - AllocationResult: &pluginapi.AssociatedDeviceAllocation{ - AllocatedDevices: allocatedDevices, - }, - }, nil + return customDevicePlugin.AllocateAssociatedDevice(req) } -func (p *StaticPolicy) calculateAssociatedDevices(gpuTopology *machine.GPUTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, request *pluginapi.AssociatedDeviceRequest) ([]string, map[string]float64, error) { - gpuRequest := request.DeviceRequest.GetDeviceRequest() - gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) - - machineState := p.state.GetMachineState() - - allocatedDevices := sets.NewString() - needed := gpuRequest - allocateDevices := func(devices ...string) bool { - for _, device := range devices { - allocatedDevices.Insert(device) - needed-- - if needed == 0 { - return true - } - } - return false +func (p *StaticPolicy) registerResourcePlugins() { + for name := range p.resourcePluginsNames { + initFunc := resourceplugin.ResourcePluginsMap[name] + resourcePlugin := initFunc(p.BasePlugin) + p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin } - - allocatedGPUMemory := func(devices ...string) map[string]float64 { - memory := make(map[string]float64) - for _, device := range devices { - memory[device] = gpuMemoryPerGPU - } - return memory - } - - availableDevices := request.DeviceRequest.GetAvailableDevices() - reusableDevices := request.DeviceRequest.GetReusableDevices() - - // allocate must include devices first - for _, device := range reusableDevices { - if machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { - general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) - } - allocateDevices(device) - } - - // if allocated devices is enough, return immediately - if allocatedDevices.Len() >= int(gpuRequest) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - - isNUMAAffinityDevice := func(device string) bool { - info, ok := gpuTopology.GPUs[device] - if !ok { - general.Errorf("failed to find hint node for device %s", device) - return false - } - - // check if gpu's numa node is the subset of hint nodes - // todo support multi numa node - if machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) { - return true - } - return false - } - - sort.SliceStable(availableDevices, func(i, j int) bool { - return machineState.GPUMemoryAllocated(availableDevices[i]) > machineState.GPUMemoryAllocated(availableDevices[j]) - }) - - // second allocate available and numa-affinity gpus - for _, device := range availableDevices { - if allocatedDevices.Has(device) { - continue - } - - if !isNUMAAffinityDevice(device) { - continue - } - - if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { - general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) - continue - } - - if allocateDevices(device) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - } - - return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) } -func (p *StaticPolicy) preferGPUMemoryMostAllocatedHints(hints []*pluginapi.TopologyHint, numaToMostAllocatedGPUMemory map[int]float64) { - hintGPUMemoryMostAllocated := make(map[int]float64) - for index, hint := range hints { - if !hint.Preferred { - continue - } - - gpuMemoryMostAllocated := float64(0) - for _, nodeID := range hint.Nodes { - gpuMemoryMostAllocated = math.Max(gpuMemoryMostAllocated, numaToMostAllocatedGPUMemory[int(nodeID)]) - } - hintGPUMemoryMostAllocated[index] = gpuMemoryMostAllocated - } - - mostAllocatedHintIndex := -1 - for index, hint := range hints { - if !hint.Preferred { - continue - } - - if mostAllocatedHintIndex == -1 || hintGPUMemoryMostAllocated[index] > hintGPUMemoryMostAllocated[mostAllocatedHintIndex] { - mostAllocatedHintIndex = index - } +func (p *StaticPolicy) registerCustomDevicePlugins() { + for name := range p.associatedDeviceNames { + initFunc := customdeviceplugin.CustomDevicePluginsMap[name] + customDevicePlugin := initFunc(p.BasePlugin) + p.customDevicePlugins[customDevicePlugin.DeviceName()] = customDevicePlugin } +} - if mostAllocatedHintIndex < 0 { - return - } +func (p *StaticPolicy) getResourcePlugin(resourceName string) resourceplugin.ResourcePlugin { + resourcePlugin := p.resourcePlugins[resourceName] + return resourcePlugin +} - for index, hint := range hints { - if !hint.Preferred || mostAllocatedHintIndex == index { - continue - } - hint.Preferred = false - } +func (p *StaticPolicy) getCustomDevicePlugin(deviceName string) customdeviceplugin.CustomDevicePlugin { + customDevicePlugin := p.customDevicePlugins[deviceName] + return customDevicePlugin } diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go index 9c5547d24d..32da62bcef 100644 --- a/pkg/config/agent/qrm/gpu_plugin.go +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -27,6 +27,8 @@ type GPUQRMPluginConfig struct { GPUMemoryAllocatablePerGPU resource.Quantity // SkipGPUStateCorruption skip gpu state corruption, and it will be used after updating state properties SkipGPUStateCorruption bool + // ResourcePluginsNames holds the names of the enabled GPU resource plugins + ResourcePluginsNames []string } func NewGPUQRMPluginConfig() *GPUQRMPluginConfig { From e3e69ee1ad17b647742132724ca3738a1cff7e35 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Thu, 2 Oct 2025 14:53:35 +0800 Subject: [PATCH 12/52] chore: add unit tests chore: add unit tests chore: add unit tests chore: add unit tests chore: add unit tests --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 9 +- .../qrm-plugins/gpu/baseplugin/test_util.go | 70 ++++ .../qrm-plugins/gpu/customdeviceplugin/gpu.go | 3 +- .../gpu/customdeviceplugin/gpu_test.go | 379 ++++++++++++++++++ .../gpu/customdeviceplugin/registry.go | 2 +- .../qrm-plugins/gpu/resourceplugin/gpu_mem.go | 3 +- .../gpu/resourceplugin/gpu_mem_test.go | 351 ++++++++++++++++ .../gpu/resourceplugin/interface.go | 3 +- .../gpu/resourceplugin/registry.go | 2 +- pkg/util/machine/gpu.go | 19 + pkg/util/machine/topology.go | 23 ++ 11 files changed, 855 insertions(+), 9 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 33d27d7149..d0d8305fd8 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -22,6 +22,10 @@ import ( "sort" "sync" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" @@ -33,9 +37,6 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/sets" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) const ( @@ -239,5 +240,5 @@ func (p *BasePlugin) CalculateAssociatedDevices( } } - return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, availableDevices len: %d, allocatedDevices len: %d", len(availableDevices), len(allocatedDevices)) + return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, number of needed GPUs: %d, availableDevices len: %d, allocatedDevices len: %d", gpuRequest, len(availableDevices), len(allocatedDevices)) } diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go b/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go new file mode 100644 index 0000000000..71d174de6f --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go @@ -0,0 +1,70 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/config/generic" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + metaserveragent "github.com/kubewharf/katalyst-core/pkg/metaserver/agent" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func makeMetaServer(numCPUs, numNUMAs, socketNum int) *metaserver.MetaServer { + cpuTopology, _ := machine.GenerateDummyCPUTopology(numCPUs, socketNum, numNUMAs) + + return &metaserver.MetaServer{ + MetaAgent: &metaserveragent.MetaAgent{ + KatalystMachineInfo: &machine.KatalystMachineInfo{ + CPUTopology: cpuTopology, + }, + }, + } +} + +func GenerateTestBasePlugin( + stateDirectory string, provider machine.GPUTopologyProvider, numCPUs, numNUMAs, socketNum int, +) (*BasePlugin, error) { + qrmConfig := qrm.NewQRMPluginsConfiguration() + qosConfig := generic.NewQoSConfiguration() + stateImpl, err := state.NewCheckpointState(qrmConfig, stateDirectory, GPUPluginStateFileName, "test_gpu_policy", provider, false, metrics.DummyMetrics{}) + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + return nil, fmt.Errorf("GenerateFakeGenericContext failed: %v", err) + } + metaServer := makeMetaServer(numCPUs, numNUMAs, socketNum) + basePlugin := &BasePlugin{ + QrmConfig: qrmConfig, + QosConfig: qosConfig, + State: stateImpl, + GpuTopologyProvider: provider, + Emitter: metrics.DummyMetrics{}, + MetaServer: metaServer, + AgentCtx: &agent.GenericContext{GenericContext: genericCtx, MetaServer: metaServer}, + AssociatedDevicesName: sets.NewString("nvidia.com/gpu"), + } + return basePlugin, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go index aeb827a873..bb477aaf93 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go @@ -19,13 +19,14 @@ package customdeviceplugin import ( "fmt" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) const GPUCustomDevicePluginName = "gpu-custom-device-plugin" diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go new file mode 100644 index 0000000000..540faf719a --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go @@ -0,0 +1,379 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestGPUCustomDevicePlugin( + stateDirectory string, numGPUs, numNUMAs, numCPUs, socketNum int, +) (*GPUDevicePlugin, error) { + gpuTopology, err := machine.GenerateDummyGPUTopology(numGPUs, numNUMAs) + if err != nil { + return nil, fmt.Errorf("GenerateDummyGPUTopology failed: %v", err) + } + providerStub := machine.NewGPUTopologyProviderStub(gpuTopology) + + basePlugin, err := baseplugin.GenerateTestBasePlugin(stateDirectory, providerStub, numCPUs, numNUMAs, socketNum) + if err != nil { + return nil, fmt.Errorf("generateTestBasePlugin failed: %v", err) + } + + p := NewGPUDevicePlugin(basePlugin) + + machineState := make(state.GPUMap) + // Initialize machine state to not have any gpu memory allocated + for i := 0; i < numGPUs; i++ { + machineState[fmt.Sprintf("%d", i)] = &state.GPUState{ + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 0, + } + } + + gpuDevicePlugin := p.(*GPUDevicePlugin) + gpuDevicePlugin.State.SetMachineState(machineState, true) + + return gpuDevicePlugin, nil +} + +func TestGPUDevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + req1 := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "nvidia.com/gpu", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "0", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 0, + }, + }, + }, + Health: pluginapi.Healthy, + }, + { + ID: "1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + Health: pluginapi.Healthy, + }, + }, + } + + req2 := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "nvidia.com/gpu", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "0", + Health: pluginapi.Healthy, + }, + { + ID: "1", + Health: pluginapi.Healthy, + }, + }, + } + + p1, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 2, 4, 2) + assert.NoError(t, err) + // For this test case, we override the dummy gpu topology to nil + err = p1.GpuTopologyProvider.SetGPUTopology(nil) + assert.NoError(t, err) + + p2, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 2, 4, 2) + assert.NoError(t, err) + // For this test case, we override the dummy gpu topology to nil + err = p2.GpuTopologyProvider.SetGPUTopology(nil) + assert.NoError(t, err) + + testCases := []struct { + name string + plugin *GPUDevicePlugin + req *pluginapi.UpdateAllocatableAssociatedDevicesRequest + expectedGPUs map[string]machine.GPUInfo + }{ + { + name: "Normal case", + plugin: p1, + req: req1, + expectedGPUs: map[string]machine.GPUInfo{ + "0": { + Health: pluginapi.Healthy, + NUMANode: []int{0}, + }, + "1": { + Health: pluginapi.Healthy, + NUMANode: []int{1}, + }, + }, + }, + { + name: "Empty topology", + plugin: p2, + req: req2, + expectedGPUs: map[string]machine.GPUInfo{ + "0": { + Health: pluginapi.Healthy, + NUMANode: nil, + }, + "1": { + Health: pluginapi.Healthy, + NUMANode: nil, + }, + }, + }, + } + for _, tt := range testCases { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + _, err := tt.plugin.UpdateAllocatableAssociatedDevices(tt.req) + assert.NoError(t, err) + + gpuTopology, _, err := tt.plugin.GpuTopologyProvider.GetGPUTopology() + assert.NoError(t, err) + assert.Equal(t, tt.expectedGPUs, gpuTopology.GPUs) + }) + } +} + +func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { + t.Parallel() + + p1, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) + assert.NoError(t, err) + + p2, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) + assert.NoError(t, err) + + p3, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 1, 4, 2) + assert.NoError(t, err) + // Fill up one of the GPUs memory completely + p3.State.SetMachineState(map[string]*state.GPUState{ + "0": { + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, + }, + "1": { + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 0 * 1024 * 1024 * 1024, + }, + }, false) + + p4, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) + assert.NoError(t, err) + + p5, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 1, 4, 2) + assert.NoError(t, err) + // Pod is already allocated + p5.State.SetAllocationInfo("pod5", "container5", &state.AllocationInfo{ + AllocatedAllocation: state.GPUAllocation{ + GPUMemoryQuantity: 8 * 1024 * 1024 * 1024, + NUMANodes: []int{0}, + }, + TopologyAwareAllocations: map[string]state.GPUAllocation{ + "1": { + GPUMemoryQuantity: 8 * 1024 * 1024 * 1024, + NUMANodes: []int{0}, + }, + }, + }, false) + + req1 := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ResourceGPUMemory), + PodUid: "pod1", + ContainerName: "container1", + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + }, + }, + DeviceRequest: &pluginapi.DeviceRequest{ + DeviceName: p1.DeviceName(), + AvailableDevices: []string{"2", "3"}, + ReusableDevices: []string{"0", "1"}, + DeviceRequest: 2, + }, + } + + req2 := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ResourceGPUMemory), + PodUid: "pod2", + ContainerName: "container2", + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 32 * 1024 * 1024 * 1024, + }, + }, + DeviceRequest: &pluginapi.DeviceRequest{ + DeviceName: p2.DeviceName(), + ReusableDevices: []string{"0"}, + AvailableDevices: []string{"1", "2"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + }, + } + + req3 := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ResourceGPUMemory), + PodUid: "pod3", + ContainerName: "container3", + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + }, + }, + DeviceRequest: &pluginapi.DeviceRequest{ + DeviceName: p3.DeviceName(), + AvailableDevices: []string{"0", "1"}, + DeviceRequest: 1, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + }, + } + + req4 := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ResourceGPUMemory), + PodUid: "pod4", + ContainerName: "container4", + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + }, + }, + DeviceRequest: &pluginapi.DeviceRequest{ + DeviceName: p4.DeviceName(), + AvailableDevices: []string{"0", "1"}, + DeviceRequest: 1, + }, + } + + req5 := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ResourceGPUMemory), + PodUid: "pod5", + ContainerName: "container5", + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 16 * 1024 * 1024 * 1024, + }, + }, + DeviceRequest: &pluginapi.DeviceRequest{ + DeviceName: p5.DeviceName(), + AvailableDevices: []string{"0", "1"}, + DeviceRequest: 1, + }, + } + + testCases := []struct { + name string + plugin *GPUDevicePlugin + req *pluginapi.AssociatedDeviceRequest + expectedErr bool + checkResp func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) + }{ + { + name: "Use all reusable devices first", + plugin: p1, + req: req1, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.Equal(t, []string{"0", "1"}, resp.AllocationResult.GetAllocatedDevices()) + }, + }, + { + name: "Use available devices after reusable devices are used up", + plugin: p2, + req: req2, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.Equal(t, []string{"0", "1"}, resp.AllocationResult.GetAllocatedDevices()) + }, + }, + { + name: "A GPU device is completely used up", + plugin: p3, + req: req3, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.Equal(t, []string{"1"}, resp.AllocationResult.GetAllocatedDevices()) + }, + }, + { + name: "Request has no hints", + plugin: p4, + req: req4, + expectedErr: true, + }, + { + name: "Pod is already allocated in state", + plugin: p5, + req: req5, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.Equal(t, []string{"1"}, resp.AllocationResult.GetAllocatedDevices()) + }, + }, + } + + for _, tt := range testCases { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + resp, err := tt.plugin.AllocateAssociatedDevice(tt.req) + + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + + if tt.checkResp != nil { + tt.checkResp(t, resp) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go index a6507d0ebc..0975ff5fa0 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go @@ -20,7 +20,7 @@ import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" type initFunc func(plugin *baseplugin.BasePlugin) CustomDevicePlugin -var CustomDevicePluginsMap map[string]initFunc +var CustomDevicePluginsMap = make(map[string]initFunc) func registerCustomDevicePlugin(pluginName string, initFunc initFunc) { CustomDevicePluginsMap[pluginName] = initFunc diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go index 1c7d60d8de..16a8ecc33a 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go @@ -22,6 +22,8 @@ import ( "sort" "sync" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" @@ -34,7 +36,6 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" "github.com/kubewharf/katalyst-core/pkg/util/metric" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) const ( diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go new file mode 100644 index 0000000000..6233639d07 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go @@ -0,0 +1,351 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "fmt" + "testing" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestGPUMemPlugin(stateDirectory string, numGPUs, numNUMAs, numCPUs, socketNum int) (*GPUMemPlugin, error) { + gpuTopology, err := machine.GenerateDummyGPUTopology(numGPUs, numNUMAs) + if err != nil { + return nil, fmt.Errorf("GenerateDummyGPUTopology failed: %v", err) + } + providerStub := machine.NewGPUTopologyProviderStub(gpuTopology) + + basePlugin, err := baseplugin.GenerateTestBasePlugin(stateDirectory, providerStub, numCPUs, numNUMAs, socketNum) + if err != nil { + return nil, fmt.Errorf("generateTestBasePlugin failed: %v", err) + } + + p := NewGPUMemPlugin(basePlugin) + + machineState := make(state.GPUMap) + // Initialize machine state to not have any gpu memory allocated + for i := 0; i < numGPUs; i++ { + machineState[fmt.Sprintf("%d", i)] = &state.GPUState{ + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 0, + } + } + + gpuMemPlugin := p.(*GPUMemPlugin) + gpuMemPlugin.State.SetMachineState(machineState, true) + + return gpuMemPlugin, nil +} + +func TestGPUMemPlugin_GetTopologyHints(t *testing.T) { + t.Parallel() + + req1 := &pluginapi.ResourceRequest{ + PodUid: "pod1", + PodNamespace: "default", + PodName: "pod-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + "nvidia.com/gpu": 2, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + } + + req2 := &pluginapi.ResourceRequest{ + PodUid: "pod2", + PodNamespace: "default", + PodName: "pod-2", + ContainerName: "container-2", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + "nvidia.com/gpu": 2, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + } + + reqWithInvalidQoS := &pluginapi.ResourceRequest{ + PodUid: "pod3", + PodNamespace: "default", + PodName: "pod-3", + ContainerName: "container-3", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + Annotations: map[string]string{}, + } + + p1, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) + assert.NoError(t, err) + + p2, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) + assert.NoError(t, err) + + // pod1 is already allocated to numa node 1 + p2.State.SetAllocationInfo("pod1", "container-1", &state.AllocationInfo{ + AllocationMeta: commonstate.AllocationMeta{ + PodUid: "pod1", + PodNamespace: "default", + PodName: "pod-1", + ContainerName: "container-1", + }, + AllocatedAllocation: state.GPUAllocation{ + NUMANodes: []int{1}, + }, + }, false) + + p3, err := generateTestGPUMemPlugin(t.TempDir(), 2, 1, 4, 1) + assert.NoError(t, err) + + // All GPU memory have been used up + p3.State.SetMachineState(map[string]*state.GPUState{ + "0": { + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, + }, + "1": { + GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, + GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, + }, + }, false) + + testCases := []struct { + name string + plugin *GPUMemPlugin + req *pluginapi.ResourceRequest + expectedErr bool + checkResp func(t *testing.T, resp *pluginapi.ResourceHintsResponse) + }{ + { + name: "normal case", + plugin: p1, + req: req1, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.ResourceHints) + assert.Contains(t, resp.ResourceHints, p1.ResourceName()) + hints := resp.ResourceHints[p1.ResourceName()] + assert.NotNil(t, hints) + assert.Equal(t, 3, len(hints.Hints)) + }, + }, + { + name: "get hints for another pod", + plugin: p1, + req: req2, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.ResourceHints) + assert.Contains(t, resp.ResourceHints, p1.ResourceName()) + hints := resp.ResourceHints[p1.ResourceName()] + assert.NotNil(t, hints) + assert.Equal(t, 3, len(hints.Hints)) + }, + }, + { + name: "regenerate hints for existing pod", + plugin: p2, + req: req1, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.ResourceHints) + assert.Contains(t, resp.ResourceHints, p2.ResourceName()) + hints := resp.ResourceHints[p2.ResourceName()] + assert.NotNil(t, hints) + assert.Equal(t, 1, len(hints.Hints)) + assert.Equal(t, hints.Hints[0].Nodes, []uint64{1}) + assert.Equal(t, hints.Hints[0].Preferred, true) + }, + }, + { + name: "insufficient gpu memory", + plugin: p3, + req: req1, + expectedErr: true, + checkResp: nil, + }, + { + name: "request with invalid QoS", + plugin: p1, + req: reqWithInvalidQoS, + expectedErr: true, + checkResp: nil, + }, + } + + for _, tt := range testCases { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + resp, err := tt.plugin.GetTopologyHints(tt.req) + fmt.Println(resp) + + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + + if tt.checkResp != nil { + tt.checkResp(t, resp) + } + }) + } +} + +func TestGPUMemPlugin_Allocate(t *testing.T) { + t.Parallel() + + req1 := &pluginapi.ResourceRequest{ + PodUid: "pod1", + PodNamespace: "default", + PodName: "pod-1", + ContainerName: "container-1", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + "nvidia.com/gpu": 1, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + Preferred: true, + }, + } + + reqForInitContainer := &pluginapi.ResourceRequest{ + PodUid: "pod2", + PodNamespace: "default", + PodName: "pod-2", + ContainerName: "container-2", + ContainerType: pluginapi.ContainerType_INIT, + ContainerIndex: 0, + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + "nvidia.com/gpu": 1, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + } + + reqForSidecarContainer := &pluginapi.ResourceRequest{ + PodUid: "pod3", + PodNamespace: "default", + PodName: "pod-3", + ContainerName: "container-3", + ContainerType: pluginapi.ContainerType_SIDECAR, + ContainerIndex: 0, + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, + "nvidia.com/gpu": 1, + }, + Annotations: map[string]string{ + consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, + }, + } + + p1, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) + assert.NoError(t, err) + + p2, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) + assert.NoError(t, err) + + p3, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) + assert.NoError(t, err) + + testCases := []struct { + name string + plugin *GPUMemPlugin + req *pluginapi.ResourceRequest + expectedErr bool + checkResp func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) + }{ + { + name: "normal case", + plugin: p1, + req: req1, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.NotNil(t, resp.AllocationResult.ResourceAllocation) + assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].AllocatedQuantity, float64(8*1024*1024*1024)) + assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].ResourceHints.Hints[0], req1.Hint) + }, + }, + { + name: "allocate for sidecar container", + plugin: p2, + req: reqForSidecarContainer, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { + assert.NotNil(t, resp) + assert.NotNil(t, resp.AllocationResult) + assert.NotNil(t, resp.AllocationResult.ResourceAllocation) + assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].AllocatedQuantity, float64(0)) + }, + }, + { + name: "allocate for init containers", + plugin: p3, + req: reqForInitContainer, + expectedErr: false, + checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { + assert.NotNil(t, resp) + assert.Nil(t, resp.AllocationResult) + }, + }, + } + for _, tt := range testCases { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + resp, err := tt.plugin.Allocate(tt.req) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + if tt.checkResp != nil { + tt.checkResp(t, resp) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go index f4170f3125..2757f23fbd 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -17,9 +17,10 @@ limitations under the License. package resourceplugin import ( + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) // ResourcePlugin knows how to handle resource requests for a specific resource type. diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go index 768725cc56..e8475bfbcb 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go @@ -22,7 +22,7 @@ import ( type initFunc func(plugin *baseplugin.BasePlugin) ResourcePlugin -var ResourcePluginsMap map[string]initFunc +var ResourcePluginsMap = make(map[string]initFunc) func registerResourcePlugin(pluginName string, initFunc initFunc) { ResourcePluginsMap[pluginName] = initFunc diff --git a/pkg/util/machine/gpu.go b/pkg/util/machine/gpu.go index ad232e51ca..3b0a400e8c 100644 --- a/pkg/util/machine/gpu.go +++ b/pkg/util/machine/gpu.go @@ -130,3 +130,22 @@ func checkNUMATopologyReady(topology *GPUTopology) bool { } return true } + +type gpuTopologyProviderStub struct { + gpuTopology *GPUTopology +} + +func NewGPUTopologyProviderStub(gpuTopology *GPUTopology) GPUTopologyProvider { + return &gpuTopologyProviderStub{ + gpuTopology: gpuTopology, + } +} + +func (g *gpuTopologyProviderStub) GetGPUTopology() (*GPUTopology, bool, error) { + return g.gpuTopology, true, nil +} + +func (g *gpuTopologyProviderStub) SetGPUTopology(gpuTopology *GPUTopology) error { + g.gpuTopology = gpuTopology + return nil +} diff --git a/pkg/util/machine/topology.go b/pkg/util/machine/topology.go index cfe568bb78..4529b74aaa 100644 --- a/pkg/util/machine/topology.go +++ b/pkg/util/machine/topology.go @@ -296,6 +296,29 @@ func GenerateDummyExtraTopology(numaNum int) (*ExtraTopologyInfo, error) { return extraTopology, nil } +// GenerateDummyGPUTopology generates a dummy GPU topology such that a gpu has affinity to a single NUMA node +// and the gpus are evenly distributed across all NUMA nodes. +func GenerateDummyGPUTopology(gpuNum, numaNum int) (*GPUTopology, error) { + if gpuNum%numaNum != 0 { + return nil, fmt.Errorf("invalid gpu number: %d and NUMA number: %d", gpuNum, numaNum) + } else if gpuNum%2 != 0 { + return nil, fmt.Errorf("invalid gpu number as it should be even: %d", gpuNum) + } + gpuTopology := new(GPUTopology) + gpuTopology.GPUs = make(map[string]GPUInfo, gpuNum) + perNumaGPU := gpuNum / numaNum + for i := 0; i < numaNum; i++ { + for j := 0; j < perNumaGPU; j++ { + gpuID := fmt.Sprintf("%d", i*perNumaGPU+j) + gpuTopology.GPUs[gpuID] = GPUInfo{ + Health: "healthy", + NUMANode: []int{i}, + } + } + } + return gpuTopology, nil +} + // CPUTopoInfo contains the NUMA, socket, and core IDs associated with a CPU. type CPUTopoInfo struct { NUMANodeID int From 2ebc7099e1727c4369e4414bc9b8de811fcb2750 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Fri, 10 Oct 2025 18:58:36 +0800 Subject: [PATCH 13/52] feat: introduce rdma state and allow states to share within gpu sub-plugins feat: introduce rdma state and allow states to share within gpu sub-plugins feat: introduce rdma state and allow states to share within gpu sub-plugins --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 72 ++++-- pkg/agent/qrm-plugins/gpu/consts/consts.go | 6 + .../qrm-plugins/gpu/customdeviceplugin/gpu.go | 56 ++-- .../gpu/customdeviceplugin/interface.go | 4 + .../{ => registry}/registry.go | 14 +- .../resourceplugin/{ => gpumemory}/gpu_mem.go | 212 ++++++++++++--- .../{ => gpumemory}/gpu_mem_test.go | 2 +- .../gpu/resourceplugin/interface.go | 9 +- .../resourceplugin/{ => registry}/registry.go | 12 +- .../gpumemory}/checkpoint.go | 2 +- .../{ => resourceplugin/gpumemory}/state.go | 56 ++-- .../gpumemory}/state_checkpoint.go | 23 +- .../gpumemory}/state_mem.go | 22 +- .../{ => resourceplugin/gpumemory}/util.go | 63 ++++- .../state/resourceplugin/rdma/checkpoint.go | 46 ++++ .../gpu/state/resourceplugin/rdma/state.go | 206 +++++++++++++++ .../resourceplugin/rdma/state_checkpoint.go | 241 ++++++++++++++++++ .../state/resourceplugin/rdma/state_mem.go | 112 ++++++++ .../gpu/state/resourceplugin/rdma/util.go | 56 ++++ .../qrm-plugins/gpu/staticpolicy/policy.go | 225 ++++++++-------- pkg/agent/qrm-plugins/gpu/util/util.go | 53 +--- pkg/util/machine/device.go | 153 +++++++++++ pkg/util/machine/rdma.go | 36 +++ 23 files changed, 1370 insertions(+), 311 deletions(-) rename pkg/agent/qrm-plugins/gpu/customdeviceplugin/{ => registry}/registry.go (61%) rename pkg/agent/qrm-plugins/gpu/resourceplugin/{ => gpumemory}/gpu_mem.go (70%) rename pkg/agent/qrm-plugins/gpu/resourceplugin/{ => gpumemory}/gpu_mem_test.go (99%) rename pkg/agent/qrm-plugins/gpu/resourceplugin/{ => registry}/registry.go (61%) rename pkg/agent/qrm-plugins/gpu/state/{ => resourceplugin/gpumemory}/checkpoint.go (99%) rename pkg/agent/qrm-plugins/gpu/state/{ => resourceplugin/gpumemory}/state.go (80%) rename pkg/agent/qrm-plugins/gpu/state/{ => resourceplugin/gpumemory}/state_checkpoint.go (89%) rename pkg/agent/qrm-plugins/gpu/state/{ => resourceplugin/gpumemory}/state_mem.go (86%) rename pkg/agent/qrm-plugins/gpu/state/{ => resourceplugin/gpumemory}/util.go (55%) create mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go create mode 100644 pkg/util/machine/device.go create mode 100644 pkg/util/machine/rdma.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index d0d8305fd8..f0c31f65d8 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -22,13 +22,13 @@ import ( "sort" "sync" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" - gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" @@ -45,49 +45,69 @@ const ( // BasePlugin is a shared plugin that provides common functionalities and fields for GPU resource plugins and custom device plugins. type BasePlugin struct { - sync.Mutex + mu sync.RWMutex + *config.Configuration - QosConfig *generic.QoSConfiguration - QrmConfig *qrm.QRMPluginsConfiguration - GpuTopologyProvider machine.GPUTopologyProvider + QosConfig *generic.QoSConfiguration + QrmConfig *qrm.QRMPluginsConfiguration Emitter metrics.MetricEmitter MetaServer *metaserver.MetaServer AgentCtx *agent.GenericContext - State state.State PodAnnotationKeptKeys []string PodLabelKeptKeys []string AssociatedDevicesName sets.String + + // Map of checkpoints for each sub-plugin + StateCheckpointsMap map[string]interface{} + // Registry of device topology providers + DeviceTopologyRegistry *machine.DeviceTopologyRegistry } func NewBasePlugin( agentCtx *agent.GenericContext, conf *config.Configuration, wrappedEmitter metrics.MetricEmitter, ) (*BasePlugin, error) { - gpuTopologyProvider := machine.NewGPUTopologyProvider(conf.GPUResourceNames) - stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, - gpuconsts.GPUResourcePluginPolicyNameStatic, gpuTopologyProvider, conf.SkipGPUStateCorruption, wrappedEmitter) - - if err != nil { - return nil, err - } + deviceTopologyRegistry := machine.NewDeviceTopologyRegistry() return &BasePlugin{ - QosConfig: conf.QoSConfiguration, - QrmConfig: conf.QRMPluginsConfiguration, - GpuTopologyProvider: gpuTopologyProvider, + QosConfig: conf.QoSConfiguration, + QrmConfig: conf.QRMPluginsConfiguration, Emitter: wrappedEmitter, MetaServer: agentCtx.MetaServer, AgentCtx: agentCtx, - State: stateImpl, PodAnnotationKeptKeys: conf.PodAnnotationKeptKeys, PodLabelKeptKeys: conf.PodLabelKeptKeys, AssociatedDevicesName: sets.NewString(conf.GPUResourceNames...), + + StateCheckpointsMap: make(map[string]interface{}), + DeviceTopologyRegistry: deviceTopologyRegistry, }, nil } +// RegisterCheckpoint is a hook to register a state for a sub-plugin so all other plugins can access it +func (p *BasePlugin) RegisterCheckpoint(pluginName string, checkpoint interface{}) { + p.mu.Lock() + defer p.mu.Unlock() + + p.StateCheckpointsMap[pluginName] = checkpoint +} + +// GetCheckpoint gets the checkpoint using the sub-plugin name +func (p *BasePlugin) GetCheckpoint(pluginName string) (interface{}, error) { + p.mu.RLock() + defer p.mu.RUnlock() + + checkpoint, ok := p.StateCheckpointsMap[pluginName] + if !ok { + return nil, fmt.Errorf("no checkpoint found for plugin %s", pluginName) + } + + return checkpoint, nil +} + func (p *BasePlugin) GetGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { gpuCount := float64(0) gpuNames := sets.NewString() @@ -114,7 +134,7 @@ func (p *BasePlugin) GetResourcePluginOptions( } func (p *BasePlugin) PackAllocationResponse( - req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, + req *pluginapi.ResourceRequest, allocationInfo *gpumemorystate.AllocationInfo, resourceAllocationAnnotations map[string]string, resourceName string, ) (*pluginapi.ResourceAllocationResponse, error) { if allocationInfo == nil { @@ -154,13 +174,21 @@ func (p *BasePlugin) PackAllocationResponse( } func (p *BasePlugin) CalculateAssociatedDevices( - gpuTopology *machine.GPUTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, + gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, request *pluginapi.AssociatedDeviceRequest, ) ([]string, map[string]float64, error) { gpuRequest := request.DeviceRequest.GetDeviceRequest() gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) - machineState := p.State.GetMachineState() + state, err := p.GetCheckpoint(gpuconsts.GPUMemPluginName) + if err != nil { + return nil, nil, err + } + gpuMemoryState, ok := state.(gpumemorystate.State) + if !ok { + return nil, nil, fmt.Errorf("failed to convert state checkpoint to gpumemorystate.State") + } + machineState := gpuMemoryState.GetMachineState() allocatedDevices := sets.NewString() needed := gpuRequest @@ -201,7 +229,7 @@ func (p *BasePlugin) CalculateAssociatedDevices( } isNUMAAffinityDevice := func(device string) bool { - info, ok := gpuTopology.GPUs[device] + info, ok := gpuTopology.Devices[device] if !ok { general.Errorf("failed to find hint node for device %s", device) return false diff --git a/pkg/agent/qrm-plugins/gpu/consts/consts.go b/pkg/agent/qrm-plugins/gpu/consts/consts.go index 4b5aa3b040..d712ed74b4 100644 --- a/pkg/agent/qrm-plugins/gpu/consts/consts.go +++ b/pkg/agent/qrm-plugins/gpu/consts/consts.go @@ -27,6 +27,8 @@ import ( type AllocatedResource struct { ResourceName string + PodName string + PodNamespace string *pluginapi.TopologyAwareResource } @@ -36,6 +38,8 @@ type AllocatableResource struct { } const ( + GPUDeviceName = "gpu_device" + RDMADeviceName = "rdma_device" // GPUResourcePluginPolicyNameStatic is the policy name of static gpu resource plugin GPUResourcePluginPolicyNameStatic = string(consts.ResourcePluginPolicyNameStatic) @@ -43,6 +47,8 @@ const ( GPUPluginDynamicPolicyName = qrm.QRMPluginNameGPU + "_" + GPUResourcePluginPolicyNameStatic ClearResidualState = GPUPluginDynamicPolicyName + "_clear_residual_state" + GPUMemPluginName = "gpu_mem_resource_plugin" + StateCheckPeriod = 30 * time.Second StateCheckTolerationTimes = 3 MaxResidualTime = 5 * time.Minute diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go index bb477aaf93..8251471b0e 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go @@ -19,11 +19,13 @@ package customdeviceplugin import ( "fmt" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/metrics" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" @@ -35,10 +37,13 @@ type GPUDevicePlugin struct { *baseplugin.BasePlugin } -func NewGPUDevicePlugin(base *baseplugin.BasePlugin) CustomDevicePlugin { +func NewGPUDevicePlugin(base *baseplugin.BasePlugin, _ metrics.MetricEmitter) (CustomDevicePlugin, error) { + gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUResourceNames) + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceName, gpuTopologyProvider) + return &GPUDevicePlugin{ BasePlugin: base, - } + }, nil } func (p *GPUDevicePlugin) DeviceName() string { @@ -46,8 +51,8 @@ func (p *GPUDevicePlugin) DeviceName() string { } func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { - gpuTopology := &machine.GPUTopology{ - GPUs: make(map[string]machine.GPUInfo, len(request.Devices)), + gpuTopology := &machine.DeviceTopology{ + Devices: make(map[string]machine.DeviceInfo, len(request.Devices)), } for _, device := range request.Devices { @@ -63,13 +68,13 @@ func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi. } } - gpuTopology.GPUs[device.ID] = machine.GPUInfo{ - Health: device.Health, - NUMANode: numaNode, + gpuTopology.Devices[device.ID] = machine.DeviceInfo{ + Health: device.Health, + NumaNodes: numaNode, } } - err := p.GpuTopologyProvider.SetGPUTopology(gpuTopology) + err := p.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.GPUDeviceName, gpuTopology) if err != nil { general.Errorf("set gpu topology failed with error: %v", err) return nil, fmt.Errorf("set gpu topology failed with error: %v", err) @@ -108,7 +113,13 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi "deviceRequest", req.DeviceRequest.DeviceRequest, ) - allocationInfo := p.State.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) + gpuMemoryState, ok := p.StateCheckpointsMap[gpuconsts.GPUMemPluginName].(gpumemorystate.State) + if !ok { + return nil, fmt.Errorf("failed to convert state checkpoint to gpumemorystate.State") + } + machineState := gpuMemoryState.GetMachineState() + + allocationInfo := gpuMemoryState.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) for gpuID := range allocationInfo.TopologyAwareAllocations { @@ -133,7 +144,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi return nil, err } - gpuTopology, numaTopologyReady, err := p.GpuTopologyProvider.GetGPUTopology() + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) if err != nil { general.Warningf("failed to get gpu topology: %v", err) return nil, err @@ -150,23 +161,23 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi return nil, err } - topologyAwareAllocations := make(map[string]state.GPUAllocation) + topologyAwareAllocations := make(map[string]gpumemorystate.GPUAllocation) for _, device := range allocatedDevices { - info, ok := gpuTopology.GPUs[device] + info, ok := gpuTopology.Devices[device] if !ok { return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) } - topologyAwareAllocations[device] = state.GPUAllocation{ + topologyAwareAllocations[device] = gpumemorystate.GPUAllocation{ GPUMemoryQuantity: allocatedGPUMemory[device], NUMANodes: info.GetNUMANode(), } } if allocationInfo == nil { - allocationInfo = &state.AllocationInfo{ + allocationInfo = &gpumemorystate.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: state.GPUAllocation{ + AllocatedAllocation: gpumemorystate.GPUAllocation{ GPUMemoryQuantity: gpuMemoryRequest, NUMANodes: hintNodes.ToSliceInt(), }, @@ -174,13 +185,13 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi } allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - p.State.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) - machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + gpuMemoryState.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) + machineState, err = gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, gpuMemoryState.GetPodEntries(), p.DeviceTopologyRegistry) if err != nil { return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) } - p.State.SetMachineState(machineState, true) + gpuMemoryState.SetMachineState(machineState, true) general.InfoS("allocated devices", "podNamespace", req.ResourceRequest.PodNamespace, @@ -195,3 +206,10 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi }, }, nil } + +func (p *GPUDevicePlugin) RemovePod(_ string) error { + // nothing to do + return nil +} + +func (p *GPUDevicePlugin) ClearResidualState() {} // Nothing to do diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go index da4ebce558..99f1f7f9a8 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -28,4 +28,8 @@ type CustomDevicePlugin interface { UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) AllocateAssociatedDevice(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) + + RemovePod(podUID string) error + + ClearResidualState() } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go similarity index 61% rename from pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go rename to pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go index 0975ff5fa0..c0f144d11e 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go @@ -14,11 +14,17 @@ See the License for the specific language governing permissions and limitations under the License. */ -package customdeviceplugin +package registry -import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) -type initFunc func(plugin *baseplugin.BasePlugin) CustomDevicePlugin +type initFunc func( + plugin *baseplugin.BasePlugin, emitter metrics.MetricEmitter, +) (customdeviceplugin.CustomDevicePlugin, error) var CustomDevicePluginsMap = make(map[string]initFunc) @@ -27,5 +33,5 @@ func registerCustomDevicePlugin(pluginName string, initFunc initFunc) { } func init() { - registerCustomDevicePlugin(GPUCustomDevicePluginName, NewGPUDevicePlugin) + registerCustomDevicePlugin(customdeviceplugin.GPUCustomDevicePluginName, customdeviceplugin.NewGPUDevicePlugin) } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go similarity index 70% rename from pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go rename to pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 16a8ecc33a..af23b8c8a7 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -14,14 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ -package resourceplugin +package gpumemory import ( + "context" "fmt" "math" "sort" "sync" + "time" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" @@ -29,7 +34,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/metrics" @@ -39,18 +44,34 @@ import ( ) const ( - GPUMemPluginName = "gpu_mem_resource_plugin" + GPUMemPluginStateFileName = "gpu_mem_plugin_state" ) type GPUMemPlugin struct { sync.Mutex *baseplugin.BasePlugin + state gpumemorystate.State + + residualHitMap map[string]int64 } -func NewGPUMemPlugin(base *baseplugin.BasePlugin) ResourcePlugin { - return &GPUMemPlugin{ - BasePlugin: base, +func NewGPUMemPlugin( + base *baseplugin.BasePlugin, wrappedEmitter metrics.MetricEmitter, +) (resourceplugin.ResourcePlugin, error) { + stateImpl, err := gpumemorystate.NewCheckpointState(base.QRMPluginsConfiguration, base.GenericQRMPluginConfiguration.StateFileDirectory, GPUMemPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, base.DeviceTopologyRegistry, base.SkipGPUStateCorruption, wrappedEmitter) + + if err != nil { + return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) } + + base.RegisterCheckpoint(gpuconsts.GPUMemPluginName, stateImpl) + + return &GPUMemPlugin{ + BasePlugin: base, + state: stateImpl, + residualHitMap: make(map[string]int64), + }, nil } func (p *GPUMemPlugin) ResourceName() string { @@ -96,7 +117,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p p.Lock() defer func() { - if err := p.State.StoreState(); err != nil { + if err := p.state.StoreState(); err != nil { general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) } p.Unlock() @@ -112,21 +133,21 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p }() var hints map[string]*pluginapi.ListOfTopologyHints - machineState := p.State.GetMachineState() - allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) + machineState := p.state.GetMachineState() + allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) if allocationInfo != nil { - hints = gpuutil.RegenerateGPUMemoryHints(allocationInfo, false) + hints = gpumemorystate.RegenerateGPUMemoryHints(allocationInfo, false) // regenerateHints failed. need to clear container record and re-calculate. if hints == nil { - podEntries := p.State.GetPodEntries() + podEntries := p.state.GetPodEntries() delete(podEntries[req.PodUid], req.ContainerName) if len(podEntries[req.PodUid]) == 0 { delete(podEntries, req.PodUid) } var err error - machineState, err = state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + machineState, err = gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, p.state.GetPodEntries(), p.DeviceTopologyRegistry) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -149,15 +170,15 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p } func (p *GPUMemPlugin) calculateHints( - gpuMemory float64, gpuReq float64, machineState state.GPUMap, req *pluginapi.ResourceRequest, + gpuMemory float64, gpuReq float64, machineState gpumemorystate.GPUMap, req *pluginapi.ResourceRequest, ) (map[string]*pluginapi.ListOfTopologyHints, error) { - gpuTopology, numaTopologyReady, err := p.GpuTopologyProvider.GetGPUTopology() + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) if err != nil { return nil, err } if !numaTopologyReady { - return nil, fmt.Errorf("numa topology is ready") + return nil, fmt.Errorf("numa topology is not ready") } perGPUMemory := gpuMemory / gpuReq @@ -171,7 +192,7 @@ func (p *GPUMemPlugin) calculateHints( } if s.GetGPUMemoryAllocated()+perGPUMemory <= s.GetGPUMemoryAllocatable() { - info, ok := gpuTopology.GPUs[gpuID] + info, ok := gpuTopology.Devices[gpuID] if !ok { return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) } @@ -296,9 +317,14 @@ func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( } } -func (p *GPUMemPlugin) GetTopologyAwareResources(allocationInfo *state.AllocationInfo) *gpuconsts.AllocatedResource { +func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") + allocationInfo := p.state.GetAllocationInfo(podUID, containerName) + if allocationInfo == nil { + return nil, nil + } + topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ @@ -311,27 +337,36 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(allocationInfo *state.Allocatio }) } - resp := &gpuconsts.AllocatedResource{ - ResourceName: p.ResourceName(), - TopologyAwareResource: &pluginapi.TopologyAwareResource{ - IsNodeResource: true, - IsScalarResource: true, - AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - TopologyAwareQuantityList: topologyAwareQuantityList, - OriginalTopologyAwareQuantityList: topologyAwareQuantityList, + resp := &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: podUID, + PodName: allocationInfo.PodName, + PodNamespace: allocationInfo.PodNamespace, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: containerName, + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + p.ResourceName(): { + IsNodeResource: true, + IsScalarResource: true, + AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + TopologyAwareQuantityList: topologyAwareQuantityList, + OriginalTopologyAwareQuantityList: topologyAwareQuantityList, + }, + }, }, } - return resp + return resp, nil } -func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(machineState state.GPUMap) *gpuconsts.AllocatableResource { +func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) { general.InfofV(4, "called") p.Lock() defer p.Unlock() + machineState := p.state.GetMachineState() + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 @@ -366,7 +401,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(machineState state.G AggregatedCapacityQuantity: aggregatedCapacityQuantity, TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, }, - } + }, nil } func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { @@ -403,7 +438,7 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso p.Lock() defer func() { - if err := p.State.StoreState(); err != nil { + if err := p.state.StoreState(); err != nil { general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) } p.Unlock() @@ -437,10 +472,10 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return emptyResponse, nil } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating - return p.PackAllocationResponse(req, &state.AllocationInfo{}, nil, p.ResourceName()) + return p.PackAllocationResponse(req, &gpumemorystate.AllocationInfo{}, nil, p.ResourceName()) } - allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) + allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) if allocationInfo != nil { resp, packErr := p.PackAllocationResponse(req, allocationInfo, nil, p.ResourceName()) if packErr != nil { @@ -458,17 +493,17 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return nil, fmt.Errorf("failed to get hint nodes: %v", err) } - newAllocation := &state.AllocationInfo{ + newAllocation := &gpumemorystate.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: state.GPUAllocation{ + AllocatedAllocation: gpumemorystate.GPUAllocation{ GPUMemoryQuantity: gpuMemory, NUMANodes: hintNodes.ToSliceInt(), }, } - p.State.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) + p.state.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) - machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.QrmConfig, p.State.GetPodEntries(), p.GpuTopologyProvider) + machineState, stateErr := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, p.state.GetPodEntries(), p.DeviceTopologyRegistry) if stateErr != nil { general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", "podNamespace", req.PodNamespace, @@ -479,7 +514,110 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso } // update state cache - p.State.SetMachineState(machineState, true) + p.state.SetMachineState(machineState, true) return p.PackAllocationResponse(req, newAllocation, nil, p.ResourceName()) } + +func (p *GPUMemPlugin) RemovePod(podUID string) error { + podEntries := p.state.GetPodEntries() + delete(podEntries, podUID) + + machineState, err := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.DeviceTopologyRegistry) + if err != nil { + general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) + return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) + } + + p.state.SetPodEntries(podEntries, false) + p.state.SetMachineState(machineState, false) + + err = p.state.StoreState() + if err != nil { + general.Errorf("store state failed with error: %v", err) + return err + } + return nil +} + +func (p *GPUMemPlugin) ClearResidualState() { + general.Infof("exec") + var ( + err error + podList []*v1.Pod + ) + residualSet := make(map[string]bool) + + defer func() { + _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) + }() + + if p.MetaServer == nil { + general.Errorf("nil metaServer") + return + } + + ctx := context.Background() + podList, err = p.MetaServer.GetPodList(ctx, nil) + if err != nil { + general.Errorf("get pod list failed: %v", err) + return + } + + podSet := sets.NewString() + for _, pod := range podList { + podSet.Insert(fmt.Sprintf("%v", pod.UID)) + } + + p.Lock() + defer p.Unlock() + + podEntries := p.state.GetPodEntries() + for podUID := range podEntries { + if !podSet.Has(podUID) { + residualSet[podUID] = true + p.residualHitMap[podUID] += 1 + general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + } + } + + podsToDelete := sets.NewString() + for podUID, hitCount := range p.residualHitMap { + if !residualSet[podUID] { + general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) + delete(p.residualHitMap, podUID) + continue + } + + if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { + podsToDelete.Insert(podUID) + } + } + + if podsToDelete.Len() > 0 { + for { + podUID, found := podsToDelete.PopAny() + if !found { + break + } + + general.Infof("clear residual pod: %s in state", podUID) + delete(podEntries, podUID) + } + + machineState, err := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.DeviceTopologyRegistry) + if err != nil { + general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + return + } + + p.state.SetPodEntries(podEntries, false) + p.state.SetMachineState(machineState, false) + + err = p.state.StoreState() + if err != nil { + general.Errorf("store state failed: %v", err) + return + } + } +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go similarity index 99% rename from pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go rename to pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go index 6233639d07..868cf60d63 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpu_mem_test.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package resourceplugin +package gpumemory import ( "fmt" diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go index 2757f23fbd..516a5bed8a 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -20,7 +20,6 @@ import ( pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" ) // ResourcePlugin knows how to handle resource requests for a specific resource type. @@ -29,9 +28,13 @@ type ResourcePlugin interface { GetTopologyHints(*pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) - GetTopologyAwareResources(*state.AllocationInfo) *gpuconsts.AllocatedResource + GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) - GetTopologyAwareAllocatableResources(state.GPUMap) *gpuconsts.AllocatableResource + GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) Allocate(*pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) + + RemovePod(podUID string) error + + ClearResidualState() } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go similarity index 61% rename from pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go rename to pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go index e8475bfbcb..a20d6a2af5 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go @@ -14,13 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ -package resourceplugin +package registry import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory" + "github.com/kubewharf/katalyst-core/pkg/metrics" ) -type initFunc func(plugin *baseplugin.BasePlugin) ResourcePlugin +type initFunc func( + plugin *baseplugin.BasePlugin, wrappedEmitter metrics.MetricEmitter, +) (resourceplugin.ResourcePlugin, error) var ResourcePluginsMap = make(map[string]initFunc) @@ -29,5 +35,5 @@ func registerResourcePlugin(pluginName string, initFunc initFunc) { } func init() { - registerResourcePlugin(GPUMemPluginName, NewGPUMemPlugin) + registerResourcePlugin(gpuconsts.GPUMemPluginName, gpumemory.NewGPUMemPlugin) } diff --git a/pkg/agent/qrm-plugins/gpu/state/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go similarity index 99% rename from pkg/agent/qrm-plugins/gpu/state/checkpoint.go rename to pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go index 0995851134..0f674baf6b 100644 --- a/pkg/agent/qrm-plugins/gpu/state/checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package state +package gpumemory import ( "encoding/json" diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go similarity index 80% rename from pkg/agent/qrm-plugins/gpu/state/state.go rename to pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go index 5c37a6faad..61cf273f1e 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package state +package gpumemory import ( "encoding/json" @@ -53,13 +53,13 @@ type ( PodEntries map[string]ContainerEntries // Keyed by pod UID ) -type GPUState struct { +type GPUMemoryState struct { GPUMemoryAllocatable float64 `json:"gpu_memory_allocatable"` GPUMemoryAllocated float64 `json:"gpu_memory_allocated"` PodEntries PodEntries `json:"pod_entries"` } -type GPUMap map[string]*GPUState // GPUMap keyed by gpu device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e +type GPUMap map[string]*GPUMemoryState // GPUMap keyed by gpu device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e func (i *AllocationInfo) String() string { if i == nil { @@ -143,67 +143,67 @@ func (e PodEntries) SetAllocationInfo(podUID string, containerName string, alloc e[podUID][containerName] = allocationInfo.Clone() } -func (ns *GPUState) String() string { - if ns == nil { +func (ms *GPUMemoryState) String() string { + if ms == nil { return "" } - contentBytes, err := json.Marshal(ns) + contentBytes, err := json.Marshal(ms) if err != nil { - general.LoggerWithPrefix("GPUState.String", general.LoggingPKGFull).Errorf("marshal GPUState failed with error: %v", err) + general.LoggerWithPrefix("GPUMemoryState.String", general.LoggingPKGFull).Errorf("marshal GPUMemoryState failed with error: %v", err) return "" } return string(contentBytes) } -func (ns *GPUState) Clone() *GPUState { - if ns == nil { +func (ms *GPUMemoryState) Clone() *GPUMemoryState { + if ms == nil { return nil } - return &GPUState{ - GPUMemoryAllocatable: ns.GPUMemoryAllocatable, - GPUMemoryAllocated: ns.GPUMemoryAllocated, - PodEntries: ns.PodEntries.Clone(), + return &GPUMemoryState{ + GPUMemoryAllocatable: ms.GPUMemoryAllocatable, + GPUMemoryAllocated: ms.GPUMemoryAllocated, + PodEntries: ms.PodEntries.Clone(), } } -func (ns *GPUState) GetGPUMemoryAllocatable() float64 { - if ns == nil { +func (ms *GPUMemoryState) GetGPUMemoryAllocatable() float64 { + if ms == nil { return 0 } - return ns.GPUMemoryAllocatable + return ms.GPUMemoryAllocatable } -func (ns *GPUState) GetGPUMemoryAllocated() float64 { - if ns == nil { +func (ms *GPUMemoryState) GetGPUMemoryAllocated() float64 { + if ms == nil { return 0 } - return ns.GPUMemoryAllocated + return ms.GPUMemoryAllocated } -// SetAllocationInfo adds a new AllocationInfo (for pod/container pairs) into the given NICState -func (ns *GPUState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { - if ns == nil { +// SetAllocationInfo adds a new AllocationInfo (for pod/container pairs) into the given GPUMemoryState +func (ms *GPUMemoryState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if ms == nil { return } if allocationInfo == nil { - general.LoggerWithPrefix("GPUState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") + general.LoggerWithPrefix("GPUMemoryState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") return } - if ns.PodEntries == nil { - ns.PodEntries = make(PodEntries) + if ms.PodEntries == nil { + ms.PodEntries = make(PodEntries) } - if _, ok := ns.PodEntries[podUID]; !ok { - ns.PodEntries[podUID] = make(ContainerEntries) + if _, ok := ms.PodEntries[podUID]; !ok { + ms.PodEntries[podUID] = make(ContainerEntries) } - ns.PodEntries[podUID][containerName] = allocationInfo.Clone() + ms.PodEntries[podUID][containerName] = allocationInfo.Clone() } func (m GPUMap) Clone() GPUMap { diff --git a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go similarity index 89% rename from pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go rename to pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go index a1b132cbea..e36b96ddea 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package state +package gpumemory import ( "errors" @@ -39,7 +39,7 @@ const ( var ( _ State = &stateCheckpoint{} - generalLog = general.LoggerWithPrefix("gpu_plugin", general.LoggingPKGFull) + generalLog = general.LoggerWithPrefix("gpu_mem_plugin", general.LoggingPKGFull) ) // stateCheckpoint is an in-memory implementation of State; @@ -83,7 +83,9 @@ func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { } } -func (s *stateCheckpoint) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, persist bool) { +func (s *stateCheckpoint) SetAllocationInfo( + podUID, containerName string, allocationInfo *AllocationInfo, persist bool, +) { s.Lock() defer s.Unlock() @@ -168,7 +170,9 @@ func (s *stateCheckpoint) storeState() error { return nil } -func (s *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider machine.GPUTopologyProvider) error { +func (s *stateCheckpoint) restoreState( + conf *qrm.QRMPluginsConfiguration, topologyRegistry *machine.DeviceTopologyRegistry, +) error { s.Lock() defer s.Unlock() var err error @@ -194,7 +198,7 @@ func (s *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, gpuTop return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) } - machineState, err := GenerateMachineStateFromPodEntries(conf, checkpoint.PodEntries, gpuTopologyProvider) + machineState, err := GenerateMachineStateFromPodEntries(conf, checkpoint.PodEntries, topologyRegistry) if err != nil { return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) } @@ -227,15 +231,16 @@ func (s *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, gpuTop return nil } -func NewCheckpointState(conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, - gpuTopologyProvider machine.GPUTopologyProvider, skipStateCorruption bool, emitter metrics.MetricEmitter, +func NewCheckpointState( + conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, + topologyRegistry *machine.DeviceTopologyRegistry, skipStateCorruption bool, emitter metrics.MetricEmitter, ) (State, error) { checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) if err != nil { return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) } - defaultCache, err := NewGPUPluginState(conf, gpuTopologyProvider) + defaultCache, err := NewGPUPluginState(conf, topologyRegistry) if err != nil { return nil, fmt.Errorf("NewGPUPluginState failed with error: %v", err) } @@ -249,7 +254,7 @@ func NewCheckpointState(conf *qrm.QRMPluginsConfiguration, stateDir, checkpointN emitter: emitter, } - if err := sc.restoreState(conf, gpuTopologyProvider); err != nil { + if err := sc.restoreState(conf, topologyRegistry); err != nil { return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ "the gpu plugin checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go similarity index 86% rename from pkg/agent/qrm-plugins/gpu/state/state_mem.go rename to pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go index e0d45dd209..4ec37d9bcd 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_mem.go +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package state +package gpumemory import ( "fmt" @@ -30,8 +30,8 @@ import ( type gpuPluginState struct { sync.RWMutex - qrmConf *qrm.QRMPluginsConfiguration - gpuTopologyProvider machine.GPUTopologyProvider + qrmConf *qrm.QRMPluginsConfiguration + topologyRegistry *machine.DeviceTopologyRegistry machineState GPUMap podEntries PodEntries @@ -39,20 +39,20 @@ type gpuPluginState struct { func NewGPUPluginState( conf *qrm.QRMPluginsConfiguration, - gpuTopologyProvider machine.GPUTopologyProvider, + topologyRegistry *machine.DeviceTopologyRegistry, ) (State, error) { generalLog.InfoS("initializing new gpu plugin in-memory state store") - defaultMachineState, err := GenerateMachineState(conf, gpuTopologyProvider) + defaultMachineState, err := GenerateMachineState(conf, topologyRegistry) if err != nil { - return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) } return &gpuPluginState{ - qrmConf: conf, - machineState: defaultMachineState, - gpuTopologyProvider: gpuTopologyProvider, - podEntries: make(PodEntries), + qrmConf: conf, + machineState: defaultMachineState, + topologyRegistry: topologyRegistry, + podEntries: make(PodEntries), }, nil } @@ -100,7 +100,7 @@ func (s *gpuPluginState) ClearState() { s.Lock() defer s.Unlock() - machineState, err := GenerateMachineState(s.qrmConf, s.gpuTopologyProvider) + machineState, err := GenerateMachineState(s.qrmConf, s.topologyRegistry) if err != nil { generalLog.ErrorS(err, "failed to generate machine state") } diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go similarity index 55% rename from pkg/agent/qrm-plugins/gpu/state/util.go rename to pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go index 5c1a3dcfca..a72f109ad9 100644 --- a/pkg/agent/qrm-plugins/gpu/state/util.go +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go @@ -14,29 +14,34 @@ See the License for the specific language governing permissions and limitations under the License. */ -package state +package gpumemory import ( "fmt" + "github.com/kubewharf/katalyst-api/pkg/consts" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) -func GenerateMachineState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider machine.GPUTopologyProvider) (GPUMap, error) { - if gpuTopologyProvider == nil { - return nil, fmt.Errorf("gpu topology provider must not be nil") +func GenerateMachineState( + conf *qrm.QRMPluginsConfiguration, topologyRegistry *machine.DeviceTopologyRegistry, +) (GPUMap, error) { + if topologyRegistry == nil { + return nil, fmt.Errorf("topology provider registry must not be nil") } - gpuTopology, _, err := gpuTopologyProvider.GetGPUTopology() + gpuTopology, _, err := topologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) if err != nil { return nil, fmt.Errorf("gpu topology provider failed with error: %v", err) } gpuMap := make(GPUMap) - for deviceID := range gpuTopology.GPUs { - gpuMap[deviceID] = &GPUState{ + for deviceID := range gpuTopology.Devices { + gpuMap[deviceID] = &GPUMemoryState{ GPUMemoryAllocatable: float64(conf.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()), GPUMemoryAllocated: 0, } @@ -50,9 +55,9 @@ func GenerateMachineState(conf *qrm.QRMPluginsConfiguration, gpuTopologyProvider func GenerateMachineStateFromPodEntries( conf *qrm.QRMPluginsConfiguration, podEntries PodEntries, - gpuTopologyProvider machine.GPUTopologyProvider, + topologyRegistry *machine.DeviceTopologyRegistry, ) (GPUMap, error) { - machineState, err := GenerateMachineState(conf, gpuTopologyProvider) + machineState, err := GenerateMachineState(conf, topologyRegistry) if err != nil { return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) } @@ -86,3 +91,43 @@ func GenerateMachineStateFromPodEntries( return machineState, nil } + +// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, +// and regenerateHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func RegenerateGPUMemoryHints( + allocationInfo *AllocationInfo, regenerate bool, +) map[string]*pluginapi.ListOfTopologyHints { + if allocationInfo == nil { + general.Errorf("RegenerateHints got nil allocationInfo") + return nil + } + + hints := map[string]*pluginapi.ListOfTopologyHints{} + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "podUID", allocationInfo.PodUid, + "containerName", allocationInfo.ContainerName) + + return nil + } + + allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) + + general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "hint", allocatedNumaNodes) + hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: allocatedNumaNodes.ToSliceUInt64(), + Preferred: true, + }, + }, + } + return hints +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go new file mode 100644 index 0000000000..77faff321a --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go @@ -0,0 +1,46 @@ +package rdma + +import ( + "encoding/json" + + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" +) + +var _ checkpointmanager.Checkpoint = &RDMAPluginCheckpoint{} + +type RDMAPluginCheckpoint struct { + PolicyName string `json:"policyName"` + MachineState RDMAMap `json:"machineState"` + PodEntries PodEntries `json:"pod_entries"` + Checksum checksum.Checksum `json:"checksum"` +} + +func NewRDMAPluginCheckpoint() *RDMAPluginCheckpoint { + return &RDMAPluginCheckpoint{ + PodEntries: make(PodEntries), + MachineState: make(RDMAMap), + } +} + +// MarshalCheckpoint returns a marshaled checkpoint +func (cp *RDMAPluginCheckpoint) MarshalCheckpoint() ([]byte, error) { + // make sure checksum wasn't set before, so it doesn't affect output checksum + cp.Checksum = 0 + cp.Checksum = checksum.New(cp) + return json.Marshal(*cp) +} + +// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint +func (cp *RDMAPluginCheckpoint) UnmarshalCheckpoint(blob []byte) error { + return json.Unmarshal(blob, cp) +} + +// VerifyChecksum verifies that current checksum of checkpoint is valid +func (cp *RDMAPluginCheckpoint) VerifyChecksum() error { + ck := cp.Checksum + cp.Checksum = 0 + err := ck.Verify(cp) + cp.Checksum = ck + return err +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go new file mode 100644 index 0000000000..7f0674e0d8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go @@ -0,0 +1,206 @@ +package rdma + +import ( + "encoding/json" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +type AllocationInfo struct { + commonstate.AllocationMeta `json:",inline"` + + AllocatedAllocation RDMAAllocation `json:"allocatedAllocation"` + TopologyAwareAllocations map[string]RDMAAllocation `json:"topologyAwareAllocations"` +} + +type RDMAAllocation struct { + NUMANodes []int `json:"numa_nodes"` +} + +func (a *RDMAAllocation) Clone() RDMAAllocation { + if a == nil { + return RDMAAllocation{} + } + + numaNodes := make([]int, len(a.NUMANodes)) + copy(numaNodes, a.NUMANodes) + return RDMAAllocation{ + NUMANodes: numaNodes, + } +} + +type ( + ContainerEntries map[string]*AllocationInfo // Keyed by container name + PodEntries map[string]ContainerEntries // Keyed by pod name +) + +type RDMAState struct { + PodEntries PodEntries `json:"pod_entries"` +} + +type RDMAMap map[string]*RDMAState // RDMAMap keyed by RDMA NIC card ID + +func (i *AllocationInfo) String() string { + if i == nil { + return "" + } + + contentBytes, err := json.Marshal(i) + + if err != nil { + general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (i *AllocationInfo) Clone() *AllocationInfo { + if i == nil { + return nil + } + + clone := &AllocationInfo{ + AllocationMeta: *i.AllocationMeta.Clone(), + AllocatedAllocation: i.AllocatedAllocation.Clone(), + } + + if i.TopologyAwareAllocations != nil { + clone.TopologyAwareAllocations = make(map[string]RDMAAllocation) + for k, v := range i.TopologyAwareAllocations { + clone.TopologyAwareAllocations[k] = v.Clone() + } + } + + return clone +} + +func (e PodEntries) Clone() PodEntries { + clone := make(PodEntries) + for podUID, containerEntries := range e { + clone[podUID] = make(ContainerEntries) + for containerID, allocationInfo := range containerEntries { + clone[podUID][containerID] = allocationInfo.Clone() + } + } + return clone +} + +func (e PodEntries) GetAllocationInfo(podUID string, containerName string) *AllocationInfo { + if e == nil { + return nil + } + + if containerEntries, ok := e[podUID]; ok { + if allocationInfo, ok := containerEntries[containerName]; ok { + return allocationInfo.Clone() + } + } + return nil +} + +func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if e == nil { + return + } + + if _, ok := e[podUID]; !ok { + e[podUID] = make(ContainerEntries) + } + e[podUID][containerName] = allocationInfo.Clone() +} + +func (rs *RDMAState) String() string { + if rs == nil { + return "" + } + + contentBytes, err := json.Marshal(rs) + if err != nil { + general.LoggerWithPrefix("RDMAState.String", general.LoggingPKGFull).Errorf("marshal RDMAState failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (rs *RDMAState) Clone() *RDMAState { + if rs == nil { + return nil + } + + return &RDMAState{ + PodEntries: rs.PodEntries.Clone(), + } +} + +func (rs *RDMAState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if rs == nil { + return + } + + if allocationInfo == nil { + general.LoggerWithPrefix("RDMAState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") + return + } + + if rs.PodEntries == nil { + rs.PodEntries = make(PodEntries) + } + + if _, ok := rs.PodEntries[podUID]; !ok { + rs.PodEntries[podUID] = make(ContainerEntries) + } + + rs.PodEntries[podUID][containerName] = allocationInfo.Clone() +} + +func (m RDMAMap) Clone() RDMAMap { + clone := make(RDMAMap) + for id, rs := range m { + clone[id] = rs.Clone() + } + return clone +} + +func (m RDMAMap) String() string { + if m == nil { + return "" + } + + contentBytes, err := json.Marshal(m) + if err != nil { + general.LoggerWithPrefix("RDMAMap.String", general.LoggingPKGFull).Errorf("marshal RDMAMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +// reader is used to get information from local states +type reader interface { + GetMachineState() RDMAMap + GetPodEntries() PodEntries + GetAllocationInfo(podUID, containerName string) *AllocationInfo +} + +// writer is used to store information from local states, +// and it also provides functionality to maintain the local files +type writer interface { + SetMachineState(rdmaMap RDMAMap, persist bool) + SetPodEntries(rdmaMap PodEntries, persist bool) + SetAllocationInfo(podUid, containerName string, allocationInfo *AllocationInfo, persist bool) + + Delete(podUID, containerName string, persist bool) + ClearState() + StoreState() error +} + +// ReadonlyState interface only provides methods for tracking pod assignments +type ReadonlyState interface { + reader +} + +// State interface provides methods for tracking and setting pod assignments +type State interface { + writer + ReadonlyState +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go new file mode 100644 index 0000000000..e791073567 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go @@ -0,0 +1,241 @@ +package rdma + +import ( + "fmt" + "path" + "reflect" + "sync" + "time" + + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/pkg/errors" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" +) + +const ( + metricMetaCacheStoreStateDuration = "metacache_store_state_duration" +) + +var ( + _ State = &stateCheckpoint{} + generalLog = general.LoggerWithPrefix("rdma_plugin", general.LoggingPKGFull) +) + +type stateCheckpoint struct { + sync.RWMutex + cache State + policyName string + checkpointManager checkpointmanager.CheckpointManager + checkpointName string + // when we add new properties to checkpoint, + // it will cause checkpoint corruption and we should skip it + skipStateCorruption bool + emitter metrics.MetricEmitter +} + +func NewCheckpointState( + stateDir, checkpointName, policyName string, + topologyRegistry *machine.DeviceTopologyRegistry, skipStateCorruption bool, emitter metrics.MetricEmitter, +) (State, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) + if err != nil { + return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) + } + + defaultCache, err := NewRDMAPluginState(topologyRegistry) + if err != nil { + return nil, fmt.Errorf("NewRDMAPluginState failed with error: %v", err) + } + + sc := &stateCheckpoint{ + cache: defaultCache, + policyName: policyName, + checkpointName: checkpointName, + checkpointManager: checkpointManager, + skipStateCorruption: skipStateCorruption, + emitter: emitter, + } + + if err := sc.restoreState(topologyRegistry); err != nil { + return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ + "the rdma plugin checkpoint file %q before restarting Kubelet", + err, path.Join(stateDir, checkpointName)) + } + + return sc, nil +} + +func (s *stateCheckpoint) SetMachineState(rdmaMap RDMAMap, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetMachineState(rdmaMap, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store machineState to checkpoint error") + } + } +} + +func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetPodEntries(podEntries, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store pod entries to checkpoint error", "err") + } + } +} + +func (s *stateCheckpoint) SetAllocationInfo( + podUID, containerName string, allocationInfo *AllocationInfo, persist bool, +) { + s.Lock() + defer s.Unlock() + + s.cache.SetAllocationInfo(podUID, containerName, allocationInfo, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store allocationInfo to checkpoint error") + } + } +} + +func (s *stateCheckpoint) storeState() error { + startTime := time.Now() + general.InfoS("called") + defer func() { + elapsed := time.Since(startTime) + general.InfoS("finished", "duration", elapsed) + _ = s.emitter.StoreFloat64(metricMetaCacheStoreStateDuration, float64(elapsed/time.Millisecond), metrics.MetricTypeNameRaw) + }() + checkpoint := NewRDMAPluginCheckpoint() + checkpoint.PolicyName = s.policyName + checkpoint.MachineState = s.cache.GetMachineState() + checkpoint.PodEntries = s.cache.GetPodEntries() + + err := s.checkpointManager.CreateCheckpoint(s.checkpointName, checkpoint) + if err != nil { + generalLog.ErrorS(err, "could not save checkpoint") + return err + } + return nil +} + +func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyRegistry) error { + s.Lock() + defer s.Unlock() + var err error + var foundAndSkippedStateCorruption bool + + checkpoint := NewRDMAPluginCheckpoint() + if err = s.checkpointManager.GetCheckpoint(s.checkpointName, checkpoint); err != nil { + if errors.Is(err, cmerrors.ErrCheckpointNotFound) { + return s.storeState() + } else if errors.Is(err, cmerrors.ErrCorruptCheckpoint) { + if !s.skipStateCorruption { + return err + } + + foundAndSkippedStateCorruption = true + generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err) + } else { + return err + } + } + + if s.policyName != checkpoint.PolicyName && !s.skipStateCorruption { + return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) + } + + machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodEntries, topologyRegistry) + if err != nil { + return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + + s.cache.SetMachineState(machineState, false) + s.cache.SetPodEntries(checkpoint.PodEntries, false) + + if !reflect.DeepEqual(machineState, checkpoint.MachineState) { + generalLog.Warningf("machine state changed: "+ + "machineState: %s; checkpointMachineState: %s", + machineState.String(), checkpoint.MachineState.String()) + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState when machine state changed failed with error: %v", err) + } + } + + if foundAndSkippedStateCorruption { + generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum") + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState failed with error: %v", err) + } + } + + generalLog.InfoS("state checkpoint: restored state from checkpoint") + + return nil +} + +func (s *stateCheckpoint) Delete(podUID, containerName string, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.Delete(podUID, containerName, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after delete operation to checkpoint error") + } + } +} + +func (s *stateCheckpoint) ClearState() { + s.Lock() + defer s.Unlock() + + s.cache.ClearState() + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after clear operation to checkpoint error") + } +} + +func (s *stateCheckpoint) StoreState() error { + s.Lock() + defer s.Unlock() + return s.storeState() +} + +func (s *stateCheckpoint) GetMachineState() RDMAMap { + s.RLock() + defer s.RUnlock() + + return s.cache.GetMachineState() +} + +func (s *stateCheckpoint) GetPodEntries() PodEntries { + s.RLock() + defer s.RUnlock() + + return s.cache.GetPodEntries() +} + +func (s *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.cache.GetAllocationInfo(podUID, containerName) +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go new file mode 100644 index 0000000000..9f226db201 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go @@ -0,0 +1,112 @@ +package rdma + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +type rdmaPluginState struct { + sync.RWMutex + + topologyRegistry *machine.DeviceTopologyRegistry + + machineState RDMAMap + podEntries PodEntries +} + +func NewRDMAPluginState(topologyRegistry *machine.DeviceTopologyRegistry) (State, error) { + generalLog.InfoS("initializing new rdma plugin in-memory state store") + + defaultMachineState, err := GenerateMachineState(topologyRegistry) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) + } + + return &rdmaPluginState{ + machineState: defaultMachineState, + topologyRegistry: topologyRegistry, + podEntries: make(PodEntries), + }, nil +} + +func (s *rdmaPluginState) SetMachineState(rdmaMap RDMAMap, _ bool) { + s.Lock() + defer s.Unlock() + s.machineState = rdmaMap.Clone() + generalLog.InfoS("updated rdma plugin machine state", + "rdmaMap", rdmaMap) +} + +func (s *rdmaPluginState) SetPodEntries(podEntries PodEntries, _ bool) { + s.Lock() + defer s.Unlock() + s.podEntries = podEntries +} + +func (s *rdmaPluginState) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, _ bool) { + s.Lock() + defer s.Unlock() + + s.podEntries.SetAllocationInfo(podUID, containerName, allocationInfo) + generalLog.InfoS("updated rdma plugin pod entries", + "podUID", podUID, + "containerName", containerName, + "allocationInfo", allocationInfo.String()) +} + +func (s *rdmaPluginState) Delete(podUID, containerName string, _ bool) { + s.Lock() + defer s.Unlock() + + if _, ok := s.podEntries[podUID]; !ok { + return + } + + delete(s.podEntries[podUID], containerName) + if len(s.podEntries[podUID]) == 0 { + delete(s.podEntries, podUID) + } + generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) +} + +func (s *rdmaPluginState) ClearState() { + s.Lock() + defer s.Unlock() + + machineState, err := GenerateMachineState(s.topologyRegistry) + if err != nil { + generalLog.ErrorS(err, "failed to generate machine state") + } + s.machineState = machineState + s.podEntries = make(PodEntries) + + generalLog.InfoS("cleared state") +} + +func (s *rdmaPluginState) StoreState() error { + // nothing to do + return nil +} + +func (s *rdmaPluginState) GetMachineState() RDMAMap { + s.RLock() + defer s.RUnlock() + + return s.machineState.Clone() +} + +func (s *rdmaPluginState) GetPodEntries() PodEntries { + s.RLock() + defer s.RUnlock() + + return s.podEntries.Clone() +} + +func (s *rdmaPluginState) GetAllocationInfo(podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.podEntries.GetAllocationInfo(podUID, containerName) +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go new file mode 100644 index 0000000000..a9bbd97e33 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go @@ -0,0 +1,56 @@ +package rdma + +import ( + "fmt" + + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func GenerateMachineState(topologyRegistry *machine.DeviceTopologyRegistry) (RDMAMap, error) { + if topologyRegistry == nil { + return nil, fmt.Errorf("topology provider registry must not be nil") + } + + rdmaTopology, _, err := topologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceName) + if err != nil { + return nil, fmt.Errorf("rdma topology provider failed with error: %v", err) + } + + rdmaMap := make(RDMAMap) + for deviceID := range rdmaTopology.Devices { + rdmaMap[deviceID] = &RDMAState{} + } + + return rdmaMap, nil +} + +func GenerateMachineStateFromPodEntries( + podEntries PodEntries, topologyRegistry *machine.DeviceTopologyRegistry, +) (RDMAMap, error) { + machineState, err := GenerateMachineState(topologyRegistry) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + } + + for rdmaID, rdmaState := range machineState { + for podUID, containerEntries := range podEntries { + for containerName, allocationInfo := range containerEntries { + if containerName != "" && allocationInfo != nil { + rdmaAllocation, ok := allocationInfo.TopologyAwareAllocations[rdmaID] + if !ok { + continue + } + alloc := allocationInfo.Clone() + alloc.AllocatedAllocation = rdmaAllocation.Clone() + alloc.TopologyAwareAllocations = map[string]RDMAAllocation{rdmaID: rdmaAllocation} + rdmaState.SetAllocationInfo(podUID, containerName, alloc) + } + } + } + + machineState[rdmaID] = rdmaState + } + + return machineState, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 7e967c08bd..48ab1b0cbe 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -22,7 +22,8 @@ import ( "sync" "time" - v1 "k8s.io/api/core/v1" + devicepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry" + resourcepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/registry" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -35,7 +36,6 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" "github.com/kubewharf/katalyst-core/pkg/config" @@ -92,8 +92,12 @@ func NewStaticPolicy( customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } - policyImplement.registerResourcePlugins() - policyImplement.registerCustomDevicePlugins() + if err = policyImplement.registerResourcePlugins(wrappedEmitter); err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to register resource plugins: %w", err) + } + if err = policyImplement.registerCustomDevicePlugins(wrappedEmitter); err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to register custom device plugins: %w", err) + } pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, func(key string, value int64) { @@ -211,6 +215,7 @@ func (p *StaticPolicy) RemovePod( p.Lock() defer p.Unlock() + // For every resource plugin and custom resource plugin, remove pod from their state if err := p.removePod(req.PodUid); err != nil { general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) return nil, err @@ -238,29 +243,71 @@ func (p *StaticPolicy) GetTopologyAwareResources( return nil, fmt.Errorf("GetTopologyAwareResources got nil req") } - allocationInfo := p.State.GetAllocationInfo(req.PodUid, req.ContainerName) - if allocationInfo == nil { - return &pluginapi.GetTopologyAwareResourcesResponse{}, nil - } - // Get topology aware resources for all resource plugins - allocatedResources := make(map[string]*pluginapi.TopologyAwareResource) + allocatedResourcesList := make([]*pluginapi.GetTopologyAwareResourcesResponse, 0) for _, resourcePlugin := range p.resourcePlugins { - allocatedResource := resourcePlugin.GetTopologyAwareResources(allocationInfo) - allocatedResources[allocatedResource.ResourceName] = allocatedResource.TopologyAwareResource + allocatedResource, err := resourcePlugin.GetTopologyAwareResources(req.PodUid, req.ContainerName) + if err != nil { + return nil, fmt.Errorf("failed to get topology aware resources for plugin %s: %w", resourcePlugin.ResourceName(), err) + } + + if allocatedResource == nil { + continue + } + allocatedResourcesList = append(allocatedResourcesList, allocatedResource) } - resp := &pluginapi.GetTopologyAwareResourcesResponse{ - PodUid: allocationInfo.PodUid, - PodName: allocationInfo.PodName, - PodNamespace: allocationInfo.PodNamespace, + // Merge the respective response into one response + resp, err := p.mergeTopologyAwareResourcesResponse(req.PodUid, req.ContainerName, allocatedResourcesList) + if err != nil { + return nil, fmt.Errorf("failed to merge topology aware resources: %w", err) + } + + return resp, nil +} + +// mergeTopologyAwareResourcesResponse takes the separate topology aware resources response from the different sub-plugins and +// merge them into one response. +func (p *StaticPolicy) mergeTopologyAwareResourcesResponse( + podUID, containerName string, respList []*pluginapi.GetTopologyAwareResourcesResponse, +) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + result := &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: podUID, ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ - ContainerName: allocationInfo.ContainerName, - AllocatedResources: allocatedResources, + ContainerName: containerName, }, } - return resp, nil + allocatedResources := make(map[string]*pluginapi.TopologyAwareResource) + for _, resp := range respList { + if resp == nil { + continue + } + + if result.PodName != "" && result.PodName != resp.PodName { + general.Errorf("pod name %s not match, expect %s", resp.PodName, result.PodName) + return nil, fmt.Errorf("pod name %s not match, expect %s", resp.PodName, result.PodName) + } + + if result.PodNamespace != "" && result.PodNamespace != resp.PodNamespace { + general.Errorf("pod namespace %s not match, expect %s", resp.PodNamespace, result.PodNamespace) + return nil, fmt.Errorf("pod namespace %s not match, expect %s", resp.PodNamespace, result.PodNamespace) + } + + if result.PodName == "" { + result.PodName = resp.PodName + } + if result.PodNamespace == "" { + result.PodNamespace = resp.PodNamespace + } + + for resourceName, resource := range resp.ContainerTopologyAwareResources.AllocatedResources { + allocatedResources[resourceName] = resource + } + } + + result.ContainerTopologyAwareResources.AllocatedResources = allocatedResources + return result, nil } // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format @@ -276,12 +323,13 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources( p.Lock() defer p.Unlock() - machineState := p.State.GetMachineState() - // Get topology aware allocatable resources for all resource plugins allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) for _, resourcePlugin := range p.resourcePlugins { - allocatableResource := resourcePlugin.GetTopologyAwareAllocatableResources(machineState) + allocatableResource, err := resourcePlugin.GetTopologyAwareAllocatableResources() + if err != nil { + return nil, fmt.Errorf("failed to get topology aware allocatable resources for plugin %s: %w", resourcePlugin.ResourceName(), err) + } allocatableResources[allocatableResource.ResourceName] = allocatableResource.AllocatableTopologyAwareResource } @@ -343,24 +391,18 @@ func (p *StaticPolicy) PreStartContainer( } func (p *StaticPolicy) removePod(podUID string) error { - // update state cache - podEntries := p.State.GetPodEntries() - delete(podEntries, podUID) - - machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.GpuTopologyProvider) - if err != nil { - general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) - return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) + for _, resourcePlugin := range p.resourcePlugins { + if err := resourcePlugin.RemovePod(podUID); err != nil { + return err + } } - p.State.SetPodEntries(podEntries, false) - p.State.SetMachineState(machineState, false) - - err = p.State.StoreState() - if err != nil { - general.Errorf("store state failed with error: %v", err) - return err + for _, customDevicePlugin := range p.customDevicePlugins { + if err := customDevicePlugin.RemovePod(podUID); err != nil { + return err + } } + return nil } @@ -373,83 +415,12 @@ func (p *StaticPolicy) clearResidualState( _ *metaserver.MetaServer, ) { general.Infof("exec") - var ( - err error - podList []*v1.Pod - ) - residualSet := make(map[string]bool) - - defer func() { - _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) - }() - - if p.MetaServer == nil { - general.Errorf("nil metaServer") - return - } - - ctx := context.Background() - podList, err = p.MetaServer.GetPodList(ctx, nil) - if err != nil { - general.Errorf("get pod list failed: %v", err) - return - } - - podSet := sets.NewString() - for _, pod := range podList { - podSet.Insert(fmt.Sprintf("%v", pod.UID)) - } - - p.Lock() - defer p.Unlock() - - podEntries := p.State.GetPodEntries() - for podUID := range podEntries { - if !podSet.Has(podUID) { - residualSet[podUID] = true - p.residualHitMap[podUID] += 1 - general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) - } - } - - podsToDelete := sets.NewString() - for podUID, hitCount := range p.residualHitMap { - if !residualSet[podUID] { - general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) - delete(p.residualHitMap, podUID) - continue - } - - if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { - podsToDelete.Insert(podUID) - } + for _, resourcePlugin := range p.resourcePlugins { + resourcePlugin.ClearResidualState() } - if podsToDelete.Len() > 0 { - for { - podUID, found := podsToDelete.PopAny() - if !found { - break - } - - general.Infof("clear residual pod: %s in state", podUID) - delete(podEntries, podUID) - } - - machineState, err := state.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.GpuTopologyProvider) - if err != nil { - general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) - return - } - - p.State.SetPodEntries(podEntries, false) - p.State.SetMachineState(machineState, false) - - err = p.State.StoreState() - if err != nil { - general.Errorf("store state failed: %v", err) - return - } + for _, customDevicePlugin := range p.customDevicePlugins { + customDevicePlugin.ClearResidualState() } } @@ -501,28 +472,46 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return customDevicePlugin.AllocateAssociatedDevice(req) } -func (p *StaticPolicy) registerResourcePlugins() { +func (p *StaticPolicy) registerResourcePlugins(wrappedEmitter metrics.MetricEmitter) error { + p.Lock() + defer p.Unlock() for name := range p.resourcePluginsNames { - initFunc := resourceplugin.ResourcePluginsMap[name] - resourcePlugin := initFunc(p.BasePlugin) + initFunc := resourcepluginregistry.ResourcePluginsMap[name] + resourcePlugin, err := initFunc(p.BasePlugin, wrappedEmitter) + if err != nil { + general.Errorf("register resource plugin %s failed with error: %w", name, err) + return fmt.Errorf("register resource plugin %s failed with error: %w", name, err) + } p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin } + return nil } -func (p *StaticPolicy) registerCustomDevicePlugins() { +func (p *StaticPolicy) registerCustomDevicePlugins(wrappedEmitter metrics.MetricEmitter) error { + p.Lock() + defer p.Unlock() for name := range p.associatedDeviceNames { - initFunc := customdeviceplugin.CustomDevicePluginsMap[name] - customDevicePlugin := initFunc(p.BasePlugin) + initFunc := devicepluginregistry.CustomDevicePluginsMap[name] + customDevicePlugin, err := initFunc(p.BasePlugin, wrappedEmitter) + if err != nil { + general.Errorf("register custom device plugin %s failed with error: %w", name, err) + return fmt.Errorf("register custom device plugin %s failed with error: %w", name, err) + } p.customDevicePlugins[customDevicePlugin.DeviceName()] = customDevicePlugin } + return nil } func (p *StaticPolicy) getResourcePlugin(resourceName string) resourceplugin.ResourcePlugin { + p.Lock() + defer p.Unlock() resourcePlugin := p.resourcePlugins[resourceName] return resourcePlugin } func (p *StaticPolicy) getCustomDevicePlugin(deviceName string) customdeviceplugin.CustomDevicePlugin { + p.Lock() + defer p.Unlock() customDevicePlugin := p.customDevicePlugins[deviceName] return customDevicePlugin } diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index 3957b09a3e..acd1e99772 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -21,55 +21,16 @@ import ( "math" pkgerrors "github.com/pkg/errors" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" - "github.com/kubewharf/katalyst-api/pkg/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) var ErrNoAvailableGPUMemoryHints = pkgerrors.New("no available gpu memory hints") -// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, -// and regenerateHints will assemble hints based on already-existed AllocationInfo, -// without any calculation logics at all -func RegenerateGPUMemoryHints(allocationInfo *state.AllocationInfo, regenerate bool) map[string]*pluginapi.ListOfTopologyHints { - if allocationInfo == nil { - general.Errorf("RegenerateHints got nil allocationInfo") - return nil - } - - hints := map[string]*pluginapi.ListOfTopologyHints{} - if regenerate { - general.ErrorS(nil, "need to regenerate hints", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "podUID", allocationInfo.PodUid, - "containerName", allocationInfo.ContainerName) - - return nil - } - - allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) - - general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "containerName", allocationInfo.ContainerName, - "hint", allocatedNumaNodes) - hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ - Hints: []*pluginapi.TopologyHint{ - { - Nodes: allocatedNumaNodes.ToSliceUInt64(), - Preferred: true, - }, - }, - } - return hints -} - -func GetNUMANodesCountToFitGPUReq(gpuReq float64, cpuTopology *machine.CPUTopology, gpuTopology *machine.GPUTopology) (int, int, error) { +func GetNUMANodesCountToFitGPUReq( + gpuReq float64, cpuTopology *machine.CPUTopology, gpuTopology *machine.DeviceTopology, +) (int, int, error) { if gpuTopology == nil { return 0, 0, fmt.Errorf("GetNUMANodesCountToFitGPUReq got nil gpuTopology") } @@ -79,17 +40,17 @@ func GetNUMANodesCountToFitGPUReq(gpuReq float64, cpuTopology *machine.CPUTopolo return 0, 0, fmt.Errorf("there is no NUMA in cpuTopology") } - if len(gpuTopology.GPUs)%numaCount != 0 { - general.Warningf("GPUs count %d cannot be evenly divisible by NUMA count %d", len(gpuTopology.GPUs), numaCount) + if len(gpuTopology.Devices)%numaCount != 0 { + general.Warningf("GPUs count %d cannot be evenly divisible by NUMA count %d", len(gpuTopology.Devices), numaCount) } - gpusPerNUMA := (len(gpuTopology.GPUs) + numaCount - 1) / numaCount + gpusPerNUMA := (len(gpuTopology.Devices) + numaCount - 1) / numaCount numaCountNeeded := int(math.Ceil(gpuReq / float64(gpusPerNUMA))) if numaCountNeeded == 0 { numaCountNeeded = 1 } if numaCountNeeded > numaCount { - return 0, 0, fmt.Errorf("invalid gpu req: %.3f in topology with NUMAs count: %d and GPUs count: %d", gpuReq, numaCount, len(gpuTopology.GPUs)) + return 0, 0, fmt.Errorf("invalid gpu req: %.3f in topology with NUMAs count: %d and GPUs count: %d", gpuReq, numaCount, len(gpuTopology.Devices)) } gpusCountNeededPerNUMA := int(math.Ceil(gpuReq / float64(numaCountNeeded))) diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go new file mode 100644 index 0000000000..948a6df919 --- /dev/null +++ b/pkg/util/machine/device.go @@ -0,0 +1,153 @@ +package machine + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/native" +) + +type DeviceTopologyProvider interface { + getDeviceTopology() (*DeviceTopology, bool, error) + setDeviceTopology(*DeviceTopology) error +} + +type DeviceTopologyRegistry struct { + // DeviceNameToProvider is a mapping of device name to their respective topology provider + DeviceNameToProvider map[string]DeviceTopologyProvider +} + +func NewDeviceTopologyRegistry() *DeviceTopologyRegistry { + return &DeviceTopologyRegistry{ + DeviceNameToProvider: make(map[string]DeviceTopologyProvider), + } +} + +// RegisterDeviceTopologyProvider registers a device topology provider for the specified device name. +func (r *DeviceTopologyRegistry) RegisterDeviceTopologyProvider( + deviceName string, deviceTopologyProvider DeviceTopologyProvider, +) { + r.DeviceNameToProvider[deviceName] = deviceTopologyProvider +} + +// SetDeviceTopology sets the device topology for the specified device name. +func (r *DeviceTopologyRegistry) SetDeviceTopology(deviceName string, deviceTopology *DeviceTopology) error { + provider, ok := r.DeviceNameToProvider[deviceName] + if !ok { + return fmt.Errorf("no device topology provider found for device %s", deviceName) + } + + return provider.setDeviceTopology(deviceTopology) +} + +// GetDeviceTopology gets the device topology for the specified device name. +func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTopology, bool, error) { + provider, ok := r.DeviceNameToProvider[deviceName] + if !ok { + return nil, false, fmt.Errorf("no device topology provider found for device %s", deviceName) + } + return provider.getDeviceTopology() +} + +type DeviceTopology struct { + Devices map[string]DeviceInfo +} + +type DeviceInfo struct { + Health string + NumaNodes []int +} + +func (i DeviceInfo) GetNUMANode() []int { + if i.NumaNodes == nil { + return []int{} + } + return i.NumaNodes +} + +type deviceTopologyProviderImpl struct { + mutex sync.RWMutex + resourceNames []string + + deviceTopology *DeviceTopology + numaTopologyReady bool +} + +func NewDeviceTopologyProvider(resourceNames []string) DeviceTopologyProvider { + deviceTopology, err := initDeviceTopology(resourceNames) + if err != nil { + deviceTopology = getEmptyDeviceTopology() + general.Warningf("initDeviceTopology failed with error: %v", err) + } else { + general.Infof("initDeviceTopology success: %v", deviceTopology) + } + + return &deviceTopologyProviderImpl{ + resourceNames: resourceNames, + deviceTopology: deviceTopology, + } +} + +func (p *deviceTopologyProviderImpl) setDeviceTopology(deviceTopology *DeviceTopology) error { + p.mutex.Lock() + defer p.mutex.Unlock() + if deviceTopology == nil { + return fmt.Errorf("deviceTopology is nil") + } + + p.deviceTopology = deviceTopology + p.numaTopologyReady = checkDeviceNUMATopologyReady(deviceTopology) + return nil +} + +func (p *deviceTopologyProviderImpl) getDeviceTopology() (*DeviceTopology, bool, error) { + p.mutex.RLock() + defer p.mutex.RUnlock() + + return p.deviceTopology, p.numaTopologyReady, nil +} + +func getEmptyDeviceTopology() *DeviceTopology { + return &DeviceTopology{ + Devices: make(map[string]DeviceInfo), + } +} + +func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { + deviceTopology := getEmptyDeviceTopology() + + kubeletCheckpoint, err := native.GetKubeletCheckpoint() + if err != nil { + general.Errorf("Failed to get kubelet checkpoint: %v", err) + return deviceTopology, nil + } + + _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() + for _, resourceName := range resourceNames { + gpuDevice, ok := registeredDevs[resourceName] + if !ok { + continue + } + + for _, id := range gpuDevice { + // get NUMA node from UpdateAllocatableAssociatedDevices + deviceTopology.Devices[id] = DeviceInfo{} + } + } + + return deviceTopology, nil +} + +func checkDeviceNUMATopologyReady(topology *DeviceTopology) bool { + if topology == nil { + return false + } + + for _, device := range topology.Devices { + if device.NumaNodes == nil { + return false + } + } + return true +} diff --git a/pkg/util/machine/rdma.go b/pkg/util/machine/rdma.go new file mode 100644 index 0000000000..46a0d79185 --- /dev/null +++ b/pkg/util/machine/rdma.go @@ -0,0 +1,36 @@ +package machine + +import "sync" + +type RDMATopologyProvider interface { + GetRDMATopology() (*RDMATopology, bool, error) + SetRDMATopology(*RDMATopology) error +} + +type RDMATopology struct { + RDMAs map[string]RDMAInfo +} + +type RDMAInfo struct { + Health string + NUMANodes []int +} + +func (i RDMAInfo) GetNUMANode() []int { + if i.NUMANodes == nil { + return []int{} + } + return i.NUMANodes +} + +type rdmaTopologyProviderImpl struct { + mutex sync.RWMutex + resourceNames []string + + rdmaTopology *RDMATopology + numaTopologyReady bool +} + +func NewRDMATopologyProvider(resourceNames []string) RDMATopologyProvider { + +} From f77d28e3ad5cb8714887a1dfc733a685897e2a5f Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 13 Oct 2025 19:38:53 +0800 Subject: [PATCH 14/52] feat: refactor state to only be in one file --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 60 ++-- .../qrm-plugins/gpu/customdeviceplugin/gpu.go | 40 +-- .../customdeviceplugin/registry/registry.go | 5 +- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 108 +++--- .../gpu/resourceplugin/registry/registry.go | 5 +- .../gpumemory => }/checkpoint.go | 14 +- .../state/resourceplugin/gpumemory/state.go | 283 ---------------- .../state/resourceplugin/gpumemory/util.go | 133 -------- .../state/resourceplugin/rdma/checkpoint.go | 46 --- .../gpu/state/resourceplugin/rdma/state.go | 206 ----------- .../resourceplugin/rdma/state_checkpoint.go | 241 ------------- .../state/resourceplugin/rdma/state_mem.go | 112 ------ .../gpu/state/resourceplugin/rdma/util.go | 56 --- pkg/agent/qrm-plugins/gpu/state/state.go | 319 ++++++++++++++++++ .../gpumemory => }/state_checkpoint.go | 50 +-- .../gpumemory => }/state_mem.go | 81 +++-- pkg/agent/qrm-plugins/gpu/state/util.go | 161 +++++++++ .../qrm-plugins/gpu/staticpolicy/policy.go | 103 +++++- pkg/util/machine/device.go | 12 +- 19 files changed, 748 insertions(+), 1287 deletions(-) rename pkg/agent/qrm-plugins/gpu/state/{resourceplugin/gpumemory => }/checkpoint.go (81%) delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go delete mode 100644 pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/state.go rename pkg/agent/qrm-plugins/gpu/state/{resourceplugin/gpumemory => }/state_checkpoint.go (78%) rename pkg/agent/qrm-plugins/gpu/state/{resourceplugin/gpumemory => }/state_mem.go (50%) create mode 100644 pkg/agent/qrm-plugins/gpu/state/util.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index f0c31f65d8..484e198adf 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -23,12 +23,13 @@ import ( "sync" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" - gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" @@ -60,7 +61,7 @@ type BasePlugin struct { AssociatedDevicesName sets.String // Map of checkpoints for each sub-plugin - StateCheckpointsMap map[string]interface{} + State state.State // Registry of device topology providers DeviceTopologyRegistry *machine.DeviceTopologyRegistry } @@ -70,6 +71,13 @@ func NewBasePlugin( ) (*BasePlugin, error) { deviceTopologyRegistry := machine.NewDeviceTopologyRegistry() + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, deviceTopologyRegistry, conf.SkipGPUStateCorruption, wrappedEmitter) + + if err != nil { + return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) + } + return &BasePlugin{ QosConfig: conf.QoSConfiguration, QrmConfig: conf.QRMPluginsConfiguration, @@ -82,32 +90,11 @@ func NewBasePlugin( PodLabelKeptKeys: conf.PodLabelKeptKeys, AssociatedDevicesName: sets.NewString(conf.GPUResourceNames...), - StateCheckpointsMap: make(map[string]interface{}), + State: stateImpl, DeviceTopologyRegistry: deviceTopologyRegistry, }, nil } -// RegisterCheckpoint is a hook to register a state for a sub-plugin so all other plugins can access it -func (p *BasePlugin) RegisterCheckpoint(pluginName string, checkpoint interface{}) { - p.mu.Lock() - defer p.mu.Unlock() - - p.StateCheckpointsMap[pluginName] = checkpoint -} - -// GetCheckpoint gets the checkpoint using the sub-plugin name -func (p *BasePlugin) GetCheckpoint(pluginName string) (interface{}, error) { - p.mu.RLock() - defer p.mu.RUnlock() - - checkpoint, ok := p.StateCheckpointsMap[pluginName] - if !ok { - return nil, fmt.Errorf("no checkpoint found for plugin %s", pluginName) - } - - return checkpoint, nil -} - func (p *BasePlugin) GetGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { gpuCount := float64(0) gpuNames := sets.NewString() @@ -134,7 +121,7 @@ func (p *BasePlugin) GetResourcePluginOptions( } func (p *BasePlugin) PackAllocationResponse( - req *pluginapi.ResourceRequest, allocationInfo *gpumemorystate.AllocationInfo, + req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, resourceAllocationAnnotations map[string]string, resourceName string, ) (*pluginapi.ResourceAllocationResponse, error) { if allocationInfo == nil { @@ -158,7 +145,7 @@ func (p *BasePlugin) PackAllocationResponse( resourceName: { IsNodeResource: true, IsScalarResource: true, // to avoid re-allocating - AllocatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + AllocatedQuantity: allocationInfo.AllocatedAllocation.Quantity, Annotations: resourceAllocationAnnotations, ResourceHints: &pluginapi.ListOfTopologyHints{ Hints: []*pluginapi.TopologyHint{ @@ -175,20 +162,15 @@ func (p *BasePlugin) PackAllocationResponse( func (p *BasePlugin) CalculateAssociatedDevices( gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, - request *pluginapi.AssociatedDeviceRequest, + request *pluginapi.AssociatedDeviceRequest, resourceName v1.ResourceName, ) ([]string, map[string]float64, error) { gpuRequest := request.DeviceRequest.GetDeviceRequest() gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) - state, err := p.GetCheckpoint(gpuconsts.GPUMemPluginName) - if err != nil { - return nil, nil, err - } - gpuMemoryState, ok := state.(gpumemorystate.State) + machineState, ok := p.State.GetMachineState()[resourceName] if !ok { - return nil, nil, fmt.Errorf("failed to convert state checkpoint to gpumemorystate.State") + return nil, nil, fmt.Errorf("no machine state for resource %s", resourceName) } - machineState := gpuMemoryState.GetMachineState() allocatedDevices := sets.NewString() needed := gpuRequest @@ -216,9 +198,9 @@ func (p *BasePlugin) CalculateAssociatedDevices( // allocate must include devices first for _, device := range reusableDevices { - if machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + if machineState.IsGPURequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) } allocateDevices(device) } @@ -244,7 +226,7 @@ func (p *BasePlugin) CalculateAssociatedDevices( } sort.SliceStable(availableDevices, func(i, j int) bool { - return machineState.GPUMemoryAllocated(availableDevices[i]) > machineState.GPUMemoryAllocated(availableDevices[j]) + return machineState.GetQuantityAllocated(availableDevices[i]) > machineState.GetQuantityAllocated(availableDevices[j]) }) // second allocate available and numa-affinity gpus @@ -257,9 +239,9 @@ func (p *BasePlugin) CalculateAssociatedDevices( continue } - if !machineState.GPUMemorySatisfiedRequest(device, gpuMemoryPerGPU) { + if !machineState.IsGPURequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, machineState.GetGPUMemoryAllocatable(device), machineState.GPUMemoryAllocated(device), gpuMemoryPerGPU) + device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) continue } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go index 8251471b0e..aadea553ea 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go @@ -19,13 +19,13 @@ package customdeviceplugin import ( "fmt" + "github.com/kubewharf/katalyst-api/pkg/consts" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" - gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" @@ -37,13 +37,13 @@ type GPUDevicePlugin struct { *baseplugin.BasePlugin } -func NewGPUDevicePlugin(base *baseplugin.BasePlugin, _ metrics.MetricEmitter) (CustomDevicePlugin, error) { +func NewGPUDevicePlugin(base *baseplugin.BasePlugin) CustomDevicePlugin { gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUResourceNames) base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceName, gpuTopologyProvider) return &GPUDevicePlugin{ BasePlugin: base, - }, nil + } } func (p *GPUDevicePlugin) DeviceName() string { @@ -113,13 +113,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi "deviceRequest", req.DeviceRequest.DeviceRequest, ) - gpuMemoryState, ok := p.StateCheckpointsMap[gpuconsts.GPUMemPluginName].(gpumemorystate.State) - if !ok { - return nil, fmt.Errorf("failed to convert state checkpoint to gpumemorystate.State") - } - machineState := gpuMemoryState.GetMachineState() - - allocationInfo := gpuMemoryState.GetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) for gpuID := range allocationInfo.TopologyAwareAllocations { @@ -155,43 +149,43 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi return nil, fmt.Errorf("numa topology is not ready") } - allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req) + allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req, consts.ResourceGPUMemory) if err != nil { general.Warningf("failed to allocate associated devices: %v", err) return nil, err } - topologyAwareAllocations := make(map[string]gpumemorystate.GPUAllocation) + topologyAwareAllocations := make(map[string]state.Allocation) for _, device := range allocatedDevices { info, ok := gpuTopology.Devices[device] if !ok { return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) } - topologyAwareAllocations[device] = gpumemorystate.GPUAllocation{ - GPUMemoryQuantity: allocatedGPUMemory[device], - NUMANodes: info.GetNUMANode(), + topologyAwareAllocations[device] = state.Allocation{ + Quantity: allocatedGPUMemory[device], + NUMANodes: info.GetNUMANode(), } } if allocationInfo == nil { - allocationInfo = &gpumemorystate.AllocationInfo{ + allocationInfo = &state.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: gpumemorystate.GPUAllocation{ - GPUMemoryQuantity: gpuMemoryRequest, - NUMANodes: hintNodes.ToSliceInt(), + AllocatedAllocation: state.Allocation{ + Quantity: gpuMemoryRequest, + NUMANodes: hintNodes.ToSliceInt(), }, } } allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - gpuMemoryState.SetAllocationInfo(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) - machineState, err = gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, gpuMemoryState.GetPodEntries(), p.DeviceTopologyRegistry) + p.State.SetAllocationInfo(consts.ResourceGPUMemory, req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) + machineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if err != nil { return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) } - gpuMemoryState.SetMachineState(machineState, true) + p.State.SetMachineState(machineState, true) general.InfoS("allocated devices", "podNamespace", req.ResourceRequest.PodNamespace, diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go index c0f144d11e..dc3faebc25 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go @@ -19,12 +19,9 @@ package registry import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" - "github.com/kubewharf/katalyst-core/pkg/metrics" ) -type initFunc func( - plugin *baseplugin.BasePlugin, emitter metrics.MetricEmitter, -) (customdeviceplugin.CustomDevicePlugin, error) +type initFunc func(plugin *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin var CustomDevicePluginsMap = make(map[string]initFunc) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index af23b8c8a7..5ed486df28 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -34,7 +34,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - gpumemorystate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/metrics" @@ -43,35 +43,18 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/metric" ) -const ( - GPUMemPluginStateFileName = "gpu_mem_plugin_state" -) - type GPUMemPlugin struct { sync.Mutex *baseplugin.BasePlugin - state gpumemorystate.State + state state.State residualHitMap map[string]int64 } -func NewGPUMemPlugin( - base *baseplugin.BasePlugin, wrappedEmitter metrics.MetricEmitter, -) (resourceplugin.ResourcePlugin, error) { - stateImpl, err := gpumemorystate.NewCheckpointState(base.QRMPluginsConfiguration, base.GenericQRMPluginConfiguration.StateFileDirectory, GPUMemPluginStateFileName, - gpuconsts.GPUResourcePluginPolicyNameStatic, base.DeviceTopologyRegistry, base.SkipGPUStateCorruption, wrappedEmitter) - - if err != nil { - return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) - } - - base.RegisterCheckpoint(gpuconsts.GPUMemPluginName, stateImpl) - +func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin { return &GPUMemPlugin{ - BasePlugin: base, - state: stateImpl, - residualHitMap: make(map[string]int64), - }, nil + BasePlugin: base, + } } func (p *GPUMemPlugin) ResourceName() string { @@ -133,21 +116,22 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p }() var hints map[string]*pluginapi.ListOfTopologyHints - machineState := p.state.GetMachineState() - allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + machineState := p.state.GetMachineState()[consts.ResourceGPUMemory] + allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) + if allocationInfo != nil { - hints = gpumemorystate.RegenerateGPUMemoryHints(allocationInfo, false) + hints = state.RegenerateGPUMemoryHints(allocationInfo, false) // regenerateHints failed. need to clear container record and re-calculate. if hints == nil { - podEntries := p.state.GetPodEntries() + podEntries := p.state.GetPodEntries(consts.ResourceGPUMemory) delete(podEntries[req.PodUid], req.ContainerName) if len(podEntries[req.PodUid]) == 0 { delete(podEntries, req.PodUid) } var err error - machineState, err = gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, p.state.GetPodEntries(), p.DeviceTopologyRegistry) + machineState, err = state.GenerateResourceStateFromPodEntries(podEntries, p.DeviceTopologyRegistry, consts.ResourceGPUMemory) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -170,7 +154,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p } func (p *GPUMemPlugin) calculateHints( - gpuMemory float64, gpuReq float64, machineState gpumemorystate.GPUMap, req *pluginapi.ResourceRequest, + gpuMemory float64, gpuReq float64, machineState state.AllocationMap, req *pluginapi.ResourceRequest, ) (map[string]*pluginapi.ListOfTopologyHints, error) { gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) if err != nil { @@ -191,7 +175,7 @@ func (p *GPUMemPlugin) calculateHints( continue } - if s.GetGPUMemoryAllocated()+perGPUMemory <= s.GetGPUMemoryAllocatable() { + if s.GetQuantityAllocated()+perGPUMemory <= float64(p.GPUMemoryAllocatablePerGPU.Value()) { info, ok := gpuTopology.Devices[gpuID] if !ok { return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) @@ -199,7 +183,7 @@ func (p *GPUMemPlugin) calculateHints( for _, numaNode := range info.GetNUMANode() { numaToAvailableGPUCount[numaNode] += 1 - numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetGPUMemoryAllocated(), numaToMostAllocatedGPUMemory[numaNode]) + numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetQuantityAllocated(), numaToMostAllocatedGPUMemory[numaNode]) } } } @@ -320,7 +304,7 @@ func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") - allocationInfo := p.state.GetAllocationInfo(podUID, containerName) + allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, podUID, containerName) if allocationInfo == nil { return nil, nil } @@ -328,7 +312,7 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) ( topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: alloc.GPUMemoryQuantity, + ResourceValue: alloc.Quantity, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -347,8 +331,8 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) ( p.ResourceName(): { IsNodeResource: true, IsScalarResource: true, - AggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, - OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.GPUMemoryQuantity, + AggregatedQuantity: allocationInfo.AllocatedAllocation.Quantity, + OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.Quantity, TopologyAwareQuantityList: topologyAwareQuantityList, OriginalTopologyAwareQuantityList: topologyAwareQuantityList, }, @@ -365,16 +349,16 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca p.Lock() defer p.Unlock() - machineState := p.state.GetMachineState() + machineState := p.state.GetMachineState()[consts.ResourceGPUMemory] topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 - for deviceID, gpuState := range machineState { - aggregatedAllocatableQuantity += gpuState.GetGPUMemoryAllocatable() - aggregatedCapacityQuantity += gpuState.GetGPUMemoryAllocatable() + for deviceID := range machineState { + aggregatedAllocatableQuantity += float64(p.GPUMemoryAllocatablePerGPU.Value()) + aggregatedCapacityQuantity += float64(p.GPUMemoryAllocatablePerGPU.Value()) topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuState.GetGPUMemoryAllocatable(), + ResourceValue: float64(p.GPUMemoryAllocatablePerGPU.Value()), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -382,7 +366,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuState.GetGPUMemoryAllocatable(), + ResourceValue: float64(p.GPUMemoryAllocatablePerGPU.Value()), Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -472,10 +456,10 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return emptyResponse, nil } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating - return p.PackAllocationResponse(req, &gpumemorystate.AllocationInfo{}, nil, p.ResourceName()) + return p.PackAllocationResponse(req, &state.AllocationInfo{}, nil, p.ResourceName()) } - allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) + allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) if allocationInfo != nil { resp, packErr := p.PackAllocationResponse(req, allocationInfo, nil, p.ResourceName()) if packErr != nil { @@ -493,17 +477,17 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return nil, fmt.Errorf("failed to get hint nodes: %v", err) } - newAllocation := &gpumemorystate.AllocationInfo{ + newAllocation := &state.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: gpumemorystate.GPUAllocation{ - GPUMemoryQuantity: gpuMemory, - NUMANodes: hintNodes.ToSliceInt(), + AllocatedAllocation: state.Allocation{ + Quantity: gpuMemory, + NUMANodes: hintNodes.ToSliceInt(), }, } - p.state.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation, false) + p.state.SetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName, newAllocation, false) - machineState, stateErr := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, p.state.GetPodEntries(), p.DeviceTopologyRegistry) + machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.state.GetPodResourceEntries(), p.DeviceTopologyRegistry) if stateErr != nil { general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", "podNamespace", req.PodNamespace, @@ -520,16 +504,16 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso } func (p *GPUMemPlugin) RemovePod(podUID string) error { - podEntries := p.state.GetPodEntries() - delete(podEntries, podUID) + podResourceEntries := p.state.GetPodResourceEntries() + delete(podResourceEntries[consts.ResourceGPUMemory], podUID) - machineState, err := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.DeviceTopologyRegistry) + machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) if err != nil { general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } - p.state.SetPodEntries(podEntries, false) + p.state.SetPodResourceEntries(podResourceEntries, false) p.state.SetMachineState(machineState, false) err = p.state.StoreState() @@ -572,12 +556,14 @@ func (p *GPUMemPlugin) ClearResidualState() { p.Lock() defer p.Unlock() - podEntries := p.state.GetPodEntries() - for podUID := range podEntries { - if !podSet.Has(podUID) { - residualSet[podUID] = true - p.residualHitMap[podUID] += 1 - general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + podResourceEntries := p.state.GetPodResourceEntries() + for _, podEntries := range podResourceEntries { + for podUID := range podEntries { + if !podSet.Has(podUID) { + residualSet[podUID] = true + p.residualHitMap[podUID] += 1 + general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + } } } @@ -602,16 +588,16 @@ func (p *GPUMemPlugin) ClearResidualState() { } general.Infof("clear residual pod: %s in state", podUID) - delete(podEntries, podUID) + podResourceEntries.RemovePod(podUID) } - machineState, err := gpumemorystate.GenerateMachineStateFromPodEntries(p.QrmConfig, podEntries, p.DeviceTopologyRegistry) + machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return } - p.state.SetPodEntries(podEntries, false) + p.state.SetPodResourceEntries(podResourceEntries, false) p.state.SetMachineState(machineState, false) err = p.state.StoreState() diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go index a20d6a2af5..1a852c902a 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go @@ -21,12 +21,9 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory" - "github.com/kubewharf/katalyst-core/pkg/metrics" ) -type initFunc func( - plugin *baseplugin.BasePlugin, wrappedEmitter metrics.MetricEmitter, -) (resourceplugin.ResourcePlugin, error) +type initFunc func(plugin *baseplugin.BasePlugin) resourceplugin.ResourcePlugin var ResourcePluginsMap = make(map[string]initFunc) diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go similarity index 81% rename from pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go rename to pkg/agent/qrm-plugins/gpu/state/checkpoint.go index 0f674baf6b..fcd518ff97 100644 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package gpumemory +package state import ( "encoding/json" @@ -26,16 +26,16 @@ import ( var _ checkpointmanager.Checkpoint = &GPUPluginCheckpoint{} type GPUPluginCheckpoint struct { - PolicyName string `json:"policyName"` - MachineState GPUMap `json:"machineState"` - PodEntries PodEntries `json:"pod_entries"` - Checksum checksum.Checksum `json:"checksum"` + PolicyName string `json:"policyName"` + MachineState AllocationResourcesMap `json:"machineState"` + PodResourceEntries PodResourceEntries `json:"pod_entries"` + Checksum checksum.Checksum `json:"checksum"` } func NewGPUPluginCheckpoint() *GPUPluginCheckpoint { return &GPUPluginCheckpoint{ - PodEntries: make(PodEntries), - MachineState: make(GPUMap), + PodResourceEntries: make(PodResourceEntries), + MachineState: make(AllocationResourcesMap), } } diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go deleted file mode 100644 index 61cf273f1e..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state.go +++ /dev/null @@ -1,283 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package gpumemory - -import ( - "encoding/json" - - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" - "github.com/kubewharf/katalyst-core/pkg/util/general" -) - -type AllocationInfo struct { - commonstate.AllocationMeta `json:",inline"` - - AllocatedAllocation GPUAllocation `json:"allocated_allocation"` - TopologyAwareAllocations map[string]GPUAllocation `json:"topology_aware_allocations"` -} - -type GPUAllocation struct { - GPUMemoryQuantity float64 `json:"gpu_memory_quantity"` - NUMANodes []int `json:"numa_nodes"` -} - -func (a *GPUAllocation) Clone() GPUAllocation { - if a == nil { - return GPUAllocation{} - } - - numaNodes := make([]int, len(a.NUMANodes)) - copy(numaNodes, a.NUMANodes) - return GPUAllocation{ - GPUMemoryQuantity: a.GPUMemoryQuantity, - NUMANodes: numaNodes, - } -} - -type ( - ContainerEntries map[string]*AllocationInfo // Keyed by container name - PodEntries map[string]ContainerEntries // Keyed by pod UID -) - -type GPUMemoryState struct { - GPUMemoryAllocatable float64 `json:"gpu_memory_allocatable"` - GPUMemoryAllocated float64 `json:"gpu_memory_allocated"` - PodEntries PodEntries `json:"pod_entries"` -} - -type GPUMap map[string]*GPUMemoryState // GPUMap keyed by gpu device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e - -func (i *AllocationInfo) String() string { - if i == nil { - return "" - } - - contentBytes, err := json.Marshal(i) - if err != nil { - general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (i *AllocationInfo) Clone() *AllocationInfo { - if i == nil { - return nil - } - - clone := &AllocationInfo{ - AllocationMeta: *i.AllocationMeta.Clone(), - AllocatedAllocation: i.AllocatedAllocation.Clone(), - } - - if i.TopologyAwareAllocations != nil { - clone.TopologyAwareAllocations = make(map[string]GPUAllocation) - for k, v := range i.TopologyAwareAllocations { - clone.TopologyAwareAllocations[k] = v.Clone() - } - } - - return clone -} - -func (e PodEntries) String() string { - if e == nil { - return "" - } - - contentBytes, err := json.Marshal(e) - if err != nil { - general.LoggerWithPrefix("PodEntries.String", general.LoggingPKGFull).Errorf("marshal PodEntries failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (e PodEntries) Clone() PodEntries { - clone := make(PodEntries) - for podUID, containerEntries := range e { - clone[podUID] = make(ContainerEntries) - for containerName, allocationInfo := range containerEntries { - clone[podUID][containerName] = allocationInfo.Clone() - } - } - return clone -} - -func (e PodEntries) GetAllocationInfo(uid string, name string) *AllocationInfo { - if e == nil { - return nil - } - - if containerEntries, ok := e[uid]; ok { - if allocationInfo, ok := containerEntries[name]; ok { - return allocationInfo.Clone() - } - } - return nil -} - -func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { - if e == nil { - return - } - - if _, ok := e[podUID]; !ok { - e[podUID] = make(ContainerEntries) - } - - e[podUID][containerName] = allocationInfo.Clone() -} - -func (ms *GPUMemoryState) String() string { - if ms == nil { - return "" - } - - contentBytes, err := json.Marshal(ms) - if err != nil { - general.LoggerWithPrefix("GPUMemoryState.String", general.LoggingPKGFull).Errorf("marshal GPUMemoryState failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (ms *GPUMemoryState) Clone() *GPUMemoryState { - if ms == nil { - return nil - } - - return &GPUMemoryState{ - GPUMemoryAllocatable: ms.GPUMemoryAllocatable, - GPUMemoryAllocated: ms.GPUMemoryAllocated, - PodEntries: ms.PodEntries.Clone(), - } -} - -func (ms *GPUMemoryState) GetGPUMemoryAllocatable() float64 { - if ms == nil { - return 0 - } - - return ms.GPUMemoryAllocatable -} - -func (ms *GPUMemoryState) GetGPUMemoryAllocated() float64 { - if ms == nil { - return 0 - } - - return ms.GPUMemoryAllocated -} - -// SetAllocationInfo adds a new AllocationInfo (for pod/container pairs) into the given GPUMemoryState -func (ms *GPUMemoryState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { - if ms == nil { - return - } - - if allocationInfo == nil { - general.LoggerWithPrefix("GPUMemoryState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") - return - } - - if ms.PodEntries == nil { - ms.PodEntries = make(PodEntries) - } - - if _, ok := ms.PodEntries[podUID]; !ok { - ms.PodEntries[podUID] = make(ContainerEntries) - } - - ms.PodEntries[podUID][containerName] = allocationInfo.Clone() -} - -func (m GPUMap) Clone() GPUMap { - clone := make(GPUMap) - for id, ns := range m { - clone[id] = ns.Clone() - } - return clone -} - -func (m GPUMap) String() string { - if m == nil { - return "" - } - - contentBytes, err := json.Marshal(m) - if err != nil { - general.LoggerWithPrefix("GPUMap.String", general.LoggingPKGFull).Errorf("marshal GPUMap failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (m GPUMap) GetGPUMemoryAllocatable(id string) float64 { - if m == nil { - return 0 - } - return m[id].GetGPUMemoryAllocatable() -} - -func (m GPUMap) GPUMemoryAllocated(id string) float64 { - if m == nil { - return 0 - } - - return m[id].GetGPUMemoryAllocated() -} - -func (m GPUMap) GPUMemorySatisfiedRequest(id string, request float64) bool { - if m == nil { - return false - } - - gpuMemoryAllocatable := m.GetGPUMemoryAllocatable(id) - gpuMemoryAllocated := m.GPUMemoryAllocated(id) - return gpuMemoryAllocatable-gpuMemoryAllocated >= request -} - -// reader is used to get information from local states -type reader interface { - GetMachineState() GPUMap - GetPodEntries() PodEntries - GetAllocationInfo(podUID, containerName string) *AllocationInfo -} - -// writer is used to store information into local states, -// and it also provides functionality to maintain the local files -type writer interface { - SetMachineState(gpuMap GPUMap, persist bool) - SetPodEntries(podEntries PodEntries, persist bool) - SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, persist bool) - - Delete(podUID, containerName string, persist bool) - ClearState() - StoreState() error -} - -// ReadonlyState interface only provides methods for tracking pod assignments -type ReadonlyState interface { - reader -} - -// State interface provides methods for tracking and setting pod assignments -type State interface { - writer - ReadonlyState -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go deleted file mode 100644 index a72f109ad9..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/util.go +++ /dev/null @@ -1,133 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package gpumemory - -import ( - "fmt" - - "github.com/kubewharf/katalyst-api/pkg/consts" - gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/machine" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" -) - -func GenerateMachineState( - conf *qrm.QRMPluginsConfiguration, topologyRegistry *machine.DeviceTopologyRegistry, -) (GPUMap, error) { - if topologyRegistry == nil { - return nil, fmt.Errorf("topology provider registry must not be nil") - } - - gpuTopology, _, err := topologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) - if err != nil { - return nil, fmt.Errorf("gpu topology provider failed with error: %v", err) - } - - gpuMap := make(GPUMap) - for deviceID := range gpuTopology.Devices { - gpuMap[deviceID] = &GPUMemoryState{ - GPUMemoryAllocatable: float64(conf.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()), - GPUMemoryAllocated: 0, - } - } - - return gpuMap, nil -} - -// GenerateMachineStateFromPodEntries returns GPUMap for gpu memory based on -// machine info along with existed pod entries -func GenerateMachineStateFromPodEntries( - conf *qrm.QRMPluginsConfiguration, - podEntries PodEntries, - topologyRegistry *machine.DeviceTopologyRegistry, -) (GPUMap, error) { - machineState, err := GenerateMachineState(conf, topologyRegistry) - if err != nil { - return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) - } - - for gpuID, gpuState := range machineState { - var gpuMemoryAllocated float64 - for podUID, containerEntries := range podEntries { - for containerName, allocationInfo := range containerEntries { - if containerName != "" && allocationInfo != nil { - gpuAllocation, ok := allocationInfo.TopologyAwareAllocations[gpuID] - if !ok { - continue - } - alloc := allocationInfo.Clone() - alloc.AllocatedAllocation = gpuAllocation.Clone() - alloc.TopologyAwareAllocations = map[string]GPUAllocation{gpuID: gpuAllocation} - gpuMemoryAllocated += gpuAllocation.GPUMemoryQuantity - gpuState.SetAllocationInfo(podUID, containerName, alloc) - } - } - } - gpuState.GPUMemoryAllocated = gpuMemoryAllocated - - generalLog := general.LoggerWithPrefix("GenerateMachineStateFromPodEntries", general.LoggingPKGFull) - if gpuState.GPUMemoryAllocatable < gpuState.GPUMemoryAllocated { - generalLog.Warningf("invalid allocated GPU memory: %f on GPU: %s"+ - " with allocatable GPU memory size: %f", gpuState.GPUMemoryAllocated, gpuID, gpuState.GPUMemoryAllocatable) - } - machineState[gpuID] = gpuState - } - - return machineState, nil -} - -// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, -// and regenerateHints will assemble hints based on already-existed AllocationInfo, -// without any calculation logics at all -func RegenerateGPUMemoryHints( - allocationInfo *AllocationInfo, regenerate bool, -) map[string]*pluginapi.ListOfTopologyHints { - if allocationInfo == nil { - general.Errorf("RegenerateHints got nil allocationInfo") - return nil - } - - hints := map[string]*pluginapi.ListOfTopologyHints{} - if regenerate { - general.ErrorS(nil, "need to regenerate hints", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "podUID", allocationInfo.PodUid, - "containerName", allocationInfo.ContainerName) - - return nil - } - - allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) - - general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "containerName", allocationInfo.ContainerName, - "hint", allocatedNumaNodes) - hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ - Hints: []*pluginapi.TopologyHint{ - { - Nodes: allocatedNumaNodes.ToSliceUInt64(), - Preferred: true, - }, - }, - } - return hints -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go deleted file mode 100644 index 77faff321a..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/checkpoint.go +++ /dev/null @@ -1,46 +0,0 @@ -package rdma - -import ( - "encoding/json" - - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" -) - -var _ checkpointmanager.Checkpoint = &RDMAPluginCheckpoint{} - -type RDMAPluginCheckpoint struct { - PolicyName string `json:"policyName"` - MachineState RDMAMap `json:"machineState"` - PodEntries PodEntries `json:"pod_entries"` - Checksum checksum.Checksum `json:"checksum"` -} - -func NewRDMAPluginCheckpoint() *RDMAPluginCheckpoint { - return &RDMAPluginCheckpoint{ - PodEntries: make(PodEntries), - MachineState: make(RDMAMap), - } -} - -// MarshalCheckpoint returns a marshaled checkpoint -func (cp *RDMAPluginCheckpoint) MarshalCheckpoint() ([]byte, error) { - // make sure checksum wasn't set before, so it doesn't affect output checksum - cp.Checksum = 0 - cp.Checksum = checksum.New(cp) - return json.Marshal(*cp) -} - -// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint -func (cp *RDMAPluginCheckpoint) UnmarshalCheckpoint(blob []byte) error { - return json.Unmarshal(blob, cp) -} - -// VerifyChecksum verifies that current checksum of checkpoint is valid -func (cp *RDMAPluginCheckpoint) VerifyChecksum() error { - ck := cp.Checksum - cp.Checksum = 0 - err := ck.Verify(cp) - cp.Checksum = ck - return err -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go deleted file mode 100644 index 7f0674e0d8..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state.go +++ /dev/null @@ -1,206 +0,0 @@ -package rdma - -import ( - "encoding/json" - - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" - "github.com/kubewharf/katalyst-core/pkg/util/general" -) - -type AllocationInfo struct { - commonstate.AllocationMeta `json:",inline"` - - AllocatedAllocation RDMAAllocation `json:"allocatedAllocation"` - TopologyAwareAllocations map[string]RDMAAllocation `json:"topologyAwareAllocations"` -} - -type RDMAAllocation struct { - NUMANodes []int `json:"numa_nodes"` -} - -func (a *RDMAAllocation) Clone() RDMAAllocation { - if a == nil { - return RDMAAllocation{} - } - - numaNodes := make([]int, len(a.NUMANodes)) - copy(numaNodes, a.NUMANodes) - return RDMAAllocation{ - NUMANodes: numaNodes, - } -} - -type ( - ContainerEntries map[string]*AllocationInfo // Keyed by container name - PodEntries map[string]ContainerEntries // Keyed by pod name -) - -type RDMAState struct { - PodEntries PodEntries `json:"pod_entries"` -} - -type RDMAMap map[string]*RDMAState // RDMAMap keyed by RDMA NIC card ID - -func (i *AllocationInfo) String() string { - if i == nil { - return "" - } - - contentBytes, err := json.Marshal(i) - - if err != nil { - general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (i *AllocationInfo) Clone() *AllocationInfo { - if i == nil { - return nil - } - - clone := &AllocationInfo{ - AllocationMeta: *i.AllocationMeta.Clone(), - AllocatedAllocation: i.AllocatedAllocation.Clone(), - } - - if i.TopologyAwareAllocations != nil { - clone.TopologyAwareAllocations = make(map[string]RDMAAllocation) - for k, v := range i.TopologyAwareAllocations { - clone.TopologyAwareAllocations[k] = v.Clone() - } - } - - return clone -} - -func (e PodEntries) Clone() PodEntries { - clone := make(PodEntries) - for podUID, containerEntries := range e { - clone[podUID] = make(ContainerEntries) - for containerID, allocationInfo := range containerEntries { - clone[podUID][containerID] = allocationInfo.Clone() - } - } - return clone -} - -func (e PodEntries) GetAllocationInfo(podUID string, containerName string) *AllocationInfo { - if e == nil { - return nil - } - - if containerEntries, ok := e[podUID]; ok { - if allocationInfo, ok := containerEntries[containerName]; ok { - return allocationInfo.Clone() - } - } - return nil -} - -func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { - if e == nil { - return - } - - if _, ok := e[podUID]; !ok { - e[podUID] = make(ContainerEntries) - } - e[podUID][containerName] = allocationInfo.Clone() -} - -func (rs *RDMAState) String() string { - if rs == nil { - return "" - } - - contentBytes, err := json.Marshal(rs) - if err != nil { - general.LoggerWithPrefix("RDMAState.String", general.LoggingPKGFull).Errorf("marshal RDMAState failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -func (rs *RDMAState) Clone() *RDMAState { - if rs == nil { - return nil - } - - return &RDMAState{ - PodEntries: rs.PodEntries.Clone(), - } -} - -func (rs *RDMAState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { - if rs == nil { - return - } - - if allocationInfo == nil { - general.LoggerWithPrefix("RDMAState.SetAllocationInfo", general.LoggingPKGFull).Errorf("passed allocationInfo is nil") - return - } - - if rs.PodEntries == nil { - rs.PodEntries = make(PodEntries) - } - - if _, ok := rs.PodEntries[podUID]; !ok { - rs.PodEntries[podUID] = make(ContainerEntries) - } - - rs.PodEntries[podUID][containerName] = allocationInfo.Clone() -} - -func (m RDMAMap) Clone() RDMAMap { - clone := make(RDMAMap) - for id, rs := range m { - clone[id] = rs.Clone() - } - return clone -} - -func (m RDMAMap) String() string { - if m == nil { - return "" - } - - contentBytes, err := json.Marshal(m) - if err != nil { - general.LoggerWithPrefix("RDMAMap.String", general.LoggingPKGFull).Errorf("marshal RDMAMap failed with error: %v", err) - return "" - } - return string(contentBytes) -} - -// reader is used to get information from local states -type reader interface { - GetMachineState() RDMAMap - GetPodEntries() PodEntries - GetAllocationInfo(podUID, containerName string) *AllocationInfo -} - -// writer is used to store information from local states, -// and it also provides functionality to maintain the local files -type writer interface { - SetMachineState(rdmaMap RDMAMap, persist bool) - SetPodEntries(rdmaMap PodEntries, persist bool) - SetAllocationInfo(podUid, containerName string, allocationInfo *AllocationInfo, persist bool) - - Delete(podUID, containerName string, persist bool) - ClearState() - StoreState() error -} - -// ReadonlyState interface only provides methods for tracking pod assignments -type ReadonlyState interface { - reader -} - -// State interface provides methods for tracking and setting pod assignments -type State interface { - writer - ReadonlyState -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go deleted file mode 100644 index e791073567..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_checkpoint.go +++ /dev/null @@ -1,241 +0,0 @@ -package rdma - -import ( - "fmt" - "path" - "reflect" - "sync" - "time" - - "github.com/kubewharf/katalyst-core/pkg/metrics" - "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/machine" - "github.com/pkg/errors" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" - cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" -) - -const ( - metricMetaCacheStoreStateDuration = "metacache_store_state_duration" -) - -var ( - _ State = &stateCheckpoint{} - generalLog = general.LoggerWithPrefix("rdma_plugin", general.LoggingPKGFull) -) - -type stateCheckpoint struct { - sync.RWMutex - cache State - policyName string - checkpointManager checkpointmanager.CheckpointManager - checkpointName string - // when we add new properties to checkpoint, - // it will cause checkpoint corruption and we should skip it - skipStateCorruption bool - emitter metrics.MetricEmitter -} - -func NewCheckpointState( - stateDir, checkpointName, policyName string, - topologyRegistry *machine.DeviceTopologyRegistry, skipStateCorruption bool, emitter metrics.MetricEmitter, -) (State, error) { - checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) - if err != nil { - return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) - } - - defaultCache, err := NewRDMAPluginState(topologyRegistry) - if err != nil { - return nil, fmt.Errorf("NewRDMAPluginState failed with error: %v", err) - } - - sc := &stateCheckpoint{ - cache: defaultCache, - policyName: policyName, - checkpointName: checkpointName, - checkpointManager: checkpointManager, - skipStateCorruption: skipStateCorruption, - emitter: emitter, - } - - if err := sc.restoreState(topologyRegistry); err != nil { - return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ - "the rdma plugin checkpoint file %q before restarting Kubelet", - err, path.Join(stateDir, checkpointName)) - } - - return sc, nil -} - -func (s *stateCheckpoint) SetMachineState(rdmaMap RDMAMap, persist bool) { - s.Lock() - defer s.Unlock() - - s.cache.SetMachineState(rdmaMap, persist) - if persist { - err := s.storeState() - if err != nil { - generalLog.ErrorS(err, "store machineState to checkpoint error") - } - } -} - -func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { - s.Lock() - defer s.Unlock() - - s.cache.SetPodEntries(podEntries, persist) - if persist { - err := s.storeState() - if err != nil { - generalLog.ErrorS(err, "store pod entries to checkpoint error", "err") - } - } -} - -func (s *stateCheckpoint) SetAllocationInfo( - podUID, containerName string, allocationInfo *AllocationInfo, persist bool, -) { - s.Lock() - defer s.Unlock() - - s.cache.SetAllocationInfo(podUID, containerName, allocationInfo, persist) - if persist { - err := s.storeState() - if err != nil { - generalLog.ErrorS(err, "store allocationInfo to checkpoint error") - } - } -} - -func (s *stateCheckpoint) storeState() error { - startTime := time.Now() - general.InfoS("called") - defer func() { - elapsed := time.Since(startTime) - general.InfoS("finished", "duration", elapsed) - _ = s.emitter.StoreFloat64(metricMetaCacheStoreStateDuration, float64(elapsed/time.Millisecond), metrics.MetricTypeNameRaw) - }() - checkpoint := NewRDMAPluginCheckpoint() - checkpoint.PolicyName = s.policyName - checkpoint.MachineState = s.cache.GetMachineState() - checkpoint.PodEntries = s.cache.GetPodEntries() - - err := s.checkpointManager.CreateCheckpoint(s.checkpointName, checkpoint) - if err != nil { - generalLog.ErrorS(err, "could not save checkpoint") - return err - } - return nil -} - -func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyRegistry) error { - s.Lock() - defer s.Unlock() - var err error - var foundAndSkippedStateCorruption bool - - checkpoint := NewRDMAPluginCheckpoint() - if err = s.checkpointManager.GetCheckpoint(s.checkpointName, checkpoint); err != nil { - if errors.Is(err, cmerrors.ErrCheckpointNotFound) { - return s.storeState() - } else if errors.Is(err, cmerrors.ErrCorruptCheckpoint) { - if !s.skipStateCorruption { - return err - } - - foundAndSkippedStateCorruption = true - generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err) - } else { - return err - } - } - - if s.policyName != checkpoint.PolicyName && !s.skipStateCorruption { - return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) - } - - machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodEntries, topologyRegistry) - if err != nil { - return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) - } - - s.cache.SetMachineState(machineState, false) - s.cache.SetPodEntries(checkpoint.PodEntries, false) - - if !reflect.DeepEqual(machineState, checkpoint.MachineState) { - generalLog.Warningf("machine state changed: "+ - "machineState: %s; checkpointMachineState: %s", - machineState.String(), checkpoint.MachineState.String()) - - err = s.storeState() - if err != nil { - return fmt.Errorf("storeState when machine state changed failed with error: %v", err) - } - } - - if foundAndSkippedStateCorruption { - generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum") - - err = s.storeState() - if err != nil { - return fmt.Errorf("storeState failed with error: %v", err) - } - } - - generalLog.InfoS("state checkpoint: restored state from checkpoint") - - return nil -} - -func (s *stateCheckpoint) Delete(podUID, containerName string, persist bool) { - s.Lock() - defer s.Unlock() - - s.cache.Delete(podUID, containerName, persist) - if persist { - err := s.storeState() - if err != nil { - generalLog.ErrorS(err, "store state after delete operation to checkpoint error") - } - } -} - -func (s *stateCheckpoint) ClearState() { - s.Lock() - defer s.Unlock() - - s.cache.ClearState() - err := s.storeState() - if err != nil { - generalLog.ErrorS(err, "store state after clear operation to checkpoint error") - } -} - -func (s *stateCheckpoint) StoreState() error { - s.Lock() - defer s.Unlock() - return s.storeState() -} - -func (s *stateCheckpoint) GetMachineState() RDMAMap { - s.RLock() - defer s.RUnlock() - - return s.cache.GetMachineState() -} - -func (s *stateCheckpoint) GetPodEntries() PodEntries { - s.RLock() - defer s.RUnlock() - - return s.cache.GetPodEntries() -} - -func (s *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo { - s.RLock() - defer s.RUnlock() - - return s.cache.GetAllocationInfo(podUID, containerName) -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go deleted file mode 100644 index 9f226db201..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/state_mem.go +++ /dev/null @@ -1,112 +0,0 @@ -package rdma - -import ( - "fmt" - "sync" - - "github.com/kubewharf/katalyst-core/pkg/util/machine" -) - -type rdmaPluginState struct { - sync.RWMutex - - topologyRegistry *machine.DeviceTopologyRegistry - - machineState RDMAMap - podEntries PodEntries -} - -func NewRDMAPluginState(topologyRegistry *machine.DeviceTopologyRegistry) (State, error) { - generalLog.InfoS("initializing new rdma plugin in-memory state store") - - defaultMachineState, err := GenerateMachineState(topologyRegistry) - if err != nil { - return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) - } - - return &rdmaPluginState{ - machineState: defaultMachineState, - topologyRegistry: topologyRegistry, - podEntries: make(PodEntries), - }, nil -} - -func (s *rdmaPluginState) SetMachineState(rdmaMap RDMAMap, _ bool) { - s.Lock() - defer s.Unlock() - s.machineState = rdmaMap.Clone() - generalLog.InfoS("updated rdma plugin machine state", - "rdmaMap", rdmaMap) -} - -func (s *rdmaPluginState) SetPodEntries(podEntries PodEntries, _ bool) { - s.Lock() - defer s.Unlock() - s.podEntries = podEntries -} - -func (s *rdmaPluginState) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, _ bool) { - s.Lock() - defer s.Unlock() - - s.podEntries.SetAllocationInfo(podUID, containerName, allocationInfo) - generalLog.InfoS("updated rdma plugin pod entries", - "podUID", podUID, - "containerName", containerName, - "allocationInfo", allocationInfo.String()) -} - -func (s *rdmaPluginState) Delete(podUID, containerName string, _ bool) { - s.Lock() - defer s.Unlock() - - if _, ok := s.podEntries[podUID]; !ok { - return - } - - delete(s.podEntries[podUID], containerName) - if len(s.podEntries[podUID]) == 0 { - delete(s.podEntries, podUID) - } - generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) -} - -func (s *rdmaPluginState) ClearState() { - s.Lock() - defer s.Unlock() - - machineState, err := GenerateMachineState(s.topologyRegistry) - if err != nil { - generalLog.ErrorS(err, "failed to generate machine state") - } - s.machineState = machineState - s.podEntries = make(PodEntries) - - generalLog.InfoS("cleared state") -} - -func (s *rdmaPluginState) StoreState() error { - // nothing to do - return nil -} - -func (s *rdmaPluginState) GetMachineState() RDMAMap { - s.RLock() - defer s.RUnlock() - - return s.machineState.Clone() -} - -func (s *rdmaPluginState) GetPodEntries() PodEntries { - s.RLock() - defer s.RUnlock() - - return s.podEntries.Clone() -} - -func (s *rdmaPluginState) GetAllocationInfo(podUID, containerName string) *AllocationInfo { - s.RLock() - defer s.RUnlock() - - return s.podEntries.GetAllocationInfo(podUID, containerName) -} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go b/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go deleted file mode 100644 index a9bbd97e33..0000000000 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/rdma/util.go +++ /dev/null @@ -1,56 +0,0 @@ -package rdma - -import ( - "fmt" - - gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/util/machine" -) - -func GenerateMachineState(topologyRegistry *machine.DeviceTopologyRegistry) (RDMAMap, error) { - if topologyRegistry == nil { - return nil, fmt.Errorf("topology provider registry must not be nil") - } - - rdmaTopology, _, err := topologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceName) - if err != nil { - return nil, fmt.Errorf("rdma topology provider failed with error: %v", err) - } - - rdmaMap := make(RDMAMap) - for deviceID := range rdmaTopology.Devices { - rdmaMap[deviceID] = &RDMAState{} - } - - return rdmaMap, nil -} - -func GenerateMachineStateFromPodEntries( - podEntries PodEntries, topologyRegistry *machine.DeviceTopologyRegistry, -) (RDMAMap, error) { - machineState, err := GenerateMachineState(topologyRegistry) - if err != nil { - return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) - } - - for rdmaID, rdmaState := range machineState { - for podUID, containerEntries := range podEntries { - for containerName, allocationInfo := range containerEntries { - if containerName != "" && allocationInfo != nil { - rdmaAllocation, ok := allocationInfo.TopologyAwareAllocations[rdmaID] - if !ok { - continue - } - alloc := allocationInfo.Clone() - alloc.AllocatedAllocation = rdmaAllocation.Clone() - alloc.TopologyAwareAllocations = map[string]RDMAAllocation{rdmaID: rdmaAllocation} - rdmaState.SetAllocationInfo(podUID, containerName, alloc) - } - } - } - - machineState[rdmaID] = rdmaState - } - - return machineState, nil -} diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go new file mode 100644 index 0000000000..d36f82df8f --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -0,0 +1,319 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/util/general" + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) + +type AllocationInfo struct { + commonstate.AllocationMeta `json:",inline"` + + AllocatedAllocation Allocation `json:"allocated_allocation"` + TopologyAwareAllocations map[string]Allocation `json:"topology_aware_allocations"` +} + +type Allocation struct { + // Quantity refers to the amount of device allocated + Quantity float64 `json:"quantity"` + NUMANodes []int `json:"numa_nodes"` +} + +func (a *Allocation) Clone() Allocation { + if a == nil { + return Allocation{} + } + + numaNodes := make([]int, len(a.NUMANodes)) + copy(numaNodes, a.NUMANodes) + return Allocation{ + Quantity: a.Quantity, + NUMANodes: numaNodes, + } +} + +type ( + ContainerEntries map[string]*AllocationInfo // Keyed by container name + PodEntries map[string]ContainerEntries // Keyed by pod UID + PodResourceEntries map[v1.ResourceName]PodEntries // Keyed by resource name +) + +type AllocationState struct { + PodEntries PodEntries `json:"pod_entries"` +} + +type AllocationMap map[string]*AllocationState // AllocationMap keyed by device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e + +type AllocationResourcesMap map[v1.ResourceName]AllocationMap // AllocationResourcesMap keyed by resource name i.e. v1.ResourceName("nvidia.com/gpu") + +func (i *AllocationInfo) String() string { + if i == nil { + return "" + } + + contentBytes, err := json.Marshal(i) + if err != nil { + general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (i *AllocationInfo) Clone() *AllocationInfo { + if i == nil { + return nil + } + + clone := &AllocationInfo{ + AllocationMeta: *i.AllocationMeta.Clone(), + AllocatedAllocation: i.AllocatedAllocation.Clone(), + } + + if i.TopologyAwareAllocations != nil { + clone.TopologyAwareAllocations = make(map[string]Allocation) + for k, v := range i.TopologyAwareAllocations { + clone.TopologyAwareAllocations[k] = v.Clone() + } + } + + return clone +} + +func (e PodEntries) String() string { + if e == nil { + return "" + } + + contentBytes, err := json.Marshal(e) + if err != nil { + general.LoggerWithPrefix("PodEntries.String", general.LoggingPKGFull).Errorf("marshal PodEntries failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (e PodEntries) Clone() PodEntries { + clone := make(PodEntries) + for podUID, containerEntries := range e { + clone[podUID] = make(ContainerEntries) + for containerName, allocationInfo := range containerEntries { + clone[podUID][containerName] = allocationInfo.Clone() + } + } + return clone +} + +func (e PodEntries) GetAllocationInfo(uid string, name string) *AllocationInfo { + if e == nil { + return nil + } + + if containerEntries, ok := e[uid]; ok { + if allocationInfo, ok := containerEntries[name]; ok { + return allocationInfo.Clone() + } + } + return nil +} + +func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if e == nil { + return + } + + if _, ok := e[podUID]; !ok { + e[podUID] = make(ContainerEntries) + } + + e[podUID][containerName] = allocationInfo.Clone() +} + +func (pre PodResourceEntries) String() string { + if pre == nil { + return "" + } + + contentBytes, err := json.Marshal(pre) + if err != nil { + general.LoggerWithPrefix("PodResourceEntries.String", general.LoggingPKGFull).Errorf("[PodResourceEntries.String] marshal PodResourceEntries failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (pre PodResourceEntries) Clone() PodResourceEntries { + if pre == nil { + return nil + } + + clone := make(PodResourceEntries) + for resourceName, podEntries := range pre { + clone[resourceName] = podEntries.Clone() + } + return clone +} + +func (pre PodResourceEntries) RemovePod(podUID string) { + if pre == nil { + return + } + + for _, podEntries := range pre { + delete(podEntries, podUID) + } +} + +func (as *AllocationState) String() string { + if as == nil { + return "" + } + + contentBytes, err := json.Marshal(as) + if err != nil { + general.LoggerWithPrefix("AllocationState.String", general.LoggingPKGFull).Errorf("[AllocationState.String]marshal AllocationState failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (as *AllocationState) Clone() *AllocationState { + if as == nil { + return nil + } + + return &AllocationState{ + PodEntries: as.PodEntries.Clone(), + } +} + +func (as *AllocationState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if as == nil { + return + } + + if as.PodEntries == nil { + as.PodEntries = make(PodEntries) + } + + if _, ok := as.PodEntries[podUID]; !ok { + as.PodEntries[podUID] = make(ContainerEntries) + } + + as.PodEntries[podUID][containerName] = allocationInfo.Clone() +} + +func (am AllocationMap) Clone() AllocationMap { + if am == nil { + return nil + } + + clone := make(AllocationMap) + for id, ns := range am { + clone[id] = ns.Clone() + } + return clone +} + +func (arm AllocationResourcesMap) String() string { + if arm == nil { + return "" + } + + contentBytes, err := json.Marshal(arm) + if err != nil { + klog.Errorf("[AllocationResourcesMap.String] marshal AllocationResourcesMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (arm AllocationResourcesMap) Clone() AllocationResourcesMap { + clone := make(AllocationResourcesMap) + for resourceName, am := range arm { + clone[resourceName] = am.Clone() + } + return clone +} + +func (as *AllocationState) GetQuantityAllocated() float64 { + if as == nil { + return 0 + } + + quantityAllocated := float64(0) + for _, podEntries := range as.PodEntries { + for _, allocationInfo := range podEntries { + quantityAllocated += allocationInfo.AllocatedAllocation.Quantity + } + } + return quantityAllocated +} + +func (am AllocationMap) GetQuantityAllocated(id string) float64 { + if am == nil { + return 0 + } + + return am[id].GetQuantityAllocated() +} + +func (am AllocationMap) IsGPURequestSatisfied(id string, request float64, allocatable float64) bool { + if am == nil { + return false + } + + allocated := am.GetQuantityAllocated(id) + return allocatable-allocated >= request +} + +// reader is used to get information from local states +type reader interface { + GetMachineState() AllocationResourcesMap + GetPodResourceEntries() PodResourceEntries + GetPodEntries(resourceName v1.ResourceName) PodEntries + GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo +} + +// writer is used to store information into local states, +// and it also provides functionality to maintain the local files +type writer interface { + SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) + SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) + SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, + ) + + Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) + ClearState() + StoreState() error +} + +// ReadonlyState interface only provides methods for tracking pod assignments +type ReadonlyState interface { + reader +} + +// State interface provides methods for tracking and setting pod assignments +type State interface { + writer + ReadonlyState +} diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go similarity index 78% rename from pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go rename to pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go index e36b96ddea..b984ae35f6 100644 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package gpumemory +package state import ( "errors" @@ -24,6 +24,7 @@ import ( "sync" "time" + v1 "k8s.io/api/core/v1" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" @@ -57,11 +58,11 @@ type stateCheckpoint struct { emitter metrics.MetricEmitter } -func (s *stateCheckpoint) SetMachineState(gpuMap GPUMap, persist bool) { +func (s *stateCheckpoint) SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) { s.Lock() defer s.Unlock() - s.cache.SetMachineState(gpuMap, persist) + s.cache.SetMachineState(allocationResourcesMap, persist) if persist { err := s.storeState() if err != nil { @@ -70,11 +71,11 @@ func (s *stateCheckpoint) SetMachineState(gpuMap GPUMap, persist bool) { } } -func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { +func (s *stateCheckpoint) SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) { s.Lock() defer s.Unlock() - s.cache.SetPodEntries(podEntries, persist) + s.cache.SetPodResourceEntries(podResourceEntries, persist) if persist { err := s.storeState() if err != nil { @@ -84,12 +85,12 @@ func (s *stateCheckpoint) SetPodEntries(podEntries PodEntries, persist bool) { } func (s *stateCheckpoint) SetAllocationInfo( - podUID, containerName string, allocationInfo *AllocationInfo, persist bool, + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, ) { s.Lock() defer s.Unlock() - s.cache.SetAllocationInfo(podUID, containerName, allocationInfo, persist) + s.cache.SetAllocationInfo(resourceName, podUID, containerName, allocationInfo, persist) if persist { err := s.storeState() if err != nil { @@ -98,11 +99,11 @@ func (s *stateCheckpoint) SetAllocationInfo( } } -func (s *stateCheckpoint) Delete(podUID, containerName string, persist bool) { +func (s *stateCheckpoint) Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) { s.Lock() defer s.Unlock() - s.cache.Delete(podUID, containerName, persist) + s.cache.Delete(resourceName, podUID, containerName, persist) if persist { err := s.storeState() if err != nil { @@ -128,25 +129,34 @@ func (s *stateCheckpoint) StoreState() error { return s.storeState() } -func (s *stateCheckpoint) GetMachineState() GPUMap { +func (s *stateCheckpoint) GetMachineState() AllocationResourcesMap { s.RLock() defer s.RUnlock() return s.cache.GetMachineState() } -func (s *stateCheckpoint) GetPodEntries() PodEntries { +func (s *stateCheckpoint) GetPodResourceEntries() PodResourceEntries { s.RLock() defer s.RUnlock() - return s.cache.GetPodEntries() + return s.cache.GetPodResourceEntries() } -func (s *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo { +func (s *stateCheckpoint) GetPodEntries(resourceName v1.ResourceName) PodEntries { s.RLock() defer s.RUnlock() - return s.cache.GetAllocationInfo(podUID, containerName) + return s.cache.GetPodEntries(resourceName) +} + +func (s *stateCheckpoint) GetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, +) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.cache.GetAllocationInfo(resourceName, podUID, containerName) } func (s *stateCheckpoint) storeState() error { @@ -160,7 +170,7 @@ func (s *stateCheckpoint) storeState() error { checkpoint := NewGPUPluginCheckpoint() checkpoint.PolicyName = s.policyName checkpoint.MachineState = s.cache.GetMachineState() - checkpoint.PodEntries = s.cache.GetPodEntries() + checkpoint.PodResourceEntries = s.cache.GetPodResourceEntries() err := s.checkpointManager.CreateCheckpoint(s.checkpointName, checkpoint) if err != nil { @@ -170,9 +180,7 @@ func (s *stateCheckpoint) storeState() error { return nil } -func (s *stateCheckpoint) restoreState( - conf *qrm.QRMPluginsConfiguration, topologyRegistry *machine.DeviceTopologyRegistry, -) error { +func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyRegistry) error { s.Lock() defer s.Unlock() var err error @@ -198,13 +206,13 @@ func (s *stateCheckpoint) restoreState( return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) } - machineState, err := GenerateMachineStateFromPodEntries(conf, checkpoint.PodEntries, topologyRegistry) + machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodResourceEntries, topologyRegistry) if err != nil { return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) } s.cache.SetMachineState(machineState, false) - s.cache.SetPodEntries(checkpoint.PodEntries, false) + s.cache.SetPodResourceEntries(checkpoint.PodResourceEntries, false) if !reflect.DeepEqual(machineState, checkpoint.MachineState) { generalLog.Warningf("machine state changed: "+ @@ -254,7 +262,7 @@ func NewCheckpointState( emitter: emitter, } - if err := sc.restoreState(conf, topologyRegistry); err != nil { + if err := sc.restoreState(topologyRegistry); err != nil { return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ "the gpu plugin checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) diff --git a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go similarity index 50% rename from pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go rename to pkg/agent/qrm-plugins/gpu/state/state_mem.go index 4ec37d9bcd..8d35ec1e08 100644 --- a/pkg/agent/qrm-plugins/gpu/state/resourceplugin/gpumemory/state_mem.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package gpumemory +package state import ( "fmt" @@ -22,6 +22,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" "github.com/kubewharf/katalyst-core/pkg/util/machine" + v1 "k8s.io/api/core/v1" ) // gpuPluginState is an in-memory implementation of State; @@ -33,8 +34,8 @@ type gpuPluginState struct { qrmConf *qrm.QRMPluginsConfiguration topologyRegistry *machine.DeviceTopologyRegistry - machineState GPUMap - podEntries PodEntries + machineState AllocationResourcesMap + podResourceEntries PodResourceEntries } func NewGPUPluginState( @@ -43,56 +44,71 @@ func NewGPUPluginState( ) (State, error) { generalLog.InfoS("initializing new gpu plugin in-memory state store") - defaultMachineState, err := GenerateMachineState(conf, topologyRegistry) + defaultMachineState, err := GenerateMachineState(topologyRegistry) if err != nil { return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) } return &gpuPluginState{ - qrmConf: conf, - machineState: defaultMachineState, - topologyRegistry: topologyRegistry, - podEntries: make(PodEntries), + qrmConf: conf, + machineState: defaultMachineState, + topologyRegistry: topologyRegistry, + podResourceEntries: make(PodResourceEntries), }, nil } -func (s *gpuPluginState) SetMachineState(gpuMap GPUMap, _ bool) { +func (s *gpuPluginState) SetMachineState(allocationResourcesMap AllocationResourcesMap, _ bool) { s.Lock() defer s.Unlock() - s.machineState = gpuMap.Clone() + s.machineState = allocationResourcesMap.Clone() generalLog.InfoS("updated gpu plugin machine state", - "GPUMap", gpuMap.String()) + "GPUMap", allocationResourcesMap.String()) } -func (s *gpuPluginState) SetPodEntries(podEntries PodEntries, _ bool) { +func (s *gpuPluginState) SetPodResourceEntries(podResourceEntries PodResourceEntries, _ bool) { s.Lock() defer s.Unlock() - s.podEntries = podEntries.Clone() + s.podResourceEntries = podResourceEntries.Clone() } -func (s *gpuPluginState) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo, _ bool) { +func (s *gpuPluginState) SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, _ bool, +) { s.Lock() defer s.Unlock() - s.podEntries.SetAllocationInfo(podUID, containerName, allocationInfo) + if _, ok := s.podResourceEntries[resourceName]; !ok { + s.podResourceEntries[resourceName] = make(PodEntries) + } + + if _, ok := s.podResourceEntries[resourceName][podUID]; !ok { + s.podResourceEntries[resourceName][podUID] = make(ContainerEntries) + } + + s.podResourceEntries[resourceName][podUID][containerName] = allocationInfo.Clone() generalLog.InfoS("updated gpu plugin pod resource entries", "podUID", podUID, "containerName", containerName, "allocationInfo", allocationInfo.String()) } -func (s *gpuPluginState) Delete(podUID, containerName string, _ bool) { +func (s *gpuPluginState) Delete(resourceName v1.ResourceName, podUID, containerName string, _ bool) { s.Lock() defer s.Unlock() - if _, ok := s.podEntries[podUID]; !ok { + if _, ok := s.podResourceEntries[resourceName]; !ok { return } - delete(s.podEntries[podUID], containerName) - if len(s.podEntries[podUID]) == 0 { - delete(s.podEntries, podUID) + if _, ok := s.podResourceEntries[resourceName][podUID]; !ok { + return + } + + delete(s.podResourceEntries[resourceName][podUID], containerName) + if len(s.podResourceEntries[resourceName][podUID]) == 0 { + delete(s.podResourceEntries[resourceName], podUID) } + generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) } @@ -100,12 +116,12 @@ func (s *gpuPluginState) ClearState() { s.Lock() defer s.Unlock() - machineState, err := GenerateMachineState(s.qrmConf, s.topologyRegistry) + machineState, err := GenerateMachineState(s.topologyRegistry) if err != nil { generalLog.ErrorS(err, "failed to generate machine state") } s.machineState = machineState - s.podEntries = make(PodEntries) + s.podResourceEntries = make(PodResourceEntries) generalLog.InfoS("cleared state") } @@ -115,23 +131,34 @@ func (s *gpuPluginState) StoreState() error { return nil } -func (s *gpuPluginState) GetMachineState() GPUMap { +func (s *gpuPluginState) GetMachineState() AllocationResourcesMap { s.RLock() defer s.RUnlock() return s.machineState.Clone() } -func (s *gpuPluginState) GetPodEntries() PodEntries { +func (s *gpuPluginState) GetPodResourceEntries() PodResourceEntries { s.RLock() defer s.RUnlock() - return s.podEntries.Clone() + return s.podResourceEntries.Clone() } -func (s *gpuPluginState) GetAllocationInfo(podUID, containerName string) *AllocationInfo { +func (s *gpuPluginState) GetPodEntries(resourceName v1.ResourceName) PodEntries { s.RLock() defer s.RUnlock() - return s.podEntries.GetAllocationInfo(podUID, containerName) + return s.podResourceEntries[resourceName].Clone() +} + +func (s *gpuPluginState) GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + if res, ok := s.podResourceEntries[resourceName][podUID][containerName]; ok { + return res.Clone() + } + + return nil } diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go new file mode 100644 index 0000000000..330239997d --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -0,0 +1,161 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +// GenerateMachineState returns an empty AllocationResourcesMap for all resource names. +func GenerateMachineState(topologyRegistry *machine.DeviceTopologyRegistry) (AllocationResourcesMap, error) { + if topologyRegistry == nil { + return nil, fmt.Errorf("topology provider registry must not be nil") + } + + deviceNameToProvider := topologyRegistry.DeviceNameToProvider + allocationResourcesMap := make(AllocationResourcesMap) + + for resourceName := range deviceNameToProvider { + allocationMap, err := GenerateResourceState(topologyRegistry, v1.ResourceName(resourceName)) + if err != nil { + return nil, fmt.Errorf("GenerateResourceState for resource %s failed with error: %v", resourceName, err) + } + allocationResourcesMap[v1.ResourceName(resourceName)] = allocationMap + } + + return allocationResourcesMap, nil +} + +// GenerateMachineStateFromPodEntries returns AllocationResourcesMap for allocated resources based on +// resource name along with existed pod resource entries. +func GenerateMachineStateFromPodEntries( + podResourceEntries PodResourceEntries, + topologyRegistry *machine.DeviceTopologyRegistry, +) (AllocationResourcesMap, error) { + machineState, err := GenerateMachineState(topologyRegistry) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + } + + for resourceName, podEntries := range podResourceEntries { + allocationMap, err := GenerateResourceStateFromPodEntries(podEntries, topologyRegistry, resourceName) + if err != nil { + return nil, fmt.Errorf("GenerateResourceStateFromPodEntries for resource %s failed with error: %v", resourceName, err) + } + machineState[resourceName] = allocationMap + } + + return machineState, nil +} + +// GenerateResourceStateFromPodEntries returns an AllocationMap of a certain resource based on pod entries +func GenerateResourceStateFromPodEntries( + podEntries PodEntries, topologyRegistry *machine.DeviceTopologyRegistry, resourceName v1.ResourceName, +) (AllocationMap, error) { + machineState, err := GenerateResourceState(topologyRegistry, resourceName) + if err != nil { + return nil, fmt.Errorf("GenerateResourceState failed with error: %v", err) + } + + for deviceID, allocationState := range machineState { + for podUID, containerEntries := range podEntries { + for containerName, allocationInfo := range containerEntries { + if containerName != "" && allocationInfo != nil { + allocation, ok := allocationInfo.TopologyAwareAllocations[deviceID] + if !ok { + continue + } + alloc := allocationInfo.Clone() + alloc.AllocatedAllocation = allocation.Clone() + alloc.TopologyAwareAllocations = map[string]Allocation{deviceID: allocation} + allocationState.SetAllocationInfo(podUID, containerName, alloc) + } + } + } + machineState[deviceID] = allocationState + } + + return machineState, nil +} + +// GenerateResourceState returns an empty AllocationMap for a certain resource. +func GenerateResourceState( + topologyProviderRegistry *machine.DeviceTopologyRegistry, resourceName v1.ResourceName, +) (AllocationMap, error) { + if topologyProviderRegistry == nil { + return nil, fmt.Errorf("topology provider registry must not be nil") + } + + topology, _, err := topologyProviderRegistry.GetDeviceTopology(string(resourceName)) + if err != nil { + return nil, fmt.Errorf("topology provider registry failed with error: %v", err) + } + + resourceState := make(AllocationMap) + for deviceID := range topology.Devices { + resourceState[deviceID] = &AllocationState{ + PodEntries: make(PodEntries), + } + } + return resourceState, nil +} + +// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, +// and regenerateHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func RegenerateGPUMemoryHints( + allocationInfo *AllocationInfo, regenerate bool, +) map[string]*pluginapi.ListOfTopologyHints { + if allocationInfo == nil { + general.Errorf("RegenerateHints got nil allocationInfo") + return nil + } + + hints := map[string]*pluginapi.ListOfTopologyHints{} + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "podUID", allocationInfo.PodUid, + "containerName", allocationInfo.ContainerName) + + return nil + } + + allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) + + general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "hint", allocatedNumaNodes) + hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: allocatedNumaNodes.ToSliceUInt64(), + Preferred: true, + }, + }, + } + return hints +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 48ab1b0cbe..b917f0e2f0 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -24,6 +24,8 @@ import ( devicepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry" resourcepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -92,10 +94,10 @@ func NewStaticPolicy( customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } - if err = policyImplement.registerResourcePlugins(wrappedEmitter); err != nil { + if err = policyImplement.registerResourcePlugins(); err != nil { return false, agent.ComponentStub{}, fmt.Errorf("failed to register resource plugins: %w", err) } - if err = policyImplement.registerCustomDevicePlugins(wrappedEmitter); err != nil { + if err = policyImplement.registerCustomDevicePlugins(); err != nil { return false, agent.ComponentStub{}, fmt.Errorf("failed to register custom device plugins: %w", err) } @@ -415,12 +417,85 @@ func (p *StaticPolicy) clearResidualState( _ *metaserver.MetaServer, ) { general.Infof("exec") - for _, resourcePlugin := range p.resourcePlugins { - resourcePlugin.ClearResidualState() + var ( + err error + podList []*v1.Pod + ) + residualSet := make(map[string]bool) + + defer func() { + _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) + }() + + if p.MetaServer == nil { + general.Errorf("nil metaServer") + return } - for _, customDevicePlugin := range p.customDevicePlugins { - customDevicePlugin.ClearResidualState() + ctx := context.Background() + podList, err = p.MetaServer.GetPodList(ctx, nil) + if err != nil { + general.Errorf("get pod list failed: %v", err) + return + } + + podSet := sets.NewString() + for _, pod := range podList { + podSet.Insert(fmt.Sprintf("%v", pod.UID)) + } + + p.Lock() + defer p.Unlock() + + podResourceEntries := p.State.GetPodResourceEntries() + for _, podEntries := range podResourceEntries { + for podUID := range podEntries { + if !podSet.Has(podUID) { + residualSet[podUID] = true + p.residualHitMap[podUID] += 1 + general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + } + } + } + + podsToDelete := sets.NewString() + for podUID, hitCount := range p.residualHitMap { + if !residualSet[podUID] { + general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) + delete(p.residualHitMap, podUID) + continue + } + + if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { + podsToDelete.Insert(podUID) + } + } + + if podsToDelete.Len() > 0 { + for { + podUID, found := podsToDelete.PopAny() + if !found { + break + } + + general.Infof("clear residual pod: %s in state", podUID) + podResourceEntries.RemovePod(podUID) + } + + machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) + if err != nil { + general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + return + } + + p.State.SetPodResourceEntries(podResourceEntries, false) + p.State.SetMachineState(machineState, false) + + err = p.State.StoreState() + if err != nil { + general.Errorf("store state failed: %v", err) + return + } } } @@ -472,31 +547,23 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return customDevicePlugin.AllocateAssociatedDevice(req) } -func (p *StaticPolicy) registerResourcePlugins(wrappedEmitter metrics.MetricEmitter) error { +func (p *StaticPolicy) registerResourcePlugins() error { p.Lock() defer p.Unlock() for name := range p.resourcePluginsNames { initFunc := resourcepluginregistry.ResourcePluginsMap[name] - resourcePlugin, err := initFunc(p.BasePlugin, wrappedEmitter) - if err != nil { - general.Errorf("register resource plugin %s failed with error: %w", name, err) - return fmt.Errorf("register resource plugin %s failed with error: %w", name, err) - } + resourcePlugin := initFunc(p.BasePlugin) p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin } return nil } -func (p *StaticPolicy) registerCustomDevicePlugins(wrappedEmitter metrics.MetricEmitter) error { +func (p *StaticPolicy) registerCustomDevicePlugins() error { p.Lock() defer p.Unlock() for name := range p.associatedDeviceNames { initFunc := devicepluginregistry.CustomDevicePluginsMap[name] - customDevicePlugin, err := initFunc(p.BasePlugin, wrappedEmitter) - if err != nil { - general.Errorf("register custom device plugin %s failed with error: %w", name, err) - return fmt.Errorf("register custom device plugin %s failed with error: %w", name, err) - } + customDevicePlugin := initFunc(p.BasePlugin) p.customDevicePlugins[customDevicePlugin.DeviceName()] = customDevicePlugin } return nil diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index 948a6df919..f674930dd2 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -9,8 +9,8 @@ import ( ) type DeviceTopologyProvider interface { - getDeviceTopology() (*DeviceTopology, bool, error) - setDeviceTopology(*DeviceTopology) error + GetDeviceTopology() (*DeviceTopology, bool, error) + SetDeviceTopology(*DeviceTopology) error } type DeviceTopologyRegistry struct { @@ -38,7 +38,7 @@ func (r *DeviceTopologyRegistry) SetDeviceTopology(deviceName string, deviceTopo return fmt.Errorf("no device topology provider found for device %s", deviceName) } - return provider.setDeviceTopology(deviceTopology) + return provider.SetDeviceTopology(deviceTopology) } // GetDeviceTopology gets the device topology for the specified device name. @@ -47,7 +47,7 @@ func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTo if !ok { return nil, false, fmt.Errorf("no device topology provider found for device %s", deviceName) } - return provider.getDeviceTopology() + return provider.GetDeviceTopology() } type DeviceTopology struct { @@ -89,7 +89,7 @@ func NewDeviceTopologyProvider(resourceNames []string) DeviceTopologyProvider { } } -func (p *deviceTopologyProviderImpl) setDeviceTopology(deviceTopology *DeviceTopology) error { +func (p *deviceTopologyProviderImpl) SetDeviceTopology(deviceTopology *DeviceTopology) error { p.mutex.Lock() defer p.mutex.Unlock() if deviceTopology == nil { @@ -101,7 +101,7 @@ func (p *deviceTopologyProviderImpl) setDeviceTopology(deviceTopology *DeviceTop return nil } -func (p *deviceTopologyProviderImpl) getDeviceTopology() (*DeviceTopology, bool, error) { +func (p *deviceTopologyProviderImpl) GetDeviceTopology() (*DeviceTopology, bool, error) { p.mutex.RLock() defer p.mutex.RUnlock() From 318f98b4b15c5bd150f91876b8cf6171df0cde51 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Wed, 15 Oct 2025 14:25:24 +0800 Subject: [PATCH 15/52] feat: implement rdma custom device plugin and implement logic for accompany resource allocation feat: implement rdma custom device plugin and implement logic for accompany resource allocation --- .../app/options/qrm/gpu_plugin.go | 11 +- go.mod | 2 +- go.sum | 4 +- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 81 +++---- pkg/agent/qrm-plugins/gpu/consts/consts.go | 4 +- .../gpu/customdeviceplugin/{ => gpu}/gpu.go | 114 ++++------ .../customdeviceplugin/{ => gpu}/gpu_test.go | 2 +- .../gpu/customdeviceplugin/interface.go | 11 +- .../gpu/customdeviceplugin/rdma/rdma.go | 199 ++++++++++++++++++ .../customdeviceplugin/registry/registry.go | 9 +- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 151 +++---------- .../gpu/resourceplugin/interface.go | 4 - pkg/agent/qrm-plugins/gpu/state/state.go | 5 +- .../qrm-plugins/gpu/state/state_checkpoint.go | 2 +- pkg/agent/qrm-plugins/gpu/state/state_mem.go | 3 +- pkg/agent/qrm-plugins/gpu/state/util.go | 5 +- .../qrm-plugins/gpu/staticpolicy/policy.go | 93 ++++++-- pkg/config/agent/qrm/gpu_plugin.go | 6 +- pkg/util/machine/device.go | 73 ++++++- pkg/util/machine/gpu.go | 151 ------------- pkg/util/machine/rdma.go | 36 ---- 21 files changed, 492 insertions(+), 474 deletions(-) rename pkg/agent/qrm-plugins/gpu/customdeviceplugin/{ => gpu}/gpu.go (59%) rename pkg/agent/qrm-plugins/gpu/customdeviceplugin/{ => gpu}/gpu_test.go (99%) create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go delete mode 100644 pkg/util/machine/gpu.go delete mode 100644 pkg/util/machine/rdma.go diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go index 9d677f0914..90ed07f688 100644 --- a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -25,17 +25,19 @@ import ( type GPUOptions struct { PolicyName string - GPUResourceNames []string + GPUDeviceNames []string GPUMemoryAllocatablePerGPU string SkipGPUStateCorruption bool + RDMADeviceNames []string ResourcePluginsNames []string } func NewGPUOptions() *GPUOptions { return &GPUOptions{ PolicyName: "static", - GPUResourceNames: []string{"nvidia.com/gpu"}, + GPUDeviceNames: []string{"nvidia.com/gpu"}, GPUMemoryAllocatablePerGPU: "100", + RDMADeviceNames: []string{"bytedance.com/rdma"}, } } @@ -44,7 +46,7 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { fs.StringVar(&o.PolicyName, "gpu-resource-plugin-policy", o.PolicyName, "The policy gpu resource plugin should use") - fs.StringSliceVar(&o.GPUResourceNames, "gpu-resource-names", o.GPUResourceNames, "The name of the GPU resource") + fs.StringSliceVar(&o.GPUDeviceNames, "gpu-resource-names", o.GPUDeviceNames, "The name of the GPU resource") fs.StringVar(&o.GPUMemoryAllocatablePerGPU, "gpu-memory-allocatable-per-gpu", o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", @@ -54,13 +56,14 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { conf.PolicyName = o.PolicyName - conf.GPUResourceNames = o.GPUResourceNames + conf.GPUDeviceNames = o.GPUDeviceNames gpuMemory, err := resource.ParseQuantity(o.GPUMemoryAllocatablePerGPU) if err != nil { return err } conf.GPUMemoryAllocatablePerGPU = gpuMemory conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption + conf.RDMADeviceNames = o.RDMADeviceNames conf.ResourcePluginsNames = o.ResourcePluginsNames return nil } diff --git a/go.mod b/go.mod index f5da2b106f..e85139eb2b 100644 --- a/go.mod +++ b/go.mod @@ -197,7 +197,7 @@ replace ( k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6 k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6 k8s.io/kubectl => k8s.io/kubectl v0.24.6 - k8s.io/kubelet => github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15 + k8s.io/kubelet => github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2 k8s.io/kubernetes => k8s.io/kubernetes v1.24.6 k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6 k8s.io/metrics => k8s.io/metrics v0.24.6 diff --git a/go.sum b/go.sum index 0e6016b3c1..479db12f40 100644 --- a/go.sum +++ b/go.sum @@ -583,8 +583,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= -github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4 h1:x3xNQhK02NrLrxz+g5Rvs3Su1wl+bYtP+JwPmPjRUvE= -github.com/luomingmeng/katalyst-api v0.0.0-20250828035512-fee878e49ce4/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= +github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1 h1:GZ9+zAV1u5pP5xTqcshvNpr0qRIkl/Gu6/YFd2RFT9A= +github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15 h1:QFm2/NiJscpQyVm8ejkuGiWVd7dUVL+WooTN5OxuG6A= github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 484e198adf..5cc177226e 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -17,20 +17,17 @@ limitations under the License. package baseplugin import ( - "context" "fmt" "sort" "sync" - gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" "github.com/kubewharf/katalyst-core/pkg/config/generic" @@ -58,7 +55,6 @@ type BasePlugin struct { PodAnnotationKeptKeys []string PodLabelKeptKeys []string - AssociatedDevicesName sets.String // Map of checkpoints for each sub-plugin State state.State @@ -88,38 +84,12 @@ func NewBasePlugin( PodAnnotationKeptKeys: conf.PodAnnotationKeptKeys, PodLabelKeptKeys: conf.PodLabelKeptKeys, - AssociatedDevicesName: sets.NewString(conf.GPUResourceNames...), State: stateImpl, DeviceTopologyRegistry: deviceTopologyRegistry, }, nil } -func (p *BasePlugin) GetGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { - gpuCount := float64(0) - gpuNames := sets.NewString() - for resourceName := range p.AssociatedDevicesName { - _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) - if err != nil && !errors.IsNotFound(err) { - return 0, nil, err - } - gpuCount += request - gpuNames.Insert(resourceName) - } - return gpuCount, gpuNames, nil -} - -func (p *BasePlugin) GetResourcePluginOptions( - context.Context, *pluginapi.Empty, -) (*pluginapi.ResourcePluginOptions, error) { - return &pluginapi.ResourcePluginOptions{ - PreStartRequired: false, - WithTopologyAlignment: true, - NeedReconcile: false, - AssociatedDevices: p.AssociatedDevicesName.List(), - }, nil -} - func (p *BasePlugin) PackAllocationResponse( req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, resourceAllocationAnnotations map[string]string, resourceName string, @@ -162,9 +132,9 @@ func (p *BasePlugin) PackAllocationResponse( func (p *BasePlugin) CalculateAssociatedDevices( gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, - request *pluginapi.AssociatedDeviceRequest, resourceName v1.ResourceName, + request *pluginapi.DeviceRequest, resourceName v1.ResourceName, ) ([]string, map[string]float64, error) { - gpuRequest := request.DeviceRequest.GetDeviceRequest() + gpuRequest := request.GetDeviceRequest() gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) machineState, ok := p.State.GetMachineState()[resourceName] @@ -193,8 +163,8 @@ func (p *BasePlugin) CalculateAssociatedDevices( return memory } - availableDevices := request.DeviceRequest.GetAvailableDevices() - reusableDevices := request.DeviceRequest.GetReusableDevices() + availableDevices := request.GetAvailableDevices() + reusableDevices := request.GetReusableDevices() // allocate must include devices first for _, device := range reusableDevices { @@ -252,3 +222,42 @@ func (p *BasePlugin) CalculateAssociatedDevices( return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, number of needed GPUs: %d, availableDevices len: %d, allocatedDevices len: %d", gpuRequest, len(availableDevices), len(allocatedDevices)) } + +// UpdateAllocatableAssociatedDevicesByDeviceType updates the topology provider with topology information of the +// given device type. +func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( + request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, deviceType string, +) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + deviceTopology := &machine.DeviceTopology{ + Devices: make(map[string]machine.DeviceInfo, len(request.Devices)), + } + + for _, device := range request.Devices { + var numaNode []int + if device.Topology != nil { + numaNode = make([]int, 0, len(device.Topology.Nodes)) + + for _, node := range device.Topology.Nodes { + if node == nil { + continue + } + numaNode = append(numaNode, int(node.ID)) + } + } + + deviceTopology.Devices[device.ID] = machine.DeviceInfo{ + Health: device.Health, + NumaNodes: numaNode, + } + } + + err := p.DeviceTopologyRegistry.SetDeviceTopology(deviceType, deviceTopology) + if err != nil { + general.Errorf("set device topology failed with error: %v", err) + return nil, fmt.Errorf("set device topology failed with error: %v", err) + } + + general.Infof("got device %s topology success: %v", request.DeviceName, deviceTopology) + + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/consts/consts.go b/pkg/agent/qrm-plugins/gpu/consts/consts.go index d712ed74b4..e8ca95567b 100644 --- a/pkg/agent/qrm-plugins/gpu/consts/consts.go +++ b/pkg/agent/qrm-plugins/gpu/consts/consts.go @@ -38,8 +38,8 @@ type AllocatableResource struct { } const ( - GPUDeviceName = "gpu_device" - RDMADeviceName = "rdma_device" + GPUDeviceType = "gpu_device" + RDMADeviceType = "rdma_device" // GPUResourcePluginPolicyNameStatic is the policy name of static gpu resource plugin GPUResourcePluginPolicyNameStatic = string(consts.ResourcePluginPolicyNameStatic) diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go similarity index 59% rename from pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go rename to pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index aadea553ea..360e8add14 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -14,15 +14,17 @@ See the License for the specific language governing permissions and limitations under the License. */ -package customdeviceplugin +package gpu import ( "fmt" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" @@ -35,85 +37,58 @@ const GPUCustomDevicePluginName = "gpu-custom-device-plugin" type GPUDevicePlugin struct { *baseplugin.BasePlugin + deviceNames []string } -func NewGPUDevicePlugin(base *baseplugin.BasePlugin) CustomDevicePlugin { - gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUResourceNames) - base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceName, gpuTopologyProvider) +func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { + gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUDeviceNames) + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) return &GPUDevicePlugin{ - BasePlugin: base, + BasePlugin: base, + deviceNames: base.GPUDeviceNames, } } -func (p *GPUDevicePlugin) DeviceName() string { - return "nvidia.com/gpu" +func (p *GPUDevicePlugin) DeviceNames() []string { + return p.deviceNames } func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { - gpuTopology := &machine.DeviceTopology{ - Devices: make(map[string]machine.DeviceInfo, len(request.Devices)), - } - - for _, device := range request.Devices { - var numaNode []int - if device.Topology != nil { - numaNode = make([]int, 0, len(device.Topology.Nodes)) - - for _, node := range device.Topology.Nodes { - if node == nil { - continue - } - numaNode = append(numaNode, int(node.ID)) - } - } - - gpuTopology.Devices[device.ID] = machine.DeviceInfo{ - Health: device.Health, - NumaNodes: numaNode, - } - } - - err := p.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.GPUDeviceName, gpuTopology) - if err != nil { - general.Errorf("set gpu topology failed with error: %v", err) - return nil, fmt.Errorf("set gpu topology failed with error: %v", err) - } - - general.Infof("got device %s gpuTopology success: %v", request.DeviceName, gpuTopology) - - return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil + return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.GPUDeviceType) } func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } -func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req.ResourceRequest, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) +func (p *GPUDevicePlugin) AllocateAssociatedDevice( + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.ResourceRequest.PodNamespace, req.ResourceRequest.PodName, req.ResourceRequest.ContainerName, err) + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) general.Errorf("%s", err.Error()) return nil, err } general.InfoS("called", - "podNamespace", req.ResourceRequest.PodNamespace, - "podName", req.ResourceRequest.PodName, - "containerName", req.ResourceRequest.ContainerName, + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, "qosLevel", qosLevel, - "reqAnnotations", req.ResourceRequest.Annotations, - "resourceRequests", req.ResourceRequest.ResourceRequests, - "deviceName", req.DeviceRequest.DeviceName, - "resourceHint", req.ResourceRequest.Hint, - "deviceHint", req.DeviceRequest.Hint, - "availableDevices", req.DeviceRequest.AvailableDevices, - "reusableDevices", req.DeviceRequest.ReusableDevices, - "deviceRequest", req.DeviceRequest.DeviceRequest, + "reqAnnotations", resReq.Annotations, + "resourceRequests", resReq.ResourceRequests, + "deviceName", deviceReq.DeviceName, + "resourceHint", resReq.Hint, + "deviceHint", deviceReq.Hint, + "availableDevices", deviceReq.AvailableDevices, + "reusableDevices", deviceReq.ReusableDevices, + "deviceRequest", deviceReq.DeviceRequest, ) - allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName) + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName) if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) for gpuID := range allocationInfo.TopologyAwareAllocations { @@ -126,19 +101,19 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi }, nil } - _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(req.ResourceRequest.ResourceRequests, req.ResourceRequest.ResourceName, false) + _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(resReq.ResourceRequests, resReq.ResourceName, false) if err != nil { return nil, err } // get hint nodes from request - hintNodes, err := machine.NewCPUSetUint64(req.DeviceRequest.GetHint().GetNodes()...) + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) if err != nil { general.Warningf("failed to get hint nodes: %v", err) return nil, err } - gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { general.Warningf("failed to get gpu topology: %v", err) return nil, err @@ -149,7 +124,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi return nil, fmt.Errorf("numa topology is not ready") } - allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, req, consts.ResourceGPUMemory) + allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, deviceReq, consts.ResourceGPUMemory) if err != nil { general.Warningf("failed to allocate associated devices: %v", err) return nil, err @@ -159,7 +134,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi for _, device := range allocatedDevices { info, ok := gpuTopology.Devices[device] if !ok { - return nil, fmt.Errorf("failed to get gpu topology for device: %s", device) + return nil, fmt.Errorf("failed to get gpu info for device: %s", device) } topologyAwareAllocations[device] = state.Allocation{ @@ -170,7 +145,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi if allocationInfo == nil { allocationInfo = &state.AllocationInfo{ - AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req.ResourceRequest, commonstate.EmptyOwnerPoolName, qosLevel), + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), AllocatedAllocation: state.Allocation{ Quantity: gpuMemoryRequest, NUMANodes: hintNodes.ToSliceInt(), @@ -179,7 +154,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi } allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - p.State.SetAllocationInfo(consts.ResourceGPUMemory, req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, allocationInfo, false) + p.State.SetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName, allocationInfo, false) machineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if err != nil { return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) @@ -187,10 +162,10 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi p.State.SetMachineState(machineState, true) - general.InfoS("allocated devices", - "podNamespace", req.ResourceRequest.PodNamespace, - "podName", req.ResourceRequest.PodName, - "containerName", req.ResourceRequest.ContainerName, + general.InfoS("allocated gpu devices", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, "qosLevel", qosLevel, "allocatedDevices", allocatedDevices) @@ -200,10 +175,3 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(req *pluginapi.AssociatedDevi }, }, nil } - -func (p *GPUDevicePlugin) RemovePod(_ string) error { - // nothing to do - return nil -} - -func (p *GPUDevicePlugin) ClearResidualState() {} // Nothing to do diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go similarity index 99% rename from pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go rename to pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go index 540faf719a..f9860d435a 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu_test.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package customdeviceplugin +package gpu import ( "fmt" diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go index 99f1f7f9a8..ddbd8faa0a 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -21,15 +21,14 @@ import ( ) type CustomDevicePlugin interface { - DeviceName() string + // DeviceNames returns a list of all possible names for this device + DeviceNames() []string GetAssociatedDeviceTopologyHints(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) - AllocateAssociatedDevice(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) - - RemovePod(podUID string) error - - ClearResidualState() + AllocateAssociatedDevice( + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + ) (*pluginapi.AssociatedDeviceAllocationResponse, error) } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go new file mode 100644 index 0000000000..cf2ba4a497 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -0,0 +1,199 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rdma + +import ( + "fmt" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const RDMACustomDevicePluginName = "rdma-custom-device-plugin" + +type RDMADevicePlugin struct { + *baseplugin.BasePlugin + deviceNames []string +} + +func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { + rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.RDMADeviceNames) + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider) + + return &RDMADevicePlugin{ + BasePlugin: base, + deviceNames: base.RDMADeviceNames, + } +} + +func (p *RDMADevicePlugin) DeviceNames() []string { + return p.deviceNames +} + +func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType) +} + +func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (p *RDMADevicePlugin) AllocateAssociatedDevice( + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + general.InfoS("called", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", resReq.Annotations, + "resourceRequests", resReq.ResourceRequests, + "deviceName", deviceReq.DeviceName, + "resourceHint", resReq.Hint, + "deviceHint", deviceReq.Hint, + "availableDevices", deviceReq.AvailableDevices, + "reusableDevices", deviceReq.ReusableDevices, + "deviceRequest", deviceReq.DeviceRequest, + ) + + // Check if there is state for the device name + rdmaAllocationInfo := p.State.GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName) + if rdmaAllocationInfo != nil && rdmaAllocationInfo.TopologyAwareAllocations != nil { + allocatedDevices := make([]string, 0, len(rdmaAllocationInfo.TopologyAwareAllocations)) + for rdmaID := range rdmaAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, rdmaID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + // Find out the GPU devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of GPU device + gpuAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(resReq.ResourceName), resReq.PodUid, resReq.ContainerName) + if gpuAllocationInfo == nil || gpuAllocationInfo.TopologyAwareAllocations == nil { + err = fmt.Errorf("get allocation info of the resource %s for pod %s/%s, container: %s failed with error: %v", + resReq.ResourceName, resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + rdmaTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) + if err != nil { + return nil, fmt.Errorf("failed to get gpu device topology: %v", err) + } + if !numaTopologyReady { + return nil, fmt.Errorf("gpu device topology is not ready") + } + + // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same + // numa nodes as the gpu and allocate them + allocatedRdmaDevices := sets.NewString() + gpuToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceAffinity(gpuconsts.GPUDeviceType, gpuconsts.RDMADeviceType) + if err != nil { + general.Warningf("failed to get gpu to rdma affinity map: %v", err) + return nil, err + } + + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, err + } + + for gpuID := range gpuAllocationInfo.TopologyAwareAllocations { + rdmaDevices, ok := gpuToRdmaAffinityMap[gpuID] + if !ok { + return nil, fmt.Errorf("failed to find rdma devices with gpu id %s", gpuID) + } + + for _, rdmaID := range rdmaDevices { + // Only allocate when the rdma device is a subset of hint nodes + isNumaNodeAffinity, err := p.DeviceTopologyRegistry.IsNumaNodeAffinity(gpuconsts.RDMADeviceType, rdmaID, hintNodes) + if err != nil { + return nil, fmt.Errorf("failed to check numa node affinity of rdma device %s with error: %v", rdmaID, err) + } + if isNumaNodeAffinity { + allocatedRdmaDevices.Insert(rdmaID) + } + } + } + + // Modify rdma state + topologyAwareAllocations := make(map[string]state.Allocation) + allocatedRdmaDevicesRes := allocatedRdmaDevices.List() + for _, deviceID := range allocatedRdmaDevicesRes { + info, ok := rdmaTopology.Devices[deviceID] + if !ok { + return nil, fmt.Errorf("failed to get rdma info for device %s", deviceID) + } + + topologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: 1, + NUMANodes: info.GetNUMANode(), + } + } + + allocationInfo := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.Allocation{ + Quantity: 1, + NUMANodes: hintNodes.ToSliceInt(), + }, + } + + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations + p.State.SetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName, allocationInfo, false) + machineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) + if err != nil { + return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) + } + + p.State.SetMachineState(machineState, true) + + general.InfoS("allocated rdma devices", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "allocatedDevices", allocatedRdmaDevicesRes) + + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedRdmaDevicesRes, + }, + }, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go index dc3faebc25..07def8b1f0 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go @@ -19,16 +19,19 @@ package registry import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma" ) type initFunc func(plugin *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin var CustomDevicePluginsMap = make(map[string]initFunc) -func registerCustomDevicePlugin(pluginName string, initFunc initFunc) { - CustomDevicePluginsMap[pluginName] = initFunc +func registerCustomDevicePlugin(deviceName string, initFunc initFunc) { + CustomDevicePluginsMap[deviceName] = initFunc } func init() { - registerCustomDevicePlugin(customdeviceplugin.GPUCustomDevicePluginName, customdeviceplugin.NewGPUDevicePlugin) + registerCustomDevicePlugin(gpu.GPUCustomDevicePluginName, gpu.NewGPUDevicePlugin) + registerCustomDevicePlugin(rdma.RDMACustomDevicePluginName, rdma.NewRDMADevicePlugin) } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 5ed486df28..45cf3a7a91 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -17,15 +17,12 @@ limitations under the License. package gpumemory import ( - "context" "fmt" "math" "sort" "sync" - "time" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" - v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -34,6 +31,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" @@ -46,14 +44,14 @@ import ( type GPUMemPlugin struct { sync.Mutex *baseplugin.BasePlugin - state state.State - residualHitMap map[string]int64 + associatedDevicesName sets.String } func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin { return &GPUMemPlugin{ - BasePlugin: base, + BasePlugin: base, + associatedDevicesName: sets.NewString(base.GPUDeviceNames...), } } @@ -77,7 +75,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := p.GetGPUCount(req) + gpuCount, gpuNames, err := p.getGPUCount(req) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) @@ -100,7 +98,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p p.Lock() defer func() { - if err := p.state.StoreState(); err != nil { + if err := p.State.StoreState(); err != nil { general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) } p.Unlock() @@ -116,15 +114,15 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p }() var hints map[string]*pluginapi.ListOfTopologyHints - machineState := p.state.GetMachineState()[consts.ResourceGPUMemory] - allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) + machineState := p.State.GetMachineState()[consts.ResourceGPUMemory] + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) if allocationInfo != nil { hints = state.RegenerateGPUMemoryHints(allocationInfo, false) // regenerateHints failed. need to clear container record and re-calculate. if hints == nil { - podEntries := p.state.GetPodEntries(consts.ResourceGPUMemory) + podEntries := p.State.GetPodEntries(consts.ResourceGPUMemory) delete(podEntries[req.PodUid], req.ContainerName) if len(podEntries[req.PodUid]) == 0 { delete(podEntries, req.PodUid) @@ -156,7 +154,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p func (p *GPUMemPlugin) calculateHints( gpuMemory float64, gpuReq float64, machineState state.AllocationMap, req *pluginapi.ResourceRequest, ) (map[string]*pluginapi.ListOfTopologyHints, error) { - gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceName) + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { return nil, err } @@ -304,7 +302,7 @@ func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") - allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, podUID, containerName) + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, podUID, containerName) if allocationInfo == nil { return nil, nil } @@ -349,7 +347,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca p.Lock() defer p.Unlock() - machineState := p.state.GetMachineState()[consts.ResourceGPUMemory] + machineState := p.State.GetMachineState()[consts.ResourceGPUMemory] topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) @@ -404,7 +402,7 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := p.GetGPUCount(req) + gpuCount, gpuNames, err := p.getGPUCount(req) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) @@ -422,7 +420,7 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso p.Lock() defer func() { - if err := p.state.StoreState(); err != nil { + if err := p.State.StoreState(); err != nil { general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) } p.Unlock() @@ -459,7 +457,7 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso return p.PackAllocationResponse(req, &state.AllocationInfo{}, nil, p.ResourceName()) } - allocationInfo := p.state.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) if allocationInfo != nil { resp, packErr := p.PackAllocationResponse(req, allocationInfo, nil, p.ResourceName()) if packErr != nil { @@ -485,9 +483,9 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso }, } - p.state.SetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName, newAllocation, false) + p.State.SetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName, newAllocation, false) - machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.state.GetPodResourceEntries(), p.DeviceTopologyRegistry) + machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if stateErr != nil { general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", "podNamespace", req.PodNamespace, @@ -498,112 +496,21 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso } // update state cache - p.state.SetMachineState(machineState, true) + p.State.SetMachineState(machineState, true) return p.PackAllocationResponse(req, newAllocation, nil, p.ResourceName()) } -func (p *GPUMemPlugin) RemovePod(podUID string) error { - podResourceEntries := p.state.GetPodResourceEntries() - delete(podResourceEntries[consts.ResourceGPUMemory], podUID) - - machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) - if err != nil { - general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) - return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) - } - - p.state.SetPodResourceEntries(podResourceEntries, false) - p.state.SetMachineState(machineState, false) - - err = p.state.StoreState() - if err != nil { - general.Errorf("store state failed with error: %v", err) - return err - } - return nil -} - -func (p *GPUMemPlugin) ClearResidualState() { - general.Infof("exec") - var ( - err error - podList []*v1.Pod - ) - residualSet := make(map[string]bool) - - defer func() { - _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) - }() - - if p.MetaServer == nil { - general.Errorf("nil metaServer") - return - } - - ctx := context.Background() - podList, err = p.MetaServer.GetPodList(ctx, nil) - if err != nil { - general.Errorf("get pod list failed: %v", err) - return - } - - podSet := sets.NewString() - for _, pod := range podList { - podSet.Insert(fmt.Sprintf("%v", pod.UID)) - } - - p.Lock() - defer p.Unlock() - - podResourceEntries := p.state.GetPodResourceEntries() - for _, podEntries := range podResourceEntries { - for podUID := range podEntries { - if !podSet.Has(podUID) { - residualSet[podUID] = true - p.residualHitMap[podUID] += 1 - general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) - } - } - } - - podsToDelete := sets.NewString() - for podUID, hitCount := range p.residualHitMap { - if !residualSet[podUID] { - general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) - delete(p.residualHitMap, podUID) - continue - } - - if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { - podsToDelete.Insert(podUID) - } - } - - if podsToDelete.Len() > 0 { - for { - podUID, found := podsToDelete.PopAny() - if !found { - break - } - - general.Infof("clear residual pod: %s in state", podUID) - podResourceEntries.RemovePod(podUID) - } - - machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) - if err != nil { - general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) - return - } - - p.state.SetPodResourceEntries(podResourceEntries, false) - p.state.SetMachineState(machineState, false) - - err = p.state.StoreState() - if err != nil { - general.Errorf("store state failed: %v", err) - return +func (p *GPUMemPlugin) getGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { + gpuCount := float64(0) + gpuNames := sets.NewString() + for resourceName := range p.associatedDevicesName { + _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + if err != nil && !errors.IsNotFound(err) { + return 0, nil, err } + gpuCount += request + gpuNames.Insert(resourceName) } + return gpuCount, gpuNames, nil } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go index 516a5bed8a..694359337c 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -33,8 +33,4 @@ type ResourcePlugin interface { GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) Allocate(*pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) - - RemovePod(podUID string) error - - ClearResidualState() } diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go index d36f82df8f..6e1d033d2a 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -19,10 +19,11 @@ package state import ( "encoding/json" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" - "github.com/kubewharf/katalyst-core/pkg/util/general" v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/util/general" ) type AllocationInfo struct { diff --git a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go index b984ae35f6..1293abbeff 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go @@ -40,7 +40,7 @@ const ( var ( _ State = &stateCheckpoint{} - generalLog = general.LoggerWithPrefix("gpu_mem_plugin", general.LoggingPKGFull) + generalLog = general.LoggerWithPrefix("gpu_plugin", general.LoggingPKGFull) ) // stateCheckpoint is an in-memory implementation of State; diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go index 8d35ec1e08..809ae014b2 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_mem.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -20,9 +20,10 @@ import ( "fmt" "sync" + v1 "k8s.io/api/core/v1" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" "github.com/kubewharf/katalyst-core/pkg/util/machine" - v1 "k8s.io/api/core/v1" ) // gpuPluginState is an in-memory implementation of State; diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go index 330239997d..fc081c9d16 100644 --- a/pkg/agent/qrm-plugins/gpu/state/util.go +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -19,11 +19,12 @@ package state import ( "fmt" + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" - v1 "k8s.io/api/core/v1" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) // GenerateMachineState returns an empty AllocationResourcesMap for all resource names. diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index b917f0e2f0..f7726509f0 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -22,9 +22,6 @@ import ( "sync" "time" - devicepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry" - resourcepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/registry" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" @@ -37,7 +34,10 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + devicepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + resourcepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" "github.com/kubewharf/katalyst-core/pkg/config" @@ -87,7 +87,7 @@ func NewStaticPolicy( stopCh: make(chan struct{}), name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), residualHitMap: make(map[string]int64), - associatedDeviceNames: sets.NewString(conf.GPUResourceNames...), + associatedDeviceNames: sets.NewString(conf.GPUDeviceNames...), resourcePluginsNames: sets.NewString(conf.ResourcePluginsNames...), BasePlugin: basePlugin, resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), @@ -393,18 +393,23 @@ func (p *StaticPolicy) PreStartContainer( } func (p *StaticPolicy) removePod(podUID string) error { - for _, resourcePlugin := range p.resourcePlugins { - if err := resourcePlugin.RemovePod(podUID); err != nil { - return err - } - } + podResourceEntries := p.State.GetPodResourceEntries() + delete(podResourceEntries[consts.ResourceGPUMemory], podUID) - for _, customDevicePlugin := range p.customDevicePlugins { - if err := customDevicePlugin.RemovePod(podUID); err != nil { - return err - } + machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) + if err != nil { + general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) + return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) } + p.State.SetPodResourceEntries(podResourceEntries, false) + p.State.SetMachineState(machineState, false) + + err = p.State.StoreState() + if err != nil { + general.Errorf("store state failed with error: %v", err) + return err + } return nil } @@ -518,6 +523,9 @@ func (*StaticPolicy) GetAssociatedDeviceTopologyHints( return &pluginapi.AssociatedDeviceHintsResponse{}, nil } +// AllocateAssociatedDevice allocates a device in this sequence: +// 1. Find the resource plugin that corresponds to the accompanyResourceName and allocate +// 2. Find the custom device plugin that corresponds to the deviceName and allocate func (p *StaticPolicy) AllocateAssociatedDevice( _ context.Context, req *pluginapi.AssociatedDeviceRequest, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { @@ -532,19 +540,55 @@ func (p *StaticPolicy) AllocateAssociatedDevice( }, nil } - if req.DeviceRequest.Hint == nil || req.DeviceRequest.Hint.Nodes == nil { - general.Warningf("got nil device hint") - return &pluginapi.AssociatedDeviceAllocationResponse{ - AllocationResult: nil, - }, nil + // Allocate accompany resource + // Check if resource plugin for the resource exists; if it does, allocate it first + accompanyResourcePlugin := p.getResourcePlugin(req.AccompanyResourceName) + if accompanyResourcePlugin != nil { + _, err := accompanyResourcePlugin.Allocate(req.ResourceRequest) + if err != nil { + return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", req.AccompanyResourceName, err) + } + } else { + // Allocate for custom device plugin + accompanyCustomDevicePlugin := p.getCustomDevicePlugin(req.AccompanyResourceName) + if accompanyCustomDevicePlugin != nil { + // Get device request for accompany device + var accompanyDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.AccompanyResourceName { + accompanyDeviceReq = deviceRequest + } + } + + if accompanyDeviceReq == nil { + return nil, fmt.Errorf("nil accompany device request") + } + + _, err := accompanyCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, accompanyDeviceReq) + if err != nil { + return nil, fmt.Errorf("AllocateAssociatedDevice accompany resource %s failed with error: %v", req.AccompanyResourceName, err) + } + } } - customDevicePlugin := p.getCustomDevicePlugin(req.DeviceRequest.DeviceName) - if customDevicePlugin == nil { - return nil, fmt.Errorf("no custom device plugin found for device %s", req.DeviceRequest.DeviceName) + // Allocate target custom device + targetCustomDevicePlugin := p.getCustomDevicePlugin(req.DeviceName) + if targetCustomDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for target device %s", req.DeviceName) + } + + var targetDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.DeviceName { + targetDeviceReq = deviceRequest + } + } + + if targetDeviceReq == nil { + return nil, fmt.Errorf("no target device plugin found for target device %s", req.DeviceName) } - return customDevicePlugin.AllocateAssociatedDevice(req) + return targetCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, targetDeviceReq) } func (p *StaticPolicy) registerResourcePlugins() error { @@ -564,7 +608,10 @@ func (p *StaticPolicy) registerCustomDevicePlugins() error { for name := range p.associatedDeviceNames { initFunc := devicepluginregistry.CustomDevicePluginsMap[name] customDevicePlugin := initFunc(p.BasePlugin) - p.customDevicePlugins[customDevicePlugin.DeviceName()] = customDevicePlugin + deviceNames := customDevicePlugin.DeviceNames() + for _, deviceName := range deviceNames { + p.customDevicePlugins[deviceName] = customDevicePlugin + } } return nil } diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go index 32da62bcef..6a4d3a8452 100644 --- a/pkg/config/agent/qrm/gpu_plugin.go +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -21,12 +21,14 @@ import "k8s.io/apimachinery/pkg/api/resource" type GPUQRMPluginConfig struct { // PolicyName is used to switch between several strategies PolicyName string - // GPUResourceName is the name of the GPU resource - GPUResourceNames []string + // GPUDeviceNames is the names of the GPU device + GPUDeviceNames []string // GPUMemoryAllocatablePerGPU is the total memory allocatable for each GPU GPUMemoryAllocatablePerGPU resource.Quantity // SkipGPUStateCorruption skip gpu state corruption, and it will be used after updating state properties SkipGPUStateCorruption bool + // RDMADeviceNames is the names of the RDMA device + RDMADeviceNames []string // ResourcePluginsNames holds the names of the enabled GPU resource plugins ResourcePluginsNames []string } diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index f674930dd2..b062703ad9 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -1,9 +1,27 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package machine import ( "fmt" "sync" + "k8s.io/apimachinery/pkg/util/sets" + "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/native" ) @@ -50,6 +68,57 @@ func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTo return provider.GetDeviceTopology() } +// GetDeviceAffinity retrieves a map of a certain device to the list of devices that it has an affinity with +// A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) +func (r *DeviceTopologyRegistry) GetDeviceAffinity(key, value string) (map[string][]string, error) { + deviceTopologyKey, numaReady, err := r.GetDeviceTopology(key) + if err != nil { + return nil, fmt.Errorf("error getting device topology for device %s: %v", key, err) + } + if !numaReady { + return nil, fmt.Errorf("device topology for device %s is not ready", key) + } + + deviceTopologyValue, numaReady, err := r.GetDeviceTopology(value) + if err != nil { + return nil, fmt.Errorf("error getting device topology for device %s: %v", value, err) + } + if !numaReady { + return nil, fmt.Errorf("device topology for device %s is not ready", value) + } + + deviceAffinity := make(map[string][]string) + for keyName, keyInfo := range deviceTopologyKey.Devices { + devicesWithAffinity := make([]string, 0) + for valueName, valueInfo := range deviceTopologyValue.Devices { + if sets.NewInt(keyInfo.GetNUMANode()...).Equal(sets.NewInt(valueInfo.GetNUMANode()...)) { + devicesWithAffinity = append(devicesWithAffinity, valueName) + } + } + deviceAffinity[keyName] = devicesWithAffinity + } + + return deviceAffinity, nil +} + +// IsNumaNodeAffinity checks if the device with the specified device ID resides in numa nodes that are a subset of the hint nodes. +func (r *DeviceTopologyRegistry) IsNumaNodeAffinity(deviceName, deviceID string, hintNodes CPUSet) (bool, error) { + deviceTopologyKey, numaReady, err := r.GetDeviceTopology(deviceName) + if err != nil { + return false, fmt.Errorf("error getting device topology for device %s: %v", deviceName, err) + } + if !numaReady { + return false, fmt.Errorf("device topology for device %s is not ready", deviceName) + } + + info, ok := deviceTopologyKey.Devices[deviceID] + if !ok { + return false, fmt.Errorf("failed to find device %s in device topology", deviceID) + } + + return NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes), nil +} + type DeviceTopology struct { Devices map[string]DeviceInfo } @@ -125,12 +194,12 @@ func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() for _, resourceName := range resourceNames { - gpuDevice, ok := registeredDevs[resourceName] + devices, ok := registeredDevs[resourceName] if !ok { continue } - for _, id := range gpuDevice { + for _, id := range devices { // get NUMA node from UpdateAllocatableAssociatedDevices deviceTopology.Devices[id] = DeviceInfo{} } diff --git a/pkg/util/machine/gpu.go b/pkg/util/machine/gpu.go deleted file mode 100644 index 3b0a400e8c..0000000000 --- a/pkg/util/machine/gpu.go +++ /dev/null @@ -1,151 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package machine - -import ( - "fmt" - "sync" - - "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/native" -) - -type GPUTopologyProvider interface { - GetGPUTopology() (*GPUTopology, bool, error) - SetGPUTopology(*GPUTopology) error -} - -type GPUTopology struct { - GPUs map[string]GPUInfo -} - -type GPUInfo struct { - Health string - NUMANode []int -} - -func (i GPUInfo) GetNUMANode() []int { - if i.NUMANode == nil { - return []int{} - } - return i.NUMANode -} - -type gpuTopologyProviderImpl struct { - mutex sync.RWMutex - resourceNames []string - - gpuTopology *GPUTopology - numaTopologyReady bool -} - -func NewGPUTopologyProvider(resourceNames []string) GPUTopologyProvider { - gpuTopology, err := initGPUTopology(resourceNames) - if err != nil { - gpuTopology = getEmptyGPUTopology() - general.Warningf("initGPUTopology failed with error: %v", err) - } else { - general.Infof("initGPUTopology success: %v", gpuTopology) - } - - return &gpuTopologyProviderImpl{ - resourceNames: resourceNames, - gpuTopology: gpuTopology, - } -} - -func getEmptyGPUTopology() *GPUTopology { - return &GPUTopology{ - GPUs: make(map[string]GPUInfo), - } -} - -func (p *gpuTopologyProviderImpl) SetGPUTopology(gpuTopology *GPUTopology) error { - p.mutex.Lock() - defer p.mutex.Unlock() - if gpuTopology == nil { - return fmt.Errorf("gpuTopology is nil") - } - - p.gpuTopology = gpuTopology - p.numaTopologyReady = checkNUMATopologyReady(gpuTopology) - return nil -} - -func (p *gpuTopologyProviderImpl) GetGPUTopology() (*GPUTopology, bool, error) { - p.mutex.RLock() - defer p.mutex.RUnlock() - - return p.gpuTopology, p.numaTopologyReady, nil -} - -func initGPUTopology(resourceNames []string) (*GPUTopology, error) { - gpuTopology := getEmptyGPUTopology() - - kubeletCheckpoint, err := native.GetKubeletCheckpoint() - if err != nil { - general.Errorf("Failed to get kubelet checkpoint: %v", err) - return gpuTopology, nil - } - - _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() - for _, resourceName := range resourceNames { - gpuDevice, ok := registeredDevs[resourceName] - if !ok { - continue - } - - for _, id := range gpuDevice { - // get NUMA node from UpdateAllocatableAssociatedDevices - gpuTopology.GPUs[id] = GPUInfo{} - } - } - - return gpuTopology, nil -} - -func checkNUMATopologyReady(topology *GPUTopology) bool { - if topology == nil { - return false - } - - for _, gpu := range topology.GPUs { - if gpu.NUMANode == nil { - return false - } - } - return true -} - -type gpuTopologyProviderStub struct { - gpuTopology *GPUTopology -} - -func NewGPUTopologyProviderStub(gpuTopology *GPUTopology) GPUTopologyProvider { - return &gpuTopologyProviderStub{ - gpuTopology: gpuTopology, - } -} - -func (g *gpuTopologyProviderStub) GetGPUTopology() (*GPUTopology, bool, error) { - return g.gpuTopology, true, nil -} - -func (g *gpuTopologyProviderStub) SetGPUTopology(gpuTopology *GPUTopology) error { - g.gpuTopology = gpuTopology - return nil -} diff --git a/pkg/util/machine/rdma.go b/pkg/util/machine/rdma.go deleted file mode 100644 index 46a0d79185..0000000000 --- a/pkg/util/machine/rdma.go +++ /dev/null @@ -1,36 +0,0 @@ -package machine - -import "sync" - -type RDMATopologyProvider interface { - GetRDMATopology() (*RDMATopology, bool, error) - SetRDMATopology(*RDMATopology) error -} - -type RDMATopology struct { - RDMAs map[string]RDMAInfo -} - -type RDMAInfo struct { - Health string - NUMANodes []int -} - -func (i RDMAInfo) GetNUMANode() []int { - if i.NUMANodes == nil { - return []int{} - } - return i.NUMANodes -} - -type rdmaTopologyProviderImpl struct { - mutex sync.RWMutex - resourceNames []string - - rdmaTopology *RDMATopology - numaTopologyReady bool -} - -func NewRDMATopologyProvider(resourceNames []string) RDMATopologyProvider { - -} From 3d2f4418cf8591717c0725e9e40eba8eb213e401 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Sat, 18 Oct 2025 23:21:18 +0800 Subject: [PATCH 16/52] feat: implement allocation of accompany resource first before device feat: implement allocation of accompany resource first before device --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 118 +----- .../qrm-plugins/gpu/baseplugin/test_util.go | 70 ---- .../gpu/customdeviceplugin/gpu/gpu.go | 87 ++-- .../gpu/customdeviceplugin/gpu/gpu_test.go | 379 ------------------ .../gpu/customdeviceplugin/interface.go | 2 +- .../gpu/customdeviceplugin/rdma/rdma.go | 182 +++++++-- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 206 ++++++++-- .../resourceplugin/gpumemory/gpu_mem_test.go | 351 ---------------- .../gpu/resourceplugin/interface.go | 4 +- pkg/agent/qrm-plugins/gpu/state/state.go | 47 ++- .../qrm-plugins/gpu/staticpolicy/policy.go | 135 +++++-- pkg/agent/qrm-plugins/gpu/util/util.go | 12 + pkg/util/machine/device.go | 4 +- 13 files changed, 536 insertions(+), 1061 deletions(-) delete mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go delete mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go delete mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 5cc177226e..50ac1be7c2 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -18,11 +18,8 @@ package baseplugin import ( "fmt" - "sort" "sync" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" @@ -60,6 +57,9 @@ type BasePlugin struct { State state.State // Registry of device topology providers DeviceTopologyRegistry *machine.DeviceTopologyRegistry + + // Map of specific device name to device type + DeviceNameToTypeMap map[string]string } func NewBasePlugin( @@ -69,7 +69,6 @@ func NewBasePlugin( stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, gpuconsts.GPUResourcePluginPolicyNameStatic, deviceTopologyRegistry, conf.SkipGPUStateCorruption, wrappedEmitter) - if err != nil { return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) } @@ -87,6 +86,7 @@ func NewBasePlugin( State: stateImpl, DeviceTopologyRegistry: deviceTopologyRegistry, + DeviceNameToTypeMap: make(map[string]string), }, nil } @@ -130,99 +130,6 @@ func (p *BasePlugin) PackAllocationResponse( }, nil } -func (p *BasePlugin) CalculateAssociatedDevices( - gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, - request *pluginapi.DeviceRequest, resourceName v1.ResourceName, -) ([]string, map[string]float64, error) { - gpuRequest := request.GetDeviceRequest() - gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) - - machineState, ok := p.State.GetMachineState()[resourceName] - if !ok { - return nil, nil, fmt.Errorf("no machine state for resource %s", resourceName) - } - - allocatedDevices := sets.NewString() - needed := gpuRequest - allocateDevices := func(devices ...string) bool { - for _, device := range devices { - allocatedDevices.Insert(device) - needed-- - if needed == 0 { - return true - } - } - return false - } - - allocatedGPUMemory := func(devices ...string) map[string]float64 { - memory := make(map[string]float64) - for _, device := range devices { - memory[device] = gpuMemoryPerGPU - } - return memory - } - - availableDevices := request.GetAvailableDevices() - reusableDevices := request.GetReusableDevices() - - // allocate must include devices first - for _, device := range reusableDevices { - if machineState.IsGPURequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { - general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) - } - allocateDevices(device) - } - - // if allocated devices is enough, return immediately - if allocatedDevices.Len() >= int(gpuRequest) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - - isNUMAAffinityDevice := func(device string) bool { - info, ok := gpuTopology.Devices[device] - if !ok { - general.Errorf("failed to find hint node for device %s", device) - return false - } - - // check if gpu's numa node is the subset of hint nodes - // todo support multi numa node - if machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) { - return true - } - return false - } - - sort.SliceStable(availableDevices, func(i, j int) bool { - return machineState.GetQuantityAllocated(availableDevices[i]) > machineState.GetQuantityAllocated(availableDevices[j]) - }) - - // second allocate available and numa-affinity gpus - for _, device := range availableDevices { - if allocatedDevices.Has(device) { - continue - } - - if !isNUMAAffinityDevice(device) { - continue - } - - if !machineState.IsGPURequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { - general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) - continue - } - - if allocateDevices(device) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - } - - return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, number of needed GPUs: %d, availableDevices len: %d, allocatedDevices len: %d", gpuRequest, len(availableDevices), len(allocatedDevices)) -} - // UpdateAllocatableAssociatedDevicesByDeviceType updates the topology provider with topology information of the // given device type. func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( @@ -261,3 +168,20 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil } + +// RegisterDeviceNameToType is used to map device name to device type. +// For example, we may have multiple device names for a same device type, e.g. "nvidia.com/gpu" and "nvidia.com/be-gpu", +// so we map them to the same device type, which allows us to allocate them interchangeably. +func (p *BasePlugin) RegisterDeviceNameToType(resourceNames []string, deviceType string) { + for _, resourceName := range resourceNames { + p.DeviceNameToTypeMap[resourceName] = deviceType + } +} + +func (p *BasePlugin) GetResourceTypeFromDeviceName(deviceName string) (string, error) { + deviceType, ok := p.DeviceNameToTypeMap[deviceName] + if !ok { + return "", fmt.Errorf("no device type found for device name %s", deviceName) + } + return deviceType, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go b/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go deleted file mode 100644 index 71d174de6f..0000000000 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/test_util.go +++ /dev/null @@ -1,70 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package baseplugin - -import ( - "fmt" - - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/sets" - - katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" - "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/config/generic" - "github.com/kubewharf/katalyst-core/pkg/metaserver" - metaserveragent "github.com/kubewharf/katalyst-core/pkg/metaserver/agent" - "github.com/kubewharf/katalyst-core/pkg/metrics" - "github.com/kubewharf/katalyst-core/pkg/util/machine" -) - -func makeMetaServer(numCPUs, numNUMAs, socketNum int) *metaserver.MetaServer { - cpuTopology, _ := machine.GenerateDummyCPUTopology(numCPUs, socketNum, numNUMAs) - - return &metaserver.MetaServer{ - MetaAgent: &metaserveragent.MetaAgent{ - KatalystMachineInfo: &machine.KatalystMachineInfo{ - CPUTopology: cpuTopology, - }, - }, - } -} - -func GenerateTestBasePlugin( - stateDirectory string, provider machine.GPUTopologyProvider, numCPUs, numNUMAs, socketNum int, -) (*BasePlugin, error) { - qrmConfig := qrm.NewQRMPluginsConfiguration() - qosConfig := generic.NewQoSConfiguration() - stateImpl, err := state.NewCheckpointState(qrmConfig, stateDirectory, GPUPluginStateFileName, "test_gpu_policy", provider, false, metrics.DummyMetrics{}) - genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) - if err != nil { - return nil, fmt.Errorf("GenerateFakeGenericContext failed: %v", err) - } - metaServer := makeMetaServer(numCPUs, numNUMAs, socketNum) - basePlugin := &BasePlugin{ - QrmConfig: qrmConfig, - QosConfig: qosConfig, - State: stateImpl, - GpuTopologyProvider: provider, - Emitter: metrics.DummyMetrics{}, - MetaServer: metaServer, - AgentCtx: &agent.GenericContext{GenericContext: genericCtx, MetaServer: metaServer}, - AssociatedDevicesName: sets.NewString("nvidia.com/gpu"), - } - return basePlugin, nil -} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index 360e8add14..a973041872 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -22,12 +22,11 @@ import ( pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" @@ -43,6 +42,7 @@ type GPUDevicePlugin struct { func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUDeviceNames) base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) + base.RegisterDeviceNameToType(base.GPUDeviceNames, gpuconsts.GPUDeviceType) return &GPUDevicePlugin{ BasePlugin: base, @@ -63,7 +63,7 @@ func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.Associat } func (p *GPUDevicePlugin) AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { @@ -88,35 +88,22 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( "deviceRequest", deviceReq.DeviceRequest, ) - allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName) - if allocationInfo != nil && allocationInfo.TopologyAwareAllocations != nil { - allocatedDevices := make([]string, 0, len(allocationInfo.TopologyAwareAllocations)) - for gpuID := range allocationInfo.TopologyAwareAllocations { - allocatedDevices = append(allocatedDevices, gpuID) - } - return &pluginapi.AssociatedDeviceAllocationResponse{ - AllocationResult: &pluginapi.AssociatedDeviceAllocation{ - AllocatedDevices: allocatedDevices, - }, - }, nil - } - - _, gpuMemoryRequest, err := util.GetQuantityFromResourceRequests(resReq.ResourceRequests, resReq.ResourceName, false) - if err != nil { - return nil, err + memoryAllocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName) + // GPU memory should have been allocated at this stage. + // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu memory state. + if memoryAllocationInfo == nil || memoryAllocationInfo.TopologyAwareAllocations == nil { + return nil, fmt.Errorf("gpu memory allocation info is nil, which is unexpected") } - // get hint nodes from request - hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) - if err != nil { - general.Warningf("failed to get hint nodes: %v", err) - return nil, err + allocatedDevices := make([]string, 0, len(memoryAllocationInfo.TopologyAwareAllocations)) + for gpuID := range memoryAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) } gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { - general.Warningf("failed to get gpu topology: %v", err) - return nil, err + general.Warningf("failed to get gpu topology: %w", err) + return nil, fmt.Errorf("failed to get gpu topology: %w", err) } if !numaTopologyReady { @@ -124,43 +111,35 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( return nil, fmt.Errorf("numa topology is not ready") } - allocatedDevices, allocatedGPUMemory, err := p.CalculateAssociatedDevices(gpuTopology, gpuMemoryRequest, hintNodes, deviceReq, consts.ResourceGPUMemory) - if err != nil { - general.Warningf("failed to allocate associated devices: %v", err) - return nil, err - } - - topologyAwareAllocations := make(map[string]state.Allocation) - for _, device := range allocatedDevices { - info, ok := gpuTopology.Devices[device] + // Save gpu device allocations in state + gpuDeviceTopologyAwareAllocations := make(map[string]state.Allocation) + for _, deviceID := range allocatedDevices { + info, ok := gpuTopology.Devices[deviceID] if !ok { - return nil, fmt.Errorf("failed to get gpu info for device: %s", device) + return nil, fmt.Errorf("failed to get gpu info for device: %s", deviceID) } - topologyAwareAllocations[device] = state.Allocation{ - Quantity: allocatedGPUMemory[device], - NUMANodes: info.GetNUMANode(), + gpuDeviceTopologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: 1, + NUMANodes: info.NumaNodes, } } - if allocationInfo == nil { - allocationInfo = &state.AllocationInfo{ - AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), - AllocatedAllocation: state.Allocation{ - Quantity: gpuMemoryRequest, - NUMANodes: hintNodes.ToSliceInt(), - }, - } + gpuDeviceAllocationInfo := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.Allocation{ + Quantity: float64(len(allocatedDevices)), + NUMANodes: memoryAllocationInfo.AllocatedAllocation.NUMANodes, + }, } + gpuDeviceAllocationInfo.TopologyAwareAllocations = gpuDeviceTopologyAwareAllocations - allocationInfo.TopologyAwareAllocations = topologyAwareAllocations - p.State.SetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName, allocationInfo, false) - machineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) + p.State.SetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName, gpuDeviceAllocationInfo, false) + gpuDeviceMachineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if err != nil { - return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) + return nil, fmt.Errorf("failed to generate gpu device machine state from pod entries: %v", err) } - - p.State.SetMachineState(machineState, true) + p.State.SetMachineState(gpuDeviceMachineState, true) general.InfoS("allocated gpu devices", "podNamespace", resReq.PodNamespace, diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go deleted file mode 100644 index f9860d435a..0000000000 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go +++ /dev/null @@ -1,379 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package gpu - -import ( - "fmt" - "testing" - - "github.com/stretchr/testify/assert" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" - - "github.com/kubewharf/katalyst-api/pkg/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - "github.com/kubewharf/katalyst-core/pkg/util/machine" -) - -func generateTestGPUCustomDevicePlugin( - stateDirectory string, numGPUs, numNUMAs, numCPUs, socketNum int, -) (*GPUDevicePlugin, error) { - gpuTopology, err := machine.GenerateDummyGPUTopology(numGPUs, numNUMAs) - if err != nil { - return nil, fmt.Errorf("GenerateDummyGPUTopology failed: %v", err) - } - providerStub := machine.NewGPUTopologyProviderStub(gpuTopology) - - basePlugin, err := baseplugin.GenerateTestBasePlugin(stateDirectory, providerStub, numCPUs, numNUMAs, socketNum) - if err != nil { - return nil, fmt.Errorf("generateTestBasePlugin failed: %v", err) - } - - p := NewGPUDevicePlugin(basePlugin) - - machineState := make(state.GPUMap) - // Initialize machine state to not have any gpu memory allocated - for i := 0; i < numGPUs; i++ { - machineState[fmt.Sprintf("%d", i)] = &state.GPUState{ - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 0, - } - } - - gpuDevicePlugin := p.(*GPUDevicePlugin) - gpuDevicePlugin.State.SetMachineState(machineState, true) - - return gpuDevicePlugin, nil -} - -func TestGPUDevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { - t.Parallel() - - req1 := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ - DeviceName: "nvidia.com/gpu", - Devices: []*pluginapi.AssociatedDevice{ - { - ID: "0", - Topology: &pluginapi.TopologyInfo{ - Nodes: []*pluginapi.NUMANode{ - { - ID: 0, - }, - }, - }, - Health: pluginapi.Healthy, - }, - { - ID: "1", - Topology: &pluginapi.TopologyInfo{ - Nodes: []*pluginapi.NUMANode{ - { - ID: 1, - }, - }, - }, - Health: pluginapi.Healthy, - }, - }, - } - - req2 := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ - DeviceName: "nvidia.com/gpu", - Devices: []*pluginapi.AssociatedDevice{ - { - ID: "0", - Health: pluginapi.Healthy, - }, - { - ID: "1", - Health: pluginapi.Healthy, - }, - }, - } - - p1, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 2, 4, 2) - assert.NoError(t, err) - // For this test case, we override the dummy gpu topology to nil - err = p1.GpuTopologyProvider.SetGPUTopology(nil) - assert.NoError(t, err) - - p2, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 2, 4, 2) - assert.NoError(t, err) - // For this test case, we override the dummy gpu topology to nil - err = p2.GpuTopologyProvider.SetGPUTopology(nil) - assert.NoError(t, err) - - testCases := []struct { - name string - plugin *GPUDevicePlugin - req *pluginapi.UpdateAllocatableAssociatedDevicesRequest - expectedGPUs map[string]machine.GPUInfo - }{ - { - name: "Normal case", - plugin: p1, - req: req1, - expectedGPUs: map[string]machine.GPUInfo{ - "0": { - Health: pluginapi.Healthy, - NUMANode: []int{0}, - }, - "1": { - Health: pluginapi.Healthy, - NUMANode: []int{1}, - }, - }, - }, - { - name: "Empty topology", - plugin: p2, - req: req2, - expectedGPUs: map[string]machine.GPUInfo{ - "0": { - Health: pluginapi.Healthy, - NUMANode: nil, - }, - "1": { - Health: pluginapi.Healthy, - NUMANode: nil, - }, - }, - }, - } - for _, tt := range testCases { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - _, err := tt.plugin.UpdateAllocatableAssociatedDevices(tt.req) - assert.NoError(t, err) - - gpuTopology, _, err := tt.plugin.GpuTopologyProvider.GetGPUTopology() - assert.NoError(t, err) - assert.Equal(t, tt.expectedGPUs, gpuTopology.GPUs) - }) - } -} - -func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { - t.Parallel() - - p1, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) - assert.NoError(t, err) - - p2, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) - assert.NoError(t, err) - - p3, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 1, 4, 2) - assert.NoError(t, err) - // Fill up one of the GPUs memory completely - p3.State.SetMachineState(map[string]*state.GPUState{ - "0": { - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, - }, - "1": { - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 0 * 1024 * 1024 * 1024, - }, - }, false) - - p4, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 4, 2, 4, 2) - assert.NoError(t, err) - - p5, err := generateTestGPUCustomDevicePlugin(t.TempDir(), 2, 1, 4, 2) - assert.NoError(t, err) - // Pod is already allocated - p5.State.SetAllocationInfo("pod5", "container5", &state.AllocationInfo{ - AllocatedAllocation: state.GPUAllocation{ - GPUMemoryQuantity: 8 * 1024 * 1024 * 1024, - NUMANodes: []int{0}, - }, - TopologyAwareAllocations: map[string]state.GPUAllocation{ - "1": { - GPUMemoryQuantity: 8 * 1024 * 1024 * 1024, - NUMANodes: []int{0}, - }, - }, - }, false) - - req1 := &pluginapi.AssociatedDeviceRequest{ - ResourceRequest: &pluginapi.ResourceRequest{ - ResourceName: string(consts.ResourceGPUMemory), - PodUid: "pod1", - ContainerName: "container1", - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - }, - }, - DeviceRequest: &pluginapi.DeviceRequest{ - DeviceName: p1.DeviceName(), - AvailableDevices: []string{"2", "3"}, - ReusableDevices: []string{"0", "1"}, - DeviceRequest: 2, - }, - } - - req2 := &pluginapi.AssociatedDeviceRequest{ - ResourceRequest: &pluginapi.ResourceRequest{ - ResourceName: string(consts.ResourceGPUMemory), - PodUid: "pod2", - ContainerName: "container2", - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 32 * 1024 * 1024 * 1024, - }, - }, - DeviceRequest: &pluginapi.DeviceRequest{ - DeviceName: p2.DeviceName(), - ReusableDevices: []string{"0"}, - AvailableDevices: []string{"1", "2"}, - DeviceRequest: 2, - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - }, - } - - req3 := &pluginapi.AssociatedDeviceRequest{ - ResourceRequest: &pluginapi.ResourceRequest{ - ResourceName: string(consts.ResourceGPUMemory), - PodUid: "pod3", - ContainerName: "container3", - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - }, - }, - DeviceRequest: &pluginapi.DeviceRequest{ - DeviceName: p3.DeviceName(), - AvailableDevices: []string{"0", "1"}, - DeviceRequest: 1, - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - }, - } - - req4 := &pluginapi.AssociatedDeviceRequest{ - ResourceRequest: &pluginapi.ResourceRequest{ - ResourceName: string(consts.ResourceGPUMemory), - PodUid: "pod4", - ContainerName: "container4", - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - }, - }, - DeviceRequest: &pluginapi.DeviceRequest{ - DeviceName: p4.DeviceName(), - AvailableDevices: []string{"0", "1"}, - DeviceRequest: 1, - }, - } - - req5 := &pluginapi.AssociatedDeviceRequest{ - ResourceRequest: &pluginapi.ResourceRequest{ - ResourceName: string(consts.ResourceGPUMemory), - PodUid: "pod5", - ContainerName: "container5", - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 16 * 1024 * 1024 * 1024, - }, - }, - DeviceRequest: &pluginapi.DeviceRequest{ - DeviceName: p5.DeviceName(), - AvailableDevices: []string{"0", "1"}, - DeviceRequest: 1, - }, - } - - testCases := []struct { - name string - plugin *GPUDevicePlugin - req *pluginapi.AssociatedDeviceRequest - expectedErr bool - checkResp func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) - }{ - { - name: "Use all reusable devices first", - plugin: p1, - req: req1, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.Equal(t, []string{"0", "1"}, resp.AllocationResult.GetAllocatedDevices()) - }, - }, - { - name: "Use available devices after reusable devices are used up", - plugin: p2, - req: req2, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.Equal(t, []string{"0", "1"}, resp.AllocationResult.GetAllocatedDevices()) - }, - }, - { - name: "A GPU device is completely used up", - plugin: p3, - req: req3, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.Equal(t, []string{"1"}, resp.AllocationResult.GetAllocatedDevices()) - }, - }, - { - name: "Request has no hints", - plugin: p4, - req: req4, - expectedErr: true, - }, - { - name: "Pod is already allocated in state", - plugin: p5, - req: req5, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.AssociatedDeviceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.Equal(t, []string{"1"}, resp.AllocationResult.GetAllocatedDevices()) - }, - }, - } - - for _, tt := range testCases { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - resp, err := tt.plugin.AllocateAssociatedDevice(tt.req) - - if tt.expectedErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - - if tt.checkResp != nil { - tt.checkResp(t, resp) - } - }) - } -} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go index ddbd8faa0a..7d06363610 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -29,6 +29,6 @@ type CustomDevicePlugin interface { UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index cf2ba4a497..1ada043a36 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -18,6 +18,7 @@ package rdma import ( "fmt" + "math" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -28,6 +29,7 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" @@ -43,6 +45,7 @@ type RDMADevicePlugin struct { func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.RDMADeviceNames) base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider) + base.RegisterDeviceNameToType(base.RDMADeviceNames, gpuconsts.RDMADeviceType) return &RDMADevicePlugin{ BasePlugin: base, @@ -62,8 +65,10 @@ func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.Associa return &pluginapi.AssociatedDeviceHintsResponse{}, nil } +// TODO:accompany resource name == "" +// check if rdma is allocated to other containers, make sure they do not share rdma func (p *RDMADevicePlugin) AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { @@ -87,6 +92,8 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( "reusableDevices", deviceReq.ReusableDevices, "deviceRequest", deviceReq.DeviceRequest, ) + p.Lock() + defer p.Unlock() // Check if there is state for the device name rdmaAllocationInfo := p.State.GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName) @@ -102,15 +109,6 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( }, nil } - // Find out the GPU devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of GPU device - gpuAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(resReq.ResourceName), resReq.PodUid, resReq.ContainerName) - if gpuAllocationInfo == nil || gpuAllocationInfo.TopologyAwareAllocations == nil { - err = fmt.Errorf("get allocation info of the resource %s for pod %s/%s, container: %s failed with error: %v", - resReq.ResourceName, resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) - general.Errorf("%s", err.Error()) - return nil, err - } - rdmaTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) if err != nil { return nil, fmt.Errorf("failed to get gpu device topology: %v", err) @@ -119,43 +117,30 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( return nil, fmt.Errorf("gpu device topology is not ready") } - // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same - // numa nodes as the gpu and allocate them - allocatedRdmaDevices := sets.NewString() - gpuToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceAffinity(gpuconsts.GPUDeviceType, gpuconsts.RDMADeviceType) - if err != nil { - general.Warningf("failed to get gpu to rdma affinity map: %v", err) - return nil, err - } - hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) if err != nil { general.Warningf("failed to get hint nodes: %v", err) return nil, err } - for gpuID := range gpuAllocationInfo.TopologyAwareAllocations { - rdmaDevices, ok := gpuToRdmaAffinityMap[gpuID] - if !ok { - return nil, fmt.Errorf("failed to find rdma devices with gpu id %s", gpuID) - } + var allocatedRdmaDevices []string - for _, rdmaID := range rdmaDevices { - // Only allocate when the rdma device is a subset of hint nodes - isNumaNodeAffinity, err := p.DeviceTopologyRegistry.IsNumaNodeAffinity(gpuconsts.RDMADeviceType, rdmaID, hintNodes) - if err != nil { - return nil, fmt.Errorf("failed to check numa node affinity of rdma device %s with error: %v", rdmaID, err) - } - if isNumaNodeAffinity { - allocatedRdmaDevices.Insert(rdmaID) - } + // No accompany resource name + if accompanyResourceName == "" { + allocatedRdmaDevices, err = p.allocateWithNoAccompanyResource(deviceReq, rdmaTopology, hintNodes) + if err != nil { + return nil, fmt.Errorf("failed to allocate with no accompany resource: %v", err) + } + } else { + allocatedRdmaDevices, err = p.allocateWithAccompanyResource(deviceReq, resReq, accompanyResourceName) + if err != nil { + return nil, fmt.Errorf("failed to allocate with accompany resource: %v", err) } } // Modify rdma state topologyAwareAllocations := make(map[string]state.Allocation) - allocatedRdmaDevicesRes := allocatedRdmaDevices.List() - for _, deviceID := range allocatedRdmaDevicesRes { + for _, deviceID := range allocatedRdmaDevices { info, ok := rdmaTopology.Devices[deviceID] if !ok { return nil, fmt.Errorf("failed to get rdma info for device %s", deviceID) @@ -189,11 +174,134 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( "podName", resReq.PodName, "containerName", resReq.ContainerName, "qosLevel", qosLevel, - "allocatedDevices", allocatedRdmaDevicesRes) + "allocatedRdmaDevices", allocatedRdmaDevices) return &pluginapi.AssociatedDeviceAllocationResponse{ AllocationResult: &pluginapi.AssociatedDeviceAllocation{ - AllocatedDevices: allocatedRdmaDevicesRes, + AllocatedDevices: allocatedRdmaDevices, }, }, nil } + +// allocateWithNoAccompanyResource allocates the rdma devices by best effort basis on the by making sure that +// it fits the hint nodes. +func (p *RDMADevicePlugin) allocateWithNoAccompanyResource( + deviceReq *pluginapi.DeviceRequest, rdmaTopology *machine.DeviceTopology, hintNodes machine.CPUSet, +) ([]string, error) { + reqQuantity := deviceReq.GetDeviceRequest() + + machineState, ok := p.State.GetMachineState()[gpuconsts.RDMADeviceType] + if !ok { + return nil, fmt.Errorf("no machine state for resource %s", gpuconsts.RDMADeviceType) + } + + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if allocatedDevices.Len() >= int(reqQuantity) { + return true + } + } + return false + } + + availableDevices := deviceReq.GetAvailableDevices() + reusableDevices := deviceReq.GetReusableDevices() + + // allocate reusable devices first + allocated := allocateDevices(reusableDevices...) + if allocated { + return allocatedDevices.UnsortedList(), nil + } + + for _, device := range availableDevices { + if !gpuutil.IsNUMAAffinityDevice(device, rdmaTopology, hintNodes) { + continue + } + + if !machineState.IsRequestSatisfied(device, 1, 1) { + general.Infof("available numa affinity rdma %s is already allocated", device) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), nil + } + } + + return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices)) +} + +// TODO: first allocate to reusable devices, then allocate to available devices proportionally based on numa affinity +func (p *RDMADevicePlugin) allocateWithAccompanyResource( + deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string, +) ([]string, error) { + var err error + + // Find out the accompany devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of accompany device + accompanyDeviceType, err := p.GetResourceTypeFromDeviceName(accompanyResourceName) + if err != nil { + return nil, fmt.Errorf("failed to get device type for accompany resource %s: %v", accompanyResourceName, err) + } + + accompanyAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName) + if accompanyAllocationInfo == nil || accompanyAllocationInfo.TopologyAwareAllocations == nil { + err = fmt.Errorf("get allocation info of the resource %s for pod %s/%s, container: %s failed with error: %v", + resReq.ResourceName, resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + // Get ratio of accompany resource to target device + accompanyResourceToTargetDeviceRatio := p.State.GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyResourceName, deviceReq.DeviceName) + + // Allocate target device according to ratio of accompany resource to target device + podResourceEntries := p.State.GetPodResourceEntries() + totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName) + + rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) * accompanyResourceToTargetDeviceRatio)) + + // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same + // numa nodes as the gpu and allocate them + accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceAffinity(accompanyResourceName, deviceReq.DeviceName) + if err != nil { + general.Warningf("failed to get gpu to rdma affinity map: %v", err) + return nil, err + } + + machineState := p.State.GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)] + + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if allocatedDevices.Len() >= rdmaToBeAllocated { + return true + } + } + return false + } + + for accompanyResourceId := range accompanyResourceIds { + rdmaDevices, ok := accompanyResourceToRdmaAffinityMap[accompanyResourceId] + if !ok { + general.Warningf("failed to get rdma device with accompany device id: %s", accompanyResourceId) + continue + } + + // Iterate through the rdma devices and check if they are already allocated + for _, rdmaDevice := range rdmaDevices { + if !machineState.IsRequestSatisfied(rdmaDevice, 1, 1) { + continue + } + + if allocateDevices(rdmaDevice) { + return allocatedDevices.UnsortedList(), nil + } + } + } + + // Did not find enough available rdma devices to allocate, return the devices that are already allocated + return allocatedDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 45cf3a7a91..d2bdbae590 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -386,34 +386,36 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca }, nil } -func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { - existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) +func (p *GPUMemPlugin) Allocate( + resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, +) (*pluginapi.ResourceAllocationResponse, error) { + existReallocAnno, isReallocation := util.IsReallocation(resourceReq.Annotations) - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, err) + resourceReq.PodNamespace, resourceReq.PodName, resourceReq.ContainerName, err) general.Errorf("%s", err.Error()) return nil, err } - _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + _, gpuMemory, err := util.GetQuantityFromResourceRequests(resourceReq.ResourceRequests, p.ResourceName(), false) if err != nil { return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := p.getGPUCount(req) + gpuCount, gpuNames, err := p.getGPUCount(resourceReq) if err != nil { - general.Errorf("getGPUCount failed from req %v with error: %v", req, err) + general.Errorf("getGPUCount failed from req %v with error: %v", resourceReq, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) } general.InfoS("called", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName, "qosLevel", qosLevel, - "reqAnnotations", req.Annotations, + "reqAnnotations", resourceReq.Annotations, "gpuMemory", gpuMemory, "gpuNames", gpuNames.List(), "gpuCount", gpuCount) @@ -421,7 +423,7 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso p.Lock() defer func() { if err := p.State.StoreState(); err != nil { - general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + general.ErrorS(err, "store state failed", "podName", resourceReq.PodName, "containerName", resourceReq.ContainerName) } p.Unlock() if err != nil { @@ -436,61 +438,68 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso }() emptyResponse := &pluginapi.ResourceAllocationResponse{ - PodUid: req.PodUid, - PodNamespace: req.PodNamespace, - PodName: req.PodName, - ContainerName: req.ContainerName, - ContainerType: req.ContainerType, - ContainerIndex: req.ContainerIndex, - PodRole: req.PodRole, - PodType: req.PodType, + PodUid: resourceReq.PodUid, + PodNamespace: resourceReq.PodNamespace, + PodName: resourceReq.PodName, + ContainerName: resourceReq.ContainerName, + ContainerType: resourceReq.ContainerType, + ContainerIndex: resourceReq.ContainerIndex, + PodRole: resourceReq.PodRole, + PodType: resourceReq.PodType, ResourceName: p.ResourceName(), - Labels: general.DeepCopyMap(req.Labels), - Annotations: general.DeepCopyMap(req.Annotations), + Labels: general.DeepCopyMap(resourceReq.Labels), + Annotations: general.DeepCopyMap(resourceReq.Annotations), } // currently, not to deal with init containers - if req.ContainerType == pluginapi.ContainerType_INIT { + if resourceReq.ContainerType == pluginapi.ContainerType_INIT { return emptyResponse, nil - } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { + } else if resourceReq.ContainerType == pluginapi.ContainerType_SIDECAR { // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating - return p.PackAllocationResponse(req, &state.AllocationInfo{}, nil, p.ResourceName()) + return p.PackAllocationResponse(resourceReq, &state.AllocationInfo{}, nil, p.ResourceName()) } - allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName) if allocationInfo != nil { - resp, packErr := p.PackAllocationResponse(req, allocationInfo, nil, p.ResourceName()) + resp, packErr := p.PackAllocationResponse(resourceReq, allocationInfo, nil, p.ResourceName()) if packErr != nil { general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", - req.PodNamespace, req.PodName, req.ContainerName, packErr) - return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) + resourceReq.PodNamespace, resourceReq.PodName, resourceReq.ContainerName, packErr) + return nil, fmt.Errorf("packAllocationResponse failed with error: %w", packErr) } return resp, nil } // get hint nodes from request - hintNodes, err := machine.NewCPUSetUint64(req.GetHint().GetNodes()...) + hintNodes, err := machine.NewCPUSetUint64(resourceReq.GetHint().GetNodes()...) if err != nil { general.Warningf("failed to get hint nodes: %v", err) - return nil, fmt.Errorf("failed to get hint nodes: %v", err) + return nil, fmt.Errorf("failed to get hint nodes: %w", err) } newAllocation := &state.AllocationInfo{ - AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(req, commonstate.EmptyOwnerPoolName, qosLevel), + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resourceReq, commonstate.EmptyOwnerPoolName, qosLevel), AllocatedAllocation: state.Allocation{ Quantity: gpuMemory, NUMANodes: hintNodes.ToSliceInt(), }, } - p.State.SetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName, newAllocation, false) + // Allocate for gpu devices + if deviceReq != nil { + if err = p.allocateForGPUDevice(newAllocation, deviceReq, gpuMemory); err != nil { + return nil, fmt.Errorf("allocateForGPUDevice failed with error: %w", err) + } + } + + p.State.SetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName, newAllocation, false) machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if stateErr != nil { general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", - "podNamespace", req.PodNamespace, - "podName", req.PodName, - "containerName", req.ContainerName, + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName, "gpuMemory", gpuMemory) return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) } @@ -498,7 +507,49 @@ func (p *GPUMemPlugin) Allocate(req *pluginapi.ResourceRequest) (*pluginapi.Reso // update state cache p.State.SetMachineState(machineState, true) - return p.PackAllocationResponse(req, newAllocation, nil, p.ResourceName()) + return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) +} + +// allocateForGPUDevice is called when there is a GPU device request, which means we need to allocate for GPUs as well, +// so we modify the topologyAwareAllocations to include the GPU devices. +func (p *GPUMemPlugin) allocateForGPUDevice( + allocationInfo *state.AllocationInfo, deviceReq *pluginapi.DeviceRequest, memoryRequest float64, +) error { + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return fmt.Errorf("failed to get hint nodes: %v", err) + } + + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + return fmt.Errorf("failed to get gpu topology: %v", err) + } + + if !numaTopologyReady { + return fmt.Errorf("numa topology is not ready") + } + + allocatedDevices, allocatedGPUMemory, err := p.calculateAssociatedGPUDevices(gpuTopology, memoryRequest, hintNodes, deviceReq) + if err != nil { + general.Warningf("failed to allocate associated devices: %v", err) + return err + } + + memoryTopologyAwareAllocations := make(map[string]state.Allocation) + for _, device := range allocatedDevices { + info, ok := gpuTopology.Devices[device] + if !ok { + return fmt.Errorf("failed to get gpu info for device: %s", device) + } + + memoryTopologyAwareAllocations[device] = state.Allocation{ + Quantity: allocatedGPUMemory[device], + NUMANodes: info.GetNUMANode(), + } + } + allocationInfo.TopologyAwareAllocations = memoryTopologyAwareAllocations + return nil } func (p *GPUMemPlugin) getGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { @@ -514,3 +565,84 @@ func (p *GPUMemPlugin) getGPUCount(req *pluginapi.ResourceRequest) (float64, set } return gpuCount, gpuNames, nil } + +func (p *GPUMemPlugin) calculateAssociatedGPUDevices( + gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, + request *pluginapi.DeviceRequest, +) ([]string, map[string]float64, error) { + gpuRequest := request.GetDeviceRequest() + gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) + + machineState, ok := p.State.GetMachineState()[consts.ResourceGPUMemory] + if !ok { + return nil, nil, fmt.Errorf("no machine state for resource %s", consts.ResourceGPUMemory) + } + + allocatedDevices := sets.NewString() + needed := gpuRequest + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + needed-- + if needed == 0 { + return true + } + } + return false + } + + allocatedGPUMemory := func(devices ...string) map[string]float64 { + memory := make(map[string]float64) + for _, device := range devices { + memory[device] = gpuMemoryPerGPU + } + return memory + } + + availableDevices := request.GetAvailableDevices() + reusableDevices := request.GetReusableDevices() + + // allocate must include devices first + for _, device := range reusableDevices { + if allocatedDevices.Has(device) { + continue + } + if machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { + general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) + } + allocateDevices(device) + } + + // if allocated devices is enough, return immediately + if allocatedDevices.Len() >= int(gpuRequest) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + + sort.SliceStable(availableDevices, func(i, j int) bool { + return machineState.GetQuantityAllocated(availableDevices[i]) > machineState.GetQuantityAllocated(availableDevices[j]) + }) + + // second allocate available and numa-affinity gpus + for _, device := range availableDevices { + if allocatedDevices.Has(device) { + continue + } + + if !gpuutil.IsNUMAAffinityDevice(device, gpuTopology, hintNodes) { + continue + } + + if !machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { + general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil + } + } + + return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, number of needed GPUs: %d, availableDevices len: %d, allocatedDevices len: %d", gpuRequest, len(availableDevices), len(allocatedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go deleted file mode 100644 index 868cf60d63..0000000000 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem_test.go +++ /dev/null @@ -1,351 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package gpumemory - -import ( - "fmt" - "testing" - - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" - - "github.com/stretchr/testify/assert" - - "github.com/kubewharf/katalyst-api/pkg/consts" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - "github.com/kubewharf/katalyst-core/pkg/util/machine" -) - -func generateTestGPUMemPlugin(stateDirectory string, numGPUs, numNUMAs, numCPUs, socketNum int) (*GPUMemPlugin, error) { - gpuTopology, err := machine.GenerateDummyGPUTopology(numGPUs, numNUMAs) - if err != nil { - return nil, fmt.Errorf("GenerateDummyGPUTopology failed: %v", err) - } - providerStub := machine.NewGPUTopologyProviderStub(gpuTopology) - - basePlugin, err := baseplugin.GenerateTestBasePlugin(stateDirectory, providerStub, numCPUs, numNUMAs, socketNum) - if err != nil { - return nil, fmt.Errorf("generateTestBasePlugin failed: %v", err) - } - - p := NewGPUMemPlugin(basePlugin) - - machineState := make(state.GPUMap) - // Initialize machine state to not have any gpu memory allocated - for i := 0; i < numGPUs; i++ { - machineState[fmt.Sprintf("%d", i)] = &state.GPUState{ - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 0, - } - } - - gpuMemPlugin := p.(*GPUMemPlugin) - gpuMemPlugin.State.SetMachineState(machineState, true) - - return gpuMemPlugin, nil -} - -func TestGPUMemPlugin_GetTopologyHints(t *testing.T) { - t.Parallel() - - req1 := &pluginapi.ResourceRequest{ - PodUid: "pod1", - PodNamespace: "default", - PodName: "pod-1", - ContainerName: "container-1", - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - "nvidia.com/gpu": 2, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - }, - } - - req2 := &pluginapi.ResourceRequest{ - PodUid: "pod2", - PodNamespace: "default", - PodName: "pod-2", - ContainerName: "container-2", - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - "nvidia.com/gpu": 2, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - }, - } - - reqWithInvalidQoS := &pluginapi.ResourceRequest{ - PodUid: "pod3", - PodNamespace: "default", - PodName: "pod-3", - ContainerName: "container-3", - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - Annotations: map[string]string{}, - } - - p1, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) - assert.NoError(t, err) - - p2, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) - assert.NoError(t, err) - - // pod1 is already allocated to numa node 1 - p2.State.SetAllocationInfo("pod1", "container-1", &state.AllocationInfo{ - AllocationMeta: commonstate.AllocationMeta{ - PodUid: "pod1", - PodNamespace: "default", - PodName: "pod-1", - ContainerName: "container-1", - }, - AllocatedAllocation: state.GPUAllocation{ - NUMANodes: []int{1}, - }, - }, false) - - p3, err := generateTestGPUMemPlugin(t.TempDir(), 2, 1, 4, 1) - assert.NoError(t, err) - - // All GPU memory have been used up - p3.State.SetMachineState(map[string]*state.GPUState{ - "0": { - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, - }, - "1": { - GPUMemoryAllocatable: 16 * 1024 * 1024 * 1024, - GPUMemoryAllocated: 16 * 1024 * 1024 * 1024, - }, - }, false) - - testCases := []struct { - name string - plugin *GPUMemPlugin - req *pluginapi.ResourceRequest - expectedErr bool - checkResp func(t *testing.T, resp *pluginapi.ResourceHintsResponse) - }{ - { - name: "normal case", - plugin: p1, - req: req1, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.ResourceHints) - assert.Contains(t, resp.ResourceHints, p1.ResourceName()) - hints := resp.ResourceHints[p1.ResourceName()] - assert.NotNil(t, hints) - assert.Equal(t, 3, len(hints.Hints)) - }, - }, - { - name: "get hints for another pod", - plugin: p1, - req: req2, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.ResourceHints) - assert.Contains(t, resp.ResourceHints, p1.ResourceName()) - hints := resp.ResourceHints[p1.ResourceName()] - assert.NotNil(t, hints) - assert.Equal(t, 3, len(hints.Hints)) - }, - }, - { - name: "regenerate hints for existing pod", - plugin: p2, - req: req1, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceHintsResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.ResourceHints) - assert.Contains(t, resp.ResourceHints, p2.ResourceName()) - hints := resp.ResourceHints[p2.ResourceName()] - assert.NotNil(t, hints) - assert.Equal(t, 1, len(hints.Hints)) - assert.Equal(t, hints.Hints[0].Nodes, []uint64{1}) - assert.Equal(t, hints.Hints[0].Preferred, true) - }, - }, - { - name: "insufficient gpu memory", - plugin: p3, - req: req1, - expectedErr: true, - checkResp: nil, - }, - { - name: "request with invalid QoS", - plugin: p1, - req: reqWithInvalidQoS, - expectedErr: true, - checkResp: nil, - }, - } - - for _, tt := range testCases { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - resp, err := tt.plugin.GetTopologyHints(tt.req) - fmt.Println(resp) - - if tt.expectedErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - - if tt.checkResp != nil { - tt.checkResp(t, resp) - } - }) - } -} - -func TestGPUMemPlugin_Allocate(t *testing.T) { - t.Parallel() - - req1 := &pluginapi.ResourceRequest{ - PodUid: "pod1", - PodNamespace: "default", - PodName: "pod-1", - ContainerName: "container-1", - ContainerType: pluginapi.ContainerType_MAIN, - ContainerIndex: 0, - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - "nvidia.com/gpu": 1, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - }, - Hint: &pluginapi.TopologyHint{ - Nodes: []uint64{0}, - Preferred: true, - }, - } - - reqForInitContainer := &pluginapi.ResourceRequest{ - PodUid: "pod2", - PodNamespace: "default", - PodName: "pod-2", - ContainerName: "container-2", - ContainerType: pluginapi.ContainerType_INIT, - ContainerIndex: 0, - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - "nvidia.com/gpu": 1, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - }, - } - - reqForSidecarContainer := &pluginapi.ResourceRequest{ - PodUid: "pod3", - PodNamespace: "default", - PodName: "pod-3", - ContainerName: "container-3", - ContainerType: pluginapi.ContainerType_SIDECAR, - ContainerIndex: 0, - ResourceRequests: map[string]float64{ - string(consts.ResourceGPUMemory): 8 * 1024 * 1024 * 1024, - "nvidia.com/gpu": 1, - }, - Annotations: map[string]string{ - consts.PodAnnotationQoSLevelKey: consts.PodAnnotationQoSLevelSharedCores, - }, - } - - p1, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) - assert.NoError(t, err) - - p2, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) - assert.NoError(t, err) - - p3, err := generateTestGPUMemPlugin(t.TempDir(), 4, 2, 8, 2) - assert.NoError(t, err) - - testCases := []struct { - name string - plugin *GPUMemPlugin - req *pluginapi.ResourceRequest - expectedErr bool - checkResp func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) - }{ - { - name: "normal case", - plugin: p1, - req: req1, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.NotNil(t, resp.AllocationResult.ResourceAllocation) - assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].AllocatedQuantity, float64(8*1024*1024*1024)) - assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].ResourceHints.Hints[0], req1.Hint) - }, - }, - { - name: "allocate for sidecar container", - plugin: p2, - req: reqForSidecarContainer, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { - assert.NotNil(t, resp) - assert.NotNil(t, resp.AllocationResult) - assert.NotNil(t, resp.AllocationResult.ResourceAllocation) - assert.Equal(t, resp.AllocationResult.ResourceAllocation[string(consts.ResourceGPUMemory)].AllocatedQuantity, float64(0)) - }, - }, - { - name: "allocate for init containers", - plugin: p3, - req: reqForInitContainer, - expectedErr: false, - checkResp: func(t *testing.T, resp *pluginapi.ResourceAllocationResponse) { - assert.NotNil(t, resp) - assert.Nil(t, resp.AllocationResult) - }, - }, - } - for _, tt := range testCases { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - resp, err := tt.plugin.Allocate(tt.req) - if tt.expectedErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - } - if tt.checkResp != nil { - tt.checkResp(t, resp) - } - }) - } -} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go index 694359337c..ba76ac6351 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -32,5 +32,7 @@ type ResourcePlugin interface { GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) - Allocate(*pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) + Allocate( + resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + ) (*pluginapi.ResourceAllocationResponse, error) } diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go index 6e1d033d2a..2fbb00f1ce 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -20,6 +20,7 @@ import ( "encoding/json" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" @@ -183,6 +184,24 @@ func (pre PodResourceEntries) RemovePod(podUID string) { } } +// GetTotalAllocatedResourceOfContainer returns the total allocated resource quantity of a container together with +// the specific resource IDs that are allocated. +func (pre PodResourceEntries) GetTotalAllocatedResourceOfContainer( + resourceName v1.ResourceName, podUID, containerName string, +) (int, sets.String) { + if podEntries, ok := pre[resourceName]; ok { + if allocationInfo := podEntries.GetAllocationInfo(podUID, containerName); allocationInfo != nil { + totalAllocationQuantity := int(allocationInfo.AllocatedAllocation.Quantity) + allocationIDs := sets.NewString() + for id := range allocationInfo.TopologyAwareAllocations { + allocationIDs.Insert(id) + } + return totalAllocationQuantity, allocationIDs + } + } + return 0, nil +} + func (as *AllocationState) String() string { if as == nil { return "" @@ -255,6 +274,24 @@ func (arm AllocationResourcesMap) Clone() AllocationResourcesMap { return clone } +// GetRatioOfAccompanyResourceToTargetResource returns the ratio of total accompany resource to total target resource. +// For example, if the total number of accompany resource is 4 and the total number of target resource is 2, +// the ratio is 2. +func (arm AllocationResourcesMap) GetRatioOfAccompanyResourceToTargetResource(accompanyResourceName, targetResourceName string) float64 { + // Find the ratio of the total number of accompany resource to the total number of target resource + accompanyResourceMap := arm[v1.ResourceName(accompanyResourceName)].Clone() + accompanyResourceNumber := accompanyResourceMap.getNumberDevices() + + targetResourceMap := arm[v1.ResourceName(targetResourceName)].Clone() + targetResourceNumber := targetResourceMap.getNumberDevices() + + if targetResourceNumber == 0 { + return 0 + } + + return float64(accompanyResourceNumber) / float64(targetResourceNumber) +} + func (as *AllocationState) GetQuantityAllocated() float64 { if as == nil { return 0 @@ -277,7 +314,7 @@ func (am AllocationMap) GetQuantityAllocated(id string) float64 { return am[id].GetQuantityAllocated() } -func (am AllocationMap) IsGPURequestSatisfied(id string, request float64, allocatable float64) bool { +func (am AllocationMap) IsRequestSatisfied(id string, request float64, allocatable float64) bool { if am == nil { return false } @@ -286,6 +323,14 @@ func (am AllocationMap) IsGPURequestSatisfied(id string, request float64, alloca return allocatable-allocated >= request } +func (am AllocationMap) getNumberDevices() int { + if am == nil { + return 0 + } + + return len(am) +} + // reader is used to get information from local states type reader interface { GetMachineState() AllocationResourcesMap diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index f7726509f0..55cabba419 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -49,7 +49,7 @@ import ( // StaticPolicy is the static gpu policy type StaticPolicy struct { - sync.Mutex + sync.RWMutex pluginapi.UnimplementedResourcePluginServer *baseplugin.BasePlugin @@ -191,6 +191,9 @@ func (p *StaticPolicy) GetTopologyHints( return nil, fmt.Errorf("GetTopologyHints got nil req") } + p.RLock() + defer p.RUnlock() + resourcePlugin := p.getResourcePlugin(req.ResourceName) if resourcePlugin == nil { return nil, fmt.Errorf("failed to find resource plugin by name %s", req.ResourceName) @@ -245,6 +248,9 @@ func (p *StaticPolicy) GetTopologyAwareResources( return nil, fmt.Errorf("GetTopologyAwareResources got nil req") } + p.RLock() + defer p.RUnlock() + // Get topology aware resources for all resource plugins allocatedResourcesList := make([]*pluginapi.GetTopologyAwareResourcesResponse, 0) for _, resourcePlugin := range p.resourcePlugins { @@ -322,8 +328,8 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources( return nil, fmt.Errorf("GetTopologyAwareAllocatableResources got nil req") } - p.Lock() - defer p.Unlock() + p.RLock() + defer p.RUnlock() // Get topology aware allocatable resources for all resource plugins allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) @@ -356,6 +362,7 @@ func (p *StaticPolicy) GetResourcePluginOptions( // Allocate is called during pod admit so that the resource // plugin can allocate corresponding resource for the container // according to resource request +// TODO: Clean state if fail func (p *StaticPolicy) Allocate( _ context.Context, req *pluginapi.ResourceRequest, @@ -364,12 +371,21 @@ func (p *StaticPolicy) Allocate( return nil, fmt.Errorf("GetTopologyHints got nil req") } + p.Lock() + defer func() { + if err != nil { + _ = p.removeContainer(req.PodUid, req.ContainerName, v1.ResourceName(req.ResourceName)) + p.Unlock() + } + }() + resourcePlugin := p.getResourcePlugin(req.ResourceName) if resourcePlugin == nil { return nil, fmt.Errorf("failed to get resource plugin by name %s", req.ResourceName) } - return resourcePlugin.Allocate(req) + resp, err = resourcePlugin.Allocate(req, nil) + return resp, err } // AllocateForPod is called during pod admit so that the resource @@ -393,8 +409,44 @@ func (p *StaticPolicy) PreStartContainer( } func (p *StaticPolicy) removePod(podUID string) error { + return p.removeWithUpdate(podUID, func(podResourceEntries state.PodResourceEntries) bool { + found := false + for _, podEntries := range podResourceEntries { + if podEntries[podUID] != nil { + found = true + } + delete(podEntries, podUID) + } + return found + }) +} + +// removeContainer removes container entry given the specific podUID, container name and resource name. +func (p *StaticPolicy) removeContainer(podUID, containerName string, resourceName v1.ResourceName) error { + return p.removeWithUpdate(podUID, func(podResourceEntries state.PodResourceEntries) bool { + found := false + for resource, podEntries := range podResourceEntries { + if resourceName == resource { + continue + } + if podEntries[podUID][containerName] != nil { + found = true + } + delete(podEntries[podUID], containerName) + } + return found + }) +} + +func (p *StaticPolicy) removeWithUpdate( + podUID string, removeFn func(podResourceEntries state.PodResourceEntries) bool, +) error { podResourceEntries := p.State.GetPodResourceEntries() - delete(podResourceEntries[consts.ResourceGPUMemory], podUID) + + found := removeFn(podResourceEntries) + if !found { + return nil + } machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) if err != nil { @@ -405,11 +457,11 @@ func (p *StaticPolicy) removePod(podUID string) error { p.State.SetPodResourceEntries(podResourceEntries, false) p.State.SetMachineState(machineState, false) - err = p.State.StoreState() - if err != nil { + if err := p.State.StoreState(); err != nil { general.Errorf("store state failed with error: %v", err) return err } + return nil } @@ -528,7 +580,9 @@ func (*StaticPolicy) GetAssociatedDeviceTopologyHints( // 2. Find the custom device plugin that corresponds to the deviceName and allocate func (p *StaticPolicy) AllocateAssociatedDevice( _ context.Context, req *pluginapi.AssociatedDeviceRequest, -) (*pluginapi.AssociatedDeviceAllocationResponse, error) { +) (resp *pluginapi.AssociatedDeviceAllocationResponse, respErr error) { + var isAccompanyResourcePlugin bool + var isAccompanyCustomDevicePlugin bool if req == nil || req.ResourceRequest == nil || req.DeviceRequest == nil { return nil, fmt.Errorf("req is nil") } @@ -540,16 +594,50 @@ func (p *StaticPolicy) AllocateAssociatedDevice( }, nil } + p.Lock() + defer func() { + // Reset state for accompany resource and target resource if there is an error + if respErr != nil { + if isAccompanyResourcePlugin { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(req.AccompanyResourceName)) + } + if isAccompanyCustomDevicePlugin { + accompanyDeviceType, _ := p.GetResourceTypeFromDeviceName(req.AccompanyResourceName) + if accompanyDeviceType != "" { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(accompanyDeviceType)) + } + } + deviceType, _ := p.GetResourceTypeFromDeviceName(req.DeviceName) + if deviceType != "" { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(deviceType)) + } + } + p.Unlock() + }() + + // Find the target device that we want to allocate for + var targetDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.DeviceName { + targetDeviceReq = deviceRequest + } + } + + if targetDeviceReq == nil { + return nil, fmt.Errorf("no target device plugin found for target device %s", req.DeviceName) + } + // Allocate accompany resource - // Check if resource plugin for the resource exists; if it does, allocate it first + // Check if accompany resource maps to a resource plugin; if it does, allocate it first accompanyResourcePlugin := p.getResourcePlugin(req.AccompanyResourceName) if accompanyResourcePlugin != nil { - _, err := accompanyResourcePlugin.Allocate(req.ResourceRequest) + _, err := accompanyResourcePlugin.Allocate(req.ResourceRequest, targetDeviceReq) if err != nil { return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", req.AccompanyResourceName, err) } + isAccompanyResourcePlugin = true } else { - // Allocate for custom device plugin + // Accompany resource maps to a custom device plugin; allocate for it accompanyCustomDevicePlugin := p.getCustomDevicePlugin(req.AccompanyResourceName) if accompanyCustomDevicePlugin != nil { // Get device request for accompany device @@ -564,10 +652,11 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("nil accompany device request") } - _, err := accompanyCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, accompanyDeviceReq) + _, err := accompanyCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, accompanyDeviceReq, "") if err != nil { return nil, fmt.Errorf("AllocateAssociatedDevice accompany resource %s failed with error: %v", req.AccompanyResourceName, err) } + isAccompanyCustomDevicePlugin = true } } @@ -577,23 +666,11 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("no custom device plugin found for target device %s", req.DeviceName) } - var targetDeviceReq *pluginapi.DeviceRequest - for _, deviceRequest := range req.DeviceRequest { - if deviceRequest.DeviceName == req.DeviceName { - targetDeviceReq = deviceRequest - } - } - - if targetDeviceReq == nil { - return nil, fmt.Errorf("no target device plugin found for target device %s", req.DeviceName) - } - - return targetCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, targetDeviceReq) + resp, respErr = targetCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) + return resp, respErr } func (p *StaticPolicy) registerResourcePlugins() error { - p.Lock() - defer p.Unlock() for name := range p.resourcePluginsNames { initFunc := resourcepluginregistry.ResourcePluginsMap[name] resourcePlugin := initFunc(p.BasePlugin) @@ -603,8 +680,6 @@ func (p *StaticPolicy) registerResourcePlugins() error { } func (p *StaticPolicy) registerCustomDevicePlugins() error { - p.Lock() - defer p.Unlock() for name := range p.associatedDeviceNames { initFunc := devicepluginregistry.CustomDevicePluginsMap[name] customDevicePlugin := initFunc(p.BasePlugin) @@ -617,15 +692,11 @@ func (p *StaticPolicy) registerCustomDevicePlugins() error { } func (p *StaticPolicy) getResourcePlugin(resourceName string) resourceplugin.ResourcePlugin { - p.Lock() - defer p.Unlock() resourcePlugin := p.resourcePlugins[resourceName] return resourcePlugin } func (p *StaticPolicy) getCustomDevicePlugin(deviceName string) customdeviceplugin.CustomDevicePlugin { - p.Lock() - defer p.Unlock() customDevicePlugin := p.customDevicePlugins[deviceName] return customDevicePlugin } diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index acd1e99772..cfde820216 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -56,3 +56,15 @@ func GetNUMANodesCountToFitGPUReq( gpusCountNeededPerNUMA := int(math.Ceil(gpuReq / float64(numaCountNeeded))) return numaCountNeeded, gpusCountNeededPerNUMA, nil } + +func IsNUMAAffinityDevice( + device string, deviceTopology *machine.DeviceTopology, hintNodes machine.CPUSet, +) bool { + info, ok := deviceTopology.Devices[device] + if !ok { + general.Errorf("failed to find device info for device %s", device) + return false + } + + return machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) +} diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index b062703ad9..b3f643f80a 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -31,6 +31,7 @@ type DeviceTopologyProvider interface { SetDeviceTopology(*DeviceTopology) error } +// DeviceTopologyRegistry is a registry of all topology providers that knows how to provide topology information of machine devices type DeviceTopologyRegistry struct { // DeviceNameToProvider is a mapping of device name to their respective topology provider DeviceNameToProvider map[string]DeviceTopologyProvider @@ -68,7 +69,7 @@ func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTo return provider.GetDeviceTopology() } -// GetDeviceAffinity retrieves a map of a certain device to the list of devices that it has an affinity with +// GetDeviceAffinity retrieves a map of a certain device to the list of devices that it has an affinity with. // A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) func (r *DeviceTopologyRegistry) GetDeviceAffinity(key, value string) (map[string][]string, error) { deviceTopologyKey, numaReady, err := r.GetDeviceTopology(key) @@ -183,6 +184,7 @@ func getEmptyDeviceTopology() *DeviceTopology { } } +// TODO: change to return map[resourceName]DeviceTopology func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { deviceTopology := getEmptyDeviceTopology() From e6e947441009c1684ef8152578acc17bace949ad Mon Sep 17 00:00:00 2001 From: JustinChengLZ Date: Tue, 21 Oct 2025 11:20:31 +0800 Subject: [PATCH 17/52] Update gpu_plugin.go --- cmd/katalyst-agent/app/options/qrm/gpu_plugin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go index 90ed07f688..8e5dc4aa77 100644 --- a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -37,7 +37,7 @@ func NewGPUOptions() *GPUOptions { PolicyName: "static", GPUDeviceNames: []string{"nvidia.com/gpu"}, GPUMemoryAllocatablePerGPU: "100", - RDMADeviceNames: []string{"bytedance.com/rdma"}, + RDMADeviceNames: []string{}, } } From 4cc1a8112fcd2c8730758c91f7baa5968f7c51fa Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 21 Oct 2025 01:59:49 +0800 Subject: [PATCH 18/52] refactor(gpu): restructure device plugin and resource management - Remove unused ResourcePluginsNames field and related configurations - Add DefaultAccompanyResourceName method to CustomDevicePlugin interface - Make registry maps private and add getter functions - Improve error handling and cleanup in StaticPolicy allocation - Simplify device topology initialization and allocation logic --- .../app/options/qrm/gpu_plugin.go | 3 - pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 8 +-- .../gpu/customdeviceplugin/gpu/gpu.go | 11 +++- .../gpu/customdeviceplugin/interface.go | 2 + .../gpu/customdeviceplugin/rdma/rdma.go | 4 ++ .../customdeviceplugin/registry/registry.go | 14 +++-- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 20 +++--- .../gpu/resourceplugin/registry/registry.go | 14 +++-- .../qrm-plugins/gpu/staticpolicy/policy.go | 62 ++++++++++++------- pkg/config/agent/qrm/gpu_plugin.go | 2 - pkg/util/machine/device.go | 1 - 11 files changed, 86 insertions(+), 55 deletions(-) diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go index 8e5dc4aa77..0d31a52633 100644 --- a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -29,7 +29,6 @@ type GPUOptions struct { GPUMemoryAllocatablePerGPU string SkipGPUStateCorruption bool RDMADeviceNames []string - ResourcePluginsNames []string } func NewGPUOptions() *GPUOptions { @@ -51,7 +50,6 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties") - fs.StringSliceVar(&o.ResourcePluginsNames, "gpu-resource-plugins-names", o.ResourcePluginsNames, "The names of the enabled gpu resource plugins") } func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { @@ -64,6 +62,5 @@ func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { conf.GPUMemoryAllocatablePerGPU = gpuMemory conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption conf.RDMADeviceNames = o.RDMADeviceNames - conf.ResourcePluginsNames = o.ResourcePluginsNames return nil } diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 50ac1be7c2..6814f3a25e 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -59,7 +59,7 @@ type BasePlugin struct { DeviceTopologyRegistry *machine.DeviceTopologyRegistry // Map of specific device name to device type - DeviceNameToTypeMap map[string]string + deviceNameToTypeMap map[string]string } func NewBasePlugin( @@ -86,7 +86,7 @@ func NewBasePlugin( State: stateImpl, DeviceTopologyRegistry: deviceTopologyRegistry, - DeviceNameToTypeMap: make(map[string]string), + deviceNameToTypeMap: make(map[string]string), }, nil } @@ -174,12 +174,12 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( // so we map them to the same device type, which allows us to allocate them interchangeably. func (p *BasePlugin) RegisterDeviceNameToType(resourceNames []string, deviceType string) { for _, resourceName := range resourceNames { - p.DeviceNameToTypeMap[resourceName] = deviceType + p.deviceNameToTypeMap[resourceName] = deviceType } } func (p *BasePlugin) GetResourceTypeFromDeviceName(deviceName string) (string, error) { - deviceType, ok := p.DeviceNameToTypeMap[deviceName] + deviceType, ok := p.deviceNameToTypeMap[deviceName] if !ok { return "", fmt.Errorf("no device type found for device name %s", deviceName) } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index a973041872..a2e959ef73 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -19,6 +19,7 @@ package gpu import ( "fmt" + v1 "k8s.io/api/core/v1" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" @@ -34,6 +35,10 @@ import ( const GPUCustomDevicePluginName = "gpu-custom-device-plugin" +const ( + defaultAccompanyResourceName = string(consts.ResourceGPUMemory) +) + type GPUDevicePlugin struct { *baseplugin.BasePlugin deviceNames []string @@ -50,6 +55,10 @@ func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDe } } +func (p *GPUDevicePlugin) DefaultAccompanyResourceName() string { + return defaultAccompanyResourceName +} + func (p *GPUDevicePlugin) DeviceNames() []string { return p.deviceNames } @@ -88,7 +97,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( "deviceRequest", deviceReq.DeviceRequest, ) - memoryAllocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resReq.PodUid, resReq.ContainerName) + memoryAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(defaultAccompanyResourceName), resReq.PodUid, resReq.ContainerName) // GPU memory should have been allocated at this stage. // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu memory state. if memoryAllocationInfo == nil || memoryAllocationInfo.TopologyAwareAllocations == nil { diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go index 7d06363610..c2f221e49a 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -28,6 +28,8 @@ type CustomDevicePlugin interface { UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) + DefaultAccompanyResourceName() string + AllocateAssociatedDevice( resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index 1ada043a36..6e8db852cd 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -53,6 +53,10 @@ func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomD } } +func (p *RDMADevicePlugin) DefaultAccompanyResourceName() string { + return "" +} + func (p *RDMADevicePlugin) DeviceNames() []string { return p.deviceNames } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go index 07def8b1f0..5a52bf7552 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go @@ -25,13 +25,17 @@ import ( type initFunc func(plugin *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin -var CustomDevicePluginsMap = make(map[string]initFunc) +var customDevicePluginsMap = make(map[string]initFunc) -func registerCustomDevicePlugin(deviceName string, initFunc initFunc) { - CustomDevicePluginsMap[deviceName] = initFunc +func RegisterCustomDevicePlugin(deviceName string, initFunc initFunc) { + customDevicePluginsMap[deviceName] = initFunc +} + +func GetRegisteredCustomDevicePlugin() map[string]initFunc { + return customDevicePluginsMap } func init() { - registerCustomDevicePlugin(gpu.GPUCustomDevicePluginName, gpu.NewGPUDevicePlugin) - registerCustomDevicePlugin(rdma.RDMACustomDevicePluginName, rdma.NewRDMADevicePlugin) + RegisterCustomDevicePlugin(gpu.GPUCustomDevicePluginName, gpu.NewGPUDevicePlugin) + RegisterCustomDevicePlugin(rdma.RDMACustomDevicePluginName, rdma.NewRDMADevicePlugin) } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index d2bdbae590..e051615f9a 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -60,8 +60,6 @@ func (p *GPUMemPlugin) ResourceName() string { } func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { - existReallocAnno, isReallocation := util.IsReallocation(req.Annotations) - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", @@ -106,9 +104,6 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p metricTags := []metrics.MetricTag{ {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, } - if existReallocAnno { - metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) - } _ = p.Emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, metricTags...) } }() @@ -173,7 +168,8 @@ func (p *GPUMemPlugin) calculateHints( continue } - if s.GetQuantityAllocated()+perGPUMemory <= float64(p.GPUMemoryAllocatablePerGPU.Value()) { + allocated := s.GetQuantityAllocated() + if allocated+perGPUMemory <= float64(p.GPUMemoryAllocatablePerGPU.Value()) { info, ok := gpuTopology.Devices[gpuID] if !ok { return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) @@ -181,7 +177,7 @@ func (p *GPUMemPlugin) calculateHints( for _, numaNode := range info.GetNUMANode() { numaToAvailableGPUCount[numaNode] += 1 - numaToMostAllocatedGPUMemory[numaNode] = math.Max(s.GetQuantityAllocated(), numaToMostAllocatedGPUMemory[numaNode]) + numaToMostAllocatedGPUMemory[numaNode] = math.Max(allocated, numaToMostAllocatedGPUMemory[numaNode]) } } } @@ -389,8 +385,6 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca func (p *GPUMemPlugin) Allocate( resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, ) (*pluginapi.ResourceAllocationResponse, error) { - existReallocAnno, isReallocation := util.IsReallocation(resourceReq.Annotations) - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", @@ -430,9 +424,6 @@ func (p *GPUMemPlugin) Allocate( metricTags := []metrics.MetricTag{ {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, } - if existReallocAnno { - metricTags = append(metricTags, metrics.MetricTag{Key: "reallocation", Val: isReallocation}) - } _ = p.Emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, metricTags...) } }() @@ -486,10 +477,13 @@ func (p *GPUMemPlugin) Allocate( } // Allocate for gpu devices - if deviceReq != nil { + if deviceReq != nil && p.associatedDevicesName.Has(deviceReq.DeviceName) { if err = p.allocateForGPUDevice(newAllocation, deviceReq, gpuMemory); err != nil { return nil, fmt.Errorf("allocateForGPUDevice failed with error: %w", err) } + } else { + // return a trivial allocationResult if no device request is provided + return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) } p.State.SetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName, newAllocation, false) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go index 1a852c902a..a7cefbaf7e 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go @@ -23,14 +23,18 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory" ) -type initFunc func(plugin *baseplugin.BasePlugin) resourceplugin.ResourcePlugin +type InitFunc func(plugin *baseplugin.BasePlugin) resourceplugin.ResourcePlugin -var ResourcePluginsMap = make(map[string]initFunc) +var resourcePluginsMap = make(map[string]InitFunc) -func registerResourcePlugin(pluginName string, initFunc initFunc) { - ResourcePluginsMap[pluginName] = initFunc +func RegisterResourcePlugin(pluginName string, initFunc InitFunc) { + resourcePluginsMap[pluginName] = initFunc +} + +func GetRegisteredResourcePlugin() map[string]InitFunc { + return resourcePluginsMap } func init() { - registerResourcePlugin(gpuconsts.GPUMemPluginName, gpumemory.NewGPUMemPlugin) + RegisterResourcePlugin(gpuconsts.GPUMemPluginName, gpumemory.NewGPUMemPlugin) } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 55cabba419..c888499ddb 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -59,7 +59,6 @@ type StaticPolicy struct { emitter metrics.MetricEmitter associatedDeviceNames sets.String - resourcePluginsNames sets.String residualHitMap map[string]int64 @@ -83,15 +82,13 @@ func NewStaticPolicy( } policyImplement := &StaticPolicy{ - emitter: wrappedEmitter, - stopCh: make(chan struct{}), - name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), - residualHitMap: make(map[string]int64), - associatedDeviceNames: sets.NewString(conf.GPUDeviceNames...), - resourcePluginsNames: sets.NewString(conf.ResourcePluginsNames...), - BasePlugin: basePlugin, - resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), - customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), + emitter: wrappedEmitter, + stopCh: make(chan struct{}), + name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), + residualHitMap: make(map[string]int64), + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } if err = policyImplement.registerResourcePlugins(); err != nil { @@ -362,7 +359,6 @@ func (p *StaticPolicy) GetResourcePluginOptions( // Allocate is called during pod admit so that the resource // plugin can allocate corresponding resource for the container // according to resource request -// TODO: Clean state if fail func (p *StaticPolicy) Allocate( _ context.Context, req *pluginapi.ResourceRequest, @@ -375,8 +371,8 @@ func (p *StaticPolicy) Allocate( defer func() { if err != nil { _ = p.removeContainer(req.PodUid, req.ContainerName, v1.ResourceName(req.ResourceName)) - p.Unlock() } + p.Unlock() }() resourcePlugin := p.getResourcePlugin(req.ResourceName) @@ -426,10 +422,15 @@ func (p *StaticPolicy) removeContainer(podUID, containerName string, resourceNam return p.removeWithUpdate(podUID, func(podResourceEntries state.PodResourceEntries) bool { found := false for resource, podEntries := range podResourceEntries { - if resourceName == resource { + if resourceName != resource { + continue + } + + if _, ok := podEntries[podUID]; !ok { continue } - if podEntries[podUID][containerName] != nil { + + if _, ok := podEntries[podUID][containerName]; ok { found = true } delete(podEntries[podUID], containerName) @@ -652,7 +653,7 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("nil accompany device request") } - _, err := accompanyCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, accompanyDeviceReq, "") + _, err := p.allocateAssociatedDevice(accompanyCustomDevicePlugin, req.ResourceRequest, accompanyDeviceReq, "") if err != nil { return nil, fmt.Errorf("AllocateAssociatedDevice accompany resource %s failed with error: %v", req.AccompanyResourceName, err) } @@ -666,26 +667,45 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("no custom device plugin found for target device %s", req.DeviceName) } - resp, respErr = targetCustomDevicePlugin.AllocateAssociatedDevice(req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) - return resp, respErr + return p.allocateAssociatedDevice(targetCustomDevicePlugin, req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) +} + +func (p *StaticPolicy) allocateAssociatedDevice(devicePlugin customdeviceplugin.CustomDevicePlugin, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + defaultAccompanyResourceName := devicePlugin.DefaultAccompanyResourceName() + if defaultAccompanyResourceName != "" && accompanyResourceName != defaultAccompanyResourceName { + accompanyResourcePlugin := p.getResourcePlugin(defaultAccompanyResourceName) + if accompanyResourcePlugin != nil { + _, err := accompanyResourcePlugin.Allocate(resReq, deviceReq) + if err != nil { + _ = p.removeContainer(resReq.PodUid, resReq.ContainerName, v1.ResourceName(defaultAccompanyResourceName)) + return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", defaultAccompanyResourceName, err) + } + } + } + + return devicePlugin.AllocateAssociatedDevice(resReq, deviceReq, accompanyResourceName) } func (p *StaticPolicy) registerResourcePlugins() error { - for name := range p.resourcePluginsNames { - initFunc := resourcepluginregistry.ResourcePluginsMap[name] + allInitFuncs := resourcepluginregistry.GetRegisteredResourcePlugin() + for _, initFunc := range allInitFuncs { resourcePlugin := initFunc(p.BasePlugin) p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin + general.Infof("Registered resource plugin: %s", resourcePlugin.ResourceName()) } return nil } func (p *StaticPolicy) registerCustomDevicePlugins() error { - for name := range p.associatedDeviceNames { - initFunc := devicepluginregistry.CustomDevicePluginsMap[name] + allInitFuncs := devicepluginregistry.GetRegisteredCustomDevicePlugin() + for name := range allInitFuncs { + initFunc := allInitFuncs[name] customDevicePlugin := initFunc(p.BasePlugin) deviceNames := customDevicePlugin.DeviceNames() for _, deviceName := range deviceNames { p.customDevicePlugins[deviceName] = customDevicePlugin + p.associatedDeviceNames.Insert(deviceName) } } return nil diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go index 6a4d3a8452..62d223a10c 100644 --- a/pkg/config/agent/qrm/gpu_plugin.go +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -29,8 +29,6 @@ type GPUQRMPluginConfig struct { SkipGPUStateCorruption bool // RDMADeviceNames is the names of the RDMA device RDMADeviceNames []string - // ResourcePluginsNames holds the names of the enabled GPU resource plugins - ResourcePluginsNames []string } func NewGPUQRMPluginConfig() *GPUQRMPluginConfig { diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index b3f643f80a..cb244be6be 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -184,7 +184,6 @@ func getEmptyDeviceTopology() *DeviceTopology { } } -// TODO: change to return map[resourceName]DeviceTopology func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { deviceTopology := getEmptyDeviceTopology() From 93ce81e22ffc57cb624b262344f872599a7b1a50 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 21 Oct 2025 21:24:34 +0800 Subject: [PATCH 19/52] refactor: remove unused GenerateDummyGPUTopology function --- pkg/util/machine/topology.go | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/pkg/util/machine/topology.go b/pkg/util/machine/topology.go index 4529b74aaa..cfe568bb78 100644 --- a/pkg/util/machine/topology.go +++ b/pkg/util/machine/topology.go @@ -296,29 +296,6 @@ func GenerateDummyExtraTopology(numaNum int) (*ExtraTopologyInfo, error) { return extraTopology, nil } -// GenerateDummyGPUTopology generates a dummy GPU topology such that a gpu has affinity to a single NUMA node -// and the gpus are evenly distributed across all NUMA nodes. -func GenerateDummyGPUTopology(gpuNum, numaNum int) (*GPUTopology, error) { - if gpuNum%numaNum != 0 { - return nil, fmt.Errorf("invalid gpu number: %d and NUMA number: %d", gpuNum, numaNum) - } else if gpuNum%2 != 0 { - return nil, fmt.Errorf("invalid gpu number as it should be even: %d", gpuNum) - } - gpuTopology := new(GPUTopology) - gpuTopology.GPUs = make(map[string]GPUInfo, gpuNum) - perNumaGPU := gpuNum / numaNum - for i := 0; i < numaNum; i++ { - for j := 0; j < perNumaGPU; j++ { - gpuID := fmt.Sprintf("%d", i*perNumaGPU+j) - gpuTopology.GPUs[gpuID] = GPUInfo{ - Health: "healthy", - NUMANode: []int{i}, - } - } - } - return gpuTopology, nil -} - // CPUTopoInfo contains the NUMA, socket, and core IDs associated with a CPU. type CPUTopoInfo struct { NUMANodeID int From b90ed387282ac5d1f6f8f10ebe62b1cc87310ea7 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 21 Oct 2025 22:01:16 +0800 Subject: [PATCH 20/52] feat(gpu): implement strategy-based GPU allocation framework introduce a new strategy framework for GPU allocation with filtering, sorting and binding components add helper functions for GPU memory and device allocation remove redundant checks and simplify allocation logic --- .../gpu/customdeviceplugin/gpu/gpu.go | 74 +++++- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 230 +++++------------- pkg/agent/qrm-plugins/gpu/state/state.go | 4 - .../qrm-plugins/gpu/staticpolicy/policy.go | 7 - .../strategy/allocate/allocation/generic.go | 81 ++++++ .../gpu/strategy/allocate/bind/default.go | 108 ++++++++ .../strategy/allocate/filter/gpu_memory.go | 94 +++++++ .../gpu/strategy/allocate/manager/helper.go | 68 ++++++ .../gpu/strategy/allocate/manager/manager.go | 175 +++++++++++++ .../strategy/allocate/manager/manager_test.go | 108 ++++++++ .../gpu/strategy/allocate/manager/registry.go | 186 ++++++++++++++ .../gpu/strategy/allocate/sort/gpu_memory.go | 100 ++++++++ .../gpu/strategy/allocate/types.go | 85 +++++++ pkg/agent/qrm-plugins/gpu/util/util.go | 25 ++ pkg/agent/qrm-plugins/util/util.go | 17 ++ 15 files changed, 1171 insertions(+), 191 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index a2e959ef73..a12f34136c 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -28,7 +28,8 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) @@ -74,7 +75,7 @@ func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.Associat func (p *GPUDevicePlugin) AllocateAssociatedDevice( resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) @@ -97,16 +98,71 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( "deviceRequest", deviceReq.DeviceRequest, ) + gpuAllocationInfo := p.State.GetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName) + if gpuAllocationInfo != nil { + if gpuAllocationInfo.TopologyAwareAllocations == nil { + return nil, fmt.Errorf("GPU topology aware allocation info is nil") + } + allocatedDevices := make([]string, 0, len(gpuAllocationInfo.TopologyAwareAllocations)) + for gpuID := range gpuAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + var allocatedDevices []string memoryAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(defaultAccompanyResourceName), resReq.PodUid, resReq.ContainerName) // GPU memory should have been allocated at this stage. // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu memory state. if memoryAllocationInfo == nil || memoryAllocationInfo.TopologyAwareAllocations == nil { - return nil, fmt.Errorf("gpu memory allocation info is nil, which is unexpected") - } + // When GPU memory allocation info is nil, invoke the GPU allocate strategy to perform GPU allocation + general.InfoS("GPU memory allocation info is nil, invoking GPU allocate strategy", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName) + + // Get GPU topology + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + general.Warningf("failed to get gpu topology: %w", err) + return nil, fmt.Errorf("failed to get gpu topology: %w", err) + } - allocatedDevices := make([]string, 0, len(memoryAllocationInfo.TopologyAwareAllocations)) - for gpuID := range memoryAllocationInfo.TopologyAwareAllocations { - allocatedDevices = append(allocatedDevices, gpuID) + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + // Use the strategy framework to allocate GPU devices + result, err := manager.AllocateGPUUsingStrategy( + resReq, + deviceReq, + gpuTopology, + p.GPUQRMPluginConfig, + p.Emitter, + p.MetaServer, + p.State.GetMachineState(), + qosLevel, + ) + + if err != nil { + return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) + } + + if !result.Success { + return nil, fmt.Errorf("GPU allocation failed: %s", result.ErrorMessage) + } + + allocatedDevices = result.AllocatedDevices + } else { + // when GPU memory allocation info exists + for gpuID := range memoryAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } } gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) @@ -121,6 +177,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( } // Save gpu device allocations in state + numaNodes := machine.NewCPUSet() gpuDeviceTopologyAwareAllocations := make(map[string]state.Allocation) for _, deviceID := range allocatedDevices { info, ok := gpuTopology.Devices[deviceID] @@ -132,13 +189,14 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( Quantity: 1, NUMANodes: info.NumaNodes, } + numaNodes.Add(info.NumaNodes...) } gpuDeviceAllocationInfo := &state.AllocationInfo{ AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), AllocatedAllocation: state.Allocation{ Quantity: float64(len(allocatedDevices)), - NUMANodes: memoryAllocationInfo.AllocatedAllocation.NUMANodes, + NUMANodes: numaNodes.ToSliceInt(), }, } gpuDeviceAllocationInfo.TopologyAwareAllocations = gpuDeviceTopologyAwareAllocations diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index e051615f9a..5bc1f9763e 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -22,8 +22,6 @@ import ( "sort" "sync" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" @@ -33,6 +31,7 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + manager "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/metrics" @@ -44,14 +43,11 @@ import ( type GPUMemPlugin struct { sync.Mutex *baseplugin.BasePlugin - - associatedDevicesName sets.String } func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin { return &GPUMemPlugin{ - BasePlugin: base, - associatedDevicesName: sets.NewString(base.GPUDeviceNames...), + BasePlugin: base, } } @@ -73,17 +69,12 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := p.getGPUCount(req) + gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.GPUDeviceNames) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) } - if gpuCount == 0 { - general.Errorf("getGPUCount failed from req %v with error: no GPU resources found", req) - return nil, fmt.Errorf("no GPU resources found") - } - general.InfoS("called", "podNamespace", req.PodNamespace, "podName", req.PodName, @@ -385,6 +376,15 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca func (p *GPUMemPlugin) Allocate( resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, ) (*pluginapi.ResourceAllocationResponse, error) { + _, exists := resourceReq.Annotations[p.ResourceName()] + if !exists { + general.InfoS("No GPU memory annotation detected and no GPU memory requested, returning empty response", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName) + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", @@ -398,7 +398,7 @@ func (p *GPUMemPlugin) Allocate( return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := p.getGPUCount(resourceReq) + gpuCount, gpuNames, err := gpuutil.GetGPUCount(resourceReq, p.GPUDeviceNames) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", resourceReq, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) @@ -428,23 +428,9 @@ func (p *GPUMemPlugin) Allocate( } }() - emptyResponse := &pluginapi.ResourceAllocationResponse{ - PodUid: resourceReq.PodUid, - PodNamespace: resourceReq.PodNamespace, - PodName: resourceReq.PodName, - ContainerName: resourceReq.ContainerName, - ContainerType: resourceReq.ContainerType, - ContainerIndex: resourceReq.ContainerIndex, - PodRole: resourceReq.PodRole, - PodType: resourceReq.PodType, - ResourceName: p.ResourceName(), - Labels: general.DeepCopyMap(resourceReq.Labels), - Annotations: general.DeepCopyMap(resourceReq.Annotations), - } - // currently, not to deal with init containers if resourceReq.ContainerType == pluginapi.ContainerType_INIT { - return emptyResponse, nil + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil } else if resourceReq.ContainerType == pluginapi.ContainerType_SIDECAR { // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating return p.PackAllocationResponse(resourceReq, &state.AllocationInfo{}, nil, p.ResourceName()) @@ -461,6 +447,38 @@ func (p *GPUMemPlugin) Allocate( return resp, nil } + // Get GPU topology + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, fmt.Errorf("failed to get gpu topology: %v", err) + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + // Use the strategy framework to allocate GPU memory + result, err := manager.AllocateGPUUsingStrategy( + resourceReq, + deviceReq, + gpuTopology, + p.GPUQRMPluginConfig, + p.Emitter, + p.MetaServer, + p.State.GetMachineState(), + qosLevel, + ) + + if err != nil { + return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) + } + + if !result.Success { + return nil, fmt.Errorf("GPU allocation failed: %s", result.ErrorMessage) + } + // get hint nodes from request hintNodes, err := machine.NewCPUSetUint64(resourceReq.GetHint().GetNodes()...) if err != nil { @@ -474,18 +492,23 @@ func (p *GPUMemPlugin) Allocate( Quantity: gpuMemory, NUMANodes: hintNodes.ToSliceInt(), }, + TopologyAwareAllocations: make(map[string]state.Allocation), } - // Allocate for gpu devices - if deviceReq != nil && p.associatedDevicesName.Has(deviceReq.DeviceName) { - if err = p.allocateForGPUDevice(newAllocation, deviceReq, gpuMemory); err != nil { - return nil, fmt.Errorf("allocateForGPUDevice failed with error: %w", err) + gpuMemoryPerGPU := gpuMemory / gpuCount + for _, deviceID := range result.AllocatedDevices { + info, ok := gpuTopology.Devices[deviceID] + if !ok { + return nil, fmt.Errorf("failed to get gpu info for device: %s", deviceID) + } + + newAllocation.TopologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: gpuMemoryPerGPU, + NUMANodes: info.NumaNodes, } - } else { - // return a trivial allocationResult if no device request is provided - return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) } + // Set allocation info in state p.State.SetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName, newAllocation, false) machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) @@ -503,140 +526,3 @@ func (p *GPUMemPlugin) Allocate( return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) } - -// allocateForGPUDevice is called when there is a GPU device request, which means we need to allocate for GPUs as well, -// so we modify the topologyAwareAllocations to include the GPU devices. -func (p *GPUMemPlugin) allocateForGPUDevice( - allocationInfo *state.AllocationInfo, deviceReq *pluginapi.DeviceRequest, memoryRequest float64, -) error { - hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) - if err != nil { - general.Warningf("failed to get hint nodes: %v", err) - return fmt.Errorf("failed to get hint nodes: %v", err) - } - - gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) - if err != nil { - return fmt.Errorf("failed to get gpu topology: %v", err) - } - - if !numaTopologyReady { - return fmt.Errorf("numa topology is not ready") - } - - allocatedDevices, allocatedGPUMemory, err := p.calculateAssociatedGPUDevices(gpuTopology, memoryRequest, hintNodes, deviceReq) - if err != nil { - general.Warningf("failed to allocate associated devices: %v", err) - return err - } - - memoryTopologyAwareAllocations := make(map[string]state.Allocation) - for _, device := range allocatedDevices { - info, ok := gpuTopology.Devices[device] - if !ok { - return fmt.Errorf("failed to get gpu info for device: %s", device) - } - - memoryTopologyAwareAllocations[device] = state.Allocation{ - Quantity: allocatedGPUMemory[device], - NUMANodes: info.GetNUMANode(), - } - } - allocationInfo.TopologyAwareAllocations = memoryTopologyAwareAllocations - return nil -} - -func (p *GPUMemPlugin) getGPUCount(req *pluginapi.ResourceRequest) (float64, sets.String, error) { - gpuCount := float64(0) - gpuNames := sets.NewString() - for resourceName := range p.associatedDevicesName { - _, request, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) - if err != nil && !errors.IsNotFound(err) { - return 0, nil, err - } - gpuCount += request - gpuNames.Insert(resourceName) - } - return gpuCount, gpuNames, nil -} - -func (p *GPUMemPlugin) calculateAssociatedGPUDevices( - gpuTopology *machine.DeviceTopology, gpuMemoryRequest float64, hintNodes machine.CPUSet, - request *pluginapi.DeviceRequest, -) ([]string, map[string]float64, error) { - gpuRequest := request.GetDeviceRequest() - gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) - - machineState, ok := p.State.GetMachineState()[consts.ResourceGPUMemory] - if !ok { - return nil, nil, fmt.Errorf("no machine state for resource %s", consts.ResourceGPUMemory) - } - - allocatedDevices := sets.NewString() - needed := gpuRequest - allocateDevices := func(devices ...string) bool { - for _, device := range devices { - allocatedDevices.Insert(device) - needed-- - if needed == 0 { - return true - } - } - return false - } - - allocatedGPUMemory := func(devices ...string) map[string]float64 { - memory := make(map[string]float64) - for _, device := range devices { - memory[device] = gpuMemoryPerGPU - } - return memory - } - - availableDevices := request.GetAvailableDevices() - reusableDevices := request.GetReusableDevices() - - // allocate must include devices first - for _, device := range reusableDevices { - if allocatedDevices.Has(device) { - continue - } - if machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { - general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) - } - allocateDevices(device) - } - - // if allocated devices is enough, return immediately - if allocatedDevices.Len() >= int(gpuRequest) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - - sort.SliceStable(availableDevices, func(i, j int) bool { - return machineState.GetQuantityAllocated(availableDevices[i]) > machineState.GetQuantityAllocated(availableDevices[j]) - }) - - // second allocate available and numa-affinity gpus - for _, device := range availableDevices { - if allocatedDevices.Has(device) { - continue - } - - if !gpuutil.IsNUMAAffinityDevice(device, gpuTopology, hintNodes) { - continue - } - - if !machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, float64(p.GPUMemoryAllocatablePerGPU.Value())) { - general.Infof("available numa affinity gpu %s has not enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", - device, float64(p.GPUMemoryAllocatablePerGPU.Value()), machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) - continue - } - - if allocateDevices(device) { - return allocatedDevices.UnsortedList(), allocatedGPUMemory(allocatedDevices.UnsortedList()...), nil - } - } - - return nil, nil, fmt.Errorf("no enough available GPUs found in gpuTopology, number of needed GPUs: %d, availableDevices len: %d, allocatedDevices len: %d", gpuRequest, len(availableDevices), len(allocatedDevices)) -} diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go index 2fbb00f1ce..fdea45b882 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -315,10 +315,6 @@ func (am AllocationMap) GetQuantityAllocated(id string) float64 { } func (am AllocationMap) IsRequestSatisfied(id string, request float64, allocatable float64) bool { - if am == nil { - return false - } - allocated := am.GetQuantityAllocated(id) return allocatable-allocated >= request } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index c888499ddb..dd2f0eafa9 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -588,13 +588,6 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("req is nil") } - if req.ResourceRequest.Hint == nil || req.ResourceRequest.Hint.Nodes == nil { - general.Warningf("got nil resource hint") - return &pluginapi.AssociatedDeviceAllocationResponse{ - AllocationResult: nil, - }, nil - } - p.Lock() defer func() { // Reset state for accompany resource and target resource if there is an error diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go new file mode 100644 index 0000000000..00533e1546 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go @@ -0,0 +1,81 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package allocation + +import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + +// GenericAllocationStrategy combines filtering, sorting, and binding strategies +type GenericAllocationStrategy struct { + name string + FilteringStrategy []allocate.FilteringStrategy + SortingStrategy allocate.SortingStrategy + BindingStrategy allocate.BindingStrategy +} + +// NewGenericAllocationStrategy creates a new allocation strategy with the given components +func NewGenericAllocationStrategy(name string, + filtering []allocate.FilteringStrategy, + sorting allocate.SortingStrategy, + binding allocate.BindingStrategy, +) allocate.AllocationStrategy { + return &GenericAllocationStrategy{ + name: name, + FilteringStrategy: filtering, + SortingStrategy: sorting, + BindingStrategy: binding, + } +} + +func (s *GenericAllocationStrategy) Name() string { + return s.name +} + +// Allocate performs the allocation using the combined strategies +func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { + var err error + allAvailableDevices := append(ctx.DeviceReq.ReusableDevices, ctx.DeviceReq.AvailableDevices...) + // Apply filtering strategy + for _, fs := range s.FilteringStrategy { + allAvailableDevices, err = fs.Filter(ctx, allAvailableDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + } + + // Apply sorting strategy + sortedDevices, err := s.SortingStrategy.Sort(ctx, allAvailableDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + + // Apply binding strategy + result, err := s.BindingStrategy.Bind(ctx, sortedDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + + return result, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go new file mode 100644 index 0000000000..d921ad7ba6 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go @@ -0,0 +1,108 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package bind + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + BindingStrategyNameDefault = "default" +) + +// DefaultBindingStrategy binds GPU devices to the allocation context +type DefaultBindingStrategy struct{} + +// NewDefaultBindingStrategy creates a new default binding strategy +func NewDefaultBindingStrategy() *DefaultBindingStrategy { + return &DefaultBindingStrategy{} +} + +// Name returns the name of the binding strategy +func (s *DefaultBindingStrategy) Name() string { + return BindingStrategyNameDefault +} + +// Bind binds the sorted GPU devices to the allocation context +// It creates allocation info for the selected devices +func (s *DefaultBindingStrategy) Bind(ctx *allocate.AllocationContext, sortedDevices []string) (*allocate.AllocationResult, error) { + if ctx.GPUTopology == nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: "GPU topology is nil", + }, fmt.Errorf("GPU topology is nil") + } + + if len(sortedDevices) == 0 && ctx.DeviceReq.DeviceRequest > 0 { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: "no devices to bind", + }, fmt.Errorf("no devices to bind") + } + + // Determine how many devices to allocate + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + if devicesToAllocate > len(sortedDevices) { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), + }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) + } + + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if devicesToAllocate == allocatedDevices.Len() { + return true + } + } + return false + } + + // First try to bind reusable devices + if allocateDevices(ctx.DeviceReq.ReusableDevices...) { + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + // Then try to bind devices from sorted list + if allocateDevices(sortedDevices...) { + general.InfoS("Successfully bound devices", + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName, + "allocatedDevices", allocatedDevices.List()) + + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), + }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go new file mode 100644 index 0000000000..be7f1406d8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go @@ -0,0 +1,94 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + FilteringStrategyNameGPUMemory = "gpu-memory" +) + +// GPUMemoryFilteringStrategy filters GPU devices based on available GPU memory +type GPUMemoryFilteringStrategy struct{} + +// NewGPUMemoryFilteringStrategy creates a new GPU memory filtering strategy +func NewGPUMemoryFilteringStrategy() allocate.FilteringStrategy { + return &GPUMemoryFilteringStrategy{} +} + +// Name returns the name of the filtering strategy +func (s *GPUMemoryFilteringStrategy) Name() string { + return FilteringStrategyNameGPUMemory +} + +// Filter filters the available GPU devices based on available GPU memory +// It returns devices that have enough available memory for the request +func (s *GPUMemoryFilteringStrategy) Filter(ctx *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + if ctx.GPUTopology == nil { + return nil, fmt.Errorf("GPU topology is nil") + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + if err != nil { + general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default available devices", err) + return allAvailableDevices, nil + } + + filteredDevices, err := s.filterGPUDevices(ctx, gpuMemory, allAvailableDevices) + if err != nil { + return nil, err + } + + return filteredDevices, nil +} +func (s *GPUMemoryFilteringStrategy) filterGPUDevices( + ctx *allocate.AllocationContext, + gpuMemoryRequest float64, + allAvailableDevices []string, +) ([]string, error) { + gpuRequest := ctx.DeviceReq.GetDeviceRequest() + gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) + gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) + + machineState := ctx.MachineState[consts.ResourceGPUMemory] + filteredDevices := sets.NewString() + for _, device := range allAvailableDevices { + if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.GPUTopology, ctx.HintNodes) { + continue + } + + if !machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, gpuMemoryAllocatablePerGPU) { + general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, gpuMemoryAllocatablePerGPU, machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) + continue + } + + filteredDevices.Insert(device) + } + + return filteredDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go new file mode 100644 index 0000000000..fda1a5d0f9 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go @@ -0,0 +1,68 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "fmt" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// AllocateGPUUsingStrategy performs GPU allocation using the strategy framework +func AllocateGPUUsingStrategy( + resourceReq *pluginapi.ResourceRequest, + deviceReq *pluginapi.DeviceRequest, + gpuTopology *machine.DeviceTopology, + gpuConfig *qrm.GPUQRMPluginConfig, + emitter metrics.MetricEmitter, + metaServer *metaserver.MetaServer, + machineState state.AllocationResourcesMap, + qosLevel string, +) (*allocate.AllocationResult, error) { + // Get hint nodes + hintNodes, err := machine.NewCPUSetUint64(resourceReq.GetHint().GetNodes()...) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to get hint nodes: %v", err), + }, err + } + + // Create allocation context + ctx := &allocate.AllocationContext{ + ResourceReq: resourceReq, + DeviceReq: deviceReq, + GPUTopology: gpuTopology, + GPUQRMPluginConfig: gpuConfig, + Emitter: emitter, + MetaServer: metaServer, + MachineState: machineState, + QoSLevel: qosLevel, + HintNodes: hintNodes, + } + + // Get the global strategy manager and perform allocation + manager := GetGlobalStrategyManager() + return manager.AllocateUsingStrategy(ctx) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go new file mode 100644 index 0000000000..a4afd8fc97 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go @@ -0,0 +1,175 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + AllocationStrategyNameDefault = "default" +) + +// StrategyManager manages the selection of allocation strategies based on resource names +type StrategyManager struct { + *StrategyRegistry + + // Mapping from resource name to strategy name + resourceToStrategy map[string]string + + // Default strategy to use when no specific strategy is configured + defaultStrategy string + + // Mutex for thread-safe access + mutex sync.RWMutex +} + +// NewStrategyManager creates a new strategy manager +func NewStrategyManager() *StrategyManager { + return &StrategyManager{ + StrategyRegistry: NewStrategyRegistry(), + resourceToStrategy: make(map[string]string), + defaultStrategy: AllocationStrategyNameDefault, + } +} + +// RegisterStrategyForResource registers a strategy for a specific resource name +func (m *StrategyManager) RegisterStrategyForResource(resourceName, strategyName string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + // Check if the strategy exists + _, err := m.GetAllocationStrategy(strategyName) + if err != nil { + return fmt.Errorf("strategy %s not found: %v", strategyName, err) + } + + m.resourceToStrategy[resourceName] = strategyName + general.Infof("Registered strategy %s for resource %s", strategyName, resourceName) + return nil +} + +// SetDefaultStrategy sets the default strategy to use when no specific strategy is configured +func (m *StrategyManager) SetDefaultStrategy(strategyName string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + // Check if the strategy exists + _, err := m.GetAllocationStrategy(strategyName) + if err != nil { + return fmt.Errorf("strategy %s not found: %v", strategyName, err) + } + + m.defaultStrategy = strategyName + general.Infof("Set default strategy to %s", strategyName) + return nil +} + +// GetStrategyForResource returns the strategy name for a given resource +func (m *StrategyManager) GetStrategyForResource(resourceName string) string { + m.mutex.RLock() + defer m.mutex.RUnlock() + + if strategyName, exists := m.resourceToStrategy[resourceName]; exists { + return strategyName + } + + return m.defaultStrategy +} + +// GetAllocationStrategyForResource returns the allocation strategy for a given resource +func (m *StrategyManager) GetAllocationStrategyForResource(resourceName string) (allocate.AllocationStrategy, error) { + strategyName := m.GetStrategyForResource(resourceName) + return m.GetAllocationStrategy(strategyName) +} + +// AllocateUsingStrategy performs allocation using the appropriate strategy for the resource +func (m *StrategyManager) AllocateUsingStrategy(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { + // Determine the device name + resourceName := ctx.DeviceReq.DeviceName + + // Get the strategy for this resource + strategy, err := m.GetAllocationStrategyForResource(resourceName) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to get strategy for resource %s: %v", resourceName, err), + }, fmt.Errorf("failed to get strategy for resource %s: %v", resourceName, err) + } + + general.InfoS("Using strategy for allocation", + "resourceName", resourceName, + "strategyName", strategy.Name(), + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName) + + // Perform allocation using the strategy + return strategy.Allocate(ctx) +} + +// Global strategy manager instance +var globalStrategyManager *StrategyManager +var once sync.Once + +// GetGlobalStrategyManager returns the global strategy manager instance +func GetGlobalStrategyManager() *StrategyManager { + once.Do(func() { + globalStrategyManager = NewStrategyManager() + + // Register default strategies + registerDefaultStrategies(globalStrategyManager) + }) + return globalStrategyManager +} + +// registerDefaultStrategies registers the default strategies +func registerDefaultStrategies(manager *StrategyManager) { + // Register filtering strategies + if err := manager.RegisterFilteringStrategy(filter.NewGPUMemoryFilteringStrategy()); err != nil { + general.Errorf("Failed to register filtering strategy: %v", err) + } + + // Register sorting strategies + if err := manager.RegisterSortingStrategy(sort.NewGPUMemorySortingStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } + + // Register binding strategies + if err := manager.RegisterBindingStrategy(bind.NewDefaultBindingStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } + + // Register allocation strategies + if err := manager.RegisterGenericAllocationStrategy(AllocationStrategyNameDefault, []string{filter.FilteringStrategyNameGPUMemory}, + sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault); err != nil { + general.Errorf("Failed to register gpu-memory-default strategy: %v", err) + } + + // Register resource to strategy mappings + err := manager.SetDefaultStrategy(AllocationStrategyNameDefault) + if err != nil { + general.Errorf("Failed to set default strategy: %v", err) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go new file mode 100644 index 0000000000..391854acaf --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go @@ -0,0 +1,108 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort" +) + +func TestStrategyRegistry(t *testing.T) { + t.Parallel() + + registry := NewStrategyRegistry() + // Test filtering strategy registration + filteringStrategy := filter.NewGPUMemoryFilteringStrategy() + err := registry.RegisterFilteringStrategy(filteringStrategy) + assert.NoError(t, err) + + // Test duplicate registration + err = registry.RegisterFilteringStrategy(filteringStrategy) + assert.Error(t, err) + + // Test strategy retrieval + retrievedStrategy, err := registry.GetFilteringStrategy(filter.FilteringStrategyNameGPUMemory) + assert.NoError(t, err) + assert.Equal(t, filter.FilteringStrategyNameGPUMemory, retrievedStrategy.Name()) + + // Test non-existent strategy + _, err = registry.GetFilteringStrategy("non-existent") + assert.Error(t, err) + + // Test sorting strategy registration + sortingStrategy := sort.NewGPUMemorySortingStrategy() + err = registry.RegisterSortingStrategy(sortingStrategy) + assert.NoError(t, err) + + // Test binding strategy registration + bindingStrategy := bind.NewDefaultBindingStrategy() + err = registry.RegisterBindingStrategy(bindingStrategy) + assert.NoError(t, err) + + // Test allocation strategy registration + err = registry.RegisterGenericAllocationStrategy("test-allocation", []string{filter.FilteringStrategyNameGPUMemory}, + sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault) + assert.NoError(t, err) + + // Test allocation strategy retrieval + allocationStrategy, err := registry.GetAllocationStrategy("test-allocation") + assert.NoError(t, err) + assert.Equal(t, filter.FilteringStrategyNameGPUMemory, allocationStrategy.(*allocation.GenericAllocationStrategy).FilteringStrategy[0].Name()) + assert.Equal(t, sort.SortingStrategyNameGPUMemory, allocationStrategy.(*allocation.GenericAllocationStrategy).SortingStrategy.Name()) + assert.Equal(t, bind.BindingStrategyNameDefault, allocationStrategy.(*allocation.GenericAllocationStrategy).BindingStrategy.Name()) +} + +func TestStrategyManager(t *testing.T) { + t.Parallel() + + manager := NewStrategyManager() + + registerDefaultStrategies(manager) + + // Test default strategy + assert.Equal(t, "default", manager.defaultStrategy) + + // Test setting default strategy + err := manager.SetDefaultStrategy("test-allocation") + assert.Error(t, err) // Should fail because test-allocation is not registered yet + + // Test registering strategy for resource + err = manager.RegisterStrategyForResource("test-resource", "test-allocation") + assert.Error(t, err) // Should fail because test-allocation is not registered yet + + err = manager.RegisterGenericAllocationStrategy("test-allocation", []string{filter.FilteringStrategyNameGPUMemory}, + sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault) + assert.NoError(t, err) + + // Now test registering strategy for resource + err = manager.RegisterStrategyForResource("test-resource", "test-allocation") + assert.NoError(t, err) + + // Test getting strategy for resource + strategyName := manager.GetStrategyForResource("test-resource") + assert.Equal(t, "test-allocation", strategyName) + + // Test getting default strategy for non-existent resource + strategyName = manager.GetStrategyForResource("non-existent-resource") + assert.Equal(t, "default", strategyName) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go new file mode 100644 index 0000000000..9f94f01196 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go @@ -0,0 +1,186 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +type StrategyRegistry struct { + filteringStrategies map[string]allocate.FilteringStrategy + sortingStrategies map[string]allocate.SortingStrategy + bindingStrategies map[string]allocate.BindingStrategy + allocationStrategies map[string]allocate.AllocationStrategy + // Mutex for thread-safe access to registries + registryMutex sync.RWMutex +} + +// NewStrategyRegistry creates a new instance of StrategyRegistry +func NewStrategyRegistry() *StrategyRegistry { + return &StrategyRegistry{ + filteringStrategies: make(map[string]allocate.FilteringStrategy), + sortingStrategies: make(map[string]allocate.SortingStrategy), + bindingStrategies: make(map[string]allocate.BindingStrategy), + allocationStrategies: make(map[string]allocate.AllocationStrategy), + } +} + +// RegisterFilteringStrategy registers a filtering strategy with the given name +func (r *StrategyRegistry) RegisterFilteringStrategy(strategy allocate.FilteringStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.filteringStrategies[strategy.Name()]; exists { + return fmt.Errorf("filtering strategy with name %s already registered", strategy.Name()) + } + + r.filteringStrategies[strategy.Name()] = strategy + general.Infof("Registered filtering strategy: %s", strategy.Name()) + return nil +} + +// RegisterSortingStrategy registers a sorting strategy with the given name +func (r *StrategyRegistry) RegisterSortingStrategy(strategy allocate.SortingStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.sortingStrategies[strategy.Name()]; exists { + return fmt.Errorf("sorting strategy with name %s already registered", strategy.Name()) + } + + r.sortingStrategies[strategy.Name()] = strategy + general.Infof("Registered sorting strategy: %s", strategy.Name()) + return nil +} + +// RegisterBindingStrategy registers a binding strategy with the given name +func (r *StrategyRegistry) RegisterBindingStrategy(strategy allocate.BindingStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.bindingStrategies[strategy.Name()]; exists { + return fmt.Errorf("binding strategy with name %s already registered", strategy.Name()) + } + + r.bindingStrategies[strategy.Name()] = strategy + general.Infof("Registered binding strategy: %s", strategy.Name()) + return nil +} + +func (r *StrategyRegistry) RegisterAllocationStrategy(strategy allocate.AllocationStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.allocationStrategies[strategy.Name()]; exists { + return fmt.Errorf("allocation strategy with name %s already registered", strategy.Name()) + } + + r.allocationStrategies[strategy.Name()] = strategy + general.Infof("Registered allocation strategy: %s", strategy.Name()) + return nil +} + +// RegisterGenericAllocationStrategy registers a complete generic allocation strategy with the given name +func (r *StrategyRegistry) RegisterGenericAllocationStrategy(name string, filteringNames []string, sortingName, bindingName string) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.allocationStrategies[name]; exists { + return fmt.Errorf("allocation strategy with name %s already registered", name) + } + + var filteringList []allocate.FilteringStrategy + for _, fn := range filteringNames { + filtering, exists := r.filteringStrategies[fn] + if !exists { + return fmt.Errorf("filtering strategy %s not found", fn) + } + filteringList = append(filteringList, filtering) + } + + sorting, exists := r.sortingStrategies[sortingName] + if !exists { + return fmt.Errorf("sorting strategy %s not found", sortingName) + } + + binding, exists := r.bindingStrategies[bindingName] + if !exists { + return fmt.Errorf("binding strategy %s not found", bindingName) + } + + r.allocationStrategies[name] = allocation.NewGenericAllocationStrategy(name, filteringList, sorting, binding) + general.Infof("Registered allocation strategy: %s (filtering: %s, sorting: %s, binding: %s)", + name, filteringNames, sortingName, bindingName) + return nil +} + +// GetFilteringStrategy returns the filtering strategy with the given name +func (r *StrategyRegistry) GetFilteringStrategy(name string) (allocate.FilteringStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.filteringStrategies[name] + if !exists { + return nil, fmt.Errorf("filtering strategy %s not found", name) + } + + return strategy, nil +} + +// GetSortingStrategy returns the sorting strategy with the given name +func (r *StrategyRegistry) GetSortingStrategy(name string) (allocate.SortingStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.sortingStrategies[name] + if !exists { + return nil, fmt.Errorf("sorting strategy %s not found", name) + } + + return strategy, nil +} + +// GetBindingStrategy returns the binding strategy with the given name +func (r *StrategyRegistry) GetBindingStrategy(name string) (allocate.BindingStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.bindingStrategies[name] + if !exists { + return nil, fmt.Errorf("binding strategy %s not found", name) + } + + return strategy, nil +} + +// GetAllocationStrategy returns the allocation strategy with the given name +func (r *StrategyRegistry) GetAllocationStrategy(name string) (allocate.AllocationStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.allocationStrategies[name] + if !exists { + return nil, fmt.Errorf("allocation strategy %s not found", name) + } + + return strategy, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go new file mode 100644 index 0000000000..afadfbeaa0 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go @@ -0,0 +1,100 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package sort + +import ( + "fmt" + "sort" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + SortingStrategyNameGPUMemory = "gpu-memory" +) + +// GPUMemorySortingStrategy sorts GPU devices based on available GPU memory +type GPUMemorySortingStrategy struct{} + +// NewGPUMemorySortingStrategy creates a new GPU memory sorting strategy +func NewGPUMemorySortingStrategy() *GPUMemorySortingStrategy { + return &GPUMemorySortingStrategy{} +} + +// Name returns the name of the sorting strategy +func (s *GPUMemorySortingStrategy) Name() string { + return SortingStrategyNameGPUMemory +} + +// Sort sorts the filtered GPU devices based on available GPU memory +// It prioritizes devices with more available memory and considers NUMA affinity +func (s *GPUMemorySortingStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { + if ctx.GPUTopology == nil { + return nil, fmt.Errorf("GPU topology is nil") + } + + _, _, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + if err != nil { + general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default filtered devices", err) + return filteredDevices, nil + } + + gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) + machineState := ctx.MachineState[consts.ResourceGPUMemory] + + // Create a slice of device info with available memory + type deviceInfo struct { + ID string + AvailableMemory float64 + NUMAAffinity bool + } + + devices := make([]deviceInfo, 0, len(filteredDevices)) + + for _, deviceID := range filteredDevices { + availableMemory := gpuMemoryAllocatablePerGPU - machineState.GetQuantityAllocated(deviceID) + devices = append(devices, deviceInfo{ + ID: deviceID, + AvailableMemory: availableMemory, + NUMAAffinity: util.IsNUMAAffinityDevice(deviceID, ctx.GPUTopology, ctx.HintNodes), + }) + } + + // Sort devices: first by NUMA affinity (preferred), then by available memory (ascending) + sort.Slice(devices, func(i, j int) bool { + // If both devices have NUMA affinity or both don't, sort by available memory + if devices[i].NUMAAffinity == devices[j].NUMAAffinity { + return devices[i].AvailableMemory < devices[j].AvailableMemory + } + + // Prefer devices with NUMA affinity + return devices[i].NUMAAffinity && !devices[j].NUMAAffinity + }) + + // Extract sorted device IDs + sortedDevices := make([]string, len(devices)) + for i, device := range devices { + sortedDevices[i] = device.ID + } + + general.InfoS("Sorted devices", "count", len(sortedDevices), "devices", sortedDevices) + return sortedDevices, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go new file mode 100644 index 0000000000..f21ba7610f --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package allocate + +import ( + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// AllocationContext contains all the information needed for GPU allocation +type AllocationContext struct { + ResourceReq *pluginapi.ResourceRequest + DeviceReq *pluginapi.DeviceRequest + GPUTopology *machine.DeviceTopology + GPUQRMPluginConfig *qrm.GPUQRMPluginConfig + Emitter metrics.MetricEmitter + MetaServer *metaserver.MetaServer + MachineState state.AllocationResourcesMap + QoSLevel string + HintNodes machine.CPUSet +} + +// AllocationResult contains the result of GPU allocation +type AllocationResult struct { + AllocatedDevices []string + Success bool + ErrorMessage string +} + +// FilteringStrategy defines the interface for filtering GPU devices +type FilteringStrategy interface { + // Name returns the name of the filtering strategy + Name() string + + // Filter filters the available GPU devices based on the allocation context + // Returns a list of filtered device IDs + Filter(ctx *AllocationContext, allAvailableDevices []string) ([]string, error) +} + +// SortingStrategy defines the interface for sorting GPU devices +type SortingStrategy interface { + // Name returns the name of the sorting strategy + Name() string + + // Sort sorts the filtered GPU devices based on the allocation context + // Returns a prioritized list of device IDs + Sort(ctx *AllocationContext, filteredDevices []string) ([]string, error) +} + +// BindingStrategy defines the interface for binding GPU devices +type BindingStrategy interface { + // Name returns the name of the binding strategy + Name() string + + // Bind binds the sorted GPU devices to the allocation context + // Returns the final allocation result + Bind(ctx *AllocationContext, sortedDevices []string) (*AllocationResult, error) +} + +type AllocationStrategy interface { + // Name returns the name of the allocation strategy + Name() string + + // Allocate performs the allocation using the combined strategies + Allocate(ctx *AllocationContext) (*AllocationResult, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index cfde820216..3168bf6e92 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -21,7 +21,11 @@ import ( "math" pkgerrors "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) @@ -68,3 +72,24 @@ func IsNUMAAffinityDevice( return machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) } + +// GetGPUCount extracts GPU count from resource request +func GetGPUCount(req *pluginapi.ResourceRequest, deviceNames []string) (float64, sets.String, error) { + gpuCount := float64(0) + gpuNames := sets.NewString() + + for _, resourceName := range deviceNames { + _, request, err := qrmutil.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + if err != nil && !errors.IsNotFound(err) { + return 0, nil, err + } + gpuCount += request + gpuNames.Insert(resourceName) + } + + if gpuCount == 0 { + return 0, gpuNames, fmt.Errorf("no available GPU count") + } + + return gpuCount, gpuNames, nil +} diff --git a/pkg/agent/qrm-plugins/util/util.go b/pkg/agent/qrm-plugins/util/util.go index ecc1cc034b..99158b1f16 100644 --- a/pkg/agent/qrm-plugins/util/util.go +++ b/pkg/agent/qrm-plugins/util/util.go @@ -378,3 +378,20 @@ func GetPodAggregatedRequestResource(req *pluginapi.ResourceRequest) (int, float return 0, 0, fmt.Errorf("not support resource name: %s", req.ResourceName) } } + +// CreateEmptyAllocationResponse creates an empty allocation response +func CreateEmptyAllocationResponse(resourceReq *pluginapi.ResourceRequest, resourceName string) *pluginapi.ResourceAllocationResponse { + return &pluginapi.ResourceAllocationResponse{ + PodUid: resourceReq.PodUid, + PodNamespace: resourceReq.PodNamespace, + PodName: resourceReq.PodName, + ContainerName: resourceReq.ContainerName, + ContainerType: resourceReq.ContainerType, + ContainerIndex: resourceReq.ContainerIndex, + PodRole: resourceReq.PodRole, + PodType: resourceReq.PodType, + ResourceName: resourceName, + Labels: general.DeepCopyMap(resourceReq.Labels), + Annotations: general.DeepCopyMap(resourceReq.Annotations), + } +} From 095e54637d4ffc84a8ed31742d0c20c3ee5f66d1 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 22 Oct 2025 11:51:20 +0800 Subject: [PATCH 21/52] refactor(gpu-strategy): reorganize gpu allocation strategy components restructure gpu allocation strategy into separate packages for better maintainability. move filtering, sorting and binding strategies to dedicated directories and implement unified generic allocation strategy. update manager to use new strategy structure and rename default strategy constant --- .../gpu/strategy/allocate/manager/manager.go | 32 +++++------ .../strategy/allocate/manager/manager_test.go | 56 +++++++++++++------ .../gpu/strategy/allocate/manager/registry.go | 2 +- .../{ => strategies}/allocation/generic.go | 4 +- .../canonical/canonical.go} | 22 ++++---- .../gpu_memory/filter.go} | 24 ++------ .../strategies/gpu_memory/gpu_memory.go | 39 +++++++++++++ .../gpu_memory/sort.go} | 21 +------ 8 files changed, 117 insertions(+), 83 deletions(-) rename pkg/agent/qrm-plugins/gpu/strategy/allocate/{ => strategies}/allocation/generic.go (96%) rename pkg/agent/qrm-plugins/gpu/strategy/allocate/{bind/default.go => strategies/canonical/canonical.go} (84%) rename pkg/agent/qrm-plugins/gpu/strategy/allocate/{filter/gpu_memory.go => strategies/gpu_memory/filter.go} (78%) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go rename pkg/agent/qrm-plugins/gpu/strategy/allocate/{sort/gpu_memory.go => strategies/gpu_memory/sort.go} (82%) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go index a4afd8fc97..9de5336792 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go @@ -21,14 +21,13 @@ import ( "sync" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" "github.com/kubewharf/katalyst-core/pkg/util/general" ) const ( - AllocationStrategyNameDefault = "default" + allocationStrategyNameDefault = "default" ) // StrategyManager manages the selection of allocation strategies based on resource names @@ -50,7 +49,7 @@ func NewStrategyManager() *StrategyManager { return &StrategyManager{ StrategyRegistry: NewStrategyRegistry(), resourceToStrategy: make(map[string]string), - defaultStrategy: AllocationStrategyNameDefault, + defaultStrategy: allocationStrategyNameDefault, } } @@ -70,6 +69,13 @@ func (m *StrategyManager) RegisterStrategyForResource(resourceName, strategyName return nil } +func (m *StrategyManager) GetDefaultStrategy() (allocate.AllocationStrategy, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + + return m.GetAllocationStrategy(m.defaultStrategy) +} + // SetDefaultStrategy sets the default strategy to use when no specific strategy is configured func (m *StrategyManager) SetDefaultStrategy(strategyName string) error { m.mutex.Lock() @@ -147,29 +153,23 @@ func GetGlobalStrategyManager() *StrategyManager { // registerDefaultStrategies registers the default strategies func registerDefaultStrategies(manager *StrategyManager) { // Register filtering strategies - if err := manager.RegisterFilteringStrategy(filter.NewGPUMemoryFilteringStrategy()); err != nil { + if err := manager.RegisterFilteringStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { general.Errorf("Failed to register filtering strategy: %v", err) } // Register sorting strategies - if err := manager.RegisterSortingStrategy(sort.NewGPUMemorySortingStrategy()); err != nil { + if err := manager.RegisterSortingStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { general.Errorf("Failed to register sorting strategy: %v", err) } // Register binding strategies - if err := manager.RegisterBindingStrategy(bind.NewDefaultBindingStrategy()); err != nil { + if err := manager.RegisterBindingStrategy(canonical.NewCanonicalStrategy()); err != nil { general.Errorf("Failed to register binding strategy: %v", err) } // Register allocation strategies - if err := manager.RegisterGenericAllocationStrategy(AllocationStrategyNameDefault, []string{filter.FilteringStrategyNameGPUMemory}, - sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault); err != nil { + if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, []string{gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { general.Errorf("Failed to register gpu-memory-default strategy: %v", err) } - - // Register resource to strategy mappings - err := manager.SetDefaultStrategy(AllocationStrategyNameDefault) - if err != nil { - general.Errorf("Failed to set default strategy: %v", err) - } } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go index 391854acaf..7b485d9c23 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go @@ -21,18 +21,42 @@ import ( "github.com/stretchr/testify/assert" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" ) +type dummyStrategy struct { + name string +} + +func (s *dummyStrategy) Name() string { + return s.name +} + +func (s *dummyStrategy) Filter(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Sort(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Bind(_ *allocate.AllocationContext, _ []string) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + +func (s *dummyStrategy) Allocate(_ *allocate.AllocationContext) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + func TestStrategyRegistry(t *testing.T) { t.Parallel() registry := NewStrategyRegistry() // Test filtering strategy registration - filteringStrategy := filter.NewGPUMemoryFilteringStrategy() + filteringStrategy := &dummyStrategy{name: "test-filtering"} err := registry.RegisterFilteringStrategy(filteringStrategy) assert.NoError(t, err) @@ -41,35 +65,35 @@ func TestStrategyRegistry(t *testing.T) { assert.Error(t, err) // Test strategy retrieval - retrievedStrategy, err := registry.GetFilteringStrategy(filter.FilteringStrategyNameGPUMemory) + retrievedStrategy, err := registry.GetFilteringStrategy("test-filtering") assert.NoError(t, err) - assert.Equal(t, filter.FilteringStrategyNameGPUMemory, retrievedStrategy.Name()) + assert.Equal(t, "test-filtering", retrievedStrategy.Name()) // Test non-existent strategy _, err = registry.GetFilteringStrategy("non-existent") assert.Error(t, err) // Test sorting strategy registration - sortingStrategy := sort.NewGPUMemorySortingStrategy() + sortingStrategy := &dummyStrategy{name: "test-sorting"} err = registry.RegisterSortingStrategy(sortingStrategy) assert.NoError(t, err) // Test binding strategy registration - bindingStrategy := bind.NewDefaultBindingStrategy() + bindingStrategy := &dummyStrategy{name: "test-binding"} err = registry.RegisterBindingStrategy(bindingStrategy) assert.NoError(t, err) // Test allocation strategy registration - err = registry.RegisterGenericAllocationStrategy("test-allocation", []string{filter.FilteringStrategyNameGPUMemory}, - sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault) + err = registry.RegisterGenericAllocationStrategy("test-allocation", []string{"test-filtering"}, + "test-sorting", "test-binding") assert.NoError(t, err) // Test allocation strategy retrieval allocationStrategy, err := registry.GetAllocationStrategy("test-allocation") assert.NoError(t, err) - assert.Equal(t, filter.FilteringStrategyNameGPUMemory, allocationStrategy.(*allocation.GenericAllocationStrategy).FilteringStrategy[0].Name()) - assert.Equal(t, sort.SortingStrategyNameGPUMemory, allocationStrategy.(*allocation.GenericAllocationStrategy).SortingStrategy.Name()) - assert.Equal(t, bind.BindingStrategyNameDefault, allocationStrategy.(*allocation.GenericAllocationStrategy).BindingStrategy.Name()) + assert.Equal(t, "test-filtering", allocationStrategy.(*allocation.GenericAllocationStrategy).FilteringStrategy[0].Name()) + assert.Equal(t, "test-sorting", allocationStrategy.(*allocation.GenericAllocationStrategy).SortingStrategy.Name()) + assert.Equal(t, "test-binding", allocationStrategy.(*allocation.GenericAllocationStrategy).BindingStrategy.Name()) } func TestStrategyManager(t *testing.T) { @@ -90,8 +114,8 @@ func TestStrategyManager(t *testing.T) { err = manager.RegisterStrategyForResource("test-resource", "test-allocation") assert.Error(t, err) // Should fail because test-allocation is not registered yet - err = manager.RegisterGenericAllocationStrategy("test-allocation", []string{filter.FilteringStrategyNameGPUMemory}, - sort.SortingStrategyNameGPUMemory, bind.BindingStrategyNameDefault) + err = manager.RegisterGenericAllocationStrategy("test-allocation", []string{gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical) assert.NoError(t, err) // Now test registering strategy for resource diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go index 9f94f01196..321be54f3d 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go @@ -21,7 +21,7 @@ import ( "sync" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" "github.com/kubewharf/katalyst-core/pkg/util/general" ) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go similarity index 96% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go rename to pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go index 00533e1546..a9bddfe616 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/allocation/generic.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go @@ -31,7 +31,7 @@ func NewGenericAllocationStrategy(name string, filtering []allocate.FilteringStrategy, sorting allocate.SortingStrategy, binding allocate.BindingStrategy, -) allocate.AllocationStrategy { +) *GenericAllocationStrategy { return &GenericAllocationStrategy{ name: name, FilteringStrategy: filtering, @@ -40,6 +40,8 @@ func NewGenericAllocationStrategy(name string, } } +var _ allocate.AllocationStrategy = &GenericAllocationStrategy{} + func (s *GenericAllocationStrategy) Name() string { return s.name } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go similarity index 84% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go rename to pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go index d921ad7ba6..c3c6a22f4a 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/bind/default.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package bind +package canonical import ( "fmt" @@ -26,25 +26,27 @@ import ( ) const ( - BindingStrategyNameDefault = "default" + StrategyNameCanonical = "canonical" ) -// DefaultBindingStrategy binds GPU devices to the allocation context -type DefaultBindingStrategy struct{} +// CanonicalStrategy binds GPU devices to the allocation context +type CanonicalStrategy struct{} -// NewDefaultBindingStrategy creates a new default binding strategy -func NewDefaultBindingStrategy() *DefaultBindingStrategy { - return &DefaultBindingStrategy{} +// NewCanonicalStrategy creates a new default binding strategy +func NewCanonicalStrategy() *CanonicalStrategy { + return &CanonicalStrategy{} } +var _ allocate.BindingStrategy = &CanonicalStrategy{} + // Name returns the name of the binding strategy -func (s *DefaultBindingStrategy) Name() string { - return BindingStrategyNameDefault +func (s *CanonicalStrategy) Name() string { + return StrategyNameCanonical } // Bind binds the sorted GPU devices to the allocation context // It creates allocation info for the selected devices -func (s *DefaultBindingStrategy) Bind(ctx *allocate.AllocationContext, sortedDevices []string) (*allocate.AllocationResult, error) { +func (s *CanonicalStrategy) Bind(ctx *allocate.AllocationContext, sortedDevices []string) (*allocate.AllocationResult, error) { if ctx.GPUTopology == nil { return &allocate.AllocationResult{ Success: false, diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go similarity index 78% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go rename to pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go index be7f1406d8..2a9a1eaa00 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/filter/gpu_memory.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package filter +package gpu_memory import ( "fmt" @@ -28,26 +28,9 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" ) -const ( - FilteringStrategyNameGPUMemory = "gpu-memory" -) - -// GPUMemoryFilteringStrategy filters GPU devices based on available GPU memory -type GPUMemoryFilteringStrategy struct{} - -// NewGPUMemoryFilteringStrategy creates a new GPU memory filtering strategy -func NewGPUMemoryFilteringStrategy() allocate.FilteringStrategy { - return &GPUMemoryFilteringStrategy{} -} - -// Name returns the name of the filtering strategy -func (s *GPUMemoryFilteringStrategy) Name() string { - return FilteringStrategyNameGPUMemory -} - // Filter filters the available GPU devices based on available GPU memory // It returns devices that have enough available memory for the request -func (s *GPUMemoryFilteringStrategy) Filter(ctx *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { +func (s *GPUMemoryStrategy) Filter(ctx *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { if ctx.GPUTopology == nil { return nil, fmt.Errorf("GPU topology is nil") } @@ -65,7 +48,8 @@ func (s *GPUMemoryFilteringStrategy) Filter(ctx *allocate.AllocationContext, all return filteredDevices, nil } -func (s *GPUMemoryFilteringStrategy) filterGPUDevices( + +func (s *GPUMemoryStrategy) filterGPUDevices( ctx *allocate.AllocationContext, gpuMemoryRequest float64, allAvailableDevices []string, diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go new file mode 100644 index 0000000000..08cc11a9ed --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go @@ -0,0 +1,39 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + +const ( + StrategyNameGPUMemory = "gpu-memory" +) + +// GPUMemoryStrategy filters GPU devices based on available GPU memory +type GPUMemoryStrategy struct{} + +var _ allocate.FilteringStrategy = &GPUMemoryStrategy{} +var _ allocate.SortingStrategy = &GPUMemoryStrategy{} + +// NewGPUMemoryStrategy creates a new GPU memory filtering strategy +func NewGPUMemoryStrategy() *GPUMemoryStrategy { + return &GPUMemoryStrategy{} +} + +// Name returns the name of the filtering strategy +func (s *GPUMemoryStrategy) Name() string { + return StrategyNameGPUMemory +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go similarity index 82% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go rename to pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go index afadfbeaa0..5f3aa14a63 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/sort/gpu_memory.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package sort +package gpu_memory import ( "fmt" @@ -27,26 +27,9 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" ) -const ( - SortingStrategyNameGPUMemory = "gpu-memory" -) - -// GPUMemorySortingStrategy sorts GPU devices based on available GPU memory -type GPUMemorySortingStrategy struct{} - -// NewGPUMemorySortingStrategy creates a new GPU memory sorting strategy -func NewGPUMemorySortingStrategy() *GPUMemorySortingStrategy { - return &GPUMemorySortingStrategy{} -} - -// Name returns the name of the sorting strategy -func (s *GPUMemorySortingStrategy) Name() string { - return SortingStrategyNameGPUMemory -} - // Sort sorts the filtered GPU devices based on available GPU memory // It prioritizes devices with more available memory and considers NUMA affinity -func (s *GPUMemorySortingStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { +func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { if ctx.GPUTopology == nil { return nil, fmt.Errorf("GPU topology is nil") } From e9cf4eb34d76c6c4b8abbc55615f0c8e1c798b66 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 22 Oct 2025 12:02:38 +0800 Subject: [PATCH 22/52] refactor(gpu-strategy): make strategy fields private and add accessors Convert public strategy fields to private and provide getter/setter methods to maintain encapsulation while allowing controlled access to the strategies --- .../allocate/strategies/allocation/generic.go | 48 +++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go index a9bddfe616..1bcd098fb7 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go @@ -21,9 +21,9 @@ import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/al // GenericAllocationStrategy combines filtering, sorting, and binding strategies type GenericAllocationStrategy struct { name string - FilteringStrategy []allocate.FilteringStrategy - SortingStrategy allocate.SortingStrategy - BindingStrategy allocate.BindingStrategy + filteringStrategy []allocate.FilteringStrategy + sortingStrategy allocate.SortingStrategy + bindingStrategy allocate.BindingStrategy } // NewGenericAllocationStrategy creates a new allocation strategy with the given components @@ -34,9 +34,9 @@ func NewGenericAllocationStrategy(name string, ) *GenericAllocationStrategy { return &GenericAllocationStrategy{ name: name, - FilteringStrategy: filtering, - SortingStrategy: sorting, - BindingStrategy: binding, + filteringStrategy: filtering, + sortingStrategy: sorting, + bindingStrategy: binding, } } @@ -51,7 +51,7 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* var err error allAvailableDevices := append(ctx.DeviceReq.ReusableDevices, ctx.DeviceReq.AvailableDevices...) // Apply filtering strategy - for _, fs := range s.FilteringStrategy { + for _, fs := range s.filteringStrategy { allAvailableDevices, err = fs.Filter(ctx, allAvailableDevices) if err != nil { return &allocate.AllocationResult{ @@ -62,7 +62,7 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* } // Apply sorting strategy - sortedDevices, err := s.SortingStrategy.Sort(ctx, allAvailableDevices) + sortedDevices, err := s.sortingStrategy.Sort(ctx, allAvailableDevices) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -71,7 +71,7 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* } // Apply binding strategy - result, err := s.BindingStrategy.Bind(ctx, sortedDevices) + result, err := s.bindingStrategy.Bind(ctx, sortedDevices) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -81,3 +81,33 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* return result, nil } + +// GetFilteringStrategy returns the filtering strategy +func (s *GenericAllocationStrategy) GetFilteringStrategy() []allocate.FilteringStrategy { + return s.filteringStrategy +} + +// SetFilteringStrategy sets the filtering strategy +func (s *GenericAllocationStrategy) SetFilteringStrategy(filteringStrategy []allocate.FilteringStrategy) { + s.filteringStrategy = filteringStrategy +} + +// GetSortingStrategy returns the sorting strategy +func (s *GenericAllocationStrategy) GetSortingStrategy() allocate.SortingStrategy { + return s.sortingStrategy +} + +// SetSortingStrategy sets the sorting strategy +func (s *GenericAllocationStrategy) SetSortingStrategy(sortingStrategy allocate.SortingStrategy) { + s.sortingStrategy = sortingStrategy +} + +// GetBindingStrategy returns the binding strategy +func (s *GenericAllocationStrategy) GetBindingStrategy() allocate.BindingStrategy { + return s.bindingStrategy +} + +// SetBindingStrategy sets the binding strategy +func (s *GenericAllocationStrategy) SetBindingStrategy(bindingStrategy allocate.BindingStrategy) { + s.bindingStrategy = bindingStrategy +} From 856b6851aa67c8b15b73ad5cb662bbb734835262 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 22 Oct 2025 16:04:34 +0800 Subject: [PATCH 23/52] feat(device): add device affinity group support Introduce DeviceAffinityGroup field to DeviceInfo struct to support device affinity grouping with priority levels. --- pkg/util/machine/device.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index cb244be6be..b4c5181f71 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -125,8 +125,15 @@ type DeviceTopology struct { } type DeviceInfo struct { - Health string - NumaNodes []int + Health string + NumaNodes []int + DeviceAffinityGroup map[AffinityPriority]AffinityGroupInfo +} + +type AffinityPriority int + +type AffinityGroupInfo struct { + GroupID int } func (i DeviceInfo) GetNUMANode() []int { From b8e223921228a98c3df18579a0d44ed7f445910b Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Thu, 23 Oct 2025 15:29:39 +0800 Subject: [PATCH 24/52] feat: develop device affinity binding and filtering strategies feat: implement device affinity strategy --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 10 +- .../gpu/customdeviceplugin/gpu/gpu.go | 2 +- .../gpu/customdeviceplugin/rdma/rdma.go | 2 +- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 2 +- .../qrm-plugins/gpu/staticpolicy/policy.go | 12 +- .../gpu/strategy/allocate/manager/helper.go | 2 +- .../gpu/strategy/allocate/manager/manager.go | 27 +++- .../gpu/strategy/allocate/manager/registry.go | 4 +- .../strategies/canonical/canonical.go | 55 +++++--- .../strategies/deviceaffinity/bind.go | 120 ++++++++++++++++++ .../deviceaffinity/device_affinity.go | 43 +++++++ .../strategies/deviceaffinity/sort.go | 24 ++++ .../allocate/strategies/gpu_memory/filter.go | 9 +- .../strategies/gpu_memory/gpu_memory.go | 6 +- .../allocate/strategies/gpu_memory/sort.go | 4 +- .../gpu/strategy/allocate/strategies/util.go | 42 ++++++ .../gpu/strategy/allocate/types.go | 2 +- pkg/util/machine/device.go | 76 ++++++----- pkg/util/machine/device_affinity.go | 69 ++++++++++ 19 files changed, 438 insertions(+), 73 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go create mode 100644 pkg/util/machine/device_affinity.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 6814f3a25e..13ec854444 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -90,6 +90,11 @@ func NewBasePlugin( }, nil } +// SetDeviceAffinity is a hook to set device affinity for a certain device type +func (p *BasePlugin) SetDeviceAffinity(deviceType string, deviceAffinityProvider machine.DeviceAffinityProvider) error { + return p.DeviceTopologyRegistry.SetDeviceAffinity(deviceType, deviceAffinityProvider) +} + func (p *BasePlugin) PackAllocationResponse( req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, resourceAllocationAnnotations map[string]string, resourceName string, @@ -153,8 +158,9 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( } deviceTopology.Devices[device.ID] = machine.DeviceInfo{ - Health: device.Health, - NumaNodes: numaNode, + Health: device.Health, + NumaNodes: numaNode, + DeviceAffinityMap: make(map[machine.AffinityPriority]machine.DeviceIDs), } } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index a12f34136c..88791ea151 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -148,7 +148,6 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( p.State.GetMachineState(), qosLevel, ) - if err != nil { return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) } @@ -201,6 +200,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( } gpuDeviceAllocationInfo.TopologyAwareAllocations = gpuDeviceTopologyAwareAllocations + // TODO:State can be updated using the actual resource name p.State.SetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName, gpuDeviceAllocationInfo, false) gpuDeviceMachineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) if err != nil { diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index 6e8db852cd..a40d86f637 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -268,7 +268,7 @@ func (p *RDMADevicePlugin) allocateWithAccompanyResource( // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same // numa nodes as the gpu and allocate them - accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceAffinity(accompanyResourceName, deviceReq.DeviceName) + accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyResourceName, deviceReq.DeviceName) if err != nil { general.Warningf("failed to get gpu to rdma affinity map: %v", err) return nil, err diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 5bc1f9763e..4645a02cc6 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -460,6 +460,7 @@ func (p *GPUMemPlugin) Allocate( } // Use the strategy framework to allocate GPU memory + // TODO: What happens when deviceReq is nil? result, err := manager.AllocateGPUUsingStrategy( resourceReq, deviceReq, @@ -470,7 +471,6 @@ func (p *GPUMemPlugin) Allocate( p.State.GetMachineState(), qosLevel, ) - if err != nil { return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index dd2f0eafa9..68787dd26d 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -109,6 +109,8 @@ func NewStaticPolicy( return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil } +var _ skeleton.QRMPlugin = (*StaticPolicy)(nil) + // Start starts this plugin func (p *StaticPolicy) Start() (err error) { general.Infof("called") @@ -557,7 +559,9 @@ func (p *StaticPolicy) clearResidualState( } } -func (p *StaticPolicy) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (p *StaticPolicy) UpdateAllocatableAssociatedDevices( + _ context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, +) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { if request == nil || len(request.Devices) == 0 { return nil, fmt.Errorf("request is nil") } @@ -663,8 +667,10 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return p.allocateAssociatedDevice(targetCustomDevicePlugin, req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) } -func (p *StaticPolicy) allocateAssociatedDevice(devicePlugin customdeviceplugin.CustomDevicePlugin, - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { +func (p *StaticPolicy) allocateAssociatedDevice( + devicePlugin customdeviceplugin.CustomDevicePlugin, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { defaultAccompanyResourceName := devicePlugin.DefaultAccompanyResourceName() if defaultAccompanyResourceName != "" && accompanyResourceName != defaultAccompanyResourceName { accompanyResourcePlugin := p.getResourcePlugin(defaultAccompanyResourceName) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go index fda1a5d0f9..3720ce7ad8 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go @@ -53,7 +53,7 @@ func AllocateGPUUsingStrategy( ctx := &allocate.AllocationContext{ ResourceReq: resourceReq, DeviceReq: deviceReq, - GPUTopology: gpuTopology, + DeviceTopology: gpuTopology, GPUQRMPluginConfig: gpuConfig, Emitter: emitter, MetaServer: metaServer, diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go index 9de5336792..bd76997cea 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go @@ -22,12 +22,14 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" "github.com/kubewharf/katalyst-core/pkg/util/general" ) const ( - allocationStrategyNameDefault = "default" + allocationStrategyNameDefault = "default" + allocationStrategyNameDeviceAffinity = "deviceAffinity" ) // StrategyManager manages the selection of allocation strategies based on resource names @@ -136,8 +138,10 @@ func (m *StrategyManager) AllocateUsingStrategy(ctx *allocate.AllocationContext) } // Global strategy manager instance -var globalStrategyManager *StrategyManager -var once sync.Once +var ( + globalStrategyManager *StrategyManager + once sync.Once +) // GetGlobalStrategyManager returns the global strategy manager instance func GetGlobalStrategyManager() *StrategyManager { @@ -157,19 +161,36 @@ func registerDefaultStrategies(manager *StrategyManager) { general.Errorf("Failed to register filtering strategy: %v", err) } + if err := manager.RegisterFilteringStrategy(canonical.NewCanonicalStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } + // Register sorting strategies if err := manager.RegisterSortingStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { general.Errorf("Failed to register sorting strategy: %v", err) } + if err := manager.RegisterSortingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } + // Register binding strategies if err := manager.RegisterBindingStrategy(canonical.NewCanonicalStrategy()); err != nil { general.Errorf("Failed to register binding strategy: %v", err) } + if err := manager.RegisterBindingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } + // Register allocation strategies if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, []string{gpu_memory.StrategyNameGPUMemory}, gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { general.Errorf("Failed to register gpu-memory-default strategy: %v", err) } + + if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDeviceAffinity, []string{deviceaffinity.StrategyNameDeviceAffinity}, + deviceaffinity.StrategyNameDeviceAffinity, canonical.StrategyNameCanonical); err != nil { + general.Errorf("Failed to register deviceAffinity strategy: %v", err) + } } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go index 321be54f3d..8e218e32f5 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go @@ -100,7 +100,9 @@ func (r *StrategyRegistry) RegisterAllocationStrategy(strategy allocate.Allocati } // RegisterGenericAllocationStrategy registers a complete generic allocation strategy with the given name -func (r *StrategyRegistry) RegisterGenericAllocationStrategy(name string, filteringNames []string, sortingName, bindingName string) error { +func (r *StrategyRegistry) RegisterGenericAllocationStrategy( + name string, filteringNames []string, sortingName, bindingName string, +) error { r.registryMutex.Lock() defer r.registryMutex.Unlock() diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go index c3c6a22f4a..342bce4d1f 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go @@ -19,8 +19,11 @@ package canonical import ( "fmt" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" "github.com/kubewharf/katalyst-core/pkg/util/general" ) @@ -37,7 +40,10 @@ func NewCanonicalStrategy() *CanonicalStrategy { return &CanonicalStrategy{} } -var _ allocate.BindingStrategy = &CanonicalStrategy{} +var ( + _ allocate.BindingStrategy = &CanonicalStrategy{} + _ allocate.FilteringStrategy = &CanonicalStrategy{} +) // Name returns the name of the binding strategy func (s *CanonicalStrategy) Name() string { @@ -46,30 +52,18 @@ func (s *CanonicalStrategy) Name() string { // Bind binds the sorted GPU devices to the allocation context // It creates allocation info for the selected devices -func (s *CanonicalStrategy) Bind(ctx *allocate.AllocationContext, sortedDevices []string) (*allocate.AllocationResult, error) { - if ctx.GPUTopology == nil { +func (s *CanonicalStrategy) Bind( + ctx *allocate.AllocationContext, sortedDevices []string, +) (*allocate.AllocationResult, error) { + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) + if !valid { return &allocate.AllocationResult{ Success: false, - ErrorMessage: "GPU topology is nil", - }, fmt.Errorf("GPU topology is nil") + ErrorMessage: errMsg, + }, fmt.Errorf(errMsg) } - if len(sortedDevices) == 0 && ctx.DeviceReq.DeviceRequest > 0 { - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: "no devices to bind", - }, fmt.Errorf("no devices to bind") - } - - // Determine how many devices to allocate devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) - if devicesToAllocate > len(sortedDevices) { - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), - }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) - } - allocatedDevices := sets.NewString() allocateDevices := func(devices ...string) bool { for _, device := range devices { @@ -108,3 +102,24 @@ func (s *CanonicalStrategy) Bind(ctx *allocate.AllocationContext, sortedDevices ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) } + +// Filter filters the available devices based on whether they are already occupied. +// The assumption is that each device can only be allocated to one container at most. +// It only returns devices that are not occupied yet. +func (s *CanonicalStrategy) Filter( + ctx *allocate.AllocationContext, allAvailableDevices []string, +) ([]string, error) { + machineState, ok := ctx.MachineState[v1.ResourceName(ctx.DeviceReq.DeviceName)] + if !ok { + return nil, fmt.Errorf("machine state for %s is not available", ctx.DeviceReq.DeviceName) + } + + filteredDevices := sets.NewString() + for _, device := range allAvailableDevices { + if machineState.IsRequestSatisfied(device, 1, 1) { + filteredDevices.Insert(device) + } + } + + return filteredDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go new file mode 100644 index 0000000000..06a28d92ce --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -0,0 +1,120 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/strings/slices" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// Bind binds the sorted devices to the allocation context by searching for the devices that have affinity to each other. +func (s *DeviceAffinityStrategy) Bind( + ctx *allocate.AllocationContext, sortedDevices []string, +) (*allocate.AllocationResult, error) { + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) + if !valid { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: errMsg, + }, fmt.Errorf(errMsg) + } + + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + + // When we bind devices, we need to make sure that the devices have affinity to each other + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) (bool, error) { + for _, firstDevice := range devices { + if devicesToAllocate == allocatedDevices.Len() { + return true, nil + } + if allocatedDevices.Has(firstDevice) { + continue + } + // Get all the devices that have affinity to the device + affinityMap, err := ctx.DeviceTopology.GetDeviceAffinityMap(firstDevice) + if err != nil { + return false, err + } + + // Loop from the highest priority to the lowest priority + for priority := 0; priority < len(affinityMap); priority++ { + // Check if the affinity device is in the list of devices to allocate + affinityPriorityDevices, ok := affinityMap[machine.AffinityPriority(priority)] + if !ok { + return false, fmt.Errorf("affinity priority %d not found", priority) + } + for _, device := range affinityPriorityDevices { + if slices.Contains(devices, device) { + allocatedDevices.Insert(device) + } + } + } + } + return false, nil + } + + // Allocate reusable devices first + finishedAllocation, err := allocateDevices(ctx.DeviceReq.ReusableDevices...) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate reusable devices: %v", err), + }, fmt.Errorf("failed to allocate reusable devices: %v", err) + } + + if finishedAllocation { + return &allocate.AllocationResult{ + Success: true, + AllocatedDevices: allocatedDevices.UnsortedList(), + }, nil + } + + // Then try to bind devices from sorted list + finishedAllocation, err = allocateDevices(sortedDevices...) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate sorted devices: %v", err), + }, fmt.Errorf("failed to allocate sorted devices: %v", err) + } + + if finishedAllocation { + general.InfoS("Successfully bound devices", + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName, + "allocatedDevices", allocatedDevices.List()) + + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), + }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go new file mode 100644 index 0000000000..fc183f4efc --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go @@ -0,0 +1,43 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +const ( + StrategyNameDeviceAffinity = "deviceAffinity" +) + +// DeviceAffinityStrategy knows how to bind devices that have affinity to each other +type DeviceAffinityStrategy struct{} + +// NewDeviceAffinityStrategy creates a new device affinity strategy with the given canonical strategy +func NewDeviceAffinityStrategy() *DeviceAffinityStrategy { + return &DeviceAffinityStrategy{} +} + +var ( + _ allocate.BindingStrategy = &DeviceAffinityStrategy{} + _ allocate.SortingStrategy = &DeviceAffinityStrategy{} +) + +// Name returns the name of the binding strategy +func (s *DeviceAffinityStrategy) Name() string { + return StrategyNameDeviceAffinity +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go new file mode 100644 index 0000000000..473f25b67b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go @@ -0,0 +1,24 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + +// Sort does nothing for device affinity strategy +func (s *DeviceAffinityStrategy) Sort(_ *allocate.AllocationContext, filteredDevices []string) ([]string, error) { + return filteredDevices, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go index 2a9a1eaa00..15c95ccea0 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -31,7 +31,7 @@ import ( // Filter filters the available GPU devices based on available GPU memory // It returns devices that have enough available memory for the request func (s *GPUMemoryStrategy) Filter(ctx *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { - if ctx.GPUTopology == nil { + if ctx.DeviceTopology == nil { return nil, fmt.Errorf("GPU topology is nil") } @@ -58,10 +58,13 @@ func (s *GPUMemoryStrategy) filterGPUDevices( gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) - machineState := ctx.MachineState[consts.ResourceGPUMemory] + machineState, ok := ctx.MachineState[consts.ResourceGPUMemory] + if !ok { + return nil, fmt.Errorf("machine state for %s is not available", consts.ResourceGPUMemory) + } filteredDevices := sets.NewString() for _, device := range allAvailableDevices { - if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.GPUTopology, ctx.HintNodes) { + if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.DeviceTopology, ctx.HintNodes) { continue } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go index 08cc11a9ed..06d0881eab 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go @@ -25,8 +25,10 @@ const ( // GPUMemoryStrategy filters GPU devices based on available GPU memory type GPUMemoryStrategy struct{} -var _ allocate.FilteringStrategy = &GPUMemoryStrategy{} -var _ allocate.SortingStrategy = &GPUMemoryStrategy{} +var ( + _ allocate.FilteringStrategy = &GPUMemoryStrategy{} + _ allocate.SortingStrategy = &GPUMemoryStrategy{} +) // NewGPUMemoryStrategy creates a new GPU memory filtering strategy func NewGPUMemoryStrategy() *GPUMemoryStrategy { diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go index 5f3aa14a63..7c1c0d2057 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -30,7 +30,7 @@ import ( // Sort sorts the filtered GPU devices based on available GPU memory // It prioritizes devices with more available memory and considers NUMA affinity func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { - if ctx.GPUTopology == nil { + if ctx.DeviceTopology == nil { return nil, fmt.Errorf("GPU topology is nil") } @@ -57,7 +57,7 @@ func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevice devices = append(devices, deviceInfo{ ID: deviceID, AvailableMemory: availableMemory, - NUMAAffinity: util.IsNUMAAffinityDevice(deviceID, ctx.GPUTopology, ctx.HintNodes), + NUMAAffinity: util.IsNUMAAffinityDevice(deviceID, ctx.DeviceTopology, ctx.HintNodes), }) } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go new file mode 100644 index 0000000000..105825e7f6 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go @@ -0,0 +1,42 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "fmt" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +// IsBindingContextValid checks if the context given for binding is valid +func IsBindingContextValid(ctx *allocate.AllocationContext, sortedDevices []string) (bool, string) { + if ctx.DeviceTopology == nil { + return false, "GPU topology is nil" + } + + if len(sortedDevices) == 0 && ctx.DeviceReq.DeviceRequest > 0 { + return false, "no devices to bind" + } + + // Determine how many devices to allocate + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + if devicesToAllocate > len(sortedDevices) { + return false, fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) + } + + return true, "" +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go index f21ba7610f..09d5c9b574 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go @@ -30,7 +30,7 @@ import ( type AllocationContext struct { ResourceReq *pluginapi.ResourceRequest DeviceReq *pluginapi.DeviceRequest - GPUTopology *machine.DeviceTopology + DeviceTopology *machine.DeviceTopology GPUQRMPluginConfig *qrm.GPUQRMPluginConfig Emitter metrics.MetricEmitter MetaServer *metaserver.MetaServer diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index b4c5181f71..3d80faa9a8 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -69,23 +69,42 @@ func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTo return provider.GetDeviceTopology() } -// GetDeviceAffinity retrieves a map of a certain device to the list of devices that it has an affinity with. +// SetDeviceAffinity establishes customised logic of affinity between devices +func (r *DeviceTopologyRegistry) SetDeviceAffinity( + deviceName string, deviceAffinityProvider DeviceAffinityProvider, +) error { + deviceTopologyProvider, ok := r.DeviceNameToProvider[deviceName] + if !ok { + return fmt.Errorf("no device topology provider found for device %s", deviceName) + } + deviceTopology, numaReady, err := deviceTopologyProvider.GetDeviceTopology() + if err != nil { + return fmt.Errorf("could not get device topology provider for device %s, err: %v", deviceName, err) + } + if !numaReady || deviceTopology == nil { + return fmt.Errorf("device topology provider for device %s is not ready", deviceName) + } + deviceAffinityProvider.SetDeviceAffinity(deviceTopology) + return deviceTopologyProvider.SetDeviceTopology(deviceTopology) +} + +// GetDeviceNUMAAffinity retrieves a map of a certain device to the list of devices that it has an affinity with. // A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) -func (r *DeviceTopologyRegistry) GetDeviceAffinity(key, value string) (map[string][]string, error) { - deviceTopologyKey, numaReady, err := r.GetDeviceTopology(key) +func (r *DeviceTopologyRegistry) GetDeviceNUMAAffinity(deviceA, deviceB string) (map[string][]string, error) { + deviceTopologyKey, numaReady, err := r.GetDeviceTopology(deviceA) if err != nil { - return nil, fmt.Errorf("error getting device topology for device %s: %v", key, err) + return nil, fmt.Errorf("error getting device topology for device %s: %v", deviceA, err) } if !numaReady { - return nil, fmt.Errorf("device topology for device %s is not ready", key) + return nil, fmt.Errorf("device topology for device %s is not ready", deviceA) } - deviceTopologyValue, numaReady, err := r.GetDeviceTopology(value) + deviceTopologyValue, numaReady, err := r.GetDeviceTopology(deviceB) if err != nil { - return nil, fmt.Errorf("error getting device topology for device %s: %v", value, err) + return nil, fmt.Errorf("error getting device topology for device %s: %v", deviceB, err) } if !numaReady { - return nil, fmt.Errorf("device topology for device %s is not ready", value) + return nil, fmt.Errorf("device topology for device %s is not ready", deviceB) } deviceAffinity := make(map[string][]string) @@ -102,39 +121,33 @@ func (r *DeviceTopologyRegistry) GetDeviceAffinity(key, value string) (map[strin return deviceAffinity, nil } -// IsNumaNodeAffinity checks if the device with the specified device ID resides in numa nodes that are a subset of the hint nodes. -func (r *DeviceTopologyRegistry) IsNumaNodeAffinity(deviceName, deviceID string, hintNodes CPUSet) (bool, error) { - deviceTopologyKey, numaReady, err := r.GetDeviceTopology(deviceName) - if err != nil { - return false, fmt.Errorf("error getting device topology for device %s: %v", deviceName, err) - } - if !numaReady { - return false, fmt.Errorf("device topology for device %s is not ready", deviceName) - } +type DeviceTopology struct { + Devices map[string]DeviceInfo +} - info, ok := deviceTopologyKey.Devices[deviceID] +func (t *DeviceTopology) GetDeviceAffinityMap(deviceId string) (map[AffinityPriority]DeviceIDs, error) { + info, ok := t.Devices[deviceId] if !ok { - return false, fmt.Errorf("failed to find device %s in device topology", deviceID) + return nil, fmt.Errorf("failed to find device %s in device topology", deviceId) } - - return NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes), nil -} - -type DeviceTopology struct { - Devices map[string]DeviceInfo + return info.DeviceAffinityMap, nil } type DeviceInfo struct { - Health string - NumaNodes []int - DeviceAffinityGroup map[AffinityPriority]AffinityGroupInfo + Health string + NumaNodes []int + // DeviceAffinityMap is the map of priority level to the other deviceIds that a particular deviceId has an affinity with + DeviceAffinityMap map[AffinityPriority]DeviceIDs } +// AffinityPriority is the level of affinity that a deviceId has with another deviceId. +// The lowest affinityPriority value is 0, and in this level, devices have the most affinity with one another, +// so it is of highest priority to try to allocate these devices together. +// As the affinityPriority value increases, devices do not have as much affinity with each other, +// so it is of lower priority to try to allocate these devices together. type AffinityPriority int -type AffinityGroupInfo struct { - GroupID int -} +type DeviceIDs []string func (i DeviceInfo) GetNUMANode() []int { if i.NumaNodes == nil { @@ -212,7 +225,6 @@ func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { deviceTopology.Devices[id] = DeviceInfo{} } } - return deviceTopology, nil } diff --git a/pkg/util/machine/device_affinity.go b/pkg/util/machine/device_affinity.go new file mode 100644 index 0000000000..27dfeb93e8 --- /dev/null +++ b/pkg/util/machine/device_affinity.go @@ -0,0 +1,69 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +// DeviceAffinityProvider knows how to form affinity between devices +type DeviceAffinityProvider interface { + // SetDeviceAffinity modifies DeviceTopology by retrieving each device's affinity to other devices + SetDeviceAffinity(*DeviceTopology) +} + +type DefaultDeviceAffinityProvider struct{} + +var _ DeviceAffinityProvider = (*DefaultDeviceAffinityProvider)(nil) + +func (p *DefaultDeviceAffinityProvider) SetDeviceAffinity(*DeviceTopology) {} + +type GPUDeviceAffinityProvider struct { + // isAffinityFunc is a function to check if 2 devices have affinity to each other + isAffinityFunc func(deviceId1, deviceId2 string) bool +} + +func NewGPUDeviceAffinityProvider(isAffinityFunc func(deviceId1, deviceId2 string) bool) DeviceAffinityProvider { + return &GPUDeviceAffinityProvider{isAffinityFunc: isAffinityFunc} +} + +// SetDeviceAffinity forms device affinity for the first layer only. +func (p *GPUDeviceAffinityProvider) SetDeviceAffinity(deviceTopology *DeviceTopology) { + for device1, deviceInfo1 := range deviceTopology.Devices { + if deviceInfo1.DeviceAffinityMap == nil { + deviceInfo1.DeviceAffinityMap = make(map[AffinityPriority]DeviceIDs) + } + + for device2, deviceInfo2 := range deviceTopology.Devices { + if device1 == device2 { + continue + } + + if deviceInfo2.DeviceAffinityMap == nil { + deviceInfo2.DeviceAffinityMap = make(map[AffinityPriority]DeviceIDs) + } + + if p.isAffinityFunc(device1, device2) { + if deviceInfo2.DeviceAffinityMap[0] == nil { + deviceInfo2.DeviceAffinityMap[0] = make(DeviceIDs, 0) + } + deviceInfo1.DeviceAffinityMap[0] = append(deviceInfo1.DeviceAffinityMap[0], device2) + + if deviceInfo1.DeviceAffinityMap[0] == nil { + deviceInfo1.DeviceAffinityMap[0] = make(DeviceIDs, 0) + } + deviceInfo2.DeviceAffinityMap[0] = append(deviceInfo2.DeviceAffinityMap[0], device1) + } + } + } +} From 8fb65b33537c3cd69295fe9486ce964ee1b3b712 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 27 Oct 2025 10:35:53 +0800 Subject: [PATCH 25/52] feat: implement binding strategy to prioritise device affinity during allocation feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate feat: when device affinity of first priority is unable to decide allocation, go to next priority to allocate fix: simplify logic of unallocated devices and change name of field --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 6 +- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 2 +- .../strategies/deviceaffinity/bind.go | 421 +++++- .../strategies/deviceaffinity/bind_test.go | 1259 +++++++++++++++++ .../allocate/strategies/gpu_memory/sort.go | 2 +- pkg/util/machine/device.go | 39 +- pkg/util/machine/device_affinity.go | 46 - 7 files changed, 1673 insertions(+), 102 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 13ec854444..1a751d1119 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -158,9 +158,9 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( } deviceTopology.Devices[device.ID] = machine.DeviceInfo{ - Health: device.Health, - NumaNodes: numaNode, - DeviceAffinityMap: make(map[machine.AffinityPriority]machine.DeviceIDs), + Health: device.Health, + NumaNodes: numaNode, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), } } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 4645a02cc6..143f415067 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -31,7 +31,7 @@ import ( gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" - manager "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/metrics" diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index 06a28d92ce..659488c399 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -18,16 +18,39 @@ package deviceaffinity import ( "fmt" + "sort" + "github.com/google/uuid" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/utils/strings/slices" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" - "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) +// affinityGroup is a group of devices that have affinity to each other. +// It is uniquely identified by an id. +type affinityGroup struct { + id string + unallocatedDevices sets.String +} + +// possibleAllocation refers to information about a certain affinity group, which includes the number of unallocated devices in the group, +// and candidateDevices refer to a set of devices that we can potentially allocate in an affinity group. +type possibleAllocation struct { + unallocatedSize int + candidateDevices sets.String +} + +// allocationByIntersectionResult is the result of allocating devices by maximising intersection size of possible allocations +// with an affinity group. +type allocationByIntersectionResult struct { + allocatedDevices sets.String + availableDevices sets.String + finished bool + err error +} + // Bind binds the sorted devices to the allocation context by searching for the devices that have affinity to each other. func (s *DeviceAffinityStrategy) Bind( ctx *allocate.AllocationContext, sortedDevices []string, @@ -41,42 +64,19 @@ func (s *DeviceAffinityStrategy) Bind( } devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + reusableDevicesSet := sets.NewString(ctx.DeviceReq.ReusableDevices...) - // When we bind devices, we need to make sure that the devices have affinity to each other - allocatedDevices := sets.NewString() - allocateDevices := func(devices ...string) (bool, error) { - for _, firstDevice := range devices { - if devicesToAllocate == allocatedDevices.Len() { - return true, nil - } - if allocatedDevices.Has(firstDevice) { - continue - } - // Get all the devices that have affinity to the device - affinityMap, err := ctx.DeviceTopology.GetDeviceAffinityMap(firstDevice) - if err != nil { - return false, err - } + // All devices that are passed into the strategy are unallocated devices + unallocatedDevicesSet := sets.NewString(sortedDevices...) - // Loop from the highest priority to the lowest priority - for priority := 0; priority < len(affinityMap); priority++ { - // Check if the affinity device is in the list of devices to allocate - affinityPriorityDevices, ok := affinityMap[machine.AffinityPriority(priority)] - if !ok { - return false, fmt.Errorf("affinity priority %d not found", priority) - } - for _, device := range affinityPriorityDevices { - if slices.Contains(devices, device) { - allocatedDevices.Insert(device) - } - } - } - } - return false, nil - } + // Get a map of affinity groups that is grouped by priority + affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() + affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) + + idToAffinityGroupMap := s.getAffinityGroupById(affinityGroupByPriority) // Allocate reusable devices first - finishedAllocation, err := allocateDevices(ctx.DeviceReq.ReusableDevices...) + allocatedDevices, err := s.allocateCandidateDevices(reusableDevicesSet, devicesToAllocate, unallocatedDevicesSet, affinityGroupByPriority, sets.NewString()) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -84,37 +84,362 @@ func (s *DeviceAffinityStrategy) Bind( }, fmt.Errorf("failed to allocate reusable devices: %v", err) } - if finishedAllocation { + if len(allocatedDevices) == devicesToAllocate { return &allocate.AllocationResult{ Success: true, AllocatedDevices: allocatedDevices.UnsortedList(), }, nil } - // Then try to bind devices from sorted list - finishedAllocation, err = allocateDevices(sortedDevices...) - if err != nil { + // Find all the affinity group ids that the allocated devices belong to and their respective priorities + allocatedAffinityGroupIds := s.findAllAffinityGroupIdsByPriority(allocatedDevices.UnsortedList(), affinityGroupByPriority) + + availableDevicesSet := sets.NewString(sortedDevices...) + // Next, allocate from available devices, but try to allocate from same affinity group as the allocated reusable devices + if allocatedDevices, err = s.allocateAvailableDevicesWithAffinity(allocatedDevices, availableDevicesSet, unallocatedDevicesSet, devicesToAllocate, allocatedAffinityGroupIds, idToAffinityGroupMap); err != nil { return &allocate.AllocationResult{ Success: false, - ErrorMessage: fmt.Sprintf("failed to allocate sorted devices: %v", err), - }, fmt.Errorf("failed to allocate sorted devices: %v", err) + ErrorMessage: fmt.Sprintf("failed to allocate available devices with affinity: %v", err), + }, fmt.Errorf("failed to allocate available devices with affinity: %v", err) } - if finishedAllocation { - general.InfoS("Successfully bound devices", - "podNamespace", ctx.ResourceReq.PodNamespace, - "podName", ctx.ResourceReq.PodName, - "containerName", ctx.ResourceReq.ContainerName, - "allocatedDevices", allocatedDevices.List()) + fmt.Println("allocated devices:", allocatedDevices.UnsortedList()) + // Return result once we have allocated all the devices + if len(allocatedDevices) == devicesToAllocate { return &allocate.AllocationResult{ + Success: true, AllocatedDevices: allocatedDevices.UnsortedList(), + }, nil + } + + // Lastly, allocate the rest of the devices from available devices + if allocatedDevices, err = s.allocateCandidateDevices(availableDevicesSet, devicesToAllocate, unallocatedDevicesSet, affinityGroupByPriority, allocatedDevices); err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate available devices: %v", err), + }, fmt.Errorf("failed to allocate available devices: %v", err) + } + + // Return result once we have allocated all the devices + if len(allocatedDevices) == devicesToAllocate { + return &allocate.AllocationResult{ Success: true, + AllocatedDevices: allocatedDevices.UnsortedList(), }, nil } return &allocate.AllocationResult{ Success: false, - ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), - }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) + ErrorMessage: fmt.Sprintf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)), + }, fmt.Errorf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)) +} + +// getAffinityGroupsByPriority forms a map of affinityGroup by priority. +func (s *DeviceAffinityStrategy) getAffinityGroupsByPriority( + affinityMap map[machine.AffinityPriority][]machine.DeviceIDs, unallocatedDevicesSet sets.String, +) map[machine.AffinityPriority][]affinityGroup { + affinityGroupsMap := make(map[machine.AffinityPriority][]affinityGroup) + for priority, affinityDevices := range affinityMap { + affinityGroupsMap[priority] = s.getAffinityGroups(affinityDevices, unallocatedDevicesSet) + } + return affinityGroupsMap +} + +// getAffinityGroups forms a list of affinityGroup with unallocated devices. +func (s *DeviceAffinityStrategy) getAffinityGroups( + affinityDevices []machine.DeviceIDs, unallocatedDevicesSet sets.String, +) []affinityGroup { + affinityGroups := make([]affinityGroup, 0, len(affinityDevices)) + + // Calculate the number of unallocated devices for each affinity group + for _, devices := range affinityDevices { + unallocatedDevices := make(machine.DeviceIDs, 0) + for _, device := range devices { + if unallocatedDevicesSet.Has(device) { + unallocatedDevices = append(unallocatedDevices, device) + } + } + affinityGroups = append(affinityGroups, affinityGroup{ + unallocatedDevices: sets.NewString(unallocatedDevices...), + id: uuid.NewString(), + }) + } + + return affinityGroups +} + +func (s *DeviceAffinityStrategy) getAffinityGroupById(affinityGroupByPriority map[machine.AffinityPriority][]affinityGroup) map[string]affinityGroup { + idToAffinityGroupMap := make(map[string]affinityGroup) + for _, groups := range affinityGroupByPriority { + for _, group := range groups { + idToAffinityGroupMap[group.id] = group + } + } + return idToAffinityGroupMap +} + +// allocateCandidateDevices finds the best allocation given some candidate devices by finding those devices with the best affinity to each other. +func (s *DeviceAffinityStrategy) allocateCandidateDevices( + candidateDevicesSet sets.String, devicesToAllocate int, unallocatedDevices sets.String, + affinityMap map[machine.AffinityPriority][]affinityGroup, allocatedDevices sets.String, +) (sets.String, error) { + // Retrieve all unallocated devices by getting the intersection of reusable devices and unallocated devices. + // Devices that are already allocated should be excluded. + availableDevicesSet := unallocatedDevices.Intersection(candidateDevicesSet).Difference(allocatedDevices) + + // If the available reusable devices is less than or equal to request, we need to allocate all of them + remainingQuantity := devicesToAllocate - len(allocatedDevices) + if availableDevicesSet.Len() <= remainingQuantity { + allocatedDevices = availableDevicesSet + return allocatedDevices, nil + } + + // Otherwise, we need to allocate these devices by their affinity + for priority := 0; priority < len(affinityMap); priority++ { + groups, ok := affinityMap[machine.AffinityPriority(priority)] + if !ok { + return nil, fmt.Errorf("affinity priority %v not found", priority) + } + + intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices) + + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, availableDevicesSet, devicesToAllocate, priority == len(affinityMap)-1) + if allocateByIntersectionRes.err != nil { + return nil, allocateByIntersectionRes.err + } + + if allocateByIntersectionRes.finished { + return allocateByIntersectionRes.allocatedDevices, nil + } + + allocatedDevices = allocateByIntersectionRes.allocatedDevices + availableDevicesSet = allocateByIntersectionRes.availableDevices + } + + return allocatedDevices, nil +} + +// allocateAvailableDevicesWithAffinity allocates devices from a set of available devices by trying to find the best device affinity +// to some already allocated reusable devices. +func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( + allocatedDevices, availableDevices, unallocatedDevices sets.String, devicesToAllocate int, + allocatedAffinityGroupIds map[machine.AffinityPriority]sets.String, idToAffinityGroupMap map[string]affinityGroup, +) (sets.String, error) { + // Unallocated available devices are retrieved by getting an intersection of total unallocated devices and the available devices to be allocated. + unallocatedAvailableDevices := unallocatedDevices.Intersection(availableDevices) + + // From the highest priority to the lowest priority, get the group IDs of the devices that are already allocated + // and try to allocate from those groups. + for priority := 0; priority < len(allocatedAffinityGroupIds); priority++ { + groupIDs, ok := allocatedAffinityGroupIds[machine.AffinityPriority(priority)] + if !ok { + return nil, fmt.Errorf("unallocated affinity group ids in priority level %v not found", priority) + } + + intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) + + for groupID := range groupIDs { + // Get affinity group + group, ok := idToAffinityGroupMap[groupID] + if !ok { + return nil, fmt.Errorf("affinity group %v not found", groupID) + } + + deviceIntersection := group.unallocatedDevices.Intersection(unallocatedAvailableDevices) + if _, ok = intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { + intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) + } + + intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{ + // The number of unallocated devices in the group is retrieved by taking a difference between + // the unallocated devices in the group and the already allocated devices + unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), + candidateDevices: deviceIntersection, + }) + } + + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, unallocatedAvailableDevices, devicesToAllocate, priority == len(allocatedAffinityGroupIds)-1) + if allocateByIntersectionRes.err != nil { + return nil, allocateByIntersectionRes.err + } + + if allocateByIntersectionRes.finished { + return allocateByIntersectionRes.allocatedDevices, nil + } + + allocatedDevices = allocateByIntersectionRes.allocatedDevices + unallocatedAvailableDevices = allocateByIntersectionRes.availableDevices + } + + return allocatedDevices, nil +} + +// mergePossibleAllocationsAndSort merges the possible allocations by their unallocated size and sorts them in ascending order of their unallocated size. +func (s *DeviceAffinityStrategy) mergePossibleAllocationsAndSort(possibleAllocations []possibleAllocation) []possibleAllocation { + merged := make(map[int]sets.String) + for _, alloc := range possibleAllocations { + if _, ok := merged[alloc.unallocatedSize]; !ok { + merged[alloc.unallocatedSize] = sets.NewString() + } + merged[alloc.unallocatedSize].Insert(alloc.candidateDevices.UnsortedList()...) + } + + mergedAllocations := make([]possibleAllocation, 0, len(merged)) + for unallocatedSize, intersected := range merged { + mergedAllocations = append(mergedAllocations, possibleAllocation{ + unallocatedSize: unallocatedSize, + candidateDevices: intersected, + }) + } + + // Sort possible allocations by their unallocated size in ascending order + // To support bin-packing, we prioritize allocation of devices in groups that have other allocated devices. + sort.Slice(mergedAllocations, func(i, j int) bool { + return mergedAllocations[i].unallocatedSize < mergedAllocations[j].unallocatedSize + }) + + return mergedAllocations +} + +func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( + groups []affinityGroup, availableDevicesSet, allocatedDevices sets.String, +) map[int][]possibleAllocation { + intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) + for _, group := range groups { + // Find intersection of affinity group and the available reusable devices + deviceIntersection := group.unallocatedDevices.Intersection(availableDevicesSet) + if _, ok := intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { + intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) + } + intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{ + // The number of unallocated devices in the group is retrieved by taking a difference between + // the unallocated devices in the group and the already allocated devices + unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), + candidateDevices: deviceIntersection, + }) + } + + return intersectionToPossibleAllocationsMap +} + +// allocateByIntersection allocates devices by the following algorithm +// 1. Sort the intersection sizes of possible allocations in descending order, we want to allocate devices with larger intersection size with an affinity group. +// 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximise +// bin-packing (try to fill up an affinity group that is already allocated with other devices. +// 3. For each intersection size, allocate devices in the order of the sorted possible allocations. +// 4. If a possible allocation has a number of intersected devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. +// 5. If we are currently at the last affinity priority level, we go through the other possible allocations (that are in sorted ascending order of number of unallocated devices) +// to fill up the remaining devices. +func (s *DeviceAffinityStrategy) allocateByIntersection( + intersectionToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, + unallocatedAvailableDevices sets.String, devicesToAllocate int, isLastPriority bool, +) allocationByIntersectionResult { + // Sort the intersection sizes of possible allocations in descending order + intersectionSizes := make([]int, 0, len(intersectionToPossibleAllocationsMap)) + for intersectionSize := range intersectionToPossibleAllocationsMap { + intersectionSizes = append(intersectionSizes, intersectionSize) + } + + sort.Slice(intersectionSizes, func(i, j int) bool { + return intersectionSizes[i] > intersectionSizes[j] + }) + + if len(intersectionToPossibleAllocationsMap) > 0 { + maxIntersection := intersectionSizes[0] + possibleAllocations, ok := intersectionToPossibleAllocationsMap[maxIntersection] + if !ok { + return allocationByIntersectionResult{ + finished: false, + err: fmt.Errorf("possible reusable devices of intersection size %v not found", maxIntersection), + } + } + + mergedPossibleAllocations := s.mergePossibleAllocationsAndSort(possibleAllocations) + + for _, possibleAlloc := range mergedPossibleAllocations { + // If devices of possible allocation size is larger than the devices needed, and it is not the last priority level, + // go to the next priority and try to allocate + if !isLastPriority && possibleAlloc.candidateDevices.Len() > devicesToAllocate-allocatedDevices.Len() { + return allocationByIntersectionResult{ + finished: false, + err: nil, + allocatedDevices: allocatedDevices, + availableDevices: unallocatedAvailableDevices, + } + } + + for device := range possibleAlloc.candidateDevices { + allocatedDevices.Insert(device) + unallocatedAvailableDevices.Delete(device) + if allocatedDevices.Len() == devicesToAllocate { + return allocationByIntersectionResult{ + finished: true, + err: nil, + allocatedDevices: allocatedDevices, + availableDevices: unallocatedAvailableDevices, + } + } + } + } + + // At the last priority, we just go through the other possible allocations of the other intersection sizes if we have not allocated finish the candidate devices + if isLastPriority { + for _, intersectionSize := range intersectionSizes[1:] { + possibleAllocations, ok = intersectionToPossibleAllocationsMap[intersectionSize] + if !ok { + return allocationByIntersectionResult{ + finished: false, + err: fmt.Errorf("possible device allocation of intersection size %v not found", intersectionSize), + } + } + + // Sort possible allocations by their unallocated size in ascending order + sort.Slice(possibleAllocations, func(i, j int) bool { + return possibleAllocations[i].unallocatedSize < possibleAllocations[j].unallocatedSize + }) + + for _, possibleAlloc := range possibleAllocations { + for device := range possibleAlloc.candidateDevices { + allocatedDevices.Insert(device) + unallocatedAvailableDevices.Delete(device) + if allocatedDevices.Len() == devicesToAllocate { + return allocationByIntersectionResult{ + finished: true, + err: nil, + allocatedDevices: allocatedDevices, + availableDevices: unallocatedAvailableDevices, + } + } + } + } + } + } + } + return allocationByIntersectionResult{ + finished: false, + err: nil, + allocatedDevices: allocatedDevices, + availableDevices: unallocatedAvailableDevices, + } +} + +// findAllAffinityGroupIdsByPriority finds the affinity group ids of the allocated devices by affinity priority level. +func (s *DeviceAffinityStrategy) findAllAffinityGroupIdsByPriority( + allocatedDevices []string, affinityMap map[machine.AffinityPriority][]affinityGroup, +) map[machine.AffinityPriority]sets.String { + affinityGroupIds := make(map[machine.AffinityPriority]sets.String) + for _, device := range allocatedDevices { + for priority, groups := range affinityMap { + for _, group := range groups { + if group.unallocatedDevices.Has(device) { + if _, ok := affinityGroupIds[priority]; !ok { + affinityGroupIds[priority] = sets.NewString() + } + affinityGroupIds[priority].Insert(group.id) + } + } + } + } + return affinityGroupIds } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go new file mode 100644 index 0000000000..e86bbc8268 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -0,0 +1,1259 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestBind(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedResult *allocate.AllocationResult + expectedErr bool + }{ + { + name: "allocate all reusable devices first", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2"}, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2"}, + Success: true, + }, + }, + { + name: "allocate the reusable devices with the best affinity to each other first", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, + DeviceRequest: 2, // should allocate gpu-3 and gpu-4 + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4", "gpu-5"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "supports bin-packing of 1 allocated device", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3"}, // gpu-4 is already allocated, so we should allocate gpu-3 to support bin-packing + DeviceRequest: 1, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3"}, + // gpu-4 is already allocated, so we allocate gpu-3 for bin-packing + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3"}, + Success: true, + }, + }, + { + name: "supports of bin-packing of 2 allocated devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-5", "gpu-6"}, // gpu-5 and gou-6 should be allocated because gpu-7 is already allocated and this supports bin-packing + DeviceRequest: 2, + }, + // Level 0: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "bin-packing of more allocated devices are preferred over less allocated devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7", "gpu-9", "gpu-10"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + // [gpu-1, gpu-2, gpu-3, gpu-4] already has 2 allocated devices, [gpu-5, gpu-6, gpu-7, gpu-8] has 1 allocated device, [gpu-9, gpu-10, gpu-11, gpu-12] has no allocated devices + // To support bin-packing, we will allocate gpu-2 and gpu-3 first, then allocate gpu-6 and gpu-7 + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7"}, + Success: true, + }, + }, + { + name: "finds first level of device affinity, then finds second level of device affinity", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7"}, // gpu-1 and gpu-2 have affinity in 1st level, gpu-5 and gpu-7 have affinity in 2nd level + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-7"}, + Success: true, + }, + }, + { + name: "allocate reusable devices first, then allocate available devices with affinity to the allocated reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + // gpu-1 and gpu-5 are already allocated as they are reusable + // gpu-2 and gpu-6 should be allocated as they have affinity to the already allocated gpu-1 and gpu-5 + sortedDevices: []string{"gpu-1", "gpu-5", "gpu-2", "gpu-6", "gpu-7", "gpu-8", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "after allocating available devices with affinity to reusable devices, still not enough devices, allocate more available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 6, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + // gpu-1 and gpu-5 are allocated because they are reusable devices + // gpu-2 and gpu-6 should be allocated as they have affinity to the already allocated gpu-1 and gpu-5 + // gpu-3 and gpu-4 should be allocated as they have affinity to each other + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "allocation of reusable devices in descending order of intersection size", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-9"}, + DeviceRequest: 6, + }, + // 1 level: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-9"}, + // Should allocate gpu-1, gpu-2, gpu-3 and gpu-4 first because they have the most intersection size with an affinity group + // Followed by gpu-5 and gpu-6 as they have the second most intersection size with an affinity group + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "allocation of available devices in descending order of intersection size after allocating all reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5", "gpu-9", "gpu-10"}, + DeviceRequest: 8, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11"}, + // gpu-6 and gpu-12 is allocated + // Allocate gpu-1, gpu-5, gpu-9 and gpu-10 first because they are reusable devices + // Allocate gpu-2, gpu-3 and gpu-4 next because they have the most intersection with an affinity group + // Between (gpu-7, gpu-8) and (gpu-10, gpu-11), allocate gpu-10 and gpu-11 because the affinity group has fewer unallocated devices (bin-packing). + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-9", "gpu-10", "gpu-11"}, + Success: true, + }, + }, + { + name: "allocate available devices that have affinity with reusable devices from highest to lowest priority", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 6, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8], [gpu-9, gpu-10], [gpu-11, gpu-12] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10"}, + 1: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9"}, + 1: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + // Allocate gpu-1 and gpu-5 first because they are reusable devices + // Allocate gpu-2 next because they have affinity with gpu-1 at the highest affinity priority (level 0) + // Allocate gpu-6 and gpu-8 next because they have affinity with gpu-5 at the next highest affinity priority (level 1) + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "allocation of odd number of devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-5"}, + DeviceRequest: 5, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + // Allocate gpu-1, gpu-3 and gpu-5 first because they are reusable devices + // Allocate gpu-2 and gpu-4 next because they have affinity with gpu-1 and gpu-3 at the highest affinity priority (level 0) + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"}, + Success: true, + }, + }, + { + name: "allocation of available devices for 1st level of affinity priority if there are no reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{}, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, + // No reusable devices to allocate. + // Allocate gpu-1 and gpu-2 because they have the best affinity to each other. + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "when first priority level is not able to determine an allocation, go to the next priority level to allocate", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + // All of the above devices are reusable devices + // At first priority level, they all give an intersection size of 2, but there are 6 of them, and we only need 4 of them + // Since there is another priority level, we go to that priority level to get the best device affinity + // (gpu-1 and gpu-2), (gpu-5 and gpu-6), (gpu-7, gpu-8) are affinity groups at priority level 0 + // But (gpu-5, gpu-6, gpu-7, gpu-8) are affinity groups at priority level 1, so we allocate gpu-5, gpu-6, gpu-7, gpu-8 + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "reusable devices are bin-packed, so we allocate the remaining available devices without considering affinity with the reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-4"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + // gpu-1 and gpu-4 are allocated because they are reusable devices. + // gpu-2 and gpu-3 are already allocated so we cannot find any affinity to the reusable devices. + // allocate gpu-7 and gpu-8 because they have affinity to one another. + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-4", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "insufficient devices to allocate, causing an error", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + // gpu-3 and gpu-4 are already allocated + // gpu-1 and gpu-2 are not enough to satisfy the request + expectedResult: &allocate.AllocationResult{ + Success: false, + }, + expectedErr: true, + }, + { + name: "if there is another priority level, allocate only the max intersection and then go to the next priority level and allocate", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5", "gpu-6", "gpu-7"}, + DeviceRequest: 3, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-5", "gpu-6", "gpu-7"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6", "gpu-7"}, + Success: true, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceBindingStrategy := NewDeviceAffinityStrategy() + result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) + if (err != nil) != tt.expectedErr { + t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) + } + verifyAllocationResult(t, result, tt.expectedResult) + }) + } +} + +func verifyAllocationResult( + t *testing.T, result *allocate.AllocationResult, expectedResult *allocate.AllocationResult, +) { + if (result == nil) != (expectedResult == nil) { + t.Errorf("result = %v, expectedResult = %v", result, expectedResult) + return + } + if result.Success != expectedResult.Success { + t.Errorf("result.Success = %v, expectedResult.Success = %v", result.Success, expectedResult.Success) + return + } + if len(result.AllocatedDevices) != len(expectedResult.AllocatedDevices) { + t.Errorf("result.AllocatedDevices = %v, expectedResult.AllocatedDevices = %v", result.AllocatedDevices, expectedResult.AllocatedDevices) + return + } + if diff := cmp.Diff(result.AllocatedDevices, expectedResult.AllocatedDevices, + cmpopts.SortSlices(func(a, b string) bool { return a < b }), + ); diff != "" { + t.Errorf("Bind() mismatch (-got +want):\n%s", diff) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go index 7c1c0d2057..9a7dea90df 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -28,7 +28,7 @@ import ( ) // Sort sorts the filtered GPU devices based on available GPU memory -// It prioritizes devices with more available memory and considers NUMA affinity +// It prioritizes devices with less available memory and considers NUMA affinity func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { if ctx.DeviceTopology == nil { return nil, fmt.Errorf("GPU topology is nil") diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index 3d80faa9a8..8a0c31c37c 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -18,9 +18,11 @@ package machine import ( "fmt" + "sort" "sync" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/strings/slices" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/native" @@ -125,19 +127,50 @@ type DeviceTopology struct { Devices map[string]DeviceInfo } +// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs within a certain affinity priority level +func (t *DeviceTopology) GroupDeviceAffinity() map[AffinityPriority][]DeviceIDs { + deviceAffinityGroup := make(map[AffinityPriority][]DeviceIDs) + for deviceId, deviceInfo := range t.Devices { + for priority, affinityDeviceIDs := range deviceInfo.DeviceAffinity { + affinityDeviceIDs = append(affinityDeviceIDs, deviceId) + // Sort the strings for easier deduplication + sort.Strings(affinityDeviceIDs) + if _, ok := deviceAffinityGroup[priority]; !ok { + deviceAffinityGroup[priority] = make([]DeviceIDs, 0) + } + + // Add the affinityDeviceIDs to the priority level if it is not already there + if !containsGroup(deviceAffinityGroup[priority], affinityDeviceIDs) { + deviceAffinityGroup[priority] = append(deviceAffinityGroup[priority], affinityDeviceIDs) + } + + } + } + return deviceAffinityGroup +} + +func containsGroup(groups []DeviceIDs, candidate DeviceIDs) bool { + for _, g := range groups { + if slices.Equal(g, candidate) { + return true + } + } + return false +} + func (t *DeviceTopology) GetDeviceAffinityMap(deviceId string) (map[AffinityPriority]DeviceIDs, error) { info, ok := t.Devices[deviceId] if !ok { return nil, fmt.Errorf("failed to find device %s in device topology", deviceId) } - return info.DeviceAffinityMap, nil + return info.DeviceAffinity, nil } type DeviceInfo struct { Health string NumaNodes []int - // DeviceAffinityMap is the map of priority level to the other deviceIds that a particular deviceId has an affinity with - DeviceAffinityMap map[AffinityPriority]DeviceIDs + // DeviceAffinity is the map of priority level to the other deviceIds that a particular deviceId has an affinity with + DeviceAffinity map[AffinityPriority]DeviceIDs } // AffinityPriority is the level of affinity that a deviceId has with another deviceId. diff --git a/pkg/util/machine/device_affinity.go b/pkg/util/machine/device_affinity.go index 27dfeb93e8..5a5fc0f222 100644 --- a/pkg/util/machine/device_affinity.go +++ b/pkg/util/machine/device_affinity.go @@ -21,49 +21,3 @@ type DeviceAffinityProvider interface { // SetDeviceAffinity modifies DeviceTopology by retrieving each device's affinity to other devices SetDeviceAffinity(*DeviceTopology) } - -type DefaultDeviceAffinityProvider struct{} - -var _ DeviceAffinityProvider = (*DefaultDeviceAffinityProvider)(nil) - -func (p *DefaultDeviceAffinityProvider) SetDeviceAffinity(*DeviceTopology) {} - -type GPUDeviceAffinityProvider struct { - // isAffinityFunc is a function to check if 2 devices have affinity to each other - isAffinityFunc func(deviceId1, deviceId2 string) bool -} - -func NewGPUDeviceAffinityProvider(isAffinityFunc func(deviceId1, deviceId2 string) bool) DeviceAffinityProvider { - return &GPUDeviceAffinityProvider{isAffinityFunc: isAffinityFunc} -} - -// SetDeviceAffinity forms device affinity for the first layer only. -func (p *GPUDeviceAffinityProvider) SetDeviceAffinity(deviceTopology *DeviceTopology) { - for device1, deviceInfo1 := range deviceTopology.Devices { - if deviceInfo1.DeviceAffinityMap == nil { - deviceInfo1.DeviceAffinityMap = make(map[AffinityPriority]DeviceIDs) - } - - for device2, deviceInfo2 := range deviceTopology.Devices { - if device1 == device2 { - continue - } - - if deviceInfo2.DeviceAffinityMap == nil { - deviceInfo2.DeviceAffinityMap = make(map[AffinityPriority]DeviceIDs) - } - - if p.isAffinityFunc(device1, device2) { - if deviceInfo2.DeviceAffinityMap[0] == nil { - deviceInfo2.DeviceAffinityMap[0] = make(DeviceIDs, 0) - } - deviceInfo1.DeviceAffinityMap[0] = append(deviceInfo1.DeviceAffinityMap[0], device2) - - if deviceInfo1.DeviceAffinityMap[0] == nil { - deviceInfo1.DeviceAffinityMap[0] = make(DeviceIDs, 0) - } - deviceInfo2.DeviceAffinityMap[0] = append(deviceInfo2.DeviceAffinityMap[0], device1) - } - } - } -} From 74173a59a405f8dc8a5a7e3b6d2039b82b002d85 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 29 Oct 2025 11:14:17 +0800 Subject: [PATCH 26/52] refactor(gpu): restructure GPU strategy and state management - introduce DefaultResourceStateGeneratorRegistry for resource state generation - add SetResourceState method to state interface - move strategy registry to separate package - enhance GenericAllocationStrategy with dynamic strategy selection - update device topology registry with thread-safe operations - consolidate GPU and RDMA device plugin initialization - improve state checkpoint handling with resource state generators - add custom strategy configuration options --- .../app/options/qrm/gpu_plugin.go | 9 ++ .../app/options/qrm/gpustrategy/allocate.go | 59 +++++++ .../options/qrm/gpustrategy/strategy_base.go | 44 ++++++ pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 74 ++++++--- .../gpu/customdeviceplugin/gpu/gpu.go | 18 ++- .../gpu/customdeviceplugin/rdma/rdma.go | 21 ++- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 149 +++++++++++++----- pkg/agent/qrm-plugins/gpu/state/interface.go | 60 +++++++ pkg/agent/qrm-plugins/gpu/state/state.go | 64 +++++--- .../qrm-plugins/gpu/state/state_checkpoint.go | 25 ++- pkg/agent/qrm-plugins/gpu/state/state_mem.go | 28 ++-- pkg/agent/qrm-plugins/gpu/state/util.go | 111 ++++++------- .../qrm-plugins/gpu/staticpolicy/policy.go | 35 ++-- .../gpu/strategy/allocate/manager/defaults.go | 69 ++++++++ .../gpu/strategy/allocate/manager/manager.go | 101 ++++++------ .../strategy/allocate/manager/manager_test.go | 71 --------- .../{manager => registry}/registry.go | 39 +---- .../allocate/registry/registry_test.go | 97 ++++++++++++ .../allocate/strategies/allocation/generic.go | 93 +++++++++-- .../strategies/canonical/canonical.go | 3 +- .../deviceaffinity/device_affinity.go | 1 - .../allocate/strategies/gpu_memory/filter.go | 5 + .../allocate/strategies/gpu_memory/sort.go | 7 +- pkg/config/agent/qrm/gpu_plugin.go | 16 +- pkg/config/agent/qrm/gpustrategy/allocate.go | 28 ++++ .../agent/qrm/gpustrategy/strategy_base.go} | 13 +- pkg/util/machine/device.go | 71 ++++++--- 27 files changed, 890 insertions(+), 421 deletions(-) create mode 100644 cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go create mode 100644 cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/interface.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go rename pkg/agent/qrm-plugins/gpu/strategy/allocate/{manager => registry}/registry.go (78%) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go create mode 100644 pkg/config/agent/qrm/gpustrategy/allocate.go rename pkg/{agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go => config/agent/qrm/gpustrategy/strategy_base.go} (65%) diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go index 0d31a52633..28b09fdec1 100644 --- a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -20,6 +20,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" cliflag "k8s.io/component-base/cli/flag" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options/qrm/gpustrategy" qrmconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" ) @@ -29,6 +30,8 @@ type GPUOptions struct { GPUMemoryAllocatablePerGPU string SkipGPUStateCorruption bool RDMADeviceNames []string + + GPUStrategyOptions *gpustrategy.GPUStrategyOptions } func NewGPUOptions() *GPUOptions { @@ -37,6 +40,7 @@ func NewGPUOptions() *GPUOptions { GPUDeviceNames: []string{"nvidia.com/gpu"}, GPUMemoryAllocatablePerGPU: "100", RDMADeviceNames: []string{}, + GPUStrategyOptions: gpustrategy.NewGPUStrategyOptions(), } } @@ -50,6 +54,8 @@ func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties") + fs.StringSliceVar(&o.RDMADeviceNames, "rdma-resource-names", o.RDMADeviceNames, "The name of the RDMA resource") + o.GPUStrategyOptions.AddFlags(fss) } func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { @@ -62,5 +68,8 @@ func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { conf.GPUMemoryAllocatablePerGPU = gpuMemory conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption conf.RDMADeviceNames = o.RDMADeviceNames + if err := o.GPUStrategyOptions.ApplyTo(conf.GPUStrategyConfig); err != nil { + return err + } return nil } diff --git a/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go b/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go new file mode 100644 index 0000000000..470ec40012 --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go @@ -0,0 +1,59 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +import ( + "strings" + + cliflag "k8s.io/component-base/cli/flag" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) + +type AllocateStrategyOptions struct { + CustomFilteringStrategies map[string]string + CustomSortingStrategy map[string]string + CustomBindingStrategy map[string]string + CustomAllocationStrategy map[string]string +} + +func NewGPUAllocateStrategyOptions() *AllocateStrategyOptions { + return &AllocateStrategyOptions{} +} + +func (o *AllocateStrategyOptions) AddFlags(fss *cliflag.NamedFlagSets) { + fs := fss.FlagSet("allocate_strategy") + fs.StringToStringVar(&o.CustomFilteringStrategies, "gpu-allocate-custom-filtering-strategies", + o.CustomFilteringStrategies, "The filtering strategies for each resource, e.g. gpu:filtering1/filtering2") + fs.StringToStringVar(&o.CustomSortingStrategy, "gpu-allocate-custom-sorting-strategy", o.CustomSortingStrategy, "The sorting strategy for each resource") + fs.StringToStringVar(&o.CustomBindingStrategy, "gpu-allocate-custom-binding-strategy", o.CustomBindingStrategy, "The binding strategy for each resource") + fs.StringToStringVar(&o.CustomAllocationStrategy, "gpu-allocate-custom-allocation-strategy", o.CustomAllocationStrategy, "The allocation strategy for each resource") +} + +func (o *AllocateStrategyOptions) ApplyTo(c *gpustrategy.AllocateStrategyConfig) error { + for resourceName, strategies := range o.CustomFilteringStrategies { + filteringStrategies := strings.Split(strategies, "/") + for _, strategyName := range filteringStrategies { + c.CustomFilteringStrategies[resourceName] = append(c.CustomFilteringStrategies[resourceName], strategyName) + } + } + + c.CustomSortingStrategy = o.CustomSortingStrategy + c.CustomBindingStrategy = o.CustomBindingStrategy + c.CustomAllocationStrategy = o.CustomAllocationStrategy + return nil +} diff --git a/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go b/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go new file mode 100644 index 0000000000..ed60713090 --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go @@ -0,0 +1,44 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +import ( + cliflag "k8s.io/component-base/cli/flag" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) + +type GPUStrategyOptions struct { + *AllocateStrategyOptions +} + +func NewGPUStrategyOptions() *GPUStrategyOptions { + return &GPUStrategyOptions{ + AllocateStrategyOptions: NewGPUAllocateStrategyOptions(), + } +} + +func (o *GPUStrategyOptions) AddFlags(fss *cliflag.NamedFlagSets) { + o.AllocateStrategyOptions.AddFlags(fss) +} + +func (o *GPUStrategyOptions) ApplyTo(conf *gpustrategy.GPUStrategyConfig) error { + if err := o.AllocateStrategyOptions.ApplyTo(conf.AllocateStrategyConfig); err != nil { + return err + } + return nil +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index 1a751d1119..f4c9101759 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -20,14 +20,13 @@ import ( "fmt" "sync" + v1 "k8s.io/api/core/v1" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" "github.com/kubewharf/katalyst-core/pkg/config" - "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/config/generic" "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" @@ -40,11 +39,8 @@ const ( // BasePlugin is a shared plugin that provides common functionalities and fields for GPU resource plugins and custom device plugins. type BasePlugin struct { - mu sync.RWMutex - *config.Configuration - - QosConfig *generic.QoSConfiguration - QrmConfig *qrm.QRMPluginsConfiguration + mu sync.RWMutex + Conf *config.Configuration Emitter metrics.MetricEmitter MetaServer *metaserver.MetaServer @@ -58,6 +54,9 @@ type BasePlugin struct { // Registry of device topology providers DeviceTopologyRegistry *machine.DeviceTopologyRegistry + // Registry of default resource state generators + DefaultResourceStateGeneratorRegistry *state.DefaultResourceStateGeneratorRegistry + // Map of specific device name to device type deviceNameToTypeMap map[string]string } @@ -65,17 +64,8 @@ type BasePlugin struct { func NewBasePlugin( agentCtx *agent.GenericContext, conf *config.Configuration, wrappedEmitter metrics.MetricEmitter, ) (*BasePlugin, error) { - deviceTopologyRegistry := machine.NewDeviceTopologyRegistry() - - stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, - gpuconsts.GPUResourcePluginPolicyNameStatic, deviceTopologyRegistry, conf.SkipGPUStateCorruption, wrappedEmitter) - if err != nil { - return nil, fmt.Errorf("NewCheckpointState failed with error: %v", err) - } - return &BasePlugin{ - QosConfig: conf.QoSConfiguration, - QrmConfig: conf.QRMPluginsConfiguration, + Conf: conf, Emitter: wrappedEmitter, MetaServer: agentCtx.MetaServer, @@ -84,15 +74,23 @@ func NewBasePlugin( PodAnnotationKeptKeys: conf.PodAnnotationKeptKeys, PodLabelKeptKeys: conf.PodLabelKeptKeys, - State: stateImpl, - DeviceTopologyRegistry: deviceTopologyRegistry, - deviceNameToTypeMap: make(map[string]string), + DeviceTopologyRegistry: machine.NewDeviceTopologyRegistry(), + DefaultResourceStateGeneratorRegistry: state.NewDefaultResourceStateGeneratorRegistry(), + + deviceNameToTypeMap: make(map[string]string), }, nil } -// SetDeviceAffinity is a hook to set device affinity for a certain device type -func (p *BasePlugin) SetDeviceAffinity(deviceType string, deviceAffinityProvider machine.DeviceAffinityProvider) error { - return p.DeviceTopologyRegistry.SetDeviceAffinity(deviceType, deviceAffinityProvider) +// InitState initializes the state of the plugin. +func (p *BasePlugin) InitState() error { + stateImpl, err := state.NewCheckpointState(p.Conf.QRMPluginsConfiguration, p.Conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, p.DefaultResourceStateGeneratorRegistry, p.Conf.SkipGPUStateCorruption, p.Emitter) + if err != nil { + return fmt.Errorf("NewCheckpointState failed with error: %v", err) + } + + p.State = stateImpl + return nil } func (p *BasePlugin) PackAllocationResponse( @@ -175,6 +173,31 @@ func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil } +// GenerateResourceStateFromPodEntries returns an AllocationMap of a certain resource based on pod entries +// 1. If podEntries is nil, it will get pod entries from state +// 2. If the generator is not found, it will return an error +func (p *BasePlugin) GenerateResourceStateFromPodEntries( + resourceName string, + podEntries state.PodEntries, +) (state.AllocationMap, error) { + if podEntries == nil { + podEntries = p.State.GetPodEntries(v1.ResourceName(resourceName)) + } + + generator, ok := p.DefaultResourceStateGeneratorRegistry.GetGenerator(resourceName) + if !ok { + return nil, fmt.Errorf("could not find generator for resource %s", resourceName) + } + + return state.GenerateResourceStateFromPodEntries(podEntries, generator) +} + +func (p *BasePlugin) GenerateMachineStateFromPodEntries( + podResourceEntries state.PodResourceEntries, +) (state.AllocationResourcesMap, error) { + return state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DefaultResourceStateGeneratorRegistry) +} + // RegisterDeviceNameToType is used to map device name to device type. // For example, we may have multiple device names for a same device type, e.g. "nvidia.com/gpu" and "nvidia.com/be-gpu", // so we map them to the same device type, which allows us to allocate them interchangeably. @@ -191,3 +214,8 @@ func (p *BasePlugin) GetResourceTypeFromDeviceName(deviceName string) (string, e } return deviceType, nil } + +// RegisterTopologyAffinityProvider is a hook to set device affinity for a certain device type +func (p *BasePlugin) RegisterTopologyAffinityProvider(deviceType string, deviceAffinityProvider machine.DeviceAffinityProvider) { + p.DeviceTopologyRegistry.RegisterTopologyAffinityProvider(deviceType, deviceAffinityProvider) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index 88791ea151..b26dd8943a 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -46,13 +46,15 @@ type GPUDevicePlugin struct { } func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { - gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.GPUDeviceNames) + gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.GPUDeviceNames) base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) - base.RegisterDeviceNameToType(base.GPUDeviceNames, gpuconsts.GPUDeviceType) + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.GPUDeviceType, + state.NewGenericDefaultResourceStateGenerator(gpuconsts.GPUDeviceType, base.DeviceTopologyRegistry)) + base.RegisterDeviceNameToType(base.Conf.GPUDeviceNames, gpuconsts.GPUDeviceType) return &GPUDevicePlugin{ BasePlugin: base, - deviceNames: base.GPUDeviceNames, + deviceNames: base.Conf.GPUDeviceNames, } } @@ -75,7 +77,7 @@ func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.Associat func (p *GPUDevicePlugin) AllocateAssociatedDevice( resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { - qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) @@ -142,7 +144,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( resReq, deviceReq, gpuTopology, - p.GPUQRMPluginConfig, + p.Conf.GPUQRMPluginConfig, p.Emitter, p.MetaServer, p.State.GetMachineState(), @@ -202,11 +204,11 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( // TODO:State can be updated using the actual resource name p.State.SetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName, gpuDeviceAllocationInfo, false) - gpuDeviceMachineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) + resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.GPUDeviceType, nil) if err != nil { - return nil, fmt.Errorf("failed to generate gpu device machine state from pod entries: %v", err) + return nil, fmt.Errorf("failed to generate gpu device state from pod entries: %v", err) } - p.State.SetMachineState(gpuDeviceMachineState, true) + p.State.SetResourceState(gpuconsts.GPUDeviceType, resourceState, true) general.InfoS("allocated gpu devices", "podNamespace", resReq.PodNamespace, diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index a40d86f637..89dad61899 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -43,13 +43,15 @@ type RDMADevicePlugin struct { } func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { - rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.RDMADeviceNames) + rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.RDMADeviceNames) base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider) - base.RegisterDeviceNameToType(base.RDMADeviceNames, gpuconsts.RDMADeviceType) + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType, + state.NewGenericDefaultResourceStateGenerator(gpuconsts.RDMADeviceType, base.DeviceTopologyRegistry)) + base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType) return &RDMADevicePlugin{ BasePlugin: base, - deviceNames: base.RDMADeviceNames, + deviceNames: base.Conf.RDMADeviceNames, } } @@ -69,12 +71,11 @@ func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.Associa return &pluginapi.AssociatedDeviceHintsResponse{}, nil } -// TODO:accompany resource name == "" -// check if rdma is allocated to other containers, make sure they do not share rdma +// AllocateAssociatedDevice check if rdma is allocated to other containers, make sure they do not share rdma func (p *RDMADevicePlugin) AllocateAssociatedDevice( resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) @@ -96,8 +97,6 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( "reusableDevices", deviceReq.ReusableDevices, "deviceRequest", deviceReq.DeviceRequest, ) - p.Lock() - defer p.Unlock() // Check if there is state for the device name rdmaAllocationInfo := p.State.GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName) @@ -166,12 +165,12 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( allocationInfo.TopologyAwareAllocations = topologyAwareAllocations p.State.SetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName, allocationInfo, false) - machineState, err := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) + resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.RDMADeviceType, nil) if err != nil { - return nil, fmt.Errorf("failed to generate machine state from pod entries: %v", err) + return nil, fmt.Errorf("failed to generate rdma device state from pod entries: %v", err) } - p.State.SetMachineState(machineState, true) + p.State.SetResourceState(gpuconsts.RDMADeviceType, resourceState, true) general.InfoS("allocated rdma devices", "podNamespace", resReq.PodNamespace, diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 143f415067..e8113a2c7f 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -46,6 +46,8 @@ type GPUMemPlugin struct { } func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin { + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(string(consts.ResourceGPUMemory), + state.NewGenericDefaultResourceStateGenerator(gpuconsts.GPUDeviceType, base.DeviceTopologyRegistry)) return &GPUMemPlugin{ BasePlugin: base, } @@ -56,7 +58,7 @@ func (p *GPUMemPlugin) ResourceName() string { } func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -69,7 +71,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.GPUDeviceNames) + gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.Conf.GPUDeviceNames) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", req, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) @@ -104,7 +106,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) if allocationInfo != nil { - hints = state.RegenerateGPUMemoryHints(allocationInfo, false) + hints = regenerateGPUMemoryHints(allocationInfo, false) // regenerateHints failed. need to clear container record and re-calculate. if hints == nil { @@ -115,7 +117,7 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p } var err error - machineState, err = state.GenerateResourceStateFromPodEntries(podEntries, p.DeviceTopologyRegistry, consts.ResourceGPUMemory) + machineState, err = p.GenerateResourceStateFromPodEntries(string(consts.ResourceGPUMemory), podEntries) if err != nil { general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", req.PodNamespace, req.PodName, req.ContainerName, err) @@ -160,7 +162,7 @@ func (p *GPUMemPlugin) calculateHints( } allocated := s.GetQuantityAllocated() - if allocated+perGPUMemory <= float64(p.GPUMemoryAllocatablePerGPU.Value()) { + if allocated+perGPUMemory <= float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) { info, ok := gpuTopology.Devices[gpuID] if !ok { return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) @@ -296,14 +298,22 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) ( topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { - topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: alloc.Quantity, - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) + perNUMAAllocated := alloc.Quantity + if len(alloc.NUMANodes) > 0 { + perNUMAAllocated = alloc.Quantity / float64(len(alloc.NUMANodes)) + } + + for _, nodeID := range alloc.NUMANodes { + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + Node: uint64(nodeID), + ResourceValue: perNUMAAllocated, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } } resp := &pluginapi.GetTopologyAwareResourcesResponse{ @@ -334,30 +344,45 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca p.Lock() defer p.Unlock() - machineState := p.State.GetMachineState()[consts.ResourceGPUMemory] + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + return nil, err + } - topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) - topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) + if !numaTopologyReady { + return nil, fmt.Errorf("numa topology is not ready") + } + + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(gpuTopology.Devices)) + topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(gpuTopology.Devices)) var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 - for deviceID := range machineState { - aggregatedAllocatableQuantity += float64(p.GPUMemoryAllocatablePerGPU.Value()) - aggregatedCapacityQuantity += float64(p.GPUMemoryAllocatablePerGPU.Value()) - topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(p.GPUMemoryAllocatablePerGPU.Value()), - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) - topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: float64(p.GPUMemoryAllocatablePerGPU.Value()), - Name: deviceID, - Type: string(v1alpha1.TopologyTypeGPU), - Annotations: map[string]string{ - consts.ResourceAnnotationKeyResourceIdentifier: "", - }, - }) + for deviceID, deviceInfo := range gpuTopology.Devices { + aggregatedAllocatableQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + aggregatedCapacityQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + gpuMemoryAllocatablePerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + if len(deviceInfo.NumaNodes) > 1 { + gpuMemoryAllocatablePerGPUNUMA = gpuMemoryAllocatablePerGPUNUMA / float64(len(deviceInfo.NumaNodes)) + } + for _, numaID := range deviceInfo.NumaNodes { + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } } return &gpuconsts.AllocatableResource{ @@ -385,7 +410,7 @@ func (p *GPUMemPlugin) Allocate( return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil } - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.QosConfig, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", resourceReq.PodNamespace, resourceReq.PodName, resourceReq.ContainerName, err) @@ -398,7 +423,7 @@ func (p *GPUMemPlugin) Allocate( return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := gpuutil.GetGPUCount(resourceReq, p.GPUDeviceNames) + gpuCount, gpuNames, err := gpuutil.GetGPUCount(resourceReq, p.Conf.GPUDeviceNames) if err != nil { general.Errorf("getGPUCount failed from req %v with error: %v", resourceReq, err) return nil, fmt.Errorf("getGPUCount failed with error: %v", err) @@ -465,7 +490,7 @@ func (p *GPUMemPlugin) Allocate( resourceReq, deviceReq, gpuTopology, - p.GPUQRMPluginConfig, + p.Conf.GPUQRMPluginConfig, p.Emitter, p.MetaServer, p.State.GetMachineState(), @@ -511,18 +536,58 @@ func (p *GPUMemPlugin) Allocate( // Set allocation info in state p.State.SetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName, newAllocation, false) - machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.State.GetPodResourceEntries(), p.DeviceTopologyRegistry) + machineState, stateErr := p.GenerateResourceStateFromPodEntries(string(consts.ResourceGPUMemory), nil) if stateErr != nil { - general.ErrorS(stateErr, "GenerateMachineStateFromPodEntries failed", + general.ErrorS(stateErr, "GenerateResourceStateFromPodEntries failed", "podNamespace", resourceReq.PodNamespace, "podName", resourceReq.PodName, "containerName", resourceReq.ContainerName, "gpuMemory", gpuMemory) - return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", stateErr) + return nil, fmt.Errorf("GenerateResourceStateFromPodEntries failed with error: %v", stateErr) } // update state cache - p.State.SetMachineState(machineState, true) + p.State.SetResourceState(consts.ResourceGPUMemory, machineState, true) return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) } + +// regenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, +// and regenerateHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func regenerateGPUMemoryHints( + allocationInfo *state.AllocationInfo, regenerate bool, +) map[string]*pluginapi.ListOfTopologyHints { + if allocationInfo == nil { + general.Errorf("RegenerateHints got nil allocationInfo") + return nil + } + + hints := map[string]*pluginapi.ListOfTopologyHints{} + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "podUID", allocationInfo.PodUid, + "containerName", allocationInfo.ContainerName) + + return nil + } + + allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) + + general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "hint", allocatedNumaNodes) + hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: allocatedNumaNodes.ToSliceUInt64(), + Preferred: true, + }, + }, + } + return hints +} diff --git a/pkg/agent/qrm-plugins/gpu/state/interface.go b/pkg/agent/qrm-plugins/gpu/state/interface.go new file mode 100644 index 0000000000..753a3efa6a --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/interface.go @@ -0,0 +1,60 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + v1 "k8s.io/api/core/v1" +) + +// reader is used to get information from local states +type reader interface { + GetMachineState() AllocationResourcesMap + GetPodResourceEntries() PodResourceEntries + GetPodEntries(resourceName v1.ResourceName) PodEntries + GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo +} + +// writer is used to store information into local states, +// and it also provides functionality to maintain the local files +type writer interface { + SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) + SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, persist bool) + SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) + SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, + ) + + Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) + ClearState() + StoreState() error +} + +// DefaultResourceStateGenerator interface is used to generate default resource state for each resource +type DefaultResourceStateGenerator interface { + GenerateDefaultResourceState() (AllocationMap, error) +} + +// ReadonlyState interface only provides methods for tracking pod assignments +type ReadonlyState interface { + reader +} + +// State interface provides methods for tracking and setting pod assignments +type State interface { + writer + ReadonlyState +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go index fdea45b882..7833025b15 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state.go +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -18,6 +18,7 @@ package state import ( "encoding/json" + "sync" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -241,6 +242,19 @@ func (as *AllocationState) SetAllocationInfo(podUID string, containerName string as.PodEntries[podUID][containerName] = allocationInfo.Clone() } +func (am AllocationMap) String() string { + if am == nil { + return "" + } + + contentBytes, err := json.Marshal(am) + if err != nil { + general.LoggerWithPrefix("AllocationMap.String", general.LoggingPKGFull).Errorf("[AllocationMap.String]marshal AllocationMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + func (am AllocationMap) Clone() AllocationMap { if am == nil { return nil @@ -327,35 +341,35 @@ func (am AllocationMap) getNumberDevices() int { return len(am) } -// reader is used to get information from local states -type reader interface { - GetMachineState() AllocationResourcesMap - GetPodResourceEntries() PodResourceEntries - GetPodEntries(resourceName v1.ResourceName) PodEntries - GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo +type DefaultResourceStateGeneratorRegistry struct { + mutex sync.RWMutex + generators map[string]DefaultResourceStateGenerator +} + +func NewDefaultResourceStateGeneratorRegistry() *DefaultResourceStateGeneratorRegistry { + return &DefaultResourceStateGeneratorRegistry{ + generators: make(map[string]DefaultResourceStateGenerator), + } } -// writer is used to store information into local states, -// and it also provides functionality to maintain the local files -type writer interface { - SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) - SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) - SetAllocationInfo( - resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, - ) - - Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) - ClearState() - StoreState() error +func (r *DefaultResourceStateGeneratorRegistry) RegisterResourceStateGenerator(resourceName string, generator DefaultResourceStateGenerator) { + r.mutex.Lock() + defer r.mutex.Unlock() + + r.generators[resourceName] = generator } -// ReadonlyState interface only provides methods for tracking pod assignments -type ReadonlyState interface { - reader +func (r *DefaultResourceStateGeneratorRegistry) GetGenerators() map[string]DefaultResourceStateGenerator { + r.mutex.RLock() + defer r.mutex.RUnlock() + + return r.generators } -// State interface provides methods for tracking and setting pod assignments -type State interface { - writer - ReadonlyState +func (r *DefaultResourceStateGeneratorRegistry) GetGenerator(resourceName string) (DefaultResourceStateGenerator, bool) { + r.mutex.RLock() + defer r.mutex.RUnlock() + + generator, ok := r.generators[resourceName] + return generator, ok } diff --git a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go index 1293abbeff..11b9207285 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go @@ -31,7 +31,6 @@ import ( "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" - "github.com/kubewharf/katalyst-core/pkg/util/machine" ) const ( @@ -71,6 +70,19 @@ func (s *stateCheckpoint) SetMachineState(allocationResourcesMap AllocationResou } } +func (s *stateCheckpoint) SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetResourceState(resourceName, allocationMap, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store resource state to checkpoint error") + } + } +} + func (s *stateCheckpoint) SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) { s.Lock() defer s.Unlock() @@ -180,7 +192,7 @@ func (s *stateCheckpoint) storeState() error { return nil } -func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyRegistry) error { +func (s *stateCheckpoint) restoreState(defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry) error { s.Lock() defer s.Unlock() var err error @@ -206,7 +218,7 @@ func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyR return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) } - machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodResourceEntries, topologyRegistry) + machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodResourceEntries, defaultResourceStateGenerators) if err != nil { return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) } @@ -241,14 +253,15 @@ func (s *stateCheckpoint) restoreState(topologyRegistry *machine.DeviceTopologyR func NewCheckpointState( conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, - topologyRegistry *machine.DeviceTopologyRegistry, skipStateCorruption bool, emitter metrics.MetricEmitter, + defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry, + skipStateCorruption bool, emitter metrics.MetricEmitter, ) (State, error) { checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) if err != nil { return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) } - defaultCache, err := NewGPUPluginState(conf, topologyRegistry) + defaultCache, err := NewGPUPluginState(conf, defaultResourceStateGenerators) if err != nil { return nil, fmt.Errorf("NewGPUPluginState failed with error: %v", err) } @@ -262,7 +275,7 @@ func NewCheckpointState( emitter: emitter, } - if err := sc.restoreState(topologyRegistry); err != nil { + if err := sc.restoreState(defaultResourceStateGenerators); err != nil { return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ "the gpu plugin checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go index 809ae014b2..89793c5a6b 100644 --- a/pkg/agent/qrm-plugins/gpu/state/state_mem.go +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -23,7 +23,6 @@ import ( v1 "k8s.io/api/core/v1" "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" - "github.com/kubewharf/katalyst-core/pkg/util/machine" ) // gpuPluginState is an in-memory implementation of State; @@ -32,8 +31,8 @@ import ( type gpuPluginState struct { sync.RWMutex - qrmConf *qrm.QRMPluginsConfiguration - topologyRegistry *machine.DeviceTopologyRegistry + qrmConf *qrm.QRMPluginsConfiguration + defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry machineState AllocationResourcesMap podResourceEntries PodResourceEntries @@ -41,20 +40,20 @@ type gpuPluginState struct { func NewGPUPluginState( conf *qrm.QRMPluginsConfiguration, - topologyRegistry *machine.DeviceTopologyRegistry, + resourceStateGeneratorRegistry *DefaultResourceStateGeneratorRegistry, ) (State, error) { generalLog.InfoS("initializing new gpu plugin in-memory state store") - defaultMachineState, err := GenerateMachineState(topologyRegistry) + defaultMachineState, err := GenerateMachineState(resourceStateGeneratorRegistry) if err != nil { return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) } return &gpuPluginState{ - qrmConf: conf, - machineState: defaultMachineState, - topologyRegistry: topologyRegistry, - podResourceEntries: make(PodResourceEntries), + qrmConf: conf, + machineState: defaultMachineState, + defaultResourceStateGenerators: resourceStateGeneratorRegistry, + podResourceEntries: make(PodResourceEntries), }, nil } @@ -66,6 +65,15 @@ func (s *gpuPluginState) SetMachineState(allocationResourcesMap AllocationResour "GPUMap", allocationResourcesMap.String()) } +func (s *gpuPluginState) SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, _ bool) { + s.Lock() + defer s.Unlock() + s.machineState[resourceName] = allocationMap.Clone() + generalLog.InfoS("updated gpu plugin resource state", + "resourceName", resourceName, + "allocationMap", allocationMap.String()) +} + func (s *gpuPluginState) SetPodResourceEntries(podResourceEntries PodResourceEntries, _ bool) { s.Lock() defer s.Unlock() @@ -117,7 +125,7 @@ func (s *gpuPluginState) ClearState() { s.Lock() defer s.Unlock() - machineState, err := GenerateMachineState(s.topologyRegistry) + machineState, err := GenerateMachineState(s.defaultResourceStateGenerators) if err != nil { generalLog.ErrorS(err, "failed to generate machine state") } diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go index fc081c9d16..6a24bc2a0f 100644 --- a/pkg/agent/qrm-plugins/gpu/state/util.go +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -19,27 +19,23 @@ package state import ( "fmt" - v1 "k8s.io/api/core/v1" - pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" - - "github.com/kubewharf/katalyst-api/pkg/consts" - "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" + v1 "k8s.io/api/core/v1" ) // GenerateMachineState returns an empty AllocationResourcesMap for all resource names. -func GenerateMachineState(topologyRegistry *machine.DeviceTopologyRegistry) (AllocationResourcesMap, error) { - if topologyRegistry == nil { - return nil, fmt.Errorf("topology provider registry must not be nil") +func GenerateMachineState( + defaultMachineStateGenerators *DefaultResourceStateGeneratorRegistry, +) (AllocationResourcesMap, error) { + if defaultMachineStateGenerators == nil { + return nil, fmt.Errorf("cannot generate machine state from nil defaultMachineStateGenerators") } - deviceNameToProvider := topologyRegistry.DeviceNameToProvider allocationResourcesMap := make(AllocationResourcesMap) - - for resourceName := range deviceNameToProvider { - allocationMap, err := GenerateResourceState(topologyRegistry, v1.ResourceName(resourceName)) + for resourceName, generator := range defaultMachineStateGenerators.GetGenerators() { + allocationMap, err := generator.GenerateDefaultResourceState() if err != nil { - return nil, fmt.Errorf("GenerateResourceState for resource %s failed with error: %v", resourceName, err) + return nil, fmt.Errorf("GenerateDefaultResourceState for resource %s failed with error: %v", resourceName, err) } allocationResourcesMap[v1.ResourceName(resourceName)] = allocationMap } @@ -51,15 +47,20 @@ func GenerateMachineState(topologyRegistry *machine.DeviceTopologyRegistry) (All // resource name along with existed pod resource entries. func GenerateMachineStateFromPodEntries( podResourceEntries PodResourceEntries, - topologyRegistry *machine.DeviceTopologyRegistry, + defaultMachineStateGenerators *DefaultResourceStateGeneratorRegistry, ) (AllocationResourcesMap, error) { - machineState, err := GenerateMachineState(topologyRegistry) - if err != nil { - return nil, fmt.Errorf("GenerateMachineState failed with error: %v", err) + if defaultMachineStateGenerators == nil { + return nil, fmt.Errorf("cannot generate machine state from nil resourceStateGeneratorRegistry") } + machineState := make(AllocationResourcesMap) for resourceName, podEntries := range podResourceEntries { - allocationMap, err := GenerateResourceStateFromPodEntries(podEntries, topologyRegistry, resourceName) + generator, ok := defaultMachineStateGenerators.GetGenerator(string(resourceName)) + if !ok { + return nil, fmt.Errorf("GetGenerator for resource %s failed", resourceName) + } + + allocationMap, err := GenerateResourceStateFromPodEntries(podEntries, generator) if err != nil { return nil, fmt.Errorf("GenerateResourceStateFromPodEntries for resource %s failed with error: %v", resourceName, err) } @@ -71,11 +72,12 @@ func GenerateMachineStateFromPodEntries( // GenerateResourceStateFromPodEntries returns an AllocationMap of a certain resource based on pod entries func GenerateResourceStateFromPodEntries( - podEntries PodEntries, topologyRegistry *machine.DeviceTopologyRegistry, resourceName v1.ResourceName, + podEntries PodEntries, + generator DefaultResourceStateGenerator, ) (AllocationMap, error) { - machineState, err := GenerateResourceState(topologyRegistry, resourceName) + machineState, err := generator.GenerateDefaultResourceState() if err != nil { - return nil, fmt.Errorf("GenerateResourceState failed with error: %v", err) + return nil, fmt.Errorf("GenerateDefaultResourceState failed with error: %v", err) } for deviceID, allocationState := range machineState { @@ -99,15 +101,29 @@ func GenerateResourceStateFromPodEntries( return machineState, nil } -// GenerateResourceState returns an empty AllocationMap for a certain resource. -func GenerateResourceState( - topologyProviderRegistry *machine.DeviceTopologyRegistry, resourceName v1.ResourceName, -) (AllocationMap, error) { - if topologyProviderRegistry == nil { +type genericDefaultResourceStateGenerator struct { + resourceName string + topologyRegistry *machine.DeviceTopologyRegistry +} + +func NewGenericDefaultResourceStateGenerator( + resourceName string, + topologyRegistry *machine.DeviceTopologyRegistry, +) DefaultResourceStateGenerator { + return &genericDefaultResourceStateGenerator{resourceName: resourceName, topologyRegistry: topologyRegistry} +} + +// GenerateDefaultResourceState return a default resource state by topology +func (g *genericDefaultResourceStateGenerator) GenerateDefaultResourceState() (AllocationMap, error) { + if g == nil { + return nil, fmt.Errorf("nil DefaultResourceStateGenerator") + } + + if g.topologyRegistry == nil { return nil, fmt.Errorf("topology provider registry must not be nil") } - topology, _, err := topologyProviderRegistry.GetDeviceTopology(string(resourceName)) + topology, _, err := g.topologyRegistry.GetDeviceTopology(g.resourceName) if err != nil { return nil, fmt.Errorf("topology provider registry failed with error: %v", err) } @@ -118,45 +134,6 @@ func GenerateResourceState( PodEntries: make(PodEntries), } } - return resourceState, nil -} -// RegenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, -// and regenerateHints will assemble hints based on already-existed AllocationInfo, -// without any calculation logics at all -func RegenerateGPUMemoryHints( - allocationInfo *AllocationInfo, regenerate bool, -) map[string]*pluginapi.ListOfTopologyHints { - if allocationInfo == nil { - general.Errorf("RegenerateHints got nil allocationInfo") - return nil - } - - hints := map[string]*pluginapi.ListOfTopologyHints{} - if regenerate { - general.ErrorS(nil, "need to regenerate hints", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "podUID", allocationInfo.PodUid, - "containerName", allocationInfo.ContainerName) - - return nil - } - - allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) - - general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", - "podNamespace", allocationInfo.PodNamespace, - "podName", allocationInfo.PodName, - "containerName", allocationInfo.ContainerName, - "hint", allocatedNumaNodes) - hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ - Hints: []*pluginapi.TopologyHint{ - { - Nodes: allocatedNumaNodes.ToSliceUInt64(), - Preferred: true, - }, - }, - } - return hints + return resourceState, nil } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 68787dd26d..384b825733 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -57,13 +57,13 @@ type StaticPolicy struct { stopCh chan struct{} started bool - emitter metrics.MetricEmitter - associatedDeviceNames sets.String + emitter metrics.MetricEmitter residualHitMap map[string]int64 - resourcePlugins map[string]resourceplugin.ResourcePlugin - customDevicePlugins map[string]customdeviceplugin.CustomDevicePlugin + associatedDeviceNames sets.String + resourcePlugins map[string]resourceplugin.ResourcePlugin + customDevicePlugins map[string]customdeviceplugin.CustomDevicePlugin } // NewStaticPolicy returns a static gpu policy @@ -82,13 +82,14 @@ func NewStaticPolicy( } policyImplement := &StaticPolicy{ - emitter: wrappedEmitter, - stopCh: make(chan struct{}), - name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), - residualHitMap: make(map[string]int64), - BasePlugin: basePlugin, - resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), - customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), + emitter: wrappedEmitter, + stopCh: make(chan struct{}), + name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), + residualHitMap: make(map[string]int64), + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + associatedDeviceNames: sets.NewString(), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } if err = policyImplement.registerResourcePlugins(); err != nil { @@ -98,6 +99,12 @@ func NewStaticPolicy( return false, agent.ComponentStub{}, fmt.Errorf("failed to register custom device plugins: %w", err) } + // init state must be done after resource plugins and custom device plugins are registered + err = policyImplement.InitState() + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to init state: %w", err) + } + pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, func(key string, value int64) { _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) @@ -139,7 +146,7 @@ func (p *StaticPolicy) Start() (err error) { }, time.Second*30, p.stopCh) err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(gpuconsts.ClearResidualState, general.HealthzCheckStateNotReady, - appqrm.QRMNetworkPluginPeriodicalHandlerGroupName, p.clearResidualState, gpuconsts.StateCheckPeriod, gpuconsts.StateCheckTolerationTimes) + appqrm.QRMGPUPluginPeriodicalHandlerGroupName, p.clearResidualState, gpuconsts.StateCheckPeriod, gpuconsts.StateCheckTolerationTimes) if err != nil { general.Errorf("start %v failed, err: %v", gpuconsts.ClearResidualState, err) } @@ -451,7 +458,7 @@ func (p *StaticPolicy) removeWithUpdate( return nil } - machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) + machineState, err := p.GenerateMachineStateFromPodEntries(podResourceEntries) if err != nil { general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) @@ -542,7 +549,7 @@ func (p *StaticPolicy) clearResidualState( podResourceEntries.RemovePod(podUID) } - machineState, err := state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DeviceTopologyRegistry) + machineState, err := p.GenerateMachineStateFromPodEntries(podResourceEntries) if err != nil { general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) return diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go new file mode 100644 index 0000000000..29b59f1e22 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go @@ -0,0 +1,69 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// registerDefaultFilterStrategies register filtering strategies +func registerDefaultFilterStrategies(manager *StrategyManager) { + if err := manager.RegisterFilteringStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { + general.Errorf("Failed to register filtering strategy: %v", err) + } + + if err := manager.RegisterFilteringStrategy(canonical.NewCanonicalStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } +} + +// registerDefaultSortingStrategies register sorting strategies +func registerDefaultSortingStrategies(manager *StrategyManager) { + if err := manager.RegisterSortingStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } +} + +// registerDefaultBindingStrategies register binding strategies +func registerDefaultBindingStrategies(manager *StrategyManager) { + if err := manager.RegisterBindingStrategy(canonical.NewCanonicalStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } + + if err := manager.RegisterBindingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } +} + +// registerDefaultAllocationStrategies register allocation strategies +func registerDefaultAllocationStrategies(manager *StrategyManager) { + if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, []string{gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { + general.Errorf("Failed to register gpu-memory-default strategy: %v", err) + } +} + +// registerDefaultStrategies registers the default strategies +func registerDefaultStrategies(manager *StrategyManager) { + registerDefaultFilterStrategies(manager) + registerDefaultSortingStrategies(manager) + registerDefaultBindingStrategies(manager) + registerDefaultAllocationStrategies(manager) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go index bd76997cea..f77b4d1fa6 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go @@ -21,20 +21,18 @@ import ( "sync" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" "github.com/kubewharf/katalyst-core/pkg/util/general" ) const ( - allocationStrategyNameDefault = "default" - allocationStrategyNameDeviceAffinity = "deviceAffinity" + allocationStrategyNameDefault = "default" ) // StrategyManager manages the selection of allocation strategies based on resource names type StrategyManager struct { - *StrategyRegistry + *registry.StrategyRegistry // Mapping from resource name to strategy name resourceToStrategy map[string]string @@ -49,7 +47,7 @@ type StrategyManager struct { // NewStrategyManager creates a new strategy manager func NewStrategyManager() *StrategyManager { return &StrategyManager{ - StrategyRegistry: NewStrategyRegistry(), + StrategyRegistry: registry.NewStrategyRegistry(), resourceToStrategy: make(map[string]string), defaultStrategy: allocationStrategyNameDefault, } @@ -106,9 +104,15 @@ func (m *StrategyManager) GetStrategyForResource(resourceName string) string { return m.defaultStrategy } -// GetAllocationStrategyForResource returns the allocation strategy for a given resource -func (m *StrategyManager) GetAllocationStrategyForResource(resourceName string) (allocate.AllocationStrategy, error) { - strategyName := m.GetStrategyForResource(resourceName) +// getAllocationStrategyForResource returns the allocation strategy for a given resource +func (m *StrategyManager) getAllocationStrategyForResource(customAllocationStrategy map[string]string, resourceName string) (allocate.AllocationStrategy, error) { + var strategyName string + if customStrategy, exists := customAllocationStrategy[resourceName]; exists { + strategyName = customStrategy + } else { + strategyName = m.GetStrategyForResource(resourceName) + } + return m.GetAllocationStrategy(strategyName) } @@ -116,9 +120,10 @@ func (m *StrategyManager) GetAllocationStrategyForResource(resourceName string) func (m *StrategyManager) AllocateUsingStrategy(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { // Determine the device name resourceName := ctx.DeviceReq.DeviceName + customAllocationStrategy := ctx.GPUQRMPluginConfig.CustomAllocationStrategy // Get the strategy for this resource - strategy, err := m.GetAllocationStrategyForResource(resourceName) + strategy, err := m.getAllocationStrategyForResource(customAllocationStrategy, resourceName) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -137,6 +142,39 @@ func (m *StrategyManager) AllocateUsingStrategy(ctx *allocate.AllocationContext) return strategy.Allocate(ctx) } +// RegisterGenericAllocationStrategy registers a complete generic allocation strategy with the given name +func (m *StrategyManager) RegisterGenericAllocationStrategy( + name string, filteringNames []string, sortingName, bindingName string, +) error { + var filteringList []allocate.FilteringStrategy + for _, fn := range filteringNames { + filtering, err := m.StrategyRegistry.GetFilteringStrategy(fn) + if err != nil { + return fmt.Errorf("filtering strategy %s not found: %v", fn, err) + } + filteringList = append(filteringList, filtering) + } + + sorting, err := m.StrategyRegistry.GetSortingStrategy(sortingName) + if err != nil { + return fmt.Errorf("sorting strategy %s not found: %v", sortingName, err) + } + + binding, err := m.StrategyRegistry.GetBindingStrategy(bindingName) + if err != nil { + return fmt.Errorf("binding strategy %s not found: %v", bindingName, err) + } + + err = m.StrategyRegistry.RegisterAllocationStrategy(allocation.NewGenericAllocationStrategy(name, m.StrategyRegistry, filteringList, sorting, binding)) + if err != nil { + return fmt.Errorf("register allocation strategy %s failed: %v", name, err) + } + + general.Infof("Registered allocation strategy: %s (filtering: %s, sorting: %s, binding: %s)", + name, filteringNames, sortingName, bindingName) + return nil +} + // Global strategy manager instance var ( globalStrategyManager *StrategyManager @@ -153,44 +191,3 @@ func GetGlobalStrategyManager() *StrategyManager { }) return globalStrategyManager } - -// registerDefaultStrategies registers the default strategies -func registerDefaultStrategies(manager *StrategyManager) { - // Register filtering strategies - if err := manager.RegisterFilteringStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { - general.Errorf("Failed to register filtering strategy: %v", err) - } - - if err := manager.RegisterFilteringStrategy(canonical.NewCanonicalStrategy()); err != nil { - general.Errorf("Failed to register sorting strategy: %v", err) - } - - // Register sorting strategies - if err := manager.RegisterSortingStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { - general.Errorf("Failed to register sorting strategy: %v", err) - } - - if err := manager.RegisterSortingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { - general.Errorf("Failed to register sorting strategy: %v", err) - } - - // Register binding strategies - if err := manager.RegisterBindingStrategy(canonical.NewCanonicalStrategy()); err != nil { - general.Errorf("Failed to register binding strategy: %v", err) - } - - if err := manager.RegisterBindingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { - general.Errorf("Failed to register binding strategy: %v", err) - } - - // Register allocation strategies - if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, []string{gpu_memory.StrategyNameGPUMemory}, - gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { - general.Errorf("Failed to register gpu-memory-default strategy: %v", err) - } - - if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDeviceAffinity, []string{deviceaffinity.StrategyNameDeviceAffinity}, - deviceaffinity.StrategyNameDeviceAffinity, canonical.StrategyNameCanonical); err != nil { - general.Errorf("Failed to register deviceAffinity strategy: %v", err) - } -} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go index 7b485d9c23..5a9a075249 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go @@ -21,81 +21,10 @@ import ( "github.com/stretchr/testify/assert" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" ) -type dummyStrategy struct { - name string -} - -func (s *dummyStrategy) Name() string { - return s.name -} - -func (s *dummyStrategy) Filter(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { - return allAvailableDevices, nil -} - -func (s *dummyStrategy) Sort(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { - return allAvailableDevices, nil -} - -func (s *dummyStrategy) Bind(_ *allocate.AllocationContext, _ []string) (*allocate.AllocationResult, error) { - return &allocate.AllocationResult{}, nil -} - -func (s *dummyStrategy) Allocate(_ *allocate.AllocationContext) (*allocate.AllocationResult, error) { - return &allocate.AllocationResult{}, nil -} - -func TestStrategyRegistry(t *testing.T) { - t.Parallel() - - registry := NewStrategyRegistry() - // Test filtering strategy registration - filteringStrategy := &dummyStrategy{name: "test-filtering"} - err := registry.RegisterFilteringStrategy(filteringStrategy) - assert.NoError(t, err) - - // Test duplicate registration - err = registry.RegisterFilteringStrategy(filteringStrategy) - assert.Error(t, err) - - // Test strategy retrieval - retrievedStrategy, err := registry.GetFilteringStrategy("test-filtering") - assert.NoError(t, err) - assert.Equal(t, "test-filtering", retrievedStrategy.Name()) - - // Test non-existent strategy - _, err = registry.GetFilteringStrategy("non-existent") - assert.Error(t, err) - - // Test sorting strategy registration - sortingStrategy := &dummyStrategy{name: "test-sorting"} - err = registry.RegisterSortingStrategy(sortingStrategy) - assert.NoError(t, err) - - // Test binding strategy registration - bindingStrategy := &dummyStrategy{name: "test-binding"} - err = registry.RegisterBindingStrategy(bindingStrategy) - assert.NoError(t, err) - - // Test allocation strategy registration - err = registry.RegisterGenericAllocationStrategy("test-allocation", []string{"test-filtering"}, - "test-sorting", "test-binding") - assert.NoError(t, err) - - // Test allocation strategy retrieval - allocationStrategy, err := registry.GetAllocationStrategy("test-allocation") - assert.NoError(t, err) - assert.Equal(t, "test-filtering", allocationStrategy.(*allocation.GenericAllocationStrategy).FilteringStrategy[0].Name()) - assert.Equal(t, "test-sorting", allocationStrategy.(*allocation.GenericAllocationStrategy).SortingStrategy.Name()) - assert.Equal(t, "test-binding", allocationStrategy.(*allocation.GenericAllocationStrategy).BindingStrategy.Name()) -} - func TestStrategyManager(t *testing.T) { t.Parallel() diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go similarity index 78% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go rename to pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go index 8e218e32f5..48b58014f9 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/registry.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go @@ -14,14 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package manager +package registry import ( "fmt" "sync" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" "github.com/kubewharf/katalyst-core/pkg/util/general" ) @@ -99,42 +98,6 @@ func (r *StrategyRegistry) RegisterAllocationStrategy(strategy allocate.Allocati return nil } -// RegisterGenericAllocationStrategy registers a complete generic allocation strategy with the given name -func (r *StrategyRegistry) RegisterGenericAllocationStrategy( - name string, filteringNames []string, sortingName, bindingName string, -) error { - r.registryMutex.Lock() - defer r.registryMutex.Unlock() - - if _, exists := r.allocationStrategies[name]; exists { - return fmt.Errorf("allocation strategy with name %s already registered", name) - } - - var filteringList []allocate.FilteringStrategy - for _, fn := range filteringNames { - filtering, exists := r.filteringStrategies[fn] - if !exists { - return fmt.Errorf("filtering strategy %s not found", fn) - } - filteringList = append(filteringList, filtering) - } - - sorting, exists := r.sortingStrategies[sortingName] - if !exists { - return fmt.Errorf("sorting strategy %s not found", sortingName) - } - - binding, exists := r.bindingStrategies[bindingName] - if !exists { - return fmt.Errorf("binding strategy %s not found", bindingName) - } - - r.allocationStrategies[name] = allocation.NewGenericAllocationStrategy(name, filteringList, sorting, binding) - general.Infof("Registered allocation strategy: %s (filtering: %s, sorting: %s, binding: %s)", - name, filteringNames, sortingName, bindingName) - return nil -} - // GetFilteringStrategy returns the filtering strategy with the given name func (r *StrategyRegistry) GetFilteringStrategy(name string) (allocate.FilteringStrategy, error) { r.registryMutex.RLock() diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go new file mode 100644 index 0000000000..d013c6a985 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go @@ -0,0 +1,97 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registry + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +type dummyStrategy struct { + name string +} + +func (s *dummyStrategy) Name() string { + return s.name +} + +func (s *dummyStrategy) Filter(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Sort(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Bind(_ *allocate.AllocationContext, _ []string) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + +func (s *dummyStrategy) Allocate(_ *allocate.AllocationContext) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + +func TestStrategyRegistry(t *testing.T) { + t.Parallel() + + registry := NewStrategyRegistry() + // Test filtering strategy registration + filteringStrategy := &dummyStrategy{name: "test-filtering"} + err := registry.RegisterFilteringStrategy(filteringStrategy) + assert.NoError(t, err) + + // Test duplicate registration + err = registry.RegisterFilteringStrategy(filteringStrategy) + assert.Error(t, err) + + // Test filtering strategy retrieval + retrievedStrategy, err := registry.GetFilteringStrategy("test-filtering") + assert.NoError(t, err) + assert.Equal(t, "test-filtering", retrievedStrategy.Name()) + + // Test non-existent strategy + _, err = registry.GetFilteringStrategy("non-existent") + assert.Error(t, err) + + // Test sorting strategy registration + sortingStrategy := &dummyStrategy{name: "test-sorting"} + err = registry.RegisterSortingStrategy(sortingStrategy) + assert.NoError(t, err) + + // Test strategy retrieval + retrievedSortingStrategy, err := registry.GetSortingStrategy("test-sorting") + assert.NoError(t, err) + assert.Equal(t, "test-sorting", retrievedSortingStrategy.Name()) + + // Test binding strategy registration + bindingStrategy := &dummyStrategy{name: "test-binding"} + err = registry.RegisterBindingStrategy(bindingStrategy) + assert.NoError(t, err) + + // Test allocation strategy registration + allocatingStrategy := &dummyStrategy{name: "test-allocation"} + err = registry.RegisterAllocationStrategy(allocatingStrategy) + assert.NoError(t, err) + + // Test allocation strategy retrieval + allocationStrategy, err := registry.GetAllocationStrategy("test-allocation") + assert.NoError(t, err) + assert.Equal(t, "test-allocation", allocationStrategy.Name()) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go index 1bcd098fb7..f5c55eaf2d 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go @@ -16,27 +16,34 @@ limitations under the License. package allocation -import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) // GenericAllocationStrategy combines filtering, sorting, and binding strategies type GenericAllocationStrategy struct { - name string - filteringStrategy []allocate.FilteringStrategy - sortingStrategy allocate.SortingStrategy - bindingStrategy allocate.BindingStrategy + name string + registry *registry.StrategyRegistry + filteringStrategies []allocate.FilteringStrategy + sortingStrategy allocate.SortingStrategy + bindingStrategy allocate.BindingStrategy } // NewGenericAllocationStrategy creates a new allocation strategy with the given components func NewGenericAllocationStrategy(name string, + registry *registry.StrategyRegistry, filtering []allocate.FilteringStrategy, sorting allocate.SortingStrategy, binding allocate.BindingStrategy, ) *GenericAllocationStrategy { return &GenericAllocationStrategy{ - name: name, - filteringStrategy: filtering, - sortingStrategy: sorting, - bindingStrategy: binding, + name: name, + registry: registry, + filteringStrategies: filtering, + sortingStrategy: sorting, + bindingStrategy: binding, } } @@ -46,12 +53,25 @@ func (s *GenericAllocationStrategy) Name() string { return s.name } +func (s *GenericAllocationStrategy) Clone(name string) *GenericAllocationStrategy { + filteringStrategies := make([]allocate.FilteringStrategy, len(s.filteringStrategies)) + copy(filteringStrategies, s.filteringStrategies) + return &GenericAllocationStrategy{ + name: name, + registry: s.registry, + filteringStrategies: filteringStrategies, + sortingStrategy: s.sortingStrategy, + bindingStrategy: s.bindingStrategy, + } +} + // Allocate performs the allocation using the combined strategies func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { var err error + resourceName := ctx.DeviceReq.DeviceName allAvailableDevices := append(ctx.DeviceReq.ReusableDevices, ctx.DeviceReq.AvailableDevices...) // Apply filtering strategy - for _, fs := range s.filteringStrategy { + for _, fs := range s.getFilteringStrategies(ctx, resourceName) { allAvailableDevices, err = fs.Filter(ctx, allAvailableDevices) if err != nil { return &allocate.AllocationResult{ @@ -62,7 +82,7 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* } // Apply sorting strategy - sortedDevices, err := s.sortingStrategy.Sort(ctx, allAvailableDevices) + sortedDevices, err := s.getSortingStrategy(ctx, resourceName).Sort(ctx, allAvailableDevices) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -71,7 +91,7 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* } // Apply binding strategy - result, err := s.bindingStrategy.Bind(ctx, sortedDevices) + result, err := s.getBindingStrategy(ctx, resourceName).Bind(ctx, sortedDevices) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -84,12 +104,12 @@ func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (* // GetFilteringStrategy returns the filtering strategy func (s *GenericAllocationStrategy) GetFilteringStrategy() []allocate.FilteringStrategy { - return s.filteringStrategy + return s.filteringStrategies } // SetFilteringStrategy sets the filtering strategy -func (s *GenericAllocationStrategy) SetFilteringStrategy(filteringStrategy []allocate.FilteringStrategy) { - s.filteringStrategy = filteringStrategy +func (s *GenericAllocationStrategy) SetFilteringStrategy(filteringStrategies []allocate.FilteringStrategy) { + s.filteringStrategies = filteringStrategies } // GetSortingStrategy returns the sorting strategy @@ -111,3 +131,46 @@ func (s *GenericAllocationStrategy) GetBindingStrategy() allocate.BindingStrateg func (s *GenericAllocationStrategy) SetBindingStrategy(bindingStrategy allocate.BindingStrategy) { s.bindingStrategy = bindingStrategy } + +func (s *GenericAllocationStrategy) getFilteringStrategies(ctx *allocate.AllocationContext, resourceName string) []allocate.FilteringStrategy { + if strategyNames, ok := ctx.GPUQRMPluginConfig.CustomFilteringStrategies[resourceName]; ok { + filteringStrategies := make([]allocate.FilteringStrategy, len(strategyNames)) + for _, fs := range strategyNames { + fs, err := s.registry.GetFilteringStrategy(fs) + if err != nil { + general.Errorf("failed to get filtering strategy %s: %v", fs, err) + continue + } + filteringStrategies = append(filteringStrategies, fs) + } + return filteringStrategies + } else { + return s.filteringStrategies + } +} + +func (s *GenericAllocationStrategy) getSortingStrategy(ctx *allocate.AllocationContext, resourceName string) allocate.SortingStrategy { + if strategyName, ok := ctx.GPUQRMPluginConfig.CustomSortingStrategy[resourceName]; ok { + sortingStrategy, err := s.registry.GetSortingStrategy(strategyName) + if err != nil { + general.Errorf("failed to get sorting strategy %s: %v", strategyName, err) + sortingStrategy = s.sortingStrategy + } + return sortingStrategy + } else { + return s.sortingStrategy + } +} + +func (s *GenericAllocationStrategy) getBindingStrategy(ctx *allocate.AllocationContext, resourceName string) allocate.BindingStrategy { + if strategyName, ok := ctx.GPUQRMPluginConfig.CustomBindingStrategy[resourceName]; ok { + bindingStrategy, err := s.registry.GetBindingStrategy(strategyName) + if err != nil { + general.Errorf("failed to get binding strategy %s: %v", strategyName, err) + bindingStrategy = s.bindingStrategy + } + return bindingStrategy + } else { + return s.bindingStrategy + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go index 342bce4d1f..fecb92a717 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go @@ -22,9 +22,8 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" "github.com/kubewharf/katalyst-core/pkg/util/general" ) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go index fc183f4efc..6ec9307521 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go @@ -34,7 +34,6 @@ func NewDeviceAffinityStrategy() *DeviceAffinityStrategy { var ( _ allocate.BindingStrategy = &DeviceAffinityStrategy{} - _ allocate.SortingStrategy = &DeviceAffinityStrategy{} ) // Name returns the name of the binding strategy diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go index 15c95ccea0..08cc5fd62b 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -41,6 +41,11 @@ func (s *GPUMemoryStrategy) Filter(ctx *allocate.AllocationContext, allAvailable return allAvailableDevices, nil } + if gpuMemory == 0 { + general.Infof("GPU Memory is 0, use default available devices") + return allAvailableDevices, nil + } + filteredDevices, err := s.filterGPUDevices(ctx, gpuMemory, allAvailableDevices) if err != nil { return nil, err diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go index 9a7dea90df..e4cfaef234 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -34,12 +34,17 @@ func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevice return nil, fmt.Errorf("GPU topology is nil") } - _, _, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + _, gpuMemory, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) if err != nil { general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default filtered devices", err) return filteredDevices, nil } + if gpuMemory == 0 { + general.Infof("GPU Memory is 0, use default available devices") + return filteredDevices, nil + } + gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) machineState := ctx.MachineState[consts.ResourceGPUMemory] diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go index 62d223a10c..f610910083 100644 --- a/pkg/config/agent/qrm/gpu_plugin.go +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -16,21 +16,29 @@ limitations under the License. package qrm -import "k8s.io/apimachinery/pkg/api/resource" +import ( + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) type GPUQRMPluginConfig struct { // PolicyName is used to switch between several strategies PolicyName string // GPUDeviceNames is the names of the GPU device GPUDeviceNames []string + // RDMADeviceNames is the names of the RDMA device + RDMADeviceNames []string // GPUMemoryAllocatablePerGPU is the total memory allocatable for each GPU GPUMemoryAllocatablePerGPU resource.Quantity // SkipGPUStateCorruption skip gpu state corruption, and it will be used after updating state properties SkipGPUStateCorruption bool - // RDMADeviceNames is the names of the RDMA device - RDMADeviceNames []string + + *gpustrategy.GPUStrategyConfig } func NewGPUQRMPluginConfig() *GPUQRMPluginConfig { - return &GPUQRMPluginConfig{} + return &GPUQRMPluginConfig{ + GPUStrategyConfig: gpustrategy.NewGPUStrategyConfig(), + } } diff --git a/pkg/config/agent/qrm/gpustrategy/allocate.go b/pkg/config/agent/qrm/gpustrategy/allocate.go new file mode 100644 index 0000000000..2653b1ecb8 --- /dev/null +++ b/pkg/config/agent/qrm/gpustrategy/allocate.go @@ -0,0 +1,28 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +type AllocateStrategyConfig struct { + CustomFilteringStrategies map[string][]string + CustomSortingStrategy map[string]string + CustomBindingStrategy map[string]string + CustomAllocationStrategy map[string]string +} + +func NewGPUAllocateStrategyConfig() *AllocateStrategyConfig { + return &AllocateStrategyConfig{} +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go b/pkg/config/agent/qrm/gpustrategy/strategy_base.go similarity index 65% rename from pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go rename to pkg/config/agent/qrm/gpustrategy/strategy_base.go index 473f25b67b..a901f80bdc 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/sort.go +++ b/pkg/config/agent/qrm/gpustrategy/strategy_base.go @@ -14,11 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. */ -package deviceaffinity +package gpustrategy -import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +type GPUStrategyConfig struct { + *AllocateStrategyConfig +} -// Sort does nothing for device affinity strategy -func (s *DeviceAffinityStrategy) Sort(_ *allocate.AllocationContext, filteredDevices []string) ([]string, error) { - return filteredDevices, nil +func NewGPUStrategyConfig() *GPUStrategyConfig { + return &GPUStrategyConfig{ + AllocateStrategyConfig: NewGPUAllocateStrategyConfig(), + } } diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index 8a0c31c37c..bca3e3440d 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -35,13 +35,19 @@ type DeviceTopologyProvider interface { // DeviceTopologyRegistry is a registry of all topology providers that knows how to provide topology information of machine devices type DeviceTopologyRegistry struct { - // DeviceNameToProvider is a mapping of device name to their respective topology provider - DeviceNameToProvider map[string]DeviceTopologyProvider + mux sync.RWMutex + + // deviceTopologyProviders is a mapping of device name to their respective topology provider + deviceTopologyProviders map[string]DeviceTopologyProvider + + // deviceTopologyAffinityProviders is a mapping of device name to their respective affinity provider + deviceTopologyAffinityProviders map[string]DeviceAffinityProvider } func NewDeviceTopologyRegistry() *DeviceTopologyRegistry { return &DeviceTopologyRegistry{ - DeviceNameToProvider: make(map[string]DeviceTopologyProvider), + deviceTopologyProviders: make(map[string]DeviceTopologyProvider), + deviceTopologyAffinityProviders: make(map[string]DeviceAffinityProvider), } } @@ -49,47 +55,60 @@ func NewDeviceTopologyRegistry() *DeviceTopologyRegistry { func (r *DeviceTopologyRegistry) RegisterDeviceTopologyProvider( deviceName string, deviceTopologyProvider DeviceTopologyProvider, ) { - r.DeviceNameToProvider[deviceName] = deviceTopologyProvider + r.mux.Lock() + defer r.mux.Unlock() + + r.deviceTopologyProviders[deviceName] = deviceTopologyProvider +} + +func (r *DeviceTopologyRegistry) RegisterTopologyAffinityProvider( + deviceName string, deviceAffinityProvider DeviceAffinityProvider, +) { + r.mux.Lock() + defer r.mux.Unlock() + + r.deviceTopologyAffinityProviders[deviceName] = deviceAffinityProvider } // SetDeviceTopology sets the device topology for the specified device name. func (r *DeviceTopologyRegistry) SetDeviceTopology(deviceName string, deviceTopology *DeviceTopology) error { - provider, ok := r.DeviceNameToProvider[deviceName] + r.mux.Lock() + defer r.mux.Unlock() + + topologyProvider, ok := r.deviceTopologyProviders[deviceName] if !ok { return fmt.Errorf("no device topology provider found for device %s", deviceName) } - return provider.SetDeviceTopology(deviceTopology) + topologyAffinityProvider, ok := r.deviceTopologyAffinityProviders[deviceName] + if ok { + topologyAffinityProvider.SetDeviceAffinity(deviceTopology) + general.Infof("set device affinity provider for device %s, %v", deviceName, deviceTopology) + } + + return topologyProvider.SetDeviceTopology(deviceTopology) +} + +// GetAllDeviceTopologyProviders returns all registered device topology providers. +func (r *DeviceTopologyRegistry) GetAllDeviceTopologyProviders() map[string]DeviceTopologyProvider { + r.mux.RLock() + defer r.mux.RUnlock() + + return r.deviceTopologyProviders } // GetDeviceTopology gets the device topology for the specified device name. func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTopology, bool, error) { - provider, ok := r.DeviceNameToProvider[deviceName] + r.mux.RLock() + defer r.mux.RUnlock() + + provider, ok := r.deviceTopologyProviders[deviceName] if !ok { return nil, false, fmt.Errorf("no device topology provider found for device %s", deviceName) } return provider.GetDeviceTopology() } -// SetDeviceAffinity establishes customised logic of affinity between devices -func (r *DeviceTopologyRegistry) SetDeviceAffinity( - deviceName string, deviceAffinityProvider DeviceAffinityProvider, -) error { - deviceTopologyProvider, ok := r.DeviceNameToProvider[deviceName] - if !ok { - return fmt.Errorf("no device topology provider found for device %s", deviceName) - } - deviceTopology, numaReady, err := deviceTopologyProvider.GetDeviceTopology() - if err != nil { - return fmt.Errorf("could not get device topology provider for device %s, err: %v", deviceName, err) - } - if !numaReady || deviceTopology == nil { - return fmt.Errorf("device topology provider for device %s is not ready", deviceName) - } - deviceAffinityProvider.SetDeviceAffinity(deviceTopology) - return deviceTopologyProvider.SetDeviceTopology(deviceTopology) -} - // GetDeviceNUMAAffinity retrieves a map of a certain device to the list of devices that it has an affinity with. // A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) func (r *DeviceTopologyRegistry) GetDeviceNUMAAffinity(deviceA, deviceB string) (map[string][]string, error) { From baceedeaa29ee07e47cee9ce45742336546137f2 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Wed, 29 Oct 2025 12:05:37 +0800 Subject: [PATCH 27/52] chore: rebase katalyst-api --- go.mod | 4 ++-- go.sum | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index e85139eb2b..2ef5e3a055 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/golang/mock v1.6.0 github.com/golang/protobuf v1.5.3 github.com/google/cadvisor v0.44.2 + github.com/google/go-cmp v0.5.9 github.com/google/uuid v1.3.0 github.com/h2non/gock v1.2.0 github.com/klauspost/cpuid/v2 v2.2.6 @@ -100,7 +101,6 @@ require ( github.com/godbus/dbus/v5 v5.0.6 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/gnostic v0.6.9 // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1 + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251029040321-5a97019a9b2c k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index 479db12f40..938e286d15 100644 --- a/go.sum +++ b/go.sum @@ -585,8 +585,6 @@ github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQ github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1 h1:GZ9+zAV1u5pP5xTqcshvNpr0qRIkl/Gu6/YFd2RFT9A= github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= -github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15 h1:QFm2/NiJscpQyVm8ejkuGiWVd7dUVL+WooTN5OxuG6A= -github.com/luomingmeng/kubelet v0.0.0-20250731182916-1e7f011d1c15/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= @@ -937,6 +935,8 @@ github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca/go.mod h1:ce1O1j6UtZfjr22oyGxGLbauSBp2YVXpARAosm7dHBg= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2 h1:km3N0XyOxD5yh/xdKwLdXnMx01wFRKeDz4/CT8ui8a0= +github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= From 679626510661e29db6c313448e2d1b75518bf5ac Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Wed, 29 Oct 2025 13:39:49 +0800 Subject: [PATCH 28/52] chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues chore: fix unit test, format and lint issues --- .../gpu/customdeviceplugin/gpu/gpu.go | 4 +- pkg/agent/qrm-plugins/gpu/state/util.go | 3 +- .../strategies/deviceaffinity/bind.go | 92 +++++++++---------- .../strategies/deviceaffinity/bind_test.go | 72 ++++++++++++++- .../deviceaffinity/device_affinity.go | 4 +- 5 files changed, 118 insertions(+), 57 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index b26dd8943a..0ebdfe13cc 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -130,7 +130,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( // Get GPU topology gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { - general.Warningf("failed to get gpu topology: %w", err) + general.Warningf("failed to get gpu topology: %v", err) return nil, fmt.Errorf("failed to get gpu topology: %w", err) } @@ -168,7 +168,7 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { - general.Warningf("failed to get gpu topology: %w", err) + general.Warningf("failed to get gpu topology: %v", err) return nil, fmt.Errorf("failed to get gpu topology: %w", err) } diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go index 6a24bc2a0f..8863220cdb 100644 --- a/pkg/agent/qrm-plugins/gpu/state/util.go +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -19,8 +19,9 @@ package state import ( "fmt" - "github.com/kubewharf/katalyst-core/pkg/util/machine" v1 "k8s.io/api/core/v1" + + "github.com/kubewharf/katalyst-core/pkg/util/machine" ) // GenerateMachineState returns an empty AllocationResourcesMap for all resource names. diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index 659488c399..c7fcaae22f 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -22,9 +22,11 @@ import ( "github.com/google/uuid" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) @@ -39,22 +41,23 @@ type affinityGroup struct { // and candidateDevices refer to a set of devices that we can potentially allocate in an affinity group. type possibleAllocation struct { unallocatedSize int - candidateDevices sets.String + candidateDevices []string } -// allocationByIntersectionResult is the result of allocating devices by maximising intersection size of possible allocations +// allocationByIntersectionResult is the result of allocating devices by maximizing intersection size of possible allocations // with an affinity group. type allocationByIntersectionResult struct { - allocatedDevices sets.String - availableDevices sets.String - finished bool - err error + finished bool + err error } // Bind binds the sorted devices to the allocation context by searching for the devices that have affinity to each other. func (s *DeviceAffinityStrategy) Bind( ctx *allocate.AllocationContext, sortedDevices []string, ) (*allocate.AllocationResult, error) { + general.InfoS("device affinity strategy binding called", + "available devices", sortedDevices) + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) if !valid { return &allocate.AllocationResult{ @@ -103,8 +106,6 @@ func (s *DeviceAffinityStrategy) Bind( }, fmt.Errorf("failed to allocate available devices with affinity: %v", err) } - fmt.Println("allocated devices:", allocatedDevices.UnsortedList()) - // Return result once we have allocated all the devices if len(allocatedDevices) == devicesToAllocate { return &allocate.AllocationResult{ @@ -204,17 +205,14 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices) - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, availableDevicesSet, devicesToAllocate, priority == len(affinityMap)-1) + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1) if allocateByIntersectionRes.err != nil { return nil, allocateByIntersectionRes.err } if allocateByIntersectionRes.finished { - return allocateByIntersectionRes.allocatedDevices, nil + return allocatedDevices, nil } - - allocatedDevices = allocateByIntersectionRes.allocatedDevices - availableDevicesSet = allocateByIntersectionRes.availableDevices } return allocatedDevices, nil @@ -255,21 +253,18 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( // The number of unallocated devices in the group is retrieved by taking a difference between // the unallocated devices in the group and the already allocated devices unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), - candidateDevices: deviceIntersection, + candidateDevices: deviceIntersection.UnsortedList(), }) } - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, unallocatedAvailableDevices, devicesToAllocate, priority == len(allocatedAffinityGroupIds)-1) + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(allocatedAffinityGroupIds)-1) if allocateByIntersectionRes.err != nil { return nil, allocateByIntersectionRes.err } if allocateByIntersectionRes.finished { - return allocateByIntersectionRes.allocatedDevices, nil + return allocatedDevices, nil } - - allocatedDevices = allocateByIntersectionRes.allocatedDevices - unallocatedAvailableDevices = allocateByIntersectionRes.availableDevices } return allocatedDevices, nil @@ -277,12 +272,12 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( // mergePossibleAllocationsAndSort merges the possible allocations by their unallocated size and sorts them in ascending order of their unallocated size. func (s *DeviceAffinityStrategy) mergePossibleAllocationsAndSort(possibleAllocations []possibleAllocation) []possibleAllocation { - merged := make(map[int]sets.String) + merged := make(map[int][]string) for _, alloc := range possibleAllocations { if _, ok := merged[alloc.unallocatedSize]; !ok { - merged[alloc.unallocatedSize] = sets.NewString() + merged[alloc.unallocatedSize] = make([]string, 0) } - merged[alloc.unallocatedSize].Insert(alloc.candidateDevices.UnsortedList()...) + merged[alloc.unallocatedSize] = append(merged[alloc.unallocatedSize], alloc.candidateDevices...) } mergedAllocations := make([]possibleAllocation, 0, len(merged)) @@ -316,7 +311,7 @@ func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( // The number of unallocated devices in the group is retrieved by taking a difference between // the unallocated devices in the group and the already allocated devices unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), - candidateDevices: deviceIntersection, + candidateDevices: deviceIntersection.UnsortedList(), }) } @@ -325,7 +320,7 @@ func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( // allocateByIntersection allocates devices by the following algorithm // 1. Sort the intersection sizes of possible allocations in descending order, we want to allocate devices with larger intersection size with an affinity group. -// 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximise +// 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize // bin-packing (try to fill up an affinity group that is already allocated with other devices. // 3. For each intersection size, allocate devices in the order of the sorted possible allocations. // 4. If a possible allocation has a number of intersected devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. @@ -333,7 +328,7 @@ func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( // to fill up the remaining devices. func (s *DeviceAffinityStrategy) allocateByIntersection( intersectionToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, - unallocatedAvailableDevices sets.String, devicesToAllocate int, isLastPriority bool, + devicesToAllocate int, isLastPriority bool, ) allocationByIntersectionResult { // Sort the intersection sizes of possible allocations in descending order intersectionSizes := make([]int, 0, len(intersectionToPossibleAllocationsMap)) @@ -345,8 +340,16 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( return intersectionSizes[i] > intersectionSizes[j] }) - if len(intersectionToPossibleAllocationsMap) > 0 { + if len(intersectionSizes) > 0 { + // Find the first intersection size that is larger than or equal to the devices needed for allocation maxIntersection := intersectionSizes[0] + for _, intersectionSize := range intersectionSizes { + if intersectionSize <= devicesToAllocate { + maxIntersection = intersectionSize + break + } + } + possibleAllocations, ok := intersectionToPossibleAllocationsMap[maxIntersection] if !ok { return allocationByIntersectionResult{ @@ -355,29 +358,27 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( } } + // TODO: Dont need to merge if last priority mergedPossibleAllocations := s.mergePossibleAllocationsAndSort(possibleAllocations) + klog.Infof("possible allocations: %v", mergedPossibleAllocations) + for _, possibleAlloc := range mergedPossibleAllocations { // If devices of possible allocation size is larger than the devices needed, and it is not the last priority level, // go to the next priority and try to allocate - if !isLastPriority && possibleAlloc.candidateDevices.Len() > devicesToAllocate-allocatedDevices.Len() { + if !isLastPriority && len(possibleAlloc.candidateDevices) > devicesToAllocate-allocatedDevices.Len() { return allocationByIntersectionResult{ - finished: false, - err: nil, - allocatedDevices: allocatedDevices, - availableDevices: unallocatedAvailableDevices, + finished: false, + err: nil, } } - for device := range possibleAlloc.candidateDevices { + for _, device := range possibleAlloc.candidateDevices { allocatedDevices.Insert(device) - unallocatedAvailableDevices.Delete(device) if allocatedDevices.Len() == devicesToAllocate { return allocationByIntersectionResult{ - finished: true, - err: nil, - allocatedDevices: allocatedDevices, - availableDevices: unallocatedAvailableDevices, + finished: true, + err: nil, } } } @@ -385,7 +386,7 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( // At the last priority, we just go through the other possible allocations of the other intersection sizes if we have not allocated finish the candidate devices if isLastPriority { - for _, intersectionSize := range intersectionSizes[1:] { + for _, intersectionSize := range intersectionSizes { possibleAllocations, ok = intersectionToPossibleAllocationsMap[intersectionSize] if !ok { return allocationByIntersectionResult{ @@ -400,15 +401,12 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( }) for _, possibleAlloc := range possibleAllocations { - for device := range possibleAlloc.candidateDevices { + for _, device := range possibleAlloc.candidateDevices { allocatedDevices.Insert(device) - unallocatedAvailableDevices.Delete(device) if allocatedDevices.Len() == devicesToAllocate { return allocationByIntersectionResult{ - finished: true, - err: nil, - allocatedDevices: allocatedDevices, - availableDevices: unallocatedAvailableDevices, + finished: true, + err: nil, } } } @@ -417,10 +415,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( } } return allocationByIntersectionResult{ - finished: false, - err: nil, - allocatedDevices: allocatedDevices, - availableDevices: unallocatedAvailableDevices, + finished: false, + err: nil, } } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go index e86bbc8268..51aafa905e 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -119,7 +119,7 @@ func TestBind(t *testing.T) { }, }, }, - sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4", "gpu-5"}, + sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, expectedResult: &allocate.AllocationResult{ AllocatedDevices: []string{"gpu-3", "gpu-4"}, Success: true, @@ -170,6 +170,70 @@ func TestBind(t *testing.T) { Success: true, }, }, + { + name: "supports bin-packing of 1 request with only available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 1, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu1", "gpu2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3"}, + Success: true, + }, + }, { name: "supports of bin-packing of 2 allocated devices", ctx: &allocate.AllocationContext{ @@ -387,9 +451,11 @@ func TestBind(t *testing.T) { }, }, }, - sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + // Allocate gpu-1, gpu-2 in first level, then allocate gpu-3 as it has affinity with gpu-1 and gpu-2 + // Then allocate gpu-5 as gpu-6 is already allocated + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7", "gpu-8"}, expectedResult: &allocate.AllocationResult{ - AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-7"}, + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5"}, Success: true, }, }, diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go index 6ec9307521..d637483ea9 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go @@ -32,9 +32,7 @@ func NewDeviceAffinityStrategy() *DeviceAffinityStrategy { return &DeviceAffinityStrategy{} } -var ( - _ allocate.BindingStrategy = &DeviceAffinityStrategy{} -) +var _ allocate.BindingStrategy = &DeviceAffinityStrategy{} // Name returns the name of the binding strategy func (s *DeviceAffinityStrategy) Name() string { From a5fc440dbf3f62fbd275ab216347e8429b272d2a Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 3 Nov 2025 11:02:03 +0800 Subject: [PATCH 29/52] fix: maintain affinity subgroup sequence in larger affinity groups fix: maintain affinity subgroup sequence in larger affinity groups --- .../strategies/deviceaffinity/bind.go | 132 ++- .../strategies/deviceaffinity/bind_test.go | 853 +++++++++++++++++- pkg/util/machine/device.go | 100 +- 3 files changed, 963 insertions(+), 122 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index c7fcaae22f..e313e96fa1 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -34,7 +34,7 @@ import ( // It is uniquely identified by an id. type affinityGroup struct { id string - unallocatedDevices sets.String + unallocatedDevices []string } // possibleAllocation refers to information about a certain affinity group, which includes the number of unallocated devices in the group, @@ -74,6 +74,7 @@ func (s *DeviceAffinityStrategy) Bind( // Get a map of affinity groups that is grouped by priority affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() + klog.Infof("affinity map: %v", affinityMap) affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) idToAffinityGroupMap := s.getAffinityGroupById(affinityGroupByPriority) @@ -144,6 +145,7 @@ func (s *DeviceAffinityStrategy) getAffinityGroupsByPriority( for priority, affinityDevices := range affinityMap { affinityGroupsMap[priority] = s.getAffinityGroups(affinityDevices, unallocatedDevicesSet) } + return affinityGroupsMap } @@ -162,7 +164,7 @@ func (s *DeviceAffinityStrategy) getAffinityGroups( } } affinityGroups = append(affinityGroups, affinityGroup{ - unallocatedDevices: sets.NewString(unallocatedDevices...), + unallocatedDevices: unallocatedDevices, id: uuid.NewString(), }) } @@ -196,22 +198,24 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( return allocatedDevices, nil } - // Otherwise, we need to allocate these devices by their affinity - for priority := 0; priority < len(affinityMap); priority++ { - groups, ok := affinityMap[machine.AffinityPriority(priority)] - if !ok { - return nil, fmt.Errorf("affinity priority %v not found", priority) - } + for allocatedDevices.Len() < devicesToAllocate { + // Otherwise, we need to allocate these devices by their affinity + for priority := 0; priority < len(affinityMap); priority++ { + groups, ok := affinityMap[machine.AffinityPriority(priority)] + if !ok { + return nil, fmt.Errorf("affinity priority %v not found", priority) + } - intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices) + intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices) - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1) - if allocateByIntersectionRes.err != nil { - return nil, allocateByIntersectionRes.err - } + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1) + if allocateByIntersectionRes.err != nil { + return nil, allocateByIntersectionRes.err + } - if allocateByIntersectionRes.finished { - return allocatedDevices, nil + if allocateByIntersectionRes.finished { + return allocatedDevices, nil + } } } @@ -244,7 +248,7 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( return nil, fmt.Errorf("affinity group %v not found", groupID) } - deviceIntersection := group.unallocatedDevices.Intersection(unallocatedAvailableDevices) + deviceIntersection := sets.NewString(group.unallocatedDevices...).Intersection(unallocatedAvailableDevices) if _, ok = intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) } @@ -252,7 +256,7 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{ // The number of unallocated devices in the group is retrieved by taking a difference between // the unallocated devices in the group and the already allocated devices - unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), + unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(), candidateDevices: deviceIntersection.UnsortedList(), }) } @@ -303,25 +307,38 @@ func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) for _, group := range groups { // Find intersection of affinity group and the available reusable devices - deviceIntersection := group.unallocatedDevices.Intersection(availableDevicesSet) - if _, ok := intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { - intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) + deviceIntersection := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet) + if len(deviceIntersection) == 0 { + continue } - intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{ + if _, ok := intersectionToPossibleAllocationsMap[len(deviceIntersection)]; !ok { + intersectionToPossibleAllocationsMap[len(deviceIntersection)] = make([]possibleAllocation, 0) + } + intersectionToPossibleAllocationsMap[len(deviceIntersection)] = append(intersectionToPossibleAllocationsMap[len(deviceIntersection)], possibleAllocation{ // The number of unallocated devices in the group is retrieved by taking a difference between // the unallocated devices in the group and the already allocated devices - unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(), - candidateDevices: deviceIntersection.UnsortedList(), + unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(), + candidateDevices: deviceIntersection, }) } return intersectionToPossibleAllocationsMap } +func getDeviceIntersection(unallocatedDevices []string, availableDevices sets.String) []string { + deviceIntersection := make([]string, 0) + for _, device := range unallocatedDevices { + if availableDevices.Has(device) { + deviceIntersection = append(deviceIntersection, device) + } + } + return deviceIntersection +} + // allocateByIntersection allocates devices by the following algorithm // 1. Sort the intersection sizes of possible allocations in descending order, we want to allocate devices with larger intersection size with an affinity group. // 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize -// bin-packing (try to fill up an affinity group that is already allocated with other devices. +// bin-packing (try to fill up an affinity group that is already allocated with other devices). // 3. For each intersection size, allocate devices in the order of the sorted possible allocations. // 4. If a possible allocation has a number of intersected devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. // 5. If we are currently at the last affinity priority level, we go through the other possible allocations (that are in sorted ascending order of number of unallocated devices) @@ -330,7 +347,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( intersectionToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, devicesToAllocate int, isLastPriority bool, ) allocationByIntersectionResult { - // Sort the intersection sizes of possible allocations in descending order + // Sort the intersection sizes of possible allocations in descending order because we want to process the larger intersections first. + // A larger intersection means that we are able to find more devices that have an affinity with an affinity group. intersectionSizes := make([]int, 0, len(intersectionToPossibleAllocationsMap)) for intersectionSize := range intersectionToPossibleAllocationsMap { intersectionSizes = append(intersectionSizes, intersectionSize) @@ -340,25 +358,31 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( return intersectionSizes[i] > intersectionSizes[j] }) - if len(intersectionSizes) > 0 { - // Find the first intersection size that is larger than or equal to the devices needed for allocation - maxIntersection := intersectionSizes[0] - for _, intersectionSize := range intersectionSizes { - if intersectionSize <= devicesToAllocate { - maxIntersection = intersectionSize - break - } + // If there is an intersection size that is larger than or equal to the number of devices needed for allocation, + // find the smallest intersection size. This is so that we try to reduce fragmentation as much as possible. + // For example, if we have 1 device to allocate, and we have intersectionSizes of 2 and 1, we want to allocate to the group with + // intersection of size 1, as this means that we are able to successfully do bin-packing (fill up an affinity group that + // that is already allocated with other devices) + start := 0 + for i, intersectionSize := range intersectionSizes { + if intersectionSize <= devicesToAllocate { + start = i + break } + } - possibleAllocations, ok := intersectionToPossibleAllocationsMap[maxIntersection] + klog.Infof("intersection to possible allocations map: %v", intersectionToPossibleAllocationsMap) + + for i := start; i < len(intersectionSizes); i++ { + intersectionSize := intersectionSizes[i] + possibleAllocations, ok := intersectionToPossibleAllocationsMap[intersectionSize] if !ok { return allocationByIntersectionResult{ finished: false, - err: fmt.Errorf("possible reusable devices of intersection size %v not found", maxIntersection), + err: fmt.Errorf("possible reusable devices of intersection size %v not found", intersectionSize), } } - // TODO: Dont need to merge if last priority mergedPossibleAllocations := s.mergePossibleAllocationsAndSort(possibleAllocations) klog.Infof("possible allocations: %v", mergedPossibleAllocations) @@ -366,7 +390,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( for _, possibleAlloc := range mergedPossibleAllocations { // If devices of possible allocation size is larger than the devices needed, and it is not the last priority level, // go to the next priority and try to allocate - if !isLastPriority && len(possibleAlloc.candidateDevices) > devicesToAllocate-allocatedDevices.Len() { + remainingToAllocate := devicesToAllocate - allocatedDevices.Len() + if !isLastPriority && len(possibleAlloc.candidateDevices) > remainingToAllocate { return allocationByIntersectionResult{ finished: false, err: nil, @@ -383,37 +408,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( } } } - - // At the last priority, we just go through the other possible allocations of the other intersection sizes if we have not allocated finish the candidate devices - if isLastPriority { - for _, intersectionSize := range intersectionSizes { - possibleAllocations, ok = intersectionToPossibleAllocationsMap[intersectionSize] - if !ok { - return allocationByIntersectionResult{ - finished: false, - err: fmt.Errorf("possible device allocation of intersection size %v not found", intersectionSize), - } - } - - // Sort possible allocations by their unallocated size in ascending order - sort.Slice(possibleAllocations, func(i, j int) bool { - return possibleAllocations[i].unallocatedSize < possibleAllocations[j].unallocatedSize - }) - - for _, possibleAlloc := range possibleAllocations { - for _, device := range possibleAlloc.candidateDevices { - allocatedDevices.Insert(device) - if allocatedDevices.Len() == devicesToAllocate { - return allocationByIntersectionResult{ - finished: true, - err: nil, - } - } - } - } - } - } } + return allocationByIntersectionResult{ finished: false, err: nil, @@ -428,7 +424,7 @@ func (s *DeviceAffinityStrategy) findAllAffinityGroupIdsByPriority( for _, device := range allocatedDevices { for priority, groups := range affinityMap { for _, group := range groups { - if group.unallocatedDevices.Has(device) { + if sets.NewString(group.unallocatedDevices...).Has(device) { if _, ok := affinityGroupIds[priority]; !ok { affinityGroupIds[priority] = sets.NewString() } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go index 51aafa905e..283504e0ac 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -17,6 +17,8 @@ limitations under the License. package deviceaffinity import ( + "reflect" + "sort" "testing" "github.com/google/go-cmp/cmp" @@ -29,16 +31,265 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/machine" ) -func TestBind(t *testing.T) { +func TestBind_NumberOfDevicesAllocated(t *testing.T) { t.Parallel() tests := []struct { - name string - ctx *allocate.AllocationContext - sortedDevices []string - expectedResult *allocate.AllocationResult - expectedErr bool + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedResult *allocate.AllocationResult + expectedErr bool + isRandom bool + expectedResultSize int }{ + { + name: "able to allocate 1 device in affinity group of size 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 1, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + isRandom: true, + expectedResultSize: 1, + }, + { + name: "able to allocate 2 devices in affinity group of size 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + isRandom: true, + expectedResultSize: 2, + }, + { + name: "able to allocate 3 devices in affinity size of group 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 3, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + isRandom: true, + expectedResultSize: 3, + }, + { + name: "able to allocate 4 devices in affinity size of group 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "able to allocate 2 devices in affinity size of group 4", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + isRandom: true, + expectedResultSize: 2, + }, { name: "allocate all reusable devices first", ctx: &allocate.AllocationContext{ @@ -1286,35 +1537,546 @@ func TestBind(t *testing.T) { Success: true, }, }, - } - - for _, tt := range tests { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - deviceBindingStrategy := NewDeviceAffinityStrategy() - result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) - if (err != nil) != tt.expectedErr { - t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) - } - verifyAllocationResult(t, result, tt.expectedResult) - }) - } -} - -func verifyAllocationResult( - t *testing.T, result *allocate.AllocationResult, expectedResult *allocate.AllocationResult, -) { - if (result == nil) != (expectedResult == nil) { - t.Errorf("result = %v, expectedResult = %v", result, expectedResult) - return - } - if result.Success != expectedResult.Success { - t.Errorf("result.Success = %v, expectedResult.Success = %v", result.Success, expectedResult.Success) - return - } - if len(result.AllocatedDevices) != len(expectedResult.AllocatedDevices) { - t.Errorf("result.AllocatedDevices = %v, expectedResult.AllocatedDevices = %v", result.AllocatedDevices, expectedResult.AllocatedDevices) + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 8 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "0": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + isRandom: true, + expectedResultSize: 2, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceBindingStrategy := NewDeviceAffinityStrategy() + result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) + if (err != nil) != tt.expectedErr { + t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) + } + verifyAllocationResult(t, result, tt.expectedResult, tt.isRandom, tt.expectedResultSize) + }) + } +} + +func TestBind_DeviceAffinity(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedErr bool + expectedAffinityPriorityLevel int + }{ + { + name: "1 level of device affinity, 2 devices in a group", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedAffinityPriorityLevel: 0, + }, + { + name: "2 devices in affinity priority 0, 4 devices in affinity priority 1", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedAffinityPriorityLevel: 1, + }, + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 4 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + expectedAffinityPriorityLevel: 0, + }, + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 8 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + DeviceRequest: 8, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + expectedAffinityPriorityLevel: 1, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceBindingStrategy := NewDeviceAffinityStrategy() + result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) + if (err != nil) != tt.expectedErr { + t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) + } + + verifyResultIsAffinity(t, result, tt.ctx.DeviceTopology, tt.expectedAffinityPriorityLevel) + }) + } +} + +func verifyAllocationResult( + t *testing.T, result *allocate.AllocationResult, expectedResult *allocate.AllocationResult, isRandom bool, + expectedResultSize int, +) { + if isRandom { + if len(result.AllocatedDevices) != expectedResultSize { + t.Errorf("result.AllocatedDevices = %v, expectedResultSize = %v", result.AllocatedDevices, expectedResultSize) + } + return + } + if (result == nil) != (expectedResult == nil) { + t.Errorf("result = %v, expectedResult = %v", result, expectedResult) + return + } + if result.Success != expectedResult.Success { + t.Errorf("result.Success = %v, expectedResult.Success = %v", result.Success, expectedResult.Success) + return + } + if len(result.AllocatedDevices) != len(expectedResult.AllocatedDevices) { + t.Errorf("result.AllocatedDevices = %v, expectedResult.AllocatedDevices = %v", result.AllocatedDevices, expectedResult.AllocatedDevices) return } if diff := cmp.Diff(result.AllocatedDevices, expectedResult.AllocatedDevices, @@ -1323,3 +2085,24 @@ func verifyAllocationResult( t.Errorf("Bind() mismatch (-got +want):\n%s", diff) } } + +func verifyResultIsAffinity( + t *testing.T, result *allocate.AllocationResult, topology *machine.DeviceTopology, + expectedAffinityPriorityLevel int, +) { + affinityMap := topology.GroupDeviceAffinity() + priorityLevelDevices := affinityMap[machine.AffinityPriority(expectedAffinityPriorityLevel)] + + sort.Slice(result.AllocatedDevices, func(i, j int) bool { + return result.AllocatedDevices[i] < result.AllocatedDevices[j] + }) + + for _, deviceIDs := range priorityLevelDevices { + sort.Slice(deviceIDs, func(i, j int) bool { return deviceIDs[i] < deviceIDs[j] }) + if reflect.DeepEqual(deviceIDs, machine.DeviceIDs(result.AllocatedDevices)) { + return + } + } + + t.Errorf("result = %v, did not find it within an affinity group", result.AllocatedDevices) +} diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index bca3e3440d..445d50c19c 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -19,6 +19,7 @@ package machine import ( "fmt" "sort" + "strings" "sync" "k8s.io/apimachinery/pkg/util/sets" @@ -146,35 +147,96 @@ type DeviceTopology struct { Devices map[string]DeviceInfo } -// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs within a certain affinity priority level +// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs are within a certain affinity priority level +// It preserves sub-group boundaries and eliminates duplicates +// E.g. if priority 0 has groups [1, 3, 5, 6] and [0, 2, 4, 7], priority 1 will be either [1, 3, 5, 6, 0, 2, 4, 7] or [0, 2, 4, 7, 1, 3, 5, 6] and not any other permutation +// This is to ensure that the higher priority affinity groups keep its permutation when it is in lower priority affinity group. func (t *DeviceTopology) GroupDeviceAffinity() map[AffinityPriority][]DeviceIDs { deviceAffinityGroup := make(map[AffinityPriority][]DeviceIDs) - for deviceId, deviceInfo := range t.Devices { - for priority, affinityDeviceIDs := range deviceInfo.DeviceAffinity { - affinityDeviceIDs = append(affinityDeviceIDs, deviceId) - // Sort the strings for easier deduplication - sort.Strings(affinityDeviceIDs) - if _, ok := deviceAffinityGroup[priority]; !ok { - deviceAffinityGroup[priority] = make([]DeviceIDs, 0) - } - // Add the affinityDeviceIDs to the priority level if it is not already there - if !containsGroup(deviceAffinityGroup[priority], affinityDeviceIDs) { - deviceAffinityGroup[priority] = append(deviceAffinityGroup[priority], affinityDeviceIDs) + // Collect unique groups per priority + uniqueGroups := make(map[AffinityPriority]map[string]DeviceIDs) + + for id, deviceInfo := range t.Devices { + for priority, group := range deviceInfo.DeviceAffinity { + // Ensure the device itself is included + if !slices.Contains(group, id) { + group = append(group, id) } + // Sort for consistent deduplication key + sortedGroup := make([]string, len(group)) + copy(sortedGroup, group) + sort.Strings(sortedGroup) + + key := strings.Join(sortedGroup, ",") + if _, ok := uniqueGroups[priority]; !ok { + uniqueGroups[priority] = make(map[string]DeviceIDs) + } + uniqueGroups[priority][key] = group } } - return deviceAffinityGroup -} -func containsGroup(groups []DeviceIDs, candidate DeviceIDs) bool { - for _, g := range groups { - if slices.Equal(g, candidate) { - return true + // Iterate priorities in order + for priority := 0; ; priority++ { + groupsMap, ok := uniqueGroups[AffinityPriority(priority)] + if !ok || len(groupsMap) == 0 { + break // no more groups at this priority + } + + // Build lower-group map for merging (priority > 0) + lowerGroupMap := make(map[string]int) + if priority > 0 { + for idx, g := range deviceAffinityGroup[AffinityPriority(priority-1)] { + for _, d := range g { + lowerGroupMap[d] = idx + } + } + } + + for _, group := range groupsMap { + if priority > 0 { + // Merge according to lower-priority group boundaries + lowerGroups := make(map[int][]string) + for _, d := range group { + if idx, ok := lowerGroupMap[d]; ok { + lowerGroups[idx] = append(lowerGroups[idx], d) + } else { + lowerGroups[-1] = append(lowerGroups[-1], d) + } + } + + merged := []string{} + for idx := 0; idx < len(deviceAffinityGroup[AffinityPriority(priority-1)]); idx++ { + if devs, ok := lowerGroups[idx]; ok { + merged = append(merged, devs...) + } + } + if devs, ok := lowerGroups[-1]; ok { + merged = append(merged, devs...) + } + group = merged + } + + // Deduplicate final groups + key := strings.Join(group, ",") + if _, ok := deviceAffinityGroup[AffinityPriority(priority)]; !ok { + deviceAffinityGroup[AffinityPriority(priority)] = []DeviceIDs{} + } + alreadyExists := false + for _, g := range deviceAffinityGroup[AffinityPriority(priority)] { + if strings.Join(g, ",") == key { + alreadyExists = true + break + } + } + if !alreadyExists { + deviceAffinityGroup[AffinityPriority(priority)] = append(deviceAffinityGroup[AffinityPriority(priority)], group) + } } } - return false + + return deviceAffinityGroup } func (t *DeviceTopology) GetDeviceAffinityMap(deviceId string) (map[AffinityPriority]DeviceIDs, error) { From b82919e0fc45189961229b8228b882b64ffeff46 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 3 Nov 2025 15:56:56 +0800 Subject: [PATCH 30/52] refactor: simplify code by deleting redundant parameters and refactor function for allocating devices refactor: simplify code by deleting redundant parameters and refactor function for allocating devices --- .../strategies/deviceaffinity/bind.go | 64 ++++++++----------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index e313e96fa1..8469139672 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -74,13 +74,12 @@ func (s *DeviceAffinityStrategy) Bind( // Get a map of affinity groups that is grouped by priority affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() - klog.Infof("affinity map: %v", affinityMap) affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) idToAffinityGroupMap := s.getAffinityGroupById(affinityGroupByPriority) // Allocate reusable devices first - allocatedDevices, err := s.allocateCandidateDevices(reusableDevicesSet, devicesToAllocate, unallocatedDevicesSet, affinityGroupByPriority, sets.NewString()) + allocatedDevices, err := s.allocateAvailableDevicesWithAffinity(sets.NewString(), reusableDevicesSet, devicesToAllocate, idToAffinityGroupMap, affinityGroupByPriority) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -95,12 +94,8 @@ func (s *DeviceAffinityStrategy) Bind( }, nil } - // Find all the affinity group ids that the allocated devices belong to and their respective priorities - allocatedAffinityGroupIds := s.findAllAffinityGroupIdsByPriority(allocatedDevices.UnsortedList(), affinityGroupByPriority) - - availableDevicesSet := sets.NewString(sortedDevices...) - // Next, allocate from available devices, but try to allocate from same affinity group as the allocated reusable devices - if allocatedDevices, err = s.allocateAvailableDevicesWithAffinity(allocatedDevices, availableDevicesSet, unallocatedDevicesSet, devicesToAllocate, allocatedAffinityGroupIds, idToAffinityGroupMap); err != nil { + // Next, allocate available devices + if allocatedDevices, err = s.allocateAvailableDevicesWithAffinity(allocatedDevices, unallocatedDevicesSet, devicesToAllocate, idToAffinityGroupMap, affinityGroupByPriority); err != nil { return &allocate.AllocationResult{ Success: false, ErrorMessage: fmt.Sprintf("failed to allocate available devices with affinity: %v", err), @@ -115,22 +110,6 @@ func (s *DeviceAffinityStrategy) Bind( }, nil } - // Lastly, allocate the rest of the devices from available devices - if allocatedDevices, err = s.allocateCandidateDevices(availableDevicesSet, devicesToAllocate, unallocatedDevicesSet, affinityGroupByPriority, allocatedDevices); err != nil { - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: fmt.Sprintf("failed to allocate available devices: %v", err), - }, fmt.Errorf("failed to allocate available devices: %v", err) - } - - // Return result once we have allocated all the devices - if len(allocatedDevices) == devicesToAllocate { - return &allocate.AllocationResult{ - Success: true, - AllocatedDevices: allocatedDevices.UnsortedList(), - }, nil - } - return &allocate.AllocationResult{ Success: false, ErrorMessage: fmt.Sprintf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)), @@ -184,12 +163,12 @@ func (s *DeviceAffinityStrategy) getAffinityGroupById(affinityGroupByPriority ma // allocateCandidateDevices finds the best allocation given some candidate devices by finding those devices with the best affinity to each other. func (s *DeviceAffinityStrategy) allocateCandidateDevices( - candidateDevicesSet sets.String, devicesToAllocate int, unallocatedDevices sets.String, - affinityMap map[machine.AffinityPriority][]affinityGroup, allocatedDevices sets.String, + candidateDevicesSet sets.String, devicesToAllocate int, + affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, allocatedDevices sets.String, ) (sets.String, error) { // Retrieve all unallocated devices by getting the intersection of reusable devices and unallocated devices. // Devices that are already allocated should be excluded. - availableDevicesSet := unallocatedDevices.Intersection(candidateDevicesSet).Difference(allocatedDevices) + availableDevicesSet := candidateDevicesSet.Difference(allocatedDevices) // If the available reusable devices is less than or equal to request, we need to allocate all of them remainingQuantity := devicesToAllocate - len(allocatedDevices) @@ -200,15 +179,15 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( for allocatedDevices.Len() < devicesToAllocate { // Otherwise, we need to allocate these devices by their affinity - for priority := 0; priority < len(affinityMap); priority++ { - groups, ok := affinityMap[machine.AffinityPriority(priority)] + for priority := 0; priority < len(affinityGroupsByPriority); priority++ { + groups, ok := affinityGroupsByPriority[machine.AffinityPriority(priority)] if !ok { return nil, fmt.Errorf("affinity priority %v not found", priority) } - intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices) + intersectionToPossibleAllocationsMap := s.getIntersectionToPossibleAllocations(groups, availableDevicesSet, allocatedDevices) - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1) + allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityGroupsByPriority)-1) if allocateByIntersectionRes.err != nil { return nil, allocateByIntersectionRes.err } @@ -223,13 +202,15 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( } // allocateAvailableDevicesWithAffinity allocates devices from a set of available devices by trying to find the best device affinity -// to some already allocated reusable devices. +// to some already allocated reusable devices. Then, if there are still devices needed to be allocated, we continue to find the +// most optimal allocation for them. func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( - allocatedDevices, availableDevices, unallocatedDevices sets.String, devicesToAllocate int, - allocatedAffinityGroupIds map[machine.AffinityPriority]sets.String, idToAffinityGroupMap map[string]affinityGroup, + allocatedDevices, availableDevices sets.String, devicesToAllocate int, + idToAffinityGroupMap map[string]affinityGroup, + affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, ) (sets.String, error) { - // Unallocated available devices are retrieved by getting an intersection of total unallocated devices and the available devices to be allocated. - unallocatedAvailableDevices := unallocatedDevices.Intersection(availableDevices) + // Find all the affinity group ids that the allocated devices belong to and their respective priorities + allocatedAffinityGroupIds := s.findAllAffinityGroupIdsByPriority(allocatedDevices.UnsortedList(), affinityGroupsByPriority) // From the highest priority to the lowest priority, get the group IDs of the devices that are already allocated // and try to allocate from those groups. @@ -248,7 +229,7 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( return nil, fmt.Errorf("affinity group %v not found", groupID) } - deviceIntersection := sets.NewString(group.unallocatedDevices...).Intersection(unallocatedAvailableDevices) + deviceIntersection := sets.NewString(group.unallocatedDevices...).Intersection(availableDevices) if _, ok = intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) } @@ -271,6 +252,12 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( } } + // After allocating devices that have affinity to the allocated devices, find an optimal allocation for the rest of the available devices + allocatedDevices, err := s.allocateCandidateDevices(availableDevices, devicesToAllocate, affinityGroupsByPriority, allocatedDevices) + if err != nil { + return nil, err + } + return allocatedDevices, nil } @@ -301,13 +288,14 @@ func (s *DeviceAffinityStrategy) mergePossibleAllocationsAndSort(possibleAllocat return mergedAllocations } -func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap( +func (s *DeviceAffinityStrategy) getIntersectionToPossibleAllocations( groups []affinityGroup, availableDevicesSet, allocatedDevices sets.String, ) map[int][]possibleAllocation { intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) for _, group := range groups { // Find intersection of affinity group and the available reusable devices deviceIntersection := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet) + // Ignore groups with no intersection as there is no affinity to the group if len(deviceIntersection) == 0 { continue } From 0cd4251d248ece7c884ed4ea3f22d9d08e1e7e24 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Tue, 4 Nov 2025 16:01:22 +0800 Subject: [PATCH 31/52] refactor: make allocation recursive to simplify logic refactor: make allocation recursive to simplify logic refactor: make allocation recursive to simplify logic --- .../strategies/deviceaffinity/bind.go | 205 +++++++----------- .../strategies/deviceaffinity/bind_test.go | 71 ++++++ 2 files changed, 155 insertions(+), 121 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index 8469139672..df773ffab3 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -76,10 +76,8 @@ func (s *DeviceAffinityStrategy) Bind( affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) - idToAffinityGroupMap := s.getAffinityGroupById(affinityGroupByPriority) - - // Allocate reusable devices first - allocatedDevices, err := s.allocateAvailableDevicesWithAffinity(sets.NewString(), reusableDevicesSet, devicesToAllocate, idToAffinityGroupMap, affinityGroupByPriority) + allocatedDevices, err := s.allocateCandidateDevices(reusableDevicesSet, unallocatedDevicesSet, devicesToAllocate, + affinityGroupByPriority, sets.NewString(), false, false) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -94,22 +92,6 @@ func (s *DeviceAffinityStrategy) Bind( }, nil } - // Next, allocate available devices - if allocatedDevices, err = s.allocateAvailableDevicesWithAffinity(allocatedDevices, unallocatedDevicesSet, devicesToAllocate, idToAffinityGroupMap, affinityGroupByPriority); err != nil { - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: fmt.Sprintf("failed to allocate available devices with affinity: %v", err), - }, fmt.Errorf("failed to allocate available devices with affinity: %v", err) - } - - // Return result once we have allocated all the devices - if len(allocatedDevices) == devicesToAllocate { - return &allocate.AllocationResult{ - Success: true, - AllocatedDevices: allocatedDevices.UnsortedList(), - }, nil - } - return &allocate.AllocationResult{ Success: false, ErrorMessage: fmt.Sprintf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)), @@ -163,21 +145,47 @@ func (s *DeviceAffinityStrategy) getAffinityGroupById(affinityGroupByPriority ma // allocateCandidateDevices finds the best allocation given some candidate devices by finding those devices with the best affinity to each other. func (s *DeviceAffinityStrategy) allocateCandidateDevices( - candidateDevicesSet sets.String, devicesToAllocate int, - affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, allocatedDevices sets.String, + reusableDevicesSet, totalAvailableDevicesSet sets.String, devicesToAllocate int, + defaultAffinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, + allocatedDevices sets.String, isFindAvailableDeviceAffinity, hasAllocatedReusable bool, ) (sets.String, error) { // Retrieve all unallocated devices by getting the intersection of reusable devices and unallocated devices. // Devices that are already allocated should be excluded. - availableDevicesSet := candidateDevicesSet.Difference(allocatedDevices) + var candidateDevicesSet sets.String + + if !hasAllocatedReusable { + candidateDevicesSet = reusableDevicesSet + } else { + candidateDevicesSet = totalAvailableDevicesSet + } + + availableDevicesSet := candidateDevicesSet + // Only subtract out the available devices with the allocated devices if we are trying to allocate from scratch + if !isFindAvailableDeviceAffinity { + availableDevicesSet = availableDevicesSet.Difference(allocatedDevices) + } // If the available reusable devices is less than or equal to request, we need to allocate all of them remainingQuantity := devicesToAllocate - len(allocatedDevices) if availableDevicesSet.Len() <= remainingQuantity { allocatedDevices = availableDevicesSet + + // If we are now allocating reusable devices, make another recursive call to allocate available devices + if !hasAllocatedReusable { + return s.allocateCandidateDevices(reusableDevicesSet, totalAvailableDevicesSet, devicesToAllocate, + defaultAffinityGroupsByPriority, allocatedDevices, !isFindAvailableDeviceAffinity, true) + } return allocatedDevices, nil } - for allocatedDevices.Len() < devicesToAllocate { + var affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup + if isFindAvailableDeviceAffinity { + affinityGroupsByPriority = s.findAllAllocatedAffinityGroupsByPriority(allocatedDevices.UnsortedList(), defaultAffinityGroupsByPriority) + } else { + affinityGroupsByPriority = defaultAffinityGroupsByPriority + } + + for { // Otherwise, we need to allocate these devices by their affinity for priority := 0; priority < len(affinityGroupsByPriority); priority++ { groups, ok := affinityGroupsByPriority[machine.AffinityPriority(priority)] @@ -185,9 +193,9 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( return nil, fmt.Errorf("affinity priority %v not found", priority) } - intersectionToPossibleAllocationsMap := s.getIntersectionToPossibleAllocations(groups, availableDevicesSet, allocatedDevices) + candidateDeviceSizeToPossibleAllocationsMap := s.getCandidateDeviceSizeToPossibleAllocations(groups, availableDevicesSet, allocatedDevices) - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityGroupsByPriority)-1) + allocateByIntersectionRes := s.allocateByIntersection(candidateDeviceSizeToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityGroupsByPriority)-1) if allocateByIntersectionRes.err != nil { return nil, allocateByIntersectionRes.err } @@ -196,69 +204,15 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( return allocatedDevices, nil } } - } - - return allocatedDevices, nil -} -// allocateAvailableDevicesWithAffinity allocates devices from a set of available devices by trying to find the best device affinity -// to some already allocated reusable devices. Then, if there are still devices needed to be allocated, we continue to find the -// most optimal allocation for them. -func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity( - allocatedDevices, availableDevices sets.String, devicesToAllocate int, - idToAffinityGroupMap map[string]affinityGroup, - affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, -) (sets.String, error) { - // Find all the affinity group ids that the allocated devices belong to and their respective priorities - allocatedAffinityGroupIds := s.findAllAffinityGroupIdsByPriority(allocatedDevices.UnsortedList(), affinityGroupsByPriority) - - // From the highest priority to the lowest priority, get the group IDs of the devices that are already allocated - // and try to allocate from those groups. - for priority := 0; priority < len(allocatedAffinityGroupIds); priority++ { - groupIDs, ok := allocatedAffinityGroupIds[machine.AffinityPriority(priority)] - if !ok { - return nil, fmt.Errorf("unallocated affinity group ids in priority level %v not found", priority) - } - - intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) - - for groupID := range groupIDs { - // Get affinity group - group, ok := idToAffinityGroupMap[groupID] - if !ok { - return nil, fmt.Errorf("affinity group %v not found", groupID) - } - - deviceIntersection := sets.NewString(group.unallocatedDevices...).Intersection(availableDevices) - if _, ok = intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok { - intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0) - } - - intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{ - // The number of unallocated devices in the group is retrieved by taking a difference between - // the unallocated devices in the group and the already allocated devices - unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(), - candidateDevices: deviceIntersection.UnsortedList(), - }) - } - - allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(allocatedAffinityGroupIds)-1) - if allocateByIntersectionRes.err != nil { - return nil, allocateByIntersectionRes.err - } - - if allocateByIntersectionRes.finished { - return allocatedDevices, nil + // isFindAvailableDeviceAffinity is true if we are trying to find available devices that have affinity with the already allocated reusable device + if isFindAvailableDeviceAffinity || allocatedDevices.Len() >= devicesToAllocate { + break } } - // After allocating devices that have affinity to the allocated devices, find an optimal allocation for the rest of the available devices - allocatedDevices, err := s.allocateCandidateDevices(availableDevices, devicesToAllocate, affinityGroupsByPriority, allocatedDevices) - if err != nil { - return nil, err - } - - return allocatedDevices, nil + return s.allocateCandidateDevices(reusableDevicesSet, totalAvailableDevicesSet, devicesToAllocate, + defaultAffinityGroupsByPriority, allocatedDevices, !isFindAvailableDeviceAffinity, true) } // mergePossibleAllocationsAndSort merges the possible allocations by their unallocated size and sorts them in ascending order of their unallocated size. @@ -288,29 +242,32 @@ func (s *DeviceAffinityStrategy) mergePossibleAllocationsAndSort(possibleAllocat return mergedAllocations } -func (s *DeviceAffinityStrategy) getIntersectionToPossibleAllocations( +// getCandidateDeviceSizeToPossibleAllocations calculates the candidate devices given an affinity group and available devices by +// taking the intersection of the two groups. Then, it returns a map of size of candidate devices to an array of possible allocations, +// where one possible allocation contains the number of unallocated devices in an affinity group, and the candidate devices calculated earlier. +func (s *DeviceAffinityStrategy) getCandidateDeviceSizeToPossibleAllocations( groups []affinityGroup, availableDevicesSet, allocatedDevices sets.String, ) map[int][]possibleAllocation { - intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation) + candidateDeviceSizeToPossibleAllocationsMap := make(map[int][]possibleAllocation) for _, group := range groups { - // Find intersection of affinity group and the available reusable devices - deviceIntersection := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet) + // Find intersection of affinity group and the available reusable devices, these will be the set of allocatable candidate devices in a group + candidateDevices := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet) // Ignore groups with no intersection as there is no affinity to the group - if len(deviceIntersection) == 0 { + if len(candidateDevices) == 0 { continue } - if _, ok := intersectionToPossibleAllocationsMap[len(deviceIntersection)]; !ok { - intersectionToPossibleAllocationsMap[len(deviceIntersection)] = make([]possibleAllocation, 0) + if _, ok := candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)]; !ok { + candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)] = make([]possibleAllocation, 0) } - intersectionToPossibleAllocationsMap[len(deviceIntersection)] = append(intersectionToPossibleAllocationsMap[len(deviceIntersection)], possibleAllocation{ + candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)] = append(candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)], possibleAllocation{ // The number of unallocated devices in the group is retrieved by taking a difference between // the unallocated devices in the group and the already allocated devices unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(), - candidateDevices: deviceIntersection, + candidateDevices: candidateDevices, }) } - return intersectionToPossibleAllocationsMap + return candidateDeviceSizeToPossibleAllocationsMap } func getDeviceIntersection(unallocatedDevices []string, availableDevices sets.String) []string { @@ -324,46 +281,47 @@ func getDeviceIntersection(unallocatedDevices []string, availableDevices sets.St } // allocateByIntersection allocates devices by the following algorithm -// 1. Sort the intersection sizes of possible allocations in descending order, we want to allocate devices with larger intersection size with an affinity group. -// 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize +// 1. Sort the candidate devices size of possible allocations in descending order, as we want to prioritize devices with larger candidate devices size with an affinity group. +// 2. For each candidate devices size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize // bin-packing (try to fill up an affinity group that is already allocated with other devices). -// 3. For each intersection size, allocate devices in the order of the sorted possible allocations. -// 4. If a possible allocation has a number of intersected devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. +// 3. For each candidate device size, allocate devices in the order of the sorted possible allocations. +// 4. If a possible allocation has a number of candidate devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. // 5. If we are currently at the last affinity priority level, we go through the other possible allocations (that are in sorted ascending order of number of unallocated devices) // to fill up the remaining devices. func (s *DeviceAffinityStrategy) allocateByIntersection( - intersectionToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, + candidateDevicesSizeToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, devicesToAllocate int, isLastPriority bool, ) allocationByIntersectionResult { // Sort the intersection sizes of possible allocations in descending order because we want to process the larger intersections first. // A larger intersection means that we are able to find more devices that have an affinity with an affinity group. - intersectionSizes := make([]int, 0, len(intersectionToPossibleAllocationsMap)) - for intersectionSize := range intersectionToPossibleAllocationsMap { - intersectionSizes = append(intersectionSizes, intersectionSize) + candidateDevicesSizes := make([]int, 0, len(candidateDevicesSizeToPossibleAllocationsMap)) + for candidateSize := range candidateDevicesSizeToPossibleAllocationsMap { + candidateDevicesSizes = append(candidateDevicesSizes, candidateSize) } - sort.Slice(intersectionSizes, func(i, j int) bool { - return intersectionSizes[i] > intersectionSizes[j] + sort.Slice(candidateDevicesSizes, func(i, j int) bool { + return candidateDevicesSizes[i] > candidateDevicesSizes[j] }) // If there is an intersection size that is larger than or equal to the number of devices needed for allocation, // find the smallest intersection size. This is so that we try to reduce fragmentation as much as possible. - // For example, if we have 1 device to allocate, and we have intersectionSizes of 2 and 1, we want to allocate to the group with + // For example, if we have 1 device to allocate, and we have candidateDevicesSizes of 2 and 1, we want to allocate to the group with // intersection of size 1, as this means that we are able to successfully do bin-packing (fill up an affinity group that // that is already allocated with other devices) - start := 0 - for i, intersectionSize := range intersectionSizes { - if intersectionSize <= devicesToAllocate { + // Find the starting index: first size <= devicesToAllocate, or last one if none + start := len(candidateDevicesSizes) - 1 + for i, size := range candidateDevicesSizes { + if size <= devicesToAllocate { start = i break } } - klog.Infof("intersection to possible allocations map: %v", intersectionToPossibleAllocationsMap) + klog.Infof("intersection to possible allocations map: %v", candidateDevicesSizeToPossibleAllocationsMap) - for i := start; i < len(intersectionSizes); i++ { - intersectionSize := intersectionSizes[i] - possibleAllocations, ok := intersectionToPossibleAllocationsMap[intersectionSize] + for i := start; i < len(candidateDevicesSizes); i++ { + intersectionSize := candidateDevicesSizes[i] + possibleAllocations, ok := candidateDevicesSizeToPossibleAllocationsMap[intersectionSize] if !ok { return allocationByIntersectionResult{ finished: false, @@ -404,22 +362,27 @@ func (s *DeviceAffinityStrategy) allocateByIntersection( } } -// findAllAffinityGroupIdsByPriority finds the affinity group ids of the allocated devices by affinity priority level. -func (s *DeviceAffinityStrategy) findAllAffinityGroupIdsByPriority( +func (s *DeviceAffinityStrategy) findAllAllocatedAffinityGroupsByPriority( allocatedDevices []string, affinityMap map[machine.AffinityPriority][]affinityGroup, -) map[machine.AffinityPriority]sets.String { - affinityGroupIds := make(map[machine.AffinityPriority]sets.String) +) map[machine.AffinityPriority][]affinityGroup { + affinityGroupsByPriority := make(map[machine.AffinityPriority][]affinityGroup) + seenIds := sets.NewString() for _, device := range allocatedDevices { for priority, groups := range affinityMap { for _, group := range groups { if sets.NewString(group.unallocatedDevices...).Has(device) { - if _, ok := affinityGroupIds[priority]; !ok { - affinityGroupIds[priority] = sets.NewString() + if seenIds.Has(group.id) { + continue + } + + if _, ok := affinityGroupsByPriority[priority]; !ok { + affinityGroupsByPriority[priority] = make([]affinityGroup, 0) } - affinityGroupIds[priority].Insert(group.id) + affinityGroupsByPriority[priority] = append(affinityGroupsByPriority[priority], group) + seenIds.Insert(group.id) } } } } - return affinityGroupIds + return affinityGroupsByPriority } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go index 283504e0ac..1c04fe97ea 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -1654,6 +1654,77 @@ func TestBind_NumberOfDevicesAllocated(t *testing.T) { isRandom: true, expectedResultSize: 2, }, + { + name: "allocate first level of affinity priority devices first, then second level of affinity priority, both have affinity to reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-0"}, + DeviceRequest: 3, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-4", "gpu-5"}, + }, + }, + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-0"}, + 1: {"gpu-0", "gpu-4", "gpu-5"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-3", "gpu-6", "gpu-7"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-6", "gpu-7"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-0", "gpu-1", "gpu-5"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-0", "gpu-1", "gpu-4"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-2", "gpu-3", "gpu-7"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-2", "gpu-3", "gpu-6"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-4", "gpu-6", "gpu-7"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-0", "gpu-1", "gpu-4"}, + Success: true, + }, + }, } for _, tt := range tests { From 3caa8c15a9f84474a06804935f75c5cbb7961c86 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 5 Nov 2025 11:15:00 +0800 Subject: [PATCH 32/52] fix(gpu): handle NUMA node edge cases and improve logging - Add logging for missing device affinity provider - Fix negative NUMA node handling in GPU memory allocation - Simplify GPU topology lookup logic - Ensure proper handling of zero quantity resource requests --- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 66 +++++++++++++------ pkg/util/machine/device.go | 2 + 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index e8113a2c7f..6fac568bca 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -156,18 +156,10 @@ func (p *GPUMemPlugin) calculateHints( numaToAvailableGPUCount := make(map[int]float64) numaToMostAllocatedGPUMemory := make(map[int]float64) - for gpuID, s := range machineState { - if s == nil { - continue - } - + for gpuID, info := range gpuTopology.Devices { + s := machineState[gpuID] allocated := s.GetQuantityAllocated() if allocated+perGPUMemory <= float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) { - info, ok := gpuTopology.Devices[gpuID] - if !ok { - return nil, fmt.Errorf("gpu %s not found in gpuTopology", gpuID) - } - for _, numaNode := range info.GetNUMANode() { numaToAvailableGPUCount[numaNode] += 1 numaToMostAllocatedGPUMemory[numaNode] = math.Max(allocated, numaToMostAllocatedGPUMemory[numaNode]) @@ -301,11 +293,22 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) ( perNUMAAllocated := alloc.Quantity if len(alloc.NUMANodes) > 0 { perNUMAAllocated = alloc.Quantity / float64(len(alloc.NUMANodes)) - } - - for _, nodeID := range alloc.NUMANodes { + for _, nodeID := range alloc.NUMANodes { + if nodeID < 0 { + nodeID = 0 + } + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + Node: uint64(nodeID), + ResourceValue: perNUMAAllocated, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } else { topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ - Node: uint64(nodeID), ResourceValue: perNUMAAllocated, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), @@ -360,14 +363,36 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca aggregatedAllocatableQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) aggregatedCapacityQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) gpuMemoryAllocatablePerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) - if len(deviceInfo.NumaNodes) > 1 { + if len(deviceInfo.NumaNodes) > 0 { gpuMemoryAllocatablePerGPUNUMA = gpuMemoryAllocatablePerGPUNUMA / float64(len(deviceInfo.NumaNodes)) - } - for _, numaID := range deviceInfo.NumaNodes { + for _, numaID := range deviceInfo.NumaNodes { + if numaID < 0 { + numaID = 0 + } + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } else { + // if deviceInfo.NumaNodes is empty, then it means the device is not NUMA aware topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ ResourceValue: gpuMemoryAllocatablePerGPUNUMA, Name: deviceID, - Node: uint64(numaID), Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ consts.ResourceAnnotationKeyResourceIdentifier: "", @@ -376,7 +401,6 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ ResourceValue: gpuMemoryAllocatablePerGPUNUMA, Name: deviceID, - Node: uint64(numaID), Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ consts.ResourceAnnotationKeyResourceIdentifier: "", @@ -401,8 +425,8 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca func (p *GPUMemPlugin) Allocate( resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, ) (*pluginapi.ResourceAllocationResponse, error) { - _, exists := resourceReq.Annotations[p.ResourceName()] - if !exists { + quantity, exists := resourceReq.ResourceRequests[p.ResourceName()] + if !exists || quantity == 0 { general.InfoS("No GPU memory annotation detected and no GPU memory requested, returning empty response", "podNamespace", resourceReq.PodNamespace, "podName", resourceReq.PodName, diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index 445d50c19c..bb4b99de4c 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -85,6 +85,8 @@ func (r *DeviceTopologyRegistry) SetDeviceTopology(deviceName string, deviceTopo if ok { topologyAffinityProvider.SetDeviceAffinity(deviceTopology) general.Infof("set device affinity provider for device %s, %v", deviceName, deviceTopology) + } else { + general.Infof("no device affinity provider found for device %s", deviceName) } return topologyProvider.SetDeviceTopology(deviceTopology) From 1ade96f2fc9f275f3d26457b5e3a163f5028a6dc Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 5 Nov 2025 11:16:00 +0800 Subject: [PATCH 33/52] feat(gpu): add canonical strategy implementation and refactor gpu memory strategy implement canonical strategy with separate filter and bind logic simplify gpu memory strategy by removing numa affinity check register canonical strategy as default allocation strategy --- .../gpu/strategy/allocate/manager/defaults.go | 3 +- .../allocate/strategies/canonical/bind.go | 80 ++++++++++++++++++ .../strategies/canonical/canonical.go | 81 ------------------- .../allocate/strategies/canonical/filter.go | 42 ++++++++++ .../allocate/strategies/gpu_memory/filter.go | 10 +-- 5 files changed, 125 insertions(+), 91 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go index 29b59f1e22..5202c9b862 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go @@ -54,7 +54,8 @@ func registerDefaultBindingStrategies(manager *StrategyManager) { // registerDefaultAllocationStrategies register allocation strategies func registerDefaultAllocationStrategies(manager *StrategyManager) { - if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, []string{gpu_memory.StrategyNameGPUMemory}, + if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, + []string{canonical.StrategyNameCanonical, gpu_memory.StrategyNameGPUMemory}, gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { general.Errorf("Failed to register gpu-memory-default strategy: %v", err) } diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go new file mode 100644 index 0000000000..fb3c8c65e7 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go @@ -0,0 +1,80 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// Bind binds the sorted GPU devices to the allocation context +// It creates allocation info for the selected devices +func (s *CanonicalStrategy) Bind( + ctx *allocate.AllocationContext, sortedDevices []string, +) (*allocate.AllocationResult, error) { + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) + if !valid { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: errMsg, + }, fmt.Errorf(errMsg) + } + + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if devicesToAllocate == allocatedDevices.Len() { + return true + } + } + return false + } + + // First try to bind reusable devices + if allocateDevices(ctx.DeviceReq.ReusableDevices...) { + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + // Then try to bind devices from sorted list + if allocateDevices(sortedDevices...) { + general.InfoS("Successfully bound devices", + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName, + "allocatedDevices", allocatedDevices.List()) + + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), + }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go index fecb92a717..a63094487c 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go @@ -17,14 +17,7 @@ limitations under the License. package canonical import ( - "fmt" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/sets" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" - "github.com/kubewharf/katalyst-core/pkg/util/general" ) const ( @@ -48,77 +41,3 @@ var ( func (s *CanonicalStrategy) Name() string { return StrategyNameCanonical } - -// Bind binds the sorted GPU devices to the allocation context -// It creates allocation info for the selected devices -func (s *CanonicalStrategy) Bind( - ctx *allocate.AllocationContext, sortedDevices []string, -) (*allocate.AllocationResult, error) { - valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) - if !valid { - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: errMsg, - }, fmt.Errorf(errMsg) - } - - devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) - allocatedDevices := sets.NewString() - allocateDevices := func(devices ...string) bool { - for _, device := range devices { - allocatedDevices.Insert(device) - if devicesToAllocate == allocatedDevices.Len() { - return true - } - } - return false - } - - // First try to bind reusable devices - if allocateDevices(ctx.DeviceReq.ReusableDevices...) { - return &allocate.AllocationResult{ - AllocatedDevices: allocatedDevices.UnsortedList(), - Success: true, - }, nil - } - - // Then try to bind devices from sorted list - if allocateDevices(sortedDevices...) { - general.InfoS("Successfully bound devices", - "podNamespace", ctx.ResourceReq.PodNamespace, - "podName", ctx.ResourceReq.PodName, - "containerName", ctx.ResourceReq.ContainerName, - "allocatedDevices", allocatedDevices.List()) - - return &allocate.AllocationResult{ - AllocatedDevices: allocatedDevices.UnsortedList(), - Success: true, - }, nil - } - - return &allocate.AllocationResult{ - Success: false, - ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), - }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) -} - -// Filter filters the available devices based on whether they are already occupied. -// The assumption is that each device can only be allocated to one container at most. -// It only returns devices that are not occupied yet. -func (s *CanonicalStrategy) Filter( - ctx *allocate.AllocationContext, allAvailableDevices []string, -) ([]string, error) { - machineState, ok := ctx.MachineState[v1.ResourceName(ctx.DeviceReq.DeviceName)] - if !ok { - return nil, fmt.Errorf("machine state for %s is not available", ctx.DeviceReq.DeviceName) - } - - filteredDevices := sets.NewString() - for _, device := range allAvailableDevices { - if machineState.IsRequestSatisfied(device, 1, 1) { - filteredDevices.Insert(device) - } - } - - return filteredDevices.UnsortedList(), nil -} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go new file mode 100644 index 0000000000..7782527ae5 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go @@ -0,0 +1,42 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" +) + +// Filter filters the available devices based on whether they are already occupied. +// The assumption is that each device can only be allocated to one container at most. +// It only returns devices that are not occupied yet. +func (s *CanonicalStrategy) Filter( + ctx *allocate.AllocationContext, allAvailableDevices []string, +) ([]string, error) { + filteredDevices := sets.NewString() + for _, device := range allAvailableDevices { + if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.DeviceTopology, ctx.HintNodes) { + continue + } + + filteredDevices.Insert(device) + } + + return filteredDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go index 08cc5fd62b..56c65fcd71 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -23,7 +23,6 @@ import ( "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" - gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" "github.com/kubewharf/katalyst-core/pkg/util/general" ) @@ -63,16 +62,9 @@ func (s *GPUMemoryStrategy) filterGPUDevices( gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) - machineState, ok := ctx.MachineState[consts.ResourceGPUMemory] - if !ok { - return nil, fmt.Errorf("machine state for %s is not available", consts.ResourceGPUMemory) - } + machineState := ctx.MachineState[consts.ResourceGPUMemory] filteredDevices := sets.NewString() for _, device := range allAvailableDevices { - if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.DeviceTopology, ctx.HintNodes) { - continue - } - if !machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, gpuMemoryAllocatablePerGPU) { general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", device, gpuMemoryAllocatablePerGPU, machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) From 2514d8592e0f983d8cb4570e0dbafdc89a0b4e99 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Wed, 5 Nov 2025 11:16:50 +0800 Subject: [PATCH 34/52] refactor(gpu/strategy): optimize device affinity allocation algorithm - Simplify affinity group structure by using sets.String for unallocated devices - Implement more efficient allocation strategy with priority-based processing - Remove unused types and consolidate allocation logic - Add helper methods for group evaluation and sorting - Improve error handling and early termination conditions --- .../strategies/deviceaffinity/bind.go | 454 +++++++++--------- 1 file changed, 227 insertions(+), 227 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index df773ffab3..d71b992235 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -22,7 +22,6 @@ import ( "github.com/google/uuid" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/klog/v2" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" @@ -34,21 +33,7 @@ import ( // It is uniquely identified by an id. type affinityGroup struct { id string - unallocatedDevices []string -} - -// possibleAllocation refers to information about a certain affinity group, which includes the number of unallocated devices in the group, -// and candidateDevices refer to a set of devices that we can potentially allocate in an affinity group. -type possibleAllocation struct { - unallocatedSize int - candidateDevices []string -} - -// allocationByIntersectionResult is the result of allocating devices by maximizing intersection size of possible allocations -// with an affinity group. -type allocationByIntersectionResult struct { - finished bool - err error + unallocatedDevices sets.String } // Bind binds the sorted devices to the allocation context by searching for the devices that have affinity to each other. @@ -74,10 +59,13 @@ func (s *DeviceAffinityStrategy) Bind( // Get a map of affinity groups that is grouped by priority affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() - affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) - allocatedDevices, err := s.allocateCandidateDevices(reusableDevicesSet, unallocatedDevicesSet, devicesToAllocate, - affinityGroupByPriority, sets.NewString(), false, false) + // Get affinity groups organized by priority level + affinityGroupsMap := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) + + // Allocate reusable devices first + allocatedDevices, err := s.allocateCandidateDevices(affinityGroupsMap, + reusableDevicesSet.Intersection(unallocatedDevicesSet), devicesToAllocate, sets.NewString()) if err != nil { return &allocate.AllocationResult{ Success: false, @@ -92,6 +80,25 @@ func (s *DeviceAffinityStrategy) Bind( }, nil } + // Next, allocate left available devices + availableDevices := unallocatedDevicesSet.Difference(allocatedDevices) + allocatedDevices, err = s.allocateCandidateDevices(affinityGroupsMap, + availableDevices, devicesToAllocate, allocatedDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate available devices with affinity: %v", err), + }, fmt.Errorf("failed to allocate available devices with affinity: %v", err) + } + + // Return result once we have allocated all the devices + if len(allocatedDevices) == devicesToAllocate { + return &allocate.AllocationResult{ + Success: true, + AllocatedDevices: allocatedDevices.UnsortedList(), + }, nil + } + return &allocate.AllocationResult{ Success: false, ErrorMessage: fmt.Sprintf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)), @@ -118,10 +125,10 @@ func (s *DeviceAffinityStrategy) getAffinityGroups( // Calculate the number of unallocated devices for each affinity group for _, devices := range affinityDevices { - unallocatedDevices := make(machine.DeviceIDs, 0) + unallocatedDevices := sets.NewString() for _, device := range devices { if unallocatedDevicesSet.Has(device) { - unallocatedDevices = append(unallocatedDevices, device) + unallocatedDevices.Insert(device) } } affinityGroups = append(affinityGroups, affinityGroup{ @@ -133,256 +140,249 @@ func (s *DeviceAffinityStrategy) getAffinityGroups( return affinityGroups } -func (s *DeviceAffinityStrategy) getAffinityGroupById(affinityGroupByPriority map[machine.AffinityPriority][]affinityGroup) map[string]affinityGroup { - idToAffinityGroupMap := make(map[string]affinityGroup) - for _, groups := range affinityGroupByPriority { - for _, group := range groups { - idToAffinityGroupMap[group.id] = group - } - } - return idToAffinityGroupMap -} - -// allocateCandidateDevices finds the best allocation given some candidate devices by finding those devices with the best affinity to each other. +// allocateCandidateDevices optimally allocates GPU devices based on affinity priorities. +// This method implements a sophisticated allocation strategy that: +// 1. Prioritizes device groups with higher affinity levels +// 2. Minimizes fragmentation by selecting devices with strong mutual affinity +// 3. Balances between fulfilling exact requirements and maintaining optimal groupings +// +// Parameters: +// - affinityGroupsMap: Mapping of affinity priorities to device groups with those priorities +// - candidateDevicesSet: Set of available devices that can be allocated +// - devicesToAllocate: Total number of devices that need to be allocated +// - allocatedDevices: Set of devices that have already been allocated in previous iterations +// +// Returns: +// - sets.String: The complete set of allocated devices after this allocation round +// - error: Any error encountered during the allocation process func (s *DeviceAffinityStrategy) allocateCandidateDevices( - reusableDevicesSet, totalAvailableDevicesSet sets.String, devicesToAllocate int, - defaultAffinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup, - allocatedDevices sets.String, isFindAvailableDeviceAffinity, hasAllocatedReusable bool, + affinityGroupsMap map[machine.AffinityPriority][]affinityGroup, + candidateDevicesSet sets.String, + devicesToAllocate int, + allocatedDevices sets.String, ) (sets.String, error) { - // Retrieve all unallocated devices by getting the intersection of reusable devices and unallocated devices. - // Devices that are already allocated should be excluded. - var candidateDevicesSet sets.String - - if !hasAllocatedReusable { - candidateDevicesSet = reusableDevicesSet - } else { - candidateDevicesSet = totalAvailableDevicesSet - } - - availableDevicesSet := candidateDevicesSet - // Only subtract out the available devices with the allocated devices if we are trying to allocate from scratch - if !isFindAvailableDeviceAffinity { - availableDevicesSet = availableDevicesSet.Difference(allocatedDevices) + // Early termination conditions + if len(allocatedDevices) == devicesToAllocate || len(candidateDevicesSet) == 0 { + return allocatedDevices, nil } - // If the available reusable devices is less than or equal to request, we need to allocate all of them - remainingQuantity := devicesToAllocate - len(allocatedDevices) - if availableDevicesSet.Len() <= remainingQuantity { - allocatedDevices = availableDevicesSet + // Calculate remaining devices needed + remainingDevicesToAllocate := devicesToAllocate - len(allocatedDevices) - // If we are now allocating reusable devices, make another recursive call to allocate available devices - if !hasAllocatedReusable { - return s.allocateCandidateDevices(reusableDevicesSet, totalAvailableDevicesSet, devicesToAllocate, - defaultAffinityGroupsByPriority, allocatedDevices, !isFindAvailableDeviceAffinity, true) - } + // Fast path: If we need all remaining candidates, allocate them all + if remainingDevicesToAllocate >= len(candidateDevicesSet) { + allocatedDevices = allocatedDevices.Union(candidateDevicesSet) return allocatedDevices, nil } - var affinityGroupsByPriority map[machine.AffinityPriority][]affinityGroup - if isFindAvailableDeviceAffinity { - affinityGroupsByPriority = s.findAllAllocatedAffinityGroupsByPriority(allocatedDevices.UnsortedList(), defaultAffinityGroupsByPriority) - } else { - affinityGroupsByPriority = defaultAffinityGroupsByPriority - } - - for { - // Otherwise, we need to allocate these devices by their affinity - for priority := 0; priority < len(affinityGroupsByPriority); priority++ { - groups, ok := affinityGroupsByPriority[machine.AffinityPriority(priority)] - if !ok { - return nil, fmt.Errorf("affinity priority %v not found", priority) - } + // Process affinity groups from highest to lowest priority + for priority := 0; priority < len(affinityGroupsMap); priority++ { + affinityPriority := machine.AffinityPriority(priority) + affinityGroups, exists := affinityGroupsMap[affinityPriority] + if !exists || len(affinityGroups) == 0 { + continue + } - candidateDeviceSizeToPossibleAllocationsMap := s.getCandidateDeviceSizeToPossibleAllocations(groups, availableDevicesSet, allocatedDevices) + // Prepare group information for evaluation + groupInfos := s.prepareGroupInfos(affinityGroups, candidateDevicesSet, allocatedDevices) + if len(groupInfos) == 0 { + continue + } - allocateByIntersectionRes := s.allocateByIntersection(candidateDeviceSizeToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityGroupsByPriority)-1) - if allocateByIntersectionRes.err != nil { - return nil, allocateByIntersectionRes.err - } + // Sort groups by allocation suitability + s.sortGroupsByPriority(groupInfos, remainingDevicesToAllocate) - if allocateByIntersectionRes.finished { - return allocatedDevices, nil - } + // Try to allocate from the best matching groups + if result, fullyAllocated := s.tryAllocateFromGroups( + groupInfos, remainingDevicesToAllocate, allocatedDevices, devicesToAllocate, + ); fullyAllocated { + return result, nil } - // isFindAvailableDeviceAffinity is true if we are trying to find available devices that have affinity with the already allocated reusable device - if isFindAvailableDeviceAffinity || allocatedDevices.Len() >= devicesToAllocate { - break + // For the lowest priority, use more flexible allocation strategies + if priority == len(affinityGroupsMap)-1 { + return s.handleLowestPriorityAllocation( + groupInfos, affinityGroupsMap, candidateDevicesSet, + devicesToAllocate, allocatedDevices, remainingDevicesToAllocate, + ) } } - return s.allocateCandidateDevices(reusableDevicesSet, totalAvailableDevicesSet, devicesToAllocate, - defaultAffinityGroupsByPriority, allocatedDevices, !isFindAvailableDeviceAffinity, true) + return allocatedDevices, nil } -// mergePossibleAllocationsAndSort merges the possible allocations by their unallocated size and sorts them in ascending order of their unallocated size. -func (s *DeviceAffinityStrategy) mergePossibleAllocationsAndSort(possibleAllocations []possibleAllocation) []possibleAllocation { - merged := make(map[int][]string) - for _, alloc := range possibleAllocations { - if _, ok := merged[alloc.unallocatedSize]; !ok { - merged[alloc.unallocatedSize] = make([]string, 0) +// prepareGroupInfos processes affinity groups and extracts relevant allocation information. +// This helper method filters out groups with no candidate devices and calculates +// the intersection between group devices and available candidates. +func (s *DeviceAffinityStrategy) prepareGroupInfos( + affinityGroups []affinityGroup, + candidateDevicesSet sets.String, + allocatedDevices sets.String, +) []groupInfo { + groupInfos := make([]groupInfo, 0, len(affinityGroups)) + + for _, group := range affinityGroups { + // Find devices in this group that are also candidates + candidates := group.unallocatedDevices.Intersection(candidateDevicesSet) + if candidates.Len() == 0 { + continue // Skip groups with no matching candidates } - merged[alloc.unallocatedSize] = append(merged[alloc.unallocatedSize], alloc.candidateDevices...) - } - mergedAllocations := make([]possibleAllocation, 0, len(merged)) - for unallocatedSize, intersected := range merged { - mergedAllocations = append(mergedAllocations, possibleAllocation{ - unallocatedSize: unallocatedSize, - candidateDevices: intersected, - }) - } - - // Sort possible allocations by their unallocated size in ascending order - // To support bin-packing, we prioritize allocation of devices in groups that have other allocated devices. - sort.Slice(mergedAllocations, func(i, j int) bool { - return mergedAllocations[i].unallocatedSize < mergedAllocations[j].unallocatedSize - }) + // Calculate unallocated and allocated device sets for this group + unallocated := group.unallocatedDevices.Difference(allocatedDevices) + if unallocated.Len() == 0 { + continue // Skip groups where all devices are already allocated + } - return mergedAllocations -} + allocated := group.unallocatedDevices.Intersection(allocatedDevices) -// getCandidateDeviceSizeToPossibleAllocations calculates the candidate devices given an affinity group and available devices by -// taking the intersection of the two groups. Then, it returns a map of size of candidate devices to an array of possible allocations, -// where one possible allocation contains the number of unallocated devices in an affinity group, and the candidate devices calculated earlier. -func (s *DeviceAffinityStrategy) getCandidateDeviceSizeToPossibleAllocations( - groups []affinityGroup, availableDevicesSet, allocatedDevices sets.String, -) map[int][]possibleAllocation { - candidateDeviceSizeToPossibleAllocationsMap := make(map[int][]possibleAllocation) - for _, group := range groups { - // Find intersection of affinity group and the available reusable devices, these will be the set of allocatable candidate devices in a group - candidateDevices := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet) - // Ignore groups with no intersection as there is no affinity to the group - if len(candidateDevices) == 0 { - continue - } - if _, ok := candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)]; !ok { - candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)] = make([]possibleAllocation, 0) - } - candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)] = append(candidateDeviceSizeToPossibleAllocationsMap[len(candidateDevices)], possibleAllocation{ - // The number of unallocated devices in the group is retrieved by taking a difference between - // the unallocated devices in the group and the already allocated devices - unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(), - candidateDevices: candidateDevices, + groupInfos = append(groupInfos, groupInfo{ + group: group, + candidates: candidates, + allocated: allocated, + unallocated: unallocated, }) } - return candidateDeviceSizeToPossibleAllocationsMap + return groupInfos } -func getDeviceIntersection(unallocatedDevices []string, availableDevices sets.String) []string { - deviceIntersection := make([]string, 0) - for _, device := range unallocatedDevices { - if availableDevices.Has(device) { - deviceIntersection = append(deviceIntersection, device) +// sortGroupsByPriority sorts affinity groups based on allocation suitability. +// The sorting criteria are: +// 1. Proximity to the exact number of devices needed (closer is better) +// 2. Total unallocated devices (smaller is better to minimize fragmentation) +// 3. Already allocated devices (larger is better to maintain consistency) +func (s *DeviceAffinityStrategy) sortGroupsByPriority( + groupInfos []groupInfo, + remainingDevicesToAllocate int, +) { + sort.Slice(groupInfos, func(i, j int) bool { + // Calculate absolute difference from needed devices + diffI := abs(groupInfos[i].candidates.Len() - remainingDevicesToAllocate) + diffJ := abs(groupInfos[j].candidates.Len() - remainingDevicesToAllocate) + + // Prefer groups closer to the exact number needed + if diffI != diffJ { + return diffI < diffJ } - } - return deviceIntersection -} - -// allocateByIntersection allocates devices by the following algorithm -// 1. Sort the candidate devices size of possible allocations in descending order, as we want to prioritize devices with larger candidate devices size with an affinity group. -// 2. For each candidate devices size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize -// bin-packing (try to fill up an affinity group that is already allocated with other devices). -// 3. For each candidate device size, allocate devices in the order of the sorted possible allocations. -// 4. If a possible allocation has a number of candidate devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there. -// 5. If we are currently at the last affinity priority level, we go through the other possible allocations (that are in sorted ascending order of number of unallocated devices) -// to fill up the remaining devices. -func (s *DeviceAffinityStrategy) allocateByIntersection( - candidateDevicesSizeToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String, - devicesToAllocate int, isLastPriority bool, -) allocationByIntersectionResult { - // Sort the intersection sizes of possible allocations in descending order because we want to process the larger intersections first. - // A larger intersection means that we are able to find more devices that have an affinity with an affinity group. - candidateDevicesSizes := make([]int, 0, len(candidateDevicesSizeToPossibleAllocationsMap)) - for candidateSize := range candidateDevicesSizeToPossibleAllocationsMap { - candidateDevicesSizes = append(candidateDevicesSizes, candidateSize) - } - sort.Slice(candidateDevicesSizes, func(i, j int) bool { - return candidateDevicesSizes[i] > candidateDevicesSizes[j] - }) + // Prefer groups with fewer unallocated devices to reduce fragmentation + if groupInfos[i].unallocated.Len() != groupInfos[j].unallocated.Len() { + return groupInfos[i].unallocated.Len() < groupInfos[j].unallocated.Len() + } - // If there is an intersection size that is larger than or equal to the number of devices needed for allocation, - // find the smallest intersection size. This is so that we try to reduce fragmentation as much as possible. - // For example, if we have 1 device to allocate, and we have candidateDevicesSizes of 2 and 1, we want to allocate to the group with - // intersection of size 1, as this means that we are able to successfully do bin-packing (fill up an affinity group that - // that is already allocated with other devices) - // Find the starting index: first size <= devicesToAllocate, or last one if none - start := len(candidateDevicesSizes) - 1 - for i, size := range candidateDevicesSizes { - if size <= devicesToAllocate { - start = i - break + // Prefer groups with more already allocated devices for consistency + if groupInfos[i].allocated.Len() != groupInfos[j].allocated.Len() { + return groupInfos[i].allocated.Len() > groupInfos[j].allocated.Len() } - } - klog.Infof("intersection to possible allocations map: %v", candidateDevicesSizeToPossibleAllocationsMap) + return groupInfos[i].group.id < groupInfos[j].group.id + }) +} - for i := start; i < len(candidateDevicesSizes); i++ { - intersectionSize := candidateDevicesSizes[i] - possibleAllocations, ok := candidateDevicesSizeToPossibleAllocationsMap[intersectionSize] - if !ok { - return allocationByIntersectionResult{ - finished: false, - err: fmt.Errorf("possible reusable devices of intersection size %v not found", intersectionSize), +// tryAllocateFromGroups attempts to allocate devices from the prioritized groups. +// It first tries to find an exact match, then falls back to partial allocations. +func (s *DeviceAffinityStrategy) tryAllocateFromGroups( + groupInfos []groupInfo, + remainingDevicesToAllocate int, + allocatedDevices sets.String, + devicesToAllocate int, +) (sets.String, bool) { + // Try to find groups that can exactly satisfy the remaining requirement + for _, group := range groupInfos { + // Check if this group can satisfy the exact remaining requirement and + // ensure affinity allocation if there are already allocated devices + if remainingDevicesToAllocate <= group.candidates.Len() && + !(allocatedDevices.Len() > 0 && group.allocated.Len() <= 0) { + + // Add all candidate devices from this group + for _, device := range group.candidates.List() { + allocatedDevices.Insert(device) + if len(allocatedDevices) == devicesToAllocate { + return allocatedDevices, true // Fully allocated + } } + return allocatedDevices, true } + } - mergedPossibleAllocations := s.mergePossibleAllocationsAndSort(possibleAllocations) + return allocatedDevices, false // Not fully allocated +} - klog.Infof("possible allocations: %v", mergedPossibleAllocations) +// handleLowestPriorityAllocation implements flexible allocation strategies for the lowest priority. +// This method is more permissive in its allocation strategy to ensure device requirements are met. +func (s *DeviceAffinityStrategy) handleLowestPriorityAllocation( + groupInfos []groupInfo, + affinityGroupsMap map[machine.AffinityPriority][]affinityGroup, + candidateDevicesSet sets.String, + devicesToAllocate int, + allocatedDevices sets.String, + remainingDevicesToAllocate int, +) (sets.String, error) { + // First try to allocate entire groups that fit within the remaining requirement and + // ensure affinity allocation if there are already allocated devices + for _, group := range groupInfos { + if remainingDevicesToAllocate >= group.candidates.Len() && + !(allocatedDevices.Len() > 0 && group.allocated.Len() <= 0) { + + // Allocate all devices from this group + allocatedDevices = allocatedDevices.Union(group.candidates) + + // Recursively allocate the remaining devices + return s.allocateCandidateDevices( + affinityGroupsMap, + candidateDevicesSet.Difference(group.candidates), + devicesToAllocate, + allocatedDevices, + ) + } + } - for _, possibleAlloc := range mergedPossibleAllocations { - // If devices of possible allocation size is larger than the devices needed, and it is not the last priority level, - // go to the next priority and try to allocate - remainingToAllocate := devicesToAllocate - allocatedDevices.Len() - if !isLastPriority && len(possibleAlloc.candidateDevices) > remainingToAllocate { - return allocationByIntersectionResult{ - finished: false, - err: nil, - } + // If no exact matches, try partial allocations from larger groups + for _, group := range groupInfos { + // Check if this group can contribute to the remaining requirement + if remainingDevicesToAllocate >= group.candidates.Len() { + // Allocate all devices from this group and continue + allocatedDevices = allocatedDevices.Union(group.candidates) + + return s.allocateCandidateDevices( + affinityGroupsMap, + candidateDevicesSet.Difference(group.candidates), + devicesToAllocate, + allocatedDevices, + ) + } else { + // Recursively allocate a subset of devices from this group + devices, err := s.allocateCandidateDevices( + affinityGroupsMap, + group.candidates, + remainingDevicesToAllocate, + group.allocated, + ) + if err != nil { + return nil, err } - for _, device := range possibleAlloc.candidateDevices { - allocatedDevices.Insert(device) - if allocatedDevices.Len() == devicesToAllocate { - return allocationByIntersectionResult{ - finished: true, - err: nil, - } - } - } + return allocatedDevices.Union(devices), nil } } - return allocationByIntersectionResult{ - finished: false, - err: nil, - } + return allocatedDevices, nil } -func (s *DeviceAffinityStrategy) findAllAllocatedAffinityGroupsByPriority( - allocatedDevices []string, affinityMap map[machine.AffinityPriority][]affinityGroup, -) map[machine.AffinityPriority][]affinityGroup { - affinityGroupsByPriority := make(map[machine.AffinityPriority][]affinityGroup) - seenIds := sets.NewString() - for _, device := range allocatedDevices { - for priority, groups := range affinityMap { - for _, group := range groups { - if sets.NewString(group.unallocatedDevices...).Has(device) { - if seenIds.Has(group.id) { - continue - } - - if _, ok := affinityGroupsByPriority[priority]; !ok { - affinityGroupsByPriority[priority] = make([]affinityGroup, 0) - } - affinityGroupsByPriority[priority] = append(affinityGroupsByPriority[priority], group) - seenIds.Insert(group.id) - } - } - } +// abs returns the absolute value of an integer. +func abs(x int) int { + if x < 0 { + return -x } - return affinityGroupsByPriority + return x +} + +// groupInfo contains pre-calculated information about an affinity group +// to optimize the allocation process by avoiding repeated calculations. +type groupInfo struct { + group affinityGroup + candidates sets.String // Devices in this group that are also candidates + allocated sets.String // Devices in this group that are already allocated + unallocated sets.String // Devices in this group that are not yet allocated } From bb637622d6989c4f8a4c1c0cd235cb7ccff9c584 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Wed, 5 Nov 2025 15:01:51 +0800 Subject: [PATCH 35/52] refactor: simplify the grouping of device affinity --- .../strategies/deviceaffinity/bind_test.go | 2 +- pkg/util/machine/device.go | 103 ++++-------------- 2 files changed, 23 insertions(+), 82 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go index 1c04fe97ea..2dcec69535 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -1129,7 +1129,7 @@ func TestBind_NumberOfDevicesAllocated(t *testing.T) { }, }, }, - sortedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, // Allocate gpu-1 and gpu-5 first because they are reusable devices // Allocate gpu-2 next because they have affinity with gpu-1 at the highest affinity priority (level 0) // Allocate gpu-6 and gpu-8 next because they have affinity with gpu-5 at the next highest affinity priority (level 1) diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index bb4b99de4c..974118a3d2 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -19,7 +19,6 @@ package machine import ( "fmt" "sort" - "strings" "sync" "k8s.io/apimachinery/pkg/util/sets" @@ -149,98 +148,40 @@ type DeviceTopology struct { Devices map[string]DeviceInfo } -// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs are within a certain affinity priority level -// It preserves sub-group boundaries and eliminates duplicates -// E.g. if priority 0 has groups [1, 3, 5, 6] and [0, 2, 4, 7], priority 1 will be either [1, 3, 5, 6, 0, 2, 4, 7] or [0, 2, 4, 7, 1, 3, 5, 6] and not any other permutation -// This is to ensure that the higher priority affinity groups keep its permutation when it is in lower priority affinity group. +// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs within a certain affinity priority level func (t *DeviceTopology) GroupDeviceAffinity() map[AffinityPriority][]DeviceIDs { deviceAffinityGroup := make(map[AffinityPriority][]DeviceIDs) - - // Collect unique groups per priority - uniqueGroups := make(map[AffinityPriority]map[string]DeviceIDs) - - for id, deviceInfo := range t.Devices { - for priority, group := range deviceInfo.DeviceAffinity { - // Ensure the device itself is included - if !slices.Contains(group, id) { - group = append(group, id) + for deviceId, deviceInfo := range t.Devices { + for priority, affinityDeviceIDs := range deviceInfo.DeviceAffinity { + // Add itself in the group if it is not already included + if !slices.Contains(affinityDeviceIDs, deviceId) { + affinityDeviceIDs = append(affinityDeviceIDs, deviceId) } - - // Sort for consistent deduplication key - sortedGroup := make([]string, len(group)) - copy(sortedGroup, group) - sort.Strings(sortedGroup) - - key := strings.Join(sortedGroup, ",") - if _, ok := uniqueGroups[priority]; !ok { - uniqueGroups[priority] = make(map[string]DeviceIDs) + // Sort the strings for easier deduplication + sort.Strings(affinityDeviceIDs) + if _, ok := deviceAffinityGroup[priority]; !ok { + deviceAffinityGroup[priority] = make([]DeviceIDs, 0) } - uniqueGroups[priority][key] = group - } - } - - // Iterate priorities in order - for priority := 0; ; priority++ { - groupsMap, ok := uniqueGroups[AffinityPriority(priority)] - if !ok || len(groupsMap) == 0 { - break // no more groups at this priority - } - // Build lower-group map for merging (priority > 0) - lowerGroupMap := make(map[string]int) - if priority > 0 { - for idx, g := range deviceAffinityGroup[AffinityPriority(priority-1)] { - for _, d := range g { - lowerGroupMap[d] = idx - } + // Add the affinityDeviceIDs to the priority level if it is not already there + if !containsGroup(deviceAffinityGroup[priority], affinityDeviceIDs) { + deviceAffinityGroup[priority] = append(deviceAffinityGroup[priority], affinityDeviceIDs) } - } - for _, group := range groupsMap { - if priority > 0 { - // Merge according to lower-priority group boundaries - lowerGroups := make(map[int][]string) - for _, d := range group { - if idx, ok := lowerGroupMap[d]; ok { - lowerGroups[idx] = append(lowerGroups[idx], d) - } else { - lowerGroups[-1] = append(lowerGroups[-1], d) - } - } - - merged := []string{} - for idx := 0; idx < len(deviceAffinityGroup[AffinityPriority(priority-1)]); idx++ { - if devs, ok := lowerGroups[idx]; ok { - merged = append(merged, devs...) - } - } - if devs, ok := lowerGroups[-1]; ok { - merged = append(merged, devs...) - } - group = merged - } - - // Deduplicate final groups - key := strings.Join(group, ",") - if _, ok := deviceAffinityGroup[AffinityPriority(priority)]; !ok { - deviceAffinityGroup[AffinityPriority(priority)] = []DeviceIDs{} - } - alreadyExists := false - for _, g := range deviceAffinityGroup[AffinityPriority(priority)] { - if strings.Join(g, ",") == key { - alreadyExists = true - break - } - } - if !alreadyExists { - deviceAffinityGroup[AffinityPriority(priority)] = append(deviceAffinityGroup[AffinityPriority(priority)], group) - } } } - return deviceAffinityGroup } +func containsGroup(groups []DeviceIDs, candidate DeviceIDs) bool { + for _, g := range groups { + if slices.Equal(g, candidate) { + return true + } + } + return false +} + func (t *DeviceTopology) GetDeviceAffinityMap(deviceId string) (map[AffinityPriority]DeviceIDs, error) { info, ok := t.Devices[deviceId] if !ok { From 197f959a2fb1d4b8d08ad5b24e3134c8df7d5651 Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Thu, 6 Nov 2025 11:29:58 +0800 Subject: [PATCH 36/52] fix: handling of nil device req --- .../qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go | 8 ++++++++ .../qrm-plugins/gpu/strategy/allocate/manager/helper.go | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 6fac568bca..753da338ea 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -434,6 +434,14 @@ func (p *GPUMemPlugin) Allocate( return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil } + if deviceReq == nil { + general.InfoS("Nil device request, returning empty response", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName) + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go index 3720ce7ad8..fe78af5de7 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go @@ -41,7 +41,7 @@ func AllocateGPUUsingStrategy( qosLevel string, ) (*allocate.AllocationResult, error) { // Get hint nodes - hintNodes, err := machine.NewCPUSetUint64(resourceReq.GetHint().GetNodes()...) + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) if err != nil { return &allocate.AllocationResult{ Success: false, From 698f27abfc71844f4f79c3a828ccf7598e17223a Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 10 Nov 2025 15:40:22 +0800 Subject: [PATCH 37/52] chore: add unit tests --- go.mod | 2 +- go.sum | 4 +- .../qrm-plugins/gpu/baseplugin/base_test.go | 175 +++++ .../custom_device_plugin_stub.go | 97 +++ .../gpu/customdeviceplugin/gpu/gpu_test.go | 329 +++++++++ .../gpu/customdeviceplugin/rdma/rdma.go | 26 +- .../gpu/customdeviceplugin/rdma/rdma_test.go | 562 +++++++++++++++ .../gpu/resourceplugin/gpumemory/gpu_mem.go | 3 +- .../resourceplugin/resource_plugin_stub.go | 81 +++ pkg/agent/qrm-plugins/gpu/state/state_test.go | 199 +++++ .../qrm-plugins/gpu/staticpolicy/policy.go | 25 +- .../gpu/staticpolicy/policy_test.go | 682 ++++++++++++++++++ .../strategies/canonical/bind_test.go | 143 ++++ .../strategies/canonical/filter_test.go | 85 +++ .../strategies/gpu_memory/filter_test.go | 220 ++++++ .../strategies/gpu_memory/sort_test.go | 178 +++++ .../gpu/strategy/allocate/strategies/util.go | 4 - pkg/agent/qrm-plugins/gpu/util/util.go | 2 +- pkg/util/machine/device.go | 35 +- pkg/util/machine/device_stub.go | 43 ++ pkg/util/machine/device_test.go | 448 ++++++++++++ 21 files changed, 3302 insertions(+), 41 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go create mode 100644 pkg/agent/qrm-plugins/gpu/state/state_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go create mode 100644 pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go create mode 100644 pkg/util/machine/device_stub.go create mode 100644 pkg/util/machine/device_test.go diff --git a/go.mod b/go.mod index 2ef5e3a055..74ac733272 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251029040321-5a97019a9b2c + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index 938e286d15..ece20c05f7 100644 --- a/go.sum +++ b/go.sum @@ -583,8 +583,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= -github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1 h1:GZ9+zAV1u5pP5xTqcshvNpr0qRIkl/Gu6/YFd2RFT9A= -github.com/luomingmeng/katalyst-api v0.0.0-20250922112104-28800baae7c1/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= +github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b h1:CJnM7Ow3Wm+QzvUrrqmGlId8UXOWNGAKi09mTnU/QQs= +github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go new file mode 100644 index 0000000000..00f0453cb9 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go @@ -0,0 +1,175 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func TestBasePlugin_PackAllocationResponse(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + req *pluginapi.ResourceRequest + allocationInfo *state.AllocationInfo + annotations map[string]string + resourceName string + expectedResp *pluginapi.ResourceAllocationResponse + expectedErr bool + }{ + { + name: "nil allocation info", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + }, + resourceName: "test-resource", + expectedErr: true, + }, + { + name: "nil request", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + }, + resourceName: "test-resource", + expectedErr: true, + }, + { + name: "basic case", + req: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + ResourceName: "test-resource", + }, + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + }, + annotations: map[string]string{ + "test-key": "test-value", + }, + resourceName: "test-resource", + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodUid: "test-uid", + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "test-resource", + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + "test-resource": { + IsNodeResource: true, + IsScalarResource: true, + AllocatedQuantity: 4, + Annotations: map[string]string{ + "test-key": "test-value", + }, + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + basePlugin, err := NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + resp, err := basePlugin.PackAllocationResponse(tt.req, tt.allocationInfo, tt.annotations, tt.resourceName) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedResp, resp) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go new file mode 100644 index 0000000000..baab6a2dae --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go @@ -0,0 +1,97 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + "fmt" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" +) + +type CustomDevicePluginStub struct { + *baseplugin.BasePlugin +} + +func NewCustomDevicePluginStub(base *baseplugin.BasePlugin) CustomDevicePlugin { + return &CustomDevicePluginStub{ + BasePlugin: base, + } +} + +func (c CustomDevicePluginStub) DeviceNames() []string { + return []string{"custom-device-plugin-stub"} +} + +func (c CustomDevicePluginStub) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (c CustomDevicePluginStub) UpdateAllocatableAssociatedDevices(_ *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (c CustomDevicePluginStub) DefaultAccompanyResourceName() string { + return "resource-plugin-stub" +} + +func (c CustomDevicePluginStub) AllocateAssociatedDevice(resReq *pluginapi.ResourceRequest, _ *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + // Simply check if the accompany resource has been allocated + // If it has been allocated, allocate the associated device + accompanyResourceAllocation := c.State.GetAllocationInfo(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName) + if accompanyResourceAllocation != nil { + c.State.SetAllocationInfo(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName, &state.AllocationInfo{}, false) + return &pluginapi.AssociatedDeviceAllocationResponse{}, nil + } else { + return nil, fmt.Errorf("accompany resource %s has not been allocated", accompanyResourceName) + } +} + +type CustomDevicePluginStub2 struct { + *baseplugin.BasePlugin +} + +func NewCustomDevicePluginStub2(base *baseplugin.BasePlugin) CustomDevicePlugin { + return &CustomDevicePluginStub2{ + BasePlugin: base, + } +} + +func (c CustomDevicePluginStub2) DeviceNames() []string { + return []string{"custom-device-plugin-stub-2"} +} + +func (c CustomDevicePluginStub2) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (c CustomDevicePluginStub2) UpdateAllocatableAssociatedDevices(_ *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (c CustomDevicePluginStub2) DefaultAccompanyResourceName() string { + return "resource-plugin-stub" +} + +func (c CustomDevicePluginStub2) AllocateAssociatedDevice(resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + // Simply allocate the associated device + c.State.SetAllocationInfo(v1.ResourceName(deviceReq.DeviceName), resReq.PodUid, resReq.ContainerName, &state.AllocationInfo{}, false) + return &pluginapi.AssociatedDeviceAllocationResponse{}, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go new file mode 100644 index 0000000000..a1f7246b2f --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go @@ -0,0 +1,329 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + conf.GPUDeviceNames = []string{"test-gpu"} + + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + basePlugin.State = stateImpl + + return basePlugin +} + +func TestGPUDevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewGPUDevicePlugin(basePlugin) + + // Update topology with associated devices + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "test-gpu", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-gpu-0", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 0, + }, + }, + }, + }, + { + ID: "test-gpu-1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + }, + } + + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify device topology is updated + gpuDevicePlugin := devicePlugin.(*GPUDevicePlugin) + deviceTopology, numaTopologyReady, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + assert.NoError(t, err) + assert.True(t, numaTopologyReady) + assert.NotNil(t, deviceTopology) + + expectedDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + "test-gpu-1": { + NumaNodes: []int{1}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + }, + } + + assert.Equal(t, expectedDeviceTopology, deviceTopology) +} + +func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + allocationInfo *state.AllocationInfo + accompanyResourceAllocationInfo *state.AllocationInfo + deviceReq *pluginapi.DeviceRequest + deviceTopology *machine.DeviceTopology + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceAllocationResponse + }{ + { + name: "Allocation already exists", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 1, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 1, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-gpu-0", "test-gpu-1"}, + }, + }, + }, + { + name: "gpu memory allocation exists", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 2, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 2, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + }, + "test-gpu-1": { + NumaNodes: []int{1}, + }, + "test-gpu-2": { + NumaNodes: []int{0}, + }, + "test-gpu-3": { + NumaNodes: []int{1}, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-gpu-0", "test-gpu-1"}, + }, + }, + }, + { + name: "device topology does not exist", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 2, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 2, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + expectedErr: true, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewGPUDevicePlugin(basePlugin) + + if tt.allocationInfo != nil { + basePlugin.State.SetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) + } + + if tt.accompanyResourceAllocationInfo != nil { + basePlugin.State.SetAllocationInfo(v1.ResourceName(defaultAccompanyResourceName), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false) + } + + if tt.deviceTopology != nil { + err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.GPUDeviceType, tt.deviceTopology) + assert.NoError(t, err) + } + + resourceReq := &pluginapi.ResourceRequest{ + PodUid: tt.podUID, + ContainerName: tt.containerName, + } + + resp, err := devicePlugin.AllocateAssociatedDevice(resourceReq, tt.deviceReq, "test") + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateAllocatedDevicesResult(t, tt.expectedResp, resp) + + // Verify state is updated + allocationInfo := basePlugin.State.GetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName) + assert.NotNil(t, allocationInfo) + } + }) + } +} + +func evaluateAllocatedDevicesResult(t *testing.T, expectedResp, actualResp *pluginapi.AssociatedDeviceAllocationResponse) { + if expectedResp.AllocationResult == nil && actualResp.AllocationResult == nil { + return + } + + if expectedResp.AllocationResult != nil && actualResp.AllocationResult == nil { + t.Errorf("expected allocation result %v, but got nil", expectedResp.AllocationResult) + return + } + + if actualResp.AllocationResult != nil && expectedResp.AllocationResult == nil { + t.Errorf("expected nil allocation result, but got %v", actualResp.AllocationResult) + return + } + + assert.ElementsMatch(t, expectedResp.AllocationResult.AllocatedDevices, actualResp.AllocationResult.AllocatedDevices) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index 89dad61899..a649f8d364 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -151,7 +151,7 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice( topologyAwareAllocations[deviceID] = state.Allocation{ Quantity: 1, - NUMANodes: info.GetNUMANode(), + NUMANodes: info.GetNUMANodes(), } } @@ -236,7 +236,8 @@ func (p *RDMADevicePlugin) allocateWithNoAccompanyResource( return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices)) } -// TODO: first allocate to reusable devices, then allocate to available devices proportionally based on numa affinity +// allocateWithAccompanyResource allocates the rdma devices by first allocating the reusable devices, then allocating the +// available devices proportionally by ensuring NUMA affinity with the accompany resource func (p *RDMADevicePlugin) allocateWithAccompanyResource( deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string, ) ([]string, error) { @@ -248,26 +249,21 @@ func (p *RDMADevicePlugin) allocateWithAccompanyResource( return nil, fmt.Errorf("failed to get device type for accompany resource %s: %v", accompanyResourceName, err) } - accompanyAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName) - if accompanyAllocationInfo == nil || accompanyAllocationInfo.TopologyAwareAllocations == nil { - err = fmt.Errorf("get allocation info of the resource %s for pod %s/%s, container: %s failed with error: %v", - resReq.ResourceName, resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) - general.Errorf("%s", err.Error()) - return nil, err - } + // Allocate all the reusable devices first + allocatedDevices := sets.NewString(deviceReq.ReusableDevices...) // Get ratio of accompany resource to target device - accompanyResourceToTargetDeviceRatio := p.State.GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyResourceName, deviceReq.DeviceName) + accompanyResourceToTargetDeviceRatio := p.State.GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyDeviceType, gpuconsts.RDMADeviceType) // Allocate target device according to ratio of accompany resource to target device podResourceEntries := p.State.GetPodResourceEntries() - totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName) + totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName) rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) * accompanyResourceToTargetDeviceRatio)) // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same // numa nodes as the gpu and allocate them - accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyResourceName, deviceReq.DeviceName) + accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyDeviceType, gpuconsts.RDMADeviceType) if err != nil { general.Warningf("failed to get gpu to rdma affinity map: %v", err) return nil, err @@ -275,13 +271,15 @@ func (p *RDMADevicePlugin) allocateWithAccompanyResource( machineState := p.State.GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)] - allocatedDevices := sets.NewString() allocateDevices := func(devices ...string) bool { for _, device := range devices { - allocatedDevices.Insert(device) if allocatedDevices.Len() >= rdmaToBeAllocated { return true } + allocatedDevices.Insert(device) + } + if allocatedDevices.Len() >= rdmaToBeAllocated { + return true } return false } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go new file mode 100644 index 0000000000..600f564a90 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go @@ -0,0 +1,562 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rdma + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + conf.RDMADeviceNames = []string{"test-rdma"} + + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + basePlugin.State = stateImpl + + // Register gpu device type and gpu device topology provider as it is an accompany resource for rdma + basePlugin.RegisterDeviceNameToType([]string{"test-gpu"}, gpuconsts.GPUDeviceType) + gpuTopologyProvider := machine.NewDeviceTopologyProvider([]string{"test-gpu"}) + basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) + + return basePlugin +} + +func TestRDMADevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewRDMADevicePlugin(basePlugin) + + // Update topology with associated devices + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "test-rdma", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-rdma-0", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 0, + }, + }, + }, + }, + { + ID: "test-rdma-1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + }, + } + + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify device topology is updated + gpuDevicePlugin := devicePlugin.(*RDMADevicePlugin) + deviceTopology, numaTopologyReady, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) + assert.NoError(t, err) + assert.True(t, numaTopologyReady) + assert.NotNil(t, deviceTopology) + + expectedDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + "test-rdma-1": { + NumaNodes: []int{1}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + }, + } + + assert.Equal(t, expectedDeviceTopology, deviceTopology) +} + +func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + allocationInfo *state.AllocationInfo + accompanyResourceAllocationInfo *state.AllocationInfo + accompanyResourceName string + deviceReq *pluginapi.DeviceRequest + deviceTopology *machine.DeviceTopology + accompanyDeviceTopology *machine.DeviceTopology + machineState state.AllocationResourcesMap + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceAllocationResponse + }{ + { + name: "Allocation already exists", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-rdma-0": { + Quantity: 1, + NUMANodes: []int{0}, + }, + "test-rdma-1": { + Quantity: 1, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + AvailableDevices: []string{"test-rdma-2", "test-rdma-3"}, + ReusableDevices: []string{"test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-1"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, allocate reusable devices first", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: []string{"test-rdma-2", "test-rdma-3"}, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-2", "test-rdma-3"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, no reusable devices, only allocate available devices with NUMA affinity", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-2"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, no reusable devices, skip devices that are already allocated", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": { + PodEntries: map[string]state.ContainerEntries{ + "pod-uid2": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-rdma-1": { + PodEntries: map[string]state.ContainerEntries{ + "pod-uid3": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-2", "test-rdma-3"}, + }, + }, + }, + { + name: "Accompany resource has been allocated", + podUID: "test-pod", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + // Ratio of 1 rdma device per 2 gpu devices + accompanyDeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + }, + "test-gpu-1": { + NumaNodes: []int{1}, + }, + "test-gpu-2": { + NumaNodes: []int{0}, + }, + "test-gpu-3": { + NumaNodes: []int{1}, + }, + "test-gpu-4": { + NumaNodes: []int{0}, + }, + "test-gpu-5": { + NumaNodes: []int{1}, + }, + "test-gpu-6": { + NumaNodes: []int{0}, + }, + "test-gpu-7": { + NumaNodes: []int{1}, + }, + }, + }, + accompanyResourceName: "test-gpu", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 1, + }, + "test-gpu-2": { + Quantity: 1, + }, + "test-gpu-4": { + Quantity: 1, + }, + "test-gpu-6": { + Quantity: 1, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + gpuconsts.GPUDeviceType: { + "test-gpu-0": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-1": {}, + "test-gpu-2": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-3": {}, + "test-gpu-4": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-5": {}, + "test-gpu-6": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-7": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-2"}, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewRDMADevicePlugin(basePlugin) + + if tt.allocationInfo != nil { + basePlugin.State.SetAllocationInfo(gpuconsts.RDMADeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) + } + + if tt.accompanyResourceAllocationInfo != nil && tt.accompanyResourceName != "" { + accompanyResourceType, err := basePlugin.GetResourceTypeFromDeviceName(tt.accompanyResourceName) + assert.NoError(t, err) + basePlugin.State.SetAllocationInfo(v1.ResourceName(accompanyResourceType), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false) + } + + if tt.deviceTopology != nil { + err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.RDMADeviceType, tt.deviceTopology) + assert.NoError(t, err) + } + + if tt.accompanyResourceName != "" && tt.accompanyDeviceTopology != nil { + accompanyResourceType, err := basePlugin.GetResourceTypeFromDeviceName(tt.accompanyResourceName) + assert.NoError(t, err) + err = basePlugin.DeviceTopologyRegistry.SetDeviceTopology(accompanyResourceType, tt.accompanyDeviceTopology) + assert.NoError(t, err) + } + + if tt.machineState != nil { + basePlugin.State.SetMachineState(tt.machineState, false) + } + + resourceReq := &pluginapi.ResourceRequest{ + PodUid: tt.podUID, + ContainerName: tt.containerName, + } + + resp, err := devicePlugin.AllocateAssociatedDevice(resourceReq, tt.deviceReq, tt.accompanyResourceName) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateAllocatedDevicesResult(t, tt.expectedResp, resp) + + // Verify state is updated + allocationInfo := basePlugin.State.GetAllocationInfo(gpuconsts.RDMADeviceType, tt.podUID, tt.containerName) + assert.NotNil(t, allocationInfo) + } + }) + } +} + +func evaluateAllocatedDevicesResult(t *testing.T, expectedResp, actualResp *pluginapi.AssociatedDeviceAllocationResponse) { + if expectedResp.AllocationResult == nil && actualResp.AllocationResult == nil { + return + } + + if expectedResp.AllocationResult != nil && actualResp.AllocationResult == nil { + t.Errorf("expected allocation result %v, but got nil", expectedResp.AllocationResult) + return + } + + if actualResp.AllocationResult != nil && expectedResp.AllocationResult == nil { + t.Errorf("expected nil allocation result, but got %v", actualResp.AllocationResult) + return + } + + assert.ElementsMatch(t, expectedResp.AllocationResult.AllocatedDevices, actualResp.AllocationResult.AllocatedDevices) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 753da338ea..31696d6cca 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -160,7 +160,7 @@ func (p *GPUMemPlugin) calculateHints( s := machineState[gpuID] allocated := s.GetQuantityAllocated() if allocated+perGPUMemory <= float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) { - for _, numaNode := range info.GetNUMANode() { + for _, numaNode := range info.GetNUMANodes() { numaToAvailableGPUCount[numaNode] += 1 numaToMostAllocatedGPUMemory[numaNode] = math.Max(allocated, numaToMostAllocatedGPUMemory[numaNode]) } @@ -517,7 +517,6 @@ func (p *GPUMemPlugin) Allocate( } // Use the strategy framework to allocate GPU memory - // TODO: What happens when deviceReq is nil? result, err := manager.AllocateGPUUsingStrategy( resourceReq, deviceReq, diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go new file mode 100644 index 0000000000..db65623868 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go @@ -0,0 +1,81 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "fmt" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" +) + +type ResourcePluginStub struct { + *baseplugin.BasePlugin +} + +func NewResourcePluginStub(base *baseplugin.BasePlugin) ResourcePlugin { + return &ResourcePluginStub{BasePlugin: base} +} + +func (r ResourcePluginStub) ResourceName() string { + return "resource-plugin-stub" +} + +func (r ResourcePluginStub) GetTopologyHints(_ *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) { + return &pluginapi.ResourceHintsResponse{}, nil +} + +func (r ResourcePluginStub) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + // Simply returns a fixed response if the podUID and containerName is found in the state, otherwise return an error + allocationInfo := r.State.GetAllocationInfo(v1.ResourceName(r.ResourceName()), podUID, containerName) + if allocationInfo == nil { + return nil, fmt.Errorf("allocationInfo is nil") + } + + // Simply returns a fixed response + return &pluginapi.GetTopologyAwareResourcesResponse{ + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + r.ResourceName(): {}, + }, + }, + }, nil +} + +func (r ResourcePluginStub) GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) { + // Simply return a fixed response + return &gpuconsts.AllocatableResource{ + ResourceName: r.ResourceName(), + }, nil +} + +func (r ResourcePluginStub) Allocate(resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest) (*pluginapi.ResourceAllocationResponse, error) { + // Simply save resource request and device request in state + if resourceReq.ResourceName == r.ResourceName() { + r.State.SetAllocationInfo(v1.ResourceName(r.ResourceName()), resourceReq.PodUid, resourceReq.ContainerName, &state.AllocationInfo{}, false) + } + + if deviceReq != nil { + r.State.SetAllocationInfo(v1.ResourceName(deviceReq.DeviceName), resourceReq.PodUid, resourceReq.ContainerName, &state.AllocationInfo{}, false) + } + + return &pluginapi.ResourceAllocationResponse{}, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_test.go b/pkg/agent/qrm-plugins/gpu/state/state_test.go new file mode 100644 index 0000000000..b595b3467b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_test.go @@ -0,0 +1,199 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" +) + +func TestAllocationResourcesMap_GetRatioOfAccompanyResourceToTargetResource(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + accompanyResourceName string + targetResourceName string + arm AllocationResourcesMap + want float64 + }{ + { + name: "normal case", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + "accompany3": {}, + "accompany4": {}, + }, + v1.ResourceName("targetResource"): { + "target1": {}, + "target2": {}, + }, + }, + want: 2.0, + }, + { + name: "got a ratio that is a fraction", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + }, + v1.ResourceName("targetResource"): { + "target1": {}, + "target2": {}, + "target3": {}, + "target4": {}, + }, + }, + want: 0.5, + }, + { + name: "no devices for target resource", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + }, + }, + want: 0, + }, + { + name: "no devices for accompany resource and target resource", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{}, + want: 0, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := tt.arm.GetRatioOfAccompanyResourceToTargetResource(tt.accompanyResourceName, tt.targetResourceName) + if got != tt.want { + t.Errorf("GetRatioOfAccompanyResourceToTargetResource() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPodResourceEntries_GetTotalAllocatedResourceOfContainer(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + resourceName v1.ResourceName + podUID string + containerName string + pre PodResourceEntries + wantTotalAllocationQuantity int + wantAllocationIDs sets.String + }{ + { + name: "normal case", + resourceName: v1.ResourceName("testResource"), + podUID: "podUID", + containerName: "containerName", + pre: PodResourceEntries{ + v1.ResourceName("testResource"): { + "podUID2": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-1": { + Quantity: 1, + }, + "test-2": { + Quantity: 1, + }, + }, + }, + }, + "podUID": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-3": { + Quantity: 1, + }, + "test-4": { + Quantity: 1, + }, + }, + }, + }, + }, + }, + wantTotalAllocationQuantity: 2, + wantAllocationIDs: sets.NewString("test-3", "test-4"), + }, + { + name: "no allocation", + resourceName: v1.ResourceName("testResource"), + podUID: "podUID", + containerName: "containerName", + pre: PodResourceEntries{ + v1.ResourceName("testResource"): { + "podUID2": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-1": { + Quantity: 1, + }, + "test-2": { + Quantity: 1, + }, + }, + }, + }, + }, + }, + wantTotalAllocationQuantity: 0, + wantAllocationIDs: sets.NewString(), + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + gotTotalAllocationQuantity, gotAllocationIDs := tt.pre.GetTotalAllocatedResourceOfContainer(tt.resourceName, tt.podUID, tt.containerName) + assert.Equal(t, tt.wantTotalAllocationQuantity, gotTotalAllocationQuantity) + + assert.ElementsMatch(t, tt.wantAllocationIDs.UnsortedList(), gotAllocationIDs.UnsortedList()) + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 384b825733..3c572d1591 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -92,10 +92,10 @@ func NewStaticPolicy( customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), } - if err = policyImplement.registerResourcePlugins(); err != nil { + if err = policyImplement.registerDefaultResourcePlugins(); err != nil { return false, agent.ComponentStub{}, fmt.Errorf("failed to register resource plugins: %w", err) } - if err = policyImplement.registerCustomDevicePlugins(); err != nil { + if err = policyImplement.registerDefaultCustomDevicePlugins(); err != nil { return false, agent.ComponentStub{}, fmt.Errorf("failed to register custom device plugins: %w", err) } @@ -315,6 +315,11 @@ func (p *StaticPolicy) mergeTopologyAwareResourcesResponse( result.PodNamespace = resp.PodNamespace } + if resp.ContainerTopologyAwareResources == nil { + general.Errorf("container topology aware resources is nil for pod %s/%s, namespace %s", podUID, containerName, result.PodNamespace) + return nil, fmt.Errorf("container topology aware resources is nil for pod %s/%s, namespace %s", podUID, containerName, result.PodNamespace) + } + for resourceName, resource := range resp.ContainerTopologyAwareResources.AllocatedResources { allocatedResources[resourceName] = resource } @@ -693,7 +698,7 @@ func (p *StaticPolicy) allocateAssociatedDevice( return devicePlugin.AllocateAssociatedDevice(resReq, deviceReq, accompanyResourceName) } -func (p *StaticPolicy) registerResourcePlugins() error { +func (p *StaticPolicy) registerDefaultResourcePlugins() error { allInitFuncs := resourcepluginregistry.GetRegisteredResourcePlugin() for _, initFunc := range allInitFuncs { resourcePlugin := initFunc(p.BasePlugin) @@ -703,7 +708,7 @@ func (p *StaticPolicy) registerResourcePlugins() error { return nil } -func (p *StaticPolicy) registerCustomDevicePlugins() error { +func (p *StaticPolicy) registerDefaultCustomDevicePlugins() error { allInitFuncs := devicepluginregistry.GetRegisteredCustomDevicePlugin() for name := range allInitFuncs { initFunc := allInitFuncs[name] @@ -726,3 +731,15 @@ func (p *StaticPolicy) getCustomDevicePlugin(deviceName string) customdeviceplug customDevicePlugin := p.customDevicePlugins[deviceName] return customDevicePlugin } + +func (p *StaticPolicy) RegisterResourcePlugin(resourcePlugin resourceplugin.ResourcePlugin) { + p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin +} + +func (p *StaticPolicy) RegisterCustomDevicePlugin(plugin customdeviceplugin.CustomDevicePlugin) { + deviceNames := plugin.DeviceNames() + for _, deviceName := range deviceNames { + p.customDevicePlugins[deviceName] = plugin + p.associatedDeviceNames.Insert(deviceName) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go new file mode 100644 index 0000000000..e75b1fede3 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go @@ -0,0 +1,682 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package staticpolicy + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const ( + testResourcePluginName = "resource-plugin-stub" + testCustomDevicePluginName = "custom-device-plugin-stub" + testCustomDevicePluginName2 = "custom-device-plugin-stub-2" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func TestNewStaticPolicy(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + + _, policy, err := NewStaticPolicy(agentCtx, conf, nil, "test") + assert.NoError(t, err) + assert.NotNil(t, policy) +} + +func makeTestStaticPolicy(t *testing.T) *StaticPolicy { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + deviceTopologyRegistry := machine.NewDeviceTopologyRegistry() + + basePlugin := &baseplugin.BasePlugin{ + Conf: conf, + Emitter: metrics.DummyMetrics{}, + MetaServer: agentCtx.MetaServer, + AgentCtx: agentCtx, + PodAnnotationKeptKeys: []string{}, + PodLabelKeptKeys: []string{}, + State: stateImpl, + DeviceTopologyRegistry: deviceTopologyRegistry, + DefaultResourceStateGeneratorRegistry: state.NewDefaultResourceStateGeneratorRegistry(), + } + + staticPolicy := &StaticPolicy{ + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), + associatedDeviceNames: sets.NewString(), + } + + err = staticPolicy.registerDefaultResourcePlugins() + assert.NoError(t, err) + + err = staticPolicy.registerDefaultCustomDevicePlugins() + assert.NoError(t, err) + + return staticPolicy +} + +func TestStaticPolicy_Allocate(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + // Register stubbed resource plugin + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Check state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testResourcePluginName, podUID, testName) + fmt.Println(allocationInfo) + assert.NotNil(t, allocationInfo) + + // Allocating to an invalid resource plugin returns error + invalidReq := &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "invalid-plugin", + ResourceRequests: map[string]float64{ + "invalid-plugin": 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.Allocate(context.Background(), invalidReq) + assert.Error(t, err) +} + +func TestStaticPolicy_RemovePod(t *testing.T) { + t.Parallel() + policy := makeTestStaticPolicy(t) + + // Register stubbed resource plugin + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + deviceTopologyProviderStub := machine.NewDeviceTopologyProviderStub() + + testDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-1": {}, + }, + } + err := deviceTopologyProviderStub.SetDeviceTopology(testDeviceTopology) + assert.NoError(t, err) + + policy.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(testResourcePluginName, deviceTopologyProviderStub) + + policy.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(testResourcePluginName, + state.NewGenericDefaultResourceStateGenerator(testResourcePluginName, policy.DeviceTopologyRegistry)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Remove pod + _, err = policy.RemovePod(context.Background(), &pluginapi.RemovePodRequest{ + PodUid: podUID, + }) + assert.NoError(t, err) + + // Check state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testResourcePluginName, podUID, testName) + assert.Nil(t, allocationInfo) +} + +func TestStaticPolicy_GetTopologyHints(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + resp, err := policy.GetTopologyHints(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Getting topology hints from an invalid resource plugin returns error + invalidReq := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "invalid-plugin", + ResourceRequests: map[string]float64{ + "invalid-plugin": 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.GetTopologyHints(context.Background(), invalidReq) + assert.Error(t, err) +} + +func TestStaticPolicy_GetTopologyAwareResources(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + testName := "test" + podUID := string(uuid.NewUUID()) + + // Allocate first + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Get topology aware resources + getTopologyAwareResourcesReq := &pluginapi.GetTopologyAwareResourcesRequest{ + PodUid: podUID, + ContainerName: testName, + } + + resp, err := policy.GetTopologyAwareResources(context.Background(), getTopologyAwareResourcesReq) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Test error handling + invalidReq := &pluginapi.GetTopologyAwareResourcesRequest{ + PodUid: podUID, + ContainerName: "invalid-container", + } + + _, err = policy.GetTopologyAwareResources(context.Background(), invalidReq) + assert.Error(t, err) +} + +func TestStaticPolicy_mergeTopologyAwareResourcesResponse(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + respList []*pluginapi.GetTopologyAwareResourcesResponse + expectedResp *pluginapi.GetTopologyAwareResourcesResponse + expectedErr bool + }{ + { + name: "test merging of response", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedResp: &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + "test-resource-2": {}, + }, + }, + }, + }, + { + name: "pod name is not the same, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name-2", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "pod namespace is not the same, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace-2", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "container topology aware resources is nil, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: nil, + }, + }, + expectedErr: true, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + policy := makeTestStaticPolicy(t) + + resp, err := policy.mergeTopologyAwareResourcesResponse(tt.podUID, tt.containerName, tt.respList) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedResp, resp) + } + }) + } +} + +func TestStaticPolicy_GetTopologyAwareAllocatableResources(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + testName := "test" + podUID := string(uuid.NewUUID()) + + // Allocate first + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + getTopologyAwareAllocatableResourcesReq := &pluginapi.GetTopologyAwareAllocatableResourcesRequest{} + resp, err := policy.GetTopologyAwareAllocatableResources(context.Background(), getTopologyAwareAllocatableResourcesReq) + assert.Error(t, err) + assert.Nil(t, resp) +} + +func TestStaticPolicy_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub(policy.BasePlugin)) + + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: testCustomDevicePluginName, + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-device-1", + }, + }, + } + + resp, err := policy.UpdateAllocatableAssociatedDevices(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Test error handling for no device request + noDeviceRequest := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: testCustomDevicePluginName, + } + + resp, err = policy.UpdateAllocatableAssociatedDevices(context.Background(), noDeviceRequest) + assert.Error(t, err) + assert.Nil(t, resp) + + // Test error handling for non-existent custom device plugin + invalidReq := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "non-existent-device", + } + + resp, err = policy.UpdateAllocatableAssociatedDevices(context.Background(), invalidReq) + assert.Error(t, err) + assert.Nil(t, resp) +} + +func TestStaticPolicy_AllocateAssociatedDevices(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub2(policy.BasePlugin)) + + podUID := string(uuid.NewUUID()) + + testName := "test" + + req := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + }, + DeviceName: testCustomDevicePluginName, + AccompanyResourceName: testResourcePluginName, + } + + resp, err := policy.AllocateAssociatedDevice(context.Background(), req) + + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify in state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testCustomDevicePluginName, podUID, testName) + assert.NotNil(t, allocationInfo) + + podUID = string(uuid.NewUUID()) + // Error handling if there is no target device + noTargetDeviceReq := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + }, + DeviceName: "invalid-device", + AccompanyResourceName: testResourcePluginName, + } + + resp, err = policy.AllocateAssociatedDevice(context.Background(), noTargetDeviceReq) + + assert.Error(t, err) + assert.Nil(t, resp) + + // Accompany resource is another custom device plugin + req = &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + { + DeviceName: testCustomDevicePluginName2, + }, + }, + DeviceName: testCustomDevicePluginName, + AccompanyResourceName: testCustomDevicePluginName2, + } + + resp, err = policy.AllocateAssociatedDevice(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify state + allocationInfo = stateImpl.GetAllocationInfo(testCustomDevicePluginName, podUID, testName) + assert.NotNil(t, allocationInfo) + + allocationInfo = stateImpl.GetAllocationInfo(testCustomDevicePluginName2, podUID, testName) + assert.NotNil(t, allocationInfo) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go new file mode 100644 index 0000000000..6093729181 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go @@ -0,0 +1,143 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestCanonicalStrategy_Bind(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedResult *allocate.AllocationResult + expectedErr bool + }{ + { + name: "device topology is nil", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "device request is greater than available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "bind all the reusable devices first", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + ReusableDevices: []string{"gpu-1", "gpu-2"}, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2"}, + }, + }, + { + name: "bind all the reusable devices first and then the available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 4, + ReusableDevices: []string{"gpu-1", "gpu-2"}, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + canonicalStrategy := NewCanonicalStrategy() + result, err := canonicalStrategy.Bind(tt.ctx, tt.sortedDevices) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedResult.AllocatedDevices, result.AllocatedDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go new file mode 100644 index 0000000000..b53e40c3f3 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestCanonicalStrategy_Filter(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + availableDevices []string + expectedFilteredDevices []string + }{ + { + name: "empty hint nodes does not filter", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0, 1}, + }, + "gpu-2": { + NumaNodes: []int{2, 3}, + }, + }, + }, + }, + availableDevices: []string{"gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-1", "gpu-2"}, + }, + { + name: "filtered devices by hint nodes", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0, 1}, + }, + "gpu-2": { + NumaNodes: []int{2, 3}, + }, + }, + }, + HintNodes: machine.NewCPUSet(0, 1, 2), + }, + availableDevices: []string{"gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-1"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + canonicalStrategy := NewCanonicalStrategy() + actualFilteredDevices, err := canonicalStrategy.Filter(tt.ctx, tt.availableDevices) + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedFilteredDevices, actualFilteredDevices, "filtered devices are not equal") + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go new file mode 100644 index 0000000000..4b206afc7d --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go @@ -0,0 +1,220 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestGPUMemoryStrategy_Filter(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + availableDevices []string + expectedFilteredDevices []string + expectedErr bool + }{ + { + name: "gpu topology is nil", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + expectedErr: true, + }, + { + name: "gpu memory does not exist, just allocate every device", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceMemoryBandwidth): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "gpu memory is 0, so we use all the available devices", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 0, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "allocate available devices with available gpu memory", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "exclude devices with not enough gpu memory", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + // 2 GB allocated + "gpu-0": { + PodEntries: map[string]state.ContainerEntries{ + "pod-0": { + "container-0": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + }, + }, + }, + }, + }, + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": { + // 1 GB allocated + PodEntries: map[string]state.ContainerEntries{ + "pod-1": { + "container-0": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 0.5, + }, + }, + "container-1": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 0.5, + }, + }, + }, + }, + }, + }, + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"}, + expectedFilteredDevices: []string{"gpu-1", "gpu-2"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + strategy := NewGPUMemoryStrategy() + filteredDevices, err := strategy.Filter(tt.ctx, tt.availableDevices) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedFilteredDevices, filteredDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go new file mode 100644 index 0000000000..3511861ba8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go @@ -0,0 +1,178 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestGPUMemoryStrategy_Sort(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + filteredDevices []string + expectedSortedDevices []string + expectedErr bool + }{ + { + name: "nil gpu topology", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "gpu memory is 0 returns all available devices without sorting", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 0, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedSortedDevices: []string{"gpu-1", "gpu-2"}, + }, + { + name: "devices are sorted by NUMA affinity first", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + // gpu-1 has NUMA affinity but gpu-2 does not + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{1}, + }, + "gpu-2": { + NumaNodes: []int{0}, + }, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + HintNodes: machine.NewCPUSet(0), + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedSortedDevices: []string{"gpu-2", "gpu-1"}, + }, + { + name: "for devices with NUMA affinity, they are sorted by available memory in ascending order", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0}, + }, + "gpu-2": { + NumaNodes: []int{1}, + }, + "gpu-3": { + NumaNodes: []int{2}, + }, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(4, resource.DecimalSI), + }, + HintNodes: machine.NewCPUSet(0, 1), + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + "gpu-1": { + PodEntries: map[string]state.ContainerEntries{ + "pod-0": { + "container-0": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "gpu-2": { + PodEntries: map[string]state.ContainerEntries{ + "pod-1": { + "container-1": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + "container-2": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2", "gpu-3"}, + expectedSortedDevices: []string{"gpu-2", "gpu-1", "gpu-3"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + strategy := NewGPUMemoryStrategy() + sortedDevices, err := strategy.Sort(tt.ctx, tt.filteredDevices) + + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedSortedDevices, sortedDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go index 105825e7f6..fb5eeaa5e1 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go @@ -28,10 +28,6 @@ func IsBindingContextValid(ctx *allocate.AllocationContext, sortedDevices []stri return false, "GPU topology is nil" } - if len(sortedDevices) == 0 && ctx.DeviceReq.DeviceRequest > 0 { - return false, "no devices to bind" - } - // Determine how many devices to allocate devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) if devicesToAllocate > len(sortedDevices) { diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index 3168bf6e92..26d92d9126 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -70,7 +70,7 @@ func IsNUMAAffinityDevice( return false } - return machine.NewCPUSet(info.GetNUMANode()...).IsSubsetOf(hintNodes) + return machine.NewCPUSet(info.GetNUMANodes()...).IsSubsetOf(hintNodes) } // GetGPUCount extracts GPU count from resource request diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go index 974118a3d2..4d2bc923b9 100644 --- a/pkg/util/machine/device.go +++ b/pkg/util/machine/device.go @@ -111,7 +111,7 @@ func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTo return provider.GetDeviceTopology() } -// GetDeviceNUMAAffinity retrieves a map of a certain device to the list of devices that it has an affinity with. +// GetDeviceNUMAAffinity retrieves a map of a certain device A to the list of devices in device B that it has an affinity with. // A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) func (r *DeviceTopologyRegistry) GetDeviceNUMAAffinity(deviceA, deviceB string) (map[string][]string, error) { deviceTopologyKey, numaReady, err := r.GetDeviceTopology(deviceA) @@ -134,7 +134,10 @@ func (r *DeviceTopologyRegistry) GetDeviceNUMAAffinity(deviceA, deviceB string) for keyName, keyInfo := range deviceTopologyKey.Devices { devicesWithAffinity := make([]string, 0) for valueName, valueInfo := range deviceTopologyValue.Devices { - if sets.NewInt(keyInfo.GetNUMANode()...).Equal(sets.NewInt(valueInfo.GetNUMANode()...)) { + deviceKeyNUMANodes := keyInfo.GetNUMANodes() + deviceValueNUMANodes := valueInfo.GetNUMANodes() + + if len(deviceKeyNUMANodes) != 0 && sets.NewInt(deviceKeyNUMANodes...).Equal(sets.NewInt(deviceValueNUMANodes...)) { devicesWithAffinity = append(devicesWithAffinity, valueName) } } @@ -148,7 +151,17 @@ type DeviceTopology struct { Devices map[string]DeviceInfo } -// GroupDeviceAffinity forms a topology graph such that all groups of DeviceIDs within a certain affinity priority level +// GroupDeviceAffinity forms a topology graph such that all devices within a DeviceIDs group have an affinity with each other. +// They are differentiated by their affinity priority level. +// E.g. Output: +// +// { +// 0: {{"gpu-0", "gpu-1"}, {"gpu-2", "gpu-3"}}, +// 1: {{"gpu-0", "gpu-1", "gpu-2", "gpu-3"}} +// } +// +// means that gpu-0 and gpu-1 have an affinity with each other, gpu-2 and gpu-3 have an affinity with each other in affinity priority 0. +// and gpu-0, gpu-1, gpu-2, and gpu-3 have an affinity with each other in affinity priority 1. func (t *DeviceTopology) GroupDeviceAffinity() map[AffinityPriority][]DeviceIDs { deviceAffinityGroup := make(map[AffinityPriority][]DeviceIDs) for deviceId, deviceInfo := range t.Devices { @@ -182,14 +195,6 @@ func containsGroup(groups []DeviceIDs, candidate DeviceIDs) bool { return false } -func (t *DeviceTopology) GetDeviceAffinityMap(deviceId string) (map[AffinityPriority]DeviceIDs, error) { - info, ok := t.Devices[deviceId] - if !ok { - return nil, fmt.Errorf("failed to find device %s in device topology", deviceId) - } - return info.DeviceAffinity, nil -} - type DeviceInfo struct { Health string NumaNodes []int @@ -206,7 +211,7 @@ type AffinityPriority int type DeviceIDs []string -func (i DeviceInfo) GetNUMANode() []int { +func (i DeviceInfo) GetNUMANodes() []int { if i.NumaNodes == nil { return []int{} } @@ -240,7 +245,7 @@ func (p *deviceTopologyProviderImpl) SetDeviceTopology(deviceTopology *DeviceTop p.mutex.Lock() defer p.mutex.Unlock() if deviceTopology == nil { - return fmt.Errorf("deviceTopology is nil") + return fmt.Errorf("deviceTopology is nil when setting device topology") } p.deviceTopology = deviceTopology @@ -252,6 +257,10 @@ func (p *deviceTopologyProviderImpl) GetDeviceTopology() (*DeviceTopology, bool, p.mutex.RLock() defer p.mutex.RUnlock() + if p.deviceTopology == nil { + return nil, false, fmt.Errorf("deviceTopology is nil when getting device topology") + } + return p.deviceTopology, p.numaTopologyReady, nil } diff --git a/pkg/util/machine/device_stub.go b/pkg/util/machine/device_stub.go new file mode 100644 index 0000000000..6e9e97796d --- /dev/null +++ b/pkg/util/machine/device_stub.go @@ -0,0 +1,43 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import "sync" + +type deviceTopologyProviderStub struct { + mutex sync.RWMutex + deviceTopology *DeviceTopology +} + +func NewDeviceTopologyProviderStub() DeviceTopologyProvider { + return &deviceTopologyProviderStub{} +} + +func (d *deviceTopologyProviderStub) SetDeviceTopology(deviceTopology *DeviceTopology) error { + d.mutex.Lock() + defer d.mutex.Unlock() + + d.deviceTopology = deviceTopology + return nil +} + +func (d *deviceTopologyProviderStub) GetDeviceTopology() (*DeviceTopology, bool, error) { + d.mutex.RLock() + defer d.mutex.RUnlock() + + return d.deviceTopology, true, nil +} diff --git a/pkg/util/machine/device_test.go b/pkg/util/machine/device_test.go new file mode 100644 index 0000000000..d9399627b8 --- /dev/null +++ b/pkg/util/machine/device_test.go @@ -0,0 +1,448 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import ( + "sort" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDeviceTopologyRegistry_GetDeviceNUMAAffinity(t *testing.T) { + t.Parallel() + + npuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": {NumaNodes: []int{0}}, + "npu-1": {NumaNodes: []int{1}}, + "npu-2": {NumaNodes: []int{0, 1}}, + }, + } + + gpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "gpu-0": {NumaNodes: []int{0}}, + "gpu-1": {NumaNodes: []int{1}}, + "gpu-2": {NumaNodes: []int{2}}, + }, + } + + xpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "xpu-0": {NumaNodes: []int{0}}, + "xpu-1": {NumaNodes: []int{1}}, + "xpu-2": {NumaNodes: nil}, + }, + } + + dpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "dpu-0": {NumaNodes: []int{1}}, + "dpu-1": {NumaNodes: []int{0}}, + "dpu-2": {NumaNodes: []int{}}, + }, + } + + // Register device topology providers + registry := NewDeviceTopologyRegistry() + registry.RegisterDeviceTopologyProvider("npu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("gpu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("xpu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("dpu", NewDeviceTopologyProviderStub()) + err := registry.SetDeviceTopology("npu", npuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("gpu", gpuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("xpu", xpuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("dpu", dpuTopology) + assert.NoError(t, err) + + tests := []struct { + name string + deviceA string + deviceB string + expected map[string][]string + expectedErr bool + }{ + { + name: "npu to gpu affinity", + deviceA: "npu", + deviceB: "gpu", + expected: map[string][]string{ + "npu-0": {"gpu-0"}, + "npu-1": {"gpu-1"}, + "npu-2": {}, + }, + }, + { + name: "non-existent device A", + deviceA: "invalid device", + deviceB: "gpu", + expectedErr: true, + }, + { + name: "non-existent device B", + deviceA: "npu", + deviceB: "invalid device", + expectedErr: true, + }, + { + name: "devices with empty numa nodes are not considered to have affinity with each other", + deviceA: "xpu", + deviceB: "dpu", + expected: map[string][]string{ + "xpu-0": {"dpu-1"}, + "xpu-1": {"dpu-0"}, + "xpu-2": {}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + actual, err := registry.GetDeviceNUMAAffinity(tt.deviceA, tt.deviceB) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateDeviceNUMAAffinity(t, actual, tt.expected) + } + }) + } +} + +func TestDeviceTopology_GroupDeviceAffinity(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + deviceTopology *DeviceTopology + expectedDeviceAffinity map[AffinityPriority][]DeviceIDs + }{ + { + name: "test simple affinity of 2 devices to 1 group with only affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + }, + }, + { + name: "test simple affinity of 4 devices to 1 group with only affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1", "npu-2"}, + }, + }, + "npu-4": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-5", "npu-6", "npu-7"}, + }, + }, + "npu-5": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-6", "npu-7"}, + }, + }, + "npu-6": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-5", "npu-7"}, + }, + }, + "npu-7": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-5", "npu-6"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"}), DeviceIDs([]string{"npu-4", "npu-5", "npu-6", "npu-7"})}, + }, + }, + { + name: "device topology includes self for one affinity level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + }, + }, + { + name: "test simple affinity of 2 devices to 1 group with 2 affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1"}, + 1: {"npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0"}, + 1: {"npu-0", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3"}, + 1: {"npu-0", "npu-1", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2"}, + 1: {"npu-0", "npu-1", "npu-2"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + 1: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"})}, + }, + }, + { + name: "device topology includes self for 2 affinity levels", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + 1: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"})}, + }, + }, + { + name: "unsorted device topology has no effect on result", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-1", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3", "npu-0", "npu-2"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1", "npu-0", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-2", "npu-1"}, + }, + }, + "npu-4": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-6", "npu-5", "npu-7"}, + }, + }, + "npu-5": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-7", "npu-4", "npu-6"}, + }, + }, + "npu-6": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-5", "npu-4", "npu-7"}, + }, + }, + "npu-7": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-6", "npu-4", "npu-5"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"}), DeviceIDs([]string{"npu-4", "npu-5", "npu-6", "npu-7"})}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceAffinity := tt.deviceTopology.GroupDeviceAffinity() + evaluateDeviceAffinity(t, deviceAffinity, tt.expectedDeviceAffinity) + }) + } +} + +func evaluateDeviceNUMAAffinity(t *testing.T, expectedDeviceNUMAAffinity, actualDeviceNUMAAffinity map[string][]string) { + if len(actualDeviceNUMAAffinity) != len(expectedDeviceNUMAAffinity) { + t.Errorf("deviceNUMAAffinity lengths don't match, expected %d, got %d", len(expectedDeviceNUMAAffinity), len(actualDeviceNUMAAffinity)) + return + } + + for device, expected := range expectedDeviceNUMAAffinity { + actual, ok := actualDeviceNUMAAffinity[device] + if !ok { + t.Errorf("expected device numa affinity for device %v, but it is not found", device) + return + } + + assert.ElementsMatch(t, expected, actual, "device numa affinity are not equal") + } +} + +func evaluateDeviceAffinity(t *testing.T, expectedDeviceAffinity, actualDeviceAffinity map[AffinityPriority][]DeviceIDs) { + if len(actualDeviceAffinity) != len(expectedDeviceAffinity) { + t.Errorf("expected %d affinities, got %d", len(expectedDeviceAffinity), len(actualDeviceAffinity)) + return + } + + for priority, expected := range expectedDeviceAffinity { + actual, ok := actualDeviceAffinity[priority] + if !ok { + t.Errorf("expected affinities for priority %v, but it is not found", priority) + return + } + + if !equalDeviceIDsGroupsIgnoreOrder(t, expected, actual) { + return + } + } +} + +func equalDeviceIDsGroupsIgnoreOrder(t *testing.T, expected, actual []DeviceIDs) bool { + if len(expected) != len(actual) { + t.Errorf("expected %d devices, got %d", len(expected), len(actual)) + return false + } + + // Convert each DeviceIDs slice into a normalized, comparable form + normalize := func(groups []DeviceIDs) []string { + res := make([]string, len(groups)) + for i, group := range groups { + sorted := append([]string{}, group...) + sort.Strings(sorted) + res[i] = strings.Join(sorted, ",") + } + sort.Strings(res) + return res + } + + normalizedExp := normalize(expected) + normalizedAct := normalize(actual) + + for i := range normalizedExp { + if normalizedExp[i] != normalizedAct[i] { + t.Errorf("expected %s, got %s", normalizedAct[i], normalizedExp[i]) + return false + } + } + + return true +} From 622e87e27fb15b2d211ddb37500efe9efbaa62ab Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 11 Nov 2025 11:31:30 +0800 Subject: [PATCH 38/52] refactor(gpumemory): move nil device request check after qos validation The nil device request check was moved to after qos validation to ensure proper resource allocation sequence. This change maintains the same behavior but improves the logical flow of the allocation process. --- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 31696d6cca..61a48e2ff0 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -434,14 +434,6 @@ func (p *GPUMemPlugin) Allocate( return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil } - if deviceReq == nil { - general.InfoS("Nil device request, returning empty response", - "podNamespace", resourceReq.PodNamespace, - "podName", resourceReq.PodName, - "containerName", resourceReq.ContainerName) - return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil - } - qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", @@ -504,6 +496,15 @@ func (p *GPUMemPlugin) Allocate( return resp, nil } + if deviceReq == nil { + general.InfoS("Nil device request, returning empty response", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName) + // if deviceReq is nil, return empty response to re-allocate after device allocation + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } + // Get GPU topology gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { From 766b8a75bda8705bf536ff9499069e9264dab914 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 11 Nov 2025 14:25:37 +0800 Subject: [PATCH 39/52] fix(gpu): skip zero requests in GetGPUCount and optimize logging remove redundant gpuCount and gpuNames logging in GetTopologyHints and Allocate move GetGPUCount call after initial checks in GetTopologyHints use deviceReq.DeviceRequest instead of gpuCount for memory calculation --- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 31 ++++++++----------- pkg/agent/qrm-plugins/gpu/util/util.go | 5 +++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 61a48e2ff0..88de958ea3 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -71,21 +71,13 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.Conf.GPUDeviceNames) - if err != nil { - general.Errorf("getGPUCount failed from req %v with error: %v", req, err) - return nil, fmt.Errorf("getGPUCount failed with error: %v", err) - } - general.InfoS("called", "podNamespace", req.PodNamespace, "podName", req.PodName, "containerName", req.ContainerName, "qosLevel", qosLevel, "reqAnnotations", req.Annotations, - "gpuMemory", gpuMemory, - "gpuNames", gpuNames.List(), - "gpuCount", gpuCount) + "gpuMemory", gpuMemory) p.Lock() defer func() { @@ -126,6 +118,14 @@ func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *p } } + gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.Conf.GPUDeviceNames) + if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + + general.Infof("gpuCount: %f, gpuNames: %v", gpuCount, gpuNames.List()) + // otherwise, calculate hint for container without allocated memory if hints == nil { var calculateErr error @@ -447,12 +447,6 @@ func (p *GPUMemPlugin) Allocate( return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) } - gpuCount, gpuNames, err := gpuutil.GetGPUCount(resourceReq, p.Conf.GPUDeviceNames) - if err != nil { - general.Errorf("getGPUCount failed from req %v with error: %v", resourceReq, err) - return nil, fmt.Errorf("getGPUCount failed with error: %v", err) - } - general.InfoS("called", "podNamespace", resourceReq.PodNamespace, "podName", resourceReq.PodName, @@ -460,8 +454,7 @@ func (p *GPUMemPlugin) Allocate( "qosLevel", qosLevel, "reqAnnotations", resourceReq.Annotations, "gpuMemory", gpuMemory, - "gpuNames", gpuNames.List(), - "gpuCount", gpuCount) + "deviceReq", deviceReq.String()) p.Lock() defer func() { @@ -505,6 +498,8 @@ func (p *GPUMemPlugin) Allocate( return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil } + general.Infof("deviceReq: %v", deviceReq.String()) + // Get GPU topology gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) if err != nil { @@ -552,7 +547,7 @@ func (p *GPUMemPlugin) Allocate( TopologyAwareAllocations: make(map[string]state.Allocation), } - gpuMemoryPerGPU := gpuMemory / gpuCount + gpuMemoryPerGPU := gpuMemory / float64(deviceReq.DeviceRequest) for _, deviceID := range result.AllocatedDevices { info, ok := gpuTopology.Devices[deviceID] if !ok { diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go index 26d92d9126..c8c1331593 100644 --- a/pkg/agent/qrm-plugins/gpu/util/util.go +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -83,6 +83,11 @@ func GetGPUCount(req *pluginapi.ResourceRequest, deviceNames []string) (float64, if err != nil && !errors.IsNotFound(err) { return 0, nil, err } + + if request == 0 { + continue + } + gpuCount += request gpuNames.Insert(resourceName) } From a74fc3e1827d1f800da5fc7d1538c8cfdeb6c6b4 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Fri, 14 Nov 2025 17:16:48 +0800 Subject: [PATCH 40/52] feat(gpumemory): add numa binding check and health status filter filter out unhealthy gpu devices when calculating topology hints skip numa binding hints for non-numa binding requests --- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 88de958ea3..e37050396e 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -22,6 +22,7 @@ import ( "sort" "sync" + deviceplugin "k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" @@ -38,6 +39,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" "github.com/kubewharf/katalyst-core/pkg/util/metric" + qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos" ) type GPUMemPlugin struct { @@ -58,6 +60,14 @@ func (p *GPUMemPlugin) ResourceName() string { } func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { + // if not numa binding, return nil hints to let kubelet choose numa node randomly + if !qosutil.AnnotationsIndicateNUMABinding(req.Annotations) { + return util.PackResourceHintsResponse(req, string(consts.ResourceGPUMemory), + map[string]*pluginapi.ListOfTopologyHints{ + string(consts.ResourceGPUMemory): nil, // indicates that there is no numa preference + }) + } + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", @@ -157,7 +167,12 @@ func (p *GPUMemPlugin) calculateHints( numaToAvailableGPUCount := make(map[int]float64) numaToMostAllocatedGPUMemory := make(map[int]float64) for gpuID, info := range gpuTopology.Devices { + if info.Health != deviceplugin.Healthy { + continue + } + s := machineState[gpuID] + // todo: get allocated quantity according to qos level allocated := s.GetQuantityAllocated() if allocated+perGPUMemory <= float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) { for _, numaNode := range info.GetNUMANodes() { From 11efc3511279fc76006210ac704ef95fce784f83 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 17 Nov 2025 10:41:24 +0800 Subject: [PATCH 41/52] fix(gpumemory): handle unhealthy devices and correct capacity values Set allocatable memory to zero for unhealthy GPU devices and use separate capacity values instead of reusing allocatable values. This ensures accurate resource accounting for both healthy and unhealthy devices. --- .../qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index e37050396e..bec09e3fdf 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -378,8 +378,13 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca aggregatedAllocatableQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) aggregatedCapacityQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) gpuMemoryAllocatablePerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + gpuMemoryCapacityPerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + if deviceInfo.Health != deviceplugin.Healthy { + gpuMemoryAllocatablePerGPUNUMA = 0 + } if len(deviceInfo.NumaNodes) > 0 { gpuMemoryAllocatablePerGPUNUMA = gpuMemoryAllocatablePerGPUNUMA / float64(len(deviceInfo.NumaNodes)) + gpuMemoryCapacityPerGPUNUMA = gpuMemoryCapacityPerGPUNUMA / float64(len(deviceInfo.NumaNodes)) for _, numaID := range deviceInfo.NumaNodes { if numaID < 0 { numaID = 0 @@ -394,7 +399,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + ResourceValue: gpuMemoryCapacityPerGPUNUMA, Name: deviceID, Node: uint64(numaID), Type: string(v1alpha1.TopologyTypeGPU), @@ -414,7 +419,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + ResourceValue: gpuMemoryCapacityPerGPUNUMA, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ From 4f8457d69015934a0e93ae59198524599dbdddc9 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 17 Nov 2025 12:40:11 +0800 Subject: [PATCH 42/52] refactor(qrm): remove unused state file directory fields Clean up GenericQRMPluginConfiguration by removing unused StateFileDirectory and InMemoryStateFileDirectory fields to simplify the struct. --- pkg/config/agent/qrm/qrm_base.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/config/agent/qrm/qrm_base.go b/pkg/config/agent/qrm/qrm_base.go index ffa851047e..9f90e6a512 100644 --- a/pkg/config/agent/qrm/qrm_base.go +++ b/pkg/config/agent/qrm/qrm_base.go @@ -22,12 +22,10 @@ import ( ) type GenericQRMPluginConfiguration struct { - StateFileDirectory string - InMemoryStateFileDirectory string - QRMPluginSocketDirs []string - ExtraStateFileAbsPath string - PodDebugAnnoKeys []string - UseKubeletReservedConfig bool + QRMPluginSocketDirs []string + ExtraStateFileAbsPath string + PodDebugAnnoKeys []string + UseKubeletReservedConfig bool // PodAnnotationKeptKeys indicates pod annotation keys will be kept in qrm state PodAnnotationKeptKeys []string // PodLabelKeptKeys indicates pod label keys will be kept in qrm state From 880bba413ae89b646b0bd88f336bf62faa470b75 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Thu, 20 Nov 2025 11:17:08 +0800 Subject: [PATCH 43/52] fix(gpumemory): handle numa topology not ready case gracefully Return nil instead of error when numa topology is not ready and log the error fix: handle error gracefully fix: handle error gracefully --- pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go | 11 +++++++++-- pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go | 11 ++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 3c572d1591..9bd6520678 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -262,7 +262,8 @@ func (p *StaticPolicy) GetTopologyAwareResources( for _, resourcePlugin := range p.resourcePlugins { allocatedResource, err := resourcePlugin.GetTopologyAwareResources(req.PodUid, req.ContainerName) if err != nil { - return nil, fmt.Errorf("failed to get topology aware resources for plugin %s: %w", resourcePlugin.ResourceName(), err) + general.Errorf("failed to get topology aware resources for plugin %s: %v", resourcePlugin.ResourceName(), err) + continue } if allocatedResource == nil { @@ -347,8 +348,14 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources( for _, resourcePlugin := range p.resourcePlugins { allocatableResource, err := resourcePlugin.GetTopologyAwareAllocatableResources() if err != nil { - return nil, fmt.Errorf("failed to get topology aware allocatable resources for plugin %s: %w", resourcePlugin.ResourceName(), err) + general.Errorf("failed to get topology aware allocatable resources for plugin %s: %v", resourcePlugin.ResourceName(), err) + continue + } + + if allocatableResource == nil { + continue } + allocatableResources[allocatableResource.ResourceName] = allocatableResource.AllocatableTopologyAwareResource } diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go index e75b1fede3..2f8a90930d 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go @@ -326,14 +326,15 @@ func TestStaticPolicy_GetTopologyAwareResources(t *testing.T) { assert.NoError(t, err) assert.NotNil(t, resp) - // Test error handling + // Invalid request does not return error invalidReq := &pluginapi.GetTopologyAwareResourcesRequest{ PodUid: podUID, ContainerName: "invalid-container", } - _, err = policy.GetTopologyAwareResources(context.Background(), invalidReq) - assert.Error(t, err) + resp, err = policy.GetTopologyAwareResources(context.Background(), invalidReq) + assert.NoError(t, err) + assert.NotNil(t, resp) } func TestStaticPolicy_mergeTopologyAwareResourcesResponse(t *testing.T) { @@ -522,8 +523,8 @@ func TestStaticPolicy_GetTopologyAwareAllocatableResources(t *testing.T) { getTopologyAwareAllocatableResourcesReq := &pluginapi.GetTopologyAwareAllocatableResourcesRequest{} resp, err := policy.GetTopologyAwareAllocatableResources(context.Background(), getTopologyAwareAllocatableResourcesReq) - assert.Error(t, err) - assert.Nil(t, resp) + assert.NoError(t, err) + assert.NotNil(t, resp) } func TestStaticPolicy_UpdateAllocatableAssociatedDevices(t *testing.T) { From 9ecafcc536ba29c10c004ae1ff39c064946d09bb Mon Sep 17 00:00:00 2001 From: "justin.cheng" Date: Mon, 24 Nov 2025 10:05:06 +0800 Subject: [PATCH 44/52] chore: add context to interface methods --- .../custom_device_plugin_stub.go | 13 +++---- .../gpu/customdeviceplugin/gpu/gpu.go | 7 ++-- .../gpu/customdeviceplugin/gpu/gpu_test.go | 5 +-- .../gpu/customdeviceplugin/interface.go | 8 +++-- .../gpu/customdeviceplugin/rdma/rdma.go | 7 ++-- .../gpu/customdeviceplugin/rdma/rdma_test.go | 5 +-- .../gpu/resourceplugin/gpumemory/gpu_mem.go | 9 ++--- .../gpu/resourceplugin/interface.go | 10 +++--- .../resourceplugin/resource_plugin_stub.go | 9 ++--- .../qrm-plugins/gpu/staticpolicy/policy.go | 36 +++++++++---------- 10 files changed, 60 insertions(+), 49 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go index baab6a2dae..b2b5aa6e3b 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go @@ -17,6 +17,7 @@ limitations under the License. package customdeviceplugin import ( + "context" "fmt" v1 "k8s.io/api/core/v1" @@ -40,11 +41,11 @@ func (c CustomDevicePluginStub) DeviceNames() []string { return []string{"custom-device-plugin-stub"} } -func (c CustomDevicePluginStub) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { +func (c CustomDevicePluginStub) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } -func (c CustomDevicePluginStub) UpdateAllocatableAssociatedDevices(_ *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (c CustomDevicePluginStub) UpdateAllocatableAssociatedDevices(context.Context, *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil } @@ -52,7 +53,7 @@ func (c CustomDevicePluginStub) DefaultAccompanyResourceName() string { return "resource-plugin-stub" } -func (c CustomDevicePluginStub) AllocateAssociatedDevice(resReq *pluginapi.ResourceRequest, _ *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { +func (c CustomDevicePluginStub) AllocateAssociatedDevice(_ context.Context, resReq *pluginapi.ResourceRequest, _ *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { // Simply check if the accompany resource has been allocated // If it has been allocated, allocate the associated device accompanyResourceAllocation := c.State.GetAllocationInfo(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName) @@ -78,11 +79,11 @@ func (c CustomDevicePluginStub2) DeviceNames() []string { return []string{"custom-device-plugin-stub-2"} } -func (c CustomDevicePluginStub2) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { +func (c CustomDevicePluginStub2) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } -func (c CustomDevicePluginStub2) UpdateAllocatableAssociatedDevices(_ *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (c CustomDevicePluginStub2) UpdateAllocatableAssociatedDevices(context.Context, *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil } @@ -90,7 +91,7 @@ func (c CustomDevicePluginStub2) DefaultAccompanyResourceName() string { return "resource-plugin-stub" } -func (c CustomDevicePluginStub2) AllocateAssociatedDevice(resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { +func (c CustomDevicePluginStub2) AllocateAssociatedDevice(_ context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { // Simply allocate the associated device c.State.SetAllocationInfo(v1.ResourceName(deviceReq.DeviceName), resReq.PodUid, resReq.ContainerName, &state.AllocationInfo{}, false) return &pluginapi.AssociatedDeviceAllocationResponse{}, nil diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index 0ebdfe13cc..d29229c074 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -17,6 +17,7 @@ limitations under the License. package gpu import ( + "context" "fmt" v1 "k8s.io/api/core/v1" @@ -66,16 +67,16 @@ func (p *GPUDevicePlugin) DeviceNames() []string { return p.deviceNames } -func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.GPUDeviceType) } -func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { +func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } func (p *GPUDevicePlugin) AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go index a1f7246b2f..621401afaf 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go @@ -17,6 +17,7 @@ limitations under the License. package gpu import ( + "context" "testing" "github.com/stretchr/testify/assert" @@ -118,7 +119,7 @@ func TestGPUDevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { }, } - resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(req) + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(context.Background(), req) assert.NoError(t, err) assert.NotNil(t, resp) @@ -295,7 +296,7 @@ func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { ContainerName: tt.containerName, } - resp, err := devicePlugin.AllocateAssociatedDevice(resourceReq, tt.deviceReq, "test") + resp, err := devicePlugin.AllocateAssociatedDevice(context.Background(), resourceReq, tt.deviceReq, "test") if tt.expectedErr { assert.Error(t, err) } else { diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go index c2f221e49a..ea446e2df6 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -17,6 +17,8 @@ limitations under the License. package customdeviceplugin import ( + "context" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" ) @@ -24,13 +26,13 @@ type CustomDevicePlugin interface { // DeviceNames returns a list of all possible names for this device DeviceNames() []string - GetAssociatedDeviceTopologyHints(*pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) + GetAssociatedDeviceTopologyHints(ctx context.Context, request *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) - UpdateAllocatableAssociatedDevices(*pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) + UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) DefaultAccompanyResourceName() string AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) } diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go index a649f8d364..b04d9a014e 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -17,6 +17,7 @@ limitations under the License. package rdma import ( + "context" "fmt" "math" @@ -63,17 +64,17 @@ func (p *RDMADevicePlugin) DeviceNames() []string { return p.deviceNames } -func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { +func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType) } -func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(_ *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { +func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { return &pluginapi.AssociatedDeviceHintsResponse{}, nil } // AllocateAssociatedDevice check if rdma is allocated to other containers, make sure they do not share rdma func (p *RDMADevicePlugin) AllocateAssociatedDevice( - resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) if err != nil { diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go index 600f564a90..37975a32cc 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go @@ -17,6 +17,7 @@ limitations under the License. package rdma import ( + "context" "testing" "github.com/stretchr/testify/assert" @@ -123,7 +124,7 @@ func TestRDMADevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { }, } - resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(req) + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(context.Background(), req) assert.NoError(t, err) assert.NotNil(t, resp) @@ -528,7 +529,7 @@ func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) { ContainerName: tt.containerName, } - resp, err := devicePlugin.AllocateAssociatedDevice(resourceReq, tt.deviceReq, tt.accompanyResourceName) + resp, err := devicePlugin.AllocateAssociatedDevice(context.Background(), resourceReq, tt.deviceReq, tt.accompanyResourceName) if tt.expectedErr { assert.Error(t, err) } else { diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index bec09e3fdf..46deb5ad86 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -17,6 +17,7 @@ limitations under the License. package gpumemory import ( + "context" "fmt" "math" "sort" @@ -59,7 +60,7 @@ func (p *GPUMemPlugin) ResourceName() string { return string(consts.ResourceGPUMemory) } -func (p *GPUMemPlugin) GetTopologyHints(req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { +func (p *GPUMemPlugin) GetTopologyHints(ctx context.Context, req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { // if not numa binding, return nil hints to let kubelet choose numa node randomly if !qosutil.AnnotationsIndicateNUMABinding(req.Annotations) { return util.PackResourceHintsResponse(req, string(consts.ResourceGPUMemory), @@ -295,7 +296,7 @@ func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( } } -func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { +func (p *GPUMemPlugin) GetTopologyAwareResources(ctx context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, podUID, containerName) @@ -356,7 +357,7 @@ func (p *GPUMemPlugin) GetTopologyAwareResources(podUID, containerName string) ( return resp, nil } -func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) { +func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) (*gpuconsts.AllocatableResource, error) { general.InfofV(4, "called") p.Lock() @@ -443,7 +444,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources() (*gpuconsts.Alloca } func (p *GPUMemPlugin) Allocate( - resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + ctx context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, ) (*pluginapi.ResourceAllocationResponse, error) { quantity, exists := resourceReq.ResourceRequests[p.ResourceName()] if !exists || quantity == 0 { diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go index ba76ac6351..f368b40939 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -17,6 +17,8 @@ limitations under the License. package resourceplugin import ( + "context" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" @@ -26,13 +28,13 @@ import ( type ResourcePlugin interface { ResourceName() string - GetTopologyHints(*pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) + GetTopologyHints(ctx context.Context, request *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) - GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) + GetTopologyAwareResources(ctx context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) - GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) + GetTopologyAwareAllocatableResources(ctx context.Context) (*gpuconsts.AllocatableResource, error) Allocate( - resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + ctx context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, ) (*pluginapi.ResourceAllocationResponse, error) } diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go index db65623868..57b28089c6 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go @@ -17,6 +17,7 @@ limitations under the License. package resourceplugin import ( + "context" "fmt" v1 "k8s.io/api/core/v1" @@ -39,11 +40,11 @@ func (r ResourcePluginStub) ResourceName() string { return "resource-plugin-stub" } -func (r ResourcePluginStub) GetTopologyHints(_ *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) { +func (r ResourcePluginStub) GetTopologyHints(context.Context, *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) { return &pluginapi.ResourceHintsResponse{}, nil } -func (r ResourcePluginStub) GetTopologyAwareResources(podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { +func (r ResourcePluginStub) GetTopologyAwareResources(_ context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { // Simply returns a fixed response if the podUID and containerName is found in the state, otherwise return an error allocationInfo := r.State.GetAllocationInfo(v1.ResourceName(r.ResourceName()), podUID, containerName) if allocationInfo == nil { @@ -60,14 +61,14 @@ func (r ResourcePluginStub) GetTopologyAwareResources(podUID, containerName stri }, nil } -func (r ResourcePluginStub) GetTopologyAwareAllocatableResources() (*gpuconsts.AllocatableResource, error) { +func (r ResourcePluginStub) GetTopologyAwareAllocatableResources(context.Context) (*gpuconsts.AllocatableResource, error) { // Simply return a fixed response return &gpuconsts.AllocatableResource{ ResourceName: r.ResourceName(), }, nil } -func (r ResourcePluginStub) Allocate(resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest) (*pluginapi.ResourceAllocationResponse, error) { +func (r ResourcePluginStub) Allocate(_ context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest) (*pluginapi.ResourceAllocationResponse, error) { // Simply save resource request and device request in state if resourceReq.ResourceName == r.ResourceName() { r.State.SetAllocationInfo(v1.ResourceName(r.ResourceName()), resourceReq.PodUid, resourceReq.ContainerName, &state.AllocationInfo{}, false) diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index 9bd6520678..e8624fdb92 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -189,7 +189,7 @@ func (p *StaticPolicy) ResourceName() string { // GetTopologyHints returns hints of corresponding resources func (p *StaticPolicy) GetTopologyHints( - _ context.Context, + ctx context.Context, req *pluginapi.ResourceRequest, ) (resp *pluginapi.ResourceHintsResponse, err error) { general.InfofV(4, "called") @@ -204,7 +204,7 @@ func (p *StaticPolicy) GetTopologyHints( if resourcePlugin == nil { return nil, fmt.Errorf("failed to find resource plugin by name %s", req.ResourceName) } - return resourcePlugin.GetTopologyHints(req) + return resourcePlugin.GetTopologyHints(ctx, req) } // GetPodTopologyHints returns hints of corresponding resources @@ -216,7 +216,7 @@ func (p *StaticPolicy) GetPodTopologyHints( } func (p *StaticPolicy) RemovePod( - _ context.Context, + ctx context.Context, req *pluginapi.RemovePodRequest, ) (*pluginapi.RemovePodResponse, error) { if req == nil { @@ -246,7 +246,7 @@ func (p *StaticPolicy) GetResourcesAllocation( // GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format func (p *StaticPolicy) GetTopologyAwareResources( - _ context.Context, + ctx context.Context, req *pluginapi.GetTopologyAwareResourcesRequest, ) (*pluginapi.GetTopologyAwareResourcesResponse, error) { general.InfofV(4, "called") @@ -260,7 +260,7 @@ func (p *StaticPolicy) GetTopologyAwareResources( // Get topology aware resources for all resource plugins allocatedResourcesList := make([]*pluginapi.GetTopologyAwareResourcesResponse, 0) for _, resourcePlugin := range p.resourcePlugins { - allocatedResource, err := resourcePlugin.GetTopologyAwareResources(req.PodUid, req.ContainerName) + allocatedResource, err := resourcePlugin.GetTopologyAwareResources(ctx, req.PodUid, req.ContainerName) if err != nil { general.Errorf("failed to get topology aware resources for plugin %s: %v", resourcePlugin.ResourceName(), err) continue @@ -332,7 +332,7 @@ func (p *StaticPolicy) mergeTopologyAwareResourcesResponse( // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format func (p *StaticPolicy) GetTopologyAwareAllocatableResources( - _ context.Context, + ctx context.Context, req *pluginapi.GetTopologyAwareAllocatableResourcesRequest, ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { general.InfofV(4, "called") @@ -346,7 +346,7 @@ func (p *StaticPolicy) GetTopologyAwareAllocatableResources( // Get topology aware allocatable resources for all resource plugins allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) for _, resourcePlugin := range p.resourcePlugins { - allocatableResource, err := resourcePlugin.GetTopologyAwareAllocatableResources() + allocatableResource, err := resourcePlugin.GetTopologyAwareAllocatableResources(ctx) if err != nil { general.Errorf("failed to get topology aware allocatable resources for plugin %s: %v", resourcePlugin.ResourceName(), err) continue @@ -381,7 +381,7 @@ func (p *StaticPolicy) GetResourcePluginOptions( // plugin can allocate corresponding resource for the container // according to resource request func (p *StaticPolicy) Allocate( - _ context.Context, + ctx context.Context, req *pluginapi.ResourceRequest, ) (resp *pluginapi.ResourceAllocationResponse, err error) { if req == nil { @@ -401,7 +401,7 @@ func (p *StaticPolicy) Allocate( return nil, fmt.Errorf("failed to get resource plugin by name %s", req.ResourceName) } - resp, err = resourcePlugin.Allocate(req, nil) + resp, err = resourcePlugin.Allocate(ctx, req, nil) return resp, err } @@ -579,7 +579,7 @@ func (p *StaticPolicy) clearResidualState( } func (p *StaticPolicy) UpdateAllocatableAssociatedDevices( - _ context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, + ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, ) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { if request == nil || len(request.Devices) == 0 { return nil, fmt.Errorf("request is nil") @@ -590,7 +590,7 @@ func (p *StaticPolicy) UpdateAllocatableAssociatedDevices( return nil, fmt.Errorf("no custom device plugin found for device %s", request.DeviceName) } - return customDevicePlugin.UpdateAllocatableAssociatedDevices(request) + return customDevicePlugin.UpdateAllocatableAssociatedDevices(ctx, request) } func (*StaticPolicy) GetAssociatedDeviceTopologyHints( @@ -603,7 +603,7 @@ func (*StaticPolicy) GetAssociatedDeviceTopologyHints( // 1. Find the resource plugin that corresponds to the accompanyResourceName and allocate // 2. Find the custom device plugin that corresponds to the deviceName and allocate func (p *StaticPolicy) AllocateAssociatedDevice( - _ context.Context, req *pluginapi.AssociatedDeviceRequest, + ctx context.Context, req *pluginapi.AssociatedDeviceRequest, ) (resp *pluginapi.AssociatedDeviceAllocationResponse, respErr error) { var isAccompanyResourcePlugin bool var isAccompanyCustomDevicePlugin bool @@ -648,7 +648,7 @@ func (p *StaticPolicy) AllocateAssociatedDevice( // Check if accompany resource maps to a resource plugin; if it does, allocate it first accompanyResourcePlugin := p.getResourcePlugin(req.AccompanyResourceName) if accompanyResourcePlugin != nil { - _, err := accompanyResourcePlugin.Allocate(req.ResourceRequest, targetDeviceReq) + _, err := accompanyResourcePlugin.Allocate(ctx, req.ResourceRequest, targetDeviceReq) if err != nil { return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", req.AccompanyResourceName, err) } @@ -669,7 +669,7 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("nil accompany device request") } - _, err := p.allocateAssociatedDevice(accompanyCustomDevicePlugin, req.ResourceRequest, accompanyDeviceReq, "") + _, err := p.allocateAssociatedDevice(ctx, accompanyCustomDevicePlugin, req.ResourceRequest, accompanyDeviceReq, "") if err != nil { return nil, fmt.Errorf("AllocateAssociatedDevice accompany resource %s failed with error: %v", req.AccompanyResourceName, err) } @@ -683,18 +683,18 @@ func (p *StaticPolicy) AllocateAssociatedDevice( return nil, fmt.Errorf("no custom device plugin found for target device %s", req.DeviceName) } - return p.allocateAssociatedDevice(targetCustomDevicePlugin, req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) + return p.allocateAssociatedDevice(ctx, targetCustomDevicePlugin, req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) } func (p *StaticPolicy) allocateAssociatedDevice( - devicePlugin customdeviceplugin.CustomDevicePlugin, + ctx context.Context, devicePlugin customdeviceplugin.CustomDevicePlugin, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, ) (*pluginapi.AssociatedDeviceAllocationResponse, error) { defaultAccompanyResourceName := devicePlugin.DefaultAccompanyResourceName() if defaultAccompanyResourceName != "" && accompanyResourceName != defaultAccompanyResourceName { accompanyResourcePlugin := p.getResourcePlugin(defaultAccompanyResourceName) if accompanyResourcePlugin != nil { - _, err := accompanyResourcePlugin.Allocate(resReq, deviceReq) + _, err := accompanyResourcePlugin.Allocate(ctx, resReq, deviceReq) if err != nil { _ = p.removeContainer(resReq.PodUid, resReq.ContainerName, v1.ResourceName(defaultAccompanyResourceName)) return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", defaultAccompanyResourceName, err) @@ -702,7 +702,7 @@ func (p *StaticPolicy) allocateAssociatedDevice( } } - return devicePlugin.AllocateAssociatedDevice(resReq, deviceReq, accompanyResourceName) + return devicePlugin.AllocateAssociatedDevice(ctx, resReq, deviceReq, accompanyResourceName) } func (p *StaticPolicy) registerDefaultResourcePlugins() error { From 3a7254ffd432d1c05f7d8a8f87c26382fdde88e5 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 24 Nov 2025 11:01:30 +0800 Subject: [PATCH 45/52] feat(gpu): add ShareGPUManager for device sharing eligibility implement ShareGPUManager to determine GPU device sharing eligibility based on pod indicators add periodic sync and caching for efficient decision making integrate with base plugin and update device state handling --- pkg/agent/qrm-plugins/gpu/baseplugin/base.go | 8 + .../gpu/baseplugin/share_gpu_manager.go | 251 ++++++++++++++ .../gpu/baseplugin/share_gpu_manager_test.go | 310 ++++++++++++++++++ .../gpu/customdeviceplugin/gpu/gpu.go | 3 + .../gpu/resourceplugin/gpumemory/gpu_mem.go | 21 +- .../qrm-plugins/gpu/staticpolicy/policy.go | 5 + 6 files changed, 588 insertions(+), 10 deletions(-) create mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go create mode 100644 pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go index f4c9101759..a63aef6bb8 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -53,6 +53,8 @@ type BasePlugin struct { State state.State // Registry of device topology providers DeviceTopologyRegistry *machine.DeviceTopologyRegistry + // ShareGPUManager is a manager that manages share GPU devices + ShareGPUManager ShareGPUManager // Registry of default resource state generators DefaultResourceStateGeneratorRegistry *state.DefaultResourceStateGeneratorRegistry @@ -76,6 +78,7 @@ func NewBasePlugin( DeviceTopologyRegistry: machine.NewDeviceTopologyRegistry(), DefaultResourceStateGeneratorRegistry: state.NewDefaultResourceStateGeneratorRegistry(), + ShareGPUManager: NewShareGPUManager(), deviceNameToTypeMap: make(map[string]string), }, nil @@ -93,6 +96,11 @@ func (p *BasePlugin) InitState() error { return nil } +func (p *BasePlugin) Run(stopCh <-chan struct{}) error { + go p.ShareGPUManager.Run(p, stopCh) + return nil +} + func (p *BasePlugin) PackAllocationResponse( req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, resourceAllocationAnnotations map[string]string, resourceName string, diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go new file mode 100644 index 0000000000..f1bd9432f4 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go @@ -0,0 +1,251 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "context" + "sync" + "time" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/general" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// This module determines whether each GPU device is eligible for shared usage +// (ShareGPU) based on per-pod extended indicators fetched from MetaServer. It +// periodically scans the GPU machine state maintained by BasePlugin and builds +// a snapshot map `shareGPUMap` that records the ShareGPU decision for each +// device ID. +// Decision rule: Only main containers are considered; if any main container on +// a device disables ShareGPU, the device is marked non-shareable. To reduce +// repeated external queries, a per-sync in-memory cache keyed by `pod UID` is +// used to memoize `EnableShareGPU` decisions. + +// ShareGPUManager determines per-device ShareGPU eligibility and maintains a +// periodic snapshot. Safe for concurrent reads via `EnableShareGPU`. +// +// Time Complexity: +// - `sync`: O(D + C) where D is number of devices and C is number of main +// containers scanned; per-pod indicator lookups are amortized O(1) via cache. +// - `EnableShareGPU`: O(1) map read. +// - `Allocate`: O(A) over `TopologyAwareAllocations` entries when marking. +// +// Potential Errors: +// - External indicator fetch may fail; treated conservatively and wrapped. +// - Internal panics are recovered to keep the manager running. +type ShareGPUManager interface { + // Allocate processes an allocation of a main container. If the pod's + // indicator disables ShareGPU, all involved device IDs in + // `TopologyAwareAllocations` are marked non-shareable in the snapshot. + // Params: + // - ctx: request-scoped context for external calls + // - allocationInfo: container allocation metadata; ignored if nil or non-main + // Returns: none; updates internal snapshot + // Errors: any external errors are logged and wrapped internally + Allocate(ctx context.Context, allocationInfo *state.AllocationInfo) + + // EnableShareGPU returns the cached ShareGPU decision for a given device ID. + // Params: + // - id: device ID string + // Returns: true if the device is shareable; false if unknown or disallowed + EnableShareGPU(id string) bool + + // Run starts the periodic synchronization loop until `stopCh` is closed. + // Params: + // - basePlugin: plugin providing machine state and MetaServer + // - stopCh: channel to stop the loop + // Notes: blocking method; should be called in a separate goroutine. + Run(*BasePlugin, <-chan struct{}) +} +type shareGPUManager struct { + sync.RWMutex + + shareGPUMap map[string]bool + basePlugin *BasePlugin +} + +// NewShareGPUManager creates a new ShareGPUManager instance. +// Returns: a manager with an empty snapshot cache. +func NewShareGPUManager() ShareGPUManager { + return &shareGPUManager{ + shareGPUMap: make(map[string]bool), + } +} + +// EnableShareGPU returns the cached ShareGPU decision for a given device ID. +// If the device ID is not present in the latest snapshot, it returns false. +// Complexity: O(1) map lookup. +func (s *shareGPUManager) EnableShareGPU(id string) bool { + s.RLock() + defer s.RUnlock() + + return s.shareGPUMap[id] +} + +// Allocate marks involved device IDs as non-shareable if the pod disables +// ShareGPU. Non-main containers are ignored. +// Complexity: O(A) where A is number of device IDs in TopologyAwareAllocations. +func (s *shareGPUManager) Allocate(ctx context.Context, allocationInfo *state.AllocationInfo) { + if allocationInfo == nil || !allocationInfo.CheckMainContainer() { + return + } + + enableShareGPU := s.evaluateContainerDeviceShareStatus(ctx, allocationInfo, nil) + if enableShareGPU { + return + } + + s.Lock() + defer s.Unlock() + for id := range allocationInfo.TopologyAwareAllocations { + s.shareGPUMap[id] = false + } +} + +// Run starts the periodic synchronization loop that refreshes ShareGPU +// decisions. The loop: +// - immediately performs a synchronization once for faster readiness; +// - then schedules periodic syncs every 15 seconds; +// - stops when `stopCh` is closed. +// Note: The method is blocking; callers should invoke it in a goroutine. +func (s *shareGPUManager) Run(basePlugin *BasePlugin, stopCh <-chan struct{}) { + ctx, cancel := context.WithCancel(context.Background()) + s.basePlugin = basePlugin + + go func() { + <-stopCh + cancel() + }() + + s.sync(ctx) + wait.UntilWithContext(ctx, s.sync, 30*time.Second) +} + +// sync refreshes the ShareGPU decisions by scanning machine state. +// Complexity: O(D + C) per invocation. +func (s *shareGPUManager) sync(ctx context.Context) { + if s.basePlugin == nil { + general.Infof("share gpu manager sync failed, basePlugin is nil") + return + } + + s.Lock() + defer s.Unlock() + machineState, ok := s.basePlugin.State.GetMachineState()[gpuconsts.GPUDeviceType] + if !ok { + general.Infof("share gpu manager found no GPU machine state; skipping") + return + } + + // Build a fresh snapshot with per-sync indicator cache. + shareGPUMap := make(map[string]bool, len(machineState)) + indicatorCache := make(map[types.UID]bool) + for id, alloc := range machineState { + shareGPUMap[id] = s.evaluateDeviceShareStatus(ctx, alloc, indicatorCache) + } + + s.shareGPUMap = shareGPUMap +} + +// evaluateDeviceShareStatus scans main containers for a device and returns true +// if and only if all of them enable ShareGPU. Any error retrieving indicators is +// treated as non-blocking and the container is ignored (optimistic sharing). +// evaluateDeviceShareStatus returns true iff all main containers enable ShareGPU. +// Complexity: O(Cd) where Cd is number of main containers on the device. +func (s *shareGPUManager) evaluateDeviceShareStatus(ctx context.Context, alloc *state.AllocationState, cache map[types.UID]bool) bool { + if alloc == nil { + return false + } + + // Default to shareable and short-circuit to false once a disallowed pod is found. + for _, containerEntries := range alloc.PodEntries { + for _, container := range containerEntries { + if !container.CheckMainContainer() { + continue + } + + enableShareGPU := s.evaluateContainerDeviceShareStatus(ctx, container, cache) + if !enableShareGPU { + return false + } + } + } + + return true +} + +// evaluateContainerDeviceShareStatus checks a single container's pod-level indicator. +// If a cache is provided, it memoizes decisions by pod UID. +// Complexity: O(1) with cache hit; O(ExternalCall) otherwise. +func (s *shareGPUManager) evaluateContainerDeviceShareStatus(ctx context.Context, container *state.AllocationInfo, cache map[types.UID]bool) bool { + podMeta := s.preparePodMeta(container) + + if cache != nil { + if v, ok := cache[podMeta.UID]; ok { + return v + } + } + + enableShareGPU, err := s.getPodEnableShareGPU(ctx, podMeta) + if err != nil { + general.Infof("share gpu manager: fetching extended indicators failed for pod %s/%s: %v", podMeta.Namespace, podMeta.Name, err) + if cache != nil { + cache[podMeta.UID] = false + } + return false + } + + if cache != nil { + cache[podMeta.UID] = enableShareGPU + } + return enableShareGPU +} + +// getPodEnableShareGPU queries MetaServer for the pod's `EnableShareGPU` indicator. +// Returns: boolean indicator value; error when external call fails. +// Errors: wrapped with context using `pkg/errors`. +func (s *shareGPUManager) getPodEnableShareGPU(ctx context.Context, podMeta metav1.ObjectMeta) (bool, error) { + enableShareGPU := false + indicators := v1alpha1.ReclaimResourceIndicators{} + baseLine, err := s.basePlugin.MetaServer.ServiceExtendedIndicator(ctx, podMeta, &indicators) + if err != nil { + return false, errors.Wrapf(err, "ServiceExtendedIndicator failed for pod %s/%s", podMeta.Namespace, podMeta.Name) + } + + if !baseLine && indicators.EnableShareGPU != nil { + enableShareGPU = *indicators.EnableShareGPU + } + + return enableShareGPU, nil +} + +func (s *shareGPUManager) preparePodMeta(info *state.AllocationInfo) metav1.ObjectMeta { + return metav1.ObjectMeta{ + UID: types.UID(info.PodUid), + Namespace: info.PodNamespace, + Name: info.PodName, + Labels: info.Labels, + Annotations: info.Annotations, + } +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go new file mode 100644 index 0000000000..8a3a46e2a0 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go @@ -0,0 +1,310 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "context" + "fmt" + "sync" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" + workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" + commonstate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + gpustate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metaserver/spd" +) + +// fakeSPM is a lightweight ServiceProfilingManager for tests. +type fakeSPM struct { + behaviors map[types.UID]struct { + baseline bool + enable *bool + err error + } + calls int +} + +func (f *fakeSPM) ServiceExtendedIndicator(_ context.Context, podMeta metav1.ObjectMeta, indicators interface{}) (bool, error) { + f.calls++ + b := f.behaviors[podMeta.UID] + // fill EnableShareGPU if indicators is of proper type + if ind, ok := indicators.(*v1alpha1.ReclaimResourceIndicators); ok { + ind.EnableShareGPU = b.enable + } + if b.err != nil { + return false, b.err + } + return b.baseline, nil +} + +// Stubs for unused interface methods +func (f *fakeSPM) ServiceBusinessPerformanceLevel(context.Context, metav1.ObjectMeta) (spd.PerformanceLevel, error) { + return spd.PerformanceLevelPerfect, nil +} + +func (f *fakeSPM) ServiceBusinessPerformanceScore(context.Context, metav1.ObjectMeta) (float64, error) { + return spd.MaxPerformanceScore, nil +} + +func (f *fakeSPM) ServiceSystemPerformanceTarget(context.Context, metav1.ObjectMeta) (spd.IndicatorTarget, error) { + return spd.IndicatorTarget{}, nil +} + +func (f *fakeSPM) ServiceBaseline(context.Context, metav1.ObjectMeta) (bool, error) { + return false, nil +} + +func (f *fakeSPM) ServiceAggregateMetrics(context.Context, metav1.ObjectMeta, v1.ResourceName, bool, workloadapis.Aggregator, workloadapis.Aggregator) ([]resource.Quantity, error) { + return nil, nil +} +func (f *fakeSPM) Run(context.Context) {} + +// fakeState provides minimal state for tests. +type fakeState struct { + machine gpustate.AllocationResourcesMap +} + +func (f *fakeState) GetMachineState() gpustate.AllocationResourcesMap { + return f.machine +} +func (f *fakeState) GetPodResourceEntries() gpustate.PodResourceEntries { return nil } +func (f *fakeState) GetPodEntries(resourceName v1.ResourceName) gpustate.PodEntries { return nil } +func (f *fakeState) GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *gpustate.AllocationInfo { + return nil +} +func (f *fakeState) SetMachineState(gpustate.AllocationResourcesMap, bool) {} +func (f *fakeState) SetResourceState(v1.ResourceName, gpustate.AllocationMap, bool) {} +func (f *fakeState) SetPodResourceEntries(gpustate.PodResourceEntries, bool) {} +func (f *fakeState) SetAllocationInfo(v1.ResourceName, string, string, *gpustate.AllocationInfo, bool) { +} +func (f *fakeState) Delete(v1.ResourceName, string, string, bool) {} +func (f *fakeState) ClearState() {} +func (f *fakeState) StoreState() error { return nil } + +func makeContainer(podUID, podNS, podName, containerName string, main bool, ids ...string) *gpustate.AllocationInfo { + t := pluginapi.ContainerType_SIDECAR.String() + if main { + t = pluginapi.ContainerType_MAIN.String() + } + topo := make(map[string]gpustate.Allocation) + for _, id := range ids { + topo[id] = gpustate.Allocation{Quantity: 1} + } + return &gpustate.AllocationInfo{ + AllocationMeta: gpustate.AllocationInfo{AllocationMeta: commonstate.AllocationMeta{ + PodUid: podUID, + PodNamespace: podNS, + PodName: podName, + ContainerName: containerName, + ContainerType: t, + }}.AllocationMeta, + AllocatedAllocation: gpustate.Allocation{Quantity: 1}, + TopologyAwareAllocations: topo, + } +} + +func Test_EnableShareGPU_DefaultFalse(t *testing.T) { + t.Parallel() + m := NewShareGPUManager().(*shareGPUManager) + if got := m.EnableShareGPU("non-existent"); got { + t.Fatalf("expected false for unknown id, got true") + } +} + +func Test_Allocate_MarkFalse_OnIndicatorFalse(t *testing.T) { + t.Parallel() + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: ptrBool(false)}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u1", "ns", "pod", "c", true, "GPU-1", "GPU-2") + m.Allocate(context.Background(), alloc) + + if m.EnableShareGPU("GPU-1") || m.EnableShareGPU("GPU-2") { + t.Fatalf("expected devices marked false after Allocate") + } +} + +func Test_Sync_BuildsSnapshot_WithCache(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + + // Build machine state: one device with two main containers of same pod + ce := gpustate.ContainerEntries{ + "c1": makeContainer("u1", "ns", "pod", "c1", true, "GPU-1"), + "c2": makeContainer("u1", "ns", "pod", "c2", true, "GPU-1"), + } + allocState := &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": ce}} + machine := gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): gpustate.AllocationMap{"GPU-1": allocState}} + + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = &BasePlugin{MetaServer: bp.MetaServer, State: &fakeState{machine: machine}} + m.sync(context.Background()) + + if !m.EnableShareGPU("GPU-1") { + t.Fatalf("expected GPU-1 shareable") + } + if spm.calls != 1 { + t.Fatalf("expected 1 indicator call due to cache, got %d", spm.calls) + } +} + +func Test_Concurrency_Safety(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u1", "ns", "pod", "c", true, "GPU-1") + + var wg sync.WaitGroup + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Writer goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 100; i++ { + m.Allocate(ctx, alloc) + time.Sleep(time.Millisecond) + } + }() + + // Reader goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 100; i++ { + _ = m.EnableShareGPU("GPU-1") + time.Sleep(time.Millisecond) + } + }() + + wg.Wait() +} + +func Test_Run_StartsAndStops(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}, State: &fakeState{machine: gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): gpustate.AllocationMap{"GPU-1": &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": gpustate.ContainerEntries{"c": makeContainer("u1", "ns", "pod", "c", true, "GPU-1")}}}}}}} + m := NewShareGPUManager() + + stopCh := make(chan struct{}) + go m.Run(bp, stopCh) + time.Sleep(50 * time.Millisecond) + close(stopCh) +} + +func Test_Allocate_Ignores_NonMain(t *testing.T) { + t.Parallel() + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{}} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u2", "ns", "pod", "c", false, "GPU-1") + m.Allocate(context.Background(), alloc) + if m.EnableShareGPU("GPU-1") { + t.Fatalf("non-main container should not mark device") + } +} + +func ptrBool(b bool) *bool { v := b; return &v } + +func BenchmarkSync_WithIndicatorCache(b *testing.B) { + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + + // Build a large machine state: N devices, each with M main containers from the same pod + const N, M = 50, 20 + am := make(gpustate.AllocationMap, N) + for i := 0; i < N; i++ { + ce := make(gpustate.ContainerEntries, M) + for j := 0; j < M; j++ { + cname := fmt.Sprintf("c%c", 'A'+j) + ce[cname] = makeContainer("u1", "ns", "pod", cname, true, fmt.Sprintf("GPU-%c", 'A'+i)) + } + am[fmt.Sprintf("GPU-%c", 'A'+i)] = &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": ce}} + } + machine := gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): am} + + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = &BasePlugin{MetaServer: bp.MetaServer, State: &fakeState{machine: machine}} + + b.ResetTimer() + for i := 0; i < b.N; i++ { + spm.calls = 0 + m.sync(context.Background()) + } + + // Expect only 1 external call per sync due to cache (one unique pod UID) + if spm.calls != 1 { + b.Fatalf("expected 1 indicator call per sync, got %d", spm.calls) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go index d29229c074..a9ba9530a4 100644 --- a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -218,6 +218,9 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice( "qosLevel", qosLevel, "allocatedDevices", allocatedDevices) + // call shareGPUManager to update GPU device state + p.BasePlugin.ShareGPUManager.Allocate(ctx, gpuDeviceAllocationInfo) + return &pluginapi.AssociatedDeviceAllocationResponse{ AllocationResult: &pluginapi.AssociatedDeviceAllocation{ AllocatedDevices: allocatedDevices, diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 46deb5ad86..0831c81dcc 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -378,20 +378,21 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) for deviceID, deviceInfo := range gpuTopology.Devices { aggregatedAllocatableQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) aggregatedCapacityQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) - gpuMemoryAllocatablePerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) - gpuMemoryCapacityPerGPUNUMA := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) - if deviceInfo.Health != deviceplugin.Healthy { - gpuMemoryAllocatablePerGPUNUMA = 0 + gpuMemoryAllocatablePerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + gpuMemoryCapacityPerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + if deviceInfo.Health != deviceplugin.Healthy || !p.ShareGPUManager.EnableShareGPU(deviceID) { + // if the device is not healthy or not enabled to share, then set allocatable to 0 + gpuMemoryAllocatablePerGPU = 0 } if len(deviceInfo.NumaNodes) > 0 { - gpuMemoryAllocatablePerGPUNUMA = gpuMemoryAllocatablePerGPUNUMA / float64(len(deviceInfo.NumaNodes)) - gpuMemoryCapacityPerGPUNUMA = gpuMemoryCapacityPerGPUNUMA / float64(len(deviceInfo.NumaNodes)) + gpuMemoryAllocatablePerGPU = gpuMemoryAllocatablePerGPU / float64(len(deviceInfo.NumaNodes)) + gpuMemoryCapacityPerGPU = gpuMemoryCapacityPerGPU / float64(len(deviceInfo.NumaNodes)) for _, numaID := range deviceInfo.NumaNodes { if numaID < 0 { numaID = 0 } topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + ResourceValue: gpuMemoryAllocatablePerGPU, Name: deviceID, Node: uint64(numaID), Type: string(v1alpha1.TopologyTypeGPU), @@ -400,7 +401,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryCapacityPerGPUNUMA, + ResourceValue: gpuMemoryCapacityPerGPU, Name: deviceID, Node: uint64(numaID), Type: string(v1alpha1.TopologyTypeGPU), @@ -412,7 +413,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) } else { // if deviceInfo.NumaNodes is empty, then it means the device is not NUMA aware topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryAllocatablePerGPUNUMA, + ResourceValue: gpuMemoryAllocatablePerGPU, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ @@ -420,7 +421,7 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) }, }) topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ - ResourceValue: gpuMemoryCapacityPerGPUNUMA, + ResourceValue: gpuMemoryCapacityPerGPU, Name: deviceID, Type: string(v1alpha1.TopologyTypeGPU), Annotations: map[string]string{ diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go index e8624fdb92..1a91438820 100644 --- a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -151,6 +151,11 @@ func (p *StaticPolicy) Start() (err error) { general.Errorf("start %v failed, err: %v", gpuconsts.ClearResidualState, err) } + err = p.BasePlugin.Run(p.stopCh) + if err != nil { + return fmt.Errorf("share gpu manager run failed with error: %w", err) + } + go wait.Until(func() { periodicalhandler.ReadyToStartHandlersByGroup(appqrm.QRMGPUPluginPeriodicalHandlerGroupName) }, 5*time.Second, p.stopCh) From 649398fe2b96f309ab5a158851405776e4d5bc84 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 24 Nov 2025 14:05:50 +0800 Subject: [PATCH 46/52] feat(eviction): add reclaimed GPU resources eviction plugin Introduce a new eviction plugin for GPU resources to handle eviction based on GPU topology and allocations. The plugin reuses the generic ZoneResourcesPlugin with zoneType=GPU to preserve behavior while adding specific GPU resource handling capabilities. Includes unit tests to verify functionality. --- pkg/agent/evictionmanager/manager.go | 1 + .../resource/reclaimed_gpu_resources.go | 82 +++++++++ .../resource/reclaimed_gpu_resources_test.go | 161 ++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go create mode 100644 pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go diff --git a/pkg/agent/evictionmanager/manager.go b/pkg/agent/evictionmanager/manager.go index 48c8ed3eb3..1457930842 100644 --- a/pkg/agent/evictionmanager/manager.go +++ b/pkg/agent/evictionmanager/manager.go @@ -143,6 +143,7 @@ func NewInnerEvictionPluginInitializers() map[string]plugin.InitFunc { innerEvictionPluginInitializers := make(map[string]plugin.InitFunc) innerEvictionPluginInitializers[resource.ReclaimedResourcesEvictionPluginName] = resource.NewReclaimedResourcesEvictionPlugin innerEvictionPluginInitializers[resource.ReclaimedNumaResourcesEvictionPluginName] = resource.NewReclaimedNumaResourcesEvictionPlugin + innerEvictionPluginInitializers[resource.ReclaimedGPUResourcesEvictionPluginName] = resource.NewReclaimedGPUResourcesEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameNumaMemoryPressure] = memory.NewNumaMemoryPressureEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameSystemMemoryPressure] = memory.NewSystemPressureEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameRssOveruse] = memory.NewRssOveruseEvictionPlugin diff --git a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go new file mode 100644 index 0000000000..07189750dc --- /dev/null +++ b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go @@ -0,0 +1,82 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resource + +import ( + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/events" + + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + "github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin" + "github.com/kubewharf/katalyst-core/pkg/client" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/process" +) + +const ( + ReclaimedGPUResourcesEvictionPluginName = "reclaimed-gpu-resource-pressure-eviction-plugin" +) + +type ReclaimedGPUResourcesPlugin struct { + *process.StopControl + *ZoneResourcesPlugin +} + +// NewReclaimedGPUResourcesEvictionPlugin constructs a GPU topology-aware eviction plugin. +// It wires threshold/deletion/tolerance getters from dynamic configuration and +// reuses the generic ZoneResourcesPlugin with zoneType=GPU to preserve behavior. +func NewReclaimedGPUResourcesEvictionPlugin(_ *client.GenericClientSet, _ events.EventRecorder, + metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, conf *config.Configuration, +) plugin.EvictionPlugin { + reclaimedThresholdGetter := func(resourceName v1.ResourceName) *float64 { + if threshold, ok := conf.GetDynamicConfiguration().EvictionThreshold[resourceName]; !ok { + return nil + } else { + return &threshold + } + } + + deletionGracePeriodGetter := func() int64 { + return conf.GetDynamicConfiguration().ReclaimedResourcesEvictionConfiguration.DeletionGracePeriod + } + + thresholdMetToleranceDurationGetter := func() int64 { + return conf.GetDynamicConfiguration().ThresholdMetToleranceDuration + } + + p := NewZoneResourcesPlugin( + ReclaimedGPUResourcesEvictionPluginName, + v1alpha1.TopologyTypeGPU, + metaServer, + emitter, + nil, + reclaimedThresholdGetter, + deletionGracePeriodGetter, + thresholdMetToleranceDurationGetter, + conf.SkipZeroQuantityResourceNames, + conf.CheckReclaimedQoSForPod, + ) + + return &ReclaimedGPUResourcesPlugin{ + StopControl: process.NewStopControl(time.Time{}), + ZoneResourcesPlugin: p, + } +} diff --git a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go new file mode 100644 index 0000000000..deb21399c2 --- /dev/null +++ b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go @@ -0,0 +1,161 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resource + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/events" + + nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1" + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnr" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/node" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +func TestNewReclaimedGPUResourcesEvictionPlugin_Behavior(t *testing.T) { + t.Parallel() + + testNodeName := "gpu-node" + testConf, err := options.NewOptions().Config() + if err != nil { + t.Fatalf("config error: %v", err) + } + testConf.NodeName = testNodeName + // configure threshold for GPU resource + if testConf.GetDynamicConfiguration().EvictionThreshold == nil { + testConf.GetDynamicConfiguration().EvictionThreshold = map[corev1.ResourceName]float64{} + } + testConf.GetDynamicConfiguration().EvictionThreshold[corev1.ResourceName("nvidia.com/gpu")] = 0.5 + + // pods + podA := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-a", + Namespace: "default", + UID: "uid-a", + Annotations: map[string]string{ + "katalyst.kubewharf.io/qos_level": "reclaimed_cores", + }, + }, + } + podB := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-b", + Namespace: "default", + UID: "uid-b", + Annotations: map[string]string{ + "katalyst.kubewharf.io/qos_level": "reclaimed_cores", + }, + }, + } + pods := []*corev1.Pod{podA, podB} + + // CNR with GPU topology and allocations + cnrObj := &nodev1alpha1.CustomNodeResource{ + ObjectMeta: metav1.ObjectMeta{Name: testNodeName}, + Status: nodev1alpha1.CustomNodeResourceStatus{ + TopologyZone: []*nodev1alpha1.TopologyZone{ + { + Name: "0", + Type: nodev1alpha1.TopologyTypeGPU, + Resources: nodev1alpha1.Resources{ + Allocatable: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"), + }, + }, + Allocations: []*nodev1alpha1.Allocation{ + { + Consumer: "default/pod-a/uid-a", + Requests: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"), + }, + }, + { + Consumer: "default/pod-b/uid-b", + Requests: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + } + + ctx, err := katalyst_base.GenerateFakeGenericContext(nil, []runtime.Object{cnrObj}, nil) + if err != nil { + t.Fatalf("context error: %v", err) + } + + ms := &metaserver.MetaServer{ + MetaAgent: &agent.MetaAgent{ + PodFetcher: &pod.PodFetcherStub{PodList: pods}, + NodeFetcher: node.NewRemoteNodeFetcher(testConf.BaseConfiguration, testConf.NodeConfiguration, + ctx.Client.KubeClient.CoreV1().Nodes()), + CNRFetcher: cnr.NewCachedCNRFetcher(testConf.BaseConfiguration, testConf.CNRConfiguration, + ctx.Client.InternalClient.NodeV1alpha1().CustomNodeResources()), + }, + } + + plugin := NewReclaimedGPUResourcesEvictionPlugin(ctx.Client, &events.FakeRecorder{}, ms, metrics.DummyMetrics{}, testConf) + if plugin == nil { + t.Fatalf("plugin nil") + } + + // ThresholdMet + met, err := plugin.ThresholdMet(context.TODO(), &pluginapi.GetThresholdMetRequest{}) + if err != nil { + t.Fatalf("threshold error: %v", err) + } + if met == nil { + t.Fatalf("threshold nil") + } + + // GetTopEvictionPods + podsResp, err := plugin.GetTopEvictionPods(context.TODO(), &pluginapi.GetTopEvictionPodsRequest{ + ActivePods: pods, + TopN: 1, + EvictionScope: met.EvictionScope, + }) + if err != nil { + t.Fatalf("top eviction error: %v", err) + } + if len(podsResp.GetTargetPods()) == 0 { + t.Fatalf("no target pods") + } + + // GetEvictPods (currently returns empty but should be non-nil) + evictResp, err := plugin.GetEvictPods(context.TODO(), &pluginapi.GetEvictPodsRequest{ActivePods: pods}) + if err != nil { + t.Fatalf("evict pods error: %v", err) + } + if evictResp == nil { + t.Fatalf("evict resp nil") + } +} From 09b42dcc97c3dc135d41980c882a4ebcb0bf226d Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 24 Nov 2025 14:06:28 +0800 Subject: [PATCH 47/52] build: update katalyst-api dependency version --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 74ac733272..ca6acb5cc0 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,7 @@ require ( ) replace ( - github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index ece20c05f7..1701a16c1e 100644 --- a/go.sum +++ b/go.sum @@ -583,8 +583,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= -github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b h1:CJnM7Ow3Wm+QzvUrrqmGlId8UXOWNGAKi09mTnU/QQs= -github.com/luomingmeng/katalyst-api v0.0.0-20251114092935-4d2194badf1b/go.mod h1:76Qriw/zHXLGhunpskZpJ9zJI3Dwf2hWnz9z8kkjR64= +github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97 h1:1vDUtGKw5ZI9yAKcbWQw+1y3BvQi3Sp4nGhCCi00vZs= +github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97/go.mod h1:BZMVGVl3EP0eCn5xsDgV41/gjYkoh43abIYxrB10e3k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= From b2a1de21fdec27a075e6a419daa7f4ea39545f11 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 9 Dec 2025 15:32:57 -0800 Subject: [PATCH 48/52] fix(resources-eviction): add error healthz state update in threshold checks Ensure healthz state is updated when errors occur during threshold checks in both resources and zone resources plugins. --- pkg/agent/evictionmanager/plugin/resource/resources.go | 5 +++++ pkg/agent/evictionmanager/plugin/resource/zone_resources.go | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/pkg/agent/evictionmanager/plugin/resource/resources.go b/pkg/agent/evictionmanager/plugin/resource/resources.go index 5908a757dd..b7163bb647 100644 --- a/pkg/agent/evictionmanager/plugin/resource/resources.go +++ b/pkg/agent/evictionmanager/plugin/resource/resources.go @@ -108,6 +108,11 @@ func (b *ResourcesEvictionPlugin) Start() { // ThresholdMet evict pods when the beset effort resources usage is greater than // the supply (after considering toleration). func (b *ResourcesEvictionPlugin) ThresholdMet(ctx context.Context, _ *pluginapi.GetThresholdMetRequest) (*pluginapi.ThresholdMetResponse, error) { + var err error + defer func() { + _ = general.UpdateHealthzStateByError(b.pluginName, err) + }() + activePods, err := b.metaServer.GetPodList(ctx, native.PodIsActive) if err != nil { errMsg := fmt.Sprintf("failed to list pods from metaServer: %v", err) diff --git a/pkg/agent/evictionmanager/plugin/resource/zone_resources.go b/pkg/agent/evictionmanager/plugin/resource/zone_resources.go index 0e17453fcf..4f511927d5 100644 --- a/pkg/agent/evictionmanager/plugin/resource/zone_resources.go +++ b/pkg/agent/evictionmanager/plugin/resource/zone_resources.go @@ -121,6 +121,11 @@ func (p *ZoneResourcesPlugin) Start() { // - Skips resources with zero total when configured in skip list. // - Returns HARD_MET with GREATER_THAN semantics when used > threshold(total). func (p *ZoneResourcesPlugin) ThresholdMet(ctx context.Context, _ *pluginapi.GetThresholdMetRequest) (*pluginapi.ThresholdMetResponse, error) { + var err error + defer func() { + _ = general.UpdateHealthzStateByError(p.pluginName, err) + }() + activePods, err := p.metaServer.GetPodList(ctx, native.PodIsActive) if err != nil { errWrapped := fmt.Errorf("list pods from metaServer: %w", err) From 4295c6a9fb70770039107a200b064fdfe4a61beb Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 12 Jan 2026 20:14:54 +0800 Subject: [PATCH 49/52] fix(gpu): ignore reclaimed containers in device share evaluation Add CheckReclaimed condition to skip reclaimed containers when evaluating device share status Add test case to verify reclaimed containers are ignored --- .../gpu/baseplugin/share_gpu_manager.go | 2 +- .../gpu/baseplugin/share_gpu_manager_test.go | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go index f1bd9432f4..4fe1d47e79 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go @@ -181,7 +181,7 @@ func (s *shareGPUManager) evaluateDeviceShareStatus(ctx context.Context, alloc * // Default to shareable and short-circuit to false once a disallowed pod is found. for _, containerEntries := range alloc.PodEntries { for _, container := range containerEntries { - if !container.CheckMainContainer() { + if !container.CheckMainContainer() || container.CheckReclaimed() { continue } diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go index 8a3a46e2a0..5f0e491f4c 100644 --- a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go @@ -31,6 +31,7 @@ import ( "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" + apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" commonstate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" gpustate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" @@ -268,6 +269,24 @@ func Test_Allocate_Ignores_NonMain(t *testing.T) { } } +func Test_evaluateDeviceShareStatus_Ignores_reclaimed(t *testing.T) { + t.Parallel() + m := NewShareGPUManager().(*shareGPUManager) + alloc := makeContainer("u2", "ns", "pod", "c", true, "GPU-1") + alloc.QoSLevel = apiconsts.PodAnnotationQoSLevelReclaimedCores + as := &gpustate.AllocationState{ + PodEntries: map[string]gpustate.ContainerEntries{ + "u2": { + "c": alloc, + }, + }, + } + + if !m.evaluateDeviceShareStatus(context.Background(), as, make(map[types.UID]bool)) { + t.Fatalf("reclaimed container should not mark device") + } +} + func ptrBool(b bool) *bool { v := b; return &v } func BenchmarkSync_WithIndicatorCache(b *testing.B) { From 1ac77a8e2b9a157218cb50ae2412ffaad2b560ec Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Mon, 12 Jan 2026 21:21:28 +0800 Subject: [PATCH 50/52] feat(eviction): add gpu memory threshold to reclaimed resources eviction --- .../dynamic/adminqos/eviction/reclaimed_resources_eviction.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go b/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go index e969c9374d..4f7f1faeb5 100644 --- a/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go +++ b/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go @@ -35,6 +35,7 @@ func NewReclaimedResourcesEvictionOptions() *ReclaimedResourcesEvictionOptions { EvictionThreshold: native.ResourceThreshold{ consts.ReclaimedResourceMilliCPU: 5.0, consts.ReclaimedResourceMemory: 5.0, + consts.ResourceGPUMemory: 5.0, }, GracePeriod: 60, ThresholdMetToleranceDuration: 0, From 1f727f45c6c4081f2321a91679772805947d96f1 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 13 Jan 2026 19:15:42 +0800 Subject: [PATCH 51/52] fix(gpumemory): correct gpu memory allocation calculation Move aggregation of allocatable and capacity quantities after health check to ensure accurate totals for unhealthy or non-shared devices --- pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go index 0831c81dcc..eac2833070 100644 --- a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -376,14 +376,14 @@ func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(gpuTopology.Devices)) var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 for deviceID, deviceInfo := range gpuTopology.Devices { - aggregatedAllocatableQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) - aggregatedCapacityQuantity += float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) gpuMemoryAllocatablePerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) gpuMemoryCapacityPerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) if deviceInfo.Health != deviceplugin.Healthy || !p.ShareGPUManager.EnableShareGPU(deviceID) { // if the device is not healthy or not enabled to share, then set allocatable to 0 gpuMemoryAllocatablePerGPU = 0 } + aggregatedAllocatableQuantity += gpuMemoryAllocatablePerGPU + aggregatedCapacityQuantity += gpuMemoryCapacityPerGPU if len(deviceInfo.NumaNodes) > 0 { gpuMemoryAllocatablePerGPU = gpuMemoryAllocatablePerGPU / float64(len(deviceInfo.NumaNodes)) gpuMemoryCapacityPerGPU = gpuMemoryCapacityPerGPU / float64(len(deviceInfo.NumaNodes)) From b13e4d2afab93d3c6e3756c69d2408f4bfc56362 Mon Sep 17 00:00:00 2001 From: luomingmeng Date: Tue, 13 Jan 2026 19:30:06 +0800 Subject: [PATCH 52/52] feat(evictionmanager): add GPU-specific threshold tolerance duration Add constant thresholdMetToleranceDurationForGPU to set a fixed 15-second tolerance duration for GPU resource eviction, replacing the dynamic configuration value. --- .../plugin/resource/reclaimed_gpu_resources.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go index 07189750dc..08fff40113 100644 --- a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go +++ b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go @@ -35,6 +35,10 @@ const ( ReclaimedGPUResourcesEvictionPluginName = "reclaimed-gpu-resource-pressure-eviction-plugin" ) +const ( + thresholdMetToleranceDurationForGPU = 15 +) + type ReclaimedGPUResourcesPlugin struct { *process.StopControl *ZoneResourcesPlugin @@ -59,7 +63,7 @@ func NewReclaimedGPUResourcesEvictionPlugin(_ *client.GenericClientSet, _ events } thresholdMetToleranceDurationGetter := func() int64 { - return conf.GetDynamicConfiguration().ThresholdMetToleranceDuration + return int64(thresholdMetToleranceDurationForGPU) } p := NewZoneResourcesPlugin(