diff --git a/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go new file mode 100644 index 0000000000..e98d9a6c7e --- /dev/null +++ b/cmd/katalyst-agent/app/agent/qrm/gpu_plugin.go @@ -0,0 +1,67 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import ( + "fmt" + "strings" + "sync" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + phconsts "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler/consts" + "github.com/kubewharf/katalyst-core/pkg/config" +) + +const ( + QRMPluginNameGPU = "qrm_gpu_plugin" +) + +var QRMGPUPluginPeriodicalHandlerGroupName = strings.Join([]string{ + QRMPluginNameGPU, + phconsts.PeriodicalHandlersGroupNameSuffix, +}, phconsts.GroupNameSeparator) + +// gpuPolicyInitializers is used to store the initializing function for gpu resource plugin policies +var gpuPolicyInitializers sync.Map + +// RegisterGPUPolicyInitializer is used to register user-defined resource plugin init functions +func RegisterGPUPolicyInitializer(name string, initFunc agent.InitFunc) { + gpuPolicyInitializers.Store(name, initFunc) +} + +// getIOPolicyInitializers returns those policies with initialized functions +func getGPUPolicyInitializers() map[string]agent.InitFunc { + agents := make(map[string]agent.InitFunc) + gpuPolicyInitializers.Range(func(key, value interface{}) bool { + agents[key.(string)] = value.(agent.InitFunc) + return true + }) + return agents +} + +// InitQRMGPUPlugins initializes the gpu QRM plugins +func InitQRMGPUPlugins(agentCtx *agent.GenericContext, conf *config.Configuration, extraConf interface{}, agentName string) (bool, agent.Component, error) { + initializers := getGPUPolicyInitializers() + policyName := conf.GPUQRMPluginConfig.PolicyName + + initFunc, ok := initializers[policyName] + if !ok { + return false, agent.ComponentStub{}, fmt.Errorf("invalid policy name %v for gpu resource plugin", policyName) + } + + return initFunc(agentCtx, conf, extraConf, agentName) +} diff --git a/cmd/katalyst-agent/app/enableagents.go b/cmd/katalyst-agent/app/enableagents.go index 615ef00c93..713ea6925a 100644 --- a/cmd/katalyst-agent/app/enableagents.go +++ b/cmd/katalyst-agent/app/enableagents.go @@ -24,6 +24,7 @@ import ( "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu" + _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/io" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory" _ "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/network" @@ -57,6 +58,7 @@ func init() { agentInitializers.Store(qrm.QRMPluginNameMemory, AgentStarter{Init: qrm.InitQRMMemoryPlugins}) agentInitializers.Store(qrm.QRMPluginNameNetwork, AgentStarter{Init: qrm.InitQRMNetworkPlugins}) agentInitializers.Store(qrm.QRMPluginNameIO, AgentStarter{Init: qrm.InitQRMIOPlugins}) + agentInitializers.Store(qrm.QRMPluginNameGPU, AgentStarter{Init: qrm.InitQRMGPUPlugins}) } // RegisterAgentInitializer is used to register user-defined agents diff --git a/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go b/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go index e969c9374d..4f7f1faeb5 100644 --- a/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go +++ b/cmd/katalyst-agent/app/options/dynamic/adminqos/eviction/reclaimed_resources_eviction.go @@ -35,6 +35,7 @@ func NewReclaimedResourcesEvictionOptions() *ReclaimedResourcesEvictionOptions { EvictionThreshold: native.ResourceThreshold{ consts.ReclaimedResourceMilliCPU: 5.0, consts.ReclaimedResourceMemory: 5.0, + consts.ResourceGPUMemory: 5.0, }, GracePeriod: 60, ThresholdMetToleranceDuration: 0, diff --git a/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go new file mode 100644 index 0000000000..28b09fdec1 --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpu_plugin.go @@ -0,0 +1,75 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import ( + "k8s.io/apimachinery/pkg/api/resource" + cliflag "k8s.io/component-base/cli/flag" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options/qrm/gpustrategy" + qrmconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" +) + +type GPUOptions struct { + PolicyName string + GPUDeviceNames []string + GPUMemoryAllocatablePerGPU string + SkipGPUStateCorruption bool + RDMADeviceNames []string + + GPUStrategyOptions *gpustrategy.GPUStrategyOptions +} + +func NewGPUOptions() *GPUOptions { + return &GPUOptions{ + PolicyName: "static", + GPUDeviceNames: []string{"nvidia.com/gpu"}, + GPUMemoryAllocatablePerGPU: "100", + RDMADeviceNames: []string{}, + GPUStrategyOptions: gpustrategy.NewGPUStrategyOptions(), + } +} + +func (o *GPUOptions) AddFlags(fss *cliflag.NamedFlagSets) { + fs := fss.FlagSet("gpu_resource_plugin") + + fs.StringVar(&o.PolicyName, "gpu-resource-plugin-policy", + o.PolicyName, "The policy gpu resource plugin should use") + fs.StringSliceVar(&o.GPUDeviceNames, "gpu-resource-names", o.GPUDeviceNames, "The name of the GPU resource") + fs.StringVar(&o.GPUMemoryAllocatablePerGPU, "gpu-memory-allocatable-per-gpu", + o.GPUMemoryAllocatablePerGPU, "The total memory allocatable for each GPU, e.g. 100") + fs.BoolVar(&o.SkipGPUStateCorruption, "skip-gpu-state-corruption", + o.SkipGPUStateCorruption, "skip gpu state corruption, and it will be used after updating state properties") + fs.StringSliceVar(&o.RDMADeviceNames, "rdma-resource-names", o.RDMADeviceNames, "The name of the RDMA resource") + o.GPUStrategyOptions.AddFlags(fss) +} + +func (o *GPUOptions) ApplyTo(conf *qrmconfig.GPUQRMPluginConfig) error { + conf.PolicyName = o.PolicyName + conf.GPUDeviceNames = o.GPUDeviceNames + gpuMemory, err := resource.ParseQuantity(o.GPUMemoryAllocatablePerGPU) + if err != nil { + return err + } + conf.GPUMemoryAllocatablePerGPU = gpuMemory + conf.SkipGPUStateCorruption = o.SkipGPUStateCorruption + conf.RDMADeviceNames = o.RDMADeviceNames + if err := o.GPUStrategyOptions.ApplyTo(conf.GPUStrategyConfig); err != nil { + return err + } + return nil +} diff --git a/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go b/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go new file mode 100644 index 0000000000..470ec40012 --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpustrategy/allocate.go @@ -0,0 +1,59 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +import ( + "strings" + + cliflag "k8s.io/component-base/cli/flag" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) + +type AllocateStrategyOptions struct { + CustomFilteringStrategies map[string]string + CustomSortingStrategy map[string]string + CustomBindingStrategy map[string]string + CustomAllocationStrategy map[string]string +} + +func NewGPUAllocateStrategyOptions() *AllocateStrategyOptions { + return &AllocateStrategyOptions{} +} + +func (o *AllocateStrategyOptions) AddFlags(fss *cliflag.NamedFlagSets) { + fs := fss.FlagSet("allocate_strategy") + fs.StringToStringVar(&o.CustomFilteringStrategies, "gpu-allocate-custom-filtering-strategies", + o.CustomFilteringStrategies, "The filtering strategies for each resource, e.g. gpu:filtering1/filtering2") + fs.StringToStringVar(&o.CustomSortingStrategy, "gpu-allocate-custom-sorting-strategy", o.CustomSortingStrategy, "The sorting strategy for each resource") + fs.StringToStringVar(&o.CustomBindingStrategy, "gpu-allocate-custom-binding-strategy", o.CustomBindingStrategy, "The binding strategy for each resource") + fs.StringToStringVar(&o.CustomAllocationStrategy, "gpu-allocate-custom-allocation-strategy", o.CustomAllocationStrategy, "The allocation strategy for each resource") +} + +func (o *AllocateStrategyOptions) ApplyTo(c *gpustrategy.AllocateStrategyConfig) error { + for resourceName, strategies := range o.CustomFilteringStrategies { + filteringStrategies := strings.Split(strategies, "/") + for _, strategyName := range filteringStrategies { + c.CustomFilteringStrategies[resourceName] = append(c.CustomFilteringStrategies[resourceName], strategyName) + } + } + + c.CustomSortingStrategy = o.CustomSortingStrategy + c.CustomBindingStrategy = o.CustomBindingStrategy + c.CustomAllocationStrategy = o.CustomAllocationStrategy + return nil +} diff --git a/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go b/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go new file mode 100644 index 0000000000..ed60713090 --- /dev/null +++ b/cmd/katalyst-agent/app/options/qrm/gpustrategy/strategy_base.go @@ -0,0 +1,44 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +import ( + cliflag "k8s.io/component-base/cli/flag" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) + +type GPUStrategyOptions struct { + *AllocateStrategyOptions +} + +func NewGPUStrategyOptions() *GPUStrategyOptions { + return &GPUStrategyOptions{ + AllocateStrategyOptions: NewGPUAllocateStrategyOptions(), + } +} + +func (o *GPUStrategyOptions) AddFlags(fss *cliflag.NamedFlagSets) { + o.AllocateStrategyOptions.AddFlags(fss) +} + +func (o *GPUStrategyOptions) ApplyTo(conf *gpustrategy.GPUStrategyConfig) error { + if err := o.AllocateStrategyOptions.ApplyTo(conf.AllocateStrategyConfig); err != nil { + return err + } + return nil +} diff --git a/cmd/katalyst-agent/app/options/qrm/qrm_base.go b/cmd/katalyst-agent/app/options/qrm/qrm_base.go index c02a6ddd9d..139c26ae77 100644 --- a/cmd/katalyst-agent/app/options/qrm/qrm_base.go +++ b/cmd/katalyst-agent/app/options/qrm/qrm_base.go @@ -88,6 +88,7 @@ type QRMPluginsOptions struct { MemoryOptions *MemoryOptions NetworkOptions *NetworkOptions IOOptions *IOOptions + GPUOptions *GPUOptions } func NewQRMPluginsOptions() *QRMPluginsOptions { @@ -96,6 +97,7 @@ func NewQRMPluginsOptions() *QRMPluginsOptions { MemoryOptions: NewMemoryOptions(), NetworkOptions: NewNetworkOptions(), IOOptions: NewIOOptions(), + GPUOptions: NewGPUOptions(), } } @@ -104,6 +106,7 @@ func (o *QRMPluginsOptions) AddFlags(fss *cliflag.NamedFlagSets) { o.MemoryOptions.AddFlags(fss) o.NetworkOptions.AddFlags(fss) o.IOOptions.AddFlags(fss) + o.GPUOptions.AddFlags(fss) } func (o *QRMPluginsOptions) ApplyTo(conf *qrmconfig.QRMPluginsConfiguration) error { @@ -119,5 +122,8 @@ func (o *QRMPluginsOptions) ApplyTo(conf *qrmconfig.QRMPluginsConfiguration) err if err := o.IOOptions.ApplyTo(conf.IOQRMPluginConfig); err != nil { return err } + if err := o.GPUOptions.ApplyTo(conf.GPUQRMPluginConfig); err != nil { + return err + } return nil } diff --git a/go.mod b/go.mod index 56953cf18e..ca6acb5cc0 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/golang/mock v1.6.0 github.com/golang/protobuf v1.5.3 github.com/google/cadvisor v0.44.2 + github.com/google/go-cmp v0.5.9 github.com/google/uuid v1.3.0 github.com/h2non/gock v1.2.0 github.com/klauspost/cpuid/v2 v2.2.6 @@ -100,7 +101,6 @@ require ( github.com/godbus/dbus/v5 v5.0.6 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/gnostic v0.6.9 // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect @@ -175,6 +175,7 @@ require ( ) replace ( + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97 k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 @@ -196,7 +197,7 @@ replace ( k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6 k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6 k8s.io/kubectl => k8s.io/kubectl v0.24.6 - k8s.io/kubelet => github.com/kubewharf/kubelet v1.24.6-kubewharf.9 + k8s.io/kubelet => github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2 k8s.io/kubernetes => k8s.io/kubernetes v1.24.6 k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6 k8s.io/metrics => k8s.io/metrics v0.24.6 diff --git a/go.sum b/go.sum index 816e098731..1701a16c1e 100644 --- a/go.sum +++ b/go.sum @@ -574,10 +574,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/katalyst-api v0.5.8-0.20251212030746-894fa2521a86 h1:GCqe9PcoTQ7akNDyAmavhnSrPV7sMAoYJ5jKEaJg4Ac= -github.com/kubewharf/katalyst-api v0.5.8-0.20251212030746-894fa2521a86/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= -github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc= -github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/libopenstorage/openstorage v1.0.0/go.mod h1:Sp1sIObHjat1BeXhfMqLZ14wnOzEhNx2YQedreMcUyc= @@ -587,6 +583,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= +github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97 h1:1vDUtGKw5ZI9yAKcbWQw+1y3BvQi3Sp4nGhCCi00vZs= +github.com/luomingmeng/katalyst-api v0.0.0-20251225062836-a81d80885d97/go.mod h1:BZMVGVl3EP0eCn5xsDgV41/gjYkoh43abIYxrB10e3k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= @@ -937,6 +935,8 @@ github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca/go.mod h1:ce1O1j6UtZfjr22oyGxGLbauSBp2YVXpARAosm7dHBg= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2 h1:km3N0XyOxD5yh/xdKwLdXnMx01wFRKeDz4/CT8ui8a0= +github.com/yehlemias/kubelet v0.0.0-20250929105636-c5bb000496f2/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= diff --git a/pkg/agent/evictionmanager/manager.go b/pkg/agent/evictionmanager/manager.go index 48c8ed3eb3..1457930842 100644 --- a/pkg/agent/evictionmanager/manager.go +++ b/pkg/agent/evictionmanager/manager.go @@ -143,6 +143,7 @@ func NewInnerEvictionPluginInitializers() map[string]plugin.InitFunc { innerEvictionPluginInitializers := make(map[string]plugin.InitFunc) innerEvictionPluginInitializers[resource.ReclaimedResourcesEvictionPluginName] = resource.NewReclaimedResourcesEvictionPlugin innerEvictionPluginInitializers[resource.ReclaimedNumaResourcesEvictionPluginName] = resource.NewReclaimedNumaResourcesEvictionPlugin + innerEvictionPluginInitializers[resource.ReclaimedGPUResourcesEvictionPluginName] = resource.NewReclaimedGPUResourcesEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameNumaMemoryPressure] = memory.NewNumaMemoryPressureEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameSystemMemoryPressure] = memory.NewSystemPressureEvictionPlugin innerEvictionPluginInitializers[memory.EvictionPluginNameRssOveruse] = memory.NewRssOveruseEvictionPlugin diff --git a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go new file mode 100644 index 0000000000..08fff40113 --- /dev/null +++ b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources.go @@ -0,0 +1,86 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resource + +import ( + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/events" + + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + "github.com/kubewharf/katalyst-core/pkg/agent/evictionmanager/plugin" + "github.com/kubewharf/katalyst-core/pkg/client" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/process" +) + +const ( + ReclaimedGPUResourcesEvictionPluginName = "reclaimed-gpu-resource-pressure-eviction-plugin" +) + +const ( + thresholdMetToleranceDurationForGPU = 15 +) + +type ReclaimedGPUResourcesPlugin struct { + *process.StopControl + *ZoneResourcesPlugin +} + +// NewReclaimedGPUResourcesEvictionPlugin constructs a GPU topology-aware eviction plugin. +// It wires threshold/deletion/tolerance getters from dynamic configuration and +// reuses the generic ZoneResourcesPlugin with zoneType=GPU to preserve behavior. +func NewReclaimedGPUResourcesEvictionPlugin(_ *client.GenericClientSet, _ events.EventRecorder, + metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, conf *config.Configuration, +) plugin.EvictionPlugin { + reclaimedThresholdGetter := func(resourceName v1.ResourceName) *float64 { + if threshold, ok := conf.GetDynamicConfiguration().EvictionThreshold[resourceName]; !ok { + return nil + } else { + return &threshold + } + } + + deletionGracePeriodGetter := func() int64 { + return conf.GetDynamicConfiguration().ReclaimedResourcesEvictionConfiguration.DeletionGracePeriod + } + + thresholdMetToleranceDurationGetter := func() int64 { + return int64(thresholdMetToleranceDurationForGPU) + } + + p := NewZoneResourcesPlugin( + ReclaimedGPUResourcesEvictionPluginName, + v1alpha1.TopologyTypeGPU, + metaServer, + emitter, + nil, + reclaimedThresholdGetter, + deletionGracePeriodGetter, + thresholdMetToleranceDurationGetter, + conf.SkipZeroQuantityResourceNames, + conf.CheckReclaimedQoSForPod, + ) + + return &ReclaimedGPUResourcesPlugin{ + StopControl: process.NewStopControl(time.Time{}), + ZoneResourcesPlugin: p, + } +} diff --git a/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go new file mode 100644 index 0000000000..deb21399c2 --- /dev/null +++ b/pkg/agent/evictionmanager/plugin/resource/reclaimed_gpu_resources_test.go @@ -0,0 +1,161 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resource + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/events" + + nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + pluginapi "github.com/kubewharf/katalyst-api/pkg/protocol/evictionplugin/v1alpha1" + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnr" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/node" + "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +func TestNewReclaimedGPUResourcesEvictionPlugin_Behavior(t *testing.T) { + t.Parallel() + + testNodeName := "gpu-node" + testConf, err := options.NewOptions().Config() + if err != nil { + t.Fatalf("config error: %v", err) + } + testConf.NodeName = testNodeName + // configure threshold for GPU resource + if testConf.GetDynamicConfiguration().EvictionThreshold == nil { + testConf.GetDynamicConfiguration().EvictionThreshold = map[corev1.ResourceName]float64{} + } + testConf.GetDynamicConfiguration().EvictionThreshold[corev1.ResourceName("nvidia.com/gpu")] = 0.5 + + // pods + podA := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-a", + Namespace: "default", + UID: "uid-a", + Annotations: map[string]string{ + "katalyst.kubewharf.io/qos_level": "reclaimed_cores", + }, + }, + } + podB := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-b", + Namespace: "default", + UID: "uid-b", + Annotations: map[string]string{ + "katalyst.kubewharf.io/qos_level": "reclaimed_cores", + }, + }, + } + pods := []*corev1.Pod{podA, podB} + + // CNR with GPU topology and allocations + cnrObj := &nodev1alpha1.CustomNodeResource{ + ObjectMeta: metav1.ObjectMeta{Name: testNodeName}, + Status: nodev1alpha1.CustomNodeResourceStatus{ + TopologyZone: []*nodev1alpha1.TopologyZone{ + { + Name: "0", + Type: nodev1alpha1.TopologyTypeGPU, + Resources: nodev1alpha1.Resources{ + Allocatable: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"), + }, + }, + Allocations: []*nodev1alpha1.Allocation{ + { + Consumer: "default/pod-a/uid-a", + Requests: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"), + }, + }, + { + Consumer: "default/pod-b/uid-b", + Requests: &corev1.ResourceList{ + corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"), + }, + }, + }, + }, + }, + }, + } + + ctx, err := katalyst_base.GenerateFakeGenericContext(nil, []runtime.Object{cnrObj}, nil) + if err != nil { + t.Fatalf("context error: %v", err) + } + + ms := &metaserver.MetaServer{ + MetaAgent: &agent.MetaAgent{ + PodFetcher: &pod.PodFetcherStub{PodList: pods}, + NodeFetcher: node.NewRemoteNodeFetcher(testConf.BaseConfiguration, testConf.NodeConfiguration, + ctx.Client.KubeClient.CoreV1().Nodes()), + CNRFetcher: cnr.NewCachedCNRFetcher(testConf.BaseConfiguration, testConf.CNRConfiguration, + ctx.Client.InternalClient.NodeV1alpha1().CustomNodeResources()), + }, + } + + plugin := NewReclaimedGPUResourcesEvictionPlugin(ctx.Client, &events.FakeRecorder{}, ms, metrics.DummyMetrics{}, testConf) + if plugin == nil { + t.Fatalf("plugin nil") + } + + // ThresholdMet + met, err := plugin.ThresholdMet(context.TODO(), &pluginapi.GetThresholdMetRequest{}) + if err != nil { + t.Fatalf("threshold error: %v", err) + } + if met == nil { + t.Fatalf("threshold nil") + } + + // GetTopEvictionPods + podsResp, err := plugin.GetTopEvictionPods(context.TODO(), &pluginapi.GetTopEvictionPodsRequest{ + ActivePods: pods, + TopN: 1, + EvictionScope: met.EvictionScope, + }) + if err != nil { + t.Fatalf("top eviction error: %v", err) + } + if len(podsResp.GetTargetPods()) == 0 { + t.Fatalf("no target pods") + } + + // GetEvictPods (currently returns empty but should be non-nil) + evictResp, err := plugin.GetEvictPods(context.TODO(), &pluginapi.GetEvictPodsRequest{ActivePods: pods}) + if err != nil { + t.Fatalf("evict pods error: %v", err) + } + if evictResp == nil { + t.Fatalf("evict resp nil") + } +} diff --git a/pkg/agent/evictionmanager/plugin/resource/resources.go b/pkg/agent/evictionmanager/plugin/resource/resources.go index 5908a757dd..b7163bb647 100644 --- a/pkg/agent/evictionmanager/plugin/resource/resources.go +++ b/pkg/agent/evictionmanager/plugin/resource/resources.go @@ -108,6 +108,11 @@ func (b *ResourcesEvictionPlugin) Start() { // ThresholdMet evict pods when the beset effort resources usage is greater than // the supply (after considering toleration). func (b *ResourcesEvictionPlugin) ThresholdMet(ctx context.Context, _ *pluginapi.GetThresholdMetRequest) (*pluginapi.ThresholdMetResponse, error) { + var err error + defer func() { + _ = general.UpdateHealthzStateByError(b.pluginName, err) + }() + activePods, err := b.metaServer.GetPodList(ctx, native.PodIsActive) if err != nil { errMsg := fmt.Sprintf("failed to list pods from metaServer: %v", err) diff --git a/pkg/agent/evictionmanager/plugin/resource/zone_resources.go b/pkg/agent/evictionmanager/plugin/resource/zone_resources.go index 0e17453fcf..4f511927d5 100644 --- a/pkg/agent/evictionmanager/plugin/resource/zone_resources.go +++ b/pkg/agent/evictionmanager/plugin/resource/zone_resources.go @@ -121,6 +121,11 @@ func (p *ZoneResourcesPlugin) Start() { // - Skips resources with zero total when configured in skip list. // - Returns HARD_MET with GREATER_THAN semantics when used > threshold(total). func (p *ZoneResourcesPlugin) ThresholdMet(ctx context.Context, _ *pluginapi.GetThresholdMetRequest) (*pluginapi.ThresholdMetResponse, error) { + var err error + defer func() { + _ = general.UpdateHealthzStateByError(p.pluginName, err) + }() + activePods, err := p.metaServer.GetPodList(ctx, native.PodIsActive) if err != nil { errWrapped := fmt.Errorf("list pods from metaServer: %w", err) diff --git a/pkg/agent/orm/endpoint/resource_plugin_stub.go b/pkg/agent/orm/endpoint/resource_plugin_stub.go index 990118f7ed..425189b2fe 100644 --- a/pkg/agent/orm/endpoint/resource_plugin_stub.go +++ b/pkg/agent/orm/endpoint/resource_plugin_stub.go @@ -52,6 +52,11 @@ type Stub struct { getTopologyAwareAllocatableResourcesFunc stubGetTopologyAwareAllocatableResourcesFunc getTopologyAwareResourcesFunc stubGetTopologyAwareResourcesFunc + // allocAssociatedDeviceFunc is used for handling associated device allocation requests. + allocAssociatedDeviceFunc stubAllocAssociatedDeviceFunc + // getAssociatedDeviceFunc is used for handling associated device get topology hints requests. + getAssociatedTopologyHintsFunc stubGetAssociatedDeviceTopologyHintsFunc + registrationStatus chan watcherapi.RegistrationStatus // for testing endpoint string // for testing } @@ -69,6 +74,10 @@ type stubGetTopologyAwareAllocatableResourcesFunc func(r *pluginapi.GetTopologyA type stubGetTopologyAwareResourcesFunc func(r *pluginapi.GetTopologyAwareResourcesRequest) (*pluginapi.GetTopologyAwareResourcesResponse, error) +type stubAllocAssociatedDeviceFunc func(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) + +type stubGetAssociatedDeviceTopologyHintsFunc func(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) + func defaultAllocFunc(r *pluginapi.ResourceRequest) (*pluginapi.ResourceAllocationResponse, error) { var response pluginapi.ResourceAllocationResponse @@ -80,6 +89,16 @@ func defaultGetAllocFunc(r *pluginapi.GetResourcesAllocationRequest) (*pluginapi return &response, nil } +func defaultAllocateAssociatedDeviceFunc(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + var response pluginapi.AssociatedDeviceAllocationResponse + return &response, nil +} + +func defaultGetAssociatedDeviceTopologyHintsFunc(r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + var response pluginapi.AssociatedDeviceHintsResponse + return &response, nil +} + // NewResourcePluginStub returns an initialized ResourcePlugin Stub. func NewResourcePluginStub(socket string, name string, preStartContainerFlag bool) *Stub { return &Stub{ @@ -89,8 +108,10 @@ func NewResourcePluginStub(socket string, name string, preStartContainerFlag boo stop: make(chan interface{}), - allocFunc1: defaultAllocFunc, - allocFunc2: defaultGetAllocFunc, + allocFunc1: defaultAllocFunc, + allocFunc2: defaultGetAllocFunc, + allocAssociatedDeviceFunc: defaultAllocateAssociatedDeviceFunc, + getAssociatedTopologyHintsFunc: defaultGetAssociatedDeviceTopologyHintsFunc, } } @@ -103,6 +124,11 @@ func (m *Stub) SetGetAllocFunc(f stubAllocFunc2) { m.allocFunc2 = f } +// SetAssociatedDeviceFunc sets the allocation function for associated devices. +func (m *Stub) SetAssociatedDeviceFunc(f stubAllocAssociatedDeviceFunc) { + m.allocAssociatedDeviceFunc = f +} + func (m *Stub) SetGetTopologyAwareAllocatableResourcesFunc(f stubGetTopologyAwareAllocatableResourcesFunc) { m.getTopologyAwareAllocatableResourcesFunc = f } @@ -289,9 +315,26 @@ func (m *Stub) GetPodTopologyHints(ctx context.Context, r *pluginapi.PodResource return &pluginapi.PodResourceHintsResponse{}, nil } -// Notify the resource plugin that the pod has beed deleted, +// RemovePod Notify the resource plugin that the pod has beed deleted, // and the plugin should do some clear-up work. func (m *Stub) RemovePod(ctx context.Context, r *pluginapi.RemovePodRequest) (*pluginapi.RemovePodResponse, error) { log.Printf("RemovePod, %+v", r) return &pluginapi.RemovePodResponse{}, nil } + +func (m *Stub) UpdateAllocatableAssociatedDevices(ctx context.Context, r *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + log.Printf("UpdateAllocatableAssociatedDevices, %+v", r) + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +// AllocateAssociatedDevice is the gRPC implementation of the allocation function for associated devices. +// It calls the registered allocation function. +func (m *Stub) AllocateAssociatedDevice(ctx context.Context, r *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + log.Printf("AllocateAssociatedDevice, %+v", r) + return m.allocAssociatedDeviceFunc(r) +} + +func (m *Stub) GetAssociatedDeviceTopologyHints(ctx context.Context, request *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + log.Printf("GetAssociatedDeviceTopologyHints, %+v", request) + return m.getAssociatedTopologyHintsFunc(request) +} diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go index d1de701337..8f1d17d1f4 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go @@ -85,6 +85,8 @@ const ( // and adjust resource requirements and configurations type DynamicPolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer + name string stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go index 471f737075..48ad1a609a 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_hint_handlers.go @@ -259,7 +259,6 @@ func (p *DynamicPolicy) calculateHints( numaBound = minNUMAsCountNeeded + 1 } - preferredHintIndexes := []int{} var availableNumaHints []*pluginapi.TopologyHint machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { maskCount := mask.Count() @@ -295,10 +294,6 @@ func (p *DynamicPolicy) calculateHints( Nodes: machine.MaskToUInt64Array(mask), Preferred: preferred, }) - - if preferred { - preferredHintIndexes = append(preferredHintIndexes, len(availableNumaHints)-1) - } }) // todo support numa_binding without numa_exclusive in the future diff --git a/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go b/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go index e220770356..3417f17615 100644 --- a/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go @@ -58,6 +58,8 @@ const ( // NativePolicy is a policy compatible with Kubernetes native semantics and is used in topology-aware scheduling scenarios. type NativePolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer + name string stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go new file mode 100644 index 0000000000..a63aef6bb8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base.go @@ -0,0 +1,229 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "fmt" + "sync" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const ( + GPUPluginStateFileName = "gpu_plugin_state" +) + +// BasePlugin is a shared plugin that provides common functionalities and fields for GPU resource plugins and custom device plugins. +type BasePlugin struct { + mu sync.RWMutex + Conf *config.Configuration + + Emitter metrics.MetricEmitter + MetaServer *metaserver.MetaServer + AgentCtx *agent.GenericContext + + PodAnnotationKeptKeys []string + PodLabelKeptKeys []string + + // Map of checkpoints for each sub-plugin + State state.State + // Registry of device topology providers + DeviceTopologyRegistry *machine.DeviceTopologyRegistry + // ShareGPUManager is a manager that manages share GPU devices + ShareGPUManager ShareGPUManager + + // Registry of default resource state generators + DefaultResourceStateGeneratorRegistry *state.DefaultResourceStateGeneratorRegistry + + // Map of specific device name to device type + deviceNameToTypeMap map[string]string +} + +func NewBasePlugin( + agentCtx *agent.GenericContext, conf *config.Configuration, wrappedEmitter metrics.MetricEmitter, +) (*BasePlugin, error) { + return &BasePlugin{ + Conf: conf, + + Emitter: wrappedEmitter, + MetaServer: agentCtx.MetaServer, + AgentCtx: agentCtx, + + PodAnnotationKeptKeys: conf.PodAnnotationKeptKeys, + PodLabelKeptKeys: conf.PodLabelKeptKeys, + + DeviceTopologyRegistry: machine.NewDeviceTopologyRegistry(), + DefaultResourceStateGeneratorRegistry: state.NewDefaultResourceStateGeneratorRegistry(), + ShareGPUManager: NewShareGPUManager(), + + deviceNameToTypeMap: make(map[string]string), + }, nil +} + +// InitState initializes the state of the plugin. +func (p *BasePlugin) InitState() error { + stateImpl, err := state.NewCheckpointState(p.Conf.QRMPluginsConfiguration, p.Conf.GenericQRMPluginConfiguration.StateFileDirectory, GPUPluginStateFileName, + gpuconsts.GPUResourcePluginPolicyNameStatic, p.DefaultResourceStateGeneratorRegistry, p.Conf.SkipGPUStateCorruption, p.Emitter) + if err != nil { + return fmt.Errorf("NewCheckpointState failed with error: %v", err) + } + + p.State = stateImpl + return nil +} + +func (p *BasePlugin) Run(stopCh <-chan struct{}) error { + go p.ShareGPUManager.Run(p, stopCh) + return nil +} + +func (p *BasePlugin) PackAllocationResponse( + req *pluginapi.ResourceRequest, allocationInfo *state.AllocationInfo, + resourceAllocationAnnotations map[string]string, resourceName string, +) (*pluginapi.ResourceAllocationResponse, error) { + if allocationInfo == nil { + return nil, fmt.Errorf("packAllocationResponse got nil allocationInfo") + } else if req == nil { + return nil, fmt.Errorf("packAllocationResponse got nil request") + } + + return &pluginapi.ResourceAllocationResponse{ + PodUid: req.PodUid, + PodNamespace: req.PodNamespace, + PodName: req.PodName, + ContainerName: req.ContainerName, + ContainerType: req.ContainerType, + ContainerIndex: req.ContainerIndex, + PodRole: req.PodRole, + PodType: req.PodType, + ResourceName: req.ResourceName, + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + resourceName: { + IsNodeResource: true, + IsScalarResource: true, // to avoid re-allocating + AllocatedQuantity: allocationInfo.AllocatedAllocation.Quantity, + Annotations: resourceAllocationAnnotations, + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + req.Hint, + }, + }, + }, + }, + }, + Labels: general.DeepCopyMap(req.Labels), + Annotations: general.DeepCopyMap(req.Annotations), + }, nil +} + +// UpdateAllocatableAssociatedDevicesByDeviceType updates the topology provider with topology information of the +// given device type. +func (p *BasePlugin) UpdateAllocatableAssociatedDevicesByDeviceType( + request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, deviceType string, +) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + deviceTopology := &machine.DeviceTopology{ + Devices: make(map[string]machine.DeviceInfo, len(request.Devices)), + } + + for _, device := range request.Devices { + var numaNode []int + if device.Topology != nil { + numaNode = make([]int, 0, len(device.Topology.Nodes)) + + for _, node := range device.Topology.Nodes { + if node == nil { + continue + } + numaNode = append(numaNode, int(node.ID)) + } + } + + deviceTopology.Devices[device.ID] = machine.DeviceInfo{ + Health: device.Health, + NumaNodes: numaNode, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + } + } + + err := p.DeviceTopologyRegistry.SetDeviceTopology(deviceType, deviceTopology) + if err != nil { + general.Errorf("set device topology failed with error: %v", err) + return nil, fmt.Errorf("set device topology failed with error: %v", err) + } + + general.Infof("got device %s topology success: %v", request.DeviceName, deviceTopology) + + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +// GenerateResourceStateFromPodEntries returns an AllocationMap of a certain resource based on pod entries +// 1. If podEntries is nil, it will get pod entries from state +// 2. If the generator is not found, it will return an error +func (p *BasePlugin) GenerateResourceStateFromPodEntries( + resourceName string, + podEntries state.PodEntries, +) (state.AllocationMap, error) { + if podEntries == nil { + podEntries = p.State.GetPodEntries(v1.ResourceName(resourceName)) + } + + generator, ok := p.DefaultResourceStateGeneratorRegistry.GetGenerator(resourceName) + if !ok { + return nil, fmt.Errorf("could not find generator for resource %s", resourceName) + } + + return state.GenerateResourceStateFromPodEntries(podEntries, generator) +} + +func (p *BasePlugin) GenerateMachineStateFromPodEntries( + podResourceEntries state.PodResourceEntries, +) (state.AllocationResourcesMap, error) { + return state.GenerateMachineStateFromPodEntries(podResourceEntries, p.DefaultResourceStateGeneratorRegistry) +} + +// RegisterDeviceNameToType is used to map device name to device type. +// For example, we may have multiple device names for a same device type, e.g. "nvidia.com/gpu" and "nvidia.com/be-gpu", +// so we map them to the same device type, which allows us to allocate them interchangeably. +func (p *BasePlugin) RegisterDeviceNameToType(resourceNames []string, deviceType string) { + for _, resourceName := range resourceNames { + p.deviceNameToTypeMap[resourceName] = deviceType + } +} + +func (p *BasePlugin) GetResourceTypeFromDeviceName(deviceName string) (string, error) { + deviceType, ok := p.deviceNameToTypeMap[deviceName] + if !ok { + return "", fmt.Errorf("no device type found for device name %s", deviceName) + } + return deviceType, nil +} + +// RegisterTopologyAffinityProvider is a hook to set device affinity for a certain device type +func (p *BasePlugin) RegisterTopologyAffinityProvider(deviceType string, deviceAffinityProvider machine.DeviceAffinityProvider) { + p.DeviceTopologyRegistry.RegisterTopologyAffinityProvider(deviceType, deviceAffinityProvider) +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go b/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go new file mode 100644 index 0000000000..00f0453cb9 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/base_test.go @@ -0,0 +1,175 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func TestBasePlugin_PackAllocationResponse(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + req *pluginapi.ResourceRequest + allocationInfo *state.AllocationInfo + annotations map[string]string + resourceName string + expectedResp *pluginapi.ResourceAllocationResponse + expectedErr bool + }{ + { + name: "nil allocation info", + req: &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + }, + resourceName: "test-resource", + expectedErr: true, + }, + { + name: "nil request", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + }, + resourceName: "test-resource", + expectedErr: true, + }, + { + name: "basic case", + req: &pluginapi.ResourceRequest{ + PodUid: "test-uid", + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + ResourceName: "test-resource", + }, + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + }, + annotations: map[string]string{ + "test-key": "test-value", + }, + resourceName: "test-resource", + expectedResp: &pluginapi.ResourceAllocationResponse{ + PodUid: "test-uid", + PodNamespace: "test", + PodName: "test", + ContainerName: "test", + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "test-resource", + AllocationResult: &pluginapi.ResourceAllocation{ + ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ + "test-resource": { + IsNodeResource: true, + IsScalarResource: true, + AllocatedQuantity: 4, + Annotations: map[string]string{ + "test-key": "test-value", + }, + ResourceHints: &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: []uint64{0, 1}, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + basePlugin, err := NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + resp, err := basePlugin.PackAllocationResponse(tt.req, tt.allocationInfo, tt.annotations, tt.resourceName) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedResp, resp) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go new file mode 100644 index 0000000000..4fe1d47e79 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager.go @@ -0,0 +1,251 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "context" + "sync" + "time" + + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/util/general" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// This module determines whether each GPU device is eligible for shared usage +// (ShareGPU) based on per-pod extended indicators fetched from MetaServer. It +// periodically scans the GPU machine state maintained by BasePlugin and builds +// a snapshot map `shareGPUMap` that records the ShareGPU decision for each +// device ID. +// Decision rule: Only main containers are considered; if any main container on +// a device disables ShareGPU, the device is marked non-shareable. To reduce +// repeated external queries, a per-sync in-memory cache keyed by `pod UID` is +// used to memoize `EnableShareGPU` decisions. + +// ShareGPUManager determines per-device ShareGPU eligibility and maintains a +// periodic snapshot. Safe for concurrent reads via `EnableShareGPU`. +// +// Time Complexity: +// - `sync`: O(D + C) where D is number of devices and C is number of main +// containers scanned; per-pod indicator lookups are amortized O(1) via cache. +// - `EnableShareGPU`: O(1) map read. +// - `Allocate`: O(A) over `TopologyAwareAllocations` entries when marking. +// +// Potential Errors: +// - External indicator fetch may fail; treated conservatively and wrapped. +// - Internal panics are recovered to keep the manager running. +type ShareGPUManager interface { + // Allocate processes an allocation of a main container. If the pod's + // indicator disables ShareGPU, all involved device IDs in + // `TopologyAwareAllocations` are marked non-shareable in the snapshot. + // Params: + // - ctx: request-scoped context for external calls + // - allocationInfo: container allocation metadata; ignored if nil or non-main + // Returns: none; updates internal snapshot + // Errors: any external errors are logged and wrapped internally + Allocate(ctx context.Context, allocationInfo *state.AllocationInfo) + + // EnableShareGPU returns the cached ShareGPU decision for a given device ID. + // Params: + // - id: device ID string + // Returns: true if the device is shareable; false if unknown or disallowed + EnableShareGPU(id string) bool + + // Run starts the periodic synchronization loop until `stopCh` is closed. + // Params: + // - basePlugin: plugin providing machine state and MetaServer + // - stopCh: channel to stop the loop + // Notes: blocking method; should be called in a separate goroutine. + Run(*BasePlugin, <-chan struct{}) +} +type shareGPUManager struct { + sync.RWMutex + + shareGPUMap map[string]bool + basePlugin *BasePlugin +} + +// NewShareGPUManager creates a new ShareGPUManager instance. +// Returns: a manager with an empty snapshot cache. +func NewShareGPUManager() ShareGPUManager { + return &shareGPUManager{ + shareGPUMap: make(map[string]bool), + } +} + +// EnableShareGPU returns the cached ShareGPU decision for a given device ID. +// If the device ID is not present in the latest snapshot, it returns false. +// Complexity: O(1) map lookup. +func (s *shareGPUManager) EnableShareGPU(id string) bool { + s.RLock() + defer s.RUnlock() + + return s.shareGPUMap[id] +} + +// Allocate marks involved device IDs as non-shareable if the pod disables +// ShareGPU. Non-main containers are ignored. +// Complexity: O(A) where A is number of device IDs in TopologyAwareAllocations. +func (s *shareGPUManager) Allocate(ctx context.Context, allocationInfo *state.AllocationInfo) { + if allocationInfo == nil || !allocationInfo.CheckMainContainer() { + return + } + + enableShareGPU := s.evaluateContainerDeviceShareStatus(ctx, allocationInfo, nil) + if enableShareGPU { + return + } + + s.Lock() + defer s.Unlock() + for id := range allocationInfo.TopologyAwareAllocations { + s.shareGPUMap[id] = false + } +} + +// Run starts the periodic synchronization loop that refreshes ShareGPU +// decisions. The loop: +// - immediately performs a synchronization once for faster readiness; +// - then schedules periodic syncs every 15 seconds; +// - stops when `stopCh` is closed. +// Note: The method is blocking; callers should invoke it in a goroutine. +func (s *shareGPUManager) Run(basePlugin *BasePlugin, stopCh <-chan struct{}) { + ctx, cancel := context.WithCancel(context.Background()) + s.basePlugin = basePlugin + + go func() { + <-stopCh + cancel() + }() + + s.sync(ctx) + wait.UntilWithContext(ctx, s.sync, 30*time.Second) +} + +// sync refreshes the ShareGPU decisions by scanning machine state. +// Complexity: O(D + C) per invocation. +func (s *shareGPUManager) sync(ctx context.Context) { + if s.basePlugin == nil { + general.Infof("share gpu manager sync failed, basePlugin is nil") + return + } + + s.Lock() + defer s.Unlock() + machineState, ok := s.basePlugin.State.GetMachineState()[gpuconsts.GPUDeviceType] + if !ok { + general.Infof("share gpu manager found no GPU machine state; skipping") + return + } + + // Build a fresh snapshot with per-sync indicator cache. + shareGPUMap := make(map[string]bool, len(machineState)) + indicatorCache := make(map[types.UID]bool) + for id, alloc := range machineState { + shareGPUMap[id] = s.evaluateDeviceShareStatus(ctx, alloc, indicatorCache) + } + + s.shareGPUMap = shareGPUMap +} + +// evaluateDeviceShareStatus scans main containers for a device and returns true +// if and only if all of them enable ShareGPU. Any error retrieving indicators is +// treated as non-blocking and the container is ignored (optimistic sharing). +// evaluateDeviceShareStatus returns true iff all main containers enable ShareGPU. +// Complexity: O(Cd) where Cd is number of main containers on the device. +func (s *shareGPUManager) evaluateDeviceShareStatus(ctx context.Context, alloc *state.AllocationState, cache map[types.UID]bool) bool { + if alloc == nil { + return false + } + + // Default to shareable and short-circuit to false once a disallowed pod is found. + for _, containerEntries := range alloc.PodEntries { + for _, container := range containerEntries { + if !container.CheckMainContainer() || container.CheckReclaimed() { + continue + } + + enableShareGPU := s.evaluateContainerDeviceShareStatus(ctx, container, cache) + if !enableShareGPU { + return false + } + } + } + + return true +} + +// evaluateContainerDeviceShareStatus checks a single container's pod-level indicator. +// If a cache is provided, it memoizes decisions by pod UID. +// Complexity: O(1) with cache hit; O(ExternalCall) otherwise. +func (s *shareGPUManager) evaluateContainerDeviceShareStatus(ctx context.Context, container *state.AllocationInfo, cache map[types.UID]bool) bool { + podMeta := s.preparePodMeta(container) + + if cache != nil { + if v, ok := cache[podMeta.UID]; ok { + return v + } + } + + enableShareGPU, err := s.getPodEnableShareGPU(ctx, podMeta) + if err != nil { + general.Infof("share gpu manager: fetching extended indicators failed for pod %s/%s: %v", podMeta.Namespace, podMeta.Name, err) + if cache != nil { + cache[podMeta.UID] = false + } + return false + } + + if cache != nil { + cache[podMeta.UID] = enableShareGPU + } + return enableShareGPU +} + +// getPodEnableShareGPU queries MetaServer for the pod's `EnableShareGPU` indicator. +// Returns: boolean indicator value; error when external call fails. +// Errors: wrapped with context using `pkg/errors`. +func (s *shareGPUManager) getPodEnableShareGPU(ctx context.Context, podMeta metav1.ObjectMeta) (bool, error) { + enableShareGPU := false + indicators := v1alpha1.ReclaimResourceIndicators{} + baseLine, err := s.basePlugin.MetaServer.ServiceExtendedIndicator(ctx, podMeta, &indicators) + if err != nil { + return false, errors.Wrapf(err, "ServiceExtendedIndicator failed for pod %s/%s", podMeta.Namespace, podMeta.Name) + } + + if !baseLine && indicators.EnableShareGPU != nil { + enableShareGPU = *indicators.EnableShareGPU + } + + return enableShareGPU, nil +} + +func (s *shareGPUManager) preparePodMeta(info *state.AllocationInfo) metav1.ObjectMeta { + return metav1.ObjectMeta{ + UID: types.UID(info.PodUid), + Namespace: info.PodNamespace, + Name: info.PodName, + Labels: info.Labels, + Annotations: info.Annotations, + } +} diff --git a/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go new file mode 100644 index 0000000000..5f0e491f4c --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/baseplugin/share_gpu_manager_test.go @@ -0,0 +1,329 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package baseplugin + +import ( + "context" + "fmt" + "sync" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" + workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" + apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" + commonstate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + gpustate "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metaserver/spd" +) + +// fakeSPM is a lightweight ServiceProfilingManager for tests. +type fakeSPM struct { + behaviors map[types.UID]struct { + baseline bool + enable *bool + err error + } + calls int +} + +func (f *fakeSPM) ServiceExtendedIndicator(_ context.Context, podMeta metav1.ObjectMeta, indicators interface{}) (bool, error) { + f.calls++ + b := f.behaviors[podMeta.UID] + // fill EnableShareGPU if indicators is of proper type + if ind, ok := indicators.(*v1alpha1.ReclaimResourceIndicators); ok { + ind.EnableShareGPU = b.enable + } + if b.err != nil { + return false, b.err + } + return b.baseline, nil +} + +// Stubs for unused interface methods +func (f *fakeSPM) ServiceBusinessPerformanceLevel(context.Context, metav1.ObjectMeta) (spd.PerformanceLevel, error) { + return spd.PerformanceLevelPerfect, nil +} + +func (f *fakeSPM) ServiceBusinessPerformanceScore(context.Context, metav1.ObjectMeta) (float64, error) { + return spd.MaxPerformanceScore, nil +} + +func (f *fakeSPM) ServiceSystemPerformanceTarget(context.Context, metav1.ObjectMeta) (spd.IndicatorTarget, error) { + return spd.IndicatorTarget{}, nil +} + +func (f *fakeSPM) ServiceBaseline(context.Context, metav1.ObjectMeta) (bool, error) { + return false, nil +} + +func (f *fakeSPM) ServiceAggregateMetrics(context.Context, metav1.ObjectMeta, v1.ResourceName, bool, workloadapis.Aggregator, workloadapis.Aggregator) ([]resource.Quantity, error) { + return nil, nil +} +func (f *fakeSPM) Run(context.Context) {} + +// fakeState provides minimal state for tests. +type fakeState struct { + machine gpustate.AllocationResourcesMap +} + +func (f *fakeState) GetMachineState() gpustate.AllocationResourcesMap { + return f.machine +} +func (f *fakeState) GetPodResourceEntries() gpustate.PodResourceEntries { return nil } +func (f *fakeState) GetPodEntries(resourceName v1.ResourceName) gpustate.PodEntries { return nil } +func (f *fakeState) GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *gpustate.AllocationInfo { + return nil +} +func (f *fakeState) SetMachineState(gpustate.AllocationResourcesMap, bool) {} +func (f *fakeState) SetResourceState(v1.ResourceName, gpustate.AllocationMap, bool) {} +func (f *fakeState) SetPodResourceEntries(gpustate.PodResourceEntries, bool) {} +func (f *fakeState) SetAllocationInfo(v1.ResourceName, string, string, *gpustate.AllocationInfo, bool) { +} +func (f *fakeState) Delete(v1.ResourceName, string, string, bool) {} +func (f *fakeState) ClearState() {} +func (f *fakeState) StoreState() error { return nil } + +func makeContainer(podUID, podNS, podName, containerName string, main bool, ids ...string) *gpustate.AllocationInfo { + t := pluginapi.ContainerType_SIDECAR.String() + if main { + t = pluginapi.ContainerType_MAIN.String() + } + topo := make(map[string]gpustate.Allocation) + for _, id := range ids { + topo[id] = gpustate.Allocation{Quantity: 1} + } + return &gpustate.AllocationInfo{ + AllocationMeta: gpustate.AllocationInfo{AllocationMeta: commonstate.AllocationMeta{ + PodUid: podUID, + PodNamespace: podNS, + PodName: podName, + ContainerName: containerName, + ContainerType: t, + }}.AllocationMeta, + AllocatedAllocation: gpustate.Allocation{Quantity: 1}, + TopologyAwareAllocations: topo, + } +} + +func Test_EnableShareGPU_DefaultFalse(t *testing.T) { + t.Parallel() + m := NewShareGPUManager().(*shareGPUManager) + if got := m.EnableShareGPU("non-existent"); got { + t.Fatalf("expected false for unknown id, got true") + } +} + +func Test_Allocate_MarkFalse_OnIndicatorFalse(t *testing.T) { + t.Parallel() + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: ptrBool(false)}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u1", "ns", "pod", "c", true, "GPU-1", "GPU-2") + m.Allocate(context.Background(), alloc) + + if m.EnableShareGPU("GPU-1") || m.EnableShareGPU("GPU-2") { + t.Fatalf("expected devices marked false after Allocate") + } +} + +func Test_Sync_BuildsSnapshot_WithCache(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + + // Build machine state: one device with two main containers of same pod + ce := gpustate.ContainerEntries{ + "c1": makeContainer("u1", "ns", "pod", "c1", true, "GPU-1"), + "c2": makeContainer("u1", "ns", "pod", "c2", true, "GPU-1"), + } + allocState := &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": ce}} + machine := gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): gpustate.AllocationMap{"GPU-1": allocState}} + + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = &BasePlugin{MetaServer: bp.MetaServer, State: &fakeState{machine: machine}} + m.sync(context.Background()) + + if !m.EnableShareGPU("GPU-1") { + t.Fatalf("expected GPU-1 shareable") + } + if spm.calls != 1 { + t.Fatalf("expected 1 indicator call due to cache, got %d", spm.calls) + } +} + +func Test_Concurrency_Safety(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u1", "ns", "pod", "c", true, "GPU-1") + + var wg sync.WaitGroup + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Writer goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 100; i++ { + m.Allocate(ctx, alloc) + time.Sleep(time.Millisecond) + } + }() + + // Reader goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 100; i++ { + _ = m.EnableShareGPU("GPU-1") + time.Sleep(time.Millisecond) + } + }() + + wg.Wait() +} + +func Test_Run_StartsAndStops(t *testing.T) { + t.Parallel() + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}, State: &fakeState{machine: gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): gpustate.AllocationMap{"GPU-1": &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": gpustate.ContainerEntries{"c": makeContainer("u1", "ns", "pod", "c", true, "GPU-1")}}}}}}} + m := NewShareGPUManager() + + stopCh := make(chan struct{}) + go m.Run(bp, stopCh) + time.Sleep(50 * time.Millisecond) + close(stopCh) +} + +func Test_Allocate_Ignores_NonMain(t *testing.T) { + t.Parallel() + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{}} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = bp + + alloc := makeContainer("u2", "ns", "pod", "c", false, "GPU-1") + m.Allocate(context.Background(), alloc) + if m.EnableShareGPU("GPU-1") { + t.Fatalf("non-main container should not mark device") + } +} + +func Test_evaluateDeviceShareStatus_Ignores_reclaimed(t *testing.T) { + t.Parallel() + m := NewShareGPUManager().(*shareGPUManager) + alloc := makeContainer("u2", "ns", "pod", "c", true, "GPU-1") + alloc.QoSLevel = apiconsts.PodAnnotationQoSLevelReclaimedCores + as := &gpustate.AllocationState{ + PodEntries: map[string]gpustate.ContainerEntries{ + "u2": { + "c": alloc, + }, + }, + } + + if !m.evaluateDeviceShareStatus(context.Background(), as, make(map[types.UID]bool)) { + t.Fatalf("reclaimed container should not mark device") + } +} + +func ptrBool(b bool) *bool { v := b; return &v } + +func BenchmarkSync_WithIndicatorCache(b *testing.B) { + enable := ptrBool(true) + spm := &fakeSPM{behaviors: map[types.UID]struct { + baseline bool + enable *bool + err error + }{ + types.UID("u1"): {baseline: false, enable: enable}, + }} + bp := &BasePlugin{MetaServer: &metaserver.MetaServer{ServiceProfilingManager: spm}} + + // Build a large machine state: N devices, each with M main containers from the same pod + const N, M = 50, 20 + am := make(gpustate.AllocationMap, N) + for i := 0; i < N; i++ { + ce := make(gpustate.ContainerEntries, M) + for j := 0; j < M; j++ { + cname := fmt.Sprintf("c%c", 'A'+j) + ce[cname] = makeContainer("u1", "ns", "pod", cname, true, fmt.Sprintf("GPU-%c", 'A'+i)) + } + am[fmt.Sprintf("GPU-%c", 'A'+i)] = &gpustate.AllocationState{PodEntries: gpustate.PodEntries{"u1": ce}} + } + machine := gpustate.AllocationResourcesMap{v1.ResourceName(gpuconsts.GPUDeviceType): am} + + m := NewShareGPUManager().(*shareGPUManager) + m.basePlugin = &BasePlugin{MetaServer: bp.MetaServer, State: &fakeState{machine: machine}} + + b.ResetTimer() + for i := 0; i < b.N; i++ { + spm.calls = 0 + m.sync(context.Background()) + } + + // Expect only 1 external call per sync due to cache (one unique pod UID) + if spm.calls != 1 { + b.Fatalf("expected 1 indicator call per sync, got %d", spm.calls) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/consts/consts.go b/pkg/agent/qrm-plugins/gpu/consts/consts.go new file mode 100644 index 0000000000..e8ca95567b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/consts/consts.go @@ -0,0 +1,55 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consts + +import ( + "time" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" +) + +type AllocatedResource struct { + ResourceName string + PodName string + PodNamespace string + *pluginapi.TopologyAwareResource +} + +type AllocatableResource struct { + ResourceName string + *pluginapi.AllocatableTopologyAwareResource +} + +const ( + GPUDeviceType = "gpu_device" + RDMADeviceType = "rdma_device" + + // GPUResourcePluginPolicyNameStatic is the policy name of static gpu resource plugin + GPUResourcePluginPolicyNameStatic = string(consts.ResourcePluginPolicyNameStatic) + + GPUPluginDynamicPolicyName = qrm.QRMPluginNameGPU + "_" + GPUResourcePluginPolicyNameStatic + ClearResidualState = GPUPluginDynamicPolicyName + "_clear_residual_state" + + GPUMemPluginName = "gpu_mem_resource_plugin" + + StateCheckPeriod = 30 * time.Second + StateCheckTolerationTimes = 3 + MaxResidualTime = 5 * time.Minute +) diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go new file mode 100644 index 0000000000..b2b5aa6e3b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/custom_device_plugin_stub.go @@ -0,0 +1,98 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + "context" + "fmt" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" +) + +type CustomDevicePluginStub struct { + *baseplugin.BasePlugin +} + +func NewCustomDevicePluginStub(base *baseplugin.BasePlugin) CustomDevicePlugin { + return &CustomDevicePluginStub{ + BasePlugin: base, + } +} + +func (c CustomDevicePluginStub) DeviceNames() []string { + return []string{"custom-device-plugin-stub"} +} + +func (c CustomDevicePluginStub) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (c CustomDevicePluginStub) UpdateAllocatableAssociatedDevices(context.Context, *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (c CustomDevicePluginStub) DefaultAccompanyResourceName() string { + return "resource-plugin-stub" +} + +func (c CustomDevicePluginStub) AllocateAssociatedDevice(_ context.Context, resReq *pluginapi.ResourceRequest, _ *pluginapi.DeviceRequest, accompanyResourceName string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + // Simply check if the accompany resource has been allocated + // If it has been allocated, allocate the associated device + accompanyResourceAllocation := c.State.GetAllocationInfo(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName) + if accompanyResourceAllocation != nil { + c.State.SetAllocationInfo(v1.ResourceName(accompanyResourceName), resReq.PodUid, resReq.ContainerName, &state.AllocationInfo{}, false) + return &pluginapi.AssociatedDeviceAllocationResponse{}, nil + } else { + return nil, fmt.Errorf("accompany resource %s has not been allocated", accompanyResourceName) + } +} + +type CustomDevicePluginStub2 struct { + *baseplugin.BasePlugin +} + +func NewCustomDevicePluginStub2(base *baseplugin.BasePlugin) CustomDevicePlugin { + return &CustomDevicePluginStub2{ + BasePlugin: base, + } +} + +func (c CustomDevicePluginStub2) DeviceNames() []string { + return []string{"custom-device-plugin-stub-2"} +} + +func (c CustomDevicePluginStub2) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (c CustomDevicePluginStub2) UpdateAllocatableAssociatedDevices(context.Context, *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return &pluginapi.UpdateAllocatableAssociatedDevicesResponse{}, nil +} + +func (c CustomDevicePluginStub2) DefaultAccompanyResourceName() string { + return "resource-plugin-stub" +} + +func (c CustomDevicePluginStub2) AllocateAssociatedDevice(_ context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + // Simply allocate the associated device + c.State.SetAllocationInfo(v1.ResourceName(deviceReq.DeviceName), resReq.PodUid, resReq.ContainerName, &state.AllocationInfo{}, false) + return &pluginapi.AssociatedDeviceAllocationResponse{}, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go new file mode 100644 index 0000000000..a9ba9530a4 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go @@ -0,0 +1,229 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "context" + "fmt" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const GPUCustomDevicePluginName = "gpu-custom-device-plugin" + +const ( + defaultAccompanyResourceName = string(consts.ResourceGPUMemory) +) + +type GPUDevicePlugin struct { + *baseplugin.BasePlugin + deviceNames []string +} + +func NewGPUDevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { + gpuTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.GPUDeviceNames) + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.GPUDeviceType, + state.NewGenericDefaultResourceStateGenerator(gpuconsts.GPUDeviceType, base.DeviceTopologyRegistry)) + base.RegisterDeviceNameToType(base.Conf.GPUDeviceNames, gpuconsts.GPUDeviceType) + + return &GPUDevicePlugin{ + BasePlugin: base, + deviceNames: base.Conf.GPUDeviceNames, + } +} + +func (p *GPUDevicePlugin) DefaultAccompanyResourceName() string { + return defaultAccompanyResourceName +} + +func (p *GPUDevicePlugin) DeviceNames() []string { + return p.deviceNames +} + +func (p *GPUDevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.GPUDeviceType) +} + +func (p *GPUDevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +func (p *GPUDevicePlugin) AllocateAssociatedDevice( + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, _ string, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + qosLevel, err := qrmutil.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + general.InfoS("called", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", resReq.Annotations, + "resourceRequests", resReq.ResourceRequests, + "deviceName", deviceReq.DeviceName, + "resourceHint", resReq.Hint, + "deviceHint", deviceReq.Hint, + "availableDevices", deviceReq.AvailableDevices, + "reusableDevices", deviceReq.ReusableDevices, + "deviceRequest", deviceReq.DeviceRequest, + ) + + gpuAllocationInfo := p.State.GetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName) + if gpuAllocationInfo != nil { + if gpuAllocationInfo.TopologyAwareAllocations == nil { + return nil, fmt.Errorf("GPU topology aware allocation info is nil") + } + allocatedDevices := make([]string, 0, len(gpuAllocationInfo.TopologyAwareAllocations)) + for gpuID := range gpuAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + var allocatedDevices []string + memoryAllocationInfo := p.State.GetAllocationInfo(v1.ResourceName(defaultAccompanyResourceName), resReq.PodUid, resReq.ContainerName) + // GPU memory should have been allocated at this stage. + // We anticipate that gpu devices have also been allocated, so we can directly use the allocated devices from the gpu memory state. + if memoryAllocationInfo == nil || memoryAllocationInfo.TopologyAwareAllocations == nil { + // When GPU memory allocation info is nil, invoke the GPU allocate strategy to perform GPU allocation + general.InfoS("GPU memory allocation info is nil, invoking GPU allocate strategy", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName) + + // Get GPU topology + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, fmt.Errorf("failed to get gpu topology: %w", err) + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + // Use the strategy framework to allocate GPU devices + result, err := manager.AllocateGPUUsingStrategy( + resReq, + deviceReq, + gpuTopology, + p.Conf.GPUQRMPluginConfig, + p.Emitter, + p.MetaServer, + p.State.GetMachineState(), + qosLevel, + ) + if err != nil { + return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) + } + + if !result.Success { + return nil, fmt.Errorf("GPU allocation failed: %s", result.ErrorMessage) + } + + allocatedDevices = result.AllocatedDevices + } else { + // when GPU memory allocation info exists + for gpuID := range memoryAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, gpuID) + } + } + + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, fmt.Errorf("failed to get gpu topology: %w", err) + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + // Save gpu device allocations in state + numaNodes := machine.NewCPUSet() + gpuDeviceTopologyAwareAllocations := make(map[string]state.Allocation) + for _, deviceID := range allocatedDevices { + info, ok := gpuTopology.Devices[deviceID] + if !ok { + return nil, fmt.Errorf("failed to get gpu info for device: %s", deviceID) + } + + gpuDeviceTopologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: 1, + NUMANodes: info.NumaNodes, + } + numaNodes.Add(info.NumaNodes...) + } + + gpuDeviceAllocationInfo := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.Allocation{ + Quantity: float64(len(allocatedDevices)), + NUMANodes: numaNodes.ToSliceInt(), + }, + } + gpuDeviceAllocationInfo.TopologyAwareAllocations = gpuDeviceTopologyAwareAllocations + + // TODO:State can be updated using the actual resource name + p.State.SetAllocationInfo(gpuconsts.GPUDeviceType, resReq.PodUid, resReq.ContainerName, gpuDeviceAllocationInfo, false) + resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.GPUDeviceType, nil) + if err != nil { + return nil, fmt.Errorf("failed to generate gpu device state from pod entries: %v", err) + } + p.State.SetResourceState(gpuconsts.GPUDeviceType, resourceState, true) + + general.InfoS("allocated gpu devices", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "allocatedDevices", allocatedDevices) + + // call shareGPUManager to update GPU device state + p.BasePlugin.ShareGPUManager.Allocate(ctx, gpuDeviceAllocationInfo) + + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go new file mode 100644 index 0000000000..621401afaf --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu_test.go @@ -0,0 +1,330 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + conf.GPUDeviceNames = []string{"test-gpu"} + + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + basePlugin.State = stateImpl + + return basePlugin +} + +func TestGPUDevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewGPUDevicePlugin(basePlugin) + + // Update topology with associated devices + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "test-gpu", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-gpu-0", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 0, + }, + }, + }, + }, + { + ID: "test-gpu-1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + }, + } + + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify device topology is updated + gpuDevicePlugin := devicePlugin.(*GPUDevicePlugin) + deviceTopology, numaTopologyReady, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + assert.NoError(t, err) + assert.True(t, numaTopologyReady) + assert.NotNil(t, deviceTopology) + + expectedDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + "test-gpu-1": { + NumaNodes: []int{1}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + }, + } + + assert.Equal(t, expectedDeviceTopology, deviceTopology) +} + +func TestGPUDevicePlugin_AllocateAssociatedDevice(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + allocationInfo *state.AllocationInfo + accompanyResourceAllocationInfo *state.AllocationInfo + deviceReq *pluginapi.DeviceRequest + deviceTopology *machine.DeviceTopology + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceAllocationResponse + }{ + { + name: "Allocation already exists", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 1, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 1, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-gpu-0", "test-gpu-1"}, + }, + }, + }, + { + name: "gpu memory allocation exists", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 2, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 2, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + }, + "test-gpu-1": { + NumaNodes: []int{1}, + }, + "test-gpu-2": { + NumaNodes: []int{0}, + }, + "test-gpu-3": { + NumaNodes: []int{1}, + }, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-gpu-0", "test-gpu-1"}, + }, + }, + }, + { + name: "device topology does not exist", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 2, + NUMANodes: []int{0}, + }, + "test-gpu-1": { + Quantity: 2, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-gpu", + AvailableDevices: []string{"test-gpu-2", "test-gpu-3"}, + ReusableDevices: []string{"test-gpu-2", "test-gpu-3"}, + DeviceRequest: 2, + }, + expectedErr: true, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewGPUDevicePlugin(basePlugin) + + if tt.allocationInfo != nil { + basePlugin.State.SetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) + } + + if tt.accompanyResourceAllocationInfo != nil { + basePlugin.State.SetAllocationInfo(v1.ResourceName(defaultAccompanyResourceName), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false) + } + + if tt.deviceTopology != nil { + err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.GPUDeviceType, tt.deviceTopology) + assert.NoError(t, err) + } + + resourceReq := &pluginapi.ResourceRequest{ + PodUid: tt.podUID, + ContainerName: tt.containerName, + } + + resp, err := devicePlugin.AllocateAssociatedDevice(context.Background(), resourceReq, tt.deviceReq, "test") + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateAllocatedDevicesResult(t, tt.expectedResp, resp) + + // Verify state is updated + allocationInfo := basePlugin.State.GetAllocationInfo(gpuconsts.GPUDeviceType, tt.podUID, tt.containerName) + assert.NotNil(t, allocationInfo) + } + }) + } +} + +func evaluateAllocatedDevicesResult(t *testing.T, expectedResp, actualResp *pluginapi.AssociatedDeviceAllocationResponse) { + if expectedResp.AllocationResult == nil && actualResp.AllocationResult == nil { + return + } + + if expectedResp.AllocationResult != nil && actualResp.AllocationResult == nil { + t.Errorf("expected allocation result %v, but got nil", expectedResp.AllocationResult) + return + } + + if actualResp.AllocationResult != nil && expectedResp.AllocationResult == nil { + t.Errorf("expected nil allocation result, but got %v", actualResp.AllocationResult) + return + } + + assert.ElementsMatch(t, expectedResp.AllocationResult.AllocatedDevices, actualResp.AllocationResult.AllocatedDevices) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go new file mode 100644 index 0000000000..ea446e2df6 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/interface.go @@ -0,0 +1,38 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package customdeviceplugin + +import ( + "context" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" +) + +type CustomDevicePlugin interface { + // DeviceNames returns a list of all possible names for this device + DeviceNames() []string + + GetAssociatedDeviceTopologyHints(ctx context.Context, request *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) + + UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) + + DefaultAccompanyResourceName() string + + AllocateAssociatedDevice( + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, + ) (*pluginapi.AssociatedDeviceAllocationResponse, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go new file mode 100644 index 0000000000..b04d9a014e --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go @@ -0,0 +1,309 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rdma + +import ( + "context" + "fmt" + "math" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const RDMACustomDevicePluginName = "rdma-custom-device-plugin" + +type RDMADevicePlugin struct { + *baseplugin.BasePlugin + deviceNames []string +} + +func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { + rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.RDMADeviceNames) + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider) + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType, + state.NewGenericDefaultResourceStateGenerator(gpuconsts.RDMADeviceType, base.DeviceTopologyRegistry)) + base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType) + + return &RDMADevicePlugin{ + BasePlugin: base, + deviceNames: base.Conf.RDMADeviceNames, + } +} + +func (p *RDMADevicePlugin) DefaultAccompanyResourceName() string { + return "" +} + +func (p *RDMADevicePlugin) DeviceNames() []string { + return p.deviceNames +} + +func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType) +} + +func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +// AllocateAssociatedDevice check if rdma is allocated to other containers, make sure they do not share rdma +func (p *RDMADevicePlugin) AllocateAssociatedDevice( + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + general.InfoS("called", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", resReq.Annotations, + "resourceRequests", resReq.ResourceRequests, + "deviceName", deviceReq.DeviceName, + "resourceHint", resReq.Hint, + "deviceHint", deviceReq.Hint, + "availableDevices", deviceReq.AvailableDevices, + "reusableDevices", deviceReq.ReusableDevices, + "deviceRequest", deviceReq.DeviceRequest, + ) + + // Check if there is state for the device name + rdmaAllocationInfo := p.State.GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName) + if rdmaAllocationInfo != nil && rdmaAllocationInfo.TopologyAwareAllocations != nil { + allocatedDevices := make([]string, 0, len(rdmaAllocationInfo.TopologyAwareAllocations)) + for rdmaID := range rdmaAllocationInfo.TopologyAwareAllocations { + allocatedDevices = append(allocatedDevices, rdmaID) + } + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedDevices, + }, + }, nil + } + + rdmaTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) + if err != nil { + return nil, fmt.Errorf("failed to get gpu device topology: %v", err) + } + if !numaTopologyReady { + return nil, fmt.Errorf("gpu device topology is not ready") + } + + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, err + } + + var allocatedRdmaDevices []string + + // No accompany resource name + if accompanyResourceName == "" { + allocatedRdmaDevices, err = p.allocateWithNoAccompanyResource(deviceReq, rdmaTopology, hintNodes) + if err != nil { + return nil, fmt.Errorf("failed to allocate with no accompany resource: %v", err) + } + } else { + allocatedRdmaDevices, err = p.allocateWithAccompanyResource(deviceReq, resReq, accompanyResourceName) + if err != nil { + return nil, fmt.Errorf("failed to allocate with accompany resource: %v", err) + } + } + + // Modify rdma state + topologyAwareAllocations := make(map[string]state.Allocation) + for _, deviceID := range allocatedRdmaDevices { + info, ok := rdmaTopology.Devices[deviceID] + if !ok { + return nil, fmt.Errorf("failed to get rdma info for device %s", deviceID) + } + + topologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: 1, + NUMANodes: info.GetNUMANodes(), + } + } + + allocationInfo := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.Allocation{ + Quantity: 1, + NUMANodes: hintNodes.ToSliceInt(), + }, + } + + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations + p.State.SetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName, allocationInfo, false) + resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.RDMADeviceType, nil) + if err != nil { + return nil, fmt.Errorf("failed to generate rdma device state from pod entries: %v", err) + } + + p.State.SetResourceState(gpuconsts.RDMADeviceType, resourceState, true) + + general.InfoS("allocated rdma devices", + "podNamespace", resReq.PodNamespace, + "podName", resReq.PodName, + "containerName", resReq.ContainerName, + "qosLevel", qosLevel, + "allocatedRdmaDevices", allocatedRdmaDevices) + + return &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: allocatedRdmaDevices, + }, + }, nil +} + +// allocateWithNoAccompanyResource allocates the rdma devices by best effort basis on the by making sure that +// it fits the hint nodes. +func (p *RDMADevicePlugin) allocateWithNoAccompanyResource( + deviceReq *pluginapi.DeviceRequest, rdmaTopology *machine.DeviceTopology, hintNodes machine.CPUSet, +) ([]string, error) { + reqQuantity := deviceReq.GetDeviceRequest() + + machineState, ok := p.State.GetMachineState()[gpuconsts.RDMADeviceType] + if !ok { + return nil, fmt.Errorf("no machine state for resource %s", gpuconsts.RDMADeviceType) + } + + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if allocatedDevices.Len() >= int(reqQuantity) { + return true + } + } + return false + } + + availableDevices := deviceReq.GetAvailableDevices() + reusableDevices := deviceReq.GetReusableDevices() + + // allocate reusable devices first + allocated := allocateDevices(reusableDevices...) + if allocated { + return allocatedDevices.UnsortedList(), nil + } + + for _, device := range availableDevices { + if !gpuutil.IsNUMAAffinityDevice(device, rdmaTopology, hintNodes) { + continue + } + + if !machineState.IsRequestSatisfied(device, 1, 1) { + general.Infof("available numa affinity rdma %s is already allocated", device) + continue + } + + if allocateDevices(device) { + return allocatedDevices.UnsortedList(), nil + } + } + + return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices)) +} + +// allocateWithAccompanyResource allocates the rdma devices by first allocating the reusable devices, then allocating the +// available devices proportionally by ensuring NUMA affinity with the accompany resource +func (p *RDMADevicePlugin) allocateWithAccompanyResource( + deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string, +) ([]string, error) { + var err error + + // Find out the accompany devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of accompany device + accompanyDeviceType, err := p.GetResourceTypeFromDeviceName(accompanyResourceName) + if err != nil { + return nil, fmt.Errorf("failed to get device type for accompany resource %s: %v", accompanyResourceName, err) + } + + // Allocate all the reusable devices first + allocatedDevices := sets.NewString(deviceReq.ReusableDevices...) + + // Get ratio of accompany resource to target device + accompanyResourceToTargetDeviceRatio := p.State.GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyDeviceType, gpuconsts.RDMADeviceType) + + // Allocate target device according to ratio of accompany resource to target device + podResourceEntries := p.State.GetPodResourceEntries() + totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName) + + rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) * accompanyResourceToTargetDeviceRatio)) + + // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same + // numa nodes as the gpu and allocate them + accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyDeviceType, gpuconsts.RDMADeviceType) + if err != nil { + general.Warningf("failed to get gpu to rdma affinity map: %v", err) + return nil, err + } + + machineState := p.State.GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)] + + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + if allocatedDevices.Len() >= rdmaToBeAllocated { + return true + } + allocatedDevices.Insert(device) + } + if allocatedDevices.Len() >= rdmaToBeAllocated { + return true + } + return false + } + + for accompanyResourceId := range accompanyResourceIds { + rdmaDevices, ok := accompanyResourceToRdmaAffinityMap[accompanyResourceId] + if !ok { + general.Warningf("failed to get rdma device with accompany device id: %s", accompanyResourceId) + continue + } + + // Iterate through the rdma devices and check if they are already allocated + for _, rdmaDevice := range rdmaDevices { + if !machineState.IsRequestSatisfied(rdmaDevice, 1, 1) { + continue + } + + if allocateDevices(rdmaDevice) { + return allocatedDevices.UnsortedList(), nil + } + } + } + + // Did not find enough available rdma devices to allocate, return the devices that are already allocated + return allocatedDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go new file mode 100644 index 0000000000..37975a32cc --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma_test.go @@ -0,0 +1,563 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rdma + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func makeTestBasePlugin(t *testing.T) *baseplugin.BasePlugin { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + conf.RDMADeviceNames = []string{"test-rdma"} + + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, metrics.DummyMetrics{}) + assert.NoError(t, err) + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + basePlugin.State = stateImpl + + // Register gpu device type and gpu device topology provider as it is an accompany resource for rdma + basePlugin.RegisterDeviceNameToType([]string{"test-gpu"}, gpuconsts.GPUDeviceType) + gpuTopologyProvider := machine.NewDeviceTopologyProvider([]string{"test-gpu"}) + basePlugin.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.GPUDeviceType, gpuTopologyProvider) + + return basePlugin +} + +func TestRDMADevicePlugin_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewRDMADevicePlugin(basePlugin) + + // Update topology with associated devices + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "test-rdma", + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-rdma-0", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 0, + }, + }, + }, + }, + { + ID: "test-rdma-1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + }, + } + + resp, err := devicePlugin.UpdateAllocatableAssociatedDevices(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify device topology is updated + gpuDevicePlugin := devicePlugin.(*RDMADevicePlugin) + deviceTopology, numaTopologyReady, err := gpuDevicePlugin.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) + assert.NoError(t, err) + assert.True(t, numaTopologyReady) + assert.NotNil(t, deviceTopology) + + expectedDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + "test-rdma-1": { + NumaNodes: []int{1}, + DeviceAffinity: make(map[machine.AffinityPriority]machine.DeviceIDs), + }, + }, + } + + assert.Equal(t, expectedDeviceTopology, deviceTopology) +} + +func TestRDMADevicePlugin_AllocateAssociatedDevices(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + allocationInfo *state.AllocationInfo + accompanyResourceAllocationInfo *state.AllocationInfo + accompanyResourceName string + deviceReq *pluginapi.DeviceRequest + deviceTopology *machine.DeviceTopology + accompanyDeviceTopology *machine.DeviceTopology + machineState state.AllocationResourcesMap + expectedErr bool + expectedResp *pluginapi.AssociatedDeviceAllocationResponse + }{ + { + name: "Allocation already exists", + allocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + NUMANodes: []int{0, 1}, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-rdma-0": { + Quantity: 1, + NUMANodes: []int{0}, + }, + "test-rdma-1": { + Quantity: 1, + NUMANodes: []int{1}, + }, + }, + }, + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + AvailableDevices: []string{"test-rdma-2", "test-rdma-3"}, + ReusableDevices: []string{"test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-1"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, allocate reusable devices first", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: []string{"test-rdma-2", "test-rdma-3"}, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-2", "test-rdma-3"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, no reusable devices, only allocate available devices with NUMA affinity", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-2"}, + }, + }, + }, + { + name: "No accompany resource allocates by best effort, no reusable devices, skip devices that are already allocated", + podUID: string(uuid.NewUUID()), + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": { + PodEntries: map[string]state.ContainerEntries{ + "pod-uid2": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-rdma-1": { + PodEntries: map[string]state.ContainerEntries{ + "pod-uid3": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-2", "test-rdma-3"}, + }, + }, + }, + { + name: "Accompany resource has been allocated", + podUID: "test-pod", + containerName: "test-container", + deviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-rdma-0": { + NumaNodes: []int{0}, + }, + "test-rdma-1": { + NumaNodes: []int{1}, + }, + "test-rdma-2": { + NumaNodes: []int{0}, + }, + "test-rdma-3": { + NumaNodes: []int{1}, + }, + }, + }, + // Ratio of 1 rdma device per 2 gpu devices + accompanyDeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-gpu-0": { + NumaNodes: []int{0}, + }, + "test-gpu-1": { + NumaNodes: []int{1}, + }, + "test-gpu-2": { + NumaNodes: []int{0}, + }, + "test-gpu-3": { + NumaNodes: []int{1}, + }, + "test-gpu-4": { + NumaNodes: []int{0}, + }, + "test-gpu-5": { + NumaNodes: []int{1}, + }, + "test-gpu-6": { + NumaNodes: []int{0}, + }, + "test-gpu-7": { + NumaNodes: []int{1}, + }, + }, + }, + accompanyResourceName: "test-gpu", + accompanyResourceAllocationInfo: &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 4, + }, + TopologyAwareAllocations: map[string]state.Allocation{ + "test-gpu-0": { + Quantity: 1, + }, + "test-gpu-2": { + Quantity: 1, + }, + "test-gpu-4": { + Quantity: 1, + }, + "test-gpu-6": { + Quantity: 1, + }, + }, + }, + machineState: state.AllocationResourcesMap{ + gpuconsts.RDMADeviceType: { + "test-rdma-0": {}, + "test-rdma-1": {}, + "test-rdma-2": {}, + "test-rdma-3": {}, + }, + gpuconsts.GPUDeviceType: { + "test-gpu-0": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-1": {}, + "test-gpu-2": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-3": {}, + "test-gpu-4": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-5": {}, + "test-gpu-6": { + PodEntries: map[string]state.ContainerEntries{ + "test-pod": { + "test-container": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "test-gpu-7": {}, + }, + }, + deviceReq: &pluginapi.DeviceRequest{ + DeviceName: "test-rdma", + ReusableDevices: nil, + AvailableDevices: []string{"test-rdma-0", "test-rdma-1", "test-rdma-2", "test-rdma-3"}, + DeviceRequest: 2, + Hint: &pluginapi.TopologyHint{ + Nodes: []uint64{0, 1}, + }, + }, + expectedResp: &pluginapi.AssociatedDeviceAllocationResponse{ + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ + AllocatedDevices: []string{"test-rdma-0", "test-rdma-2"}, + }, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + basePlugin := makeTestBasePlugin(t) + devicePlugin := NewRDMADevicePlugin(basePlugin) + + if tt.allocationInfo != nil { + basePlugin.State.SetAllocationInfo(gpuconsts.RDMADeviceType, tt.podUID, tt.containerName, tt.allocationInfo, false) + } + + if tt.accompanyResourceAllocationInfo != nil && tt.accompanyResourceName != "" { + accompanyResourceType, err := basePlugin.GetResourceTypeFromDeviceName(tt.accompanyResourceName) + assert.NoError(t, err) + basePlugin.State.SetAllocationInfo(v1.ResourceName(accompanyResourceType), tt.podUID, tt.containerName, tt.accompanyResourceAllocationInfo, false) + } + + if tt.deviceTopology != nil { + err := basePlugin.DeviceTopologyRegistry.SetDeviceTopology(gpuconsts.RDMADeviceType, tt.deviceTopology) + assert.NoError(t, err) + } + + if tt.accompanyResourceName != "" && tt.accompanyDeviceTopology != nil { + accompanyResourceType, err := basePlugin.GetResourceTypeFromDeviceName(tt.accompanyResourceName) + assert.NoError(t, err) + err = basePlugin.DeviceTopologyRegistry.SetDeviceTopology(accompanyResourceType, tt.accompanyDeviceTopology) + assert.NoError(t, err) + } + + if tt.machineState != nil { + basePlugin.State.SetMachineState(tt.machineState, false) + } + + resourceReq := &pluginapi.ResourceRequest{ + PodUid: tt.podUID, + ContainerName: tt.containerName, + } + + resp, err := devicePlugin.AllocateAssociatedDevice(context.Background(), resourceReq, tt.deviceReq, tt.accompanyResourceName) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateAllocatedDevicesResult(t, tt.expectedResp, resp) + + // Verify state is updated + allocationInfo := basePlugin.State.GetAllocationInfo(gpuconsts.RDMADeviceType, tt.podUID, tt.containerName) + assert.NotNil(t, allocationInfo) + } + }) + } +} + +func evaluateAllocatedDevicesResult(t *testing.T, expectedResp, actualResp *pluginapi.AssociatedDeviceAllocationResponse) { + if expectedResp.AllocationResult == nil && actualResp.AllocationResult == nil { + return + } + + if expectedResp.AllocationResult != nil && actualResp.AllocationResult == nil { + t.Errorf("expected allocation result %v, but got nil", expectedResp.AllocationResult) + return + } + + if actualResp.AllocationResult != nil && expectedResp.AllocationResult == nil { + t.Errorf("expected nil allocation result, but got %v", actualResp.AllocationResult) + return + } + + assert.ElementsMatch(t, expectedResp.AllocationResult.AllocatedDevices, actualResp.AllocationResult.AllocatedDevices) +} diff --git a/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go new file mode 100644 index 0000000000..5a52bf7552 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry/registry.go @@ -0,0 +1,41 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registry + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma" +) + +type initFunc func(plugin *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin + +var customDevicePluginsMap = make(map[string]initFunc) + +func RegisterCustomDevicePlugin(deviceName string, initFunc initFunc) { + customDevicePluginsMap[deviceName] = initFunc +} + +func GetRegisteredCustomDevicePlugin() map[string]initFunc { + return customDevicePluginsMap +} + +func init() { + RegisterCustomDevicePlugin(gpu.GPUCustomDevicePluginName, gpu.NewGPUDevicePlugin) + RegisterCustomDevicePlugin(rdma.RDMACustomDevicePluginName, rdma.NewRDMADevicePlugin) +} diff --git a/pkg/agent/qrm-plugins/gpu/gpu.go b/pkg/agent/qrm-plugins/gpu/gpu.go new file mode 100644 index 0000000000..62010cd178 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/gpu.go @@ -0,0 +1,27 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/staticpolicy" +) + +func init() { + qrm.RegisterGPUPolicyInitializer(gpuconsts.GPUResourcePluginPolicyNameStatic, staticpolicy.NewStaticPolicy) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go new file mode 100644 index 0000000000..eac2833070 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory/gpu_mem.go @@ -0,0 +1,642 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpumemory + +import ( + "context" + "fmt" + "math" + "sort" + "sync" + + deviceplugin "k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" + qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos" +) + +type GPUMemPlugin struct { + sync.Mutex + *baseplugin.BasePlugin +} + +func NewGPUMemPlugin(base *baseplugin.BasePlugin) resourceplugin.ResourcePlugin { + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(string(consts.ResourceGPUMemory), + state.NewGenericDefaultResourceStateGenerator(gpuconsts.GPUDeviceType, base.DeviceTopologyRegistry)) + return &GPUMemPlugin{ + BasePlugin: base, + } +} + +func (p *GPUMemPlugin) ResourceName() string { + return string(consts.ResourceGPUMemory) +} + +func (p *GPUMemPlugin) GetTopologyHints(ctx context.Context, req *pluginapi.ResourceRequest) (resp *pluginapi.ResourceHintsResponse, err error) { + // if not numa binding, return nil hints to let kubelet choose numa node randomly + if !qosutil.AnnotationsIndicateNUMABinding(req.Annotations) { + return util.PackResourceHintsResponse(req, string(consts.ResourceGPUMemory), + map[string]*pluginapi.ListOfTopologyHints{ + string(consts.ResourceGPUMemory): nil, // indicates that there is no numa preference + }) + } + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, req, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(req.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + general.InfoS("called", + "podNamespace", req.PodNamespace, + "podName", req.PodName, + "containerName", req.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", req.Annotations, + "gpuMemory", gpuMemory) + + p.Lock() + defer func() { + if err := p.State.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", req.PodName, "containerName", req.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + _ = p.Emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + var hints map[string]*pluginapi.ListOfTopologyHints + machineState := p.State.GetMachineState()[consts.ResourceGPUMemory] + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, req.PodUid, req.ContainerName) + + if allocationInfo != nil { + hints = regenerateGPUMemoryHints(allocationInfo, false) + + // regenerateHints failed. need to clear container record and re-calculate. + if hints == nil { + podEntries := p.State.GetPodEntries(consts.ResourceGPUMemory) + delete(podEntries[req.PodUid], req.ContainerName) + if len(podEntries[req.PodUid]) == 0 { + delete(podEntries, req.PodUid) + } + + var err error + machineState, err = p.GenerateResourceStateFromPodEntries(string(consts.ResourceGPUMemory), podEntries) + if err != nil { + general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", + req.PodNamespace, req.PodName, req.ContainerName, err) + return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + } + } + + gpuCount, gpuNames, err := gpuutil.GetGPUCount(req, p.Conf.GPUDeviceNames) + if err != nil { + general.Errorf("getGPUCount failed from req %v with error: %v", req, err) + return nil, fmt.Errorf("getGPUCount failed with error: %v", err) + } + + general.Infof("gpuCount: %f, gpuNames: %v", gpuCount, gpuNames.List()) + + // otherwise, calculate hint for container without allocated memory + if hints == nil { + var calculateErr error + // calculate hint for container without allocated cpus + hints, calculateErr = p.calculateHints(gpuMemory, gpuCount, machineState, req) + if calculateErr != nil { + return nil, fmt.Errorf("calculateHints failed with error: %v", calculateErr) + } + } + + return util.PackResourceHintsResponse(req, p.ResourceName(), hints) +} + +func (p *GPUMemPlugin) calculateHints( + gpuMemory float64, gpuReq float64, machineState state.AllocationMap, req *pluginapi.ResourceRequest, +) (map[string]*pluginapi.ListOfTopologyHints, error) { + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + return nil, err + } + + if !numaTopologyReady { + return nil, fmt.Errorf("numa topology is not ready") + } + + perGPUMemory := gpuMemory / gpuReq + general.Infof("gpuMemory: %f, gpuReq: %f, perGPUMemory: %f", gpuMemory, gpuReq, perGPUMemory) + + numaToAvailableGPUCount := make(map[int]float64) + numaToMostAllocatedGPUMemory := make(map[int]float64) + for gpuID, info := range gpuTopology.Devices { + if info.Health != deviceplugin.Healthy { + continue + } + + s := machineState[gpuID] + // todo: get allocated quantity according to qos level + allocated := s.GetQuantityAllocated() + if allocated+perGPUMemory <= float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) { + for _, numaNode := range info.GetNUMANodes() { + numaToAvailableGPUCount[numaNode] += 1 + numaToMostAllocatedGPUMemory[numaNode] = math.Max(allocated, numaToMostAllocatedGPUMemory[numaNode]) + } + } + } + + numaNodes := make([]int, 0, p.MetaServer.NumNUMANodes) + for numaNode := range p.MetaServer.NUMAToCPUs { + numaNodes = append(numaNodes, numaNode) + } + sort.Ints(numaNodes) + + minNUMAsCountNeeded, _, err := gpuutil.GetNUMANodesCountToFitGPUReq(gpuReq, p.MetaServer.CPUTopology, gpuTopology) + if err != nil { + return nil, err + } + + numaCountPerSocket, err := p.MetaServer.NUMAsPerSocket() + if err != nil { + return nil, fmt.Errorf("NUMAsPerSocket failed with error: %v", err) + } + + numaBound := len(numaNodes) + if numaBound > machine.LargeNUMAsPoint { + // [TODO]: to discuss refine minNUMAsCountNeeded+1 + numaBound = minNUMAsCountNeeded + 1 + } + + var availableNumaHints []*pluginapi.TopologyHint + machine.IterateBitMasks(numaNodes, numaBound, func(mask machine.BitMask) { + maskCount := mask.Count() + if maskCount < minNUMAsCountNeeded { + return + } + + maskBits := mask.GetBits() + numaCountNeeded := mask.Count() + + allAvailableGPUsCountInMask := float64(0) + for _, nodeID := range maskBits { + allAvailableGPUsCountInMask += numaToAvailableGPUCount[nodeID] + } + + if allAvailableGPUsCountInMask < gpuReq { + return + } + + crossSockets, err := machine.CheckNUMACrossSockets(maskBits, p.MetaServer.CPUTopology) + if err != nil { + return + } else if numaCountNeeded <= numaCountPerSocket && crossSockets { + return + } + + preferred := maskCount == minNUMAsCountNeeded + availableNumaHints = append(availableNumaHints, &pluginapi.TopologyHint{ + Nodes: machine.MaskToUInt64Array(mask), + Preferred: preferred, + }) + }) + + // prefer numa nodes with most allocated gpu memory + p.preferGPUMemoryMostAllocatedHints(availableNumaHints, numaToMostAllocatedGPUMemory) + + // NOTE: because grpc is inability to distinguish between an empty array and nil, + // we return an error instead of an empty array. + // we should resolve this issue if we need to manage multi-resource in one plugin. + if len(availableNumaHints) == 0 { + general.Warningf("got no available gpu memory hints for pod: %s/%s, container: %s", + req.PodNamespace, req.PodName, req.ContainerName) + return nil, gpuutil.ErrNoAvailableGPUMemoryHints + } + + return map[string]*pluginapi.ListOfTopologyHints{ + p.ResourceName(): { + Hints: availableNumaHints, + }, + }, nil +} + +func (p *GPUMemPlugin) preferGPUMemoryMostAllocatedHints( + hints []*pluginapi.TopologyHint, numaToMostAllocatedGPUMemory map[int]float64, +) { + hintGPUMemoryMostAllocated := make(map[int]float64) + for index, hint := range hints { + if !hint.Preferred { + continue + } + + gpuMemoryMostAllocated := float64(0) + for _, nodeID := range hint.Nodes { + gpuMemoryMostAllocated = math.Max(gpuMemoryMostAllocated, numaToMostAllocatedGPUMemory[int(nodeID)]) + } + hintGPUMemoryMostAllocated[index] = gpuMemoryMostAllocated + } + + mostAllocatedHintIndex := -1 + for index, hint := range hints { + if !hint.Preferred { + continue + } + + if mostAllocatedHintIndex == -1 || hintGPUMemoryMostAllocated[index] > hintGPUMemoryMostAllocated[mostAllocatedHintIndex] { + mostAllocatedHintIndex = index + } + } + + if mostAllocatedHintIndex < 0 { + return + } + + for index, hint := range hints { + if !hint.Preferred || mostAllocatedHintIndex == index { + continue + } + hint.Preferred = false + } +} + +func (p *GPUMemPlugin) GetTopologyAwareResources(ctx context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + general.InfofV(4, "called") + + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, podUID, containerName) + if allocationInfo == nil { + return nil, nil + } + + topologyAwareQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(allocationInfo.TopologyAwareAllocations)) + for deviceID, alloc := range allocationInfo.TopologyAwareAllocations { + perNUMAAllocated := alloc.Quantity + if len(alloc.NUMANodes) > 0 { + perNUMAAllocated = alloc.Quantity / float64(len(alloc.NUMANodes)) + for _, nodeID := range alloc.NUMANodes { + if nodeID < 0 { + nodeID = 0 + } + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + Node: uint64(nodeID), + ResourceValue: perNUMAAllocated, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } else { + topologyAwareQuantityList = append(topologyAwareQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: perNUMAAllocated, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } + + resp := &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: podUID, + PodName: allocationInfo.PodName, + PodNamespace: allocationInfo.PodNamespace, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: containerName, + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + p.ResourceName(): { + IsNodeResource: true, + IsScalarResource: true, + AggregatedQuantity: allocationInfo.AllocatedAllocation.Quantity, + OriginalAggregatedQuantity: allocationInfo.AllocatedAllocation.Quantity, + TopologyAwareQuantityList: topologyAwareQuantityList, + OriginalTopologyAwareQuantityList: topologyAwareQuantityList, + }, + }, + }, + } + + return resp, nil +} + +func (p *GPUMemPlugin) GetTopologyAwareAllocatableResources(ctx context.Context) (*gpuconsts.AllocatableResource, error) { + general.InfofV(4, "called") + + p.Lock() + defer p.Unlock() + + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + return nil, err + } + + if !numaTopologyReady { + return nil, fmt.Errorf("numa topology is not ready") + } + + topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(gpuTopology.Devices)) + topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(gpuTopology.Devices)) + var aggregatedAllocatableQuantity, aggregatedCapacityQuantity float64 + for deviceID, deviceInfo := range gpuTopology.Devices { + gpuMemoryAllocatablePerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + gpuMemoryCapacityPerGPU := float64(p.Conf.GPUMemoryAllocatablePerGPU.Value()) + if deviceInfo.Health != deviceplugin.Healthy || !p.ShareGPUManager.EnableShareGPU(deviceID) { + // if the device is not healthy or not enabled to share, then set allocatable to 0 + gpuMemoryAllocatablePerGPU = 0 + } + aggregatedAllocatableQuantity += gpuMemoryAllocatablePerGPU + aggregatedCapacityQuantity += gpuMemoryCapacityPerGPU + if len(deviceInfo.NumaNodes) > 0 { + gpuMemoryAllocatablePerGPU = gpuMemoryAllocatablePerGPU / float64(len(deviceInfo.NumaNodes)) + gpuMemoryCapacityPerGPU = gpuMemoryCapacityPerGPU / float64(len(deviceInfo.NumaNodes)) + for _, numaID := range deviceInfo.NumaNodes { + if numaID < 0 { + numaID = 0 + } + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPU, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryCapacityPerGPU, + Name: deviceID, + Node: uint64(numaID), + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } else { + // if deviceInfo.NumaNodes is empty, then it means the device is not NUMA aware + topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryAllocatablePerGPU, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ + ResourceValue: gpuMemoryCapacityPerGPU, + Name: deviceID, + Type: string(v1alpha1.TopologyTypeGPU), + Annotations: map[string]string{ + consts.ResourceAnnotationKeyResourceIdentifier: "", + }, + }) + } + } + + return &gpuconsts.AllocatableResource{ + ResourceName: p.ResourceName(), + AllocatableTopologyAwareResource: &pluginapi.AllocatableTopologyAwareResource{ + IsNodeResource: true, + IsScalarResource: true, + AggregatedAllocatableQuantity: aggregatedAllocatableQuantity, + TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, + AggregatedCapacityQuantity: aggregatedCapacityQuantity, + TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, + }, + }, nil +} + +func (p *GPUMemPlugin) Allocate( + ctx context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, +) (*pluginapi.ResourceAllocationResponse, error) { + quantity, exists := resourceReq.ResourceRequests[p.ResourceName()] + if !exists || quantity == 0 { + general.InfoS("No GPU memory annotation detected and no GPU memory requested, returning empty response", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName) + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } + + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resourceReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) + if err != nil { + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", + resourceReq.PodNamespace, resourceReq.PodName, resourceReq.ContainerName, err) + general.Errorf("%s", err.Error()) + return nil, err + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(resourceReq.ResourceRequests, p.ResourceName(), false) + if err != nil { + return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) + } + + general.InfoS("called", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName, + "qosLevel", qosLevel, + "reqAnnotations", resourceReq.Annotations, + "gpuMemory", gpuMemory, + "deviceReq", deviceReq.String()) + + p.Lock() + defer func() { + if err := p.State.StoreState(); err != nil { + general.ErrorS(err, "store state failed", "podName", resourceReq.PodName, "containerName", resourceReq.ContainerName) + } + p.Unlock() + if err != nil { + metricTags := []metrics.MetricTag{ + {Key: "error_message", Val: metric.MetricTagValueFormat(err)}, + } + _ = p.Emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, metricTags...) + } + }() + + // currently, not to deal with init containers + if resourceReq.ContainerType == pluginapi.ContainerType_INIT { + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } else if resourceReq.ContainerType == pluginapi.ContainerType_SIDECAR { + // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating + return p.PackAllocationResponse(resourceReq, &state.AllocationInfo{}, nil, p.ResourceName()) + } + + allocationInfo := p.State.GetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName) + if allocationInfo != nil { + resp, packErr := p.PackAllocationResponse(resourceReq, allocationInfo, nil, p.ResourceName()) + if packErr != nil { + general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", + resourceReq.PodNamespace, resourceReq.PodName, resourceReq.ContainerName, packErr) + return nil, fmt.Errorf("packAllocationResponse failed with error: %w", packErr) + } + return resp, nil + } + + if deviceReq == nil { + general.InfoS("Nil device request, returning empty response", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName) + // if deviceReq is nil, return empty response to re-allocate after device allocation + return util.CreateEmptyAllocationResponse(resourceReq, p.ResourceName()), nil + } + + general.Infof("deviceReq: %v", deviceReq.String()) + + // Get GPU topology + gpuTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.GPUDeviceType) + if err != nil { + general.Warningf("failed to get gpu topology: %v", err) + return nil, fmt.Errorf("failed to get gpu topology: %v", err) + } + + if !numaTopologyReady { + general.Warningf("numa topology is not ready") + return nil, fmt.Errorf("numa topology is not ready") + } + + // Use the strategy framework to allocate GPU memory + result, err := manager.AllocateGPUUsingStrategy( + resourceReq, + deviceReq, + gpuTopology, + p.Conf.GPUQRMPluginConfig, + p.Emitter, + p.MetaServer, + p.State.GetMachineState(), + qosLevel, + ) + if err != nil { + return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err) + } + + if !result.Success { + return nil, fmt.Errorf("GPU allocation failed: %s", result.ErrorMessage) + } + + // get hint nodes from request + hintNodes, err := machine.NewCPUSetUint64(resourceReq.GetHint().GetNodes()...) + if err != nil { + general.Warningf("failed to get hint nodes: %v", err) + return nil, fmt.Errorf("failed to get hint nodes: %w", err) + } + + newAllocation := &state.AllocationInfo{ + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resourceReq, commonstate.EmptyOwnerPoolName, qosLevel), + AllocatedAllocation: state.Allocation{ + Quantity: gpuMemory, + NUMANodes: hintNodes.ToSliceInt(), + }, + TopologyAwareAllocations: make(map[string]state.Allocation), + } + + gpuMemoryPerGPU := gpuMemory / float64(deviceReq.DeviceRequest) + for _, deviceID := range result.AllocatedDevices { + info, ok := gpuTopology.Devices[deviceID] + if !ok { + return nil, fmt.Errorf("failed to get gpu info for device: %s", deviceID) + } + + newAllocation.TopologyAwareAllocations[deviceID] = state.Allocation{ + Quantity: gpuMemoryPerGPU, + NUMANodes: info.NumaNodes, + } + } + + // Set allocation info in state + p.State.SetAllocationInfo(consts.ResourceGPUMemory, resourceReq.PodUid, resourceReq.ContainerName, newAllocation, false) + + machineState, stateErr := p.GenerateResourceStateFromPodEntries(string(consts.ResourceGPUMemory), nil) + if stateErr != nil { + general.ErrorS(stateErr, "GenerateResourceStateFromPodEntries failed", + "podNamespace", resourceReq.PodNamespace, + "podName", resourceReq.PodName, + "containerName", resourceReq.ContainerName, + "gpuMemory", gpuMemory) + return nil, fmt.Errorf("GenerateResourceStateFromPodEntries failed with error: %v", stateErr) + } + + // update state cache + p.State.SetResourceState(consts.ResourceGPUMemory, machineState, true) + + return p.PackAllocationResponse(resourceReq, newAllocation, nil, p.ResourceName()) +} + +// regenerateGPUMemoryHints regenerates hints for container that'd already been allocated gpu memory, +// and regenerateHints will assemble hints based on already-existed AllocationInfo, +// without any calculation logics at all +func regenerateGPUMemoryHints( + allocationInfo *state.AllocationInfo, regenerate bool, +) map[string]*pluginapi.ListOfTopologyHints { + if allocationInfo == nil { + general.Errorf("RegenerateHints got nil allocationInfo") + return nil + } + + hints := map[string]*pluginapi.ListOfTopologyHints{} + if regenerate { + general.ErrorS(nil, "need to regenerate hints", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "podUID", allocationInfo.PodUid, + "containerName", allocationInfo.ContainerName) + + return nil + } + + allocatedNumaNodes := machine.NewCPUSet(allocationInfo.AllocatedAllocation.NUMANodes...) + + general.InfoS("regenerating machineInfo hints, gpu memory was already allocated to pod", + "podNamespace", allocationInfo.PodNamespace, + "podName", allocationInfo.PodName, + "containerName", allocationInfo.ContainerName, + "hint", allocatedNumaNodes) + hints[string(consts.ResourceGPUMemory)] = &pluginapi.ListOfTopologyHints{ + Hints: []*pluginapi.TopologyHint{ + { + Nodes: allocatedNumaNodes.ToSliceUInt64(), + Preferred: true, + }, + }, + } + return hints +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go new file mode 100644 index 0000000000..f368b40939 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/interface.go @@ -0,0 +1,40 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "context" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" +) + +// ResourcePlugin knows how to handle resource requests for a specific resource type. +type ResourcePlugin interface { + ResourceName() string + + GetTopologyHints(ctx context.Context, request *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) + + GetTopologyAwareResources(ctx context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) + + GetTopologyAwareAllocatableResources(ctx context.Context) (*gpuconsts.AllocatableResource, error) + + Allocate( + ctx context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, + ) (*pluginapi.ResourceAllocationResponse, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go new file mode 100644 index 0000000000..a7cefbaf7e --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/registry/registry.go @@ -0,0 +1,40 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registry + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/gpumemory" +) + +type InitFunc func(plugin *baseplugin.BasePlugin) resourceplugin.ResourcePlugin + +var resourcePluginsMap = make(map[string]InitFunc) + +func RegisterResourcePlugin(pluginName string, initFunc InitFunc) { + resourcePluginsMap[pluginName] = initFunc +} + +func GetRegisteredResourcePlugin() map[string]InitFunc { + return resourcePluginsMap +} + +func init() { + RegisterResourcePlugin(gpuconsts.GPUMemPluginName, gpumemory.NewGPUMemPlugin) +} diff --git a/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go new file mode 100644 index 0000000000..57b28089c6 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/resourceplugin/resource_plugin_stub.go @@ -0,0 +1,82 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourceplugin + +import ( + "context" + "fmt" + + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" +) + +type ResourcePluginStub struct { + *baseplugin.BasePlugin +} + +func NewResourcePluginStub(base *baseplugin.BasePlugin) ResourcePlugin { + return &ResourcePluginStub{BasePlugin: base} +} + +func (r ResourcePluginStub) ResourceName() string { + return "resource-plugin-stub" +} + +func (r ResourcePluginStub) GetTopologyHints(context.Context, *pluginapi.ResourceRequest) (*pluginapi.ResourceHintsResponse, error) { + return &pluginapi.ResourceHintsResponse{}, nil +} + +func (r ResourcePluginStub) GetTopologyAwareResources(_ context.Context, podUID, containerName string) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + // Simply returns a fixed response if the podUID and containerName is found in the state, otherwise return an error + allocationInfo := r.State.GetAllocationInfo(v1.ResourceName(r.ResourceName()), podUID, containerName) + if allocationInfo == nil { + return nil, fmt.Errorf("allocationInfo is nil") + } + + // Simply returns a fixed response + return &pluginapi.GetTopologyAwareResourcesResponse{ + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + r.ResourceName(): {}, + }, + }, + }, nil +} + +func (r ResourcePluginStub) GetTopologyAwareAllocatableResources(context.Context) (*gpuconsts.AllocatableResource, error) { + // Simply return a fixed response + return &gpuconsts.AllocatableResource{ + ResourceName: r.ResourceName(), + }, nil +} + +func (r ResourcePluginStub) Allocate(_ context.Context, resourceReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest) (*pluginapi.ResourceAllocationResponse, error) { + // Simply save resource request and device request in state + if resourceReq.ResourceName == r.ResourceName() { + r.State.SetAllocationInfo(v1.ResourceName(r.ResourceName()), resourceReq.PodUid, resourceReq.ContainerName, &state.AllocationInfo{}, false) + } + + if deviceReq != nil { + r.State.SetAllocationInfo(v1.ResourceName(deviceReq.DeviceName), resourceReq.PodUid, resourceReq.ContainerName, &state.AllocationInfo{}, false) + } + + return &pluginapi.ResourceAllocationResponse{}, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/state/checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go new file mode 100644 index 0000000000..fcd518ff97 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/checkpoint.go @@ -0,0 +1,62 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" +) + +var _ checkpointmanager.Checkpoint = &GPUPluginCheckpoint{} + +type GPUPluginCheckpoint struct { + PolicyName string `json:"policyName"` + MachineState AllocationResourcesMap `json:"machineState"` + PodResourceEntries PodResourceEntries `json:"pod_entries"` + Checksum checksum.Checksum `json:"checksum"` +} + +func NewGPUPluginCheckpoint() *GPUPluginCheckpoint { + return &GPUPluginCheckpoint{ + PodResourceEntries: make(PodResourceEntries), + MachineState: make(AllocationResourcesMap), + } +} + +// MarshalCheckpoint returns marshaled checkpoint +func (cp *GPUPluginCheckpoint) MarshalCheckpoint() ([]byte, error) { + // make sure checksum wasn't set before, so it doesn't affect output checksum + cp.Checksum = 0 + cp.Checksum = checksum.New(cp) + return json.Marshal(*cp) +} + +// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint +func (cp *GPUPluginCheckpoint) UnmarshalCheckpoint(blob []byte) error { + return json.Unmarshal(blob, cp) +} + +// VerifyChecksum verifies that current checksum of checkpoint is valid +func (cp *GPUPluginCheckpoint) VerifyChecksum() error { + ck := cp.Checksum + cp.Checksum = 0 + err := ck.Verify(cp) + cp.Checksum = ck + return err +} diff --git a/pkg/agent/qrm-plugins/gpu/state/interface.go b/pkg/agent/qrm-plugins/gpu/state/interface.go new file mode 100644 index 0000000000..753a3efa6a --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/interface.go @@ -0,0 +1,60 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + v1 "k8s.io/api/core/v1" +) + +// reader is used to get information from local states +type reader interface { + GetMachineState() AllocationResourcesMap + GetPodResourceEntries() PodResourceEntries + GetPodEntries(resourceName v1.ResourceName) PodEntries + GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo +} + +// writer is used to store information into local states, +// and it also provides functionality to maintain the local files +type writer interface { + SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) + SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, persist bool) + SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) + SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, + ) + + Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) + ClearState() + StoreState() error +} + +// DefaultResourceStateGenerator interface is used to generate default resource state for each resource +type DefaultResourceStateGenerator interface { + GenerateDefaultResourceState() (AllocationMap, error) +} + +// ReadonlyState interface only provides methods for tracking pod assignments +type ReadonlyState interface { + reader +} + +// State interface provides methods for tracking and setting pod assignments +type State interface { + writer + ReadonlyState +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state.go b/pkg/agent/qrm-plugins/gpu/state/state.go new file mode 100644 index 0000000000..7833025b15 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state.go @@ -0,0 +1,375 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "encoding/json" + "sync" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +type AllocationInfo struct { + commonstate.AllocationMeta `json:",inline"` + + AllocatedAllocation Allocation `json:"allocated_allocation"` + TopologyAwareAllocations map[string]Allocation `json:"topology_aware_allocations"` +} + +type Allocation struct { + // Quantity refers to the amount of device allocated + Quantity float64 `json:"quantity"` + NUMANodes []int `json:"numa_nodes"` +} + +func (a *Allocation) Clone() Allocation { + if a == nil { + return Allocation{} + } + + numaNodes := make([]int, len(a.NUMANodes)) + copy(numaNodes, a.NUMANodes) + return Allocation{ + Quantity: a.Quantity, + NUMANodes: numaNodes, + } +} + +type ( + ContainerEntries map[string]*AllocationInfo // Keyed by container name + PodEntries map[string]ContainerEntries // Keyed by pod UID + PodResourceEntries map[v1.ResourceName]PodEntries // Keyed by resource name +) + +type AllocationState struct { + PodEntries PodEntries `json:"pod_entries"` +} + +type AllocationMap map[string]*AllocationState // AllocationMap keyed by device name i.e. GPU-fef8089b-4820-abfc-e83e-94318197576e + +type AllocationResourcesMap map[v1.ResourceName]AllocationMap // AllocationResourcesMap keyed by resource name i.e. v1.ResourceName("nvidia.com/gpu") + +func (i *AllocationInfo) String() string { + if i == nil { + return "" + } + + contentBytes, err := json.Marshal(i) + if err != nil { + general.LoggerWithPrefix("AllocationInfo.String", general.LoggingPKGFull).Errorf("marshal AllocationInfo failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (i *AllocationInfo) Clone() *AllocationInfo { + if i == nil { + return nil + } + + clone := &AllocationInfo{ + AllocationMeta: *i.AllocationMeta.Clone(), + AllocatedAllocation: i.AllocatedAllocation.Clone(), + } + + if i.TopologyAwareAllocations != nil { + clone.TopologyAwareAllocations = make(map[string]Allocation) + for k, v := range i.TopologyAwareAllocations { + clone.TopologyAwareAllocations[k] = v.Clone() + } + } + + return clone +} + +func (e PodEntries) String() string { + if e == nil { + return "" + } + + contentBytes, err := json.Marshal(e) + if err != nil { + general.LoggerWithPrefix("PodEntries.String", general.LoggingPKGFull).Errorf("marshal PodEntries failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (e PodEntries) Clone() PodEntries { + clone := make(PodEntries) + for podUID, containerEntries := range e { + clone[podUID] = make(ContainerEntries) + for containerName, allocationInfo := range containerEntries { + clone[podUID][containerName] = allocationInfo.Clone() + } + } + return clone +} + +func (e PodEntries) GetAllocationInfo(uid string, name string) *AllocationInfo { + if e == nil { + return nil + } + + if containerEntries, ok := e[uid]; ok { + if allocationInfo, ok := containerEntries[name]; ok { + return allocationInfo.Clone() + } + } + return nil +} + +func (e PodEntries) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if e == nil { + return + } + + if _, ok := e[podUID]; !ok { + e[podUID] = make(ContainerEntries) + } + + e[podUID][containerName] = allocationInfo.Clone() +} + +func (pre PodResourceEntries) String() string { + if pre == nil { + return "" + } + + contentBytes, err := json.Marshal(pre) + if err != nil { + general.LoggerWithPrefix("PodResourceEntries.String", general.LoggingPKGFull).Errorf("[PodResourceEntries.String] marshal PodResourceEntries failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (pre PodResourceEntries) Clone() PodResourceEntries { + if pre == nil { + return nil + } + + clone := make(PodResourceEntries) + for resourceName, podEntries := range pre { + clone[resourceName] = podEntries.Clone() + } + return clone +} + +func (pre PodResourceEntries) RemovePod(podUID string) { + if pre == nil { + return + } + + for _, podEntries := range pre { + delete(podEntries, podUID) + } +} + +// GetTotalAllocatedResourceOfContainer returns the total allocated resource quantity of a container together with +// the specific resource IDs that are allocated. +func (pre PodResourceEntries) GetTotalAllocatedResourceOfContainer( + resourceName v1.ResourceName, podUID, containerName string, +) (int, sets.String) { + if podEntries, ok := pre[resourceName]; ok { + if allocationInfo := podEntries.GetAllocationInfo(podUID, containerName); allocationInfo != nil { + totalAllocationQuantity := int(allocationInfo.AllocatedAllocation.Quantity) + allocationIDs := sets.NewString() + for id := range allocationInfo.TopologyAwareAllocations { + allocationIDs.Insert(id) + } + return totalAllocationQuantity, allocationIDs + } + } + return 0, nil +} + +func (as *AllocationState) String() string { + if as == nil { + return "" + } + + contentBytes, err := json.Marshal(as) + if err != nil { + general.LoggerWithPrefix("AllocationState.String", general.LoggingPKGFull).Errorf("[AllocationState.String]marshal AllocationState failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (as *AllocationState) Clone() *AllocationState { + if as == nil { + return nil + } + + return &AllocationState{ + PodEntries: as.PodEntries.Clone(), + } +} + +func (as *AllocationState) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { + if as == nil { + return + } + + if as.PodEntries == nil { + as.PodEntries = make(PodEntries) + } + + if _, ok := as.PodEntries[podUID]; !ok { + as.PodEntries[podUID] = make(ContainerEntries) + } + + as.PodEntries[podUID][containerName] = allocationInfo.Clone() +} + +func (am AllocationMap) String() string { + if am == nil { + return "" + } + + contentBytes, err := json.Marshal(am) + if err != nil { + general.LoggerWithPrefix("AllocationMap.String", general.LoggingPKGFull).Errorf("[AllocationMap.String]marshal AllocationMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (am AllocationMap) Clone() AllocationMap { + if am == nil { + return nil + } + + clone := make(AllocationMap) + for id, ns := range am { + clone[id] = ns.Clone() + } + return clone +} + +func (arm AllocationResourcesMap) String() string { + if arm == nil { + return "" + } + + contentBytes, err := json.Marshal(arm) + if err != nil { + klog.Errorf("[AllocationResourcesMap.String] marshal AllocationResourcesMap failed with error: %v", err) + return "" + } + return string(contentBytes) +} + +func (arm AllocationResourcesMap) Clone() AllocationResourcesMap { + clone := make(AllocationResourcesMap) + for resourceName, am := range arm { + clone[resourceName] = am.Clone() + } + return clone +} + +// GetRatioOfAccompanyResourceToTargetResource returns the ratio of total accompany resource to total target resource. +// For example, if the total number of accompany resource is 4 and the total number of target resource is 2, +// the ratio is 2. +func (arm AllocationResourcesMap) GetRatioOfAccompanyResourceToTargetResource(accompanyResourceName, targetResourceName string) float64 { + // Find the ratio of the total number of accompany resource to the total number of target resource + accompanyResourceMap := arm[v1.ResourceName(accompanyResourceName)].Clone() + accompanyResourceNumber := accompanyResourceMap.getNumberDevices() + + targetResourceMap := arm[v1.ResourceName(targetResourceName)].Clone() + targetResourceNumber := targetResourceMap.getNumberDevices() + + if targetResourceNumber == 0 { + return 0 + } + + return float64(accompanyResourceNumber) / float64(targetResourceNumber) +} + +func (as *AllocationState) GetQuantityAllocated() float64 { + if as == nil { + return 0 + } + + quantityAllocated := float64(0) + for _, podEntries := range as.PodEntries { + for _, allocationInfo := range podEntries { + quantityAllocated += allocationInfo.AllocatedAllocation.Quantity + } + } + return quantityAllocated +} + +func (am AllocationMap) GetQuantityAllocated(id string) float64 { + if am == nil { + return 0 + } + + return am[id].GetQuantityAllocated() +} + +func (am AllocationMap) IsRequestSatisfied(id string, request float64, allocatable float64) bool { + allocated := am.GetQuantityAllocated(id) + return allocatable-allocated >= request +} + +func (am AllocationMap) getNumberDevices() int { + if am == nil { + return 0 + } + + return len(am) +} + +type DefaultResourceStateGeneratorRegistry struct { + mutex sync.RWMutex + generators map[string]DefaultResourceStateGenerator +} + +func NewDefaultResourceStateGeneratorRegistry() *DefaultResourceStateGeneratorRegistry { + return &DefaultResourceStateGeneratorRegistry{ + generators: make(map[string]DefaultResourceStateGenerator), + } +} + +func (r *DefaultResourceStateGeneratorRegistry) RegisterResourceStateGenerator(resourceName string, generator DefaultResourceStateGenerator) { + r.mutex.Lock() + defer r.mutex.Unlock() + + r.generators[resourceName] = generator +} + +func (r *DefaultResourceStateGeneratorRegistry) GetGenerators() map[string]DefaultResourceStateGenerator { + r.mutex.RLock() + defer r.mutex.RUnlock() + + return r.generators +} + +func (r *DefaultResourceStateGeneratorRegistry) GetGenerator(resourceName string) (DefaultResourceStateGenerator, bool) { + r.mutex.RLock() + defer r.mutex.RUnlock() + + generator, ok := r.generators[resourceName] + return generator, ok +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go new file mode 100644 index 0000000000..11b9207285 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_checkpoint.go @@ -0,0 +1,285 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "errors" + "fmt" + "path" + "reflect" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + metricMetaCacheStoreStateDuration = "metacache_store_state_duration" +) + +var ( + _ State = &stateCheckpoint{} + generalLog = general.LoggerWithPrefix("gpu_plugin", general.LoggingPKGFull) +) + +// stateCheckpoint is an in-memory implementation of State; +// everytime we want to read or write states, those requests will always +// go to in-memory State, and then go to disk State, i.e. in write-back mode +type stateCheckpoint struct { + sync.RWMutex + cache State + policyName string + checkpointManager checkpointmanager.CheckpointManager + checkpointName string + // when we add new properties to checkpoint, + // it will cause checkpoint corruption and we should skip it + skipStateCorruption bool + emitter metrics.MetricEmitter +} + +func (s *stateCheckpoint) SetMachineState(allocationResourcesMap AllocationResourcesMap, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetMachineState(allocationResourcesMap, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store machineState to checkpoint error") + } + } +} + +func (s *stateCheckpoint) SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetResourceState(resourceName, allocationMap, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store resource state to checkpoint error") + } + } +} + +func (s *stateCheckpoint) SetPodResourceEntries(podResourceEntries PodResourceEntries, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.SetPodResourceEntries(podResourceEntries, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store pod entries to checkpoint error", "err") + } + } +} + +func (s *stateCheckpoint) SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, persist bool, +) { + s.Lock() + defer s.Unlock() + + s.cache.SetAllocationInfo(resourceName, podUID, containerName, allocationInfo, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store allocationInfo to checkpoint error") + } + } +} + +func (s *stateCheckpoint) Delete(resourceName v1.ResourceName, podUID, containerName string, persist bool) { + s.Lock() + defer s.Unlock() + + s.cache.Delete(resourceName, podUID, containerName, persist) + if persist { + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after delete operation to checkpoint error") + } + } +} + +func (s *stateCheckpoint) ClearState() { + s.Lock() + defer s.Unlock() + + s.cache.ClearState() + err := s.storeState() + if err != nil { + generalLog.ErrorS(err, "store state after clear operation to checkpoint error") + } +} + +func (s *stateCheckpoint) StoreState() error { + s.Lock() + defer s.Unlock() + return s.storeState() +} + +func (s *stateCheckpoint) GetMachineState() AllocationResourcesMap { + s.RLock() + defer s.RUnlock() + + return s.cache.GetMachineState() +} + +func (s *stateCheckpoint) GetPodResourceEntries() PodResourceEntries { + s.RLock() + defer s.RUnlock() + + return s.cache.GetPodResourceEntries() +} + +func (s *stateCheckpoint) GetPodEntries(resourceName v1.ResourceName) PodEntries { + s.RLock() + defer s.RUnlock() + + return s.cache.GetPodEntries(resourceName) +} + +func (s *stateCheckpoint) GetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, +) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + return s.cache.GetAllocationInfo(resourceName, podUID, containerName) +} + +func (s *stateCheckpoint) storeState() error { + startTime := time.Now() + general.InfoS("called") + defer func() { + elapsed := time.Since(startTime) + general.InfoS("finished", "duration", elapsed) + _ = s.emitter.StoreFloat64(metricMetaCacheStoreStateDuration, float64(elapsed/time.Millisecond), metrics.MetricTypeNameRaw) + }() + checkpoint := NewGPUPluginCheckpoint() + checkpoint.PolicyName = s.policyName + checkpoint.MachineState = s.cache.GetMachineState() + checkpoint.PodResourceEntries = s.cache.GetPodResourceEntries() + + err := s.checkpointManager.CreateCheckpoint(s.checkpointName, checkpoint) + if err != nil { + generalLog.ErrorS(err, "could not save checkpoint") + return err + } + return nil +} + +func (s *stateCheckpoint) restoreState(defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry) error { + s.Lock() + defer s.Unlock() + var err error + var foundAndSkippedStateCorruption bool + + checkpoint := NewGPUPluginCheckpoint() + if err = s.checkpointManager.GetCheckpoint(s.checkpointName, checkpoint); err != nil { + if errors.Is(err, cmerrors.ErrCheckpointNotFound) { + return s.storeState() + } else if errors.Is(err, cmerrors.ErrCorruptCheckpoint) { + if !s.skipStateCorruption { + return err + } + + foundAndSkippedStateCorruption = true + generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err) + } else { + return err + } + } + + if s.policyName != checkpoint.PolicyName && !s.skipStateCorruption { + return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", s.policyName, checkpoint.PolicyName) + } + + machineState, err := GenerateMachineStateFromPodEntries(checkpoint.PodResourceEntries, defaultResourceStateGenerators) + if err != nil { + return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + } + + s.cache.SetMachineState(machineState, false) + s.cache.SetPodResourceEntries(checkpoint.PodResourceEntries, false) + + if !reflect.DeepEqual(machineState, checkpoint.MachineState) { + generalLog.Warningf("machine state changed: "+ + "machineState: %s; checkpointMachineState: %s", + machineState.String(), checkpoint.MachineState.String()) + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState when machine state changed failed with error: %v", err) + } + } + + if foundAndSkippedStateCorruption { + generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum") + + err = s.storeState() + if err != nil { + return fmt.Errorf("storeState failed with error: %v", err) + } + } + + generalLog.InfoS("state checkpoint: restored state from checkpoint") + + return nil +} + +func NewCheckpointState( + conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, + defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry, + skipStateCorruption bool, emitter metrics.MetricEmitter, +) (State, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) + if err != nil { + return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) + } + + defaultCache, err := NewGPUPluginState(conf, defaultResourceStateGenerators) + if err != nil { + return nil, fmt.Errorf("NewGPUPluginState failed with error: %v", err) + } + + sc := &stateCheckpoint{ + cache: defaultCache, + policyName: policyName, + checkpointManager: checkpointManager, + checkpointName: checkpointName, + skipStateCorruption: skipStateCorruption, + emitter: emitter, + } + + if err := sc.restoreState(defaultResourceStateGenerators); err != nil { + return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ + "the gpu plugin checkpoint file %q before restarting Kubelet", + err, path.Join(stateDir, checkpointName)) + } + + return sc, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_mem.go b/pkg/agent/qrm-plugins/gpu/state/state_mem.go new file mode 100644 index 0000000000..89793c5a6b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_mem.go @@ -0,0 +1,173 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + "sync" + + v1 "k8s.io/api/core/v1" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" +) + +// gpuPluginState is an in-memory implementation of State; +// everytime we want to read or write states, those requests will always +// go to in-memory State, and then go to disk State, i.e. in write-back mode +type gpuPluginState struct { + sync.RWMutex + + qrmConf *qrm.QRMPluginsConfiguration + defaultResourceStateGenerators *DefaultResourceStateGeneratorRegistry + + machineState AllocationResourcesMap + podResourceEntries PodResourceEntries +} + +func NewGPUPluginState( + conf *qrm.QRMPluginsConfiguration, + resourceStateGeneratorRegistry *DefaultResourceStateGeneratorRegistry, +) (State, error) { + generalLog.InfoS("initializing new gpu plugin in-memory state store") + + defaultMachineState, err := GenerateMachineState(resourceStateGeneratorRegistry) + if err != nil { + return nil, fmt.Errorf("GenerateMachineState failed with error: %w", err) + } + + return &gpuPluginState{ + qrmConf: conf, + machineState: defaultMachineState, + defaultResourceStateGenerators: resourceStateGeneratorRegistry, + podResourceEntries: make(PodResourceEntries), + }, nil +} + +func (s *gpuPluginState) SetMachineState(allocationResourcesMap AllocationResourcesMap, _ bool) { + s.Lock() + defer s.Unlock() + s.machineState = allocationResourcesMap.Clone() + generalLog.InfoS("updated gpu plugin machine state", + "GPUMap", allocationResourcesMap.String()) +} + +func (s *gpuPluginState) SetResourceState(resourceName v1.ResourceName, allocationMap AllocationMap, _ bool) { + s.Lock() + defer s.Unlock() + s.machineState[resourceName] = allocationMap.Clone() + generalLog.InfoS("updated gpu plugin resource state", + "resourceName", resourceName, + "allocationMap", allocationMap.String()) +} + +func (s *gpuPluginState) SetPodResourceEntries(podResourceEntries PodResourceEntries, _ bool) { + s.Lock() + defer s.Unlock() + s.podResourceEntries = podResourceEntries.Clone() +} + +func (s *gpuPluginState) SetAllocationInfo( + resourceName v1.ResourceName, podUID, containerName string, allocationInfo *AllocationInfo, _ bool, +) { + s.Lock() + defer s.Unlock() + + if _, ok := s.podResourceEntries[resourceName]; !ok { + s.podResourceEntries[resourceName] = make(PodEntries) + } + + if _, ok := s.podResourceEntries[resourceName][podUID]; !ok { + s.podResourceEntries[resourceName][podUID] = make(ContainerEntries) + } + + s.podResourceEntries[resourceName][podUID][containerName] = allocationInfo.Clone() + generalLog.InfoS("updated gpu plugin pod resource entries", + "podUID", podUID, + "containerName", containerName, + "allocationInfo", allocationInfo.String()) +} + +func (s *gpuPluginState) Delete(resourceName v1.ResourceName, podUID, containerName string, _ bool) { + s.Lock() + defer s.Unlock() + + if _, ok := s.podResourceEntries[resourceName]; !ok { + return + } + + if _, ok := s.podResourceEntries[resourceName][podUID]; !ok { + return + } + + delete(s.podResourceEntries[resourceName][podUID], containerName) + if len(s.podResourceEntries[resourceName][podUID]) == 0 { + delete(s.podResourceEntries[resourceName], podUID) + } + + generalLog.InfoS("deleted container entry", "podUID", podUID, "containerName", containerName) +} + +func (s *gpuPluginState) ClearState() { + s.Lock() + defer s.Unlock() + + machineState, err := GenerateMachineState(s.defaultResourceStateGenerators) + if err != nil { + generalLog.ErrorS(err, "failed to generate machine state") + } + s.machineState = machineState + s.podResourceEntries = make(PodResourceEntries) + + generalLog.InfoS("cleared state") +} + +func (s *gpuPluginState) StoreState() error { + // nothing to do + return nil +} + +func (s *gpuPluginState) GetMachineState() AllocationResourcesMap { + s.RLock() + defer s.RUnlock() + + return s.machineState.Clone() +} + +func (s *gpuPluginState) GetPodResourceEntries() PodResourceEntries { + s.RLock() + defer s.RUnlock() + + return s.podResourceEntries.Clone() +} + +func (s *gpuPluginState) GetPodEntries(resourceName v1.ResourceName) PodEntries { + s.RLock() + defer s.RUnlock() + + return s.podResourceEntries[resourceName].Clone() +} + +func (s *gpuPluginState) GetAllocationInfo(resourceName v1.ResourceName, podUID, containerName string) *AllocationInfo { + s.RLock() + defer s.RUnlock() + + if res, ok := s.podResourceEntries[resourceName][podUID][containerName]; ok { + return res.Clone() + } + + return nil +} diff --git a/pkg/agent/qrm-plugins/gpu/state/state_test.go b/pkg/agent/qrm-plugins/gpu/state/state_test.go new file mode 100644 index 0000000000..b595b3467b --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/state_test.go @@ -0,0 +1,199 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" +) + +func TestAllocationResourcesMap_GetRatioOfAccompanyResourceToTargetResource(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + accompanyResourceName string + targetResourceName string + arm AllocationResourcesMap + want float64 + }{ + { + name: "normal case", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + "accompany3": {}, + "accompany4": {}, + }, + v1.ResourceName("targetResource"): { + "target1": {}, + "target2": {}, + }, + }, + want: 2.0, + }, + { + name: "got a ratio that is a fraction", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + }, + v1.ResourceName("targetResource"): { + "target1": {}, + "target2": {}, + "target3": {}, + "target4": {}, + }, + }, + want: 0.5, + }, + { + name: "no devices for target resource", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{ + v1.ResourceName("accompanyResource"): { + "accompany1": {}, + "accompany2": {}, + }, + }, + want: 0, + }, + { + name: "no devices for accompany resource and target resource", + accompanyResourceName: "accompanyResource", + targetResourceName: "targetResource", + arm: AllocationResourcesMap{}, + want: 0, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := tt.arm.GetRatioOfAccompanyResourceToTargetResource(tt.accompanyResourceName, tt.targetResourceName) + if got != tt.want { + t.Errorf("GetRatioOfAccompanyResourceToTargetResource() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPodResourceEntries_GetTotalAllocatedResourceOfContainer(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + resourceName v1.ResourceName + podUID string + containerName string + pre PodResourceEntries + wantTotalAllocationQuantity int + wantAllocationIDs sets.String + }{ + { + name: "normal case", + resourceName: v1.ResourceName("testResource"), + podUID: "podUID", + containerName: "containerName", + pre: PodResourceEntries{ + v1.ResourceName("testResource"): { + "podUID2": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-1": { + Quantity: 1, + }, + "test-2": { + Quantity: 1, + }, + }, + }, + }, + "podUID": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-3": { + Quantity: 1, + }, + "test-4": { + Quantity: 1, + }, + }, + }, + }, + }, + }, + wantTotalAllocationQuantity: 2, + wantAllocationIDs: sets.NewString("test-3", "test-4"), + }, + { + name: "no allocation", + resourceName: v1.ResourceName("testResource"), + podUID: "podUID", + containerName: "containerName", + pre: PodResourceEntries{ + v1.ResourceName("testResource"): { + "podUID2": { + "containerName": { + AllocatedAllocation: Allocation{ + Quantity: 2, + }, + TopologyAwareAllocations: map[string]Allocation{ + "test-1": { + Quantity: 1, + }, + "test-2": { + Quantity: 1, + }, + }, + }, + }, + }, + }, + wantTotalAllocationQuantity: 0, + wantAllocationIDs: sets.NewString(), + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + gotTotalAllocationQuantity, gotAllocationIDs := tt.pre.GetTotalAllocatedResourceOfContainer(tt.resourceName, tt.podUID, tt.containerName) + assert.Equal(t, tt.wantTotalAllocationQuantity, gotTotalAllocationQuantity) + + assert.ElementsMatch(t, tt.wantAllocationIDs.UnsortedList(), gotAllocationIDs.UnsortedList()) + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/state/util.go b/pkg/agent/qrm-plugins/gpu/state/util.go new file mode 100644 index 0000000000..8863220cdb --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/state/util.go @@ -0,0 +1,140 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package state + +import ( + "fmt" + + v1 "k8s.io/api/core/v1" + + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// GenerateMachineState returns an empty AllocationResourcesMap for all resource names. +func GenerateMachineState( + defaultMachineStateGenerators *DefaultResourceStateGeneratorRegistry, +) (AllocationResourcesMap, error) { + if defaultMachineStateGenerators == nil { + return nil, fmt.Errorf("cannot generate machine state from nil defaultMachineStateGenerators") + } + + allocationResourcesMap := make(AllocationResourcesMap) + for resourceName, generator := range defaultMachineStateGenerators.GetGenerators() { + allocationMap, err := generator.GenerateDefaultResourceState() + if err != nil { + return nil, fmt.Errorf("GenerateDefaultResourceState for resource %s failed with error: %v", resourceName, err) + } + allocationResourcesMap[v1.ResourceName(resourceName)] = allocationMap + } + + return allocationResourcesMap, nil +} + +// GenerateMachineStateFromPodEntries returns AllocationResourcesMap for allocated resources based on +// resource name along with existed pod resource entries. +func GenerateMachineStateFromPodEntries( + podResourceEntries PodResourceEntries, + defaultMachineStateGenerators *DefaultResourceStateGeneratorRegistry, +) (AllocationResourcesMap, error) { + if defaultMachineStateGenerators == nil { + return nil, fmt.Errorf("cannot generate machine state from nil resourceStateGeneratorRegistry") + } + + machineState := make(AllocationResourcesMap) + for resourceName, podEntries := range podResourceEntries { + generator, ok := defaultMachineStateGenerators.GetGenerator(string(resourceName)) + if !ok { + return nil, fmt.Errorf("GetGenerator for resource %s failed", resourceName) + } + + allocationMap, err := GenerateResourceStateFromPodEntries(podEntries, generator) + if err != nil { + return nil, fmt.Errorf("GenerateResourceStateFromPodEntries for resource %s failed with error: %v", resourceName, err) + } + machineState[resourceName] = allocationMap + } + + return machineState, nil +} + +// GenerateResourceStateFromPodEntries returns an AllocationMap of a certain resource based on pod entries +func GenerateResourceStateFromPodEntries( + podEntries PodEntries, + generator DefaultResourceStateGenerator, +) (AllocationMap, error) { + machineState, err := generator.GenerateDefaultResourceState() + if err != nil { + return nil, fmt.Errorf("GenerateDefaultResourceState failed with error: %v", err) + } + + for deviceID, allocationState := range machineState { + for podUID, containerEntries := range podEntries { + for containerName, allocationInfo := range containerEntries { + if containerName != "" && allocationInfo != nil { + allocation, ok := allocationInfo.TopologyAwareAllocations[deviceID] + if !ok { + continue + } + alloc := allocationInfo.Clone() + alloc.AllocatedAllocation = allocation.Clone() + alloc.TopologyAwareAllocations = map[string]Allocation{deviceID: allocation} + allocationState.SetAllocationInfo(podUID, containerName, alloc) + } + } + } + machineState[deviceID] = allocationState + } + + return machineState, nil +} + +type genericDefaultResourceStateGenerator struct { + resourceName string + topologyRegistry *machine.DeviceTopologyRegistry +} + +func NewGenericDefaultResourceStateGenerator( + resourceName string, + topologyRegistry *machine.DeviceTopologyRegistry, +) DefaultResourceStateGenerator { + return &genericDefaultResourceStateGenerator{resourceName: resourceName, topologyRegistry: topologyRegistry} +} + +// GenerateDefaultResourceState return a default resource state by topology +func (g *genericDefaultResourceStateGenerator) GenerateDefaultResourceState() (AllocationMap, error) { + if g == nil { + return nil, fmt.Errorf("nil DefaultResourceStateGenerator") + } + + if g.topologyRegistry == nil { + return nil, fmt.Errorf("topology provider registry must not be nil") + } + + topology, _, err := g.topologyRegistry.GetDeviceTopology(g.resourceName) + if err != nil { + return nil, fmt.Errorf("topology provider registry failed with error: %v", err) + } + + resourceState := make(AllocationMap) + for deviceID := range topology.Devices { + resourceState[deviceID] = &AllocationState{ + PodEntries: make(PodEntries), + } + } + + return resourceState, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go new file mode 100644 index 0000000000..1a91438820 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy.go @@ -0,0 +1,757 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package staticpolicy + +import ( + "context" + "fmt" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-api/pkg/plugins/skeleton" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + appqrm "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + devicepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + resourcepluginregistry "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" + "github.com/kubewharf/katalyst-core/pkg/config" + dynamicconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// StaticPolicy is the static gpu policy +type StaticPolicy struct { + sync.RWMutex + pluginapi.UnimplementedResourcePluginServer + *baseplugin.BasePlugin + + name string + stopCh chan struct{} + started bool + + emitter metrics.MetricEmitter + + residualHitMap map[string]int64 + + associatedDeviceNames sets.String + resourcePlugins map[string]resourceplugin.ResourcePlugin + customDevicePlugins map[string]customdeviceplugin.CustomDevicePlugin +} + +// NewStaticPolicy returns a static gpu policy +func NewStaticPolicy( + agentCtx *agent.GenericContext, conf *config.Configuration, + _ interface{}, agentName string, +) (bool, agent.Component, error) { + wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{ + Key: util.QRMPluginPolicyTagName, + Val: gpuconsts.GPUResourcePluginPolicyNameStatic, + }) + + basePlugin, err := baseplugin.NewBasePlugin(agentCtx, conf, wrappedEmitter) + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to create base plugin: %w", err) + } + + policyImplement := &StaticPolicy{ + emitter: wrappedEmitter, + stopCh: make(chan struct{}), + name: fmt.Sprintf("%s_%s", agentName, gpuconsts.GPUResourcePluginPolicyNameStatic), + residualHitMap: make(map[string]int64), + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + associatedDeviceNames: sets.NewString(), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), + } + + if err = policyImplement.registerDefaultResourcePlugins(); err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to register resource plugins: %w", err) + } + if err = policyImplement.registerDefaultCustomDevicePlugins(); err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to register custom device plugins: %w", err) + } + + // init state must be done after resource plugins and custom device plugins are registered + err = policyImplement.InitState() + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("failed to init state: %w", err) + } + + pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, + func(key string, value int64) { + _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) + }) + if err != nil { + return false, agent.ComponentStub{}, fmt.Errorf("static policy new plugin wrapper failed with error: %v", err) + } + + return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil +} + +var _ skeleton.QRMPlugin = (*StaticPolicy)(nil) + +// Start starts this plugin +func (p *StaticPolicy) Start() (err error) { + general.Infof("called") + + p.Lock() + defer func() { + if !p.started { + if err == nil { + p.started = true + } else { + close(p.stopCh) + } + } + p.Unlock() + }() + + if p.started { + general.Infof("already started") + return nil + } + + p.stopCh = make(chan struct{}) + + go wait.Until(func() { + _ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw) + }, time.Second*30, p.stopCh) + + err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(gpuconsts.ClearResidualState, general.HealthzCheckStateNotReady, + appqrm.QRMGPUPluginPeriodicalHandlerGroupName, p.clearResidualState, gpuconsts.StateCheckPeriod, gpuconsts.StateCheckTolerationTimes) + if err != nil { + general.Errorf("start %v failed, err: %v", gpuconsts.ClearResidualState, err) + } + + err = p.BasePlugin.Run(p.stopCh) + if err != nil { + return fmt.Errorf("share gpu manager run failed with error: %w", err) + } + + go wait.Until(func() { + periodicalhandler.ReadyToStartHandlersByGroup(appqrm.QRMGPUPluginPeriodicalHandlerGroupName) + }, 5*time.Second, p.stopCh) + + return nil +} + +// Stop stops this plugin +func (p *StaticPolicy) Stop() error { + p.Lock() + defer func() { + p.started = false + p.Unlock() + general.Infof("stopped") + }() + + if !p.started { + general.Warningf("already stopped") + return nil + } + + close(p.stopCh) + + return nil +} + +// Name returns the name of this plugin +func (p *StaticPolicy) Name() string { + return p.name +} + +// ResourceName returns resource names managed by this plugin +func (p *StaticPolicy) ResourceName() string { + return string(consts.ResourceGPUMemory) +} + +// GetTopologyHints returns hints of corresponding resources +func (p *StaticPolicy) GetTopologyHints( + ctx context.Context, + req *pluginapi.ResourceRequest, +) (resp *pluginapi.ResourceHintsResponse, err error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyHints got nil req") + } + + p.RLock() + defer p.RUnlock() + + resourcePlugin := p.getResourcePlugin(req.ResourceName) + if resourcePlugin == nil { + return nil, fmt.Errorf("failed to find resource plugin by name %s", req.ResourceName) + } + return resourcePlugin.GetTopologyHints(ctx, req) +} + +// GetPodTopologyHints returns hints of corresponding resources +func (p *StaticPolicy) GetPodTopologyHints( + _ context.Context, + req *pluginapi.PodResourceRequest, +) (resp *pluginapi.PodResourceHintsResponse, err error) { + return nil, util.ErrNotImplemented +} + +func (p *StaticPolicy) RemovePod( + ctx context.Context, + req *pluginapi.RemovePodRequest, +) (*pluginapi.RemovePodResponse, error) { + if req == nil { + return nil, fmt.Errorf("RemovePod got nil req") + } + + p.Lock() + defer p.Unlock() + + // For every resource plugin and custom resource plugin, remove pod from their state + if err := p.removePod(req.PodUid); err != nil { + general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) + return nil, err + } + + return &pluginapi.RemovePodResponse{}, nil +} + +// GetResourcesAllocation returns allocation results of corresponding resources +func (p *StaticPolicy) GetResourcesAllocation( + _ context.Context, + _ *pluginapi.GetResourcesAllocationRequest, +) (*pluginapi.GetResourcesAllocationResponse, error) { + general.InfofV(4, "called") + return &pluginapi.GetResourcesAllocationResponse{}, nil +} + +// GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format +func (p *StaticPolicy) GetTopologyAwareResources( + ctx context.Context, + req *pluginapi.GetTopologyAwareResourcesRequest, +) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyAwareResources got nil req") + } + + p.RLock() + defer p.RUnlock() + + // Get topology aware resources for all resource plugins + allocatedResourcesList := make([]*pluginapi.GetTopologyAwareResourcesResponse, 0) + for _, resourcePlugin := range p.resourcePlugins { + allocatedResource, err := resourcePlugin.GetTopologyAwareResources(ctx, req.PodUid, req.ContainerName) + if err != nil { + general.Errorf("failed to get topology aware resources for plugin %s: %v", resourcePlugin.ResourceName(), err) + continue + } + + if allocatedResource == nil { + continue + } + allocatedResourcesList = append(allocatedResourcesList, allocatedResource) + } + + // Merge the respective response into one response + resp, err := p.mergeTopologyAwareResourcesResponse(req.PodUid, req.ContainerName, allocatedResourcesList) + if err != nil { + return nil, fmt.Errorf("failed to merge topology aware resources: %w", err) + } + + return resp, nil +} + +// mergeTopologyAwareResourcesResponse takes the separate topology aware resources response from the different sub-plugins and +// merge them into one response. +func (p *StaticPolicy) mergeTopologyAwareResourcesResponse( + podUID, containerName string, respList []*pluginapi.GetTopologyAwareResourcesResponse, +) (*pluginapi.GetTopologyAwareResourcesResponse, error) { + result := &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: podUID, + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: containerName, + }, + } + + allocatedResources := make(map[string]*pluginapi.TopologyAwareResource) + for _, resp := range respList { + if resp == nil { + continue + } + + if result.PodName != "" && result.PodName != resp.PodName { + general.Errorf("pod name %s not match, expect %s", resp.PodName, result.PodName) + return nil, fmt.Errorf("pod name %s not match, expect %s", resp.PodName, result.PodName) + } + + if result.PodNamespace != "" && result.PodNamespace != resp.PodNamespace { + general.Errorf("pod namespace %s not match, expect %s", resp.PodNamespace, result.PodNamespace) + return nil, fmt.Errorf("pod namespace %s not match, expect %s", resp.PodNamespace, result.PodNamespace) + } + + if result.PodName == "" { + result.PodName = resp.PodName + } + if result.PodNamespace == "" { + result.PodNamespace = resp.PodNamespace + } + + if resp.ContainerTopologyAwareResources == nil { + general.Errorf("container topology aware resources is nil for pod %s/%s, namespace %s", podUID, containerName, result.PodNamespace) + return nil, fmt.Errorf("container topology aware resources is nil for pod %s/%s, namespace %s", podUID, containerName, result.PodNamespace) + } + + for resourceName, resource := range resp.ContainerTopologyAwareResources.AllocatedResources { + allocatedResources[resourceName] = resource + } + } + + result.ContainerTopologyAwareResources.AllocatedResources = allocatedResources + return result, nil +} + +// GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format +func (p *StaticPolicy) GetTopologyAwareAllocatableResources( + ctx context.Context, + req *pluginapi.GetTopologyAwareAllocatableResourcesRequest, +) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { + general.InfofV(4, "called") + if req == nil { + return nil, fmt.Errorf("GetTopologyAwareAllocatableResources got nil req") + } + + p.RLock() + defer p.RUnlock() + + // Get topology aware allocatable resources for all resource plugins + allocatableResources := make(map[string]*pluginapi.AllocatableTopologyAwareResource) + for _, resourcePlugin := range p.resourcePlugins { + allocatableResource, err := resourcePlugin.GetTopologyAwareAllocatableResources(ctx) + if err != nil { + general.Errorf("failed to get topology aware allocatable resources for plugin %s: %v", resourcePlugin.ResourceName(), err) + continue + } + + if allocatableResource == nil { + continue + } + + allocatableResources[allocatableResource.ResourceName] = allocatableResource.AllocatableTopologyAwareResource + } + + return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ + AllocatableResources: allocatableResources, + }, nil +} + +// GetResourcePluginOptions returns options to be communicated with Resource Manager +func (p *StaticPolicy) GetResourcePluginOptions( + context.Context, + *pluginapi.Empty, +) (*pluginapi.ResourcePluginOptions, error) { + return &pluginapi.ResourcePluginOptions{ + PreStartRequired: false, + WithTopologyAlignment: true, + NeedReconcile: false, + AssociatedDevices: p.associatedDeviceNames.List(), + }, nil +} + +// Allocate is called during pod admit so that the resource +// plugin can allocate corresponding resource for the container +// according to resource request +func (p *StaticPolicy) Allocate( + ctx context.Context, + req *pluginapi.ResourceRequest, +) (resp *pluginapi.ResourceAllocationResponse, err error) { + if req == nil { + return nil, fmt.Errorf("GetTopologyHints got nil req") + } + + p.Lock() + defer func() { + if err != nil { + _ = p.removeContainer(req.PodUid, req.ContainerName, v1.ResourceName(req.ResourceName)) + } + p.Unlock() + }() + + resourcePlugin := p.getResourcePlugin(req.ResourceName) + if resourcePlugin == nil { + return nil, fmt.Errorf("failed to get resource plugin by name %s", req.ResourceName) + } + + resp, err = resourcePlugin.Allocate(ctx, req, nil) + return resp, err +} + +// AllocateForPod is called during pod admit so that the resource +// plugin can allocate corresponding resource for the pod +// according to resource request +func (p *StaticPolicy) AllocateForPod( + _ context.Context, + req *pluginapi.PodResourceRequest, +) (resp *pluginapi.PodResourceAllocationResponse, err error) { + return nil, util.ErrNotImplemented +} + +// PreStartContainer is called, if indicated by resource plugin during registration phase, +// before each container start. Resource plugin can run resource specific operations +// such as resetting the resource before making resources available to the container +func (p *StaticPolicy) PreStartContainer( + context.Context, + *pluginapi.PreStartContainerRequest, +) (*pluginapi.PreStartContainerResponse, error) { + return &pluginapi.PreStartContainerResponse{}, nil +} + +func (p *StaticPolicy) removePod(podUID string) error { + return p.removeWithUpdate(podUID, func(podResourceEntries state.PodResourceEntries) bool { + found := false + for _, podEntries := range podResourceEntries { + if podEntries[podUID] != nil { + found = true + } + delete(podEntries, podUID) + } + return found + }) +} + +// removeContainer removes container entry given the specific podUID, container name and resource name. +func (p *StaticPolicy) removeContainer(podUID, containerName string, resourceName v1.ResourceName) error { + return p.removeWithUpdate(podUID, func(podResourceEntries state.PodResourceEntries) bool { + found := false + for resource, podEntries := range podResourceEntries { + if resourceName != resource { + continue + } + + if _, ok := podEntries[podUID]; !ok { + continue + } + + if _, ok := podEntries[podUID][containerName]; ok { + found = true + } + delete(podEntries[podUID], containerName) + } + return found + }) +} + +func (p *StaticPolicy) removeWithUpdate( + podUID string, removeFn func(podResourceEntries state.PodResourceEntries) bool, +) error { + podResourceEntries := p.State.GetPodResourceEntries() + + found := removeFn(podResourceEntries) + if !found { + return nil + } + + machineState, err := p.GenerateMachineStateFromPodEntries(podResourceEntries) + if err != nil { + general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) + return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) + } + + p.State.SetPodResourceEntries(podResourceEntries, false) + p.State.SetMachineState(machineState, false) + + if err := p.State.StoreState(); err != nil { + general.Errorf("store state failed with error: %v", err) + return err + } + + return nil +} + +// clearResidualState is used to clean residual pods in local state +func (p *StaticPolicy) clearResidualState( + _ *config.Configuration, + _ interface{}, + _ *dynamicconfig.DynamicAgentConfiguration, + _ metrics.MetricEmitter, + _ *metaserver.MetaServer, +) { + general.Infof("exec") + var ( + err error + podList []*v1.Pod + ) + residualSet := make(map[string]bool) + + defer func() { + _ = general.UpdateHealthzStateByError(gpuconsts.ClearResidualState, err) + }() + + if p.MetaServer == nil { + general.Errorf("nil metaServer") + return + } + + ctx := context.Background() + podList, err = p.MetaServer.GetPodList(ctx, nil) + if err != nil { + general.Errorf("get pod list failed: %v", err) + return + } + + podSet := sets.NewString() + for _, pod := range podList { + podSet.Insert(fmt.Sprintf("%v", pod.UID)) + } + + p.Lock() + defer p.Unlock() + + podResourceEntries := p.State.GetPodResourceEntries() + for _, podEntries := range podResourceEntries { + for podUID := range podEntries { + if !podSet.Has(podUID) { + residualSet[podUID] = true + p.residualHitMap[podUID] += 1 + general.Infof("found pod: %s with state but doesn't show up in pod watcher, hit count: %d", podUID, p.residualHitMap[podUID]) + } + } + } + + podsToDelete := sets.NewString() + for podUID, hitCount := range p.residualHitMap { + if !residualSet[podUID] { + general.Infof("already found pod: %s in pod watcher or its state is cleared, delete it from residualHitMap", podUID) + delete(p.residualHitMap, podUID) + continue + } + + if time.Duration(hitCount)*gpuconsts.StateCheckPeriod >= gpuconsts.MaxResidualTime { + podsToDelete.Insert(podUID) + } + } + + if podsToDelete.Len() > 0 { + for { + podUID, found := podsToDelete.PopAny() + if !found { + break + } + + general.Infof("clear residual pod: %s in state", podUID) + podResourceEntries.RemovePod(podUID) + } + + machineState, err := p.GenerateMachineStateFromPodEntries(podResourceEntries) + if err != nil { + general.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) + return + } + + p.State.SetPodResourceEntries(podResourceEntries, false) + p.State.SetMachineState(machineState, false) + + err = p.State.StoreState() + if err != nil { + general.Errorf("store state failed: %v", err) + return + } + } +} + +func (p *StaticPolicy) UpdateAllocatableAssociatedDevices( + ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest, +) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { + if request == nil || len(request.Devices) == 0 { + return nil, fmt.Errorf("request is nil") + } + + customDevicePlugin := p.getCustomDevicePlugin(request.DeviceName) + if customDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for device %s", request.DeviceName) + } + + return customDevicePlugin.UpdateAllocatableAssociatedDevices(ctx, request) +} + +func (*StaticPolicy) GetAssociatedDeviceTopologyHints( + _ context.Context, _ *pluginapi.AssociatedDeviceRequest, +) (*pluginapi.AssociatedDeviceHintsResponse, error) { + return &pluginapi.AssociatedDeviceHintsResponse{}, nil +} + +// AllocateAssociatedDevice allocates a device in this sequence: +// 1. Find the resource plugin that corresponds to the accompanyResourceName and allocate +// 2. Find the custom device plugin that corresponds to the deviceName and allocate +func (p *StaticPolicy) AllocateAssociatedDevice( + ctx context.Context, req *pluginapi.AssociatedDeviceRequest, +) (resp *pluginapi.AssociatedDeviceAllocationResponse, respErr error) { + var isAccompanyResourcePlugin bool + var isAccompanyCustomDevicePlugin bool + if req == nil || req.ResourceRequest == nil || req.DeviceRequest == nil { + return nil, fmt.Errorf("req is nil") + } + + p.Lock() + defer func() { + // Reset state for accompany resource and target resource if there is an error + if respErr != nil { + if isAccompanyResourcePlugin { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(req.AccompanyResourceName)) + } + if isAccompanyCustomDevicePlugin { + accompanyDeviceType, _ := p.GetResourceTypeFromDeviceName(req.AccompanyResourceName) + if accompanyDeviceType != "" { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(accompanyDeviceType)) + } + } + deviceType, _ := p.GetResourceTypeFromDeviceName(req.DeviceName) + if deviceType != "" { + _ = p.removeContainer(req.ResourceRequest.PodUid, req.ResourceRequest.ContainerName, v1.ResourceName(deviceType)) + } + } + p.Unlock() + }() + + // Find the target device that we want to allocate for + var targetDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.DeviceName { + targetDeviceReq = deviceRequest + } + } + + if targetDeviceReq == nil { + return nil, fmt.Errorf("no target device plugin found for target device %s", req.DeviceName) + } + + // Allocate accompany resource + // Check if accompany resource maps to a resource plugin; if it does, allocate it first + accompanyResourcePlugin := p.getResourcePlugin(req.AccompanyResourceName) + if accompanyResourcePlugin != nil { + _, err := accompanyResourcePlugin.Allocate(ctx, req.ResourceRequest, targetDeviceReq) + if err != nil { + return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", req.AccompanyResourceName, err) + } + isAccompanyResourcePlugin = true + } else { + // Accompany resource maps to a custom device plugin; allocate for it + accompanyCustomDevicePlugin := p.getCustomDevicePlugin(req.AccompanyResourceName) + if accompanyCustomDevicePlugin != nil { + // Get device request for accompany device + var accompanyDeviceReq *pluginapi.DeviceRequest + for _, deviceRequest := range req.DeviceRequest { + if deviceRequest.DeviceName == req.AccompanyResourceName { + accompanyDeviceReq = deviceRequest + } + } + + if accompanyDeviceReq == nil { + return nil, fmt.Errorf("nil accompany device request") + } + + _, err := p.allocateAssociatedDevice(ctx, accompanyCustomDevicePlugin, req.ResourceRequest, accompanyDeviceReq, "") + if err != nil { + return nil, fmt.Errorf("AllocateAssociatedDevice accompany resource %s failed with error: %v", req.AccompanyResourceName, err) + } + isAccompanyCustomDevicePlugin = true + } + } + + // Allocate target custom device + targetCustomDevicePlugin := p.getCustomDevicePlugin(req.DeviceName) + if targetCustomDevicePlugin == nil { + return nil, fmt.Errorf("no custom device plugin found for target device %s", req.DeviceName) + } + + return p.allocateAssociatedDevice(ctx, targetCustomDevicePlugin, req.ResourceRequest, targetDeviceReq, req.AccompanyResourceName) +} + +func (p *StaticPolicy) allocateAssociatedDevice( + ctx context.Context, devicePlugin customdeviceplugin.CustomDevicePlugin, + resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { + defaultAccompanyResourceName := devicePlugin.DefaultAccompanyResourceName() + if defaultAccompanyResourceName != "" && accompanyResourceName != defaultAccompanyResourceName { + accompanyResourcePlugin := p.getResourcePlugin(defaultAccompanyResourceName) + if accompanyResourcePlugin != nil { + _, err := accompanyResourcePlugin.Allocate(ctx, resReq, deviceReq) + if err != nil { + _ = p.removeContainer(resReq.PodUid, resReq.ContainerName, v1.ResourceName(defaultAccompanyResourceName)) + return nil, fmt.Errorf("allocate accompany resource %s failed with error: %v", defaultAccompanyResourceName, err) + } + } + } + + return devicePlugin.AllocateAssociatedDevice(ctx, resReq, deviceReq, accompanyResourceName) +} + +func (p *StaticPolicy) registerDefaultResourcePlugins() error { + allInitFuncs := resourcepluginregistry.GetRegisteredResourcePlugin() + for _, initFunc := range allInitFuncs { + resourcePlugin := initFunc(p.BasePlugin) + p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin + general.Infof("Registered resource plugin: %s", resourcePlugin.ResourceName()) + } + return nil +} + +func (p *StaticPolicy) registerDefaultCustomDevicePlugins() error { + allInitFuncs := devicepluginregistry.GetRegisteredCustomDevicePlugin() + for name := range allInitFuncs { + initFunc := allInitFuncs[name] + customDevicePlugin := initFunc(p.BasePlugin) + deviceNames := customDevicePlugin.DeviceNames() + for _, deviceName := range deviceNames { + p.customDevicePlugins[deviceName] = customDevicePlugin + p.associatedDeviceNames.Insert(deviceName) + } + } + return nil +} + +func (p *StaticPolicy) getResourcePlugin(resourceName string) resourceplugin.ResourcePlugin { + resourcePlugin := p.resourcePlugins[resourceName] + return resourcePlugin +} + +func (p *StaticPolicy) getCustomDevicePlugin(deviceName string) customdeviceplugin.CustomDevicePlugin { + customDevicePlugin := p.customDevicePlugins[deviceName] + return customDevicePlugin +} + +func (p *StaticPolicy) RegisterResourcePlugin(resourcePlugin resourceplugin.ResourcePlugin) { + p.resourcePlugins[resourcePlugin.ResourceName()] = resourcePlugin +} + +func (p *StaticPolicy) RegisterCustomDevicePlugin(plugin customdeviceplugin.CustomDevicePlugin) { + deviceNames := plugin.DeviceNames() + for _, deviceName := range deviceNames { + p.customDevicePlugins[deviceName] = plugin + p.associatedDeviceNames.Insert(deviceName) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go new file mode 100644 index 0000000000..2f8a90930d --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/staticpolicy/policy_test.go @@ -0,0 +1,683 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package staticpolicy + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/uuid" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + katalyst_base "github.com/kubewharf/katalyst-core/cmd/base" + "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/resourceplugin" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +const ( + testResourcePluginName = "resource-plugin-stub" + testCustomDevicePluginName = "custom-device-plugin-stub" + testCustomDevicePluginName2 = "custom-device-plugin-stub-2" +) + +func generateTestConfiguration(t *testing.T) *config.Configuration { + conf := config.NewConfiguration() + tmpDir := t.TempDir() + conf.QRMPluginSocketDirs = []string{tmpDir} + conf.CheckpointManagerDir = tmpDir + + return conf +} + +func generateTestGenericContext(t *testing.T, conf *config.Configuration) *agent.GenericContext { + genericCtx, err := katalyst_base.GenerateFakeGenericContext([]runtime.Object{}) + if err != nil { + t.Fatalf("unable to generate test generic context: %v", err) + } + + metaServer, err := metaserver.NewMetaServer(genericCtx.Client, metrics.DummyMetrics{}, conf) + if err != nil { + t.Fatalf("unable to generate test meta server: %v", err) + } + + agentCtx := &agent.GenericContext{ + GenericContext: genericCtx, + MetaServer: metaServer, + PluginManager: nil, + } + + agentCtx.MetaServer = metaServer + return agentCtx +} + +func TestNewStaticPolicy(t *testing.T) { + t.Parallel() + + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + + _, policy, err := NewStaticPolicy(agentCtx, conf, nil, "test") + assert.NoError(t, err) + assert.NotNil(t, policy) +} + +func makeTestStaticPolicy(t *testing.T) *StaticPolicy { + conf := generateTestConfiguration(t) + agentCtx := generateTestGenericContext(t, conf) + + tmpDir := t.TempDir() + conf.GenericQRMPluginConfiguration.StateFileDirectory = tmpDir + + stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, tmpDir, "test", "test-policy", state.NewDefaultResourceStateGeneratorRegistry(), true, metrics.DummyMetrics{}) + assert.NoError(t, err) + + deviceTopologyRegistry := machine.NewDeviceTopologyRegistry() + + basePlugin := &baseplugin.BasePlugin{ + Conf: conf, + Emitter: metrics.DummyMetrics{}, + MetaServer: agentCtx.MetaServer, + AgentCtx: agentCtx, + PodAnnotationKeptKeys: []string{}, + PodLabelKeptKeys: []string{}, + State: stateImpl, + DeviceTopologyRegistry: deviceTopologyRegistry, + DefaultResourceStateGeneratorRegistry: state.NewDefaultResourceStateGeneratorRegistry(), + } + + staticPolicy := &StaticPolicy{ + BasePlugin: basePlugin, + resourcePlugins: make(map[string]resourceplugin.ResourcePlugin), + customDevicePlugins: make(map[string]customdeviceplugin.CustomDevicePlugin), + associatedDeviceNames: sets.NewString(), + } + + err = staticPolicy.registerDefaultResourcePlugins() + assert.NoError(t, err) + + err = staticPolicy.registerDefaultCustomDevicePlugins() + assert.NoError(t, err) + + return staticPolicy +} + +func TestStaticPolicy_Allocate(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + // Register stubbed resource plugin + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Check state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testResourcePluginName, podUID, testName) + fmt.Println(allocationInfo) + assert.NotNil(t, allocationInfo) + + // Allocating to an invalid resource plugin returns error + invalidReq := &pluginapi.ResourceRequest{ + PodUid: string(uuid.NewUUID()), + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "invalid-plugin", + ResourceRequests: map[string]float64{ + "invalid-plugin": 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.Allocate(context.Background(), invalidReq) + assert.Error(t, err) +} + +func TestStaticPolicy_RemovePod(t *testing.T) { + t.Parallel() + policy := makeTestStaticPolicy(t) + + // Register stubbed resource plugin + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + deviceTopologyProviderStub := machine.NewDeviceTopologyProviderStub() + + testDeviceTopology := &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "test-1": {}, + }, + } + err := deviceTopologyProviderStub.SetDeviceTopology(testDeviceTopology) + assert.NoError(t, err) + + policy.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(testResourcePluginName, deviceTopologyProviderStub) + + policy.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(testResourcePluginName, + state.NewGenericDefaultResourceStateGenerator(testResourcePluginName, policy.DeviceTopologyRegistry)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Remove pod + _, err = policy.RemovePod(context.Background(), &pluginapi.RemovePodRequest{ + PodUid: podUID, + }) + assert.NoError(t, err) + + // Check state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testResourcePluginName, podUID, testName) + assert.Nil(t, allocationInfo) +} + +func TestStaticPolicy_GetTopologyHints(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + + testName := "test" + podUID := string(uuid.NewUUID()) + + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + resp, err := policy.GetTopologyHints(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Getting topology hints from an invalid resource plugin returns error + invalidReq := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: "invalid-plugin", + ResourceRequests: map[string]float64{ + "invalid-plugin": 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err = policy.GetTopologyHints(context.Background(), invalidReq) + assert.Error(t, err) +} + +func TestStaticPolicy_GetTopologyAwareResources(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + testName := "test" + podUID := string(uuid.NewUUID()) + + // Allocate first + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + // Get topology aware resources + getTopologyAwareResourcesReq := &pluginapi.GetTopologyAwareResourcesRequest{ + PodUid: podUID, + ContainerName: testName, + } + + resp, err := policy.GetTopologyAwareResources(context.Background(), getTopologyAwareResourcesReq) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Invalid request does not return error + invalidReq := &pluginapi.GetTopologyAwareResourcesRequest{ + PodUid: podUID, + ContainerName: "invalid-container", + } + + resp, err = policy.GetTopologyAwareResources(context.Background(), invalidReq) + assert.NoError(t, err) + assert.NotNil(t, resp) +} + +func TestStaticPolicy_mergeTopologyAwareResourcesResponse(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + podUID string + containerName string + respList []*pluginapi.GetTopologyAwareResourcesResponse + expectedResp *pluginapi.GetTopologyAwareResourcesResponse + expectedErr bool + }{ + { + name: "test merging of response", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedResp: &pluginapi.GetTopologyAwareResourcesResponse{ + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + "test-resource-2": {}, + }, + }, + }, + }, + { + name: "pod name is not the same, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name-2", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "pod namespace is not the same, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace-2", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-2": {}, + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "container topology aware resources is nil, return an error", + podUID: "test-pod", + containerName: "test-container", + respList: []*pluginapi.GetTopologyAwareResourcesResponse{ + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ + ContainerName: "test-container", + AllocatedResources: map[string]*pluginapi.TopologyAwareResource{ + "test-resource-1": {}, + }, + }, + }, + { + PodUid: "test-pod", + PodName: "test-pod-name", + PodNamespace: "test-pod-namespace", + ContainerTopologyAwareResources: nil, + }, + }, + expectedErr: true, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + policy := makeTestStaticPolicy(t) + + resp, err := policy.mergeTopologyAwareResourcesResponse(tt.podUID, tt.containerName, tt.respList) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedResp, resp) + } + }) + } +} + +func TestStaticPolicy_GetTopologyAwareAllocatableResources(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + testName := "test" + podUID := string(uuid.NewUUID()) + + // Allocate first + req := &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + } + + _, err := policy.Allocate(context.Background(), req) + assert.NoError(t, err) + + getTopologyAwareAllocatableResourcesReq := &pluginapi.GetTopologyAwareAllocatableResourcesRequest{} + resp, err := policy.GetTopologyAwareAllocatableResources(context.Background(), getTopologyAwareAllocatableResourcesReq) + assert.NoError(t, err) + assert.NotNil(t, resp) +} + +func TestStaticPolicy_UpdateAllocatableAssociatedDevices(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub(policy.BasePlugin)) + + req := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: testCustomDevicePluginName, + Devices: []*pluginapi.AssociatedDevice{ + { + ID: "test-device-1", + }, + }, + } + + resp, err := policy.UpdateAllocatableAssociatedDevices(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Test error handling for no device request + noDeviceRequest := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: testCustomDevicePluginName, + } + + resp, err = policy.UpdateAllocatableAssociatedDevices(context.Background(), noDeviceRequest) + assert.Error(t, err) + assert.Nil(t, resp) + + // Test error handling for non-existent custom device plugin + invalidReq := &pluginapi.UpdateAllocatableAssociatedDevicesRequest{ + DeviceName: "non-existent-device", + } + + resp, err = policy.UpdateAllocatableAssociatedDevices(context.Background(), invalidReq) + assert.Error(t, err) + assert.Nil(t, resp) +} + +func TestStaticPolicy_AllocateAssociatedDevices(t *testing.T) { + t.Parallel() + + policy := makeTestStaticPolicy(t) + policy.RegisterResourcePlugin(resourceplugin.NewResourcePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub(policy.BasePlugin)) + policy.RegisterCustomDevicePlugin(customdeviceplugin.NewCustomDevicePluginStub2(policy.BasePlugin)) + + podUID := string(uuid.NewUUID()) + + testName := "test" + + req := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ContainerIndex: 0, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + }, + DeviceName: testCustomDevicePluginName, + AccompanyResourceName: testResourcePluginName, + } + + resp, err := policy.AllocateAssociatedDevice(context.Background(), req) + + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify in state + stateImpl := policy.State + allocationInfo := stateImpl.GetAllocationInfo(testCustomDevicePluginName, podUID, testName) + assert.NotNil(t, allocationInfo) + + podUID = string(uuid.NewUUID()) + // Error handling if there is no target device + noTargetDeviceReq := &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + }, + DeviceName: "invalid-device", + AccompanyResourceName: testResourcePluginName, + } + + resp, err = policy.AllocateAssociatedDevice(context.Background(), noTargetDeviceReq) + + assert.Error(t, err) + assert.Nil(t, resp) + + // Accompany resource is another custom device plugin + req = &pluginapi.AssociatedDeviceRequest{ + ResourceRequest: &pluginapi.ResourceRequest{ + PodUid: podUID, + PodNamespace: testName, + PodName: testName, + ContainerName: testName, + ContainerType: pluginapi.ContainerType_MAIN, + ResourceName: testResourcePluginName, + ResourceRequests: map[string]float64{ + testResourcePluginName: 2, + }, + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + DeviceRequest: []*pluginapi.DeviceRequest{ + { + DeviceName: testCustomDevicePluginName, + }, + { + DeviceName: testCustomDevicePluginName2, + }, + }, + DeviceName: testCustomDevicePluginName, + AccompanyResourceName: testCustomDevicePluginName2, + } + + resp, err = policy.AllocateAssociatedDevice(context.Background(), req) + assert.NoError(t, err) + assert.NotNil(t, resp) + + // Verify state + allocationInfo = stateImpl.GetAllocationInfo(testCustomDevicePluginName, podUID, testName) + assert.NotNil(t, allocationInfo) + + allocationInfo = stateImpl.GetAllocationInfo(testCustomDevicePluginName2, podUID, testName) + assert.NotNil(t, allocationInfo) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go new file mode 100644 index 0000000000..5202c9b862 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/defaults.go @@ -0,0 +1,70 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// registerDefaultFilterStrategies register filtering strategies +func registerDefaultFilterStrategies(manager *StrategyManager) { + if err := manager.RegisterFilteringStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { + general.Errorf("Failed to register filtering strategy: %v", err) + } + + if err := manager.RegisterFilteringStrategy(canonical.NewCanonicalStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } +} + +// registerDefaultSortingStrategies register sorting strategies +func registerDefaultSortingStrategies(manager *StrategyManager) { + if err := manager.RegisterSortingStrategy(gpu_memory.NewGPUMemoryStrategy()); err != nil { + general.Errorf("Failed to register sorting strategy: %v", err) + } +} + +// registerDefaultBindingStrategies register binding strategies +func registerDefaultBindingStrategies(manager *StrategyManager) { + if err := manager.RegisterBindingStrategy(canonical.NewCanonicalStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } + + if err := manager.RegisterBindingStrategy(deviceaffinity.NewDeviceAffinityStrategy()); err != nil { + general.Errorf("Failed to register binding strategy: %v", err) + } +} + +// registerDefaultAllocationStrategies register allocation strategies +func registerDefaultAllocationStrategies(manager *StrategyManager) { + if err := manager.RegisterGenericAllocationStrategy(allocationStrategyNameDefault, + []string{canonical.StrategyNameCanonical, gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical); err != nil { + general.Errorf("Failed to register gpu-memory-default strategy: %v", err) + } +} + +// registerDefaultStrategies registers the default strategies +func registerDefaultStrategies(manager *StrategyManager) { + registerDefaultFilterStrategies(manager) + registerDefaultSortingStrategies(manager) + registerDefaultBindingStrategies(manager) + registerDefaultAllocationStrategies(manager) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go new file mode 100644 index 0000000000..fe78af5de7 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/helper.go @@ -0,0 +1,68 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "fmt" + + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// AllocateGPUUsingStrategy performs GPU allocation using the strategy framework +func AllocateGPUUsingStrategy( + resourceReq *pluginapi.ResourceRequest, + deviceReq *pluginapi.DeviceRequest, + gpuTopology *machine.DeviceTopology, + gpuConfig *qrm.GPUQRMPluginConfig, + emitter metrics.MetricEmitter, + metaServer *metaserver.MetaServer, + machineState state.AllocationResourcesMap, + qosLevel string, +) (*allocate.AllocationResult, error) { + // Get hint nodes + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to get hint nodes: %v", err), + }, err + } + + // Create allocation context + ctx := &allocate.AllocationContext{ + ResourceReq: resourceReq, + DeviceReq: deviceReq, + DeviceTopology: gpuTopology, + GPUQRMPluginConfig: gpuConfig, + Emitter: emitter, + MetaServer: metaServer, + MachineState: machineState, + QoSLevel: qosLevel, + HintNodes: hintNodes, + } + + // Get the global strategy manager and perform allocation + manager := GetGlobalStrategyManager() + return manager.AllocateUsingStrategy(ctx) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go new file mode 100644 index 0000000000..f77b4d1fa6 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager.go @@ -0,0 +1,193 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +const ( + allocationStrategyNameDefault = "default" +) + +// StrategyManager manages the selection of allocation strategies based on resource names +type StrategyManager struct { + *registry.StrategyRegistry + + // Mapping from resource name to strategy name + resourceToStrategy map[string]string + + // Default strategy to use when no specific strategy is configured + defaultStrategy string + + // Mutex for thread-safe access + mutex sync.RWMutex +} + +// NewStrategyManager creates a new strategy manager +func NewStrategyManager() *StrategyManager { + return &StrategyManager{ + StrategyRegistry: registry.NewStrategyRegistry(), + resourceToStrategy: make(map[string]string), + defaultStrategy: allocationStrategyNameDefault, + } +} + +// RegisterStrategyForResource registers a strategy for a specific resource name +func (m *StrategyManager) RegisterStrategyForResource(resourceName, strategyName string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + // Check if the strategy exists + _, err := m.GetAllocationStrategy(strategyName) + if err != nil { + return fmt.Errorf("strategy %s not found: %v", strategyName, err) + } + + m.resourceToStrategy[resourceName] = strategyName + general.Infof("Registered strategy %s for resource %s", strategyName, resourceName) + return nil +} + +func (m *StrategyManager) GetDefaultStrategy() (allocate.AllocationStrategy, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + + return m.GetAllocationStrategy(m.defaultStrategy) +} + +// SetDefaultStrategy sets the default strategy to use when no specific strategy is configured +func (m *StrategyManager) SetDefaultStrategy(strategyName string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + // Check if the strategy exists + _, err := m.GetAllocationStrategy(strategyName) + if err != nil { + return fmt.Errorf("strategy %s not found: %v", strategyName, err) + } + + m.defaultStrategy = strategyName + general.Infof("Set default strategy to %s", strategyName) + return nil +} + +// GetStrategyForResource returns the strategy name for a given resource +func (m *StrategyManager) GetStrategyForResource(resourceName string) string { + m.mutex.RLock() + defer m.mutex.RUnlock() + + if strategyName, exists := m.resourceToStrategy[resourceName]; exists { + return strategyName + } + + return m.defaultStrategy +} + +// getAllocationStrategyForResource returns the allocation strategy for a given resource +func (m *StrategyManager) getAllocationStrategyForResource(customAllocationStrategy map[string]string, resourceName string) (allocate.AllocationStrategy, error) { + var strategyName string + if customStrategy, exists := customAllocationStrategy[resourceName]; exists { + strategyName = customStrategy + } else { + strategyName = m.GetStrategyForResource(resourceName) + } + + return m.GetAllocationStrategy(strategyName) +} + +// AllocateUsingStrategy performs allocation using the appropriate strategy for the resource +func (m *StrategyManager) AllocateUsingStrategy(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { + // Determine the device name + resourceName := ctx.DeviceReq.DeviceName + customAllocationStrategy := ctx.GPUQRMPluginConfig.CustomAllocationStrategy + + // Get the strategy for this resource + strategy, err := m.getAllocationStrategyForResource(customAllocationStrategy, resourceName) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to get strategy for resource %s: %v", resourceName, err), + }, fmt.Errorf("failed to get strategy for resource %s: %v", resourceName, err) + } + + general.InfoS("Using strategy for allocation", + "resourceName", resourceName, + "strategyName", strategy.Name(), + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName) + + // Perform allocation using the strategy + return strategy.Allocate(ctx) +} + +// RegisterGenericAllocationStrategy registers a complete generic allocation strategy with the given name +func (m *StrategyManager) RegisterGenericAllocationStrategy( + name string, filteringNames []string, sortingName, bindingName string, +) error { + var filteringList []allocate.FilteringStrategy + for _, fn := range filteringNames { + filtering, err := m.StrategyRegistry.GetFilteringStrategy(fn) + if err != nil { + return fmt.Errorf("filtering strategy %s not found: %v", fn, err) + } + filteringList = append(filteringList, filtering) + } + + sorting, err := m.StrategyRegistry.GetSortingStrategy(sortingName) + if err != nil { + return fmt.Errorf("sorting strategy %s not found: %v", sortingName, err) + } + + binding, err := m.StrategyRegistry.GetBindingStrategy(bindingName) + if err != nil { + return fmt.Errorf("binding strategy %s not found: %v", bindingName, err) + } + + err = m.StrategyRegistry.RegisterAllocationStrategy(allocation.NewGenericAllocationStrategy(name, m.StrategyRegistry, filteringList, sorting, binding)) + if err != nil { + return fmt.Errorf("register allocation strategy %s failed: %v", name, err) + } + + general.Infof("Registered allocation strategy: %s (filtering: %s, sorting: %s, binding: %s)", + name, filteringNames, sortingName, bindingName) + return nil +} + +// Global strategy manager instance +var ( + globalStrategyManager *StrategyManager + once sync.Once +) + +// GetGlobalStrategyManager returns the global strategy manager instance +func GetGlobalStrategyManager() *StrategyManager { + once.Do(func() { + globalStrategyManager = NewStrategyManager() + + // Register default strategies + registerDefaultStrategies(globalStrategyManager) + }) + return globalStrategyManager +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go new file mode 100644 index 0000000000..5a9a075249 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager/manager_test.go @@ -0,0 +1,61 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package manager + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory" +) + +func TestStrategyManager(t *testing.T) { + t.Parallel() + + manager := NewStrategyManager() + + registerDefaultStrategies(manager) + + // Test default strategy + assert.Equal(t, "default", manager.defaultStrategy) + + // Test setting default strategy + err := manager.SetDefaultStrategy("test-allocation") + assert.Error(t, err) // Should fail because test-allocation is not registered yet + + // Test registering strategy for resource + err = manager.RegisterStrategyForResource("test-resource", "test-allocation") + assert.Error(t, err) // Should fail because test-allocation is not registered yet + + err = manager.RegisterGenericAllocationStrategy("test-allocation", []string{gpu_memory.StrategyNameGPUMemory}, + gpu_memory.StrategyNameGPUMemory, canonical.StrategyNameCanonical) + assert.NoError(t, err) + + // Now test registering strategy for resource + err = manager.RegisterStrategyForResource("test-resource", "test-allocation") + assert.NoError(t, err) + + // Test getting strategy for resource + strategyName := manager.GetStrategyForResource("test-resource") + assert.Equal(t, "test-allocation", strategyName) + + // Test getting default strategy for non-existent resource + strategyName = manager.GetStrategyForResource("non-existent-resource") + assert.Equal(t, "default", strategyName) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go new file mode 100644 index 0000000000..48b58014f9 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry.go @@ -0,0 +1,151 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registry + +import ( + "fmt" + "sync" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +type StrategyRegistry struct { + filteringStrategies map[string]allocate.FilteringStrategy + sortingStrategies map[string]allocate.SortingStrategy + bindingStrategies map[string]allocate.BindingStrategy + allocationStrategies map[string]allocate.AllocationStrategy + // Mutex for thread-safe access to registries + registryMutex sync.RWMutex +} + +// NewStrategyRegistry creates a new instance of StrategyRegistry +func NewStrategyRegistry() *StrategyRegistry { + return &StrategyRegistry{ + filteringStrategies: make(map[string]allocate.FilteringStrategy), + sortingStrategies: make(map[string]allocate.SortingStrategy), + bindingStrategies: make(map[string]allocate.BindingStrategy), + allocationStrategies: make(map[string]allocate.AllocationStrategy), + } +} + +// RegisterFilteringStrategy registers a filtering strategy with the given name +func (r *StrategyRegistry) RegisterFilteringStrategy(strategy allocate.FilteringStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.filteringStrategies[strategy.Name()]; exists { + return fmt.Errorf("filtering strategy with name %s already registered", strategy.Name()) + } + + r.filteringStrategies[strategy.Name()] = strategy + general.Infof("Registered filtering strategy: %s", strategy.Name()) + return nil +} + +// RegisterSortingStrategy registers a sorting strategy with the given name +func (r *StrategyRegistry) RegisterSortingStrategy(strategy allocate.SortingStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.sortingStrategies[strategy.Name()]; exists { + return fmt.Errorf("sorting strategy with name %s already registered", strategy.Name()) + } + + r.sortingStrategies[strategy.Name()] = strategy + general.Infof("Registered sorting strategy: %s", strategy.Name()) + return nil +} + +// RegisterBindingStrategy registers a binding strategy with the given name +func (r *StrategyRegistry) RegisterBindingStrategy(strategy allocate.BindingStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.bindingStrategies[strategy.Name()]; exists { + return fmt.Errorf("binding strategy with name %s already registered", strategy.Name()) + } + + r.bindingStrategies[strategy.Name()] = strategy + general.Infof("Registered binding strategy: %s", strategy.Name()) + return nil +} + +func (r *StrategyRegistry) RegisterAllocationStrategy(strategy allocate.AllocationStrategy) error { + r.registryMutex.Lock() + defer r.registryMutex.Unlock() + + if _, exists := r.allocationStrategies[strategy.Name()]; exists { + return fmt.Errorf("allocation strategy with name %s already registered", strategy.Name()) + } + + r.allocationStrategies[strategy.Name()] = strategy + general.Infof("Registered allocation strategy: %s", strategy.Name()) + return nil +} + +// GetFilteringStrategy returns the filtering strategy with the given name +func (r *StrategyRegistry) GetFilteringStrategy(name string) (allocate.FilteringStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.filteringStrategies[name] + if !exists { + return nil, fmt.Errorf("filtering strategy %s not found", name) + } + + return strategy, nil +} + +// GetSortingStrategy returns the sorting strategy with the given name +func (r *StrategyRegistry) GetSortingStrategy(name string) (allocate.SortingStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.sortingStrategies[name] + if !exists { + return nil, fmt.Errorf("sorting strategy %s not found", name) + } + + return strategy, nil +} + +// GetBindingStrategy returns the binding strategy with the given name +func (r *StrategyRegistry) GetBindingStrategy(name string) (allocate.BindingStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.bindingStrategies[name] + if !exists { + return nil, fmt.Errorf("binding strategy %s not found", name) + } + + return strategy, nil +} + +// GetAllocationStrategy returns the allocation strategy with the given name +func (r *StrategyRegistry) GetAllocationStrategy(name string) (allocate.AllocationStrategy, error) { + r.registryMutex.RLock() + defer r.registryMutex.RUnlock() + + strategy, exists := r.allocationStrategies[name] + if !exists { + return nil, fmt.Errorf("allocation strategy %s not found", name) + } + + return strategy, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go new file mode 100644 index 0000000000..d013c6a985 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry/registry_test.go @@ -0,0 +1,97 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registry + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +type dummyStrategy struct { + name string +} + +func (s *dummyStrategy) Name() string { + return s.name +} + +func (s *dummyStrategy) Filter(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Sort(_ *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + return allAvailableDevices, nil +} + +func (s *dummyStrategy) Bind(_ *allocate.AllocationContext, _ []string) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + +func (s *dummyStrategy) Allocate(_ *allocate.AllocationContext) (*allocate.AllocationResult, error) { + return &allocate.AllocationResult{}, nil +} + +func TestStrategyRegistry(t *testing.T) { + t.Parallel() + + registry := NewStrategyRegistry() + // Test filtering strategy registration + filteringStrategy := &dummyStrategy{name: "test-filtering"} + err := registry.RegisterFilteringStrategy(filteringStrategy) + assert.NoError(t, err) + + // Test duplicate registration + err = registry.RegisterFilteringStrategy(filteringStrategy) + assert.Error(t, err) + + // Test filtering strategy retrieval + retrievedStrategy, err := registry.GetFilteringStrategy("test-filtering") + assert.NoError(t, err) + assert.Equal(t, "test-filtering", retrievedStrategy.Name()) + + // Test non-existent strategy + _, err = registry.GetFilteringStrategy("non-existent") + assert.Error(t, err) + + // Test sorting strategy registration + sortingStrategy := &dummyStrategy{name: "test-sorting"} + err = registry.RegisterSortingStrategy(sortingStrategy) + assert.NoError(t, err) + + // Test strategy retrieval + retrievedSortingStrategy, err := registry.GetSortingStrategy("test-sorting") + assert.NoError(t, err) + assert.Equal(t, "test-sorting", retrievedSortingStrategy.Name()) + + // Test binding strategy registration + bindingStrategy := &dummyStrategy{name: "test-binding"} + err = registry.RegisterBindingStrategy(bindingStrategy) + assert.NoError(t, err) + + // Test allocation strategy registration + allocatingStrategy := &dummyStrategy{name: "test-allocation"} + err = registry.RegisterAllocationStrategy(allocatingStrategy) + assert.NoError(t, err) + + // Test allocation strategy retrieval + allocationStrategy, err := registry.GetAllocationStrategy("test-allocation") + assert.NoError(t, err) + assert.Equal(t, "test-allocation", allocationStrategy.Name()) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go new file mode 100644 index 0000000000..f5c55eaf2d --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/allocation/generic.go @@ -0,0 +1,176 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package allocation + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/registry" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// GenericAllocationStrategy combines filtering, sorting, and binding strategies +type GenericAllocationStrategy struct { + name string + registry *registry.StrategyRegistry + filteringStrategies []allocate.FilteringStrategy + sortingStrategy allocate.SortingStrategy + bindingStrategy allocate.BindingStrategy +} + +// NewGenericAllocationStrategy creates a new allocation strategy with the given components +func NewGenericAllocationStrategy(name string, + registry *registry.StrategyRegistry, + filtering []allocate.FilteringStrategy, + sorting allocate.SortingStrategy, + binding allocate.BindingStrategy, +) *GenericAllocationStrategy { + return &GenericAllocationStrategy{ + name: name, + registry: registry, + filteringStrategies: filtering, + sortingStrategy: sorting, + bindingStrategy: binding, + } +} + +var _ allocate.AllocationStrategy = &GenericAllocationStrategy{} + +func (s *GenericAllocationStrategy) Name() string { + return s.name +} + +func (s *GenericAllocationStrategy) Clone(name string) *GenericAllocationStrategy { + filteringStrategies := make([]allocate.FilteringStrategy, len(s.filteringStrategies)) + copy(filteringStrategies, s.filteringStrategies) + return &GenericAllocationStrategy{ + name: name, + registry: s.registry, + filteringStrategies: filteringStrategies, + sortingStrategy: s.sortingStrategy, + bindingStrategy: s.bindingStrategy, + } +} + +// Allocate performs the allocation using the combined strategies +func (s *GenericAllocationStrategy) Allocate(ctx *allocate.AllocationContext) (*allocate.AllocationResult, error) { + var err error + resourceName := ctx.DeviceReq.DeviceName + allAvailableDevices := append(ctx.DeviceReq.ReusableDevices, ctx.DeviceReq.AvailableDevices...) + // Apply filtering strategy + for _, fs := range s.getFilteringStrategies(ctx, resourceName) { + allAvailableDevices, err = fs.Filter(ctx, allAvailableDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + } + + // Apply sorting strategy + sortedDevices, err := s.getSortingStrategy(ctx, resourceName).Sort(ctx, allAvailableDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + + // Apply binding strategy + result, err := s.getBindingStrategy(ctx, resourceName).Bind(ctx, sortedDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: err.Error(), + }, err + } + + return result, nil +} + +// GetFilteringStrategy returns the filtering strategy +func (s *GenericAllocationStrategy) GetFilteringStrategy() []allocate.FilteringStrategy { + return s.filteringStrategies +} + +// SetFilteringStrategy sets the filtering strategy +func (s *GenericAllocationStrategy) SetFilteringStrategy(filteringStrategies []allocate.FilteringStrategy) { + s.filteringStrategies = filteringStrategies +} + +// GetSortingStrategy returns the sorting strategy +func (s *GenericAllocationStrategy) GetSortingStrategy() allocate.SortingStrategy { + return s.sortingStrategy +} + +// SetSortingStrategy sets the sorting strategy +func (s *GenericAllocationStrategy) SetSortingStrategy(sortingStrategy allocate.SortingStrategy) { + s.sortingStrategy = sortingStrategy +} + +// GetBindingStrategy returns the binding strategy +func (s *GenericAllocationStrategy) GetBindingStrategy() allocate.BindingStrategy { + return s.bindingStrategy +} + +// SetBindingStrategy sets the binding strategy +func (s *GenericAllocationStrategy) SetBindingStrategy(bindingStrategy allocate.BindingStrategy) { + s.bindingStrategy = bindingStrategy +} + +func (s *GenericAllocationStrategy) getFilteringStrategies(ctx *allocate.AllocationContext, resourceName string) []allocate.FilteringStrategy { + if strategyNames, ok := ctx.GPUQRMPluginConfig.CustomFilteringStrategies[resourceName]; ok { + filteringStrategies := make([]allocate.FilteringStrategy, len(strategyNames)) + for _, fs := range strategyNames { + fs, err := s.registry.GetFilteringStrategy(fs) + if err != nil { + general.Errorf("failed to get filtering strategy %s: %v", fs, err) + continue + } + filteringStrategies = append(filteringStrategies, fs) + } + return filteringStrategies + } else { + return s.filteringStrategies + } +} + +func (s *GenericAllocationStrategy) getSortingStrategy(ctx *allocate.AllocationContext, resourceName string) allocate.SortingStrategy { + if strategyName, ok := ctx.GPUQRMPluginConfig.CustomSortingStrategy[resourceName]; ok { + sortingStrategy, err := s.registry.GetSortingStrategy(strategyName) + if err != nil { + general.Errorf("failed to get sorting strategy %s: %v", strategyName, err) + sortingStrategy = s.sortingStrategy + } + return sortingStrategy + } else { + return s.sortingStrategy + } +} + +func (s *GenericAllocationStrategy) getBindingStrategy(ctx *allocate.AllocationContext, resourceName string) allocate.BindingStrategy { + if strategyName, ok := ctx.GPUQRMPluginConfig.CustomBindingStrategy[resourceName]; ok { + bindingStrategy, err := s.registry.GetBindingStrategy(strategyName) + if err != nil { + general.Errorf("failed to get binding strategy %s: %v", strategyName, err) + bindingStrategy = s.bindingStrategy + } + return bindingStrategy + } else { + return s.bindingStrategy + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go new file mode 100644 index 0000000000..fb3c8c65e7 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind.go @@ -0,0 +1,80 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// Bind binds the sorted GPU devices to the allocation context +// It creates allocation info for the selected devices +func (s *CanonicalStrategy) Bind( + ctx *allocate.AllocationContext, sortedDevices []string, +) (*allocate.AllocationResult, error) { + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) + if !valid { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: errMsg, + }, fmt.Errorf(errMsg) + } + + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + allocatedDevices := sets.NewString() + allocateDevices := func(devices ...string) bool { + for _, device := range devices { + allocatedDevices.Insert(device) + if devicesToAllocate == allocatedDevices.Len() { + return true + } + } + return false + } + + // First try to bind reusable devices + if allocateDevices(ctx.DeviceReq.ReusableDevices...) { + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + // Then try to bind devices from sorted list + if allocateDevices(sortedDevices...) { + general.InfoS("Successfully bound devices", + "podNamespace", ctx.ResourceReq.PodNamespace, + "podName", ctx.ResourceReq.PodName, + "containerName", ctx.ResourceReq.ContainerName, + "allocatedDevices", allocatedDevices.List()) + + return &allocate.AllocationResult{ + AllocatedDevices: allocatedDevices.UnsortedList(), + Success: true, + }, nil + } + + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)), + }, fmt.Errorf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go new file mode 100644 index 0000000000..6093729181 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/bind_test.go @@ -0,0 +1,143 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestCanonicalStrategy_Bind(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedResult *allocate.AllocationResult + expectedErr bool + }{ + { + name: "device topology is nil", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "device request is greater than available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "bind all the reusable devices first", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + ReusableDevices: []string{"gpu-1", "gpu-2"}, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2"}, + }, + }, + { + name: "bind all the reusable devices first and then the available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + PodNamespace: "default", + PodName: "podName", + ContainerName: "containerName", + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 4, + ReusableDevices: []string{"gpu-1", "gpu-2"}, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + canonicalStrategy := NewCanonicalStrategy() + result, err := canonicalStrategy.Bind(tt.ctx, tt.sortedDevices) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedResult.AllocatedDevices, result.AllocatedDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go new file mode 100644 index 0000000000..a63094487c --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/canonical.go @@ -0,0 +1,43 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +const ( + StrategyNameCanonical = "canonical" +) + +// CanonicalStrategy binds GPU devices to the allocation context +type CanonicalStrategy struct{} + +// NewCanonicalStrategy creates a new default binding strategy +func NewCanonicalStrategy() *CanonicalStrategy { + return &CanonicalStrategy{} +} + +var ( + _ allocate.BindingStrategy = &CanonicalStrategy{} + _ allocate.FilteringStrategy = &CanonicalStrategy{} +) + +// Name returns the name of the binding strategy +func (s *CanonicalStrategy) Name() string { + return StrategyNameCanonical +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go new file mode 100644 index 0000000000..7782527ae5 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter.go @@ -0,0 +1,42 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" +) + +// Filter filters the available devices based on whether they are already occupied. +// The assumption is that each device can only be allocated to one container at most. +// It only returns devices that are not occupied yet. +func (s *CanonicalStrategy) Filter( + ctx *allocate.AllocationContext, allAvailableDevices []string, +) ([]string, error) { + filteredDevices := sets.NewString() + for _, device := range allAvailableDevices { + if !ctx.HintNodes.IsEmpty() && !gpuutil.IsNUMAAffinityDevice(device, ctx.DeviceTopology, ctx.HintNodes) { + continue + } + + filteredDevices.Insert(device) + } + + return filteredDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go new file mode 100644 index 0000000000..b53e40c3f3 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/canonical/filter_test.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package canonical + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestCanonicalStrategy_Filter(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + availableDevices []string + expectedFilteredDevices []string + }{ + { + name: "empty hint nodes does not filter", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0, 1}, + }, + "gpu-2": { + NumaNodes: []int{2, 3}, + }, + }, + }, + }, + availableDevices: []string{"gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-1", "gpu-2"}, + }, + { + name: "filtered devices by hint nodes", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0, 1}, + }, + "gpu-2": { + NumaNodes: []int{2, 3}, + }, + }, + }, + HintNodes: machine.NewCPUSet(0, 1, 2), + }, + availableDevices: []string{"gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-1"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + canonicalStrategy := NewCanonicalStrategy() + actualFilteredDevices, err := canonicalStrategy.Filter(tt.ctx, tt.availableDevices) + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedFilteredDevices, actualFilteredDevices, "filtered devices are not equal") + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go new file mode 100644 index 0000000000..d71b992235 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -0,0 +1,388 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "fmt" + "sort" + + "github.com/google/uuid" + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// affinityGroup is a group of devices that have affinity to each other. +// It is uniquely identified by an id. +type affinityGroup struct { + id string + unallocatedDevices sets.String +} + +// Bind binds the sorted devices to the allocation context by searching for the devices that have affinity to each other. +func (s *DeviceAffinityStrategy) Bind( + ctx *allocate.AllocationContext, sortedDevices []string, +) (*allocate.AllocationResult, error) { + general.InfoS("device affinity strategy binding called", + "available devices", sortedDevices) + + valid, errMsg := strategies.IsBindingContextValid(ctx, sortedDevices) + if !valid { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: errMsg, + }, fmt.Errorf(errMsg) + } + + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + reusableDevicesSet := sets.NewString(ctx.DeviceReq.ReusableDevices...) + + // All devices that are passed into the strategy are unallocated devices + unallocatedDevicesSet := sets.NewString(sortedDevices...) + + // Get a map of affinity groups that is grouped by priority + affinityMap := ctx.DeviceTopology.GroupDeviceAffinity() + + // Get affinity groups organized by priority level + affinityGroupsMap := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet) + + // Allocate reusable devices first + allocatedDevices, err := s.allocateCandidateDevices(affinityGroupsMap, + reusableDevicesSet.Intersection(unallocatedDevicesSet), devicesToAllocate, sets.NewString()) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate reusable devices: %v", err), + }, fmt.Errorf("failed to allocate reusable devices: %v", err) + } + + if len(allocatedDevices) == devicesToAllocate { + return &allocate.AllocationResult{ + Success: true, + AllocatedDevices: allocatedDevices.UnsortedList(), + }, nil + } + + // Next, allocate left available devices + availableDevices := unallocatedDevicesSet.Difference(allocatedDevices) + allocatedDevices, err = s.allocateCandidateDevices(affinityGroupsMap, + availableDevices, devicesToAllocate, allocatedDevices) + if err != nil { + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("failed to allocate available devices with affinity: %v", err), + }, fmt.Errorf("failed to allocate available devices with affinity: %v", err) + } + + // Return result once we have allocated all the devices + if len(allocatedDevices) == devicesToAllocate { + return &allocate.AllocationResult{ + Success: true, + AllocatedDevices: allocatedDevices.UnsortedList(), + }, nil + } + + return &allocate.AllocationResult{ + Success: false, + ErrorMessage: fmt.Sprintf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)), + }, fmt.Errorf("not enough devices to allocate: need %d, have %d", devicesToAllocate, len(allocatedDevices)) +} + +// getAffinityGroupsByPriority forms a map of affinityGroup by priority. +func (s *DeviceAffinityStrategy) getAffinityGroupsByPriority( + affinityMap map[machine.AffinityPriority][]machine.DeviceIDs, unallocatedDevicesSet sets.String, +) map[machine.AffinityPriority][]affinityGroup { + affinityGroupsMap := make(map[machine.AffinityPriority][]affinityGroup) + for priority, affinityDevices := range affinityMap { + affinityGroupsMap[priority] = s.getAffinityGroups(affinityDevices, unallocatedDevicesSet) + } + + return affinityGroupsMap +} + +// getAffinityGroups forms a list of affinityGroup with unallocated devices. +func (s *DeviceAffinityStrategy) getAffinityGroups( + affinityDevices []machine.DeviceIDs, unallocatedDevicesSet sets.String, +) []affinityGroup { + affinityGroups := make([]affinityGroup, 0, len(affinityDevices)) + + // Calculate the number of unallocated devices for each affinity group + for _, devices := range affinityDevices { + unallocatedDevices := sets.NewString() + for _, device := range devices { + if unallocatedDevicesSet.Has(device) { + unallocatedDevices.Insert(device) + } + } + affinityGroups = append(affinityGroups, affinityGroup{ + unallocatedDevices: unallocatedDevices, + id: uuid.NewString(), + }) + } + + return affinityGroups +} + +// allocateCandidateDevices optimally allocates GPU devices based on affinity priorities. +// This method implements a sophisticated allocation strategy that: +// 1. Prioritizes device groups with higher affinity levels +// 2. Minimizes fragmentation by selecting devices with strong mutual affinity +// 3. Balances between fulfilling exact requirements and maintaining optimal groupings +// +// Parameters: +// - affinityGroupsMap: Mapping of affinity priorities to device groups with those priorities +// - candidateDevicesSet: Set of available devices that can be allocated +// - devicesToAllocate: Total number of devices that need to be allocated +// - allocatedDevices: Set of devices that have already been allocated in previous iterations +// +// Returns: +// - sets.String: The complete set of allocated devices after this allocation round +// - error: Any error encountered during the allocation process +func (s *DeviceAffinityStrategy) allocateCandidateDevices( + affinityGroupsMap map[machine.AffinityPriority][]affinityGroup, + candidateDevicesSet sets.String, + devicesToAllocate int, + allocatedDevices sets.String, +) (sets.String, error) { + // Early termination conditions + if len(allocatedDevices) == devicesToAllocate || len(candidateDevicesSet) == 0 { + return allocatedDevices, nil + } + + // Calculate remaining devices needed + remainingDevicesToAllocate := devicesToAllocate - len(allocatedDevices) + + // Fast path: If we need all remaining candidates, allocate them all + if remainingDevicesToAllocate >= len(candidateDevicesSet) { + allocatedDevices = allocatedDevices.Union(candidateDevicesSet) + return allocatedDevices, nil + } + + // Process affinity groups from highest to lowest priority + for priority := 0; priority < len(affinityGroupsMap); priority++ { + affinityPriority := machine.AffinityPriority(priority) + affinityGroups, exists := affinityGroupsMap[affinityPriority] + if !exists || len(affinityGroups) == 0 { + continue + } + + // Prepare group information for evaluation + groupInfos := s.prepareGroupInfos(affinityGroups, candidateDevicesSet, allocatedDevices) + if len(groupInfos) == 0 { + continue + } + + // Sort groups by allocation suitability + s.sortGroupsByPriority(groupInfos, remainingDevicesToAllocate) + + // Try to allocate from the best matching groups + if result, fullyAllocated := s.tryAllocateFromGroups( + groupInfos, remainingDevicesToAllocate, allocatedDevices, devicesToAllocate, + ); fullyAllocated { + return result, nil + } + + // For the lowest priority, use more flexible allocation strategies + if priority == len(affinityGroupsMap)-1 { + return s.handleLowestPriorityAllocation( + groupInfos, affinityGroupsMap, candidateDevicesSet, + devicesToAllocate, allocatedDevices, remainingDevicesToAllocate, + ) + } + } + + return allocatedDevices, nil +} + +// prepareGroupInfos processes affinity groups and extracts relevant allocation information. +// This helper method filters out groups with no candidate devices and calculates +// the intersection between group devices and available candidates. +func (s *DeviceAffinityStrategy) prepareGroupInfos( + affinityGroups []affinityGroup, + candidateDevicesSet sets.String, + allocatedDevices sets.String, +) []groupInfo { + groupInfos := make([]groupInfo, 0, len(affinityGroups)) + + for _, group := range affinityGroups { + // Find devices in this group that are also candidates + candidates := group.unallocatedDevices.Intersection(candidateDevicesSet) + if candidates.Len() == 0 { + continue // Skip groups with no matching candidates + } + + // Calculate unallocated and allocated device sets for this group + unallocated := group.unallocatedDevices.Difference(allocatedDevices) + if unallocated.Len() == 0 { + continue // Skip groups where all devices are already allocated + } + + allocated := group.unallocatedDevices.Intersection(allocatedDevices) + + groupInfos = append(groupInfos, groupInfo{ + group: group, + candidates: candidates, + allocated: allocated, + unallocated: unallocated, + }) + } + + return groupInfos +} + +// sortGroupsByPriority sorts affinity groups based on allocation suitability. +// The sorting criteria are: +// 1. Proximity to the exact number of devices needed (closer is better) +// 2. Total unallocated devices (smaller is better to minimize fragmentation) +// 3. Already allocated devices (larger is better to maintain consistency) +func (s *DeviceAffinityStrategy) sortGroupsByPriority( + groupInfos []groupInfo, + remainingDevicesToAllocate int, +) { + sort.Slice(groupInfos, func(i, j int) bool { + // Calculate absolute difference from needed devices + diffI := abs(groupInfos[i].candidates.Len() - remainingDevicesToAllocate) + diffJ := abs(groupInfos[j].candidates.Len() - remainingDevicesToAllocate) + + // Prefer groups closer to the exact number needed + if diffI != diffJ { + return diffI < diffJ + } + + // Prefer groups with fewer unallocated devices to reduce fragmentation + if groupInfos[i].unallocated.Len() != groupInfos[j].unallocated.Len() { + return groupInfos[i].unallocated.Len() < groupInfos[j].unallocated.Len() + } + + // Prefer groups with more already allocated devices for consistency + if groupInfos[i].allocated.Len() != groupInfos[j].allocated.Len() { + return groupInfos[i].allocated.Len() > groupInfos[j].allocated.Len() + } + + return groupInfos[i].group.id < groupInfos[j].group.id + }) +} + +// tryAllocateFromGroups attempts to allocate devices from the prioritized groups. +// It first tries to find an exact match, then falls back to partial allocations. +func (s *DeviceAffinityStrategy) tryAllocateFromGroups( + groupInfos []groupInfo, + remainingDevicesToAllocate int, + allocatedDevices sets.String, + devicesToAllocate int, +) (sets.String, bool) { + // Try to find groups that can exactly satisfy the remaining requirement + for _, group := range groupInfos { + // Check if this group can satisfy the exact remaining requirement and + // ensure affinity allocation if there are already allocated devices + if remainingDevicesToAllocate <= group.candidates.Len() && + !(allocatedDevices.Len() > 0 && group.allocated.Len() <= 0) { + + // Add all candidate devices from this group + for _, device := range group.candidates.List() { + allocatedDevices.Insert(device) + if len(allocatedDevices) == devicesToAllocate { + return allocatedDevices, true // Fully allocated + } + } + return allocatedDevices, true + } + } + + return allocatedDevices, false // Not fully allocated +} + +// handleLowestPriorityAllocation implements flexible allocation strategies for the lowest priority. +// This method is more permissive in its allocation strategy to ensure device requirements are met. +func (s *DeviceAffinityStrategy) handleLowestPriorityAllocation( + groupInfos []groupInfo, + affinityGroupsMap map[machine.AffinityPriority][]affinityGroup, + candidateDevicesSet sets.String, + devicesToAllocate int, + allocatedDevices sets.String, + remainingDevicesToAllocate int, +) (sets.String, error) { + // First try to allocate entire groups that fit within the remaining requirement and + // ensure affinity allocation if there are already allocated devices + for _, group := range groupInfos { + if remainingDevicesToAllocate >= group.candidates.Len() && + !(allocatedDevices.Len() > 0 && group.allocated.Len() <= 0) { + + // Allocate all devices from this group + allocatedDevices = allocatedDevices.Union(group.candidates) + + // Recursively allocate the remaining devices + return s.allocateCandidateDevices( + affinityGroupsMap, + candidateDevicesSet.Difference(group.candidates), + devicesToAllocate, + allocatedDevices, + ) + } + } + + // If no exact matches, try partial allocations from larger groups + for _, group := range groupInfos { + // Check if this group can contribute to the remaining requirement + if remainingDevicesToAllocate >= group.candidates.Len() { + // Allocate all devices from this group and continue + allocatedDevices = allocatedDevices.Union(group.candidates) + + return s.allocateCandidateDevices( + affinityGroupsMap, + candidateDevicesSet.Difference(group.candidates), + devicesToAllocate, + allocatedDevices, + ) + } else { + // Recursively allocate a subset of devices from this group + devices, err := s.allocateCandidateDevices( + affinityGroupsMap, + group.candidates, + remainingDevicesToAllocate, + group.allocated, + ) + if err != nil { + return nil, err + } + + return allocatedDevices.Union(devices), nil + } + } + + return allocatedDevices, nil +} + +// abs returns the absolute value of an integer. +func abs(x int) int { + if x < 0 { + return -x + } + return x +} + +// groupInfo contains pre-calculated information about an affinity group +// to optimize the allocation process by avoiding repeated calculations. +type groupInfo struct { + group affinityGroup + candidates sets.String // Devices in this group that are also candidates + allocated sets.String // Devices in this group that are already allocated + unallocated sets.String // Devices in this group that are not yet allocated +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go new file mode 100644 index 0000000000..2dcec69535 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -0,0 +1,2179 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "reflect" + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + v1 "k8s.io/api/core/v1" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestBind_NumberOfDevicesAllocated(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedResult *allocate.AllocationResult + expectedErr bool + isRandom bool + expectedResultSize int + }{ + { + name: "able to allocate 1 device in affinity group of size 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 1, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + isRandom: true, + expectedResultSize: 1, + }, + { + name: "able to allocate 2 devices in affinity group of size 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + isRandom: true, + expectedResultSize: 2, + }, + { + name: "able to allocate 3 devices in affinity size of group 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 3, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + isRandom: true, + expectedResultSize: 3, + }, + { + name: "able to allocate 4 devices in affinity size of group 2", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "able to allocate 2 devices in affinity size of group 4", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + isRandom: true, + expectedResultSize: 2, + }, + { + name: "allocate all reusable devices first", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2"}, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2"}, + Success: true, + }, + }, + { + name: "allocate the reusable devices with the best affinity to each other first", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, + DeviceRequest: 2, // should allocate gpu-3 and gpu-4 + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "supports bin-packing of 1 allocated device", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3"}, // gpu-4 is already allocated, so we should allocate gpu-3 to support bin-packing + DeviceRequest: 1, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3"}, + // gpu-4 is already allocated, so we allocate gpu-3 for bin-packing + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3"}, + Success: true, + }, + }, + { + name: "supports bin-packing of 1 request with only available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 1, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu1", "gpu2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3"}, + Success: true, + }, + }, + { + name: "supports of bin-packing of 2 allocated devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-5", "gpu-6"}, // gpu-5 and gou-6 should be allocated because gpu-7 is already allocated and this supports bin-packing + DeviceRequest: 2, + }, + // Level 0: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "bin-packing of more allocated devices are preferred over less allocated devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7", "gpu-9", "gpu-10"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + // [gpu-1, gpu-2, gpu-3, gpu-4] already has 2 allocated devices, [gpu-5, gpu-6, gpu-7, gpu-8] has 1 allocated device, [gpu-9, gpu-10, gpu-11, gpu-12] has no allocated devices + // To support bin-packing, we will allocate gpu-2 and gpu-3 first, then allocate gpu-6 and gpu-7 + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-2", "gpu-3", "gpu-6", "gpu-7"}, + Success: true, + }, + }, + { + name: "finds first level of device affinity, then finds second level of device affinity", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7"}, // gpu-1 and gpu-2 have affinity in 1st level, gpu-5 and gpu-7 have affinity in 2nd level + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + // Allocate gpu-1, gpu-2 in first level, then allocate gpu-3 as it has affinity with gpu-1 and gpu-2 + // Then allocate gpu-5 as gpu-6 is already allocated + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5"}, + Success: true, + }, + }, + { + name: "allocate reusable devices first, then allocate available devices with affinity to the allocated reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + // gpu-1 and gpu-5 are already allocated as they are reusable + // gpu-2 and gpu-6 should be allocated as they have affinity to the already allocated gpu-1 and gpu-5 + sortedDevices: []string{"gpu-1", "gpu-5", "gpu-2", "gpu-6", "gpu-7", "gpu-8", "gpu-3", "gpu-4"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "after allocating available devices with affinity to reusable devices, still not enough devices, allocate more available devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 6, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + // gpu-1 and gpu-5 are allocated because they are reusable devices + // gpu-2 and gpu-6 should be allocated as they have affinity to the already allocated gpu-1 and gpu-5 + // gpu-3 and gpu-4 should be allocated as they have affinity to each other + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "allocation of reusable devices in descending order of intersection size", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-9"}, + DeviceRequest: 6, + }, + // 1 level: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-9"}, + // Should allocate gpu-1, gpu-2, gpu-3 and gpu-4 first because they have the most intersection size with an affinity group + // Followed by gpu-5 and gpu-6 as they have the second most intersection size with an affinity group + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6"}, + Success: true, + }, + }, + { + name: "allocation of available devices in descending order of intersection size after allocating all reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5", "gpu-9", "gpu-10"}, + DeviceRequest: 8, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11"}, + // gpu-6 and gpu-12 is allocated + // Allocate gpu-1, gpu-5, gpu-9 and gpu-10 first because they are reusable devices + // Allocate gpu-2, gpu-3 and gpu-4 next because they have the most intersection with an affinity group + // Between (gpu-7, gpu-8) and (gpu-10, gpu-11), allocate gpu-10 and gpu-11 because the affinity group has fewer unallocated devices (bin-packing). + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-9", "gpu-10", "gpu-11"}, + Success: true, + }, + }, + { + name: "allocate available devices that have affinity with reusable devices from highest to lowest priority", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5"}, + DeviceRequest: 6, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8], [gpu-9, gpu-10], [gpu-11, gpu-12] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8], [gpu-9, gpu-10, gpu-11, gpu-12] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10"}, + 1: {"gpu-10", "gpu-11", "gpu-12"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9"}, + 1: {"gpu-9", "gpu-11", "gpu-12"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + // Allocate gpu-1 and gpu-5 first because they are reusable devices + // Allocate gpu-2 next because they have affinity with gpu-1 at the highest affinity priority (level 0) + // Allocate gpu-6 and gpu-8 next because they have affinity with gpu-5 at the next highest affinity priority (level 1) + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "allocation of odd number of devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-3", "gpu-5"}, + DeviceRequest: 5, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + // Allocate gpu-1, gpu-3 and gpu-5 first because they are reusable devices + // Allocate gpu-2 and gpu-4 next because they have affinity with gpu-1 and gpu-3 at the highest affinity priority (level 0) + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"}, + Success: true, + }, + }, + { + name: "allocation of available devices for 1st level of affinity priority if there are no reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{}, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-3", "gpu-4"}, + // No reusable devices to allocate. + // Allocate gpu-1 and gpu-2 because they have the best affinity to each other. + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-3", "gpu-4"}, + Success: true, + }, + }, + { + name: "when first priority level is not able to determine an allocation, go to the next priority level to allocate", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + // All of the above devices are reusable devices + // At first priority level, they all give an intersection size of 2, but there are 6 of them, and we only need 4 of them + // Since there is another priority level, we go to that priority level to get the best device affinity + // (gpu-1 and gpu-2), (gpu-5 and gpu-6), (gpu-7, gpu-8) are affinity groups at priority level 0 + // But (gpu-5, gpu-6, gpu-7, gpu-8) are affinity groups at priority level 1, so we allocate gpu-5, gpu-6, gpu-7, gpu-8 + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "reusable devices are bin-packed, so we allocate the remaining available devices without considering affinity with the reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-4"}, + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + // gpu-1 and gpu-4 are allocated because they are reusable devices. + // gpu-2 and gpu-3 are already allocated so we cannot find any affinity to the reusable devices. + // allocate gpu-7 and gpu-8 because they have affinity to one another. + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-4", "gpu-7", "gpu-8"}, + Success: true, + }, + }, + { + name: "insufficient devices to allocate, causing an error", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2"}, + // gpu-3 and gpu-4 are already allocated + // gpu-1 and gpu-2 are not enough to satisfy the request + expectedResult: &allocate.AllocationResult{ + Success: false, + }, + expectedErr: true, + }, + { + name: "if there is another priority level, allocate only the max intersection and then go to the next priority level and allocate", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-5", "gpu-6", "gpu-7"}, + DeviceRequest: 3, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 1: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + "gpu": { + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + "gpu-4": {}, + "gpu-5": {}, + "gpu-6": {}, + "gpu-7": {}, + "gpu-8": {}, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-5", "gpu-6", "gpu-7"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-5", "gpu-6", "gpu-7"}, + Success: true, + }, + }, + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 8 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "0": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + isRandom: true, + expectedResultSize: 2, + }, + { + name: "allocate first level of affinity priority devices first, then second level of affinity priority, both have affinity to reusable devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-0"}, + DeviceRequest: 3, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-4", "gpu-5"}, + }, + }, + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-0"}, + 1: {"gpu-0", "gpu-4", "gpu-5"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-3", "gpu-6", "gpu-7"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-6", "gpu-7"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-0", "gpu-1", "gpu-5"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-0", "gpu-1", "gpu-4"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-2", "gpu-3", "gpu-7"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-2", "gpu-3", "gpu-6"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-4", "gpu-6", "gpu-7"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-0", "gpu-1", "gpu-4"}, + Success: true, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceBindingStrategy := NewDeviceAffinityStrategy() + result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) + if (err != nil) != tt.expectedErr { + t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) + } + verifyAllocationResult(t, result, tt.expectedResult, tt.isRandom, tt.expectedResultSize) + }) + } +} + +func TestBind_DeviceAffinity(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + sortedDevices []string + expectedErr bool + expectedAffinityPriorityLevel int + }{ + { + name: "1 level of device affinity, 2 devices in a group", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 2, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedAffinityPriorityLevel: 0, + }, + { + name: "2 devices in affinity priority 0, 4 devices in affinity priority 1", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2"}, + 1: {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1"}, + 1: {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6"}, + 1: {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5"}, + 1: {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-8"}, + 1: {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-7"}, + 1: {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedAffinityPriorityLevel: 1, + }, + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 4 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + expectedAffinityPriorityLevel: 0, + }, + { + name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 8 devices", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + DeviceRequest: 8, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-2", "gpu-3", "gpu-4"}, + 1: {"gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-3", "gpu-4"}, + 1: {"gpu-1", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-4"}, + 1: {"gpu-1", "gpu-2", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-1", "gpu-2", "gpu-3"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-6", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-7", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-8"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-5", "gpu-6", "gpu-7"}, + 1: {"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"}, + }, + }, + "gpu-9": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-10", "gpu-11", "gpu-12"}, + 1: {"gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-10": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-11", "gpu-12"}, + 1: {"gpu-9", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-11": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-12"}, + 1: {"gpu-9", "gpu-10", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-12": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-9", "gpu-10", "gpu-11"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-13": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-14", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-14", "gpu-15", "gpu-16"}, + }, + }, + "gpu-14": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-15", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-15", "gpu-16"}, + }, + }, + "gpu-15": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-16"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-16"}, + }, + }, + "gpu-16": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + 0: {"gpu-13", "gpu-14", "gpu-15"}, + 1: {"gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8", "gpu-9", "gpu-10", "gpu-11", "gpu-12", "gpu-13", "gpu-14", "gpu-15", "gpu-16"}, + expectedAffinityPriorityLevel: 1, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceBindingStrategy := NewDeviceAffinityStrategy() + result, err := deviceBindingStrategy.Bind(tt.ctx, tt.sortedDevices) + if (err != nil) != tt.expectedErr { + t.Errorf("Bind() error = %v, expectedErr %v", err, tt.expectedErr) + } + + verifyResultIsAffinity(t, result, tt.ctx.DeviceTopology, tt.expectedAffinityPriorityLevel) + }) + } +} + +func verifyAllocationResult( + t *testing.T, result *allocate.AllocationResult, expectedResult *allocate.AllocationResult, isRandom bool, + expectedResultSize int, +) { + if isRandom { + if len(result.AllocatedDevices) != expectedResultSize { + t.Errorf("result.AllocatedDevices = %v, expectedResultSize = %v", result.AllocatedDevices, expectedResultSize) + } + return + } + if (result == nil) != (expectedResult == nil) { + t.Errorf("result = %v, expectedResult = %v", result, expectedResult) + return + } + if result.Success != expectedResult.Success { + t.Errorf("result.Success = %v, expectedResult.Success = %v", result.Success, expectedResult.Success) + return + } + if len(result.AllocatedDevices) != len(expectedResult.AllocatedDevices) { + t.Errorf("result.AllocatedDevices = %v, expectedResult.AllocatedDevices = %v", result.AllocatedDevices, expectedResult.AllocatedDevices) + return + } + if diff := cmp.Diff(result.AllocatedDevices, expectedResult.AllocatedDevices, + cmpopts.SortSlices(func(a, b string) bool { return a < b }), + ); diff != "" { + t.Errorf("Bind() mismatch (-got +want):\n%s", diff) + } +} + +func verifyResultIsAffinity( + t *testing.T, result *allocate.AllocationResult, topology *machine.DeviceTopology, + expectedAffinityPriorityLevel int, +) { + affinityMap := topology.GroupDeviceAffinity() + priorityLevelDevices := affinityMap[machine.AffinityPriority(expectedAffinityPriorityLevel)] + + sort.Slice(result.AllocatedDevices, func(i, j int) bool { + return result.AllocatedDevices[i] < result.AllocatedDevices[j] + }) + + for _, deviceIDs := range priorityLevelDevices { + sort.Slice(deviceIDs, func(i, j int) bool { return deviceIDs[i] < deviceIDs[j] }) + if reflect.DeepEqual(deviceIDs, machine.DeviceIDs(result.AllocatedDevices)) { + return + } + } + + t.Errorf("result = %v, did not find it within an affinity group", result.AllocatedDevices) +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go new file mode 100644 index 0000000000..d637483ea9 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/device_affinity.go @@ -0,0 +1,40 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceaffinity + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +const ( + StrategyNameDeviceAffinity = "deviceAffinity" +) + +// DeviceAffinityStrategy knows how to bind devices that have affinity to each other +type DeviceAffinityStrategy struct{} + +// NewDeviceAffinityStrategy creates a new device affinity strategy with the given canonical strategy +func NewDeviceAffinityStrategy() *DeviceAffinityStrategy { + return &DeviceAffinityStrategy{} +} + +var _ allocate.BindingStrategy = &DeviceAffinityStrategy{} + +// Name returns the name of the binding strategy +func (s *DeviceAffinityStrategy) Name() string { + return StrategyNameDeviceAffinity +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go new file mode 100644 index 0000000000..56c65fcd71 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter.go @@ -0,0 +1,78 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// Filter filters the available GPU devices based on available GPU memory +// It returns devices that have enough available memory for the request +func (s *GPUMemoryStrategy) Filter(ctx *allocate.AllocationContext, allAvailableDevices []string) ([]string, error) { + if ctx.DeviceTopology == nil { + return nil, fmt.Errorf("GPU topology is nil") + } + + _, gpuMemory, err := util.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + if err != nil { + general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default available devices", err) + return allAvailableDevices, nil + } + + if gpuMemory == 0 { + general.Infof("GPU Memory is 0, use default available devices") + return allAvailableDevices, nil + } + + filteredDevices, err := s.filterGPUDevices(ctx, gpuMemory, allAvailableDevices) + if err != nil { + return nil, err + } + + return filteredDevices, nil +} + +func (s *GPUMemoryStrategy) filterGPUDevices( + ctx *allocate.AllocationContext, + gpuMemoryRequest float64, + allAvailableDevices []string, +) ([]string, error) { + gpuRequest := ctx.DeviceReq.GetDeviceRequest() + gpuMemoryPerGPU := gpuMemoryRequest / float64(gpuRequest) + gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) + + machineState := ctx.MachineState[consts.ResourceGPUMemory] + filteredDevices := sets.NewString() + for _, device := range allAvailableDevices { + if !machineState.IsRequestSatisfied(device, gpuMemoryPerGPU, gpuMemoryAllocatablePerGPU) { + general.Warningf("must include gpu %s has enough memory to allocate, gpuMemoryAllocatable: %f, gpuMemoryAllocated: %f, gpuMemoryPerGPU: %f", + device, gpuMemoryAllocatablePerGPU, machineState.GetQuantityAllocated(device), gpuMemoryPerGPU) + continue + } + + filteredDevices.Insert(device) + } + + return filteredDevices.UnsortedList(), nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go new file mode 100644 index 0000000000..4b206afc7d --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/filter_test.go @@ -0,0 +1,220 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestGPUMemoryStrategy_Filter(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + availableDevices []string + expectedFilteredDevices []string + expectedErr bool + }{ + { + name: "gpu topology is nil", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + expectedErr: true, + }, + { + name: "gpu memory does not exist, just allocate every device", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceMemoryBandwidth): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "gpu memory is 0, so we use all the available devices", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 0, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "allocate available devices with available gpu memory", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + }, + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + expectedFilteredDevices: []string{"gpu-0", "gpu-1", "gpu-2"}, + }, + { + name: "exclude devices with not enough gpu memory", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-0": {}, + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 4, + }, + }, + DeviceReq: &v1alpha1.DeviceRequest{ + DeviceRequest: 2, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + // 2 GB allocated + "gpu-0": { + PodEntries: map[string]state.ContainerEntries{ + "pod-0": { + "container-0": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 2, + }, + }, + }, + }, + }, + "gpu-1": {}, + "gpu-2": {}, + "gpu-3": { + // 1 GB allocated + PodEntries: map[string]state.ContainerEntries{ + "pod-1": { + "container-0": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 0.5, + }, + }, + "container-1": &state.AllocationInfo{ + AllocatedAllocation: state.Allocation{ + Quantity: 0.5, + }, + }, + }, + }, + }, + }, + }, + }, + availableDevices: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"}, + expectedFilteredDevices: []string{"gpu-1", "gpu-2"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + strategy := NewGPUMemoryStrategy() + filteredDevices, err := strategy.Filter(tt.ctx, tt.availableDevices) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.ElementsMatch(t, tt.expectedFilteredDevices, filteredDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go new file mode 100644 index 0000000000..06d0881eab --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/gpu_memory.go @@ -0,0 +1,41 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + +const ( + StrategyNameGPUMemory = "gpu-memory" +) + +// GPUMemoryStrategy filters GPU devices based on available GPU memory +type GPUMemoryStrategy struct{} + +var ( + _ allocate.FilteringStrategy = &GPUMemoryStrategy{} + _ allocate.SortingStrategy = &GPUMemoryStrategy{} +) + +// NewGPUMemoryStrategy creates a new GPU memory filtering strategy +func NewGPUMemoryStrategy() *GPUMemoryStrategy { + return &GPUMemoryStrategy{} +} + +// Name returns the name of the filtering strategy +func (s *GPUMemoryStrategy) Name() string { + return StrategyNameGPUMemory +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go new file mode 100644 index 0000000000..e4cfaef234 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort.go @@ -0,0 +1,88 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "fmt" + "sort" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" +) + +// Sort sorts the filtered GPU devices based on available GPU memory +// It prioritizes devices with less available memory and considers NUMA affinity +func (s *GPUMemoryStrategy) Sort(ctx *allocate.AllocationContext, filteredDevices []string) ([]string, error) { + if ctx.DeviceTopology == nil { + return nil, fmt.Errorf("GPU topology is nil") + } + + _, gpuMemory, err := qrmutil.GetQuantityFromResourceRequests(ctx.ResourceReq.ResourceRequests, string(consts.ResourceGPUMemory), false) + if err != nil { + general.Warningf("getReqQuantityFromResourceReq failed with error: %v, use default filtered devices", err) + return filteredDevices, nil + } + + if gpuMemory == 0 { + general.Infof("GPU Memory is 0, use default available devices") + return filteredDevices, nil + } + + gpuMemoryAllocatablePerGPU := float64(ctx.GPUQRMPluginConfig.GPUMemoryAllocatablePerGPU.Value()) + machineState := ctx.MachineState[consts.ResourceGPUMemory] + + // Create a slice of device info with available memory + type deviceInfo struct { + ID string + AvailableMemory float64 + NUMAAffinity bool + } + + devices := make([]deviceInfo, 0, len(filteredDevices)) + + for _, deviceID := range filteredDevices { + availableMemory := gpuMemoryAllocatablePerGPU - machineState.GetQuantityAllocated(deviceID) + devices = append(devices, deviceInfo{ + ID: deviceID, + AvailableMemory: availableMemory, + NUMAAffinity: util.IsNUMAAffinityDevice(deviceID, ctx.DeviceTopology, ctx.HintNodes), + }) + } + + // Sort devices: first by NUMA affinity (preferred), then by available memory (ascending) + sort.Slice(devices, func(i, j int) bool { + // If both devices have NUMA affinity or both don't, sort by available memory + if devices[i].NUMAAffinity == devices[j].NUMAAffinity { + return devices[i].AvailableMemory < devices[j].AvailableMemory + } + + // Prefer devices with NUMA affinity + return devices[i].NUMAAffinity && !devices[j].NUMAAffinity + }) + + // Extract sorted device IDs + sortedDevices := make([]string, len(devices)) + for i, device := range devices { + sortedDevices[i] = device.ID + } + + general.InfoS("Sorted devices", "count", len(sortedDevices), "devices", sortedDevices) + return sortedDevices, nil +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go new file mode 100644 index 0000000000..3511861ba8 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/gpu_memory/sort_test.go @@ -0,0 +1,178 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu_memory + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-api/pkg/consts" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +func TestGPUMemoryStrategy_Sort(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ctx *allocate.AllocationContext + filteredDevices []string + expectedSortedDevices []string + expectedErr bool + }{ + { + name: "nil gpu topology", + ctx: &allocate.AllocationContext{ + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedErr: true, + }, + { + name: "gpu memory is 0 returns all available devices without sorting", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": {}, + "gpu-2": {}, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 0, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedSortedDevices: []string{"gpu-1", "gpu-2"}, + }, + { + name: "devices are sorted by NUMA affinity first", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + // gpu-1 has NUMA affinity but gpu-2 does not + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{1}, + }, + "gpu-2": { + NumaNodes: []int{0}, + }, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(2, resource.DecimalSI), + }, + HintNodes: machine.NewCPUSet(0), + }, + filteredDevices: []string{"gpu-1", "gpu-2"}, + expectedSortedDevices: []string{"gpu-2", "gpu-1"}, + }, + { + name: "for devices with NUMA affinity, they are sorted by available memory in ascending order", + ctx: &allocate.AllocationContext{ + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + NumaNodes: []int{0}, + }, + "gpu-2": { + NumaNodes: []int{1}, + }, + "gpu-3": { + NumaNodes: []int{2}, + }, + }, + }, + ResourceReq: &v1alpha1.ResourceRequest{ + ResourceRequests: map[string]float64{ + string(consts.ResourceGPUMemory): 1, + }, + }, + GPUQRMPluginConfig: &qrm.GPUQRMPluginConfig{ + GPUMemoryAllocatablePerGPU: *resource.NewQuantity(4, resource.DecimalSI), + }, + HintNodes: machine.NewCPUSet(0, 1), + MachineState: map[v1.ResourceName]state.AllocationMap{ + consts.ResourceGPUMemory: { + "gpu-1": { + PodEntries: map[string]state.ContainerEntries{ + "pod-0": { + "container-0": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + "gpu-2": { + PodEntries: map[string]state.ContainerEntries{ + "pod-1": { + "container-1": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + "container-2": { + AllocatedAllocation: state.Allocation{ + Quantity: 1, + }, + }, + }, + }, + }, + }, + }, + }, + filteredDevices: []string{"gpu-1", "gpu-2", "gpu-3"}, + expectedSortedDevices: []string{"gpu-2", "gpu-1", "gpu-3"}, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + strategy := NewGPUMemoryStrategy() + sortedDevices, err := strategy.Sort(tt.ctx, tt.filteredDevices) + + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedSortedDevices, sortedDevices) + } + }) + } +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go new file mode 100644 index 0000000000..fb5eeaa5e1 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/util.go @@ -0,0 +1,38 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "fmt" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate" +) + +// IsBindingContextValid checks if the context given for binding is valid +func IsBindingContextValid(ctx *allocate.AllocationContext, sortedDevices []string) (bool, string) { + if ctx.DeviceTopology == nil { + return false, "GPU topology is nil" + } + + // Determine how many devices to allocate + devicesToAllocate := int(ctx.DeviceReq.DeviceRequest) + if devicesToAllocate > len(sortedDevices) { + return false, fmt.Sprintf("not enough devices: need %d, have %d", devicesToAllocate, len(sortedDevices)) + } + + return true, "" +} diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go new file mode 100644 index 0000000000..09d5c9b574 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/types.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package allocate + +import ( + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" + "github.com/kubewharf/katalyst-core/pkg/metaserver" + "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +// AllocationContext contains all the information needed for GPU allocation +type AllocationContext struct { + ResourceReq *pluginapi.ResourceRequest + DeviceReq *pluginapi.DeviceRequest + DeviceTopology *machine.DeviceTopology + GPUQRMPluginConfig *qrm.GPUQRMPluginConfig + Emitter metrics.MetricEmitter + MetaServer *metaserver.MetaServer + MachineState state.AllocationResourcesMap + QoSLevel string + HintNodes machine.CPUSet +} + +// AllocationResult contains the result of GPU allocation +type AllocationResult struct { + AllocatedDevices []string + Success bool + ErrorMessage string +} + +// FilteringStrategy defines the interface for filtering GPU devices +type FilteringStrategy interface { + // Name returns the name of the filtering strategy + Name() string + + // Filter filters the available GPU devices based on the allocation context + // Returns a list of filtered device IDs + Filter(ctx *AllocationContext, allAvailableDevices []string) ([]string, error) +} + +// SortingStrategy defines the interface for sorting GPU devices +type SortingStrategy interface { + // Name returns the name of the sorting strategy + Name() string + + // Sort sorts the filtered GPU devices based on the allocation context + // Returns a prioritized list of device IDs + Sort(ctx *AllocationContext, filteredDevices []string) ([]string, error) +} + +// BindingStrategy defines the interface for binding GPU devices +type BindingStrategy interface { + // Name returns the name of the binding strategy + Name() string + + // Bind binds the sorted GPU devices to the allocation context + // Returns the final allocation result + Bind(ctx *AllocationContext, sortedDevices []string) (*AllocationResult, error) +} + +type AllocationStrategy interface { + // Name returns the name of the allocation strategy + Name() string + + // Allocate performs the allocation using the combined strategies + Allocate(ctx *AllocationContext) (*AllocationResult, error) +} diff --git a/pkg/agent/qrm-plugins/gpu/util/util.go b/pkg/agent/qrm-plugins/gpu/util/util.go new file mode 100644 index 0000000000..c8c1331593 --- /dev/null +++ b/pkg/agent/qrm-plugins/gpu/util/util.go @@ -0,0 +1,100 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "fmt" + "math" + + pkgerrors "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" + + qrmutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" +) + +var ErrNoAvailableGPUMemoryHints = pkgerrors.New("no available gpu memory hints") + +func GetNUMANodesCountToFitGPUReq( + gpuReq float64, cpuTopology *machine.CPUTopology, gpuTopology *machine.DeviceTopology, +) (int, int, error) { + if gpuTopology == nil { + return 0, 0, fmt.Errorf("GetNUMANodesCountToFitGPUReq got nil gpuTopology") + } + + numaCount := cpuTopology.NumNUMANodes + if numaCount == 0 { + return 0, 0, fmt.Errorf("there is no NUMA in cpuTopology") + } + + if len(gpuTopology.Devices)%numaCount != 0 { + general.Warningf("GPUs count %d cannot be evenly divisible by NUMA count %d", len(gpuTopology.Devices), numaCount) + } + + gpusPerNUMA := (len(gpuTopology.Devices) + numaCount - 1) / numaCount + numaCountNeeded := int(math.Ceil(gpuReq / float64(gpusPerNUMA))) + if numaCountNeeded == 0 { + numaCountNeeded = 1 + } + if numaCountNeeded > numaCount { + return 0, 0, fmt.Errorf("invalid gpu req: %.3f in topology with NUMAs count: %d and GPUs count: %d", gpuReq, numaCount, len(gpuTopology.Devices)) + } + + gpusCountNeededPerNUMA := int(math.Ceil(gpuReq / float64(numaCountNeeded))) + return numaCountNeeded, gpusCountNeededPerNUMA, nil +} + +func IsNUMAAffinityDevice( + device string, deviceTopology *machine.DeviceTopology, hintNodes machine.CPUSet, +) bool { + info, ok := deviceTopology.Devices[device] + if !ok { + general.Errorf("failed to find device info for device %s", device) + return false + } + + return machine.NewCPUSet(info.GetNUMANodes()...).IsSubsetOf(hintNodes) +} + +// GetGPUCount extracts GPU count from resource request +func GetGPUCount(req *pluginapi.ResourceRequest, deviceNames []string) (float64, sets.String, error) { + gpuCount := float64(0) + gpuNames := sets.NewString() + + for _, resourceName := range deviceNames { + _, request, err := qrmutil.GetQuantityFromResourceRequests(req.ResourceRequests, resourceName, false) + if err != nil && !errors.IsNotFound(err) { + return 0, nil, err + } + + if request == 0 { + continue + } + + gpuCount += request + gpuNames.Insert(resourceName) + } + + if gpuCount == 0 { + return 0, gpuNames, fmt.Errorf("no available GPU count") + } + + return gpuCount, gpuNames, nil +} diff --git a/pkg/agent/qrm-plugins/io/staticpolicy/policy.go b/pkg/agent/qrm-plugins/io/staticpolicy/policy.go index f0e11f9d91..39c943b217 100644 --- a/pkg/agent/qrm-plugins/io/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/io/staticpolicy/policy.go @@ -48,6 +48,7 @@ const ( // StaticPolicy is the static io policy type StaticPolicy struct { sync.Mutex + pluginapi.UnimplementedResourcePluginServer name string stopCh chan struct{} diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go index 6db20be45e..9b5a83f67f 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go @@ -96,6 +96,7 @@ const ( type DynamicPolicy struct { sync.RWMutex + pluginapi.UnimplementedResourcePluginServer stopCh chan struct{} started bool diff --git a/pkg/agent/qrm-plugins/network/state/state_net.go b/pkg/agent/qrm-plugins/network/state/state_mem.go similarity index 100% rename from pkg/agent/qrm-plugins/network/state/state_net.go rename to pkg/agent/qrm-plugins/network/state/state_mem.go diff --git a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go index 328984c592..ece3b452a9 100644 --- a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go @@ -78,6 +78,7 @@ const ( // StaticPolicy is the static network policy type StaticPolicy struct { sync.Mutex + pluginapi.UnimplementedResourcePluginServer name string stopCh chan struct{} diff --git a/pkg/agent/qrm-plugins/util/util.go b/pkg/agent/qrm-plugins/util/util.go index 5eaca3dfd7..99158b1f16 100644 --- a/pkg/agent/qrm-plugins/util/util.go +++ b/pkg/agent/qrm-plugins/util/util.go @@ -25,6 +25,8 @@ import ( "strings" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" @@ -43,27 +45,32 @@ func GetQuantityFromResourceReq(req *pluginapi.ResourceRequest) (int, float64, e return 0, 0, fmt.Errorf("invalid req.ResourceRequests length: %d", len(req.ResourceRequests)) } - for key := range req.ResourceRequests { - switch key { - case string(v1.ResourceCPU): - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - case string(apiconsts.ReclaimedResourceMilliCPU): - return general.Max(int(math.Ceil(req.ResourceRequests[key]/1000.0)), 0), req.ResourceRequests[key] / 1000.0, nil - case string(v1.ResourceMemory), string(apiconsts.ReclaimedResourceMemory): - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - case string(apiconsts.ResourceNetBandwidth): - if req.Annotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue { - general.Infof("detect %s: %s, return %s: 0 instead of %s: %.2f", - PodAnnotationQuantityFromQRMDeclarationKey, PodAnnotationQuantityFromQRMDeclarationTrue, key, key, req.ResourceRequests[key]) - return 0, 0, nil - } - return general.Max(int(math.Ceil(req.ResourceRequests[key])), 0), req.ResourceRequests[key], nil - default: - return 0, 0, fmt.Errorf("invalid request resource name: %s", key) + return GetQuantityFromResourceRequests(req.ResourceRequests, req.ResourceName, IsQuantityFromQRMDeclaration(req.Annotations)) +} + +func GetQuantityFromResourceRequests(resourceRequests map[string]float64, resourceName string, isQuantityFromQRMDeclaration bool) (int, float64, error) { + quantity, ok := resourceRequests[resourceName] + if !ok { + return 0, 0, errors.NewNotFound(schema.GroupResource{}, resourceName) + } + + switch resourceName { + case string(apiconsts.ReclaimedResourceMilliCPU): + return general.Max(int(math.Ceil(quantity/1000.0)), 0), quantity / 1000.0, nil + case string(apiconsts.ResourceNetBandwidth): + if isQuantityFromQRMDeclaration { + general.Infof("detect %s: %s, return %s: 0 instead of %s: %.2f", + PodAnnotationQuantityFromQRMDeclarationKey, PodAnnotationQuantityFromQRMDeclarationTrue, resourceName, resourceName, quantity) + return 0, 0, nil } + return general.Max(int(math.Ceil(quantity)), 0), quantity, nil + default: + return general.Max(int(math.Ceil(quantity)), 0), quantity, nil } +} - return 0, 0, fmt.Errorf("unexpected end") +func IsQuantityFromQRMDeclaration(podAnnotations map[string]string) bool { + return podAnnotations[PodAnnotationQuantityFromQRMDeclarationKey] == PodAnnotationQuantityFromQRMDeclarationTrue } // IsDebugPod returns true if the pod annotations show up any configurable debug key @@ -371,3 +378,20 @@ func GetPodAggregatedRequestResource(req *pluginapi.ResourceRequest) (int, float return 0, 0, fmt.Errorf("not support resource name: %s", req.ResourceName) } } + +// CreateEmptyAllocationResponse creates an empty allocation response +func CreateEmptyAllocationResponse(resourceReq *pluginapi.ResourceRequest, resourceName string) *pluginapi.ResourceAllocationResponse { + return &pluginapi.ResourceAllocationResponse{ + PodUid: resourceReq.PodUid, + PodNamespace: resourceReq.PodNamespace, + PodName: resourceReq.PodName, + ContainerName: resourceReq.ContainerName, + ContainerType: resourceReq.ContainerType, + ContainerIndex: resourceReq.ContainerIndex, + PodRole: resourceReq.PodRole, + PodType: resourceReq.PodType, + ResourceName: resourceName, + Labels: general.DeepCopyMap(resourceReq.Labels), + Annotations: general.DeepCopyMap(resourceReq.Annotations), + } +} diff --git a/pkg/agent/qrm-plugins/util/util_test.go b/pkg/agent/qrm-plugins/util/util_test.go index ddcd12377c..fe8f85a0f5 100644 --- a/pkg/agent/qrm-plugins/util/util_test.go +++ b/pkg/agent/qrm-plugins/util/util_test.go @@ -17,12 +17,13 @@ limitations under the License. package util import ( - "fmt" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" "github.com/kubewharf/katalyst-api/pkg/consts" @@ -41,6 +42,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }{ { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceCPU), ResourceRequests: map[string]float64{ string(v1.ResourceCPU): 123, }, @@ -49,6 +51,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ReclaimedResourceMilliCPU), ResourceRequests: map[string]float64{ string(consts.ReclaimedResourceMilliCPU): 234001, }, @@ -57,6 +60,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceMemory), ResourceRequests: map[string]float64{ string(v1.ResourceMemory): 256, }, @@ -65,6 +69,7 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(consts.ReclaimedResourceMemory), ResourceRequests: map[string]float64{ string(consts.ReclaimedResourceMemory): 1345, }, @@ -73,18 +78,19 @@ func TestGetQuantityFromResourceReq(t *testing.T) { }, { req: &pluginapi.ResourceRequest{ + ResourceName: string(v1.ResourceCPU), ResourceRequests: map[string]float64{ "test": 1345, }, }, - err: fmt.Errorf("invalid request resource name: %s", "test"), + err: errors.NewNotFound(schema.GroupResource{}, string(v1.ResourceCPU)), }, } for _, tc := range testCases { res, _, err := GetQuantityFromResourceReq(tc.req) if tc.err != nil { - as.NotNil(err) + as.Equal(tc.err, err) } else { as.EqualValues(tc.result, res) } diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go index 48b47588f8..15d3b4d870 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go @@ -349,8 +349,8 @@ func (p *topologyAdapterImpl) Run(ctx context.Context, handler func()) error { // validatePodResourcesServerResponse validate pod resources server response, if the resource is empty, // maybe the kubelet or qrm plugin is restarting -func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1. - AllocatableResourcesResponse, listPodResourcesResponse *podresv1.ListPodResourcesResponse, +func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1.AllocatableResourcesResponse, + listPodResourcesResponse *podresv1.ListPodResourcesResponse, ) error { if len(p.needValidationResources) > 0 { if allocatableResourcesResponse == nil { @@ -396,6 +396,11 @@ func (p *topologyAdapterImpl) addNumaSocketChildrenZoneNodes(generator *util.Top continue } + if parentZoneNode == nil { + // skip the resource which doesn't have parent zone node + continue + } + err = generator.AddNode(parentZoneNode, zoneNode) if err != nil { errList = append(errList, err) @@ -1133,6 +1138,12 @@ func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQu }, } + if identifier, ok := quantity.Annotations[apiconsts.ResourceAnnotationKeyResourceIdentifier]; ok && len(identifier) == 0 { + // if quantity has resource identifier annotation, but it is empty, it means it is unique and the parent zone node + // already exists, we can just return the zone node and nil parent zone node + return zoneNode, nil, nil + } + switch quantity.TopologyLevel { case podresv1.TopologyLevel_NUMA: parentZoneNode := util.GenerateNumaZoneNode(nodeID) @@ -1141,7 +1152,9 @@ func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQu parentZoneNode := util.GenerateSocketZoneNode(nodeID) return zoneNode, &parentZoneNode, nil default: - return zoneNode, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel) + // if quantity topology level is not numa or socket, it means that the zone is a child of socket or numa, + // and the zone node is determined by the quantity name or its resource identifier if existed. + return zoneNode, nil, nil } } } diff --git a/pkg/config/agent/qrm/gpu_plugin.go b/pkg/config/agent/qrm/gpu_plugin.go new file mode 100644 index 0000000000..f610910083 --- /dev/null +++ b/pkg/config/agent/qrm/gpu_plugin.go @@ -0,0 +1,44 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package qrm + +import ( + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/gpustrategy" +) + +type GPUQRMPluginConfig struct { + // PolicyName is used to switch between several strategies + PolicyName string + // GPUDeviceNames is the names of the GPU device + GPUDeviceNames []string + // RDMADeviceNames is the names of the RDMA device + RDMADeviceNames []string + // GPUMemoryAllocatablePerGPU is the total memory allocatable for each GPU + GPUMemoryAllocatablePerGPU resource.Quantity + // SkipGPUStateCorruption skip gpu state corruption, and it will be used after updating state properties + SkipGPUStateCorruption bool + + *gpustrategy.GPUStrategyConfig +} + +func NewGPUQRMPluginConfig() *GPUQRMPluginConfig { + return &GPUQRMPluginConfig{ + GPUStrategyConfig: gpustrategy.NewGPUStrategyConfig(), + } +} diff --git a/pkg/config/agent/qrm/gpustrategy/allocate.go b/pkg/config/agent/qrm/gpustrategy/allocate.go new file mode 100644 index 0000000000..2653b1ecb8 --- /dev/null +++ b/pkg/config/agent/qrm/gpustrategy/allocate.go @@ -0,0 +1,28 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +type AllocateStrategyConfig struct { + CustomFilteringStrategies map[string][]string + CustomSortingStrategy map[string]string + CustomBindingStrategy map[string]string + CustomAllocationStrategy map[string]string +} + +func NewGPUAllocateStrategyConfig() *AllocateStrategyConfig { + return &AllocateStrategyConfig{} +} diff --git a/pkg/config/agent/qrm/gpustrategy/strategy_base.go b/pkg/config/agent/qrm/gpustrategy/strategy_base.go new file mode 100644 index 0000000000..a901f80bdc --- /dev/null +++ b/pkg/config/agent/qrm/gpustrategy/strategy_base.go @@ -0,0 +1,27 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpustrategy + +type GPUStrategyConfig struct { + *AllocateStrategyConfig +} + +func NewGPUStrategyConfig() *GPUStrategyConfig { + return &GPUStrategyConfig{ + AllocateStrategyConfig: NewGPUAllocateStrategyConfig(), + } +} diff --git a/pkg/config/agent/qrm/qrm_base.go b/pkg/config/agent/qrm/qrm_base.go index 18136bb490..9f90e6a512 100644 --- a/pkg/config/agent/qrm/qrm_base.go +++ b/pkg/config/agent/qrm/qrm_base.go @@ -22,12 +22,10 @@ import ( ) type GenericQRMPluginConfiguration struct { - StateFileDirectory string - InMemoryStateFileDirectory string - QRMPluginSocketDirs []string - ExtraStateFileAbsPath string - PodDebugAnnoKeys []string - UseKubeletReservedConfig bool + QRMPluginSocketDirs []string + ExtraStateFileAbsPath string + PodDebugAnnoKeys []string + UseKubeletReservedConfig bool // PodAnnotationKeptKeys indicates pod annotation keys will be kept in qrm state PodAnnotationKeptKeys []string // PodLabelKeptKeys indicates pod label keys will be kept in qrm state @@ -50,6 +48,7 @@ type QRMPluginsConfiguration struct { *MemoryQRMPluginConfig *NetworkQRMPluginConfig *IOQRMPluginConfig + *GPUQRMPluginConfig } func NewGenericQRMPluginConfiguration() *GenericQRMPluginConfiguration { @@ -69,5 +68,6 @@ func NewQRMPluginsConfiguration() *QRMPluginsConfiguration { MemoryQRMPluginConfig: NewMemoryQRMPluginConfig(), NetworkQRMPluginConfig: NewNetworkQRMPluginConfig(), IOQRMPluginConfig: NewIOQRMPluginConfig(), + GPUQRMPluginConfig: NewGPUQRMPluginConfig(), } } diff --git a/pkg/util/machine/device.go b/pkg/util/machine/device.go new file mode 100644 index 0000000000..4d2bc923b9 --- /dev/null +++ b/pkg/util/machine/device.go @@ -0,0 +1,308 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import ( + "fmt" + "sort" + "sync" + + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/utils/strings/slices" + + "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/native" +) + +type DeviceTopologyProvider interface { + GetDeviceTopology() (*DeviceTopology, bool, error) + SetDeviceTopology(*DeviceTopology) error +} + +// DeviceTopologyRegistry is a registry of all topology providers that knows how to provide topology information of machine devices +type DeviceTopologyRegistry struct { + mux sync.RWMutex + + // deviceTopologyProviders is a mapping of device name to their respective topology provider + deviceTopologyProviders map[string]DeviceTopologyProvider + + // deviceTopologyAffinityProviders is a mapping of device name to their respective affinity provider + deviceTopologyAffinityProviders map[string]DeviceAffinityProvider +} + +func NewDeviceTopologyRegistry() *DeviceTopologyRegistry { + return &DeviceTopologyRegistry{ + deviceTopologyProviders: make(map[string]DeviceTopologyProvider), + deviceTopologyAffinityProviders: make(map[string]DeviceAffinityProvider), + } +} + +// RegisterDeviceTopologyProvider registers a device topology provider for the specified device name. +func (r *DeviceTopologyRegistry) RegisterDeviceTopologyProvider( + deviceName string, deviceTopologyProvider DeviceTopologyProvider, +) { + r.mux.Lock() + defer r.mux.Unlock() + + r.deviceTopologyProviders[deviceName] = deviceTopologyProvider +} + +func (r *DeviceTopologyRegistry) RegisterTopologyAffinityProvider( + deviceName string, deviceAffinityProvider DeviceAffinityProvider, +) { + r.mux.Lock() + defer r.mux.Unlock() + + r.deviceTopologyAffinityProviders[deviceName] = deviceAffinityProvider +} + +// SetDeviceTopology sets the device topology for the specified device name. +func (r *DeviceTopologyRegistry) SetDeviceTopology(deviceName string, deviceTopology *DeviceTopology) error { + r.mux.Lock() + defer r.mux.Unlock() + + topologyProvider, ok := r.deviceTopologyProviders[deviceName] + if !ok { + return fmt.Errorf("no device topology provider found for device %s", deviceName) + } + + topologyAffinityProvider, ok := r.deviceTopologyAffinityProviders[deviceName] + if ok { + topologyAffinityProvider.SetDeviceAffinity(deviceTopology) + general.Infof("set device affinity provider for device %s, %v", deviceName, deviceTopology) + } else { + general.Infof("no device affinity provider found for device %s", deviceName) + } + + return topologyProvider.SetDeviceTopology(deviceTopology) +} + +// GetAllDeviceTopologyProviders returns all registered device topology providers. +func (r *DeviceTopologyRegistry) GetAllDeviceTopologyProviders() map[string]DeviceTopologyProvider { + r.mux.RLock() + defer r.mux.RUnlock() + + return r.deviceTopologyProviders +} + +// GetDeviceTopology gets the device topology for the specified device name. +func (r *DeviceTopologyRegistry) GetDeviceTopology(deviceName string) (*DeviceTopology, bool, error) { + r.mux.RLock() + defer r.mux.RUnlock() + + provider, ok := r.deviceTopologyProviders[deviceName] + if !ok { + return nil, false, fmt.Errorf("no device topology provider found for device %s", deviceName) + } + return provider.GetDeviceTopology() +} + +// GetDeviceNUMAAffinity retrieves a map of a certain device A to the list of devices in device B that it has an affinity with. +// A device is considered to have an affinity with another device if they are on the exact same NUMA node(s) +func (r *DeviceTopologyRegistry) GetDeviceNUMAAffinity(deviceA, deviceB string) (map[string][]string, error) { + deviceTopologyKey, numaReady, err := r.GetDeviceTopology(deviceA) + if err != nil { + return nil, fmt.Errorf("error getting device topology for device %s: %v", deviceA, err) + } + if !numaReady { + return nil, fmt.Errorf("device topology for device %s is not ready", deviceA) + } + + deviceTopologyValue, numaReady, err := r.GetDeviceTopology(deviceB) + if err != nil { + return nil, fmt.Errorf("error getting device topology for device %s: %v", deviceB, err) + } + if !numaReady { + return nil, fmt.Errorf("device topology for device %s is not ready", deviceB) + } + + deviceAffinity := make(map[string][]string) + for keyName, keyInfo := range deviceTopologyKey.Devices { + devicesWithAffinity := make([]string, 0) + for valueName, valueInfo := range deviceTopologyValue.Devices { + deviceKeyNUMANodes := keyInfo.GetNUMANodes() + deviceValueNUMANodes := valueInfo.GetNUMANodes() + + if len(deviceKeyNUMANodes) != 0 && sets.NewInt(deviceKeyNUMANodes...).Equal(sets.NewInt(deviceValueNUMANodes...)) { + devicesWithAffinity = append(devicesWithAffinity, valueName) + } + } + deviceAffinity[keyName] = devicesWithAffinity + } + + return deviceAffinity, nil +} + +type DeviceTopology struct { + Devices map[string]DeviceInfo +} + +// GroupDeviceAffinity forms a topology graph such that all devices within a DeviceIDs group have an affinity with each other. +// They are differentiated by their affinity priority level. +// E.g. Output: +// +// { +// 0: {{"gpu-0", "gpu-1"}, {"gpu-2", "gpu-3"}}, +// 1: {{"gpu-0", "gpu-1", "gpu-2", "gpu-3"}} +// } +// +// means that gpu-0 and gpu-1 have an affinity with each other, gpu-2 and gpu-3 have an affinity with each other in affinity priority 0. +// and gpu-0, gpu-1, gpu-2, and gpu-3 have an affinity with each other in affinity priority 1. +func (t *DeviceTopology) GroupDeviceAffinity() map[AffinityPriority][]DeviceIDs { + deviceAffinityGroup := make(map[AffinityPriority][]DeviceIDs) + for deviceId, deviceInfo := range t.Devices { + for priority, affinityDeviceIDs := range deviceInfo.DeviceAffinity { + // Add itself in the group if it is not already included + if !slices.Contains(affinityDeviceIDs, deviceId) { + affinityDeviceIDs = append(affinityDeviceIDs, deviceId) + } + // Sort the strings for easier deduplication + sort.Strings(affinityDeviceIDs) + if _, ok := deviceAffinityGroup[priority]; !ok { + deviceAffinityGroup[priority] = make([]DeviceIDs, 0) + } + + // Add the affinityDeviceIDs to the priority level if it is not already there + if !containsGroup(deviceAffinityGroup[priority], affinityDeviceIDs) { + deviceAffinityGroup[priority] = append(deviceAffinityGroup[priority], affinityDeviceIDs) + } + + } + } + return deviceAffinityGroup +} + +func containsGroup(groups []DeviceIDs, candidate DeviceIDs) bool { + for _, g := range groups { + if slices.Equal(g, candidate) { + return true + } + } + return false +} + +type DeviceInfo struct { + Health string + NumaNodes []int + // DeviceAffinity is the map of priority level to the other deviceIds that a particular deviceId has an affinity with + DeviceAffinity map[AffinityPriority]DeviceIDs +} + +// AffinityPriority is the level of affinity that a deviceId has with another deviceId. +// The lowest affinityPriority value is 0, and in this level, devices have the most affinity with one another, +// so it is of highest priority to try to allocate these devices together. +// As the affinityPriority value increases, devices do not have as much affinity with each other, +// so it is of lower priority to try to allocate these devices together. +type AffinityPriority int + +type DeviceIDs []string + +func (i DeviceInfo) GetNUMANodes() []int { + if i.NumaNodes == nil { + return []int{} + } + return i.NumaNodes +} + +type deviceTopologyProviderImpl struct { + mutex sync.RWMutex + resourceNames []string + + deviceTopology *DeviceTopology + numaTopologyReady bool +} + +func NewDeviceTopologyProvider(resourceNames []string) DeviceTopologyProvider { + deviceTopology, err := initDeviceTopology(resourceNames) + if err != nil { + deviceTopology = getEmptyDeviceTopology() + general.Warningf("initDeviceTopology failed with error: %v", err) + } else { + general.Infof("initDeviceTopology success: %v", deviceTopology) + } + + return &deviceTopologyProviderImpl{ + resourceNames: resourceNames, + deviceTopology: deviceTopology, + } +} + +func (p *deviceTopologyProviderImpl) SetDeviceTopology(deviceTopology *DeviceTopology) error { + p.mutex.Lock() + defer p.mutex.Unlock() + if deviceTopology == nil { + return fmt.Errorf("deviceTopology is nil when setting device topology") + } + + p.deviceTopology = deviceTopology + p.numaTopologyReady = checkDeviceNUMATopologyReady(deviceTopology) + return nil +} + +func (p *deviceTopologyProviderImpl) GetDeviceTopology() (*DeviceTopology, bool, error) { + p.mutex.RLock() + defer p.mutex.RUnlock() + + if p.deviceTopology == nil { + return nil, false, fmt.Errorf("deviceTopology is nil when getting device topology") + } + + return p.deviceTopology, p.numaTopologyReady, nil +} + +func getEmptyDeviceTopology() *DeviceTopology { + return &DeviceTopology{ + Devices: make(map[string]DeviceInfo), + } +} + +func initDeviceTopology(resourceNames []string) (*DeviceTopology, error) { + deviceTopology := getEmptyDeviceTopology() + + kubeletCheckpoint, err := native.GetKubeletCheckpoint() + if err != nil { + general.Errorf("Failed to get kubelet checkpoint: %v", err) + return deviceTopology, nil + } + + _, registeredDevs := kubeletCheckpoint.GetDataInLatestFormat() + for _, resourceName := range resourceNames { + devices, ok := registeredDevs[resourceName] + if !ok { + continue + } + + for _, id := range devices { + // get NUMA node from UpdateAllocatableAssociatedDevices + deviceTopology.Devices[id] = DeviceInfo{} + } + } + return deviceTopology, nil +} + +func checkDeviceNUMATopologyReady(topology *DeviceTopology) bool { + if topology == nil { + return false + } + + for _, device := range topology.Devices { + if device.NumaNodes == nil { + return false + } + } + return true +} diff --git a/pkg/util/machine/device_affinity.go b/pkg/util/machine/device_affinity.go new file mode 100644 index 0000000000..5a5fc0f222 --- /dev/null +++ b/pkg/util/machine/device_affinity.go @@ -0,0 +1,23 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +// DeviceAffinityProvider knows how to form affinity between devices +type DeviceAffinityProvider interface { + // SetDeviceAffinity modifies DeviceTopology by retrieving each device's affinity to other devices + SetDeviceAffinity(*DeviceTopology) +} diff --git a/pkg/util/machine/device_stub.go b/pkg/util/machine/device_stub.go new file mode 100644 index 0000000000..6e9e97796d --- /dev/null +++ b/pkg/util/machine/device_stub.go @@ -0,0 +1,43 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import "sync" + +type deviceTopologyProviderStub struct { + mutex sync.RWMutex + deviceTopology *DeviceTopology +} + +func NewDeviceTopologyProviderStub() DeviceTopologyProvider { + return &deviceTopologyProviderStub{} +} + +func (d *deviceTopologyProviderStub) SetDeviceTopology(deviceTopology *DeviceTopology) error { + d.mutex.Lock() + defer d.mutex.Unlock() + + d.deviceTopology = deviceTopology + return nil +} + +func (d *deviceTopologyProviderStub) GetDeviceTopology() (*DeviceTopology, bool, error) { + d.mutex.RLock() + defer d.mutex.RUnlock() + + return d.deviceTopology, true, nil +} diff --git a/pkg/util/machine/device_test.go b/pkg/util/machine/device_test.go new file mode 100644 index 0000000000..d9399627b8 --- /dev/null +++ b/pkg/util/machine/device_test.go @@ -0,0 +1,448 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machine + +import ( + "sort" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDeviceTopologyRegistry_GetDeviceNUMAAffinity(t *testing.T) { + t.Parallel() + + npuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": {NumaNodes: []int{0}}, + "npu-1": {NumaNodes: []int{1}}, + "npu-2": {NumaNodes: []int{0, 1}}, + }, + } + + gpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "gpu-0": {NumaNodes: []int{0}}, + "gpu-1": {NumaNodes: []int{1}}, + "gpu-2": {NumaNodes: []int{2}}, + }, + } + + xpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "xpu-0": {NumaNodes: []int{0}}, + "xpu-1": {NumaNodes: []int{1}}, + "xpu-2": {NumaNodes: nil}, + }, + } + + dpuTopology := &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "dpu-0": {NumaNodes: []int{1}}, + "dpu-1": {NumaNodes: []int{0}}, + "dpu-2": {NumaNodes: []int{}}, + }, + } + + // Register device topology providers + registry := NewDeviceTopologyRegistry() + registry.RegisterDeviceTopologyProvider("npu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("gpu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("xpu", NewDeviceTopologyProviderStub()) + registry.RegisterDeviceTopologyProvider("dpu", NewDeviceTopologyProviderStub()) + err := registry.SetDeviceTopology("npu", npuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("gpu", gpuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("xpu", xpuTopology) + assert.NoError(t, err) + err = registry.SetDeviceTopology("dpu", dpuTopology) + assert.NoError(t, err) + + tests := []struct { + name string + deviceA string + deviceB string + expected map[string][]string + expectedErr bool + }{ + { + name: "npu to gpu affinity", + deviceA: "npu", + deviceB: "gpu", + expected: map[string][]string{ + "npu-0": {"gpu-0"}, + "npu-1": {"gpu-1"}, + "npu-2": {}, + }, + }, + { + name: "non-existent device A", + deviceA: "invalid device", + deviceB: "gpu", + expectedErr: true, + }, + { + name: "non-existent device B", + deviceA: "npu", + deviceB: "invalid device", + expectedErr: true, + }, + { + name: "devices with empty numa nodes are not considered to have affinity with each other", + deviceA: "xpu", + deviceB: "dpu", + expected: map[string][]string{ + "xpu-0": {"dpu-1"}, + "xpu-1": {"dpu-0"}, + "xpu-2": {}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + actual, err := registry.GetDeviceNUMAAffinity(tt.deviceA, tt.deviceB) + if tt.expectedErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + evaluateDeviceNUMAAffinity(t, actual, tt.expected) + } + }) + } +} + +func TestDeviceTopology_GroupDeviceAffinity(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + deviceTopology *DeviceTopology + expectedDeviceAffinity map[AffinityPriority][]DeviceIDs + }{ + { + name: "test simple affinity of 2 devices to 1 group with only affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + }, + }, + { + name: "test simple affinity of 4 devices to 1 group with only affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1", "npu-2"}, + }, + }, + "npu-4": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-5", "npu-6", "npu-7"}, + }, + }, + "npu-5": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-6", "npu-7"}, + }, + }, + "npu-6": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-5", "npu-7"}, + }, + }, + "npu-7": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-4", "npu-5", "npu-6"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"}), DeviceIDs([]string{"npu-4", "npu-5", "npu-6", "npu-7"})}, + }, + }, + { + name: "device topology includes self for one affinity level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + }, + }, + { + name: "test simple affinity of 2 devices to 1 group with 2 affinity priority level", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1"}, + 1: {"npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0"}, + 1: {"npu-0", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3"}, + 1: {"npu-0", "npu-1", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2"}, + 1: {"npu-0", "npu-1", "npu-2"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + 1: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"})}, + }, + }, + { + name: "device topology includes self for 2 affinity levels", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-1"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-3"}, + 1: {"npu-0", "npu-1", "npu-2", "npu-3"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1"}), DeviceIDs([]string{"npu-2", "npu-3"})}, + 1: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"})}, + }, + }, + { + name: "unsorted device topology has no effect on result", + deviceTopology: &DeviceTopology{ + Devices: map[string]DeviceInfo{ + "npu-0": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-2", "npu-1", "npu-3"}, + }, + }, + "npu-1": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-3", "npu-0", "npu-2"}, + }, + }, + "npu-2": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-1", "npu-0", "npu-3"}, + }, + }, + "npu-3": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-0", "npu-2", "npu-1"}, + }, + }, + "npu-4": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-6", "npu-5", "npu-7"}, + }, + }, + "npu-5": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-7", "npu-4", "npu-6"}, + }, + }, + "npu-6": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-5", "npu-4", "npu-7"}, + }, + }, + "npu-7": { + DeviceAffinity: map[AffinityPriority]DeviceIDs{ + 0: {"npu-6", "npu-4", "npu-5"}, + }, + }, + }, + }, + expectedDeviceAffinity: map[AffinityPriority][]DeviceIDs{ + 0: {DeviceIDs([]string{"npu-0", "npu-1", "npu-2", "npu-3"}), DeviceIDs([]string{"npu-4", "npu-5", "npu-6", "npu-7"})}, + }, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + deviceAffinity := tt.deviceTopology.GroupDeviceAffinity() + evaluateDeviceAffinity(t, deviceAffinity, tt.expectedDeviceAffinity) + }) + } +} + +func evaluateDeviceNUMAAffinity(t *testing.T, expectedDeviceNUMAAffinity, actualDeviceNUMAAffinity map[string][]string) { + if len(actualDeviceNUMAAffinity) != len(expectedDeviceNUMAAffinity) { + t.Errorf("deviceNUMAAffinity lengths don't match, expected %d, got %d", len(expectedDeviceNUMAAffinity), len(actualDeviceNUMAAffinity)) + return + } + + for device, expected := range expectedDeviceNUMAAffinity { + actual, ok := actualDeviceNUMAAffinity[device] + if !ok { + t.Errorf("expected device numa affinity for device %v, but it is not found", device) + return + } + + assert.ElementsMatch(t, expected, actual, "device numa affinity are not equal") + } +} + +func evaluateDeviceAffinity(t *testing.T, expectedDeviceAffinity, actualDeviceAffinity map[AffinityPriority][]DeviceIDs) { + if len(actualDeviceAffinity) != len(expectedDeviceAffinity) { + t.Errorf("expected %d affinities, got %d", len(expectedDeviceAffinity), len(actualDeviceAffinity)) + return + } + + for priority, expected := range expectedDeviceAffinity { + actual, ok := actualDeviceAffinity[priority] + if !ok { + t.Errorf("expected affinities for priority %v, but it is not found", priority) + return + } + + if !equalDeviceIDsGroupsIgnoreOrder(t, expected, actual) { + return + } + } +} + +func equalDeviceIDsGroupsIgnoreOrder(t *testing.T, expected, actual []DeviceIDs) bool { + if len(expected) != len(actual) { + t.Errorf("expected %d devices, got %d", len(expected), len(actual)) + return false + } + + // Convert each DeviceIDs slice into a normalized, comparable form + normalize := func(groups []DeviceIDs) []string { + res := make([]string, len(groups)) + for i, group := range groups { + sorted := append([]string{}, group...) + sort.Strings(sorted) + res[i] = strings.Join(sorted, ",") + } + sort.Strings(res) + return res + } + + normalizedExp := normalize(expected) + normalizedAct := normalize(actual) + + for i := range normalizedExp { + if normalizedExp[i] != normalizedAct[i] { + t.Errorf("expected %s, got %s", normalizedAct[i], normalizedExp[i]) + return false + } + } + + return true +} diff --git a/pkg/util/native/kubelet.go b/pkg/util/native/kubelet.go index 91eafcbbd3..8c7dc22028 100644 --- a/pkg/util/native/kubelet.go +++ b/pkg/util/native/kubelet.go @@ -26,13 +26,20 @@ import ( "strconv" "time" + "github.com/pkg/errors" v1 "k8s.io/api/core/v1" "k8s.io/client-go/discovery" "k8s.io/client-go/rest" + "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" ) const ( defaultTimeout = time.Second * 10 + + // kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint + kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint" ) // MemoryReservation specifies the memory reservation of different types for each NUMA node @@ -198,3 +205,21 @@ func insecureConfig(host, tokenFile string) (*rest.Config, error) { BearerTokenFile: tokenFile, }, nil } + +func GetKubeletCheckpoint() (checkpoint.DeviceManagerCheckpoint, error) { + checkpointManager, err := checkpointmanager.NewCheckpointManager(v1beta1.DevicePluginPath) + if err != nil { + return nil, errors.Wrap(err, "new checkpoint manager failed") + } + + registeredDevs := make(map[string][]string) + devEntries := make([]checkpoint.PodDevicesEntry, 0) + cp := checkpoint.New(devEntries, registeredDevs) + + err = checkpointManager.GetCheckpoint(kubeletDeviceManagerCheckpoint, cp) + if err != nil { + return nil, errors.Wrap(err, "get checkpoint failed") + } + + return cp, nil +}