-
Notifications
You must be signed in to change notification settings - Fork 213
OCPSTRAT-2485: Introduce feature gate based inclusion/exclusion of manifests #1273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9004329
8163e7d
dad68ce
ed06ae4
d206d70
cc44477
714d49f
309f0be
f46468a
0f4973f
4becfc9
f723c0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,8 @@ require ( | |
| k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 | ||
| ) | ||
|
|
||
| replace github.com/openshift/library-go => github.com/JoelSpeed/library-go v0.0.0-20251217102247-bcd739ac7ffa | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # Check the full go.mod file to understand context
cat go.modRepository: openshift/cluster-version-operator Length of output: 4437 🏁 Script executed: # Look for any git history or PR context about this change
git log --oneline -n 20 -- go.modRepository: openshift/cluster-version-operator Length of output: 142 🏁 Script executed: # Search for any documentation or comments about library-go dependency
rg -i "library-go" --type md --type txt -A 2 -B 2Repository: openshift/cluster-version-operator Length of output: 60 Replace to personal library-go fork must be reverted before merge The 🤖 Prompt for AI Agents |
||
|
|
||
| require ( | ||
| cel.dev/expr v0.24.0 // indirect | ||
| github.com/antlr4-go/antlr/v4 v4.13.0 // indirect | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
| "time" | ||
|
|
||
| "github.com/spf13/cobra" | ||
| "k8s.io/apimachinery/pkg/util/sets" | ||
|
|
||
| "github.com/openshift/cluster-version-operator/pkg/payload" | ||
| ) | ||
|
|
@@ -30,7 +31,7 @@ func newTaskGraphCmd() *cobra.Command { | |
|
|
||
| func runTaskGraphCmd(cmd *cobra.Command, args []string) error { | ||
| manifestDir := args[0] | ||
| release, err := payload.LoadUpdate(manifestDir, "", "", "", payload.DefaultClusterProfile, nil) | ||
| release, err := payload.LoadUpdate(manifestDir, "", "", "", payload.DefaultClusterProfile, nil, sets.Set[string]{}) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't handle feature sets today, does it need to handle feature gates?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not think so. It would be nice, but as you have stated, the file is not being kept up-to-date with new functionalities, so I believe it's okay not to implement it for now. |
||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: This file is not to be merged, this is just for testing purposes |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: dev-preview-created | ||
| namespace: openshift-cluster-version | ||
| annotations: | ||
| include.release.openshift.io/self-managed-high-availability: "true" | ||
| release.openshift.io/feature-gates: "Example,Example2" | ||
| data: | ||
| example: "this should be created on dev-preview-only as it's annotated with Example and Example2" | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: tech-preview-created | ||
| namespace: openshift-cluster-version | ||
| annotations: | ||
| include.release.openshift.io/self-managed-high-availability: "true" | ||
| release.openshift.io/feature-gates: "Example,-Example2" | ||
| data: | ||
| example: "this should be created on tech-preview-only as it's annotated with Example and not Example2" | ||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: default-created | ||
| namespace: openshift-cluster-version | ||
| annotations: | ||
| include.release.openshift.io/self-managed-high-availability: "true" | ||
| release.openshift.io/feature-gates: "-Example,-Example2" | ||
| data: | ||
| example: "this should be created on default-only as it's annotated with negation for Example and Example2" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ import ( | |
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/types" | ||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
| "k8s.io/apimachinery/pkg/util/sets" | ||
| "k8s.io/apimachinery/pkg/util/wait" | ||
| informerscorev1 "k8s.io/client-go/informers/core/v1" | ||
| "k8s.io/client-go/kubernetes" | ||
|
|
@@ -120,6 +121,7 @@ type Operator struct { | |
| cmConfigLister listerscorev1.ConfigMapNamespaceLister | ||
| cmConfigManagedLister listerscorev1.ConfigMapNamespaceLister | ||
| proxyLister configlistersv1.ProxyLister | ||
| featureGateLister configlistersv1.FeatureGateLister | ||
| cacheSynced []cache.InformerSynced | ||
|
|
||
| // queue tracks applying updates to a cluster. | ||
|
|
@@ -189,6 +191,10 @@ type Operator struct { | |
| // featurechangestopper controller will detect when cluster feature gate config changes and shutdown the CVO. | ||
| enabledFeatureGates featuregates.CvoGateChecker | ||
|
|
||
| // featureGatesMutex protects access to enabledManifestFeatureGates | ||
| featureGatesMutex sync.RWMutex | ||
| enabledManifestFeatureGates sets.Set[string] | ||
|
|
||
| clusterProfile string | ||
| uid types.UID | ||
|
|
||
|
|
@@ -213,6 +219,7 @@ func New( | |
| cmConfigManagedInformer informerscorev1.ConfigMapInformer, | ||
| proxyInformer configinformersv1.ProxyInformer, | ||
| operatorInformerFactory operatorexternalversions.SharedInformerFactory, | ||
| featureGateInformer configinformersv1.FeatureGateInformer, | ||
| client clientset.Interface, | ||
| kubeClient kubernetes.Interface, | ||
| operatorClient operatorclientset.Interface, | ||
|
|
@@ -248,9 +255,9 @@ func New( | |
| kubeClient: kubeClient, | ||
| operatorClient: operatorClient, | ||
| eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: namespace}), | ||
| queue: workqueue.NewTypedRateLimitingQueueWithConfig[any](workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "clusterversion"}), | ||
| availableUpdatesQueue: workqueue.NewTypedRateLimitingQueueWithConfig[any](workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "availableupdates"}), | ||
| upgradeableQueue: workqueue.NewTypedRateLimitingQueueWithConfig[any](workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "upgradeable"}), | ||
| queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "clusterversion"}), | ||
| availableUpdatesQueue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "availableupdates"}), | ||
| upgradeableQueue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[any](), workqueue.TypedRateLimitingQueueConfig[any]{Name: "upgradeable"}), | ||
|
|
||
| hypershift: hypershift, | ||
| exclude: exclude, | ||
|
|
@@ -276,6 +283,9 @@ func New( | |
| if _, err := coInformer.Informer().AddEventHandler(optr.clusterOperatorEventHandler()); err != nil { | ||
| return nil, err | ||
| } | ||
| if _, err := featureGateInformer.Informer().AddEventHandler(optr.featureGateEventHandler()); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| optr.coLister = coInformer.Lister() | ||
| optr.cacheSynced = append(optr.cacheSynced, coInformer.Informer().HasSynced) | ||
|
|
@@ -287,6 +297,12 @@ func New( | |
| optr.cmConfigLister = cmConfigInformer.Lister().ConfigMaps(internal.ConfigNamespace) | ||
| optr.cmConfigManagedLister = cmConfigManagedInformer.Lister().ConfigMaps(internal.ConfigManagedNamespace) | ||
|
|
||
| optr.featureGateLister = featureGateInformer.Lister() | ||
| optr.cacheSynced = append(optr.cacheSynced, featureGateInformer.Informer().HasSynced) | ||
|
|
||
| // Initialize cluster feature gates | ||
| optr.initializeFeatureGates() | ||
|
|
||
| // make sure this is initialized after all the listers are initialized | ||
| optr.upgradeableChecks = optr.defaultUpgradeableChecks() | ||
|
|
||
|
|
@@ -318,7 +334,7 @@ func (optr *Operator) LoadInitialPayload(ctx context.Context, restConfig *rest.C | |
| } | ||
|
|
||
| update, err := payload.LoadUpdate(optr.defaultPayloadDir(), optr.release.Image, optr.exclude, string(optr.requiredFeatureSet), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing that has come to my mind from looking more and more at the code. With the proposed changes, the CVO could now be working with two different FeatureGate generations at a moment. The feature set is received quite differently from the feature gates; however, both are used simultaneously at a concrete moment for evaluation, to include or not to include a manifest. Due to parallelism, graceful shutdowns, the following could maybe happen? Given the Include() logic allows for logic like: Include only when default and gate is enabled: Include only when techpreview and gate is enabled: to state the desire to apply a manifest only when a desired feature set is set AND when a desired feature gate is enabled/disabled.
I suppose this also highlights the fact that a feature set is set independently of enabled gates, which are set by a cluster operator. However, what about the time between these two actions? The CVO may be applying manifests with a setting that’s only temporary. Was this considered? I am not sure how a CVO could evaluate that the An additional example may be the current initialization, which tries to get enabled gates. However, due to an error when fetching, the CVO could start applying manifest given a configuration (requiredFeatureSet, noEnabledGates), which may cause a similar issue. Manifests with Everything would be dandy, given that we eventually reach the desired cluster state. However, for example, an object deletion requires a specific manifest annotation for deletion [2]. There seem to be areas for non-deterministic application of manifests, which could theoretically leave some existing objects in the cluster (be that non-important manifests or important manifests introducing potential problems, be that even a deployment with an exploitable vulnerability not intended to be deployed in production environments).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I may also be misunderstanding/not clear on the usage of the annotation, to-be feature gate usage, and relevant constraints 👀
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or should the usage of a feature-set or feature-gate annotation disallow the usage of the other in a given manifest? Throwing some ideas and questions into the air.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, so this is quite an interesting thread, with a few things to unpick
This means apply when the gate is enabled, it has no bearing on the featureset, so this would apply whether the gate was enable in TP, DP, Default or Custom. This is the intended use case.
This is not how I was intending for anyone to use the feature here, you would use one, or the other. Long term, I don't want anyone beyond the openshift/api repo to use the old featureset distinction. I can't think of a use case presently where featureset is better than featuregate for general features, nor can I think of a case where you would want to use both. Logically, if the manifest is required as part of the feature in TP, why wouldn't it also be required as part of the feature when it is promoted to default? 🤔
I wonder if we want to update the feature gate watching so that featureset and featuregate update in the same way, either synchronously, or by both causing a kill on change and having the pod restart? I was trying to avoid pod restarts where possible, are you sure the featureset causes a restart? Or is that just when gates pertinent to CVOs operation change?
Changes to the FeatureGate are generally additive. In fact, only in CustomNoUpgrade can a feature gate currently go from On to Off. So in theory, when the featureset changes, any manifest that does get applied before the gates are updated should also be applied once the gates are updated. There might be a small period where not all manifests are being applied, but I don't think there should be a case where some get applied that weren't eventually meant to. Now, in the future, we plan to use gates to remove features, that's where things might be different. We will have manifests with a gate enabled meaning the feature is on, apply the deployment etc, and then with the gate disabled, apply the tombstone version. Again, there should be no need to use featureset here.
I'll do this. We aren't expecting them to be used in tandem, and I cannot currently think of a use case for this, so this makes sense to avoid potential races (though I'm fairly confident we would be alright anyway) |
||
| optr.clusterProfile, configv1.KnownClusterVersionCapabilities) | ||
| optr.clusterProfile, configv1.KnownClusterVersionCapabilities, optr.getEnabledFeatureGates()) | ||
|
|
||
| if err != nil { | ||
| return nil, fmt.Errorf("the local release contents are invalid - no current version can be determined from disk: %v", err) | ||
|
|
@@ -779,7 +795,7 @@ func (optr *Operator) sync(ctx context.Context, key string) error { | |
| } | ||
|
|
||
| // inform the config sync loop about our desired state | ||
| status := optr.configSync.Update(ctx, config.Generation, desired, config, state) | ||
| status := optr.configSync.Update(ctx, config.Generation, desired, config, state, optr.getEnabledFeatureGates()) | ||
|
|
||
| // write cluster version status | ||
| return optr.syncStatus(ctx, original, config, status, errs) | ||
|
|
@@ -1084,6 +1100,97 @@ func (optr *Operator) HTTPClient() (*http.Client, error) { | |
| }, nil | ||
| } | ||
|
|
||
| // featureGateEventHandler handles changes to FeatureGate objects and updates the cluster feature gates | ||
| func (optr *Operator) featureGateEventHandler() cache.ResourceEventHandler { | ||
| workQueueKey := optr.queueKey() | ||
| return cache.ResourceEventHandlerFuncs{ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do have a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You cannot delete the FeatureGate resource, there is an admission time block on this, so, there should never be a delete event coming through the informer. If there were, I'm not exactly sure what you'd do here anyway, so I felt it was ok to omit the delete handler on this occasion |
||
| AddFunc: func(obj interface{}) { | ||
| if optr.updateEnabledFeatureGates(obj) { | ||
| optr.queue.Add(workQueueKey) | ||
| } | ||
| }, | ||
| UpdateFunc: func(old, new interface{}) { | ||
| if optr.updateEnabledFeatureGates(new) { | ||
| optr.queue.Add(workQueueKey) | ||
| } | ||
| }, | ||
| } | ||
| } | ||
|
|
||
| // initializeFeatureGates initializes the cluster feature gates from the current FeatureGate object | ||
| func (optr *Operator) initializeFeatureGates() { | ||
| // Try to load initial state from the cluster FeatureGate object | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could take an extra consideration for |
||
| if optr.featureGateLister != nil { | ||
| if featureGate, err := optr.featureGateLister.Get("cluster"); err == nil { | ||
| optr.updateEnabledFeatureGates(featureGate) | ||
| } | ||
| } | ||
|
Comment on lines
+1123
to
+1127
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are silently ignoring the error. This will result in the default enabled gates being set. This fact may not logged anywhere. We are logging the fact when no enabled gates are found when extracting. |
||
| } | ||
|
|
||
| // updateEnabledFeatureGates updates the cluster feature gates based on a FeatureGate object | ||
| func (optr *Operator) updateEnabledFeatureGates(obj interface{}) bool { | ||
|
Comment on lines
+1130
to
+1131
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we please add a description for the meaning of the return value? |
||
| featureGate, ok := obj.(*configv1.FeatureGate) | ||
| if !ok { | ||
| klog.Warningf("Expected FeatureGate object but got %T", obj) | ||
| return false | ||
| } | ||
|
|
||
| newGates := optr.extractEnabledGates(featureGate) | ||
|
|
||
| // Check if gates actually changed to avoid unnecessary work | ||
| if !optr.enabledManifestFeatureGates.Equal(newGates) { | ||
| optr.featureGatesMutex.Lock() | ||
| defer optr.featureGatesMutex.Unlock() | ||
|
Comment on lines
+1140
to
+1143
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, I thought I had fixed this one, let me check I pushed everything 🤔 |
||
|
|
||
| klog.V(2).Infof("Cluster feature gates changed from %v to %v", | ||
| sets.List(optr.enabledManifestFeatureGates), sets.List(newGates)) | ||
|
|
||
| optr.enabledManifestFeatureGates = newGates | ||
|
|
||
| return true | ||
| } | ||
|
|
||
| return false | ||
| } | ||
|
|
||
| // getEnabledFeatureGates returns a copy of the current cluster feature gates for safe consumption | ||
| func (optr *Operator) getEnabledFeatureGates() sets.Set[string] { | ||
| optr.featureGatesMutex.RLock() | ||
| defer optr.featureGatesMutex.RUnlock() | ||
|
|
||
| // Return a copy to prevent external modification | ||
| result := sets.Set[string]{} | ||
| for gate := range optr.enabledManifestFeatureGates { | ||
| result.Insert(gate) | ||
| } | ||
| return result | ||
| } | ||
|
|
||
| // extractEnabledGates extracts the list of enabled feature gates for the current cluster version | ||
| func (optr *Operator) extractEnabledGates(featureGate *configv1.FeatureGate) sets.Set[string] { | ||
| enabledGates := sets.Set[string]{} | ||
|
|
||
| // Find the feature gate details for the current cluster version | ||
| currentVersion := optr.enabledFeatureGates.DesiredVersion() | ||
| for _, details := range featureGate.Status.FeatureGates { | ||
| if details.Version == currentVersion { | ||
| for _, enabled := range details.Enabled { | ||
| enabledGates.Insert(string(enabled.Name)) | ||
| } | ||
| klog.V(4).Infof("Found %d enabled feature gates for version %s: %v", | ||
| enabledGates.Len(), currentVersion, sets.List(enabledGates)) | ||
| break | ||
| } | ||
| } | ||
|
|
||
| // If no matching version found, log a warning but continue with empty set | ||
| if enabledGates.Len() == 0 { | ||
| klog.V(2).Infof("No feature gates found for current version %s, using empty set", currentVersion) | ||
| } | ||
|
Comment on lines
+1186
to
+1189
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment states
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would probably leave this as info personally |
||
|
|
||
| return enabledGates | ||
| } | ||
|
|
||
| // shouldReconcileCVOConfiguration returns whether the CVO should reconcile its configuration using the API server. | ||
| // | ||
| // enabledFeatureGates must be initialized before the function is called. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we include an explanation in the commit for why the change was made?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I can update that. This was a result of me actually implementing the manifest filtering completely through the render step. Previously it didn't actually do most filtering which meant that manifests without a cluster profile annotation were applied, now, we need a cluster profile annotation on all files to be rendered, and this one was missing it