From fb88d04d7cca634446c1fb9eb8d395d86ab30f0f Mon Sep 17 00:00:00 2001
From: mehabhalodiya <mehabhalodiya@gmail.com>
Date: Wed, 9 Jul 2025 19:50:26 +0530
Subject: [PATCH] OCP 40556 fix

Signed-off-by: mehabhalodiya <mehabhalodiya@gmail.com>
---
 .../scalability/replace_loadedmaster_nodes.sh |  66 ++++++++++
 .../replace_loadedmaster_nodes_env.sh         |   6 +
 .../scalability/replace_loadedworker_nodes.sh |   2 +-
 .../scalability/replace_master_nodes.py       |  84 +++++++++++++
 .../clouds/master-node-machineset-aws.yaml    |  78 ++++++++++++
 ...eplacenodes.py => replace_worker_nodes.py} |   5 +-
 .../scripts/scalability/utils/ocp_utils.py    | 114 +++++++++++++++++-
 7 files changed, 349 insertions(+), 6 deletions(-)
 create mode 100644 perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh
 create mode 100644 perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh
 create mode 100644 perfscale_regression_ci/scripts/scalability/replace_master_nodes.py
 create mode 100644 perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml
 rename perfscale_regression_ci/scripts/scalability/{replacenodes.py => replace_worker_nodes.py} (92%)

diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh
new file mode 100644
index 00000000000..4a8d169cf55
--- /dev/null
+++ b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh
@@ -0,0 +1,66 @@
+#/!/bin/bash
+##################################################################################################
+## Auth=mbhalodi@redhat.com
+## Desription: Script for resizing master nodes with larger master nodes
+## Polarion test case: OCP-40556 - Replace loaded control plane nodes with larger instances
+## https://polarion.engineering.redhat.com/polarion/#/project/OSE/workitem?id=OCP-40556
+## Customer related: https://access.redhat.com/support/cases/#/case/02864921
+## Pre-requisites: 
+### Cluster with 3 m5.4xlarge master nodes and 30 m5.xlarge worker cluster on AWS, or equivalent size of these profiles:
+#### AWS: IPI-on-AWS-OVN
+#### GCP: IPI-on-GCP-OVN
+#### Azure: IPI-on-Azure-OVN
+#### Alicloud: IPI-on-AlibabacloudOVN
+## kube-burner config: 
+### e2e-benchmarking/workloads/kube-burner/workloads/cluster-density/cluster-density.yml
+## Parameters: Cloud provider, number of replicas and number of JOB_ITERATIONS
+###################################################################################################
+
+source ./replace_loadedmaster_nodes_env.sh
+source ../common.sh
+source ../../utils/run_workload.sh
+
+# If parameters is set from upstream ci, overwrite params
+echo "Upstream PARAMETERS set to $PARAMETERS"
+export params=(${PARAMETERS:-aws 3 120})
+echo "params is $params"
+
+export CLOUD=${params[0]:-"aws"}
+export REPLICAS=${params[1]:-"3"}
+export JOB_ITERATIONS=${params[2]:-"120"}
+
+echo "Testing with $CLOUD $REPLICAS $JOB_ITERATIONS"
+
+# Install dittybopper to check resource usage
+install_dittybopper
+
+if [ $? -eq 0 ]; 
+then
+    # Cluster health check prior to testing
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Run workload on current master nodes machineset."
+    run_kube-burner-ocp-wrapper
+    sleep 180
+    echo "Deploy new machineset and scale down one machine at a time from existing machinesets." 
+    cd ./replace_nodes/clouds
+    . ./${CLOUD}.sh
+    cd ../..
+    python ./replace_master_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
+    echo "Existing machines scaled down and new nodes are up."
+    sleep 180 
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Cleanup existing workload namespaces."
+    delete_project_by_label kube-burner-job=$WORKLOAD
+    sleep 180
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Rerun workload on new machineset."
+    run_kube-burner-ocp-wrapper
+    sleep 180
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Test complete!"
+    echo "Verify test results as defined in Polarion test case."
+    exit 0
+else
+    echo "Failed to install dittybopper."
+    exit 1
+fi
diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh
new file mode 100644
index 00000000000..7fa1f317f27
--- /dev/null
+++ b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh
@@ -0,0 +1,6 @@
+export WORKLOAD=cluster-density-v2
+export ITERATIONS=120
+export OPENSHIFT_MASTER_NODE_INSTANCE_TYPE="m5.4xlarge"
+export GC="false"
+export PPROF="false"
+export CHURN="false"
diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh
index dcc9ff8f41d..ddc332ad095 100755
--- a/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh
+++ b/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh
@@ -45,7 +45,7 @@ then
     cd ./replace_nodes/clouds
     . ./${CLOUD}.sh
     cd ../..
-    python ./replacenodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
+    python ./replace_worker_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
     echo "Existing machines scaled down and new nodes are up."
     sleep 180 
     python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
diff --git a/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py b/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py
new file mode 100644
index 00000000000..8fb09f46023
--- /dev/null
+++ b/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import datetime
+import utils.ocp_utils as ocp_utils
+import time
+
+n = len(sys.argv)
+print(f"Total arguments passed: {n}")
+
+cloud_type = sys.argv[1]
+num_masters_to_maintain = int(sys.argv[2]) 
+new_master_instance_type = "m5.4xlarge"
+
+cluster_name = os.environ.get('CLUSTER_NAME')
+if not cluster_name:
+    print("Error: CLUSTER_NAME environment variable not set. Please set it before running.")
+    sys.exit(1)
+
+master_machineset_name = f"{cluster_name}-master"
+
+print(f"Preparing to upgrade master nodes to instance type: {new_master_instance_type}")
+print(f"Targeting existing master machineset: {master_machineset_name}")
+
+# --- Phase 1: Update the Existing Master MachineSet Template ---
+print(f"\n--- Phase 1: Updating Machineset Template ---")
+print(f"Updating instance type of master machineset {master_machineset_name} to {new_master_instance_type}...")
+ocp_utils.update_master_machineset(master_machineset_name, new_master_instance_type)
+print(f"Master machineset {master_machineset_name} template updated.")
+
+# --- Phase 2: One-by-One Rolling Replacement ---
+print(f"\n--- Phase 2: Starting Rolling Replacement ---")
+old_master_node_names = ocp_utils.run("oc get nodes -l node-role.kubernetes.io/master --no-headers -o name").strip().split('\n')
+old_master_node_names = [name.split('/')[1].strip() for name in old_master_node_names if name.strip()]
+
+if len(old_master_node_names) != num_masters_to_maintain:
+    print(f"Error: Expected {num_masters_to_maintain} masters to replace, but found {len(old_master_node_names)}. Please ensure the cluster is in a stable state before proceeding.")
+    sys.exit(1)
+
+old_master_node_names.sort() 
+print(f"Identified {len(old_master_node_names)} existing master nodes for replacement: {old_master_node_names}")
+
+print(f"Starting one-by-one rolling replacement for {num_masters_to_maintain} nodes.")
+
+for i, old_node_to_decommission in enumerate(old_master_node_names):
+    print(f"\n--- Replacement Iteration {i+1} of {num_masters_to_maintain} ---")
+    print(f"Selected master node for upgrade: {old_node_to_decommission}")
+
+    try:
+        # Step 1: Cordon the old node
+        print(f"Cordoning node {old_node_to_decommission}...")
+        ocp_utils.run(f"oc adm cordon {old_node_to_decommission}")
+
+        # Step 2: Get the Machine object name for this node
+        old_machine_name = ocp_utils.get_machine_name_from_node(old_node_to_decommission)
+        print(f"Found Machine {old_machine_name} for node {old_node_to_decommission}.")
+
+        # Step 3: Delete the Machine object to trigger a replacement
+        print(f"Deleting Machine {old_machine_name} in namespace openshift-machine-api to trigger replacement...")
+        ocp_utils.run(f"oc delete machine {old_machine_name} -n openshift-machine-api")
+
+        # Step 4: Wait for a new master node to appear and become ready
+        # The machineset controller will provision a new machine with the updated instance size.
+        print(f"Waiting for new master node to appear with instance type {new_master_instance_type} and become Ready...")
+        # This function should wait for the total number of masters to return to 'num_masters_to_maintain'
+        # with the correct instance type.
+        ocp_utils.wait_for_master_node_creation(num_masters_to_maintain, new_master_instance_type) 
+
+        # Step 5: Uncordon the node once the new instance is ready
+        # The new instance will have the same hostname/node name, so we uncordon that name.
+        print(f"New instance is Ready. Uncordoning node {old_node_to_decommission}...")
+        ocp_utils.run(f"oc adm uncordon {old_node_to_decommission}")
+        
+        print(f"Node {old_node_to_decommission} successfully upgraded to instance type {new_master_instance_type}.")
+
+    except Exception as e:
+        print(f"Error during upgrade of master node {old_node_to_decommission}: {e}")
+        print("This is a critical error during master replacement. Aborting.")
+        sys.exit(1)
+
+print("\n--- Final Master Node Upgrade Completed ---")
+ocp_utils.cluster_health_check()
+print("--- Final Cluster Health Check Complete ---")
diff --git a/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml b/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml
new file mode 100644
index 00000000000..6b31368f40c
--- /dev/null
+++ b/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml
@@ -0,0 +1,78 @@
+apiVersion: machine.openshift.io/v1beta1
+kind: MachineSet
+metadata:
+  creationTimestamp: null
+  labels:
+    machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+    machine.openshift.io/cluster-api-machine-role: master
+    machine.openshift.io/cluster-api-machine-type: master
+    machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+  name: ${CLUSTER_NAME}-master-new 
+  namespace: openshift-machine-api
+spec:
+  replicas: 0 # Initial replicas set to 0. The script will scale this up.
+  selector:
+    matchLabels:
+      machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+      machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+        machine.openshift.io/cluster-api-machine-role: master
+        machine.openshift.io/cluster-api-machine-type: master
+        machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+    spec:
+      metadata:
+        creationTimestamp: null
+        labels:
+          node-role.kubernetes.io/master: ""
+      providerSpec:
+        value:
+          ami:
+            id: ${AMI_ID}
+          apiVersion: awsproviderconfig.openshift.io/v1beta1
+          canIPForward: false
+          credentialsSecret:
+            name: aws-cloud-credentials
+          deletionProtection: false
+          disks:
+          - autoDelete: false
+            boot: true
+            image: ${MASTER_MACHINESET_IMAGE}
+            labels: null
+            sizeGb: ${OPENSHIFT_MASTER_NODE_VOLUME_SIZE}
+            type: ${OPENSHIFT_MASTER_NODE_VOLUME_TYPE}
+          deviceIndex: 0
+          iamInstanceProfile:
+            id: ${CLUSTER_NAME}-master-profile
+          instanceType: ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
+          kind: AWSMachineProviderConfig
+          metadata:
+            creationTimestamp: null
+          placement:
+            availabilityZone: ${CLUSTER_REGION}a
+            region: ${CLUSTER_REGION}
+          publicIp: true
+          securityGroups:
+          - filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-node
+          - filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-lb
+          subnet:
+            filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-subnet-private-${CLUSTER_REGION}a
+          tags:
+          - name: kubernetes.io/cluster/${CLUSTER_NAME}-master
+            value: owned
+          userDataSecret:
+            name: master-user-data
+      versions:
+        kubelet: ""
\ No newline at end of file
diff --git a/perfscale_regression_ci/scripts/scalability/replacenodes.py b/perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py
similarity index 92%
rename from perfscale_regression_ci/scripts/scalability/replacenodes.py
rename to perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py
index 6bf00c60f03..164a4f708e1 100755
--- a/perfscale_regression_ci/scripts/scalability/replacenodes.py
+++ b/perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py
@@ -22,7 +22,7 @@
 
 print('Start time: Scaling up new machineset {} at '.format(final_machine_set) + str(datetime.datetime.now()))
 ocp_utils.scale_machine_replicas(final_machine_set, replicas)
-ocp_utils.wait_for_node_creation(replicas, new_worker_instance_type)
+ocp_utils.wait_for_worker_node_creation(replicas, new_worker_instance_type)
 print('End time: Finished scaling up {} at '.format(final_machine_set) + str(datetime.datetime.now()))
 
 machines_sets=ocp_utils.run("oc get machinesets -A -o name  --no-headers").split('\n')
@@ -47,9 +47,8 @@
         while int(replicas) >= 1: 
             replicas = int(replicas) - 1
             ocp_utils.scale_machine_replicas(machineset, replicas) 
-            ocp_utils.wait_for_node_deletion(machineset, replicas)
+            ocp_utils.wait_for_worker_node_deletion(machineset, replicas)
         print('End time: All nodes deleted from {} at '.format(machineset) + str(datetime.datetime.now()))
         print()
         ocp_utils.cluster_health_check()
         #delete_machineset(machineset) #After scaling down do not delete machinesets for this scenario
-        
\ No newline at end of file
diff --git a/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py b/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py
index e1290ab0b73..6d1318635ff 100644
--- a/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py
+++ b/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py
@@ -3,6 +3,7 @@
 import subprocess
 import time 
 import sys
+import json
 
 def run(command, print_out=False):
     try:
@@ -44,7 +45,7 @@ def scale_machine_replicas(machine_set, replicas):
         else:
             break
 
-def wait_for_node_deletion(machine_set, wanted_replicas):
+def wait_for_worker_node_deletion(machine_set, wanted_replicas):
     machine_name = machine_set.split('/')[-1]
     cmd = "oc get machines -l machine.openshift.io/cluster-api-machineset=%s -n openshift-machine-api --no-headers | grep worker | wc -l" % (machine_name)
     while True:
@@ -71,7 +72,7 @@ def wait_for_node_deletion(machine_set, wanted_replicas):
             break
     print()
 
-def wait_for_node_creation(wanted_replicas, new_worker_instance_type):
+def wait_for_worker_node_creation(wanted_replicas, new_worker_instance_type):
     cmd = "oc get nodes -l node.kubernetes.io/instance-type=%s -n openshift-machine-api --no-headers | grep worker | wc -l" % (new_worker_instance_type)
     while True:
         try:
@@ -97,6 +98,59 @@ def wait_for_node_creation(wanted_replicas, new_worker_instance_type):
             continue
     print()
 
+def wait_for_master_node_deletion(machine_set, wanted_replicas):
+    machine_name = machine_set.split('/')[-1]
+    cmd = "oc get machines -l machine.openshift.io/cluster-api-machineset=%s -n openshift-machine-api --no-headers | grep control-plane,master | wc -l" % (machine_name)
+    while True:
+        try:
+            replicas=run(cmd)
+            if "No resources found" in replicas:
+                print("No resources found")
+                return
+        except ValueError as err:
+            print (err)
+            continue
+        else:
+            break
+    while int(wanted_replicas) != int(replicas):
+        print('wanted vs. actual replicas ' + str(wanted_replicas) +" " + str(replicas))
+        time.sleep(5)
+        try:
+            replicas=run(cmd)
+        except ValueError as err:
+            print (err)
+            continue
+        if "No resources found" in replicas:
+            print("No resources found")
+            break
+    print()
+
+def wait_for_master_node_creation(wanted_replicas, new_master_instance_type):
+    cmd = "oc get nodes -l node.kubernetes.io/instance-type=%s -n openshift-machine-api --no-headers | grep control-plane,master | wc -l" % (new_master_instance_type)
+    while True:
+        try:
+            replicas=run(cmd)
+        except ValueError as err:
+            print (err)
+            continue
+        else:
+            if "No resources found" in replicas:
+                continue
+            else:
+                break
+    while int(wanted_replicas) != int(replicas):
+        print('wanted vs. actual replicas ' + str(wanted_replicas) +" " + str(replicas))
+        time.sleep(5)
+        try:
+            replicas=run(cmd)
+        except ValueError as err:
+            print (err)
+            continue
+        if "No resources found" in replicas:
+            print ("No resources found")
+            continue
+    print()
+
 def delete_machineset(machineset):
     cmd = "oc delete %s -n openshift-machine-api" % (machineset)
     while True:
@@ -119,3 +173,59 @@ def cluster_health_check():
             print (err)
         else:
             break
+
+def update_master_machineset(master_machineset_name, new_master_instance_type):
+    """
+    Updates the instanceType of the specified master machineset.
+    :param machineset_name: The name of the master machineset (e.g., "cluster-name-master").
+    :param new_instance_type: The new instance type to set (e.g., "m5.4xlarge").
+    """
+    # Construct the JSON patch payload
+    patch_payload = {
+        "spec": {
+            "template": {
+                "spec": {
+                    "providerSpec": {
+                        "value": {
+                            "instanceType": new_master_instance_type
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    # Convert payload to a JSON string
+    patch_json = json.dumps(patch_payload)
+    
+    # Construct the oc patch command
+    # Use --type=merge for simplicity, or --type=json --patch='[{"op": "replace", "path": "/spec/template/spec/providerSpec/value/instanceType", "value": "new_type"}]'
+    # The merge strategy is often easier for simple field updates.
+    cmd = f"oc patch machineset {master_machineset_name} -n openshift-machine-api --type=merge -p '{patch_json}'"
+    
+    try:
+        run(cmd)
+        print(f"Successfully patched machineset {master_machineset_name} with new instance type {new_master_instance_type}.")
+    except Exception as e:
+        print(f"Failed to patch machineset {master_machineset_name}: {e}")
+        raise
+
+def get_machine_name_from_node(node_name):
+    """
+    Gets the Machine object name associated with a given node name.
+    """
+    cmd = f"oc get machines -n openshift-machine-api -o custom-columns=NAME:.metadata.name,NODE_NAME:.status.nodeRef.name --no-headers"
+    output = run(cmd)
+    
+    machine_name = None
+    for line in output.splitlines():
+        parts = line.split()
+        if len(parts) == 2 and parts[1] == node_name:
+            if machine_name:
+                raise ValueError(f"Found multiple Machine objects for node {node_name}. This is unexpected.")
+            machine_name = parts[0]
+            
+    if not machine_name:
+        raise ValueError(f"No Machine object found for node {node_name} in openshift-machine-api namespace.")
+    
+    return machine_name