openshift · mehabhalodiya · Jul 9, 2025
diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh
@@ -0,0 +1,66 @@
+#/!/bin/bash
+##################################################################################################
+## Auth=mbhalodi@redhat.com
+## Desription: Script for resizing master nodes with larger master nodes
+## Polarion test case: OCP-40556 - Replace loaded control plane nodes with larger instances
+## https://polarion.engineering.redhat.com/polarion/#/project/OSE/workitem?id=OCP-40556
+## Customer related: https://access.redhat.com/support/cases/#/case/02864921
+## Pre-requisites: 
+### Cluster with 3 m5.4xlarge master nodes and 30 m5.xlarge worker cluster on AWS, or equivalent size of these profiles:
+#### AWS: IPI-on-AWS-OVN
+#### GCP: IPI-on-GCP-OVN
+#### Azure: IPI-on-Azure-OVN
+#### Alicloud: IPI-on-AlibabacloudOVN
+## kube-burner config: 
+### e2e-benchmarking/workloads/kube-burner/workloads/cluster-density/cluster-density.yml
+## Parameters: Cloud provider, number of replicas and number of JOB_ITERATIONS
+###################################################################################################
+
+source ./replace_loadedmaster_nodes_env.sh
+source ../common.sh
+source ../../utils/run_workload.sh
+
+# If parameters is set from upstream ci, overwrite params
+echo "Upstream PARAMETERS set to $PARAMETERS"
+export params=(${PARAMETERS:-aws 3 120})
+echo "params is $params"
+
+export CLOUD=${params[0]:-"aws"}
+export REPLICAS=${params[1]:-"3"}
+export JOB_ITERATIONS=${params[2]:-"120"}
+
+echo "Testing with $CLOUD $REPLICAS $JOB_ITERATIONS"
+
+# Install dittybopper to check resource usage
+install_dittybopper
+
+if [ $? -eq 0 ]; 
+then
+    # Cluster health check prior to testing
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Run workload on current master nodes machineset."
+    run_kube-burner-ocp-wrapper
+    sleep 180
+    echo "Deploy new machineset and scale down one machine at a time from existing machinesets." 
+    cd ./replace_nodes/clouds
+    . ./${CLOUD}.sh
+    cd ../..
+    python ./replace_master_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
+    echo "Existing machines scaled down and new nodes are up."
+    sleep 180 
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Cleanup existing workload namespaces."
+    delete_project_by_label kube-burner-job=$WORKLOAD
+    sleep 180
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Rerun workload on new machineset."
+    run_kube-burner-ocp-wrapper
+    sleep 180
+    python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
+    echo "Test complete!"
+    echo "Verify test results as defined in Polarion test case."
+    exit 0
+else
+    echo "Failed to install dittybopper."
+    exit 1
+fi
diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh
@@ -0,0 +1,6 @@
+export WORKLOAD=cluster-density-v2
+export ITERATIONS=120
+export OPENSHIFT_MASTER_NODE_INSTANCE_TYPE="m5.4xlarge"
+export GC="false"
+export PPROF="false"
+export CHURN="false"
diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh
@@ -45,7 +45,7 @@ then
     cd ./replace_nodes/clouds
     . ./${CLOUD}.sh
     cd ../..
-    python ./replacenodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
+    python ./replace_worker_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
     echo "Existing machines scaled down and new nodes are up."
     sleep 180 
     python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"

diff --git a/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py b/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import datetime
+import utils.ocp_utils as ocp_utils
+import time
+
+n = len(sys.argv)
+print(f"Total arguments passed: {n}")
+
+cloud_type = sys.argv[1]
+num_masters_to_maintain = int(sys.argv[2]) 
+new_master_instance_type = "m5.4xlarge"
+
+cluster_name = os.environ.get('CLUSTER_NAME')
+if not cluster_name:
+    print("Error: CLUSTER_NAME environment variable not set. Please set it before running.")
+    sys.exit(1)
+
+master_machineset_name = f"{cluster_name}-master"
+
+print(f"Preparing to upgrade master nodes to instance type: {new_master_instance_type}")
+print(f"Targeting existing master machineset: {master_machineset_name}")
+
+# --- Phase 1: Update the Existing Master MachineSet Template ---
+print(f"\n--- Phase 1: Updating Machineset Template ---")
+print(f"Updating instance type of master machineset {master_machineset_name} to {new_master_instance_type}...")
+ocp_utils.update_master_machineset(master_machineset_name, new_master_instance_type)
+print(f"Master machineset {master_machineset_name} template updated.")
+
+# --- Phase 2: One-by-One Rolling Replacement ---
+print(f"\n--- Phase 2: Starting Rolling Replacement ---")
+old_master_node_names = ocp_utils.run("oc get nodes -l node-role.kubernetes.io/master --no-headers -o name").strip().split('\n')
+old_master_node_names = [name.split('/')[1].strip() for name in old_master_node_names if name.strip()]
+
+if len(old_master_node_names) != num_masters_to_maintain:
+    print(f"Error: Expected {num_masters_to_maintain} masters to replace, but found {len(old_master_node_names)}. Please ensure the cluster is in a stable state before proceeding.")
+    sys.exit(1)
+
+old_master_node_names.sort() 
+print(f"Identified {len(old_master_node_names)} existing master nodes for replacement: {old_master_node_names}")
+
+print(f"Starting one-by-one rolling replacement for {num_masters_to_maintain} nodes.")
+
+for i, old_node_to_decommission in enumerate(old_master_node_names):
+    print(f"\n--- Replacement Iteration {i+1} of {num_masters_to_maintain} ---")
+    print(f"Selected master node for upgrade: {old_node_to_decommission}")
+
+    try:
+        # Step 1: Cordon the old node
+        print(f"Cordoning node {old_node_to_decommission}...")
+        ocp_utils.run(f"oc adm cordon {old_node_to_decommission}")
+
+        # Step 2: Get the Machine object name for this node
+        old_machine_name = ocp_utils.get_machine_name_from_node(old_node_to_decommission)
+        print(f"Found Machine {old_machine_name} for node {old_node_to_decommission}.")
+
+        # Step 3: Delete the Machine object to trigger a replacement
+        print(f"Deleting Machine {old_machine_name} in namespace openshift-machine-api to trigger replacement...")
+        ocp_utils.run(f"oc delete machine {old_machine_name} -n openshift-machine-api")
+
+        # Step 4: Wait for a new master node to appear and become ready
+        # The machineset controller will provision a new machine with the updated instance size.
+        print(f"Waiting for new master node to appear with instance type {new_master_instance_type} and become Ready...")
+        # This function should wait for the total number of masters to return to 'num_masters_to_maintain'
+        # with the correct instance type.
+        ocp_utils.wait_for_master_node_creation(num_masters_to_maintain, new_master_instance_type) 
+
+        # Step 5: Uncordon the node once the new instance is ready
+        # The new instance will have the same hostname/node name, so we uncordon that name.
+        print(f"New instance is Ready. Uncordoning node {old_node_to_decommission}...")
+        ocp_utils.run(f"oc adm uncordon {old_node_to_decommission}")
+
+        print(f"Node {old_node_to_decommission} successfully upgraded to instance type {new_master_instance_type}.")
+
+    except Exception as e:
+        print(f"Error during upgrade of master node {old_node_to_decommission}: {e}")
+        print("This is a critical error during master replacement. Aborting.")
+        sys.exit(1)
+
+print("\n--- Final Master Node Upgrade Completed ---")
+ocp_utils.cluster_health_check()
+print("--- Final Cluster Health Check Complete ---")
diff --git a/...le_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml b/...le_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml
@@ -0,0 +1,78 @@
+apiVersion: machine.openshift.io/v1beta1
+kind: MachineSet
+metadata:
+  creationTimestamp: null
+  labels:
+    machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+    machine.openshift.io/cluster-api-machine-role: master
+    machine.openshift.io/cluster-api-machine-type: master
+    machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+  name: ${CLUSTER_NAME}-master-new 
+  namespace: openshift-machine-api
+spec:
+  replicas: 0 # Initial replicas set to 0. The script will scale this up.
+  selector:
+    matchLabels:
+      machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+      machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
+        machine.openshift.io/cluster-api-machine-role: master
+        machine.openshift.io/cluster-api-machine-type: master
+        machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
+    spec:
+      metadata:
+        creationTimestamp: null
+        labels:
+          node-role.kubernetes.io/master: ""
+      providerSpec:
+        value:
+          ami:
+            id: ${AMI_ID}
+          apiVersion: awsproviderconfig.openshift.io/v1beta1
+          canIPForward: false
+          credentialsSecret:
+            name: aws-cloud-credentials
+          deletionProtection: false
+          disks:
+          - autoDelete: false
+            boot: true
+            image: ${MASTER_MACHINESET_IMAGE}
+            labels: null
+            sizeGb: ${OPENSHIFT_MASTER_NODE_VOLUME_SIZE}
+            type: ${OPENSHIFT_MASTER_NODE_VOLUME_TYPE}
+          deviceIndex: 0
+          iamInstanceProfile:
+            id: ${CLUSTER_NAME}-master-profile
+          instanceType: ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
+          kind: AWSMachineProviderConfig
+          metadata:
+            creationTimestamp: null
+          placement:
+            availabilityZone: ${CLUSTER_REGION}a
+            region: ${CLUSTER_REGION}
+          publicIp: true
+          securityGroups:
+          - filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-node
+          - filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-lb
+          subnet:
+            filters:
+            - name: tag:Name
+              values:
+              - ${CLUSTER_NAME}-subnet-private-${CLUSTER_REGION}a
+          tags:
+          - name: kubernetes.io/cluster/${CLUSTER_NAME}-master
+            value: owned
+          userDataSecret:
+            name: master-user-data
+      versions:
+        kubelet: ""
diff --git a/...on_ci/scripts/scalability/replacenodes.py → ...ripts/scalability/replace_worker_nodes.py b/...on_ci/scripts/scalability/replacenodes.py → ...ripts/scalability/replace_worker_nodes.py
@@ -22,7 +22,7 @@
 
 print('Start time: Scaling up new machineset {} at '.format(final_machine_set) + str(datetime.datetime.now()))
 ocp_utils.scale_machine_replicas(final_machine_set, replicas)
-ocp_utils.wait_for_node_creation(replicas, new_worker_instance_type)
+ocp_utils.wait_for_worker_node_creation(replicas, new_worker_instance_type)
 print('End time: Finished scaling up {} at '.format(final_machine_set) + str(datetime.datetime.now()))
 
 machines_sets=ocp_utils.run("oc get machinesets -A -o name  --no-headers").split('\n')
@@ -47,9 +47,8 @@
         while int(replicas) >= 1: 
             replicas = int(replicas) - 1
             ocp_utils.scale_machine_replicas(machineset, replicas) 
-            ocp_utils.wait_for_node_deletion(machineset, replicas)
+            ocp_utils.wait_for_worker_node_deletion(machineset, replicas)
         print('End time: All nodes deleted from {} at '.format(machineset) + str(datetime.datetime.now()))
         print()
         ocp_utils.cluster_health_check()
         #delete_machineset(machineset) #After scaling down do not delete machinesets for this scenario
-