From fb88d04d7cca634446c1fb9eb8d395d86ab30f0f Mon Sep 17 00:00:00 2001 From: mehabhalodiya Date: Wed, 9 Jul 2025 19:50:26 +0530 Subject: [PATCH] OCP 40556 fix Signed-off-by: mehabhalodiya --- .../scalability/replace_loadedmaster_nodes.sh | 66 ++++++++++ .../replace_loadedmaster_nodes_env.sh | 6 + .../scalability/replace_loadedworker_nodes.sh | 2 +- .../scalability/replace_master_nodes.py | 84 +++++++++++++ .../clouds/master-node-machineset-aws.yaml | 78 ++++++++++++ ...eplacenodes.py => replace_worker_nodes.py} | 5 +- .../scripts/scalability/utils/ocp_utils.py | 114 +++++++++++++++++- 7 files changed, 349 insertions(+), 6 deletions(-) create mode 100644 perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh create mode 100644 perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh create mode 100644 perfscale_regression_ci/scripts/scalability/replace_master_nodes.py create mode 100644 perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml rename perfscale_regression_ci/scripts/scalability/{replacenodes.py => replace_worker_nodes.py} (92%) diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh new file mode 100644 index 00000000000..4a8d169cf55 --- /dev/null +++ b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes.sh @@ -0,0 +1,66 @@ +#/!/bin/bash +################################################################################################## +## Auth=mbhalodi@redhat.com +## Desription: Script for resizing master nodes with larger master nodes +## Polarion test case: OCP-40556 - Replace loaded control plane nodes with larger instances +## https://polarion.engineering.redhat.com/polarion/#/project/OSE/workitem?id=OCP-40556 +## Customer related: https://access.redhat.com/support/cases/#/case/02864921 +## Pre-requisites: +### Cluster with 3 m5.4xlarge master nodes and 30 m5.xlarge worker cluster on AWS, or equivalent size of these profiles: +#### AWS: IPI-on-AWS-OVN +#### GCP: IPI-on-GCP-OVN +#### Azure: IPI-on-Azure-OVN +#### Alicloud: IPI-on-AlibabacloudOVN +## kube-burner config: +### e2e-benchmarking/workloads/kube-burner/workloads/cluster-density/cluster-density.yml +## Parameters: Cloud provider, number of replicas and number of JOB_ITERATIONS +################################################################################################### + +source ./replace_loadedmaster_nodes_env.sh +source ../common.sh +source ../../utils/run_workload.sh + +# If parameters is set from upstream ci, overwrite params +echo "Upstream PARAMETERS set to $PARAMETERS" +export params=(${PARAMETERS:-aws 3 120}) +echo "params is $params" + +export CLOUD=${params[0]:-"aws"} +export REPLICAS=${params[1]:-"3"} +export JOB_ITERATIONS=${params[2]:-"120"} + +echo "Testing with $CLOUD $REPLICAS $JOB_ITERATIONS" + +# Install dittybopper to check resource usage +install_dittybopper + +if [ $? -eq 0 ]; +then + # Cluster health check prior to testing + python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()" + echo "Run workload on current master nodes machineset." + run_kube-burner-ocp-wrapper + sleep 180 + echo "Deploy new machineset and scale down one machine at a time from existing machinesets." + cd ./replace_nodes/clouds + . ./${CLOUD}.sh + cd ../.. + python ./replace_master_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE} + echo "Existing machines scaled down and new nodes are up." + sleep 180 + python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()" + echo "Cleanup existing workload namespaces." + delete_project_by_label kube-burner-job=$WORKLOAD + sleep 180 + python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()" + echo "Rerun workload on new machineset." + run_kube-burner-ocp-wrapper + sleep 180 + python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()" + echo "Test complete!" + echo "Verify test results as defined in Polarion test case." + exit 0 +else + echo "Failed to install dittybopper." + exit 1 +fi diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh new file mode 100644 index 00000000000..7fa1f317f27 --- /dev/null +++ b/perfscale_regression_ci/scripts/scalability/replace_loadedmaster_nodes_env.sh @@ -0,0 +1,6 @@ +export WORKLOAD=cluster-density-v2 +export ITERATIONS=120 +export OPENSHIFT_MASTER_NODE_INSTANCE_TYPE="m5.4xlarge" +export GC="false" +export PPROF="false" +export CHURN="false" diff --git a/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh b/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh index dcc9ff8f41d..ddc332ad095 100755 --- a/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh +++ b/perfscale_regression_ci/scripts/scalability/replace_loadedworker_nodes.sh @@ -45,7 +45,7 @@ then cd ./replace_nodes/clouds . ./${CLOUD}.sh cd ../.. - python ./replacenodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE} + python ./replace_worker_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE} echo "Existing machines scaled down and new nodes are up." sleep 180 python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()" diff --git a/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py b/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py new file mode 100644 index 00000000000..8fb09f46023 --- /dev/null +++ b/perfscale_regression_ci/scripts/scalability/replace_master_nodes.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +import sys +import os +import datetime +import utils.ocp_utils as ocp_utils +import time + +n = len(sys.argv) +print(f"Total arguments passed: {n}") + +cloud_type = sys.argv[1] +num_masters_to_maintain = int(sys.argv[2]) +new_master_instance_type = "m5.4xlarge" + +cluster_name = os.environ.get('CLUSTER_NAME') +if not cluster_name: + print("Error: CLUSTER_NAME environment variable not set. Please set it before running.") + sys.exit(1) + +master_machineset_name = f"{cluster_name}-master" + +print(f"Preparing to upgrade master nodes to instance type: {new_master_instance_type}") +print(f"Targeting existing master machineset: {master_machineset_name}") + +# --- Phase 1: Update the Existing Master MachineSet Template --- +print(f"\n--- Phase 1: Updating Machineset Template ---") +print(f"Updating instance type of master machineset {master_machineset_name} to {new_master_instance_type}...") +ocp_utils.update_master_machineset(master_machineset_name, new_master_instance_type) +print(f"Master machineset {master_machineset_name} template updated.") + +# --- Phase 2: One-by-One Rolling Replacement --- +print(f"\n--- Phase 2: Starting Rolling Replacement ---") +old_master_node_names = ocp_utils.run("oc get nodes -l node-role.kubernetes.io/master --no-headers -o name").strip().split('\n') +old_master_node_names = [name.split('/')[1].strip() for name in old_master_node_names if name.strip()] + +if len(old_master_node_names) != num_masters_to_maintain: + print(f"Error: Expected {num_masters_to_maintain} masters to replace, but found {len(old_master_node_names)}. Please ensure the cluster is in a stable state before proceeding.") + sys.exit(1) + +old_master_node_names.sort() +print(f"Identified {len(old_master_node_names)} existing master nodes for replacement: {old_master_node_names}") + +print(f"Starting one-by-one rolling replacement for {num_masters_to_maintain} nodes.") + +for i, old_node_to_decommission in enumerate(old_master_node_names): + print(f"\n--- Replacement Iteration {i+1} of {num_masters_to_maintain} ---") + print(f"Selected master node for upgrade: {old_node_to_decommission}") + + try: + # Step 1: Cordon the old node + print(f"Cordoning node {old_node_to_decommission}...") + ocp_utils.run(f"oc adm cordon {old_node_to_decommission}") + + # Step 2: Get the Machine object name for this node + old_machine_name = ocp_utils.get_machine_name_from_node(old_node_to_decommission) + print(f"Found Machine {old_machine_name} for node {old_node_to_decommission}.") + + # Step 3: Delete the Machine object to trigger a replacement + print(f"Deleting Machine {old_machine_name} in namespace openshift-machine-api to trigger replacement...") + ocp_utils.run(f"oc delete machine {old_machine_name} -n openshift-machine-api") + + # Step 4: Wait for a new master node to appear and become ready + # The machineset controller will provision a new machine with the updated instance size. + print(f"Waiting for new master node to appear with instance type {new_master_instance_type} and become Ready...") + # This function should wait for the total number of masters to return to 'num_masters_to_maintain' + # with the correct instance type. + ocp_utils.wait_for_master_node_creation(num_masters_to_maintain, new_master_instance_type) + + # Step 5: Uncordon the node once the new instance is ready + # The new instance will have the same hostname/node name, so we uncordon that name. + print(f"New instance is Ready. Uncordoning node {old_node_to_decommission}...") + ocp_utils.run(f"oc adm uncordon {old_node_to_decommission}") + + print(f"Node {old_node_to_decommission} successfully upgraded to instance type {new_master_instance_type}.") + + except Exception as e: + print(f"Error during upgrade of master node {old_node_to_decommission}: {e}") + print("This is a critical error during master replacement. Aborting.") + sys.exit(1) + +print("\n--- Final Master Node Upgrade Completed ---") +ocp_utils.cluster_health_check() +print("--- Final Cluster Health Check Complete ---") diff --git a/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml b/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml new file mode 100644 index 00000000000..6b31368f40c --- /dev/null +++ b/perfscale_regression_ci/scripts/scalability/replace_nodes/clouds/master-node-machineset-aws.yaml @@ -0,0 +1,78 @@ +apiVersion: machine.openshift.io/v1beta1 +kind: MachineSet +metadata: + creationTimestamp: null + labels: + machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME} + machine.openshift.io/cluster-api-machine-role: master + machine.openshift.io/cluster-api-machine-type: master + machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new + name: ${CLUSTER_NAME}-master-new + namespace: openshift-machine-api +spec: + replicas: 0 # Initial replicas set to 0. The script will scale this up. + selector: + matchLabels: + machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME} + machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new + template: + metadata: + creationTimestamp: null + labels: + machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME} + machine.openshift.io/cluster-api-machine-role: master + machine.openshift.io/cluster-api-machine-type: master + machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new + spec: + metadata: + creationTimestamp: null + labels: + node-role.kubernetes.io/master: "" + providerSpec: + value: + ami: + id: ${AMI_ID} + apiVersion: awsproviderconfig.openshift.io/v1beta1 + canIPForward: false + credentialsSecret: + name: aws-cloud-credentials + deletionProtection: false + disks: + - autoDelete: false + boot: true + image: ${MASTER_MACHINESET_IMAGE} + labels: null + sizeGb: ${OPENSHIFT_MASTER_NODE_VOLUME_SIZE} + type: ${OPENSHIFT_MASTER_NODE_VOLUME_TYPE} + deviceIndex: 0 + iamInstanceProfile: + id: ${CLUSTER_NAME}-master-profile + instanceType: ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE} + kind: AWSMachineProviderConfig + metadata: + creationTimestamp: null + placement: + availabilityZone: ${CLUSTER_REGION}a + region: ${CLUSTER_REGION} + publicIp: true + securityGroups: + - filters: + - name: tag:Name + values: + - ${CLUSTER_NAME}-node + - filters: + - name: tag:Name + values: + - ${CLUSTER_NAME}-lb + subnet: + filters: + - name: tag:Name + values: + - ${CLUSTER_NAME}-subnet-private-${CLUSTER_REGION}a + tags: + - name: kubernetes.io/cluster/${CLUSTER_NAME}-master + value: owned + userDataSecret: + name: master-user-data + versions: + kubelet: "" \ No newline at end of file diff --git a/perfscale_regression_ci/scripts/scalability/replacenodes.py b/perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py similarity index 92% rename from perfscale_regression_ci/scripts/scalability/replacenodes.py rename to perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py index 6bf00c60f03..164a4f708e1 100755 --- a/perfscale_regression_ci/scripts/scalability/replacenodes.py +++ b/perfscale_regression_ci/scripts/scalability/replace_worker_nodes.py @@ -22,7 +22,7 @@ print('Start time: Scaling up new machineset {} at '.format(final_machine_set) + str(datetime.datetime.now())) ocp_utils.scale_machine_replicas(final_machine_set, replicas) -ocp_utils.wait_for_node_creation(replicas, new_worker_instance_type) +ocp_utils.wait_for_worker_node_creation(replicas, new_worker_instance_type) print('End time: Finished scaling up {} at '.format(final_machine_set) + str(datetime.datetime.now())) machines_sets=ocp_utils.run("oc get machinesets -A -o name --no-headers").split('\n') @@ -47,9 +47,8 @@ while int(replicas) >= 1: replicas = int(replicas) - 1 ocp_utils.scale_machine_replicas(machineset, replicas) - ocp_utils.wait_for_node_deletion(machineset, replicas) + ocp_utils.wait_for_worker_node_deletion(machineset, replicas) print('End time: All nodes deleted from {} at '.format(machineset) + str(datetime.datetime.now())) print() ocp_utils.cluster_health_check() #delete_machineset(machineset) #After scaling down do not delete machinesets for this scenario - \ No newline at end of file diff --git a/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py b/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py index e1290ab0b73..6d1318635ff 100644 --- a/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py +++ b/perfscale_regression_ci/scripts/scalability/utils/ocp_utils.py @@ -3,6 +3,7 @@ import subprocess import time import sys +import json def run(command, print_out=False): try: @@ -44,7 +45,7 @@ def scale_machine_replicas(machine_set, replicas): else: break -def wait_for_node_deletion(machine_set, wanted_replicas): +def wait_for_worker_node_deletion(machine_set, wanted_replicas): machine_name = machine_set.split('/')[-1] cmd = "oc get machines -l machine.openshift.io/cluster-api-machineset=%s -n openshift-machine-api --no-headers | grep worker | wc -l" % (machine_name) while True: @@ -71,7 +72,7 @@ def wait_for_node_deletion(machine_set, wanted_replicas): break print() -def wait_for_node_creation(wanted_replicas, new_worker_instance_type): +def wait_for_worker_node_creation(wanted_replicas, new_worker_instance_type): cmd = "oc get nodes -l node.kubernetes.io/instance-type=%s -n openshift-machine-api --no-headers | grep worker | wc -l" % (new_worker_instance_type) while True: try: @@ -97,6 +98,59 @@ def wait_for_node_creation(wanted_replicas, new_worker_instance_type): continue print() +def wait_for_master_node_deletion(machine_set, wanted_replicas): + machine_name = machine_set.split('/')[-1] + cmd = "oc get machines -l machine.openshift.io/cluster-api-machineset=%s -n openshift-machine-api --no-headers | grep control-plane,master | wc -l" % (machine_name) + while True: + try: + replicas=run(cmd) + if "No resources found" in replicas: + print("No resources found") + return + except ValueError as err: + print (err) + continue + else: + break + while int(wanted_replicas) != int(replicas): + print('wanted vs. actual replicas ' + str(wanted_replicas) +" " + str(replicas)) + time.sleep(5) + try: + replicas=run(cmd) + except ValueError as err: + print (err) + continue + if "No resources found" in replicas: + print("No resources found") + break + print() + +def wait_for_master_node_creation(wanted_replicas, new_master_instance_type): + cmd = "oc get nodes -l node.kubernetes.io/instance-type=%s -n openshift-machine-api --no-headers | grep control-plane,master | wc -l" % (new_master_instance_type) + while True: + try: + replicas=run(cmd) + except ValueError as err: + print (err) + continue + else: + if "No resources found" in replicas: + continue + else: + break + while int(wanted_replicas) != int(replicas): + print('wanted vs. actual replicas ' + str(wanted_replicas) +" " + str(replicas)) + time.sleep(5) + try: + replicas=run(cmd) + except ValueError as err: + print (err) + continue + if "No resources found" in replicas: + print ("No resources found") + continue + print() + def delete_machineset(machineset): cmd = "oc delete %s -n openshift-machine-api" % (machineset) while True: @@ -119,3 +173,59 @@ def cluster_health_check(): print (err) else: break + +def update_master_machineset(master_machineset_name, new_master_instance_type): + """ + Updates the instanceType of the specified master machineset. + :param machineset_name: The name of the master machineset (e.g., "cluster-name-master"). + :param new_instance_type: The new instance type to set (e.g., "m5.4xlarge"). + """ + # Construct the JSON patch payload + patch_payload = { + "spec": { + "template": { + "spec": { + "providerSpec": { + "value": { + "instanceType": new_master_instance_type + } + } + } + } + } + } + + # Convert payload to a JSON string + patch_json = json.dumps(patch_payload) + + # Construct the oc patch command + # Use --type=merge for simplicity, or --type=json --patch='[{"op": "replace", "path": "/spec/template/spec/providerSpec/value/instanceType", "value": "new_type"}]' + # The merge strategy is often easier for simple field updates. + cmd = f"oc patch machineset {master_machineset_name} -n openshift-machine-api --type=merge -p '{patch_json}'" + + try: + run(cmd) + print(f"Successfully patched machineset {master_machineset_name} with new instance type {new_master_instance_type}.") + except Exception as e: + print(f"Failed to patch machineset {master_machineset_name}: {e}") + raise + +def get_machine_name_from_node(node_name): + """ + Gets the Machine object name associated with a given node name. + """ + cmd = f"oc get machines -n openshift-machine-api -o custom-columns=NAME:.metadata.name,NODE_NAME:.status.nodeRef.name --no-headers" + output = run(cmd) + + machine_name = None + for line in output.splitlines(): + parts = line.split() + if len(parts) == 2 and parts[1] == node_name: + if machine_name: + raise ValueError(f"Found multiple Machine objects for node {node_name}. This is unexpected.") + machine_name = parts[0] + + if not machine_name: + raise ValueError(f"No Machine object found for node {node_name} in openshift-machine-api namespace.") + + return machine_name