Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#/!/bin/bash
##################################################################################################
## Auth=mbhalodi@redhat.com
## Desription: Script for resizing master nodes with larger master nodes
## Polarion test case: OCP-40556 - Replace loaded control plane nodes with larger instances
## https://polarion.engineering.redhat.com/polarion/#/project/OSE/workitem?id=OCP-40556
## Customer related: https://access.redhat.com/support/cases/#/case/02864921
## Pre-requisites:
### Cluster with 3 m5.4xlarge master nodes and 30 m5.xlarge worker cluster on AWS, or equivalent size of these profiles:
#### AWS: IPI-on-AWS-OVN
#### GCP: IPI-on-GCP-OVN
#### Azure: IPI-on-Azure-OVN
#### Alicloud: IPI-on-AlibabacloudOVN
## kube-burner config:
### e2e-benchmarking/workloads/kube-burner/workloads/cluster-density/cluster-density.yml
## Parameters: Cloud provider, number of replicas and number of JOB_ITERATIONS
###################################################################################################

source ./replace_loadedmaster_nodes_env.sh
source ../common.sh
source ../../utils/run_workload.sh

# If parameters is set from upstream ci, overwrite params
echo "Upstream PARAMETERS set to $PARAMETERS"
export params=(${PARAMETERS:-aws 3 120})
echo "params is $params"

export CLOUD=${params[0]:-"aws"}
export REPLICAS=${params[1]:-"3"}
export JOB_ITERATIONS=${params[2]:-"120"}

echo "Testing with $CLOUD $REPLICAS $JOB_ITERATIONS"

# Install dittybopper to check resource usage
install_dittybopper

if [ $? -eq 0 ];
then
# Cluster health check prior to testing
python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
echo "Run workload on current master nodes machineset."
run_kube-burner-ocp-wrapper
sleep 180
echo "Deploy new machineset and scale down one machine at a time from existing machinesets."
cd ./replace_nodes/clouds
. ./${CLOUD}.sh
cd ../..
python ./replace_master_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
echo "Existing machines scaled down and new nodes are up."
sleep 180
python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
echo "Cleanup existing workload namespaces."
delete_project_by_label kube-burner-job=$WORKLOAD
sleep 180
python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
echo "Rerun workload on new machineset."
run_kube-burner-ocp-wrapper
sleep 180
python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
echo "Test complete!"
echo "Verify test results as defined in Polarion test case."
exit 0
else
echo "Failed to install dittybopper."
exit 1
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export WORKLOAD=cluster-density-v2
export ITERATIONS=120
export OPENSHIFT_MASTER_NODE_INSTANCE_TYPE="m5.4xlarge"
export GC="false"
export PPROF="false"
export CHURN="false"
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ then
cd ./replace_nodes/clouds
. ./${CLOUD}.sh
cd ../..
python ./replacenodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
python ./replace_worker_nodes.py ${CLOUD} ${REPLICAS} ${OPENSHIFT_WORKER_NODE_INSTANCE_TYPE}
echo "Existing machines scaled down and new nodes are up."
sleep 180
python -c "import utils.ocp_utils as ocp_utils; ocp_utils.cluster_health_check()"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

import sys
import os
import datetime
import utils.ocp_utils as ocp_utils
import time

n = len(sys.argv)
print(f"Total arguments passed: {n}")

cloud_type = sys.argv[1]
num_masters_to_maintain = int(sys.argv[2])
new_master_instance_type = "m5.4xlarge"

cluster_name = os.environ.get('CLUSTER_NAME')
if not cluster_name:
print("Error: CLUSTER_NAME environment variable not set. Please set it before running.")
sys.exit(1)

master_machineset_name = f"{cluster_name}-master"

print(f"Preparing to upgrade master nodes to instance type: {new_master_instance_type}")
print(f"Targeting existing master machineset: {master_machineset_name}")

# --- Phase 1: Update the Existing Master MachineSet Template ---
print(f"\n--- Phase 1: Updating Machineset Template ---")
print(f"Updating instance type of master machineset {master_machineset_name} to {new_master_instance_type}...")
ocp_utils.update_master_machineset(master_machineset_name, new_master_instance_type)
print(f"Master machineset {master_machineset_name} template updated.")

# --- Phase 2: One-by-One Rolling Replacement ---
print(f"\n--- Phase 2: Starting Rolling Replacement ---")
old_master_node_names = ocp_utils.run("oc get nodes -l node-role.kubernetes.io/master --no-headers -o name").strip().split('\n')
old_master_node_names = [name.split('/')[1].strip() for name in old_master_node_names if name.strip()]

if len(old_master_node_names) != num_masters_to_maintain:
print(f"Error: Expected {num_masters_to_maintain} masters to replace, but found {len(old_master_node_names)}. Please ensure the cluster is in a stable state before proceeding.")
sys.exit(1)

old_master_node_names.sort()
print(f"Identified {len(old_master_node_names)} existing master nodes for replacement: {old_master_node_names}")

print(f"Starting one-by-one rolling replacement for {num_masters_to_maintain} nodes.")

for i, old_node_to_decommission in enumerate(old_master_node_names):
print(f"\n--- Replacement Iteration {i+1} of {num_masters_to_maintain} ---")
print(f"Selected master node for upgrade: {old_node_to_decommission}")

try:
# Step 1: Cordon the old node
print(f"Cordoning node {old_node_to_decommission}...")
ocp_utils.run(f"oc adm cordon {old_node_to_decommission}")

# Step 2: Get the Machine object name for this node
old_machine_name = ocp_utils.get_machine_name_from_node(old_node_to_decommission)
print(f"Found Machine {old_machine_name} for node {old_node_to_decommission}.")

# Step 3: Delete the Machine object to trigger a replacement
print(f"Deleting Machine {old_machine_name} in namespace openshift-machine-api to trigger replacement...")
ocp_utils.run(f"oc delete machine {old_machine_name} -n openshift-machine-api")

# Step 4: Wait for a new master node to appear and become ready
# The machineset controller will provision a new machine with the updated instance size.
print(f"Waiting for new master node to appear with instance type {new_master_instance_type} and become Ready...")
# This function should wait for the total number of masters to return to 'num_masters_to_maintain'
# with the correct instance type.
ocp_utils.wait_for_master_node_creation(num_masters_to_maintain, new_master_instance_type)

# Step 5: Uncordon the node once the new instance is ready
# The new instance will have the same hostname/node name, so we uncordon that name.
print(f"New instance is Ready. Uncordoning node {old_node_to_decommission}...")
ocp_utils.run(f"oc adm uncordon {old_node_to_decommission}")

print(f"Node {old_node_to_decommission} successfully upgraded to instance type {new_master_instance_type}.")

except Exception as e:
print(f"Error during upgrade of master node {old_node_to_decommission}: {e}")
print("This is a critical error during master replacement. Aborting.")
sys.exit(1)

print("\n--- Final Master Node Upgrade Completed ---")
ocp_utils.cluster_health_check()
print("--- Final Cluster Health Check Complete ---")
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: machine.openshift.io/v1beta1
kind: MachineSet
metadata:
creationTimestamp: null
labels:
machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
machine.openshift.io/cluster-api-machine-role: master
machine.openshift.io/cluster-api-machine-type: master
machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
name: ${CLUSTER_NAME}-master-new
namespace: openshift-machine-api
spec:
replicas: 0 # Initial replicas set to 0. The script will scale this up.
selector:
matchLabels:
machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
template:
metadata:
creationTimestamp: null
labels:
machine.openshift.io/cluster-api-cluster: ${CLUSTER_NAME}
machine.openshift.io/cluster-api-machine-role: master
machine.openshift.io/cluster-api-machine-type: master
machine.openshift.io/cluster-api-machineset: ${CLUSTER_NAME}-master-new
spec:
metadata:
creationTimestamp: null
labels:
node-role.kubernetes.io/master: ""
providerSpec:
value:
ami:
id: ${AMI_ID}
apiVersion: awsproviderconfig.openshift.io/v1beta1
canIPForward: false
credentialsSecret:
name: aws-cloud-credentials
deletionProtection: false
disks:
- autoDelete: false
boot: true
image: ${MASTER_MACHINESET_IMAGE}
labels: null
sizeGb: ${OPENSHIFT_MASTER_NODE_VOLUME_SIZE}
type: ${OPENSHIFT_MASTER_NODE_VOLUME_TYPE}
deviceIndex: 0
iamInstanceProfile:
id: ${CLUSTER_NAME}-master-profile
instanceType: ${OPENSHIFT_MASTER_NODE_INSTANCE_TYPE}
kind: AWSMachineProviderConfig
metadata:
creationTimestamp: null
placement:
availabilityZone: ${CLUSTER_REGION}a
region: ${CLUSTER_REGION}
publicIp: true
securityGroups:
- filters:
- name: tag:Name
values:
- ${CLUSTER_NAME}-node
- filters:
- name: tag:Name
values:
- ${CLUSTER_NAME}-lb
subnet:
filters:
- name: tag:Name
values:
- ${CLUSTER_NAME}-subnet-private-${CLUSTER_REGION}a
tags:
- name: kubernetes.io/cluster/${CLUSTER_NAME}-master
value: owned
userDataSecret:
name: master-user-data
versions:
kubelet: ""
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

print('Start time: Scaling up new machineset {} at '.format(final_machine_set) + str(datetime.datetime.now()))
ocp_utils.scale_machine_replicas(final_machine_set, replicas)
ocp_utils.wait_for_node_creation(replicas, new_worker_instance_type)
ocp_utils.wait_for_worker_node_creation(replicas, new_worker_instance_type)
print('End time: Finished scaling up {} at '.format(final_machine_set) + str(datetime.datetime.now()))

machines_sets=ocp_utils.run("oc get machinesets -A -o name --no-headers").split('\n')
Expand All @@ -47,9 +47,8 @@
while int(replicas) >= 1:
replicas = int(replicas) - 1
ocp_utils.scale_machine_replicas(machineset, replicas)
ocp_utils.wait_for_node_deletion(machineset, replicas)
ocp_utils.wait_for_worker_node_deletion(machineset, replicas)
print('End time: All nodes deleted from {} at '.format(machineset) + str(datetime.datetime.now()))
print()
ocp_utils.cluster_health_check()
#delete_machineset(machineset) #After scaling down do not delete machinesets for this scenario

Loading