diff --git a/customization/SageMakerHyperPod/cli_utility/00_setup/.gitignore b/customization/SageMakerHyperPod/cli_utility/00_setup/.gitignore new file mode 100644 index 00000000..f1b27cf2 --- /dev/null +++ b/customization/SageMakerHyperPod/cli_utility/00_setup/.gitignore @@ -0,0 +1,6 @@ +# Generated configuration files +.stack_arn +params.json +env_vars +og_cluster_config.json +updated_cluster_config.json \ No newline at end of file diff --git a/customization/SageMakerHyperPod/cli_utility/00_setup/cleanup_infrastructure.sh b/customization/SageMakerHyperPod/cli_utility/00_setup/cleanup_infrastructure.sh new file mode 100755 index 00000000..e942a3f3 --- /dev/null +++ b/customization/SageMakerHyperPod/cli_utility/00_setup/cleanup_infrastructure.sh @@ -0,0 +1,334 @@ +#!/usr/bin/env bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# ๐Ÿงน HyperPod EKS Infrastructure Cleanup Script +# +# This script safely removes the HyperPod EKS infrastructure created by create_infrastructure.sh: +# - Deletes CloudFormation stack and all associated resources +# - Removes EKS cluster, VPC, subnets, and security groups +# - Cleans up IAM roles and compute instances +# - Removes local configuration files +# +# Prerequisites: +# - AWS CLI configured with appropriate permissions +# - .stack_arn file from the original deployment +# +# What this script does: +# 1. Validates stack existence and permissions +# 2. Shows resources that will be deleted +# 3. Initiates CloudFormation stack deletion +# 4. Monitors deletion progress +# 5. Cleans up local files + +set -e + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_step() { + echo -e "${BLUE}[STEP]${NC} $1" +} + +# Function to check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Function to prompt user for yes/no +prompt_yes_no() { + while true; do + read -p "$1 (y/n): " yn + case $yn in + [Yy]* ) return 0;; + [Nn]* ) return 1;; + * ) echo "Please answer yes or no.";; + esac + done +} + +echo "==========================================" +echo "๐Ÿงน HyperPod EKS Infrastructure Cleanup" +echo "==========================================" +echo +echo "This script will DELETE all AWS infrastructure created for your HyperPod cluster:" +echo " โ€ข EKS cluster and all workloads" +echo " โ€ข EC2 instances and Auto Scaling groups" +echo " โ€ข VPC, subnets, and networking components" +echo " โ€ข Security groups and IAM roles" +echo " โ€ข Load balancers and storage volumes" +echo +print_warning "โš ๏ธ THIS ACTION CANNOT BE UNDONE!" +print_warning "โš ๏ธ ALL DATA AND WORKLOADS WILL BE PERMANENTLY LOST!" +echo + +# Step 1: Validate prerequisites +print_step "Step 1: Validating Prerequisites" + +# Check if AWS CLI is available +if ! command_exists aws; then + print_error "AWS CLI not found. Please install AWS CLI first." + exit 1 +fi + +print_status "AWS CLI found: $(aws --version)" + +# Check if .stack_arn file exists +if [ ! -f .stack_arn ]; then + print_error "Stack ARN file (.stack_arn) not found." + print_error "This file is created by create_infrastructure.sh and contains the stack identifier." + echo + print_status "Alternative: You can manually specify the stack details" + if prompt_yes_no "Do you want to manually enter stack information?"; then + read -p "Enter CloudFormation stack name: " stack_name + read -p "Enter AWS region: " region + stack_identifier="$stack_name" + else + exit 1 + fi +else + # Read stack ARN from file + stack_arn=$(cat .stack_arn) + print_status "Found stack ARN: $stack_arn" + + # Extract region and stack name from ARN + region=$(echo "$stack_arn" | cut -d':' -f4) + stack_name=$(echo "$stack_arn" | cut -d'/' -f2) + stack_identifier="$stack_arn" +fi + +print_status "Stack name: $stack_name" +print_status "Region: $region" + +# Step 2: Validate stack exists +print_step "Step 2: Validating Stack Status" + +print_status "Checking if stack exists and is accessible..." +if ! aws cloudformation describe-stacks --stack-name "$stack_identifier" --region "$region" >/dev/null 2>&1; then + print_error "Cannot access stack '$stack_name' in region '$region'" + print_error "Please check:" + print_error " โ€ข Stack name and region are correct" + print_error " โ€ข AWS credentials have proper permissions" + print_error " โ€ข Stack hasn't already been deleted" + exit 1 +fi + +# Get stack status +stack_status=$(aws cloudformation describe-stacks \ + --stack-name "$stack_identifier" \ + --region "$region" \ + --query 'Stacks[0].StackStatus' \ + --output text) + +print_status "Current stack status: $stack_status" + +# Check if stack is in a deletable state +case $stack_status in + DELETE_IN_PROGRESS) + print_warning "Stack deletion is already in progress" + if prompt_yes_no "Monitor the existing deletion process?"; then + # Skip to monitoring step + deletion_in_progress=true + else + exit 0 + fi + ;; + DELETE_COMPLETE) + print_status "Stack has already been deleted" + if prompt_yes_no "Clean up local files?"; then + # Skip to cleanup step + skip_deletion=true + else + exit 0 + fi + ;; + CREATE_IN_PROGRESS|UPDATE_IN_PROGRESS|UPDATE_ROLLBACK_IN_PROGRESS) + print_error "Stack is currently being modified (status: $stack_status)" + print_error "Please wait for the current operation to complete before deleting" + exit 1 + ;; +esac + +# Step 3: Show resources to be deleted +if [ "$skip_deletion" != "true" ] && [ "$deletion_in_progress" != "true" ]; then + print_step "Step 3: Resources to be Deleted" + + print_status "Retrieving stack resources..." + echo + echo "The following AWS resources will be PERMANENTLY DELETED:" + echo + + # List stack resources + aws cloudformation list-stack-resources \ + --stack-name "$stack_identifier" \ + --region "$region" \ + --query 'StackResourceSummaries[*].[ResourceType,LogicalResourceId,PhysicalResourceId]' \ + --output table + + echo + print_warning "๐Ÿ’ฐ Deletion will stop all charges for these resources" + print_warning "๐Ÿ“Š Any data stored in EBS volumes or databases will be lost" + print_warning "๐Ÿ”ง Running workloads and jobs will be terminated" +fi + +# Step 4: Confirm deletion +if [ "$skip_deletion" != "true" ] && [ "$deletion_in_progress" != "true" ]; then + print_step "Step 4: Deletion Confirmation" + + echo + print_warning "โš ๏ธ FINAL WARNING: This will permanently delete all infrastructure!" + print_warning "โš ๏ธ Make sure you have backed up any important data!" + echo + + if ! prompt_yes_no "Are you absolutely sure you want to DELETE the entire HyperPod infrastructure?"; then + print_status "Deletion cancelled by user" + exit 0 + fi + + echo + if ! prompt_yes_no "Type 'yes' to confirm - this is your last chance to cancel"; then + print_status "Deletion cancelled by user" + exit 0 + fi +fi + +# Step 5: Delete stack +if [ "$skip_deletion" != "true" ] && [ "$deletion_in_progress" != "true" ]; then + print_step "Step 5: Initiating Stack Deletion" + + print_status "Starting CloudFormation stack deletion..." + aws cloudformation delete-stack \ + --stack-name "$stack_identifier" \ + --region "$region" + + print_status "Stack deletion initiated successfully" +fi + +# Step 6: Monitor deletion +print_step "Step 6: Monitoring Deletion Progress" + +print_status "Monitoring deletion progress (this may take 15-20 minutes)..." +print_status "You can also monitor progress in the AWS CloudFormation console" +echo + +# Function to check stack status +check_stack_status() { + aws cloudformation describe-stacks \ + --stack-name "$1" \ + --region "$2" \ + --query 'Stacks[0].StackStatus' \ + --output text 2>/dev/null || echo "DELETE_COMPLETE" +} + +# Monitor deletion with progress updates +start_time=$(date +%s) +while true; do + status=$(check_stack_status "$stack_identifier" "$region") + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + # Format elapsed time + elapsed_minutes=$((elapsed_time / 60)) + elapsed_seconds=$((elapsed_time % 60)) + + case $status in + DELETE_COMPLETE) + print_status "Stack deleted successfully after ${elapsed_minutes}m ${elapsed_seconds}s!" + break + ;; + DELETE_IN_PROGRESS) + echo -ne "\r\033[K${GREEN}[INFO]${NC} Stack deletion in progress... (${elapsed_minutes}m ${elapsed_seconds}s elapsed)" + sleep 30 + ;; + DELETE_FAILED) + print_error "Stack deletion failed after ${elapsed_minutes}m ${elapsed_seconds}s" + print_error "Some resources may need manual cleanup" + print_error "Check the AWS CloudFormation console for error details" + exit 1 + ;; + *) + print_error "Unexpected stack status: $status" + exit 1 + ;; + esac +done + +# Step 7: Clean up local files +print_step "Step 7: Cleaning Up Local Files" + +echo +print_status "Cleaning up local configuration files..." + +files_to_clean=( + ".stack_arn" + "params.json" + "og_cluster_config.json" + "updated_cluster_config.json" + "env_vars" +) + +cleaned_files=() +for file in "${files_to_clean[@]}"; do + if [ -f "$file" ]; then + rm -f "$file" + cleaned_files+=("$file") + fi +done + +if [ ${#cleaned_files[@]} -gt 0 ]; then + print_status "Removed local files:" + for file in "${cleaned_files[@]}"; do + echo " โ€ข $file" + done +else + print_status "No local files found to clean up" +fi + +# Final completion message +echo +print_status "==========================================" +print_status "โœ… Infrastructure cleanup completed!" +print_status "==========================================" +echo +print_status "What was deleted:" +echo " โ€ข CloudFormation stack and all resources" +echo " โ€ข EKS cluster and compute instances" +echo " โ€ข VPC, subnets, and networking components" +echo " โ€ข IAM roles and security groups" +echo " โ€ข Local configuration files" +echo +print_status "๐Ÿ’ฐ All AWS charges for these resources have stopped" +print_status "๐Ÿ”„ You can run create_infrastructure.sh again to recreate the infrastructure" +echo +print_warning "Note: If you had any persistent data (EFS, databases), verify it was backed up" diff --git a/customization/SageMakerHyperPod/cli_utility/00_setup/create_cfn_stack.sh b/customization/SageMakerHyperPod/cli_utility/00_setup/create_cfn_stack.sh index eb698479..7f49a0c2 100755 --- a/customization/SageMakerHyperPod/cli_utility/00_setup/create_cfn_stack.sh +++ b/customization/SageMakerHyperPod/cli_utility/00_setup/create_cfn_stack.sh @@ -1,7 +1,40 @@ #!/usr/bin/env bash -# CloudFormation Stack Creation Script for HyperPod EKS -# This script handles the initial setup and CloudFormation stack creation +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# ๐Ÿš€ HyperPod EKS Infrastructure Setup Script +# +# This script creates the foundational AWS infrastructure for HyperPod clusters: +# - Sets up EKS cluster for Kubernetes orchestration +# - Creates VPC, subnets, and networking components +# - Configures IAM roles and security groups +# - Provisions initial compute instances for ML workloads +# +# Prerequisites: +# - AWS CLI configured with appropriate permissions +# - Sudo access for installing required tools +# - Internet connection for downloading dependencies +# +# What this script does: +# 1. Installs required tools (jq, AWS CLI) +# 2. Collects configuration parameters interactively +# 3. Downloads CloudFormation template from AWS samples +# 4. Creates and monitors CloudFormation stack deployment +# 5. Saves stack information for subsequent setup steps set -e @@ -62,24 +95,33 @@ prompt_input() { } echo "==========================================" -echo "HyperPod EKS CloudFormation Stack Creation" +echo "๐Ÿš€ HyperPod EKS Infrastructure Setup" echo "==========================================" echo - +echo "This script will create the foundational AWS infrastructure for your HyperPod cluster." +echo "The process typically takes 20-30 minutes and includes:" +echo " โ€ข EKS cluster creation" +echo " โ€ข VPC and networking setup" +echo " โ€ข IAM roles and security configuration" +echo " โ€ข Initial compute instance provisioning" +echo +print_warning "Sudo access required for installing system dependencies (jq, AWS CLI)" # Check if we have sudo access if ! sudo -v >/dev/null 2>&1; then print_error "This script requires sudo access for installing packages. Please run with sudo privileges." exit 1 fi -# Step 0: Installations -print_step "Step 0: Checking and Installing Required Tools" +# Step 0: Prerequisites Installation +print_step "Step 0: Installing Required Dependencies" +echo "Installing tools needed for CloudFormation deployment and JSON processing..." +echo -# 0.0 Install jq -print_status "Checking jq installation..." +# 0.0 Install jq (JSON processor) +print_status "Checking jq installation (needed for JSON parameter processing)..." if ! command_exists jq; then - print_warning "jq not found. Installing..." - if prompt_yes_no "Do you want to install jq?"; then + print_warning "jq not found. This tool is required for processing CloudFormation parameters." + if prompt_yes_no "Install jq JSON processor?"; then if [ "$(uname)" == "Darwin" ]; then brew install jq else @@ -95,10 +137,10 @@ else fi # 0.1 Install AWS CLI -print_status "Checking AWS CLI installation..." +print_status "Checking AWS CLI installation (required for CloudFormation operations)..." if ! command_exists aws; then - print_warning "AWS CLI not found. Installing..." - if prompt_yes_no "Do you want to install AWS CLI?"; then + print_warning "AWS CLI not found. This is required for creating AWS infrastructure." + if prompt_yes_no "Install AWS CLI version 2?"; then curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" unzip awscliv2.zip sudo ./aws/install --update @@ -113,7 +155,10 @@ else fi echo -print_step "Step 1: Configuration Parameters" +print_step "Step 1: Infrastructure Configuration" +echo "Configure your HyperPod cluster settings. Press Enter to use defaults or provide custom values." +echo "Tip: For production workloads, consider using larger instance types and counts." +echo # Initialize parameter arrays param_keys=( @@ -156,26 +201,19 @@ param_defaults=( ) param_descriptions=( - "Create new EKS cluster stack" - "Name of the EKS cluster" - # "Security Group ID for the cluster" - # "Create new VPC stack" - # "VPC ID to use" - # "NAT Gateway ID" - "NAT Recovery" - "HyperPod cluster name" - "Prefix for resource names" - "Availability Zone ID" - # "AcceleratedThreadsPerCore" - # "AcceleratedLifeCycleConfigOnCreate" - # "AcceleratedInstanceGroupName" - # "EnableInstanceStressCheck" - "Instance type for accelerated instances" - "Number of accelerated instances" - "Create general purpose instance group" + "๐Ÿง Create new EKS cluster stack (recommended: true for new setups)" + "๐Ÿ“ EKS cluster name (will be used for kubectl configuration)" + "๐Ÿ”„ Node recovery strategy (None/Automatic - how to handle failed nodes)" + "๐Ÿท๏ธ HyperPod cluster name (unique identifier for your ML cluster)" + "๐Ÿ’ผ Resource name prefix (helps organize AWS resources)" + "๐ŸŒ Availability Zone ID (e.g., use1-az2 for us-east-1b)" + "โšก GPU instance type (ml.g5.8xlarge=8 A10G GPUs, ml.p5.48xlarge=8 H100 GPUs)" + "๐Ÿ“Š Number of GPU instances to launch initially" + "๐Ÿ› ๏ธ Create general purpose instances (false saves costs, true adds CPU nodes)" ) -print_status "Please review and configure the following parameters:" +print_status "Configure your HyperPod infrastructure parameters:" +echo "Each parameter has a recommended default value. Customize as needed for your use case." echo # Create a temporary file to store parameters @@ -225,17 +263,34 @@ if ! prompt_yes_no "Do the parameters look correct?"; then exit 1 fi -# Step 3: Download and run CloudFormation stack -print_step "Step 3: Downloading CloudFormation template and creating stack" +# Step 3: Infrastructure Deployment +print_step "Step 3: Deploying AWS Infrastructure" +echo "Downloading the official AWS CloudFormation template for HyperPod EKS integration..." +echo "This template creates a complete ML infrastructure stack including:" +echo " โ€ข EKS cluster with managed node groups" +echo " โ€ข VPC with public/private subnets" +echo " โ€ข Security groups and IAM roles" +echo " โ€ข HyperPod cluster configuration" +echo -print_status "Downloading main-stack.yaml..." +print_status "Downloading CloudFormation template from AWS samples repository..." curl -O https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml -stack_name=$(prompt_input "Enter stack name" "hp-eks-test-stack") -region=$(prompt_input "Enter AWS region" "us-east-1") +echo +print_status "Final deployment configuration:" +stack_name=$(prompt_input "Enter CloudFormation stack name (must be unique in your account)" "hp-eks-test-stack") +region=$(prompt_input "Enter AWS region (where your infrastructure will be created)" "us-east-1") +echo +print_status "Stack will be created in region: $region" +print_status "Stack name: $stack_name" -print_status "Creating CloudFormation stack: $stack_name" -if prompt_yes_no "Proceed with stack creation?"; then +echo +print_warning "IMPORTANT: Stack creation will incur AWS charges for the resources created." +print_status "Ready to create CloudFormation stack: $stack_name" +print_status "Estimated deployment time: 20-30 minutes" +print_status "Resources that will be created: EKS cluster, EC2 instances, VPC, IAM roles" +echo +if prompt_yes_no "Proceed with infrastructure deployment?"; then # Create the stack and capture the stack ARN stack_arn=$(aws cloudformation create-stack \ --stack-name "$stack_name" \ @@ -246,8 +301,12 @@ if prompt_yes_no "Proceed with stack creation?"; then --query 'StackId' \ --output text) - print_status "Stack creation initiated with ARN: $stack_arn" - print_status "Waiting for stack to complete (this may take 20-30 minutes)..." + print_status "Stack deployment initiated successfully!" + print_status "Stack ARN: $stack_arn" + echo + print_status "Monitoring deployment progress (this typically takes 20-30 minutes)..." + print_status "You can also monitor progress in the AWS CloudFormation console" + echo # Function to check stack status check_stack_status() { @@ -292,12 +351,194 @@ if prompt_yes_no "Proceed with stack creation?"; then # Save the stack ARN to a file for future reference echo "$stack_arn" > .stack_arn - print_status "Stack ARN saved to .stack_arn" + print_status "Stack ARN saved to .stack_arn file for next setup step" else print_warning "Stack creation skipped." fi -print_status "==========================================" -print_status "CloudFormation stack creation completed!" -print_status "Next step: Run create_hp_cluster.sh to configure the cluster" -print_status "==========================================" +# Function to display infrastructure status +display_infrastructure_status() { + local stack_name="$1" + local region="$2" + + echo + print_step "Infrastructure Status Report" + echo "Retrieving current infrastructure information from AWS..." + echo + + # Check if stack exists + if ! aws cloudformation describe-stacks --stack-name "$stack_name" --region "$region" >/dev/null 2>&1; then + print_warning "Stack '$stack_name' not found or not accessible" + return 1 + fi + + # Get stack outputs + print_status "๐Ÿ“‹ CloudFormation Stack Information:" + aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'Stacks[0].{StackName:StackName,Status:StackStatus,CreationTime:CreationTime}' \ + --output table + + # Get EKS cluster info + eks_cluster_name=$(aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'Stacks[0].Outputs[?OutputKey==`EKSClusterName`].OutputValue' \ + --output text 2>/dev/null || echo "N/A") + + if [ "$eks_cluster_name" != "N/A" ] && [ -n "$eks_cluster_name" ]; then + echo + print_status "๐Ÿ—๏ธ EKS Cluster Information:" + aws eks describe-cluster \ + --name "$eks_cluster_name" \ + --region "$region" \ + --query 'cluster.{Name:name,Status:status,Version:version,Endpoint:endpoint,CreatedAt:createdAt}' \ + --output table 2>/dev/null || print_warning "EKS cluster details not accessible" + + echo + print_status "๐Ÿ‘ฅ EKS Node Groups:" + aws eks list-nodegroups \ + --cluster-name "$eks_cluster_name" \ + --region "$region" \ + --query 'nodegroups' \ + --output table 2>/dev/null || print_warning "Node groups not accessible" + fi + + # Get VPC info + vpc_id=$(aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'Stacks[0].Outputs[?OutputKey==`VpcId`].OutputValue' \ + --output text 2>/dev/null || echo "N/A") + + if [ "$vpc_id" != "N/A" ] && [ -n "$vpc_id" ]; then + echo + print_status "๐ŸŒ VPC Information:" + aws ec2 describe-vpcs \ + --vpc-ids "$vpc_id" \ + --region "$region" \ + --query 'Vpcs[0].{VpcId:VpcId,CidrBlock:CidrBlock,State:State}' \ + --output table 2>/dev/null || print_warning "VPC details not accessible" + + echo + print_status "๐Ÿ”— Subnets:" + aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=$vpc_id" \ + --region "$region" \ + --query 'Subnets[].{SubnetId:SubnetId,CidrBlock:CidrBlock,AvailabilityZone:AvailabilityZone,Type:Tags[?Key==`Name`].Value|[0]}' \ + --output table 2>/dev/null || print_warning "Subnet details not accessible" + fi + + # Get EC2 instances + echo + print_status "๐Ÿ’ป EC2 Instances (HyperPod nodes):" + aws ec2 describe-instances \ + --region "$region" \ + --filters "Name=tag:aws:cloudformation:stack-name,Values=$stack_name" "Name=instance-state-name,Values=running,pending,stopping,stopped" \ + --query 'Reservations[].Instances[].{InstanceId:InstanceId,InstanceType:InstanceType,State:State.Name,LaunchTime:LaunchTime,PrivateIpAddress:PrivateIpAddress}' \ + --output table 2>/dev/null || print_warning "EC2 instances not accessible" + + # Get SageMaker HyperPod cluster info + hyperpod_cluster_name=$(aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'Stacks[0].Outputs[?OutputKey==`HyperPodClusterName`].OutputValue' \ + --output text 2>/dev/null || echo "N/A") + + if [ "$hyperpod_cluster_name" != "N/A" ] && [ -n "$hyperpod_cluster_name" ]; then + echo + print_status "๐Ÿš€ SageMaker HyperPod Cluster:" + aws sagemaker describe-cluster \ + --cluster-name "$hyperpod_cluster_name" \ + --region "$region" \ + --query '{ClusterName:ClusterName,ClusterStatus:ClusterStatus,CreationTime:CreationTime,InstanceGroups:InstanceGroups[].{GroupName:InstanceGroupName,InstanceType:InstanceType,InstanceCount:CurrentCount}}' \ + --output table 2>/dev/null || print_warning "HyperPod cluster details not accessible" + fi + + # Get IAM roles created by the stack + echo + print_status "๐Ÿ” IAM Roles:" + aws cloudformation list-stack-resources \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'StackResourceSummaries[?ResourceType==`AWS::IAM::Role`].{LogicalId:LogicalResourceId,PhysicalId:PhysicalResourceId,Status:ResourceStatus}' \ + --output table 2>/dev/null || print_warning "IAM roles not accessible" + + # Get security groups + echo + print_status "๐Ÿ›ก๏ธ Security Groups:" + aws cloudformation list-stack-resources \ + --stack-name "$stack_name" \ + --region "$region" \ + --query 'StackResourceSummaries[?ResourceType==`AWS::EC2::SecurityGroup`].{LogicalId:LogicalResourceId,PhysicalId:PhysicalResourceId,Status:ResourceStatus}' \ + --output table 2>/dev/null || print_warning "Security groups not accessible" + + # Summary of installed tools + echo + print_status "๐Ÿ› ๏ธ Installed Tools Summary:" + printf "%-20s %-30s %-10s\n" "Tool" "Version" "Status" + printf "%-20s %-30s %-10s\n" "----" "-------" "------" + + # Check jq + if command_exists jq; then + jq_version=$(jq --version 2>/dev/null || echo "Unknown") + printf "%-20s %-30s %-10s\n" "jq" "$jq_version" "โœ… Installed" + else + printf "%-20s %-30s %-10s\n" "jq" "N/A" "โŒ Missing" + fi + + # Check AWS CLI + if command_exists aws; then + aws_version=$(aws --version 2>/dev/null | cut -d' ' -f1 || echo "Unknown") + printf "%-20s %-30s %-10s\n" "AWS CLI" "$aws_version" "โœ… Installed" + else + printf "%-20s %-30s %-10s\n" "AWS CLI" "N/A" "โŒ Missing" + fi + + # Check kubectl (if available) + if command_exists kubectl; then + kubectl_version=$(kubectl version --client --short 2>/dev/null | cut -d' ' -f3 || echo "Unknown") + printf "%-20s %-30s %-10s\n" "kubectl" "$kubectl_version" "โœ… Installed" + else + printf "%-20s %-30s %-10s\n" "kubectl" "N/A" "โš ๏ธ Not installed" + fi + + echo + print_status "๐Ÿ“Š Resource Cost Estimation:" + echo "Note: Use AWS Pricing Calculator for detailed cost estimates" + echo "https://calculator.aws" +} + +echo +print_status "===========================================" +print_status "โœ… Infrastructure deployment completed successfully!" +print_status "===========================================" +echo +print_status "What was created:" +echo " โ€ข EKS cluster for Kubernetes orchestration" +echo " โ€ข VPC with secure networking configuration" +echo " โ€ข IAM roles with appropriate permissions" +echo " โ€ข Initial compute instances for ML workloads" +echo + +# Display comprehensive infrastructure status +if [ -n "$stack_name" ] && [ -n "$region" ]; then + display_infrastructure_status "$stack_name" "$region" +fi + +echo +print_status "Next Steps:" +echo " 1. Run './create_hp_cluster.sh' to configure the HyperPod cluster" +echo " 2. This will install kubectl, eksctl, and helm" +echo " 3. Configure cluster access and restricted instance groups" +echo +print_status "Files created:" +echo " โ€ข .stack_arn - Contains your CloudFormation stack ARN" +echo " โ€ข params.json - Your infrastructure configuration" +echo " โ€ข main-stack.yaml - CloudFormation template" +echo +print_status "Cleanup:" +echo " โ€ข Run './cleanup_infrastructure.sh' to delete all infrastructure when done" +echo +print_warning "Keep the .stack_arn file - it's needed for the next setup step!" diff --git a/customization/SageMakerHyperPod/cli_utility/00_setup/create_hyperpod_clusters.sh b/customization/SageMakerHyperPod/cli_utility/00_setup/create_hyperpod_clusters.sh new file mode 100755 index 00000000..e433c93c --- /dev/null +++ b/customization/SageMakerHyperPod/cli_utility/00_setup/create_hyperpod_clusters.sh @@ -0,0 +1,575 @@ +#!/usr/bin/env bash + +# ============================================================================== +# Amazon SageMaker HyperPod Cluster Configuration Script +# ============================================================================== +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# ============================================================================== +# DESCRIPTION: +# This script configures an Amazon SageMaker HyperPod cluster with EKS integration. +# It handles the complete setup process including tool installation, cluster +# configuration, and restricted instance group creation. +# +# PREREQUISITES: +# - AWS CLI installed and configured with appropriate credentials +# - CloudFormation stack created (run create_infrastructure.sh first) +# - .stack_arn file present (created by create_infrastructure.sh) +# - sagemaker-2017-07-24.normal.json service model file +# - create_config.sh script in the same directory +# +# WHAT THIS SCRIPT DOES: +# 1. Installs required tools (kubectl, eksctl, helm) if missing +# 2. Configures environment variables from CloudFormation stack +# 3. Updates kubeconfig for EKS cluster access +# 4. Configures SageMaker service model for HyperPod operations +# 5. Retrieves existing cluster configuration +# 6. Creates new restricted instance groups with custom settings +# 7. Updates IAM roles with necessary permissions +# 8. Applies the new configuration to the HyperPod cluster +# 9. Verifies the configuration update +# +# USAGE: +# ./create_hyperpod_clusters.sh +# +# OUTPUT FILES: +# - env_vars: Environment variables for the session +# - og_cluster_config.json: Original cluster configuration backup +# - updated_cluster_config.json: New cluster configuration +# +# AUTHOR: AWS Solutions Team +# VERSION: 1.0 +# LAST MODIFIED: $(date +%Y-%m-%d) +# ============================================================================== + +set -e + +# Color codes for enhanced user experience +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Enhanced output functions with better formatting +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_step() { + echo -e "${BLUE}[STEP]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_header() { + echo -e "${PURPLE}$1${NC}" +} + +# Function to check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Enhanced prompt function with better validation +prompt_yes_no() { + while true; do + read -p "$1 (y/n): " yn + case $yn in + [Yy]* ) return 0;; + [Nn]* ) return 1;; + * ) echo "Please answer yes or no.";; + esac + done +} + +# Enhanced input prompt with validation +prompt_input() { + local prompt="$1" + local default="$2" + local input + + if [ -n "$default" ]; then + read -p "$prompt [$default]: " input + echo "${input:-$default}" + else + read -p "$prompt: " input + echo "$input" + fi +} + +# Display script header +echo +print_header "===============================================================================" +print_header "๐Ÿš€ Amazon SageMaker HyperPod Cluster Configuration" +print_header "===============================================================================" +echo +print_status "This script will configure your HyperPod cluster with the following steps:" +echo " ๐Ÿ“ฆ Install required tools (kubectl, eksctl, helm)" +echo " ๐Ÿ”ง Configure environment variables and AWS settings" +echo " โš™๏ธ Update Kubernetes configuration for EKS access" +echo " ๐Ÿ” Configure SageMaker service model and IAM permissions" +echo " ๐Ÿ“‹ Retrieve and update cluster configuration" +echo " ๐ŸŽฏ Create restricted instance groups with custom settings" +echo " โœ… Verify and apply the new configuration" +echo +print_warning "โš ๏ธ Ensure you have run create_infrastructure.sh successfully before proceeding" +echo + +# Check prerequisites with detailed error messages +print_step "Validating Prerequisites" + +if [ ! -f .stack_arn ]; then + print_error "Stack ARN file (.stack_arn) not found." + print_error "This file is created by create_infrastructure.sh and contains the CloudFormation stack identifier." + print_error "Please run ccreate_infrastructure.sh first to create the required infrastructure." + exit 1 +fi + +if [ ! -f create_config.sh ]; then + print_error "create_config.sh not found in current directory." + print_error "This script is required to generate environment variables." + exit 1 +fi + +print_success "All prerequisite files found!" +echo + +# Step 1: Install required tools +print_step "Step 1: Installing Required Tools" + +# Install kubectl +print_status "Checking kubectl installation..." +if ! command_exists kubectl; then + print_warning "kubectl not found. Installing..." + if prompt_yes_no "Do you want to install kubectl?"; then + curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.30.4/2024-09-11/bin/linux/amd64/kubectl + chmod +x ./kubectl + mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$HOME/bin:$PATH + echo 'export PATH=$HOME/bin:$PATH' >> ~/.bashrc + rm ./kubectl + print_success "kubectl installed successfully" + else + print_error "kubectl is required. Exiting." + exit 1 + fi +else + print_success "kubectl is already installed: $(kubectl version --client --short 2>/dev/null || echo 'version check failed')" +fi + +# Install eksctl +print_status "Checking eksctl installation..." +if ! command_exists eksctl; then + print_warning "eksctl not found. Installing..." + if prompt_yes_no "Do you want to install eksctl?"; then + ARCH=amd64 + PLATFORM=$(uname -s)_$ARCH + curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz" + tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm eksctl_$PLATFORM.tar.gz + sudo mv /tmp/eksctl /usr/local/bin + print_success "eksctl installed successfully" + else + print_error "eksctl is required. Exiting." + exit 1 + fi +else + print_success "eksctl is already installed: $(eksctl version)" +fi + +# Install helm +print_status "Checking helm installation..." +if ! command_exists helm; then + print_warning "helm not found. Installing..." + if prompt_yes_no "Do you want to install helm?"; then + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + chmod 700 get_helm.sh + ./get_helm.sh + rm get_helm.sh + print_success "helm installed successfully" + else + print_error "helm is required. Exiting." + exit 1 + fi +else + print_success "helm is already installed: $(helm version --short)" +fi + +# Step 2: Set environment variables +print_step "Step 2: Setting Environment Variables" + +# Get stack name from .stack_arn file +stack_name=$(basename $(cat .stack_arn | cut -d'/' -f2)) +region=$(cat .stack_arn | cut -d':' -f4) + +export STACK_ID="$stack_name" +export AWS_REGION="$region" + +print_status "Environment variables set:" +echo " โ€ข STACK_ID=$STACK_ID" +echo " โ€ข AWS_REGION=$AWS_REGION" + +# Step 3: Configure environment +print_step "Step 3: Configuring Environment" + +print_status "Running create_config.sh to generate environment variables..." +./create_config.sh + +# Source environment variables +print_status "Loading environment variables..." +source env_vars + +# Step 4: Update kubeconfig +print_step "Step 4: Updating Kubernetes Configuration" + +if [ -n "$EKS_CLUSTER_NAME" ]; then + print_status "Updating kubeconfig for EKS cluster: $EKS_CLUSTER_NAME" + aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION" + print_success "Kubernetes configuration updated successfully!" +else + print_error "EKS_CLUSTER_NAME not set. Please check create_config.sh output." + exit 1 +fi + +# Step 5: Install SageMaker Python package +print_step "Step 5: Installing SageMaker Python Package" + +print_status "Checking SageMaker Python package installation..." +if ! python3 -c "import sagemaker" 2>/dev/null; then + print_warning "SageMaker Python package not found. Installing..." + if prompt_yes_no "Do you want to install the SageMaker Python package?"; then + # Try different installation methods based on availability + if command_exists pipx; then + print_status "Installing SageMaker via pipx (isolated environment)..." + pipx install sagemaker + elif [[ "$OSTYPE" == "darwin"* ]] && command_exists brew; then + print_status "Installing SageMaker via brew (brew's Python recommended for macOS)..." + brew install sagemaker + else + print_status "Installing SageMaker via pip3..." + pip3 install sagemaker + fi + print_success "SageMaker Python package installed successfully!" + else + print_error "SageMaker Python package is required. Exiting." + exit 1 + fi +else + print_success "SageMaker Python package is already installed" +fi + +# Step 6: Get cluster configuration +print_step "Step 6: Retrieving Cluster Configuration" + +print_status "Retrieving existing HyperPod cluster configuration..." +if [ -z "$HYPERPOD_CLUSTER_NAME" ]; then + print_error "HYPERPOD_CLUSTER_NAME is not set. Please check if env_vars was sourced correctly." + exit 1 +fi + +print_status "Using HyperPod cluster: $HYPERPOD_CLUSTER_NAME" +# Check if HYPERPOD_CLUSTER_NAME is an ARN, if not use HYPERPOD_CLUSTER_ARN +if [[ "$HYPERPOD_CLUSTER_NAME" == arn:* ]]; then + cluster_identifier="$HYPERPOD_CLUSTER_NAME" +else + cluster_identifier="$HYPERPOD_CLUSTER_ARN" +fi + +if ! aws sagemaker describe-cluster --cluster-name "$cluster_identifier" --region "$AWS_REGION" > og_cluster_config.json; then + print_error "Failed to retrieve cluster configuration." + print_error "Please verify the cluster exists and you have proper permissions." + exit 1 +fi + +print_success "Original cluster configuration saved to og_cluster_config.json" + +# Step 7: Create new cluster configuration +print_step "Step 7: Configuring Restricted Instance Groups" + +print_status "Creating new cluster configuration with restricted instance groups..." +echo +print_status "๐Ÿ“ Please provide configuration for the restricted instance group:" + +# Get values for RestrictedInstanceGroups +restricted_instance_count=$(prompt_input "Instance count for restricted group" "2") +restricted_instance_type=$(prompt_input "Instance type for restricted group" "ml.p5.48xlarge") + +if ! jq -e '.InstanceGroups[0].ExecutionRole' og_cluster_config.json >/dev/null 2>&1; then + print_error "Invalid cluster configuration. Could not find ExecutionRole in og_cluster_config.json" + exit 1 +fi + +# Get execution role from original config and verify it exists +original_execution_role=$(jq -r '.InstanceGroups[0].ExecutionRole' og_cluster_config.json) +if [ -z "$original_execution_role" ] || [ "$original_execution_role" = "null" ]; then + print_error "Could not find ExecutionRole in original configuration" + exit 1 +fi + +print_status "Configuring IAM permissions for execution role..." +role_name=$(basename "$original_execution_role") +if ! aws iam get-role --role-name "$role_name" > /dev/null 2>&1; then + print_error "Could not find execution role. Please ensure the role exists." + exit 1 +fi + +# Update trust relationship +print_status "Updating IAM trust relationship for role: $role_name" +trust_policy='{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "sagemaker.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +}' + +if ! aws iam update-assume-role-policy --role-name "$role_name" --policy-document "$trust_policy"; then + print_error "Failed to update trust relationship. Please ensure you have sufficient permissions." + exit 1 +fi + +# Update role permissions +print_status "Updating IAM role permissions..." +role_policy='{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeSubnets", + "ec2:DescribeVpcs", + "ec2:DescribeSecurityGroups", + "iam:PassRole", + "sagemaker:*" + ], + "Resource": "*" + } + ] +}' + +if ! aws iam put-role-policy --role-name "$role_name" --policy-name "HyperPodClusterPolicy" --policy-document "$role_policy"; then + print_error "Failed to update role permissions. Please ensure you have sufficient permissions." + exit 1 +fi + +# Use the same execution role as existing instance groups +restricted_execution_role="$original_execution_role" +print_success "IAM role configured with updated permissions: $restricted_execution_role" + +echo +print_status "๐Ÿ’พ Storage Configuration:" +restricted_volume_size=$(prompt_input "EBS volume size in GB" "500") + +echo +print_status "๐Ÿ—‚๏ธ FSx Lustre Configuration:" +restricted_fsx_size=$(prompt_input "FSx Lustre size in GiB" "12000") +restricted_fsx_throughput=$(prompt_input "FSx Lustre per unit storage throughput" "125") + +# Create updated cluster configuration +print_status "Generating updated cluster configuration..." +# Get VPC config from original configuration +vpc_config=$(jq -c '.VpcConfig' og_cluster_config.json) +if [ -z "$vpc_config" ] || [ "$vpc_config" = "null" ]; then + print_error "Could not find VPC configuration in original configuration" + exit 1 +fi + +# Get LifeCycleConfig from existing instance group +lifecycle_config=$(jq -c '.InstanceGroups[0].LifeCycleConfig' og_cluster_config.json) +if [ "$lifecycle_config" = "null" ]; then + print_error "Could not find LifeCycleConfig in original configuration" + exit 1 +fi + +# Create new restricted instance group with required LifeCycleConfig and FSxLustreConfig +print_status "Creating restricted instance group with FSx Lustre configuration..." +restricted_instance_group=$(jq -n \ + --argjson count "$restricted_instance_count" \ + --arg name "restricted-instance-group" \ + --arg type "$restricted_instance_type" \ + --arg role "$restricted_execution_role" \ + --argjson threads 1 \ + --argjson volume "$restricted_volume_size" \ + --argjson fsx_size "$restricted_fsx_size" \ + --argjson fsx_throughput "$restricted_fsx_throughput" \ + --argjson lifecycle "$lifecycle_config" \ + '{ + "InstanceCount": $count, + "InstanceGroupName": $name, + "InstanceType": $type, + "ExecutionRole": $role, + "ThreadsPerCore": $threads, + "LifeCycleConfig": $lifecycle, + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": $volume + } + } + ], + "EnvironmentConfig": { + "FSxLustreConfig": { + "SizeInGiB": $fsx_size, + "PerUnitStorageThroughput": $fsx_throughput + } + } + }') + +# Debug: Show the restricted instance group +print_status "๐Ÿ“‹ Restricted instance group configuration:" +echo "$restricted_instance_group" | jq '.' + +# Combine existing and new instance groups +existing_instance_groups=$(jq -c '.InstanceGroups | map({ + InstanceCount: .TargetCount, + InstanceGroupName: .InstanceGroupName, + InstanceType: .InstanceType, + LifeCycleConfig: .LifeCycleConfig, + ExecutionRole: .ExecutionRole, + ThreadsPerCore: .ThreadsPerCore, + InstanceStorageConfigs: .InstanceStorageConfigs, + OnStartDeepHealthChecks: .OnStartDeepHealthChecks, + EnvironmentConfig: .EnvironmentConfig +} | del(.[] | nulls))' og_cluster_config.json) + +# Add the new restricted instance group to existing ones +print_status "Combining existing and new instance groups..." +all_instance_groups=$(echo "$existing_instance_groups" | jq --argjson new "$restricted_instance_group" ". + [$new]") + +# Debug: Show total instance groups count +total_groups=$(echo "$all_instance_groups" | jq 'length') +print_status "Total instance groups after adding restricted group: $total_groups" + +cat > updated_cluster_config.json << EOF +{ + "ClusterName": "${HYPERPOD_CLUSTER_NAME}", + "InstanceGroups": ${all_instance_groups} +} +EOF + +# Validate JSON +if ! jq '.' updated_cluster_config.json > /dev/null 2>&1; then + print_error "Invalid JSON generated in updated_cluster_config.json" + exit 1 +fi + +print_success "Updated cluster configuration created: updated_cluster_config.json" +if prompt_yes_no "Do you want to review the new configuration?"; then + echo + print_status "๐Ÿ“„ New Cluster Configuration:" + cat updated_cluster_config.json | jq '.' + echo + if ! prompt_yes_no "Does the configuration look correct?"; then + print_error "Please edit updated_cluster_config.json manually before continuing." + exit 1 + fi +fi + +# Step 8: Update cluster configuration +print_step "Step 8: Applying Cluster Configuration" + +print_status "Updating HyperPod cluster with new configuration..." +if aws sagemaker update-cluster --cluster-name "$cluster_identifier" --cli-input-json file://updated_cluster_config.json; then + print_success "Cluster configuration update initiated successfully!" +else + print_error "Failed to update cluster configuration. Check AWS console for details." + exit 1 +fi + +# Step 9: Verify cluster update +print_step "Step 9: Verifying Configuration Update" + +print_status "Waiting for cluster update to propagate..." +sleep 10 + +print_status "Verifying updated cluster configuration..." +if aws sagemaker describe-cluster --region "$AWS_REGION" --cluster-name "$cluster_identifier" > /dev/null 2>&1; then + print_success "Cluster configuration updated successfully!" +else + print_warning "Cluster update may still be in progress. Check AWS console for status." +fi + +# Final status and completion summary +echo +print_header "===============================================================================" +print_success "๐ŸŽ‰ HyperPod Cluster Configuration Completed Successfully!" +print_header "===============================================================================" +echo + +# Display configuration summary +print_status "๐Ÿ“Š Configuration Summary:" +echo " โ€ข Cluster Name: $HYPERPOD_CLUSTER_NAME" +echo " โ€ข AWS Region: $AWS_REGION" +echo " โ€ข EKS Cluster: $EKS_CLUSTER_NAME" +echo " โ€ข Restricted Instance Count: $restricted_instance_count" +echo " โ€ข Restricted Instance Type: $restricted_instance_type" +echo " โ€ข EBS Volume Size: ${restricted_volume_size}GB" +echo " โ€ข FSx Lustre Size: ${restricted_fsx_size}GiB" +echo " โ€ข FSx Throughput: ${restricted_fsx_throughput}MB/s/TiB" +echo + +print_status "๐Ÿ“ Generated Files:" +echo " โ€ข env_vars - Environment variables for this session" +echo " โ€ข og_cluster_config.json - Original cluster configuration backup" +echo " โ€ข updated_cluster_config.json - New cluster configuration" +echo + +print_status "๐Ÿ” Verification Commands:" +echo " โ€ข Check EKS nodes: kubectl get nodes" +echo " โ€ข Check HyperPod status: aws sagemaker describe-cluster --cluster-name '$cluster_identifier'" +echo " โ€ข View cluster in console: https://console.aws.amazon.com/sagemaker/home?region=$AWS_REGION#/hyperpod" +echo + +print_status "๐Ÿš€ Next Steps:" +echo " 1. Verify EKS cluster nodes are ready" +echo " 2. Check HyperPod cluster status in AWS Console" +echo " 3. Deploy your machine learning workloads" +echo " 4. Monitor cluster performance and scaling" +echo + +# Cleanup temporary files with user confirmation +if [ -f "og_cluster_config.json" ] || [ -f "updated_cluster_config.json" ]; then + echo + if prompt_yes_no "๐Ÿงน Clean up temporary configuration files?"; then + rm -f og_cluster_config.json updated_cluster_config.json + print_success "Temporary files cleaned up successfully." + else + print_status "Temporary files preserved for your reference." + fi +fi + +echo +print_success "โœ… Script execution completed at $(date)" +print_status "๐Ÿ’ก For troubleshooting, check the AWS CloudFormation and SageMaker consoles." +echo \ No newline at end of file