From e6e89095064ae1ffd97e9fba6fcfb2ed4e23d63e Mon Sep 17 00:00:00 2001 From: alex-504 Date: Tue, 1 Jul 2025 14:41:37 +0900 Subject: [PATCH] Added CloudWatch and Alert resources to complete the task requirements --- ISSUES_FOUND.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++- README.md | 8 ++++++++ terraform/main.tf | 31 ++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/ISSUES_FOUND.md b/ISSUES_FOUND.md index ab52048..13ed6f6 100644 --- a/ISSUES_FOUND.md +++ b/ISSUES_FOUND.md @@ -151,4 +151,50 @@ These "gotchas" are actually excellent learning opportunities: - **Environment Configuration**: Understand 12-factor app principles - **Error Handling**: Learn proper error handling patterns - **Security**: Understand container and application security -- **Infrastructure as Code**: Start from scratch with proper structure \ No newline at end of file +- **Infrastructure as Code**: Start from scratch with proper structure + +## Issue: ECS App Could Not Connect to RDS (Database Connection Refused) + +### **Symptoms:** +- ECS tasks were repeatedly stopping with exit code 1. +- CloudWatch logs showed: + - `sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused` +- Health check endpoint was not responding; app was not running. + +### **Root Cause:** +- The application was trying to connect to a PostgreSQL database at `localhost` inside the container, but the actual database was running on AWS RDS in a private subnet. +- The ECS task definition was only passing `DB_HOST` or missing the correct `DATABASE_URL` environment variable, so the app defaulted to `localhost`. + +### **Troubleshooting Steps:** +1. **Checked ECS task status:** + - Noticed tasks were stopping quickly after launch. +2. **Enabled CloudWatch logging:** + - Updated ECS task definition to send logs to CloudWatch for easier debugging. +3. **Reviewed CloudWatch logs:** + - Found clear Python stack trace showing connection attempts to `localhost`. +4. **Reviewed environment variables:** + - Realized the app expected a `DATABASE_URL` env var, not just `DB_HOST`. +5. **Checked RDS endpoint and credentials:** + - Confirmed the correct RDS endpoint, username, and password. + +### **Solution:** +- Updated the ECS task definition in Terraform to set the `DATABASE_URL` environment variable: + ```hcl + environment = [ + { + name = "DATABASE_URL" + value = "postgresql://beer_admin:${var.db_password}@${var.db_host}:5432/beer_catalog" + } + ] + ``` +- Added `db_host` as a Terraform variable and set it to the RDS endpoint. +- Re-applied Terraform to update the ECS service and task definition. +- Verified the app was running and healthy by hitting the `/health` endpoint. + +### **Key Learnings:** +- Always check CloudWatch logs for ECS task failures—they provide detailed error messages. +- Ensure your app's environment variables match what the code expects (e.g., `DATABASE_URL`). +- Use Terraform variables to avoid hardcoding sensitive or environment-specific values. +- ECS task definition changes require a new revision and service update. + +--- \ No newline at end of file diff --git a/README.md b/README.md index 1608e88..5b66a66 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,14 @@ This project uses AWS ECS Fargate to run the Beer Catalog app in a scalable, man - The ECS service is set to run 1 copy of your app by default. You can scale this by changing the `desired_count` in the Terraform code. - For production, consider adding a load balancer and auto-scaling. +## Monitoring & Alerts + +- **CloudWatch Alarms:** + - `ECS-Task-Failures`: Triggers if any ECS task fails in the `beer-catalog-service`. + - `RDS-High-CPU`: Triggers if RDS CPU utilization exceeds 80%. +- Both alarms are provisioned via Terraform and can be viewed in the AWS CloudWatch console. +- See `cloudwatch_alarms.png` for a screenshot of the alarms in the AWS Console. + diff --git a/terraform/main.tf b/terraform/main.tf index 4109974..7cf97d5 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -270,3 +270,34 @@ resource "aws_cloudwatch_log_group" "ecs_app" { name = "/ecs/beer-catalog-app" retention_in_days = 7 } + +resource "aws_cloudwatch_metric_alarm" "ecs_task_failures" { + alarm_name = "ECS-Task-Failures" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + metric_name = "ServiceTaskFailures" + namespace = "AWS/ECS" + period = 60 + statistic = "Sum" + threshold = 0 + alarm_description = "Alarm if any ECS task fails in the beer-catalog-service" + dimensions = { + ClusterName = aws_ecs_cluster.main.name + ServiceName = aws_ecs_service.app.name + } +} + +resource "aws_cloudwatch_metric_alarm" "rds_high_cpu" { + alarm_name = "RDS-High-CPU" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "CPUUtilization" + namespace = "AWS/RDS" + period = 300 + statistic = "Average" + threshold = 80 + alarm_description = "Alarm if RDS CPU utilization exceeds 80%" + dimensions = { + DBInstanceIdentifier = aws_db_instance.beer_database.id + } +}