From e6e89095064ae1ffd97e9fba6fcfb2ed4e23d63e Mon Sep 17 00:00:00 2001
From: alex-504 <alexandre.ealimentos@gmail.com>
Date: Tue, 1 Jul 2025 14:41:37 +0900
Subject: [PATCH] Added CloudWatch and Alert resources to complete the task
 requirements

---
 ISSUES_FOUND.md   | 48 ++++++++++++++++++++++++++++++++++++++++++++++-
 README.md         |  8 ++++++++
 terraform/main.tf | 31 ++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/ISSUES_FOUND.md b/ISSUES_FOUND.md
index ab52048..13ed6f6 100644
--- a/ISSUES_FOUND.md
+++ b/ISSUES_FOUND.md
@@ -151,4 +151,50 @@ These "gotchas" are actually excellent learning opportunities:
 - **Environment Configuration**: Understand 12-factor app principles
 - **Error Handling**: Learn proper error handling patterns
 - **Security**: Understand container and application security
-- **Infrastructure as Code**: Start from scratch with proper structure 
\ No newline at end of file
+- **Infrastructure as Code**: Start from scratch with proper structure
+
+## Issue: ECS App Could Not Connect to RDS (Database Connection Refused)
+
+### **Symptoms:**
+- ECS tasks were repeatedly stopping with exit code 1.
+- CloudWatch logs showed:
+  - `sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused`
+- Health check endpoint was not responding; app was not running.
+
+### **Root Cause:**
+- The application was trying to connect to a PostgreSQL database at `localhost` inside the container, but the actual database was running on AWS RDS in a private subnet.
+- The ECS task definition was only passing `DB_HOST` or missing the correct `DATABASE_URL` environment variable, so the app defaulted to `localhost`.
+
+### **Troubleshooting Steps:**
+1. **Checked ECS task status:**
+   - Noticed tasks were stopping quickly after launch.
+2. **Enabled CloudWatch logging:**
+   - Updated ECS task definition to send logs to CloudWatch for easier debugging.
+3. **Reviewed CloudWatch logs:**
+   - Found clear Python stack trace showing connection attempts to `localhost`.
+4. **Reviewed environment variables:**
+   - Realized the app expected a `DATABASE_URL` env var, not just `DB_HOST`.
+5. **Checked RDS endpoint and credentials:**
+   - Confirmed the correct RDS endpoint, username, and password.
+
+### **Solution:**
+- Updated the ECS task definition in Terraform to set the `DATABASE_URL` environment variable:
+  ```hcl
+  environment = [
+    {
+      name  = "DATABASE_URL"
+      value = "postgresql://beer_admin:${var.db_password}@${var.db_host}:5432/beer_catalog"
+    }
+  ]
+  ```
+- Added `db_host` as a Terraform variable and set it to the RDS endpoint.
+- Re-applied Terraform to update the ECS service and task definition.
+- Verified the app was running and healthy by hitting the `/health` endpoint.
+
+### **Key Learnings:**
+- Always check CloudWatch logs for ECS task failures—they provide detailed error messages.
+- Ensure your app's environment variables match what the code expects (e.g., `DATABASE_URL`).
+- Use Terraform variables to avoid hardcoding sensitive or environment-specific values.
+- ECS task definition changes require a new revision and service update.
+
+--- 
\ No newline at end of file
diff --git a/README.md b/README.md
index 1608e88..5b66a66 100644
--- a/README.md
+++ b/README.md
@@ -294,6 +294,14 @@ This project uses AWS ECS Fargate to run the Beer Catalog app in a scalable, man
 - The ECS service is set to run 1 copy of your app by default. You can scale this by changing the `desired_count` in the Terraform code.
 - For production, consider adding a load balancer and auto-scaling.
 
+## Monitoring & Alerts
+
+- **CloudWatch Alarms:**
+  - `ECS-Task-Failures`: Triggers if any ECS task fails in the `beer-catalog-service`.
+  - `RDS-High-CPU`: Triggers if RDS CPU utilization exceeds 80%.
+- Both alarms are provisioned via Terraform and can be viewed in the AWS CloudWatch console.
+- See `cloudwatch_alarms.png` for a screenshot of the alarms in the AWS Console.
+
 
 
 
diff --git a/terraform/main.tf b/terraform/main.tf
index 4109974..7cf97d5 100644
--- a/terraform/main.tf
+++ b/terraform/main.tf
@@ -270,3 +270,34 @@ resource "aws_cloudwatch_log_group" "ecs_app" {
   name              = "/ecs/beer-catalog-app"
   retention_in_days = 7
 }
+
+resource "aws_cloudwatch_metric_alarm" "ecs_task_failures" {
+  alarm_name          = "ECS-Task-Failures"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "ServiceTaskFailures"
+  namespace           = "AWS/ECS"
+  period              = 60
+  statistic           = "Sum"
+  threshold           = 0
+  alarm_description   = "Alarm if any ECS task fails in the beer-catalog-service"
+  dimensions = {
+    ClusterName = aws_ecs_cluster.main.name
+    ServiceName = aws_ecs_service.app.name
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "rds_high_cpu" {
+  alarm_name          = "RDS-High-CPU"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 2
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/RDS"
+  period              = 300
+  statistic           = "Average"
+  threshold           = 80
+  alarm_description   = "Alarm if RDS CPU utilization exceeds 80%"
+  dimensions = {
+    DBInstanceIdentifier = aws_db_instance.beer_database.id
+  }
+}