From d8153f7efd922bc9d9f13c06116ba2d49ac3fe43 Mon Sep 17 00:00:00 2001 From: tysker Date: Mon, 12 Jan 2026 09:25:45 +0100 Subject: [PATCH 1/3] add new app version to ansible group_vars folder --- ansible/group_vars/app.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/group_vars/app.yml b/ansible/group_vars/app.yml index b6f3cab..ee15808 100644 --- a/ansible/group_vars/app.yml +++ b/ansible/group_vars/app.yml @@ -1,4 +1,4 @@ -app_image: "ghcr.io/tysker/cloud_devops_app:77ecd38" +app_image: "ghcr.io/tysker/cloud_devops_app:0950da9" app_container_name: "cloud-devops-app" app_container_port: 5000 app_public_port: 80 From 255978a545f5f85fca54aa5156164ffbd96cc3c8 Mon Sep 17 00:00:00 2001 From: tysker Date: Mon, 12 Jan 2026 11:12:10 +0100 Subject: [PATCH 2/3] fix(prom template) change prom. job from node-explorer to node --- ansible/roles/prometheus/templates/prometheus.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 index 59a0934..679a126 100644 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ b/ansible/roles/prometheus/templates/prometheus.yml.j2 @@ -3,7 +3,7 @@ global: evaluation_interval: 15s scrape_configs: - - job_name: "node-exporter" + - job_name: "node" static_configs: - targets: - "{{ hostvars['monitoring-1'].ansible_host }}:9100" From 95c2eb67f94b685287d271ac9ba1edc75e6311d7 Mon Sep 17 00:00:00 2001 From: tysker Date: Mon, 12 Jan 2026 13:07:22 +0100 Subject: [PATCH 3/3] docs(readme,checklist) update documentation so it resembles current state --- README.md | 109 +++++++++++++++++++++++++++++++++++--- ansible/README.md | 4 +- docs/project-checklist.md | 27 ++++++---- 3 files changed, 120 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index f801303..feb75d0 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ ## Project Description This repository is a complete end-to-end DevOps learning project built around a small Python Flask -application. The goal is to gradually build a realistic production-like environment that includes: +application. All access follows a bastion-based, non-root security model. + +The goal is to gradually build a realistic production-like environment that includes: - containerization with Docker - CI/CD pipelines (GitHub Actions) @@ -16,6 +18,8 @@ application. The goal is to gradually build a realistic production-like environm The project grows in clear stages. Each stage is documented with **what was done**, **why it matters**, and **how it was implemented**, so it becomes both a learning journal and a portfolio project. +**Current status:** Stages 1–10 completed. Application is deployed, monitored with Prometheus and Grafana, and accessible via HTTP. Next step: TLS and reverse proxy. + ## Structure Current project layout: @@ -24,19 +28,32 @@ Current project layout: cloud_devops_lab/ ├── ansible │   ├── ansible.cfg +│   ├── ansible.log │   ├── group_vars -│   │   └── all.yml +│   │   ├── all.yml +│   │   ├── app.yml +│   │   └── monitoring.yml │   ├── hosts.ini │   ├── playbooks -│   │   └── bootstrap.yml +│   │   ├── bootstrap_1.yml +│   │   ├── bootstrap_2.yml +│   │   ├── deploy_app.yml +│   │   ├── monitoring_grafana.yml +│   │   ├── monitoring_node_exporter.yml +│   │   └── monitoring_prometheus.yml │   ├── README.md │   └── roles -│   ├── bootstrap_users +│   ├── bootstrap_user │   ├── common +│   ├── deploy_app │   ├── docker +│   ├── grafana +│   ├── node_exporter +│   ├── prometheus │   └── ssh_hardening ├── app │   ├── Dockerfile +│   ├── gunicorn.conf.py │   ├── requirements.txt │   ├── src │   │   ├── app.py @@ -50,6 +67,8 @@ cloud_devops_lab/ │   ├── lib │   ├── lib64 -> lib │   └── pyvenv.cfg +├── docs +│   └── project-checklist.md ├── infrastructure │   └── terraform │   ├── main.tf @@ -74,6 +93,8 @@ cloud_devops_lab/ - Linode for server hosting - Cloudflare (DNS) - Domain registrar +- Grafana +- Prometheus ## Running the Application Locally @@ -104,8 +125,8 @@ The project is built in incremental stages. Each stage adds a new DevOps capabil - Stage 7: SSH hardening - Stage 8: Docker installation (via Ansible) - Stage 9: Application deployment -- Stage 10: Monitoring stack (Prometheus & Grafana) -- Stage 11: TLS certificates & reverse proxy +- Stage 10: Monitoring stack (Prometheus & Grafana) +- Stage 11: TLS certificates & reverse proxy (Caddy) ### Stage 1 — Flask Application @@ -143,7 +164,6 @@ foundation for CI/CD pipelines, registries, deployment automation, and infrastru 1. Build image: `docker build -t cloud-devops-app:0.1 .` 2. Run container: `docker run -p 5000:5000 cloud-devops-app:0.1` 3. Test health endpoint: `curl http://localhost:5000/health` -4. Test metrics: `curl http://localhost:5000/metrics/custom` ### Stage 3 — CI/CD Pipeline (GHCR Integration) @@ -289,6 +309,74 @@ A repeatable deployment reduces manual steps and ensures consistent environments - Exposed HTTP on port 80 mapped to container port 5000. - Added an Ansible health check against `/health`. +### Stage 10 — Monitoring stack (Prometheus & Grafana) + +This stage introduces full observability for both the infrastructure and the application. + +#### Part 1 — Node Exporter + +**What:** +Deployed Node Exporter on the application and monitoring servers. + +**Why:** +Host-level metrics (CPU, memory, disk, network) are essential for understanding system health and capacity. + +**How:** +- Installed Node Exporter via Docker using Ansible. +- Metrics exposed on port `9100`. +- Targets scraped via private IPs. + +--- + +#### Part 2 — Prometheus + +**What:** +Deployed Prometheus on the monitoring server. + +**Why:** +Prometheus acts as the central metrics collection and storage system. + +**How:** +- Prometheus deployed via Docker using Ansible. +- Configuration rendered from a template (`prometheus.yml`). +- Scrapes: + - Node Exporter on app + monitoring servers + - Flask application metrics +- Persistent data directory mounted on the host. + +--- + +#### Part 3 — Grafana + +**What:** +Deployed Grafana for metrics visualization. + +**Why:** +Metrics are only useful if they can be explored and visualized effectively. + +**How:** +- Grafana deployed via Docker using Ansible. +- Prometheus configured as a data source. +- Access restricted to SSH port forwarding (no public exposure). +- Imported **Node Exporter Full** dashboard (ID 1860). + +--- + +#### Part 4 — Flask application metrics + +**What:** +Exposed application metrics in Prometheus format. + +**Why:** +Application-level observability enables insight into runtime behavior, performance, and stability. + +**How:** +- Added `/metrics` endpoint using `prometheus_client`. +- Removed the earlier JSON-based metrics endpoint. +- Prometheus scrapes the app at: + - `http://:80/metrics` +- Metrics verified in Prometheus and visualized in Grafana. + ### Access Model - Direct SSH access is allowed only to the jump server. @@ -329,7 +417,12 @@ A chronological log describing the work done in each stage. - Procced to Stage 7: SSH hardening - Procced to Stage 8: Docker installation (via Ansible) - Procced to Stage 9: Application deployment using Docker and GHCR -- Procced to Stage 10: Stage 10: Monitoring stack (Prometheus & Grafana) +- Procced to Stage 10: Stage 10: Monitoring stack (Prometheus & Grafana) +- Procced to Stage 11: TLS certificates & reverse proxy (Caddy) + +Stage 11 will introduce HTTPS, automatic TLS certificates, and a reverse proxy +in front of the application. This enables secure traffic, prepares the setup +for Cloudflare proxying, and allows stricter firewall rules on the application server. ## Git Workflow & Conventions diff --git a/ansible/README.md b/ansible/README.md index 57e2c05..a162641 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -12,10 +12,12 @@ all servers in the Cloud DevOps Lab. ## Roles ### common + - Verifies basic connectivity (`ping`) - Used as a baseline dependency for all other roles ### bootstrap_users + - Creates a non-root `devops` user - Adds the user to the `sudo` group - Configures passwordless sudo (temporary) @@ -26,5 +28,5 @@ all servers in the Cloud DevOps Lab. Run from the `ansible/` directory: ```bash -ansible-playbook playbooks/bootstrap.yml +ansible-playbook playbooks/.... ``` diff --git a/docs/project-checklist.md b/docs/project-checklist.md index 481eb8f..e420b52 100644 --- a/docs/project-checklist.md +++ b/docs/project-checklist.md @@ -29,7 +29,7 @@ completed, what is in progress, and what belongs to future expansion. - [x] Cloudflare in DNS-only mode (proxy disabled) - [ ] Terraform-managed DNS records (Cloudflare provider) - [ ] Stable DNS target via reserved IP -- [ ] Decide exposure model for monitoring (private vs public) +- [x] Decide exposure model for monitoring (private vs public) --- @@ -124,19 +124,24 @@ completed, what is in progress, and what belongs to future expansion. ### Prometheus -- [ ] Prometheus deployed on monitoring server -- [ ] Scrape node exporters -- [ ] Scrape application metrics -- [ ] Retention and storage configured -- [ ] Alert rules defined +- [x] Prometheus deployed on monitoring server +- [x] Scrape node exporters (app + monitoring via private IPs) +- [x] Scrape application metrics (`/metrics` scraped as job `flask_app`) +- [ ] Retention and storage tuned/configured explicitly (beyond defaults) +- [ ] Alert rules definedned ### Grafana -- [ ] Grafana deployed on monitoring server -- [ ] Prometheus datasource configured -- [ ] Node exporter dashboards imported -- [ ] App dashboards created -- [ ] Access control (auth / private access) +### Grafana + +- [x] Grafana deployed on monitoring server +- [x] Prometheus datasource configured +- [x] Node exporter dashboards imported +- [x] App dashboards created (custom panels for `flask_app`): + - `process_resident_memory_bytes` + - `rate(process_cpu_seconds_total[5m])` + - `rate(python_gc_objects_collected_total[5m])` +- [ ] Access control hardening (no default creds, users/roles; still private via SSH tunnel) ---