diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index b3f307b..0c6d0fd 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -1,7 +1,6 @@ [defaults] inventory = hosts.ini roles_path = roles -remote_user = devops host_key_checking = False retry_files_enabled = False timeout = 30 @@ -14,10 +13,11 @@ fact_caching_timeout = 86400 # logging & output log_path = ./ansible.log -stdout_callback = yaml +stdout_callback = ansible.builtin.default +result_format = yaml # pyhton interpreter path -interpreter_python = "{{ansible_python_interpreter}}" +interpreter_python = /usr/bin/python3 [ssh_connection] ssh_args = -o ForwardAgent=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index e746599..424a62b 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -4,12 +4,10 @@ ansible_python_interpreter: /usr/bin/python3 devops_user: devops devops_public_key: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/linode.pub') }}" -# app image -app_image: "ghcr.io/tysker/cloud_devops_app:77ecd38" -app_container_name: "cloud-devops-app" -app_container_port: 5000 -app_public_port: 80 - # github account ghcr_username: "tysker" ghcr_token: "{{ lookup('env', 'GHCR_TOKEN') }}" + +# node-exporter +node_exporter_image: "prom/node-exporter:v1.8.1" +node_exporter_port: 9100 diff --git a/ansible/group_vars/app.yml b/ansible/group_vars/app.yml new file mode 100644 index 0000000..b6f3cab --- /dev/null +++ b/ansible/group_vars/app.yml @@ -0,0 +1,4 @@ +app_image: "ghcr.io/tysker/cloud_devops_app:77ecd38" +app_container_name: "cloud-devops-app" +app_container_port: 5000 +app_public_port: 80 diff --git a/ansible/group_vars/monitoring.yml b/ansible/group_vars/monitoring.yml new file mode 100644 index 0000000..a21633e --- /dev/null +++ b/ansible/group_vars/monitoring.yml @@ -0,0 +1,10 @@ +# prometheus +prometheus_image: "prom/prometheus:v2.52.0" +prometheus_port: 9090 +prometheus_config_dir: "/opt/prometheus" +prometheus_data_dir: "/opt/prometheus/data" + +# grafana +grafana_image: "grafana/grafana:10.4.3" +grafana_port: 3000 +grafana_data_dir: "/opt/grafana/data" diff --git a/ansible/hosts.ini b/ansible/hosts.ini index d0d558d..8ecc281 100644 --- a/ansible/hosts.ini +++ b/ansible/hosts.ini @@ -1,14 +1,13 @@ [bastion] -jump-1 ansible_host=172.105.80.74 +jump-1 ansible_host=172.104.228.45 [app] -app-1 ansible_host=192.168.133.230 ansible_ssh_common_args='-o ProxyJump=devops@172.105.80.74' +app-1 ansible_host=192.168.137.27 ansible_ssh_common_args='-o ProxyJump=devops@172.104.228.45' [monitoring] -monitoring-1 ansible_host=192.168.133.104 ansible_ssh_common_args='-o ProxyJump=devops@172.105.80.74' +monitoring-1 ansible_host=192.168.137.82 ansible_ssh_common_args='-o ProxyJump=devops@172.104.228.45' [all:children] bastion app monitoring - diff --git a/ansible/playbooks/bootstrap_1.yml b/ansible/playbooks/bootstrap_1.yml new file mode 100644 index 0000000..c1ffdec --- /dev/null +++ b/ansible/playbooks/bootstrap_1.yml @@ -0,0 +1,6 @@ +- name: Bootstrap all server (initial) + hosts: all + remote_user: root + roles: + - common + - bootstrap_user diff --git a/ansible/playbooks/bootstrap.yml b/ansible/playbooks/bootstrap_2.yml similarity index 68% rename from ansible/playbooks/bootstrap.yml rename to ansible/playbooks/bootstrap_2.yml index 766d8e4..a604758 100644 --- a/ansible/playbooks/bootstrap.yml +++ b/ansible/playbooks/bootstrap_2.yml @@ -1,14 +1,13 @@ -- name: Bootstrap all servers +- name: Harden SSH (after devops exists) hosts: all - gather_facts: false + remote_user: devops become: true roles: - - common - - bootstrap_users - ssh_hardening - name: Install Docker on app and monitoring servers hosts: app:monitoring + remote_user: devops gather_facts: true become: true roles: diff --git a/ansible/playbooks/deploy_app.yml b/ansible/playbooks/deploy_app.yml index def8e08..0e4fe3b 100644 --- a/ansible/playbooks/deploy_app.yml +++ b/ansible/playbooks/deploy_app.yml @@ -1,5 +1,6 @@ - name: Deploy Flask app container hosts: app + remote_user: devops gather_facts: true become: true roles: diff --git a/ansible/playbooks/monitoring_grafana.yml b/ansible/playbooks/monitoring_grafana.yml new file mode 100644 index 0000000..ed6e604 --- /dev/null +++ b/ansible/playbooks/monitoring_grafana.yml @@ -0,0 +1,7 @@ +- name: Deploy Grafana on monitoring server + hosts: monitoring + remote_user: devops + gather_facts: true + become: true + roles: + - grafana diff --git a/ansible/playbooks/monitoring_node_exporter.yml b/ansible/playbooks/monitoring_node_exporter.yml new file mode 100644 index 0000000..5039c37 --- /dev/null +++ b/ansible/playbooks/monitoring_node_exporter.yml @@ -0,0 +1,7 @@ +- name: Deploy Node Exporter on app and monitoring servers + hosts: app:monitoring + remote_user: devops + gather_facts: true + become: true + roles: + - node_exporter diff --git a/ansible/playbooks/monitoring_prometheus.yml b/ansible/playbooks/monitoring_prometheus.yml new file mode 100644 index 0000000..81a8674 --- /dev/null +++ b/ansible/playbooks/monitoring_prometheus.yml @@ -0,0 +1,7 @@ +- name: Deploy Prometheus on monitoring server + hosts: monitoring + gather_facts: true + remote_user: devops + become: true + roles: + - prometheus diff --git a/ansible/roles/bootstrap_users/tasks/main.yml b/ansible/roles/bootstrap_user/tasks/main.yml similarity index 100% rename from ansible/roles/bootstrap_users/tasks/main.yml rename to ansible/roles/bootstrap_user/tasks/main.yml diff --git a/ansible/roles/common/handlers/main.yml b/ansible/roles/common/handlers/main.yml deleted file mode 100644 index e69de29..0000000 diff --git a/ansible/roles/grafana/tasks/main.yml b/ansible/roles/grafana/tasks/main.yml new file mode 100644 index 0000000..3869f7d --- /dev/null +++ b/ansible/roles/grafana/tasks/main.yml @@ -0,0 +1,21 @@ +- name: Ensure Grafana data directory exists + ansible.builtin.file: + path: "{{ grafana_data_dir }}" + state: directory + owner: "472" + group: "472" + mode: "0755" + +- name: Ensure Grafana container is running + community.docker.docker_container: + name: grafana + image: "{{ grafana_image }}" + state: started + restart_policy: unless-stopped + network_mode: host + volumes: + - "{{ grafana_data_dir }}:/var/lib/grafana" + env: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_USERS_ALLOW_SIGN_UP: "false" diff --git a/ansible/roles/node_exporter/tasks/main.yml b/ansible/roles/node_exporter/tasks/main.yml new file mode 100644 index 0000000..ba1a5a6 --- /dev/null +++ b/ansible/roles/node_exporter/tasks/main.yml @@ -0,0 +1,12 @@ +- name: Ensure Node Exporter container is running + community.docker.docker_container: + name: node-exporter + image: "{{ node_exporter_image }}" + state: started + restart_policy: unless-stopped + network_mode: host + pid_mode: host + read_only: true + command: ["--path.rootfs=/host"] + volumes: + - "/:/host:ro,rslave" diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..bb595c6 --- /dev/null +++ b/ansible/roles/prometheus/tasks/main.yml @@ -0,0 +1,38 @@ +- name: Ensure Prometheus directories exist + become: true + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "65534" + group: "65534" + mode: "0755" + loop: + - "{{ prometheus_config_dir }}" + - "{{ prometheus_data_dir }}" + +- name: Render Prometheus configuration + ansible.builtin.template: + src: prometheus.yml.j2 + dest: "{{ prometheus_config_dir }}/prometheus.yml" + owner: root + group: root + mode: "0644" + +- name: Ensure Prometheus container is running + community.docker.docker_container: + name: prometheus + image: "{{ prometheus_image }}" + state: started + recreate: true + restart_policy: unless-stopped + network_mode: host + command: + - "--config.file={{ prometheus_config_dir }}/prometheus.yml" + - "--storage.tsdb.path={{ prometheus_data_dir }}" + - "--web.listen-address=0.0.0.0:{{ prometheus_port }}" + volumes: + - "{{ prometheus_config_dir }}/prometheus.yml:{{ prometheus_config_dir }}/prometheus.yml:ro" + - "{{ prometheus_data_dir }}:{{ prometheus_data_dir }}" + read_only: true + tmpfs: + - /tmp diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 0000000..8340de1 --- /dev/null +++ b/ansible/roles/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "node" + static_configs: + - targets: + - "{{ hostvars['monitoring-1'].ansible_host }}:9100" + - "{{ hostvars['app-1'].ansible_host }}:9100" + diff --git a/docs/project-checklist.md b/docs/project-checklist.md new file mode 100644 index 0000000..481eb8f --- /dev/null +++ b/docs/project-checklist.md @@ -0,0 +1,190 @@ +# Cloud DevOps Lab — Project Roadmap + +This document is the **authoritative roadmap and checklist** for the Cloud DevOps Lab. +It reflects the real implementation state of the project and defines what has been +completed, what is in progress, and what belongs to future expansion. + +--- + +## 1. Servers & Networking (Terraform / Linode) + +- [x] Create three servers (jump, app, monitoring) +- [x] Shared private network between all servers +- [x] Jump server as the only public SSH entry point +- [x] App server has public IP for HTTP access (temporary) +- [x] Monitoring server private-only +- [x] Firewall rules enforced via Terraform +- [ ] Reserved IPv4 address for app server (stable DNS target) +- [ ] Remote Terraform state backend (S3-compatible / Terraform Cloud) +- [ ] Terraform CI checks (`fmt`, `validate`, `tflint`) +- [ ] Document Terraform module contracts (inputs/outputs) + +--- + +## 2. DNS & Domain (Cloudflare + Registrar) + +- [x] Domain registered (simply.com) +- [x] Nameservers delegated to Cloudflare +- [x] DNS records created for app server +- [x] Cloudflare in DNS-only mode (proxy disabled) +- [ ] Terraform-managed DNS records (Cloudflare provider) +- [ ] Stable DNS target via reserved IP +- [ ] Decide exposure model for monitoring (private vs public) + +--- + +## 3. Access Control & SSH Security (Ansible) + +- [x] Non-root `devops` user created on all servers +- [x] Password authentication disabled +- [x] Challenge-response authentication disabled +- [x] Root SSH login disabled +- [x] SSH access restricted via `AllowUsers` +- [x] Bastion (jump host) enforced +- [x] SSH agent forwarding configured and documented +- [x] Ansible runs as `devops` with `become` +- [ ] Restrict SSH on jump server to trusted IP ranges +- [ ] Explicit SSH hardening parameters (`MaxAuthTries`, `LoginGraceTime`) +- [ ] Fail2ban on jump server +- [ ] Break-glass access procedure documented + +--- + +## 4. Firewalls & Host Hardening + +- [x] Linode firewalls applied to all servers +- [x] SSH allowed to app/monitoring only from jump private IP +- [x] Inbound policy DROP, outbound ACCEPT +- [x] App firewall allows HTTP (80) +- [ ] Firewall rules reviewed and minimized +- [ ] Automatic security updates (unattended-upgrades) +- [ ] Disable unused services and packages +- [ ] Basic system auditing and log retention + +--- + +## 5. Secrets Management + +- [x] Terraform secrets via environment variables +- [x] GitHub Actions secrets for CI +- [ ] Ansible Vault for runtime secrets +- [ ] Encrypted `.env` files generated by Ansible +- [ ] Secret rotation strategy documented +- [ ] Optional: HashiCorp Vault (Roadmap Part 2) + +--- + +## 6. Container Runtime (Docker) + +- [x] Docker installed via Ansible (app + monitoring) +- [x] Docker not installed on jump server +- [x] `devops` user added to docker group +- [x] Application container deployed +- [x] Restart policy (`unless-stopped`) +- [x] Healthcheck implemented +- [ ] Container runs as non-root user +- [ ] Resource limits (CPU/memory) +- [ ] Log rotation for Docker containers +- [ ] Migrate app deployment to Docker Compose + +--- + +## 7. CI/CD (GitHub Actions + GHCR) + +- [x] Docker image built in CI +- [x] Images pushed to GHCR +- [x] Immutable SHA tags used for deployment +- [x] Deployment via Ansible using pinned image +- [ ] Linting and unit tests in CI +- [ ] Container vulnerability scanning (Trivy/Grype) +- [ ] SBOM generation +- [ ] Promotion workflow (staging → production) +- [ ] Semantic versioning strategy + +--- + +## 8. Application Deployment + +- [x] App deployed via Ansible +- [x] Health endpoint validated automatically +- [x] HTTP exposed on port 80 +- [ ] Bind app container to localhost only (via reverse proxy) +- [ ] Blue/green or rolling deployment strategy +- [ ] Rollback procedure documented + +--- + +## 9. Monitoring & Observability + +### Node Exporter + +- [x] Node Exporter deployed on app server +- [x] Node Exporter deployed on monitoring server +- [x] Metrics verified on port 9100 + +### Prometheus + +- [ ] Prometheus deployed on monitoring server +- [ ] Scrape node exporters +- [ ] Scrape application metrics +- [ ] Retention and storage configured +- [ ] Alert rules defined + +### Grafana + +- [ ] Grafana deployed on monitoring server +- [ ] Prometheus datasource configured +- [ ] Node exporter dashboards imported +- [ ] App dashboards created +- [ ] Access control (auth / private access) + +--- + +## 10. TLS, Reverse Proxy & Edge Security + +- [ ] Reverse proxy (Nginx / Caddy / Traefik) +- [ ] HTTPS via Let’s Encrypt or Cloudflare origin certs +- [ ] App container bound to localhost +- [ ] Cloudflare proxy enabled +- [ ] Origin access restricted to Cloudflare IPs +- [ ] Security headers enforced (HSTS, etc.) + +--- + +# Roadmap — Part 2 (After Core Project) + +### Reliability & Scaling + +- [ ] Reserved IP + NodeBalancer +- [ ] Blue/green or canary deployments +- [ ] Automated rollbacks +- [ ] Load testing + +### Security Maturity + +- [ ] Centralized logging (Loki / ELK) +- [ ] SSO for Grafana +- [ ] WAF rules and rate limiting +- [ ] Image signing (Cosign) + +### Infrastructure Maturity + +- [ ] Remote Terraform state + workspaces +- [ ] Terraform → Ansible dynamic inventory +- [ ] Pre-commit hooks +- [ ] Policy as Code (OPA / Conftest) + +### Orchestration (Choose One) + +- [ ] Docker Compose (production-grade) +- [ ] Docker Swarm +- [ ] Kubernetes (k3s, ingress, cert-manager, GitOps) + +--- + +## Project Rules + +- Every stage starts with a new git branch +- No manual changes on servers +- Pinned versions only +- Documentation updated at the end of each stage diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 082e5d3..6f14484 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -1,43 +1,9 @@ -# resource "linode_instance" "jump" { -# label = "${var.project_name}-${var.environment}-jump" -# region = var.region -# type = var.instance_type -# image = var.image -# -# authorized_keys = [chomp(file(var.ssh_public_key_path))] -# -# private_ip = true -# } -# -# resource "linode_instance" "app" { -# label = "${var.project_name}-${var.environment}-app" -# region = var.region -# type = var.instance_type -# image = var.image -# -# authorized_keys = [chomp(file(var.ssh_public_key_path))] -# -# -# private_ip = true -# } -# -# resource "linode_instance" "monitoring" { -# label = "${var.project_name}-${var.environment}-monitoring" -# region = var.region -# type = var.instance_type -# image = var.image -# -# authorized_keys = [chomp(file(var.ssh_public_key_path))] -# -# private_ip = true -# } - module "jump" { source = "./modules/compute" label = "${var.project_name}-${var.environment}-jump" region = var.region - instance_type = var.instance_type + instance_type = var.instance_type_1gb image = var.image authorized_keys = [chomp(file(var.ssh_public_key_path))] } @@ -47,7 +13,7 @@ module "app" { label = "${var.project_name}-${var.environment}-app" region = var.region - instance_type = var.instance_type + instance_type = var.instance_type_2gb image = var.image authorized_keys = [chomp(file(var.ssh_public_key_path))] } @@ -57,7 +23,7 @@ module "monitoring" { label = "${var.project_name}-${var.environment}-monitoring" region = var.region - instance_type = var.instance_type + instance_type = var.instance_type_2gb image = var.image authorized_keys = [chomp(file(var.ssh_public_key_path))] } @@ -98,6 +64,14 @@ resource "linode_firewall" "app_fw" { ipv4 = ["192.168.0.0/16"] } + inbound { + label = "allow-http" + action = "ACCEPT" + protocol = "TCP" + ports = "80" + ipv4 = ["0.0.0.0/0"] + } + inbound_policy = "DROP" outbound_policy = "ACCEPT" @@ -123,14 +97,6 @@ resource "linode_firewall" "monitoring_fw" { ipv4 = ["192.168.0.0/16"] } - inbound { - label = "allow-http" - action = "ACCEPT" - protocol = "TCP" - ports = "80" - ipv4 = ["0.0.0.0/0"] - } - inbound_policy = "DROP" outbound_policy = "ACCEPT" diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index 34d508c..00a4fe3 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -10,12 +10,18 @@ variable "region" { default = "eu-central" } -variable "instance_type" { - description = "Linode instance type" +variable "instance_type_1gb" { + description = "Linode instance type 1GB ram" type = string default = "g6-nanode-1" } +variable "instance_type_2gb" { + description = "Linode instance type 2GB ram" + type = string + default = "g6-standard-1" +} + variable "ssh_public_key_path" { description = "Path to the SSH public key used to access servers" type = string