diff --git a/charts/iap/.helmignore b/charts/iap/.helmignore index 3ecbbcb..f3edbe5 100644 --- a/charts/iap/.helmignore +++ b/charts/iap/.helmignore @@ -24,5 +24,4 @@ # Itential patterns to ignore lab-setup values-* -tests/ .github diff --git a/charts/iap/Chart.yaml b/charts/iap/Chart.yaml index 21fbf94..fdc5475 100644 --- a/charts/iap/Chart.yaml +++ b/charts/iap/Chart.yaml @@ -10,7 +10,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.7.0 +version: 1.8.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/iap/templates/tests/test-connection.yaml b/charts/iap/templates/tests/test-connection.yaml deleted file mode 100644 index 206760e..0000000 --- a/charts/iap/templates/tests/test-connection.yaml +++ /dev/null @@ -1,209 +0,0 @@ -{{- if .Values.tests.enabled }} -apiVersion: v1 -kind: Pod -metadata: - name: {{ include "iap.fullname" . }}-test-connection - labels: - {{- include "iap.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": test - "helm.sh/hook-weight": "1" - "helm.sh/hook-delete-policy": before-hook-creation -spec: - restartPolicy: Never - containers: - - name: connection-test - image: curlimages/curl:latest # Changed to curl image for better tools - command: - - /bin/sh - - -c - - | - echo "๐Ÿš€ Starting enhanced connection test to {{ .Values.ingress.loadBalancer.host }}:{{ .Values.service.port | default 3443 }}" - echo "โฐ Starting at: $(date)" - - # Configuration - target_host="{{ .Values.ingress.loadBalancer.host }}" - target_port="{{ .Values.service.port | default 3443 }}" - max_wait_time={{ .Values.tests.connectionTimeout | default 300 }} # Increased default - dns_retry_interval=10 - connection_retry_interval=3 # Faster retries - stability_tests=5 # More stability tests - success_threshold=3 # Need 3/5 to pass - - echo "๐Ÿ“‹ Test configuration:" - echo " Target host: ${target_host}" - echo " Target port: ${target_port}" - echo " Max wait time: ${max_wait_time}s" - echo " Connection retry interval: ${connection_retry_interval}s" - echo " Stability tests: ${stability_tests}" - echo " Success threshold: ${success_threshold}/${stability_tests}" - echo "" - - # Step 1: DNS Resolution (quick since it worked before) - echo "๐Ÿ” Phase 1: DNS Resolution Test" - if nslookup "${target_host}" > /dev/null 2>&1; then - resolved_ip=$(nslookup "${target_host}" | grep -A1 "Name:" | grep "Address:" | awk '{print $2}' | head -1) - echo "โœ… DNS resolution successful!" - echo "๐ŸŒ Resolved IP: ${resolved_ip:-"could not extract IP"}" - else - echo "โŒ DNS resolution failed" - exit 1 - fi - - # Step 2: Enhanced connectivity test with better diagnostics - echo "" - echo "๐Ÿ”Œ Phase 2: Enhanced Connectivity Test" - connection_wait_time=0 - connection_successful=false - success_count=0 - attempt=0 - - while [ $connection_wait_time -lt $max_wait_time ] && [ "$connection_successful" = "false" ]; do - attempt=$((attempt + 1)) - echo "๐Ÿ”Œ Testing connection (attempt ${attempt}) at $(date)..." - - # Test with multiple methods for better diagnostics - nc_result=false - curl_result=false - - # Method 1: netcat test - if timeout 10 nc -z "${target_host}" "${target_port}" > /dev/null 2>&1; then - nc_result=true - echo " โœ… netcat: TCP connection successful" - else - echo " โŒ netcat: TCP connection failed" - fi - - # Method 2: curl test (for HTTPS/HTTP services) - if timeout 15 curl -k -s --connect-timeout 10 --max-time 15 \ - "https://${target_host}:${target_port}" > /dev/null 2>&1; then - curl_result=true - echo " โœ… curl: HTTPS connection successful" - else - echo " โŒ curl: HTTPS connection failed" - fi - - # Consider it successful if either method works - if [ "$nc_result" = "true" ] || [ "$curl_result" = "true" ]; then - success_count=$((success_count + 1)) - echo " ๐ŸŽฏ Connection attempt ${attempt} successful (success count: ${success_count})" - - # Require 2 consecutive successes to consider it stable - if [ $success_count -ge 2 ]; then - connection_successful=true - echo " ๐ŸŽ‰ Connection established after ${attempt} attempts!" - fi - else - success_count=0 # Reset success count on failure - echo " โŒ Both connection methods failed" - fi - - if [ "$connection_successful" = "false" ]; then - echo " โณ Waiting ${connection_retry_interval}s before retry... (${connection_wait_time}s elapsed)" - sleep $connection_retry_interval - connection_wait_time=$((connection_wait_time + connection_retry_interval)) - fi - done - - if [ "$connection_successful" = "false" ]; then - echo "โŒ Connection test failed after ${connection_wait_time}s" - echo "" - echo "๐Ÿ” Diagnostic Information:" - - # Additional diagnostics - echo "๐Ÿ“ Final DNS check:" - nslookup "${target_host}" 2>&1 - - echo "๐Ÿ” Detailed connection attempt:" - timeout 10 nc -v "${target_host}" "${target_port}" 2>&1 || echo "Detailed netcat failed" - - echo "๐ŸŒ Network trace (first 3 hops):" - timeout 10 traceroute -m 3 "${target_host}" 2>/dev/null || echo "traceroute not available" - - echo "๐Ÿ’ก Possible issues:" - echo " - Service is still starting up (StatefulSet can take time)" - echo " - Load balancer is not fully configured" - echo " - Health checks are interfering with connections" - echo " - Resource constraints causing intermittent availability" - echo " - Certificate/TLS issues (port ${target_port})" - - exit 1 - fi - - # Step 3: Enhanced stability test - echo "" - echo "โœ… Phase 3: Enhanced Stability Test" - echo "๐Ÿ”„ Running ${stability_tests} stability tests over 30 seconds..." - - passed_tests=0 - failed_tests=0 - - for test_num in $(seq 1 $stability_tests); do - echo "๐Ÿงช Stability test ${test_num}/${stability_tests} at $(date)..." - - # Test both netcat and curl again - nc_stable=false - curl_stable=false - - if timeout 8 nc -z "${target_host}" "${target_port}" > /dev/null 2>&1; then - nc_stable=true - fi - - if timeout 12 curl -k -s --connect-timeout 8 --max-time 12 \ - "https://${target_host}:${target_port}" > /dev/null 2>&1; then - curl_stable=true - fi - - if [ "$nc_stable" = "true" ] || [ "$curl_stable" = "true" ]; then - passed_tests=$((passed_tests + 1)) - echo " โœ… Test ${test_num} passed (nc:${nc_stable}, curl:${curl_stable})" - else - failed_tests=$((failed_tests + 1)) - echo " โŒ Test ${test_num} failed (nc:${nc_stable}, curl:${curl_stable})" - fi - - # Wait between tests (except last one) - if [ $test_num -lt $stability_tests ]; then - sleep 6 - fi - done - - echo "" - echo "๐Ÿ“Š Stability Test Results:" - echo " Passed: ${passed_tests}/${stability_tests}" - echo " Failed: ${failed_tests}/${stability_tests}" - echo " Success rate: $((passed_tests * 100 / stability_tests))%" - - if [ $passed_tests -ge $success_threshold ]; then - echo "โœ… Stability test PASSED (${passed_tests}/${stability_tests} >= ${success_threshold})" - - # Final summary - echo "" - echo "๐ŸŽ‰ Connection test completed successfully!" - echo "๐Ÿ“Š Final Summary:" - echo " Target: ${target_host}:${target_port}" - echo " Connection attempts: ${attempt}" - echo " Stability: ${passed_tests}/${stability_tests} tests passed" - echo " Overall status: HEALTHY with acceptable intermittency" - echo "โฐ Completed at: $(date)" - - exit 0 - else - echo "โŒ Stability test FAILED (${passed_tests}/${stability_tests} < ${success_threshold})" - echo "" - echo "โš ๏ธ The service is reachable but unstable." - echo "๐Ÿ’ก This often indicates:" - echo " - StatefulSet pods are still starting up" - echo " - Load balancer health checks are cycling" - echo " - Resource limits causing pod restarts" - echo " - Network connectivity issues" - echo "" - echo "๐Ÿ” Recommended actions:" - echo " 1. Check pod status: kubectl get pods -n {{ .Release.Namespace }}" - echo " 2. Check pod logs: kubectl logs -n {{ .Release.Namespace }}" - echo " 3. Check service endpoints: kubectl get endpoints -n {{ .Release.Namespace }}" - echo " 4. Wait longer and re-run the test" - - exit 1 - fi -{{- end }} \ No newline at end of file diff --git a/charts/iap/templates/tests/test-post-install.yaml b/charts/iap/templates/tests/test-post-install.yaml new file mode 100644 index 0000000..8184ea7 --- /dev/null +++ b/charts/iap/templates/tests/test-post-install.yaml @@ -0,0 +1,310 @@ +{{- if .Values.postInstallTests.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ include "iap.fullname" . }}-test-post-install" + labels: + {{- include "iap.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": before-hook-creation +spec: + ttlSecondsAfterFinished: {{ .Values.postInstallTests.ttlSecondsAfterFinished | default 300 }} + template: + metadata: + labels: + {{- include "iap.labels" . | nindent 8 }} + spec: + restartPolicy: Never + serviceAccountName: "{{ include "iap.fullname" . }}-test" + containers: + - name: post-install-test + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - | + echo "๐Ÿš€ Starting Platform Post-Installation Comprehensive Test Suite..." + echo "โฐ Test started at: $(date)" + echo "" + + # Test configuration + max_wait={{ .Values.postInstallTests.readinessTimeout | default 300 }} + wait_time=0 + retry_interval=15 + replica_count={{ .Values.replicaCount | default 1 }} + namespace="{{ .Release.Namespace }}" + protocol="{{- if .Values.useTLS }}https{{- else }}http{{- end }}" + expected_version="{{ .Values.image.tag }}" + + # Test paths + health_path="/health/status" + version_path="/version" + + # Process configuration + required_processes="Pronghorn_core Pronghorn_Jst_Application Pronghorn_TemplateBuilder_Application Pronghorn_AGManager_Application Pronghorn_Tags_Application Pronghorn_FormBuilder_Application Pronghorn_JsonForms_Application Pronghorn_MOP_Application Pronghorn_AutomationStudio_Application Pronghorn_OperationsManager_Application Pronghorn_GatewayManager_Application Pronghorn_WorkflowBuilder_Application Pronghorn_WorkFlowEngine_Application Pronghorn_Search_Application" + optional_processes="{{- range .Values.postInstallTests.processCountTest.optionalProcesses }}{{ . | replace " " "_" }} {{- end }}" + + # Count processes + required_count=$(echo ${required_processes} | wc -w) + optional_count=$(echo ${optional_processes} | wc -w) + + echo "๐Ÿ“‹ Test Configuration:" + echo " Protocol: ${protocol}" + echo " Replicas: ${replica_count}" + echo " Namespace: ${namespace}" + echo " Expected version: ${expected_version}" + echo " Required processes: ${required_count}" + echo " Optional processes: ${optional_count}" + echo " Max wait time: ${max_wait}s" + echo " Retry interval: ${retry_interval}s" + echo "" + + # Function to test health endpoint for a pod + test_health_endpoint() { + local pod_index=$1 + local pod_url="${protocol}://{{ .Values.service.name }}-headless-${pod_index}.${namespace}.svc.cluster.local:{{ .Values.service.port }}${health_path}" + + echo " ๐Ÿ” Health check for pod ${pod_index}: ${pod_url}" + + if timeout 15 curl -k -s --connect-timeout 10 --max-time 10 "${pod_url}" > /tmp/health_response_${pod_index} 2>/dev/null; then + response=$(cat /tmp/health_response_${pod_index}) + + # Parse health status + apps_status=$(echo "${response}" | grep -o '"apps":"[^"]*"' | cut -d'"' -f4) + adapters_status=$(echo "${response}" | grep -o '"adapters":"[^"]*"' | cut -d'"' -f4) + redis_status=$(echo "${response}" | grep -o '{"service":"redis","status":"[^"]*"}' | grep -o '"status":"[^"]*"' | cut -d'"' -f4) + mongo_status=$(echo "${response}" | grep -o '{"service":"mongo","status":"[^"]*"}' | grep -o '"status":"[^"]*"' | cut -d'"' -f4) + + # Check all components + all_healthy=true + if [ "${apps_status}" != "running" ]; then + echo " โŒ Apps status: ${apps_status} (expected: running)" + all_healthy=false + else + echo " โœ… Apps: running" + fi + + if [ "${adapters_status}" != "running" ]; then + echo " โŒ Adapters status: ${adapters_status} (expected: running)" + all_healthy=false + else + echo " โœ… Adapters: running" + fi + + if [ "${redis_status}" != "running" ]; then + echo " โŒ Redis status: ${redis_status} (expected: running)" + all_healthy=false + else + echo " โœ… Redis: running" + fi + + if [ "${mongo_status}" != "running" ]; then + echo " โŒ Mongo status: ${mongo_status} (expected: running)" + all_healthy=false + else + echo " โœ… Mongo: running" + fi + + [ "${all_healthy}" = "true" ] && return 0 || return 1 + else + echo " โŒ Health endpoint request failed" + return 1 + fi + } + + # Function to test version endpoint for a pod + test_version_endpoint() { + local pod_index=$1 + local pod_url="${protocol}://{{ .Values.service.name }}-headless-${pod_index}.${namespace}.svc.cluster.local:{{ .Values.service.port }}${version_path}" + + echo " ๐Ÿ” Version check for pod ${pod_index}: ${pod_url}" + + if timeout 15 curl -k -s --connect-timeout 10 --max-time 10 "${pod_url}" > /tmp/version_response_${pod_index} 2>/dev/null; then + response=$(cat /tmp/version_response_${pod_index}) + echo " ๐Ÿ“„ Version Installed: ${response}" + + if echo "${response}" | grep -q "${expected_version}"; then + echo " โœ… Version matches: ${expected_version}" + return 0 + else + echo " โŒ Version mismatch - expected: ${expected_version}" + return 1 + fi + else + echo " โŒ Version endpoint request failed" + return 1 + fi + } + + # Function to test process count for a pod + test_process_count() { + local pod_index=$1 + local pod_name="{{ include "iap.fullname" . }}-${pod_index}" + + echo " ๐Ÿ” Process check for pod ${pod_index}: ${pod_name}" + + # Check if pod exists and is running + if ! kubectl get pod "${pod_name}" -n "${namespace}" >/dev/null 2>&1; then + echo " โŒ Pod ${pod_name} not found" + return 1 + fi + + local pod_status=$(kubectl get pod "${pod_name}" -n "${namespace}" -o jsonpath='{.status.phase}') + if [ "${pod_status}" != "Running" ]; then + echo " โŒ Pod ${pod_name} is not running (status: ${pod_status})" + return 1 + fi + + # Get process list + if ! process_list=$(kubectl exec -n "${namespace}" "${pod_name}" -c iap -- ps aux 2>/dev/null); then + echo " โŒ Failed to get process list from pod" + return 1 + fi + + # Check required processes + required_found=0 + required_missing=0 + missing_list="" + + for process in ${required_processes}; do + process_name=$(echo "${process}" | sed 's/_/ /g') + if echo "${process_list}" | grep -q "${process_name}"; then + required_found=$((required_found + 1)) + else + required_missing=$((required_missing + 1)) + missing_list="${missing_list}\\n - ${process_name}" + fi + done + + # Check optional processes + optional_found=0 + for process in ${optional_processes}; do + process_name=$(echo "${process}" | sed 's/_/ /g') + if echo "${process_list}" | grep -q "${process_name}"; then + optional_found=$((optional_found + 1)) + fi + done + + echo " ๐Ÿ“Š Processes: ${required_found}/${required_count} required, ${optional_found}/${optional_count} optional" + + if [ ${required_missing} -eq 0 ]; then + echo " โœ… All required processes running" + [ ${optional_found} -gt 0 ] && echo " ๐ŸŽ‰ ${optional_found} optional process(es) also running" + return 0 + else + echo " โŒ Missing ${required_missing} required process(es)" + return 1 + fi + } + + # Function to run comprehensive test for a single pod + test_pod_comprehensive() { + local pod_index=$1 + echo "๐Ÿ” Testing pod ${pod_index} - Comprehensive Check" + + local health_ok=false + local version_ok=false + local process_ok=false + + # Test health endpoint + if test_health_endpoint $pod_index; then + health_ok=true + fi + + # Test version endpoint + if test_version_endpoint $pod_index; then + version_ok=true + fi + + # Test process count + if test_process_count $pod_index; then + process_ok=true + fi + + # Summary for this pod + echo "" + echo " ๐Ÿ“Š Pod ${pod_index} Summary:" + echo " Health: $([ "$health_ok" = "true" ] && echo "โœ… PASS" || echo "โŒ FAIL")" + echo " Version: $([ "$version_ok" = "true" ] && echo "โœ… PASS" || echo "โŒ FAIL")" + echo " Processes: $([ "$process_ok" = "true" ] && echo "โœ… PASS" || echo "โŒ FAIL")" + + # Pod passes if all tests pass + [ "$health_ok" = "true" ] && [ "$version_ok" = "true" ] && [ "$process_ok" = "true" ] + } + + # Main test loop + echo "๐Ÿ”„ Starting comprehensive test cycle..." + all_tests_passed=false + + while [ $wait_time -lt $max_wait ] && [ "$all_tests_passed" = "false" ]; do + echo "" + echo "=== Test Round $(( (wait_time / retry_interval) + 1 )) ===" + + passed_count=0 + failed_count=0 + + # Test each replica + for i in $(seq 0 $((replica_count - 1))); do + if test_pod_comprehensive $i; then + passed_count=$((passed_count + 1)) + else + failed_count=$((failed_count + 1)) + fi + echo "" + done + + echo "" + echo "๐Ÿ“Š Round Summary:" + echo " Pods passed all tests: ${passed_count}/${replica_count}" + echo " Pods with failures: ${failed_count}/${replica_count}" + echo " Time elapsed: ${wait_time}s" + + if [ $passed_count -eq $replica_count ]; then + all_tests_passed=true + echo " โœ… All pods passed comprehensive testing!" + else + echo " โณ Waiting ${retry_interval}s before next test round..." + sleep $retry_interval + wait_time=$((wait_time + retry_interval)) + fi + done + + # Final results + if [ "$all_tests_passed" = "true" ]; then + echo "" + echo "" + echo "๐ŸŽ‰ Platform POST-INSTALLATION TESTS PASSED!" + echo "๐Ÿ“‹ Final Results:" + echo " โœ… All ${replica_count} pod(s) passed comprehensive testing" + echo " โœ… Health endpoints: All components healthy" + echo " โœ… Version verification: Expected version deployed" + echo " โœ… Process verification: All required processes running" + echo " โฑ๏ธ Total test duration: ${wait_time}s" + echo "โฐ Test completed at: $(date)" + + exit 0 + else + echo "" + echo "" + echo "โŒ Platform POST-INSTALLATION TESTS FAILED!" + echo "๐Ÿ’” Not all pods passed comprehensive testing after ${max_wait}s" + echo "" + echo "๐Ÿ” Final diagnostic information available in pod logs" + echo "๐Ÿ’ก Common issues:" + echo " - Application still starting up" + echo " - Resource constraints" + echo " - Configuration issues" + echo " - Network connectivity problems" + echo "" + echo "๐Ÿ”ง Troubleshooting:" + echo " 1. Check pod logs: kubectl logs -n ${namespace} -c iap" + echo " 2. Check pod status: kubectl get pods -n ${namespace}" + echo " 3. Check resource usage: kubectl top pod -n ${namespace}" + echo " 4. Verify configuration and secrets" + + exit 1 + fi +{{- end }} \ No newline at end of file diff --git a/charts/iap/templates/tests/test-rbac.yaml b/charts/iap/templates/tests/test-rbac.yaml new file mode 100644 index 0000000..94d9fe1 --- /dev/null +++ b/charts/iap/templates/tests/test-rbac.yaml @@ -0,0 +1,50 @@ +{{- if .Values.postInstallTests.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "{{ include "iap.fullname" . }}-test" + labels: + {{- include "iap.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "-1" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: "{{ include "iap.fullname" . }}-test" + labels: + {{- include "iap.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "-1" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +- apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: "{{ include "iap.fullname" . }}-test" + labels: + {{- include "iap.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + "helm.sh/hook-weight": "-1" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: "{{ include "iap.fullname" . }}-test" +subjects: +- kind: ServiceAccount + name: "{{ include "iap.fullname" . }}-test" + namespace: {{ .Release.Namespace }} +{{- end }} \ No newline at end of file diff --git a/charts/iap/templates/tests/test-readiness.yaml b/charts/iap/templates/tests/test-readiness.yaml deleted file mode 100644 index 7bf4e48..0000000 --- a/charts/iap/templates/tests/test-readiness.yaml +++ /dev/null @@ -1,70 +0,0 @@ -{{- if .Values.tests.enabled }} -{{- $protocol := "http" -}} -{{- if .Values.useTLS -}} - {{- $protocol = "https" -}} -{{- end -}} -apiVersion: v1 -kind: Pod -metadata: - name: "{{ include "iap.fullname" . }}-test-readiness" - annotations: - "helm.sh/hook": test - "helm.sh/hook-weight": "2" - "helm.sh/hook-delete-policy": before-hook-creation -spec: - restartPolicy: Never - containers: - - name: readiness-test - image: curlimages/curl:latest - command: - - /bin/sh - - -c - - | - echo "Waiting for StatefulSet to be ready..." - - # Configuration - max_wait={{ .Values.tests.readinessTimeout | default 600 }} - wait_time=0 - replica_count={{ .Values.replicaCount | default 1 }} - protocol="{{ $protocol }}" - - echo "Configuration:" - echo " Max wait: ${max_wait}s" - echo " Replica count: ${replica_count}" - echo " Protocol: ${protocol}" - - while [ $wait_time -lt $max_wait ]; do - ready_count=0 - - echo "=== Check round $(($wait_time / 10 + 1)) ===" - - # Check each replica - for i in $(seq 0 $((replica_count - 1))); do - - pod_url="${protocol}://{{ include "iap.fullname" . }}-${i}.{{ .Values.service.name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.applicationPort }}/health/status" - - echo "Testing pod ${i}: ${pod_url}" - - if curl -f -s --connect-timeout 5 --max-time 10 "${pod_url}"; then - ready_count=$((ready_count + 1)) - echo "โœ… Replica ${i} is ready" - else - echo "โณ Replica ${i} is not ready yet" - fi - done - - if [ $ready_count -eq $replica_count ]; then - echo "๐ŸŽ‰ All replicas are ready! (${ready_count}/${replica_count})" - exit 0 - fi - - echo "๐Ÿ“Š Status: ($ready_count/$replica_count ready) - ${wait_time}s elapsed" - sleep 10 - wait_time=$((wait_time + 10)) # This was missing! - done - - echo "โŒ Timeout waiting for StatefulSet to be ready after ${max_wait}s" - echo "๐Ÿ“Š Final status: ($ready_count/$replica_count ready)" - exit 1 - -{{- end }} \ No newline at end of file diff --git a/charts/iap/values.yaml b/charts/iap/values.yaml index d35c113..99f7a0a 100644 --- a/charts/iap/values.yaml +++ b/charts/iap/values.yaml @@ -56,13 +56,23 @@ initAdapterInstaller: # branch: "v1.2.3" # Clones specific branch/tag -tests: +# Post-installation test configuration +postInstallTests: + # -- Enable or disable all post-install tests (helm test command) enabled: true + # -- Maximum time to wait for tests to pass before failing (in seconds) readinessTimeout: 300 - image: - repository: busybox - tag: latest - pullPolicy: IfNotPresent + # -- Time to live after test completion (in seconds) + # Job, pod resources including logs will be automatically deleted after this duration + ttlSecondsAfterFinished: 300 # 5 minutes default + # Process count verification configuration + processCountTest: + # -- Optional Itential applications that won't cause test failure if missing + optionalProcesses: + - "Pronghorn_LifecycleManager_Application" + - "Pronghorn_ConfigurationManager_Application" + - "Pronghorn_NSOManager_Application" + - "Pronghorn_AppArtifacts_Application" certManager: # -- Toggles the use of cert-manager for managing the TLS certificates. Setting this to false diff --git a/docs/helm-tests.md b/docs/helm-tests.md new file mode 100644 index 0000000..8dda7ba --- /dev/null +++ b/docs/helm-tests.md @@ -0,0 +1,386 @@ +# Platform Helm Tests + +This document describes the Helm tests available for the Itential Automation Platform (Platform) Helm chart. Helm tests are Kubernetes Jobs that run validation checks after a Helm chart is deployed to verify that your release is working correctly. + +## Overview + +Helm tests provide automated validation of your Platform deployment by running comprehensive checks against the deployed application. The Platform test suite validates: + +- โœ… **Health Endpoint**: Application health and all components (apps, adapters, redis, mongo) status +- โœ… **Version Verification**: Deployed version matches expected image tag +- โœ… **Process Count**: All required Pronghorn processes are running +- โœ… **Multi-Pod Support**: Tests all replicas individually via headless services +- โœ… **Automatic Cleanup**: Tests self-delete after configurable TTL period + +## Running Tests + +### Basic Test Execution + +Run the comprehensive post-install test for a deployed release: + +```bash +helm test -n +``` + +**Example:** +```bash +helm test iap -n development +``` + +### With Options + +```bash +# Run tests with extended timeout (default: 5 minutes) +helm test -n --timeout 10m + +# Run tests and keep detailed output +helm test -n --debug + +# Clean up previous test runs first +helm test -n --cleanup +``` + +### Viewing Test Results + +```bash +# View test job logs (recommended) +kubectl logs job/-test-post-install -n + +# View test job status +kubectl get jobs -n | grep test + +# Check test job details +kubectl describe job -test-post-install -n + +# View all test-related resources +kubectl get pods,jobs -n -l "helm.sh/hook=test" +``` + +## Test Configuration + +Tests are configured in the `values.yaml` file under the `postInstallTests` section: + +```yaml +# Post-installation test configuration +postInstallTests: + # -- Enable or disable all post-install tests (helm test command) + enabled: true + # -- Maximum time to wait for tests to pass before failing (in seconds) + readinessTimeout: 300 + # -- Time to live after test completion (in seconds) + # Job, pod resources including logs will be automatically deleted after this duration + ttlSecondsAfterFinished: 300 # 5 minutes default + # Process count verification configuration + processCountTest: + # -- Optional Itential applications that won't cause test failure if missing + optionalProcesses: + - "Pronghorn_LifecycleManager_Application" + - "Pronghorn_ConfigurationManager_Application" + - "Pronghorn_NSOManager_Application" + - "Pronghorn_AppArtifacts_Application" +``` + +### Configuration Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `postInstallTests.enabled` | Enable or disable all tests | `true` | +| `postInstallTests.readinessTimeout` | Maximum time to wait for application readiness (seconds) | `300` | +| `postInstallTests.ttlSecondsAfterFinished` | Automatic cleanup delay after test completion (seconds) | `300` | +| `postInstallTests.processCountTest.optionalProcesses` | List of optional processes that won't fail tests if missing | See values | + +## Comprehensive Post-Install Test + +**File**: `templates/tests/test-post-install.yaml` +**Resource Type**: `Job` (batch/v1) +**Hook Weight**: `1` +**Purpose**: Validates complete Platform deployment health and functionality + +### What it Tests + +The comprehensive test performs three validation checks on each pod replica: + +#### 1. Health Endpoint Validation +- โœ… Health endpoint (`/health/status`) returns HTTP 200 +- โœ… All replica pods respond individually via headless services +- โœ… Component status verification: + - `apps`: running + - `adapters`: running + - `redis`: running + - `mongo`: running + +#### 2. Version Verification +- โœ… Version endpoint (`/version`) returns expected version +- โœ… Deployed version matches `values.image.tag` +- โœ… Version consistency across all replicas + +#### 3. Process Count Validation +- โœ… **Required Processes** (14 core processes): + - Pronghorn_core + - Pronghorn_Jst_Application + - Pronghorn_TemplateBuilder_Application + - Pronghorn_AGManager_Application + - Pronghorn_Tags_Application + - Pronghorn_FormBuilder_Application + - Pronghorn_JsonForms_Application + - Pronghorn_MOP_Application + - Pronghorn_AutomationStudio_Application + - Pronghorn_OperationsManager_Application + - Pronghorn_GatewayManager_Application + - Pronghorn_WorkflowBuilder_Application + - Pronghorn_WorkFlowEngine_Application + - Pronghorn_Search_Application + +- โ„น๏ธ **Optional Processes** (configurable): + - Pronghorn_LifecycleManager_Application + - Pronghorn_ConfigurationManager_Application + - Pronghorn_NSOManager_Application + - Pronghorn_AppArtifacts_Application + +### Test Execution Flow + +1. **Initialization**: Configure test parameters and display test setup +2. **Multi-Round Testing**: Retry logic with 15-second intervals until timeout +3. **Per-Pod Validation**: Test each replica individually using headless services +4. **Comprehensive Scoring**: All three test types must pass for pod success +5. **Success Criteria**: All replicas must pass all tests +6. **Automatic Cleanup**: Job and pods deleted after TTL expires + +### Example Output + +``` +๐Ÿš€ Starting Platform Post-Installation Comprehensive Test Suite... +โฐ Test started at: Wed Nov 26 17:14:37 UTC 2025 + +๐Ÿ“‹ Test Configuration: + Protocol: https + Replicas: 1 + Namespace: na + Expected version: 6.1.1 + Required processes: 14 + Optional processes: 4 + Max wait time: 300s + Retry interval: 15s + +๐Ÿ”„ Starting comprehensive test cycle... + +=== Test Round 1 === +๐Ÿ” Testing pod 0 - Comprehensive Check + ๐Ÿ” Health check for pod 0: https://iap-service-headless-0.local:443/health/status + โœ… Health endpoint responded + โœ… Apps: running + โœ… Adapters: running + โœ… Redis: running + โœ… Mongo: running + ๐Ÿ” Version check for pod 0: https://iap-service-headless-0.local:443/version + โœ… Version endpoint responded + ๐Ÿ“„ Version Installed: "6.1.1" + โœ… Version matches: 6.1.1 + ๐Ÿ” Process check for pod 0: iap-na-0 + ๐Ÿ“Š Processes: 14/14 required, 1/4 optional + โœ… All required processes running + ๐Ÿ“Š Pod 0 Summary: + Health: โœ… PASS + Version: โœ… PASS + Processes: โœ… PASS + +๐Ÿ“Š Round Summary: + Pods passed all tests: 1/1 + Pods with failures: 0/1 + Time elapsed: 0s + โœ… All pods passed comprehensive testing! + +๐ŸŽ‰ Platform POST-INSTALLATION TESTS PASSED! +๐Ÿ“‹ Final Results: + โœ… All 1 pod(s) passed comprehensive testing + โœ… Health endpoints: All components healthy + โœ… Version verification: Expected version deployed + โœ… Process verification: All required processes running + โฑ๏ธ Total test duration: 0s +โฐ Test completed at: Wed Nov 26 17:14:39 UTC 2025 +``` + +### Service Discovery + +The test uses **headless service URLs** for reliable pod-specific communication: + +- **Format**: `https://{service.name}-headless-{pod-index}.{namespace}.svc.cluster.local:{service.port}/{endpoint}` +- **Example**: `https://iap-service-headless-0.na.svc.cluster.local:443/health/status` + +This approach provides: +- Direct pod-to-pod communication +- Independence from external load balancers +- Faster, more reliable internal cluster networking +- Individual pod validation (important for StatefulSets) + +### Automatic Cleanup (TTL) + +The test uses Kubernetes Job with `ttlSecondsAfterFinished` for automatic resource cleanup: + +- โฑ๏ธ **Default TTL**: 300 seconds (5 minutes) +- ๐Ÿ” **Debugging Window**: Logs available during entire TTL period +- ๐Ÿ—‘๏ธ **Auto-Cleanup**: Job and pod resources automatically deleted after TTL +- โš™๏ธ **Configurable**: Adjust via `postInstallTests.ttlSecondsAfterFinished` + +**Timeline Example:** +1. Test runs and completes (success/failure) +2. TTL countdown begins (5 minutes) +3. Logs remain accessible: `kubectl logs job/iap-test-post-install -n na` +4. After 5 minutes: Job and pod are automatically deleted +5. No manual cleanup required + +### Troubleshooting + +If tests fail, check the following in order: + +#### 1. View Test Logs +```bash +# Get detailed test output +kubectl logs job/-test-post-install -n + +# If job is deleted, check recent events +kubectl get events -n --sort-by='.lastTimestamp' +``` + +#### 2. Check Application Status +```bash +# Verify IAP pods are running +kubectl get pods -n + +# Check application logs +kubectl logs -n -c iap + +# Verify services and endpoints +kubectl get svc,endpoints -n +``` + +#### 3. Manual Health Check +```bash +# Port-forward and test endpoints directly +kubectl port-forward 3443:3443 -n + +# Test health endpoint +curl -k https://localhost:3443/health/status + +# Test version endpoint +curl -k https://localhost:3443/version +``` + +#### 4. Process Validation +```bash +# Check processes in pod +kubectl exec -n -c iap -- ps aux | grep -i pronghorn +``` + +### Common Failure Scenarios + +| Issue | Symptoms | Solution | +|-------|----------|----------| +| **Application Starting** | Health/version endpoints fail initially | Wait longer or increase `readinessTimeout` | +| **Database Connectivity** | Health endpoint shows mongo/redis not running | Check database connections and credentials | +| **Resource Constraints** | Missing processes or pods failing | Increase CPU/memory requests/limits | +| **Network Policies** | Connection timeouts | Verify network policies allow test pod โ†’ Platform communication | +| **TLS Issues** | SSL/certificate errors | Check certificate configuration and trust chain | +| **Version Mismatch** | Version check fails | Verify `values.image.tag` matches actual deployment | + +## Integration with CI/CD + +Helm tests integrate seamlessly with deployment pipelines: + +```bash +#!/bin/bash +set -e + +echo "Deploying Platform..." +helm upgrade --install iap-prod ./chart -f values-prod.yaml -n production + +echo "Running comprehensive post-install tests..." +if helm test iap-prod -n production --timeout 10m; then + echo "โœ… Platform deployment validation successful" + echo "๐Ÿš€ Application is ready for traffic" +else + echo "โŒ Platform deployment validation failed" + echo "๐Ÿ“‹ Test logs:" + kubectl logs job/iap-prod-test-post-install -n production + exit 1 +fi +``` + +### GitLab CI Example + +```yaml +deploy_and_test: + stage: deploy + script: + - helm upgrade --install ${CI_ENVIRONMENT_NAME} ./chart -f values-${CI_ENVIRONMENT_NAME}.yaml -n ${CI_ENVIRONMENT_NAME} + - helm test ${CI_ENVIRONMENT_NAME} -n ${CI_ENVIRONMENT_NAME} --timeout 15m + environment: + name: ${CI_COMMIT_REF_SLUG} + only: + - main + - develop +``` + +## Development and Customization + +### Adding Optional Processes + +To add new optional processes that won't fail tests if missing: + +```yaml +postInstallTests: + processCountTest: + optionalProcesses: + - "Pronghorn_LifecycleManager_Application" + - "Pronghorn_ConfigurationManager_Application" + - "Pronghorn_NSOManager_Application" + - "Pronghorn_AppArtifacts_Application" + - "Pronghorn_CustomApp_Application" # Add your custom process +``` + +### Adjusting Timeouts + +For slower environments or longer startup times: + +```yaml +postInstallTests: + readinessTimeout: 600 # 10 minutes instead of default 5 + ttlSecondsAfterFinished: 1800 # Keep logs for 30 minutes +``` + +### Environment-Specific Configuration + +Different environments may need different test configurations: + +```yaml +# values-dev.yaml - More lenient for development +postInstallTests: + readinessTimeout: 600 + ttlSecondsAfterFinished: 1800 # Keep logs longer for debugging + +# values-prod.yaml - Strict for production +postInstallTests: + readinessTimeout: 300 + ttlSecondsAfterFinished: 300 # Clean up quickly +``` + +## Best Practices + +1. **Run tests after every deployment** to catch issues early +2. **Monitor test results** in CI/CD pipelines for automated validation +3. **Adjust timeouts** based on environment performance characteristics +4. **Review test logs** when failures occur for detailed diagnostics +5. **Keep TTL reasonable** - balance debugging needs with resource cleanup +6. **Test in staging** before deploying to production environments + +## Contributing + +When modifying the test suite: + +1. Update test templates in `templates/tests/test-post-install.yaml` +2. Update values files if new configuration options are needed +3. Test changes against running deployment +4. Update this documentation with new features or changes +5. Ensure backward compatibility with existing deployments + +For questions or issues with Helm tests, check the [Troubleshooting](#troubleshooting) section or review the application deployment logs. \ No newline at end of file