diff --git a/cmd/testserver/run.go b/cmd/testserver/run.go index c3acf553..3e1858a6 100644 --- a/cmd/testserver/run.go +++ b/cmd/testserver/run.go @@ -19,6 +19,7 @@ import ( "k8s.io/client-go/util/flowcontrol" "github.com/castai/cluster-controller/internal/helm" + "github.com/castai/cluster-controller/internal/metrics" "github.com/castai/cluster-controller/loadtest" "github.com/castai/cluster-controller/loadtest/scenarios" ) @@ -28,6 +29,8 @@ func run(ctx context.Context) error { cfg := loadtest.GetConfig() logger.Info("creating test server") + loadtest.RegisterTestMetrics(metrics.GetRegistry()) + testServer := loadtest.NewTestServer(logger, loadtest.TestServerConfig{ MaxActionsPerCall: 1000, TimeoutWaitingForActions: 60 * time.Second, @@ -51,7 +54,7 @@ func run(ctx context.Context) error { // Choose scenarios below by adding/removing/etc. instances of scenarios.XXX() // All scenarios in the list run in parallel (but not necessarily at the same time if preparation takes different time). testScenarios := []scenarios.TestScenario{ - scenarios.CheckNodeStatus(5000, logger), + scenarios.DrainNode(1, 10, logger), } logger.Info("Starting continuous test scenario execution") @@ -88,8 +91,10 @@ func run(ctx context.Context) error { if len(receivedErrors) > 0 { logger.Error(fmt.Sprintf("Iteration %d completed with (%d) errors: %v", iteration, len(receivedErrors), errors.Join(receivedErrors...))) + loadtest.IncrementTestRunFailure(len(receivedErrors)) } else { logger.Info(fmt.Sprintf("Iteration %d completed successfully", iteration)) + loadtest.IncrementTestRunSuccess() } logger.Info("Waiting 1 minute before next iteration") diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index a0fd6334..712313ce 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -30,3 +30,8 @@ func NewMetricsMux() *http.ServeMux { func Gather() ([]*dto.MetricFamily, error) { return registry.Gather() } + +// GetRegistry returns the prometheus registry for registering additional metrics. +func GetRegistry() prometheus.Registerer { + return registry +} diff --git a/loadtest/http.go b/loadtest/http.go index c0e106ec..49e7aa46 100644 --- a/loadtest/http.go +++ b/loadtest/http.go @@ -7,10 +7,19 @@ import ( "net/http" "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/metrics" ) func NewHttpServer(ctx context.Context, cfg Config, testServer *CastAITestServer) error { - http.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions", func(w http.ResponseWriter, r *http.Request) { + mux := http.NewServeMux() + + // Register metrics endpoint - metrics.NewMetricsMux() has "/metrics" internally, so we strip it + metricsMux := metrics.NewMetricsMux() + mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) { + metricsMux.ServeHTTP(w, r) + }) + + mux.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions", func(w http.ResponseWriter, r *http.Request) { result, err := testServer.GetActions(r.Context(), "") if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) @@ -29,7 +38,7 @@ func NewHttpServer(ctx context.Context, cfg Config, testServer *CastAITestServer } }) - http.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions/{action_id}/ack", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions/{action_id}/ack", func(w http.ResponseWriter, r *http.Request) { actionID := r.PathValue("action_id") var req castai.AckClusterActionRequest err := json.NewDecoder(r.Body).Decode(&req) @@ -45,7 +54,7 @@ func NewHttpServer(ctx context.Context, cfg Config, testServer *CastAITestServer } }) - http.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions/logs", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/v1/kubernetes/clusters/{cluster_id}/actions/logs", func(w http.ResponseWriter, r *http.Request) { var req castai.LogEntry err := json.NewDecoder(r.Body).Decode(&req) if err != nil { @@ -61,5 +70,5 @@ func NewHttpServer(ctx context.Context, cfg Config, testServer *CastAITestServer }) //nolint:gosec // Missing timeouts are not a real issue here. - return http.ListenAndServe(fmt.Sprintf(":%d", cfg.Port), nil) + return http.ListenAndServe(fmt.Sprintf(":%d", cfg.Port), mux) } diff --git a/loadtest/metrics.go b/loadtest/metrics.go new file mode 100644 index 00000000..076eaefb --- /dev/null +++ b/loadtest/metrics.go @@ -0,0 +1,38 @@ +package loadtest + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// testRunCounter tracks test run iterations by status. +var testRunCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "test_run_total", + Help: "Count of test run iterations by status (success/failed).", + }, + []string{"status"}, +) + +// testRunErrorsCounter tracks total number of errors in failed test runs. +var testRunErrorsCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "test_run_errors_total", + Help: "Total count of errors across all failed test runs.", + }, +) + +func IncrementTestRunSuccess() { + testRunCounter.With(prometheus.Labels{"status": "success"}).Inc() +} + +func IncrementTestRunFailure(errorCount int) { + testRunCounter.With(prometheus.Labels{"status": "failed"}).Inc() + testRunErrorsCounter.Add(float64(errorCount)) +} + +func RegisterTestMetrics(registry prometheus.Registerer) { + registry.MustRegister( + testRunCounter, + testRunErrorsCounter, + ) +} diff --git a/loadtest/scenarios/drain_node.go b/loadtest/scenarios/drain_node.go index 15d1d1e8..024ec4b1 100644 --- a/loadtest/scenarios/drain_node.go +++ b/loadtest/scenarios/drain_node.go @@ -68,7 +68,6 @@ func (s *drainNodeScenario) Preparation(ctx context.Context, namespace string, c deployment.Namespace = namespace //nolint:gosec // Not afraid of overflow here. deployment.Spec.Replicas = lo.ToPtr(int32(s.deploymentReplicas)) - deployment.Spec.Template.Spec.NodeName = nodeName _, err = clientset.AppsV1().Deployments(namespace).Create(ctx, deployment, metav1.CreateOptions{}) if err != nil { @@ -92,7 +91,9 @@ func (s *drainNodeScenario) Preparation(ctx context.Context, namespace string, c }) } - return errGroup.Wait() + err := errGroup.Wait() + + return err } func (s *drainNodeScenario) Cleanup(ctx context.Context, namespace string, clientset kubernetes.Interface) error { @@ -160,7 +161,8 @@ func (s *drainNodeScenario) Run(ctx context.Context, _ string, _ kubernetes.Inte CreatedAt: time.Now().UTC(), ActionDrainNode: &castai.ActionDrainNode{ NodeName: node.Name, - NodeID: "", + ProviderId: node.Name, + NodeID: node.Name, DrainTimeoutSeconds: 60, Force: true, },