diff --git a/charts/README.md b/charts/README.md index d6e844c0..0643b7a8 100644 --- a/charts/README.md +++ b/charts/README.md @@ -44,7 +44,7 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | ports.api | int | `8000` | API server port | | ports.health | int | `8080` | Health check endpoint port | | ports.metrics | int | `9090` | Prometheus metrics endpoint port | -| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. | +| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. | | config.existingConfigMap | string | `""` | Use an existing ConfigMap instead of generating one. When set, all other `config.*` values are ignored. | | config.server | object | `{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}` | HTTP server settings | | config.server.hostname | string | `""` | Public hostname advertised by the API (leave empty for auto-detect) | @@ -93,13 +93,13 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | config.logging.masking.enabled | bool | `true` | Enable log masking | | config.logging.masking.headers | list | `["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]` | HTTP headers whose values are redacted in logs | | config.logging.masking.fields | list | `["password","secret","token","api_key","access_token","refresh_token","client_secret"]` | Field names whose values are redacted in logs | -| config.metrics | object | `{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}}` | Prometheus metrics endpoint settings | +| config.metrics | object | `{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}}` | Prometheus metrics endpoint settings | | config.metrics.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for in-cluster access) | | config.metrics.port | int | `9090` | Listen port (must match `ports.metrics`) | | config.metrics.tls | object | `{"enabled":false}` | TLS configuration for the metrics endpoint | | config.metrics.tls.enabled | bool | `false` | Enable TLS on the metrics endpoint | | config.metrics.label_metrics_inclusion_duration | string | `"168h"` | Duration window for label-based metric inclusion | -| config.metrics.deletion_stuck_threshold | string | `"30m"` | Threshold after which a deletion is considered stuck | +| config.metrics.reconciliation_stuck_threshold | string | `"10m"` | Threshold after which a pending reconciliation is considered stuck | | config.health | object | `{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}}` | Health check endpoint settings | | config.health.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for probe access) | | config.health.port | int | `8080` | Listen port (must match `ports.health`) | diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml index 79aac88d..8489f728 100644 --- a/charts/templates/prometheusrule.yaml +++ b/charts/templates/prometheusrule.yaml @@ -15,30 +15,30 @@ metadata: {{- end }} spec: groups: - - name: hyperfleet-api-deletion + - name: hyperfleet-api-reconciliation rules: - - alert: HyperFleetResourceDeletionStuckWarning - expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + - alert: HyperFleetResourceReconciliationStuckWarning + expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 for: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} labels: severity: warning annotations: - summary: "HyperFleet resources stuck in Pending Deletion state" + summary: "HyperFleet resources stuck pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in - Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been + pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} (alert delay). runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }} - - alert: HyperFleetResourceDeletionStuckCritical - expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + - alert: HyperFleetResourceReconciliationStuckCritical + expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 for: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} labels: severity: critical annotations: - summary: "HyperFleet resources timed out in Pending Deletion state" + summary: "HyperFleet resources timed out pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in - Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been + pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} (alert delay). Immediate investigation required. runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }} {{- end }} diff --git a/charts/values.schema.json b/charts/values.schema.json index 2d52392f..eb8d20ed 100644 --- a/charts/values.schema.json +++ b/charts/values.schema.json @@ -351,9 +351,9 @@ "type": "string", "description": "Duration window for including label-based metrics (Go duration, e.g. 168h)" }, - "deletion_stuck_threshold": { + "reconciliation_stuck_threshold": { "type": "string", - "description": "Duration after which a pending deletion is considered stuck (Go duration, e.g. 30m)" + "description": "Duration after which a pending reconciliation is considered stuck (Go duration, e.g. 10m)" } } }, diff --git a/charts/values.yaml b/charts/values.yaml index 4e8918bd..477328aa 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -178,8 +178,8 @@ config: # -- Duration window for label-based metric inclusion label_metrics_inclusion_duration: 168h - # -- Threshold after which a deletion is considered stuck - deletion_stuck_threshold: 30m + # -- Threshold after which a pending reconciliation is considered stuck + reconciliation_stuck_threshold: 10m # -- Health check endpoint settings health: diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index 040bcc54..6f8df5c6 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -131,11 +131,11 @@ func runServe(cmd *cobra.Command, args []string) { ).Info("Logger initialized") if sf := environments.Environment().Database.SessionFactory; sf != nil { - if err := metrics.RegisterCollector( + if err := metrics.RegisterReconciliationCollector( sf.DirectDB(), - environments.Environment().Config.Metrics.DeletionStuckThreshold, + environments.Environment().Config.Metrics.ReconciliationStuckThreshold, ); err != nil { - logger.WithError(ctx, err).Error("Failed to register pending deletion collector") + logger.WithError(ctx, err).Error("Failed to register reconciliation collector") } } diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go index 54ac053a..14ed8800 100755 --- a/cmd/hyperfleet-api/server/metrics_middleware.go +++ b/cmd/hyperfleet-api/server/metrics_middleware.go @@ -113,7 +113,7 @@ func ResetMetricCollectors() { requestCountMetric.Reset() requestDurationMetric.Reset() db_metrics.ResetMetrics() - metrics.ResetMetrics() + metrics.ResetReconciliationMetrics() buildInfoMetric.Reset() buildInfoMetric.With(prometheus.Labels{ metricsComponentLabel: metricsComponentValue, diff --git a/pkg/config/flags.go b/pkg/config/flags.go index 0bb4cef1..b43a1027 100644 --- a/pkg/config/flags.go +++ b/pkg/config/flags.go @@ -89,8 +89,8 @@ func AddMetricsFlags(cmd *cobra.Command) { cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics") cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration, "Duration for cluster telemetry label inclusion") - cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold, - "Duration after which a pending deletion resource is considered stuck") + cmd.Flags().Duration("metrics-reconciliation-stuck-threshold", defaults.ReconciliationStuckThreshold, + "Duration after which a pending reconciliation resource is considered stuck") } // AddHealthFlags adds health check configuration flags following standard naming diff --git a/pkg/config/loader.go b/pkg/config/loader.go index 0d173822..74990077 100644 --- a/pkg/config/loader.go +++ b/pkg/config/loader.go @@ -353,7 +353,7 @@ func (l *ConfigLoader) bindAllEnvVars() { l.bindEnv("metrics.port") l.bindEnv("metrics.tls.enabled") l.bindEnv("metrics.label_metrics_inclusion_duration") - l.bindEnv("metrics.deletion_stuck_threshold") + l.bindEnv("metrics.reconciliation_stuck_threshold") // Health config l.bindEnv("health.host") @@ -421,8 +421,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) { l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file")) l.bindPFlag("metrics.label_metrics_inclusion_duration", cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration")) - l.bindPFlag("metrics.deletion_stuck_threshold", - cmd.Flags().Lookup("metrics-deletion-stuck-threshold")) + l.bindPFlag("metrics.reconciliation_stuck_threshold", + cmd.Flags().Lookup("metrics-reconciliation-stuck-threshold")) // Health flags: --health-* -> health.* l.bindPFlag("health.host", cmd.Flags().Lookup("health-host")) diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go index 1ee67ea4..0b6ec28a 100755 --- a/pkg/config/metrics.go +++ b/pkg/config/metrics.go @@ -14,7 +14,7 @@ type MetricsConfig struct { TLS TLSConfig `mapstructure:"tls" json:"tls" validate:"required"` Port int `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"` LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll - DeletionStuckThreshold time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"` //nolint:lll + ReconciliationStuckThreshold time.Duration `mapstructure:"reconciliation_stuck_threshold" json:"reconciliation_stuck_threshold" validate:"required"` //nolint:lll } // NewMetricsConfig returns default MetricsConfig values @@ -27,14 +27,14 @@ func NewMetricsConfig() *MetricsConfig { Enabled: false, }, LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days - DeletionStuckThreshold: 30 * time.Minute, + ReconciliationStuckThreshold: 10 * time.Minute, } } // Validate validates MetricsConfig fields that struct tags cannot enforce func (m *MetricsConfig) Validate() error { - if m.DeletionStuckThreshold <= 0 { - return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold) + if m.ReconciliationStuckThreshold <= 0 { + return fmt.Errorf("ReconciliationStuckThreshold must be positive, got %v", m.ReconciliationStuckThreshold) } return nil } diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go deleted file mode 100644 index 6ec55873..00000000 --- a/pkg/metrics/deletion.go +++ /dev/null @@ -1,169 +0,0 @@ -/* -Copyright (c) 2026 Red Hat, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "context" - "database/sql" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" -) - -const metricsSubsystem = "hyperfleet_api" - -const ( - labelResourceType = "resource_type" - labelComponent = "component" - labelVersion = "version" -) - -const componentValue = "api" - -var deletionLabels = []string{labelResourceType, labelComponent, labelVersion} - -var pendingDeletionDurationBuckets = []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} - -var pendingDeletionTotal = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: metricsSubsystem, - Name: "resource_pending_deletion_total", - Help: "Total number of resources that entered the Pending Deletion state (deleted_time set).", - }, - deletionLabels, -) - -var pendingDeletionDuration = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: metricsSubsystem, - Name: "resource_pending_deletion_duration_seconds", - Help: "Duration from pending deletion (deleted_time set) to hard-delete completion in seconds.", - Buckets: pendingDeletionDurationBuckets, - }, - deletionLabels, -) - -var registerOnce sync.Once - -func RegisterMetrics() { - registerOnce.Do(func() { - prometheus.MustRegister(pendingDeletionTotal) - prometheus.MustRegister(pendingDeletionDuration) - }) -} - -func init() { - RegisterMetrics() -} - -func RecordPendingDeletion(resourceType string) { - pendingDeletionTotal.With(prometheus.Labels{ - labelResourceType: resourceType, - labelComponent: componentValue, - labelVersion: api.Version, - }).Inc() -} - -func ObservePendingDeletionDuration(resourceType string, deletedAt time.Time) { - duration := time.Since(deletedAt).Seconds() - pendingDeletionDuration.With(prometheus.Labels{ - labelResourceType: resourceType, - labelComponent: componentValue, - labelVersion: api.Version, - }).Observe(duration) -} - -func ResetMetrics() { - pendingDeletionTotal.Reset() - pendingDeletionDuration.Reset() -} - -// PendingDeletionCollector implements prometheus.Collector to report the number of -// resources stuck in Pending Deletion state beyond a configurable threshold. -// It queries the database on each Prometheus scrape. -const defaultQueryTimeout = 30 * time.Second - -type PendingDeletionCollector struct { - stuckDesc *prometheus.Desc - db *sql.DB - stuckThreshold time.Duration - queryTimeout time.Duration -} - -func NewPendingDeletionCollector(db *sql.DB, stuckThreshold time.Duration) *PendingDeletionCollector { - return &PendingDeletionCollector{ - db: db, - stuckThreshold: stuckThreshold, - queryTimeout: defaultQueryTimeout, - stuckDesc: prometheus.NewDesc( - metricsSubsystem+"_resource_pending_deletion_stuck", - "Number of resources in Pending Deletion state beyond the stuck threshold.", - []string{labelResourceType}, - prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}, - ), - } -} - -func (c *PendingDeletionCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.stuckDesc -} - -// stuckQueries maps resource types to their pre-built SQL queries. -// Table names are compile-time constants — no user input in SQL strings. -var stuckQueries = []struct { - query string - resourceType string -}{ - {"SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1", "cluster"}, - {"SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1", "nodepool"}, -} - -func (c *PendingDeletionCollector) Collect(ch chan<- prometheus.Metric) { - if c == nil || c.db == nil { - return - } - - threshold := time.Now().UTC().Add(-c.stuckThreshold) - - for _, q := range stuckQueries { - ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout) - var count int64 - row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants - if err := row.Scan(&count); err != nil { - cancel() - logger.With(ctx, "resource_type", q.resourceType).WithError(err).Error("Failed to query pending deletion resources") - continue - } - cancel() - - ch <- prometheus.MustNewConstMetric( - c.stuckDesc, - prometheus.GaugeValue, - float64(count), - q.resourceType, - ) - } -} - -func RegisterCollector(db *sql.DB, stuckThreshold time.Duration) error { - collector := NewPendingDeletionCollector(db, stuckThreshold) - return prometheus.Register(collector) -} diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go deleted file mode 100644 index 2fac3ca3..00000000 --- a/pkg/metrics/deletion_test.go +++ /dev/null @@ -1,267 +0,0 @@ -/* -Copyright (c) 2026 Red Hat, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "testing" - "time" - - . "github.com/onsi/gomega" - "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" -) - -const ( - testPendingDeletionTotalMetric = "hyperfleet_api_resource_pending_deletion_total" - testPendingDeletionDurationMetric = "hyperfleet_api_resource_pending_deletion_duration_seconds" - testPendingDeletionStuckMetric = "hyperfleet_api_resource_pending_deletion_stuck" - testResourceCluster = "cluster" - testResourceNodepool = "nodepool" -) - -func TestMetricsSubsystem(t *testing.T) { - RegisterTestingT(t) - Expect(metricsSubsystem).To(Equal("hyperfleet_api")) -} - -func TestPendingDeletionTotalIsRegistered(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - found = true - Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER)) - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionTotalMetric+" metric should be registered") -} - -func TestRecordPendingDeletion_IncrementsCounter(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - RecordPendingDeletion(testResourceCluster) - RecordPendingDeletion(testResourceNodepool) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var clusterCount, nodepoolCount float64 - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - clusterCount = metric.GetCounter().GetValue() - } - if labels["resource_type"] == testResourceNodepool { - nodepoolCount = metric.GetCounter().GetValue() - } - } - break - } - } - Expect(clusterCount).To(Equal(2.0)) - Expect(nodepoolCount).To(Equal(1.0)) -} - -func TestRecordPendingDeletion_Labels(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - found = true - Expect(labels["component"]).To(Equal("api")) - Expect(labels["version"]).To(Equal(api.Version)) - } - } - break - } - } - Expect(found).To(BeTrue(), "pending deletion total metric with expected labels should exist") -} - -func TestPendingDeletionDurationIsRegistered(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - found = true - Expect(mf.GetType()).To(Equal(dto.MetricType_HISTOGRAM)) - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") -} - -func TestObservePendingDeletionDuration_RecordsValue(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - deletedAt := time.Now().Add(-10 * time.Second) - ObservePendingDeletionDuration(testResourceCluster, deletedAt) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - found = true - Expect(metric.GetHistogram().GetSampleCount()).To(BeEquivalentTo(1)) - Expect(metric.GetHistogram().GetSampleSum()).To(BeNumerically(">=", 10.0)) - } - } - break - } - } - Expect(found).To(BeTrue(), "pending deletion duration metric with cluster label should exist") -} - -func TestPendingDeletionDurationBuckets(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - expectedBuckets := []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} - - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-1*time.Second)) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - found = true - for _, metric := range mf.GetMetric() { - histogram := metric.GetHistogram() - buckets := histogram.GetBucket() - Expect(buckets).To(HaveLen(len(expectedBuckets))) - for i, b := range buckets { - Expect(b.GetUpperBound()).To(Equal(expectedBuckets[i])) - } - } - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") -} - -func TestResetMetrics_ClearsAllDeletionMetrics(t *testing.T) { - RegisterTestingT(t) - - RecordPendingDeletion(testResourceCluster) - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) - - ResetMetrics() - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion total should be empty after reset") - } - if mf.GetName() == testPendingDeletionDurationMetric { - Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion duration should be empty after reset") - } - } -} - -func TestPendingDeletionCollector_Describe(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - ch := make(chan *prometheus.Desc, 10) - collector.Describe(ch) - close(ch) - - var descs []*prometheus.Desc - for desc := range ch { - descs = append(descs, desc) - } - - Expect(descs).To(HaveLen(1)) - Expect(descs[0].String()).To(ContainSubstring("resource_pending_deletion_stuck")) -} - -func TestPendingDeletionCollector_CollectWithNilDB(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - ch := make(chan prometheus.Metric, 10) - collector.Collect(ch) - close(ch) - - var collectedMetrics []prometheus.Metric - for m := range ch { - collectedMetrics = append(collectedMetrics, m) - } - - Expect(collectedMetrics).To(BeEmpty()) -} - -func TestStuckDescriptor(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - descStr := collector.stuckDesc.String() - - Expect(descStr).To(ContainSubstring("hyperfleet_api_resource_pending_deletion_stuck")) - Expect(descStr).To(ContainSubstring("resource_type")) - Expect(descStr).To(ContainSubstring("component")) - Expect(descStr).To(ContainSubstring("version")) -} - -func labelsToMap(metric *dto.Metric) map[string]string { - labels := make(map[string]string) - for _, lp := range metric.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - return labels -} diff --git a/pkg/metrics/reconciliation.go b/pkg/metrics/reconciliation.go new file mode 100644 index 00000000..b48a2fc4 --- /dev/null +++ b/pkg/metrics/reconciliation.go @@ -0,0 +1,194 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "database/sql" + "fmt" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" +) + +const metricsSubsystem = "hyperfleet_api" + +const ( + labelResourceType = "resource_type" + labelComponent = "component" + labelVersion = "version" + labelIsDelete = "is_delete" +) + +const componentValue = "api" + +const defaultQueryTimeout = 30 * time.Second + +var reconciliationLabels = []string{labelResourceType, labelIsDelete} + +var reconciliationStartedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: metricsSubsystem, + Name: "reconciliation_started_total", + Help: "Total number of resources that entered the unreconciled state " + + "(Reconciled condition transitioned to False).", + ConstLabels: prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}, + }, + reconciliationLabels, +) + +var reconciliationRegisterOnce sync.Once + +func RegisterReconciliationMetrics() { + reconciliationRegisterOnce.Do(func() { + prometheus.MustRegister(reconciliationStartedTotal) + }) +} + +func init() { + RegisterReconciliationMetrics() +} + +func RecordReconciliationStarted(resourceType string, isDelete bool) { + reconciliationStartedTotal.With(prometheus.Labels{ + labelResourceType: resourceType, + labelIsDelete: fmt.Sprintf("%t", isDelete), + }).Inc() +} + +func ResetReconciliationMetrics() { + reconciliationStartedTotal.Reset() +} + +type ReconciliationCollector struct { + pendingDesc *prometheus.Desc + stuckDesc *prometheus.Desc + durationDesc *prometheus.Desc + + db *sql.DB + stuckThreshold time.Duration + queryTimeout time.Duration +} + +func NewReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) *ReconciliationCollector { + constLabels := prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version} + variableLabels := []string{labelResourceType, labelIsDelete} + + return &ReconciliationCollector{ + db: db, + stuckThreshold: stuckThreshold, + queryTimeout: defaultQueryTimeout, + pendingDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation", + "Number of resources currently pending reconciliation (Reconciled=False).", + variableLabels, + constLabels, + ), + stuckDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation_stuck", + "Number of resources pending reconciliation beyond the stuck threshold.", + variableLabels, + constLabels, + ), + durationDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation_stuck_duration_seconds", + "Maximum duration in seconds that any resource has been stuck pending reconciliation.", + variableLabels, + constLabels, + ), + } +} + +func (c *ReconciliationCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.pendingDesc + ch <- c.stuckDesc + ch <- c.durationDesc +} + +// reconciliationQuery uses a CTE to parse JSONB once per row, then computes +// all three metrics (pending count, stuck count, max stuck duration) in a +// single query — 1 round-trip instead of 3. +// +//nolint:lll // SQL readability — breaking these lines across Go string boundaries would harm clarity +const reconciliationQuery = ` +WITH unreconciled AS ( + SELECT 'cluster' AS resource_type, + CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete, + (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time + FROM clusters + WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False' + UNION ALL + SELECT 'nodepool' AS resource_type, + CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete, + (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time + FROM node_pools + WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False' +) +SELECT resource_type, + is_delete, + COUNT(*) AS pending, + COUNT(*) FILTER (WHERE transition_time < $1) AS stuck, + COALESCE(MAX(EXTRACT(EPOCH FROM (NOW() - transition_time))) FILTER (WHERE transition_time < $1), 0) AS max_duration +FROM unreconciled +GROUP BY resource_type, is_delete` + +func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) { + if c == nil || c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout) + defer cancel() + + threshold := time.Now().UTC().Add(-c.stuckThreshold) + + rows, err := c.db.QueryContext(ctx, reconciliationQuery, threshold) //nolint:gosec // compile-time SQL + if err != nil { + logger.WithError(ctx, err).Error("Failed to query reconciliation metrics") + return + } + defer rows.Close() + + for rows.Next() { + var resourceType, isDelete string + var pending, stuck int64 + var maxDuration float64 + + if err := rows.Scan(&resourceType, &isDelete, &pending, &stuck, &maxDuration); err != nil { + logger.WithError(ctx, err).Error("Failed to scan reconciliation metric row") + continue + } + + labels := []string{resourceType, isDelete} + ch <- prometheus.MustNewConstMetric(c.pendingDesc, prometheus.GaugeValue, float64(pending), labels...) + ch <- prometheus.MustNewConstMetric(c.stuckDesc, prometheus.GaugeValue, float64(stuck), labels...) + ch <- prometheus.MustNewConstMetric(c.durationDesc, prometheus.GaugeValue, maxDuration, labels...) + } + + if err := rows.Err(); err != nil { + logger.WithError(ctx, err).Error("Error iterating reconciliation metric rows") + } +} + +func RegisterReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) error { + collector := NewReconciliationCollector(db, stuckThreshold) + return prometheus.Register(collector) +} diff --git a/pkg/metrics/reconciliation_test.go b/pkg/metrics/reconciliation_test.go new file mode 100644 index 00000000..fa2a5d75 --- /dev/null +++ b/pkg/metrics/reconciliation_test.go @@ -0,0 +1,218 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" +) + +const ( + testReconciliationStartedTotalMetric = "hyperfleet_api_reconciliation_started_total" + testResourceCluster = "cluster" + testResourceNodepool = "nodepool" +) + +func labelsToMap(metric *dto.Metric) map[string]string { + labels := make(map[string]string) + for _, lp := range metric.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + return labels +} + +func TestReconciliationStartedTotalIsRegistered(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + found = true + Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER)) + break + } + } + Expect(found).To(BeTrue(), testReconciliationStartedTotalMetric+" metric should be registered") +} + +func TestRecordReconciliationStarted_IncrementsCounter(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + RecordReconciliationStarted(testResourceCluster, false) + RecordReconciliationStarted(testResourceNodepool, true) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var clusterCount, nodepoolDeleteCount float64 + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster && labels["is_delete"] == "false" { + clusterCount = metric.GetCounter().GetValue() + } + if labels["resource_type"] == testResourceNodepool && labels["is_delete"] == "true" { + nodepoolDeleteCount = metric.GetCounter().GetValue() + } + } + break + } + } + Expect(clusterCount).To(Equal(2.0)) + Expect(nodepoolDeleteCount).To(Equal(1.0)) +} + +func TestRecordReconciliationStarted_Labels(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + found = true + Expect(labels["is_delete"]).To(Equal("false")) + Expect(labels["component"]).To(Equal("api")) + Expect(labels["version"]).To(Equal(api.Version)) + } + } + break + } + } + Expect(found).To(BeTrue(), "reconciliation started metric with expected labels should exist") +} + +func TestRecordReconciliationStarted_IsDeleteLabelValues(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, true) + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + isDeleteValues := map[string]bool{} + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + isDeleteValues[labels["is_delete"]] = true + } + } + break + } + } + Expect(isDeleteValues).To(HaveKey("true")) + Expect(isDeleteValues).To(HaveKey("false")) +} + +func TestResetReconciliationMetrics_ClearsAllMetrics(t *testing.T) { + RegisterTestingT(t) + + RecordReconciliationStarted(testResourceCluster, false) + + ResetReconciliationMetrics() + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + Expect(mf.GetMetric()).To(BeEmpty(), "reconciliation started total should be empty after reset") + } + } +} + +func TestReconciliationCollector_Describe(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + ch := make(chan *prometheus.Desc, 10) + collector.Describe(ch) + close(ch) + + var descs []*prometheus.Desc + for desc := range ch { + descs = append(descs, desc) + } + + Expect(descs).To(HaveLen(3)) +} + +func TestReconciliationCollector_CollectWithNilDB(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + ch := make(chan prometheus.Metric, 10) + collector.Collect(ch) + close(ch) + + var collectedMetrics []prometheus.Metric + for m := range ch { + collectedMetrics = append(collectedMetrics, m) + } + + Expect(collectedMetrics).To(BeEmpty()) +} + +func TestReconciliationCollector_DescriptorNames(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + + Expect(collector.pendingDesc.String()).To(ContainSubstring("resource_pending_reconciliation")) + Expect(collector.stuckDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck")) + Expect(collector.durationDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck_duration_seconds")) +} + +func TestReconciliationCollector_DescriptorLabels(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + + for _, desc := range []*prometheus.Desc{collector.pendingDesc, collector.stuckDesc, collector.durationDesc} { + descStr := desc.String() + Expect(descStr).To(ContainSubstring("resource_type")) + Expect(descStr).To(ContainSubstring("is_delete")) + Expect(descStr).To(ContainSubstring("component")) + Expect(descStr).To(ContainSubstring("version")) + } +} diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go index 92e0c585..3d403e6f 100644 --- a/pkg/services/cluster.go +++ b/pkg/services/cluster.go @@ -147,8 +147,6 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu return nil, handleSoftDeleteError(api.ResourceTypeCluster, saveErr) } - metrics.RecordPendingDeletion("cluster") - cluster, svcErr := s.UpdateClusterStatusFromAdapters(ctx, id) if svcErr != nil { return nil, svcErr @@ -228,6 +226,8 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( hasChildResources = exists } + prevReconciledStatus := extractPrevReconciledStatus(ctx, cluster.StatusConditions) + reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{ ResourceGeneration: cluster.Generation, RefTime: refTime, @@ -238,6 +238,11 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( HasChildResources: hasChildResources, }) + if reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { + metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil) + } + allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go index 79a1462f..5150ff21 100644 --- a/pkg/services/node_pool.go +++ b/pkg/services/node_pool.go @@ -11,7 +11,6 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) //go:generate mockgen-v0.6.0 -source=node_pool.go -package=services -destination=node_pool_mock.go @@ -187,8 +186,6 @@ func (s *sqlNodePoolService) SoftDelete(ctx context.Context, nodePoolID string) return nil, handleSoftDeleteError(api.ResourceTypeNodePool, err) } - metrics.RecordPendingDeletion("nodepool") - updated, svcErr := s.UpdateNodePoolStatusFromAdapters(ctx, nodePool.ID) if svcErr != nil { return nil, svcErr @@ -207,12 +204,10 @@ func (s *sqlNodePoolService) CascadeSoftDelete( deletedTime = time.Now().UTC().Truncate(time.Microsecond) } - var newlyDeleted int for _, np := range nodePools { if np.DeletedTime == nil { np.MarkDeleted(deletedBy, deletedTime) np.IncrementGeneration() - newlyDeleted++ } } @@ -224,10 +219,6 @@ func (s *sqlNodePoolService) CascadeSoftDelete( return handleSoftDeleteError(api.ResourceTypeNodePool, err) } - for range newlyDeleted { - metrics.RecordPendingDeletion("nodepool") - } - return nil } diff --git a/pkg/services/status_helpers.go b/pkg/services/status_helpers.go index d1c4767a..1ac42d81 100644 --- a/pkg/services/status_helpers.go +++ b/pkg/services/status_helpers.go @@ -8,8 +8,17 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/config" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) +func extractPrevReconciledStatus(ctx context.Context, raw []byte) *api.ResourceConditionStatus { + prevReconciled, _, _ := parsePrevConditions(ctx, raw) + if prevReconciled == nil { + return nil + } + return &prevReconciled.Status +} + // computeNodePoolConditionsJSON aggregates adapter statuses into marshaled conditions JSON. // Returns nil if conditions are unchanged relative to np.StatusConditions. func computeNodePoolConditionsJSON( @@ -18,6 +27,8 @@ func computeNodePoolConditionsJSON( adapterStatuses []*api.AdapterStatus, requiredAdapters []string, ) ([]byte, *errors.ServiceError) { + prevReconciledStatus := extractPrevReconciledStatus(ctx, np.StatusConditions) + reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{ ResourceGeneration: np.Generation, RefTime: nodePoolRefTime(np), @@ -27,6 +38,11 @@ func computeNodePoolConditionsJSON( AdapterStatuses: adapterStatuses, }) + if reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { + metrics.RecordReconciliationStarted("nodepool", np.DeletedTime != nil) + } + allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go deleted file mode 100644 index 2bfe8cc7..00000000 --- a/test/integration/deletion_metrics_test.go +++ /dev/null @@ -1,95 +0,0 @@ -package integration - -import ( - "net/http" - "testing" - "time" - - . "github.com/onsi/gomega" - "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api/openapi" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" - "github.com/openshift-hyperfleet/hyperfleet-api/test" -) - -func TestPendingDeletionCollector_Integration(t *testing.T) { - t.Run("given soft-deleted resources older than threshold, collector reports them as stuck", func(t *testing.T) { - RegisterTestingT(t) - h, client := test.RegisterIntegration(t) - account := h.NewRandAccount() - ctx := h.NewAuthenticatedContext(account) - - cluster, err := h.Factories.NewClusters(h.NewID()) - Expect(err).NotTo(HaveOccurred()) - npResp, err := client.CreateNodePoolWithResponse( - ctx, cluster.ID, openapi.CreateNodePoolJSONRequestBody(newNodePoolInput("stuck-np")), - test.WithAuthToken(ctx), - ) - Expect(err).NotTo(HaveOccurred()) - Expect(npResp.StatusCode()).To(Equal(http.StatusCreated)) - - _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) - Expect(err).NotTo(HaveOccurred()) - - // Backdate deleted_time to 1 hour ago so resources exceed the 30m threshold - db := h.DBFactory.New(ctx) - pastTime := time.Now().UTC().Add(-1 * time.Hour) - Expect(db.Exec("UPDATE clusters SET deleted_time = ? WHERE id = ?", pastTime, cluster.ID).Error).NotTo(HaveOccurred()) - err = db.Exec("UPDATE node_pools SET deleted_time = ? WHERE owner_id = ?", pastTime, cluster.ID).Error - Expect(err).NotTo(HaveOccurred()) - - rawDB := h.DBFactory.DirectDB() - collector := metrics.NewPendingDeletionCollector(rawDB, 30*time.Minute) - - collected := collectStuckMetrics(t, collector) - - Expect(collected["cluster"]).To(BeNumerically(">=", 1), "should report at least 1 stuck cluster") - Expect(collected["nodepool"]).To(BeNumerically(">=", 1), "should report at least 1 stuck nodepool") - }) - - t.Run("given soft-deleted resources within threshold, collector reports zero stuck", func(t *testing.T) { - RegisterTestingT(t) - h, client := test.RegisterIntegration(t) - account := h.NewRandAccount() - ctx := h.NewAuthenticatedContext(account) - - cluster, err := h.Factories.NewClusters(h.NewID()) - Expect(err).NotTo(HaveOccurred()) - - _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) - Expect(err).NotTo(HaveOccurred()) - - rawDB := h.DBFactory.DirectDB() - // Use a very long threshold so the just-deleted cluster is NOT stuck - collector := metrics.NewPendingDeletionCollector(rawDB, 24*time.Hour) - - collected := collectStuckMetrics(t, collector) - - clusterValue, ok := collected["cluster"] - Expect(ok).To(BeTrue(), "collector should emit a cluster series") - Expect(clusterValue).To(Equal(0.0), "recently deleted cluster should not be stuck") - }) -} - -func collectStuckMetrics(t *testing.T, collector *metrics.PendingDeletionCollector) map[string]float64 { - t.Helper() - RegisterTestingT(t) - - ch := make(chan prometheus.Metric, 10) - collector.Collect(ch) - close(ch) - - result := make(map[string]float64) - for m := range ch { - pb := &dto.Metric{} - Expect(m.Write(pb)).To(Succeed()) - for _, lp := range pb.GetLabel() { - if lp.GetName() == "resource_type" { - result[lp.GetValue()] = pb.GetGauge().GetValue() - } - } - } - return result -} diff --git a/test/integration/reconciliation_metrics_test.go b/test/integration/reconciliation_metrics_test.go new file mode 100644 index 00000000..b44e12ce --- /dev/null +++ b/test/integration/reconciliation_metrics_test.go @@ -0,0 +1,210 @@ +package integration + +import ( + "strings" + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" + "github.com/openshift-hyperfleet/hyperfleet-api/test" + "github.com/openshift-hyperfleet/hyperfleet-api/test/factories" +) + +const ( + resourceCluster = "cluster" + isDeleteFalse = "false" + isDeleteTrue = "true" +) + +func TestReconciliationCollector_Integration(t *testing.T) { + t.Run("pending reconciliation resources are reported with correct labels", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-5 * time.Minute) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + Expect(pending).NotTo(BeEmpty(), "should report pending reconciliation metrics") + + var found bool + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report pending cluster with is_delete=false") + }) + + t.Run("stuck resources beyond threshold are reported", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + stuck := collected["stuck"] + Expect(stuck).NotTo(BeEmpty(), "should report stuck reconciliation metrics") + + var found bool + for _, m := range stuck { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report stuck cluster") + }) + + t.Run("resources within threshold are pending but not stuck", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + recentTime := time.Now().UTC().Add(-2 * time.Minute) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, recentTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + var pendingCount float64 + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + pendingCount = m.value + } + } + Expect(pendingCount).To(BeNumerically(">=", 1), "resource should be pending") + + stuck := collected["stuck"] + var stuckCount float64 + for _, m := range stuck { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + stuckCount = m.value + } + } + Expect(stuckCount).To(Equal(0.0), "recent resource should not be stuck") + }) + + t.Run("max stuck duration is reported for stuck resources", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + duration := collected["duration"] + Expect(duration).NotTo(BeEmpty(), "should report stuck duration metrics") + + var found bool + for _, m := range duration { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 3500), "stuck duration should be ~3600 seconds") + } + } + Expect(found).To(BeTrue(), "should report stuck duration for cluster") + }) + + t.Run("soft-deleted resources are reported with is_delete=true", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + cluster, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + ctx := h.NewAuthenticatedContext(h.NewRandAccount()) + db := h.DBFactory.New(ctx) + deletedTime := time.Now().UTC().Add(-1 * time.Hour) + err = db.Exec( + "UPDATE clusters SET deleted_time = ? WHERE id = ?", deletedTime, cluster.ID, + ).Error + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + var found bool + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteTrue { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report soft-deleted cluster with is_delete=true") + }) +} + +type collectedMetric struct { + labels map[string]string + value float64 +} + +func collectReconciliationMetrics( + t *testing.T, collector *metrics.ReconciliationCollector, +) map[string][]collectedMetric { + t.Helper() + RegisterTestingT(t) + + ch := make(chan prometheus.Metric, 20) + collector.Collect(ch) + close(ch) + + result := map[string][]collectedMetric{ + "pending": {}, + "stuck": {}, + "duration": {}, + } + + for m := range ch { + pb := &dto.Metric{} + Expect(m.Write(pb)).To(Succeed()) + + desc := m.Desc().String() + labels := make(map[string]string) + for _, lp := range pb.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + + cm := collectedMetric{labels: labels, value: pb.GetGauge().GetValue()} + + switch { + case strings.Contains(desc, "stuck_duration_seconds"): + result["duration"] = append(result["duration"], cm) + case strings.Contains(desc, "reconciliation_stuck"): + result["stuck"] = append(result["stuck"], cm) + case strings.Contains(desc, "pending_reconciliation"): + result["pending"] = append(result["pending"], cm) + } + } + return result +}