From 17d74dfbf00a561cee17272905a0abe8e35c5a7b Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Tue, 20 Feb 2024 14:27:27 +0100 Subject: [PATCH] chore: rework prometheus metrics registration fail if metric registration panics Signed-off-by: Mario Constanti --- apiserver/controllers/controllers.go | 30 ++++---------- cmd/garm/main.go | 6 +++ doc/config_metrics.md | 14 +++---- metrics/health.go | 2 +- metrics/instance.go | 2 +- metrics/metrics.go | 60 +++++++++++++++++----------- metrics/pool.go | 4 -- metrics/webhooks.go | 2 +- runner/metrics/health.go | 4 +- runner/metrics/instance.go | 6 +-- runner/metrics/metrics.go | 12 +++--- 11 files changed, 68 insertions(+), 74 deletions(-) diff --git a/apiserver/controllers/controllers.go b/apiserver/controllers/controllers.go index 36b63891..acfee0b9 100644 --- a/apiserver/controllers/controllers.go +++ b/apiserver/controllers/controllers.go @@ -106,35 +106,23 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo signature := r.Header.Get("X-Hub-Signature-256") hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type") - controllerInfo, err := a.r.GetControllerInfo(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info") - return - } - if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil { if errors.Is(err, gErrors.ErrNotFound) { metrics.WebhooksReceived.WithLabelValues( - "false", // label: valid - "owner_unknown", // label: reason - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: controller_id + "false", // label: valid + "owner_unknown", // label: reason ).Inc() slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?") return } else if strings.Contains(err.Error(), "signature") { // TODO: check error type metrics.WebhooksReceived.WithLabelValues( - "false", // label: valid - "signature_invalid", // label: reason - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: controller_id + "false", // label: valid + "signature_invalid", // label: reason ).Inc() } else { metrics.WebhooksReceived.WithLabelValues( - "false", // label: valid - "unknown", // label: reason - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: controller_id + "false", // label: valid + "unknown", // label: reason ).Inc() } @@ -142,10 +130,8 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo return } metrics.WebhooksReceived.WithLabelValues( - "true", // label: valid - "", // label: reason - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: controller_id + "true", // label: valid + "", // label: reason ).Inc() } diff --git a/cmd/garm/main.go b/cmd/garm/main.go index 327f09bd..dfac21a9 100644 --- a/cmd/garm/main.go +++ b/cmd/garm/main.go @@ -35,6 +35,7 @@ import ( "github.com/cloudbase/garm/config" "github.com/cloudbase/garm/database" "github.com/cloudbase/garm/database/common" + "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/runner" runnerMetrics "github.com/cloudbase/garm/runner/metrics" garmUtil "github.com/cloudbase/garm/util" @@ -219,6 +220,11 @@ func main() { slog.InfoContext(ctx, "setting up metric routes") router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware) + slog.InfoContext(ctx, "register metrics") + if err := metrics.RegisterMetrics(); err != nil { + log.Fatal(err) + } + slog.InfoContext(ctx, "start metrics collection") runnerMetrics.CollectObjectMetric(ctx, runner, time.NewTicker(cfg.Metrics.Duration())) } diff --git a/doc/config_metrics.md b/doc/config_metrics.md index 65ae1057..17161616 100644 --- a/doc/config_metrics.md +++ b/doc/config_metrics.md @@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it ## Common metrics -| Metric name | Type | Labels | Description | -|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `garm_health` | Gauge | `controller_id`=<controller id>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. | -| `garm_webhooks_received` | Counter | `controller_id`=<controller id>
`name`=<hostname> | This is a counter that increments every time GARM receives a webhook from GitHub. | +| Metric name | Type | Labels | Description | +|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `garm_health` | Gauge | `controller_id`=<controller id>
`callback_url`=<callback url>
`controller_webhook_url`=<controller webhook url>
`metadata_url`=<metadata url>
`webhook_url`=<webhook url>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. | +| `garm_webhooks_received` | Counter | `valid`=<valid request>
`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. | ## Enterprise metrics @@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it ## Runner metrics -| Metric name | Type | Labels | Description | -|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------| -| `garm_runner_status` | Gauge | `controller_id`=<controller id>
`hostname`=<hostname>
`name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | +| Metric name | Type | Labels | Description | +|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------| +| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | More metrics will be added in the future. diff --git a/metrics/health.go b/metrics/health.go index 8c8d6bcd..d1f5e969 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -9,5 +9,5 @@ var ( Namespace: metricsNamespace, Name: "health", Help: "Health of the garm", - }, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"}) + }, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"}) ) diff --git a/metrics/instance.go b/metrics/instance.go index e37d80fe..a4ac66bf 100644 --- a/metrics/instance.go +++ b/metrics/instance.go @@ -10,5 +10,5 @@ var ( Subsystem: metricsRunnerSubsystem, Name: "status", Help: "Status of the instance", - }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}) + }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"}) ) diff --git a/metrics/metrics.go b/metrics/metrics.go index 2bdec860..7dec272c 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -13,29 +13,41 @@ const metricsRepositorySubsystem = "repository" const metricsEnterpriseSubsystem = "enterprise" const metricsWebhookSubsystem = "webhook" -func init() { - // runner metrics - prometheus.MustRegister(InstanceStatus) - // organization metrics - prometheus.MustRegister(OrganizationInfo) - prometheus.MustRegister(OrganizationPoolManagerStatus) - // enterprise metrics - prometheus.MustRegister(EnterpriseInfo) - prometheus.MustRegister(EnterprisePoolManagerStatus) - // repository metrics - prometheus.MustRegister(RepositoryInfo) - prometheus.MustRegister(RepositoryPoolManagerStatus) - // provider metrics - prometheus.MustRegister(ProviderInfo) - // pool metrics - prometheus.MustRegister(PoolInfo) - prometheus.MustRegister(PoolStatus) - prometheus.MustRegister(PoolMaxRunners) - prometheus.MustRegister(PoolMinIdleRunners) - prometheus.MustRegister(PoolBootstrapTimeout) - // health metrics - prometheus.MustRegister(GarmHealth) - // webhook metrics - prometheus.MustRegister(WebhooksReceived) +// RegisterMetrics registers all the metrics +func RegisterMetrics() error { + var collectors []prometheus.Collector + collectors = append(collectors, + // runner metrics + InstanceStatus, + // organization metrics + OrganizationInfo, + OrganizationPoolManagerStatus, + // enterprise metrics + EnterpriseInfo, + EnterprisePoolManagerStatus, + // repository metrics + RepositoryInfo, + RepositoryPoolManagerStatus, + // provider metrics + ProviderInfo, + // pool metrics + PoolInfo, + PoolStatus, + PoolMaxRunners, + PoolMinIdleRunners, + PoolBootstrapTimeout, + // health metrics + GarmHealth, + // webhook metrics + WebhooksReceived, + ) + + for _, c := range collectors { + if err := prometheus.Register(c); err != nil { + return err + } + } + + return nil } diff --git a/metrics/pool.go b/metrics/pool.go index 764ad706..5803af90 100644 --- a/metrics/pool.go +++ b/metrics/pool.go @@ -10,10 +10,6 @@ var ( Subsystem: metricsPoolSubsystem, Name: "info", Help: "Info of the pool", - // ConstLabels: prometheus.Labels{ - // "controller_id": metricControllerValue, - // "hostname": metricHostnameValue, - // }, }, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}) PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ diff --git a/metrics/webhooks.go b/metrics/webhooks.go index 7b314eb6..14b6492c 100644 --- a/metrics/webhooks.go +++ b/metrics/webhooks.go @@ -8,5 +8,5 @@ var ( Subsystem: metricsWebhookSubsystem, Name: "received", Help: "The total number of webhooks received", - }, []string{"valid", "reason", "hostname", "controller_id"}) + }, []string{"valid", "reason"}) ) diff --git a/runner/metrics/health.go b/runner/metrics/health.go index c84dca45..d70adf10 100644 --- a/runner/metrics/health.go +++ b/runner/metrics/health.go @@ -9,14 +9,12 @@ import ( ) func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error { - metrics.GarmHealth.WithLabelValues( - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: id controllerInfo.MetadataURL, // label: metadata_url controllerInfo.CallbackURL, // label: callback_url controllerInfo.WebhookURL, // label: webhook_url controllerInfo.ControllerWebhookURL, // label: controller_webhook_url + controllerInfo.ControllerID.String(), // label: controller_id ).Set(1) return nil } diff --git a/runner/metrics/instance.go b/runner/metrics/instance.go index 9fdb0740..a32fa081 100644 --- a/runner/metrics/instance.go +++ b/runner/metrics/instance.go @@ -4,13 +4,12 @@ import ( "context" "github.com/cloudbase/garm/metrics" - "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner" ) // CollectInstanceMetric collects the metrics for the runner instances // reflecting the statuses and the pool they belong to. -func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error { +func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error { // reset metrics metrics.InstanceStatus.Reset() @@ -63,10 +62,7 @@ func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo poolNames[instance.PoolID].Name, // label: pool_owner poolNames[instance.PoolID].Type, // label: pool_type instance.PoolID, // label: pool_id - controllerInfo.Hostname, // label: hostname - controllerInfo.ControllerID.String(), // label: controller_id poolNames[instance.PoolID].ProviderName, // label: provider - ).Set(1) } return nil diff --git a/runner/metrics/metrics.go b/runner/metrics/metrics.go index 5427118f..911226dc 100644 --- a/runner/metrics/metrics.go +++ b/runner/metrics/metrics.go @@ -57,17 +57,17 @@ func CollectObjectMetric(ctx context.Context, r *runner.Runner, ticker *time.Tic slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics") } + slog.DebugContext(ctx, "collecting instance metrics") + err = CollectInstanceMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics") + } + slog.DebugContext(ctx, "collecting health metrics") err = CollectHealthMetric(ctx, r, controllerInfo) if err != nil { slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics") } - - slog.DebugContext(ctx, "collecting instance metrics") - err = CollectInstanceMetric(ctx, r, controllerInfo) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics") - } } } }()