chore: rework prometheus metrics registration

fail if metric registration panics

Signed-off-by: Mario Constanti <mario.constanti@mercedes-benz.com>
This commit is contained in:
Mario Constanti 2024-02-20 14:27:27 +01:00
parent 3e025dda2f
commit 17d74dfbf0
11 changed files with 68 additions and 74 deletions

View file

@ -106,35 +106,23 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
controllerInfo, err := a.r.GetControllerInfo(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
return
}
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"owner_unknown", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
"false", // label: valid
"owner_unknown", // label: reason
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"signature_invalid", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
"false", // label: valid
"signature_invalid", // label: reason
).Inc()
} else {
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"unknown", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
"false", // label: valid
"unknown", // label: reason
).Inc()
}
@ -142,10 +130,8 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
return
}
metrics.WebhooksReceived.WithLabelValues(
"true", // label: valid
"", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
"true", // label: valid
"", // label: reason
).Inc()
}

View file

@ -35,6 +35,7 @@ import (
"github.com/cloudbase/garm/config"
"github.com/cloudbase/garm/database"
"github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
@ -219,6 +220,11 @@ func main() {
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
slog.InfoContext(ctx, "register metrics")
if err := metrics.RegisterMetrics(); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(ctx, runner, time.NewTicker(cfg.Metrics.Duration()))
}

View file

@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
## Common metrics
| Metric name | Type | Labels | Description |
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
| Metric name | Type | Labels | Description |
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`callback_url`=&lt;callback url&gt; <br>`controller_webhook_url`=&lt;controller webhook url&gt; <br>`metadata_url`=&lt;metadata url&gt; <br>`webhook_url`=&lt;webhook url&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `valid`=&lt;valid request&gt; <br>`reason`=&lt;reason for invalid requests&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
## Enterprise metrics
@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
| Metric name | Type | Labels | Description |
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `controller_id`=&lt;controller id&gt; <br>`hostname`=&lt;hostname&gt; <br>`name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
More metrics will be added in the future.

View file

@ -9,5 +9,5 @@ var (
Namespace: metricsNamespace,
Name: "health",
Help: "Health of the garm",
}, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"})
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
)

View file

@ -10,5 +10,5 @@ var (
Subsystem: metricsRunnerSubsystem,
Name: "status",
Help: "Status of the instance",
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"})
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
)

View file

@ -13,29 +13,41 @@ const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
func init() {
// runner metrics
prometheus.MustRegister(InstanceStatus)
// organization metrics
prometheus.MustRegister(OrganizationInfo)
prometheus.MustRegister(OrganizationPoolManagerStatus)
// enterprise metrics
prometheus.MustRegister(EnterpriseInfo)
prometheus.MustRegister(EnterprisePoolManagerStatus)
// repository metrics
prometheus.MustRegister(RepositoryInfo)
prometheus.MustRegister(RepositoryPoolManagerStatus)
// provider metrics
prometheus.MustRegister(ProviderInfo)
// pool metrics
prometheus.MustRegister(PoolInfo)
prometheus.MustRegister(PoolStatus)
prometheus.MustRegister(PoolMaxRunners)
prometheus.MustRegister(PoolMinIdleRunners)
prometheus.MustRegister(PoolBootstrapTimeout)
// health metrics
prometheus.MustRegister(GarmHealth)
// webhook metrics
prometheus.MustRegister(WebhooksReceived)
// RegisterMetrics registers all the metrics
func RegisterMetrics() error {
var collectors []prometheus.Collector
collectors = append(collectors,
// runner metrics
InstanceStatus,
// organization metrics
OrganizationInfo,
OrganizationPoolManagerStatus,
// enterprise metrics
EnterpriseInfo,
EnterprisePoolManagerStatus,
// repository metrics
RepositoryInfo,
RepositoryPoolManagerStatus,
// provider metrics
ProviderInfo,
// pool metrics
PoolInfo,
PoolStatus,
PoolMaxRunners,
PoolMinIdleRunners,
PoolBootstrapTimeout,
// health metrics
GarmHealth,
// webhook metrics
WebhooksReceived,
)
for _, c := range collectors {
if err := prometheus.Register(c); err != nil {
return err
}
}
return nil
}

View file

@ -10,10 +10,6 @@ var (
Subsystem: metricsPoolSubsystem,
Name: "info",
Help: "Info of the pool",
// ConstLabels: prometheus.Labels{
// "controller_id": metricControllerValue,
// "hostname": metricHostnameValue,
// },
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{

View file

@ -8,5 +8,5 @@ var (
Subsystem: metricsWebhookSubsystem,
Name: "received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason", "hostname", "controller_id"})
}, []string{"valid", "reason"})
)

View file

@ -9,14 +9,12 @@ import (
)
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
metrics.GarmHealth.WithLabelValues(
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: id
controllerInfo.MetadataURL, // label: metadata_url
controllerInfo.CallbackURL, // label: callback_url
controllerInfo.WebhookURL, // label: webhook_url
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
controllerInfo.ControllerID.String(), // label: controller_id
).Set(1)
return nil
}

View file

@ -4,13 +4,12 @@ import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.InstanceStatus.Reset()
@ -63,10 +62,7 @@ func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
poolNames[instance.PoolID].ProviderName, // label: provider
).Set(1)
}
return nil

View file

@ -57,17 +57,17 @@ func CollectObjectMetric(ctx context.Context, r *runner.Runner, ticker *time.Tic
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics")
}
slog.DebugContext(ctx, "collecting instance metrics")
err = CollectInstanceMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
}
slog.DebugContext(ctx, "collecting health metrics")
err = CollectHealthMetric(ctx, r, controllerInfo)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics")
}
slog.DebugContext(ctx, "collecting instance metrics")
err = CollectInstanceMetric(ctx, r, controllerInfo)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
}
}
}
}()