chore: rework prometheus metrics registration
fail if metric registration panics Signed-off-by: Mario Constanti <mario.constanti@mercedes-benz.com>
This commit is contained in:
parent
3e025dda2f
commit
17d74dfbf0
11 changed files with 68 additions and 74 deletions
|
|
@ -106,35 +106,23 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
|
|||
signature := r.Header.Get("X-Hub-Signature-256")
|
||||
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
|
||||
|
||||
controllerInfo, err := a.r.GetControllerInfo(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
|
||||
return
|
||||
}
|
||||
|
||||
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
|
||||
if errors.Is(err, gErrors.ErrNotFound) {
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"owner_unknown", // label: reason
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
"false", // label: valid
|
||||
"owner_unknown", // label: reason
|
||||
).Inc()
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
|
||||
return
|
||||
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"signature_invalid", // label: reason
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
"false", // label: valid
|
||||
"signature_invalid", // label: reason
|
||||
).Inc()
|
||||
} else {
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"unknown", // label: reason
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
"false", // label: valid
|
||||
"unknown", // label: reason
|
||||
).Inc()
|
||||
}
|
||||
|
||||
|
|
@ -142,10 +130,8 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
|
|||
return
|
||||
}
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"true", // label: valid
|
||||
"", // label: reason
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
"true", // label: valid
|
||||
"", // label: reason
|
||||
).Inc()
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import (
|
|||
"github.com/cloudbase/garm/config"
|
||||
"github.com/cloudbase/garm/database"
|
||||
"github.com/cloudbase/garm/database/common"
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
|
||||
garmUtil "github.com/cloudbase/garm/util"
|
||||
|
|
@ -219,6 +220,11 @@ func main() {
|
|||
slog.InfoContext(ctx, "setting up metric routes")
|
||||
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
|
||||
|
||||
slog.InfoContext(ctx, "register metrics")
|
||||
if err := metrics.RegisterMetrics(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
slog.InfoContext(ctx, "start metrics collection")
|
||||
runnerMetrics.CollectObjectMetric(ctx, runner, time.NewTicker(cfg.Metrics.Duration()))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
|
|||
|
||||
## Common metrics
|
||||
|
||||
| Metric name | Type | Labels | Description |
|
||||
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
|
||||
| `garm_health` | Gauge | `controller_id`=<controller id> <br>`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
|
||||
| `garm_webhooks_received` | Counter | `controller_id`=<controller id> <br>`name`=<hostname> | This is a counter that increments every time GARM receives a webhook from GitHub. |
|
||||
| Metric name | Type | Labels | Description |
|
||||
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
|
||||
| `garm_health` | Gauge | `controller_id`=<controller id> <br>`callback_url`=<callback url> <br>`controller_webhook_url`=<controller webhook url> <br>`metadata_url`=<metadata url> <br>`webhook_url`=<webhook url> <br>`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
|
||||
| `garm_webhooks_received` | Counter | `valid`=<valid request> <br>`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. |
|
||||
|
||||
## Enterprise metrics
|
||||
|
||||
|
|
@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
|
|||
|
||||
## Runner metrics
|
||||
|
||||
| Metric name | Type | Labels | Description |
|
||||
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
|
||||
| `garm_runner_status` | Gauge | `controller_id`=<controller id> <br>`hostname`=<hostname> <br>`name`=<runner name> <br>`pool_owner`=<owner name> <br>`pool_type`=<repository\|organization\|enterprise> <br>`provider`=<provider name> <br>`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown> <br>`status`=<idle\|pending\|terminated\|installing\|failed\|active> <br> | This is a gauge value that gives us details about the runners garm spawns |
|
||||
| Metric name | Type | Labels | Description |
|
||||
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
|
||||
| `garm_runner_status` | Gauge | `name`=<runner name> <br>`pool_owner`=<owner name> <br>`pool_type`=<repository\|organization\|enterprise> <br>`provider`=<provider name> <br>`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown> <br>`status`=<idle\|pending\|terminated\|installing\|failed\|active> <br> | This is a gauge value that gives us details about the runners garm spawns |
|
||||
|
||||
More metrics will be added in the future.
|
||||
|
||||
|
|
|
|||
|
|
@ -9,5 +9,5 @@ var (
|
|||
Namespace: metricsNamespace,
|
||||
Name: "health",
|
||||
Help: "Health of the garm",
|
||||
}, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"})
|
||||
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -10,5 +10,5 @@ var (
|
|||
Subsystem: metricsRunnerSubsystem,
|
||||
Name: "status",
|
||||
Help: "Status of the instance",
|
||||
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"})
|
||||
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -13,29 +13,41 @@ const metricsRepositorySubsystem = "repository"
|
|||
const metricsEnterpriseSubsystem = "enterprise"
|
||||
const metricsWebhookSubsystem = "webhook"
|
||||
|
||||
func init() {
|
||||
// runner metrics
|
||||
prometheus.MustRegister(InstanceStatus)
|
||||
// organization metrics
|
||||
prometheus.MustRegister(OrganizationInfo)
|
||||
prometheus.MustRegister(OrganizationPoolManagerStatus)
|
||||
// enterprise metrics
|
||||
prometheus.MustRegister(EnterpriseInfo)
|
||||
prometheus.MustRegister(EnterprisePoolManagerStatus)
|
||||
// repository metrics
|
||||
prometheus.MustRegister(RepositoryInfo)
|
||||
prometheus.MustRegister(RepositoryPoolManagerStatus)
|
||||
// provider metrics
|
||||
prometheus.MustRegister(ProviderInfo)
|
||||
// pool metrics
|
||||
prometheus.MustRegister(PoolInfo)
|
||||
prometheus.MustRegister(PoolStatus)
|
||||
prometheus.MustRegister(PoolMaxRunners)
|
||||
prometheus.MustRegister(PoolMinIdleRunners)
|
||||
prometheus.MustRegister(PoolBootstrapTimeout)
|
||||
// health metrics
|
||||
prometheus.MustRegister(GarmHealth)
|
||||
// webhook metrics
|
||||
prometheus.MustRegister(WebhooksReceived)
|
||||
// RegisterMetrics registers all the metrics
|
||||
func RegisterMetrics() error {
|
||||
|
||||
var collectors []prometheus.Collector
|
||||
collectors = append(collectors,
|
||||
// runner metrics
|
||||
InstanceStatus,
|
||||
// organization metrics
|
||||
OrganizationInfo,
|
||||
OrganizationPoolManagerStatus,
|
||||
// enterprise metrics
|
||||
EnterpriseInfo,
|
||||
EnterprisePoolManagerStatus,
|
||||
// repository metrics
|
||||
RepositoryInfo,
|
||||
RepositoryPoolManagerStatus,
|
||||
// provider metrics
|
||||
ProviderInfo,
|
||||
// pool metrics
|
||||
PoolInfo,
|
||||
PoolStatus,
|
||||
PoolMaxRunners,
|
||||
PoolMinIdleRunners,
|
||||
PoolBootstrapTimeout,
|
||||
// health metrics
|
||||
GarmHealth,
|
||||
// webhook metrics
|
||||
WebhooksReceived,
|
||||
)
|
||||
|
||||
for _, c := range collectors {
|
||||
if err := prometheus.Register(c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,10 +10,6 @@ var (
|
|||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the pool",
|
||||
// ConstLabels: prometheus.Labels{
|
||||
// "controller_id": metricControllerValue,
|
||||
// "hostname": metricHostnameValue,
|
||||
// },
|
||||
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
|
||||
|
||||
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
|
|
|
|||
|
|
@ -8,5 +8,5 @@ var (
|
|||
Subsystem: metricsWebhookSubsystem,
|
||||
Name: "received",
|
||||
Help: "The total number of webhooks received",
|
||||
}, []string{"valid", "reason", "hostname", "controller_id"})
|
||||
}, []string{"valid", "reason"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -9,14 +9,12 @@ import (
|
|||
)
|
||||
|
||||
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||
|
||||
metrics.GarmHealth.WithLabelValues(
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: id
|
||||
controllerInfo.MetadataURL, // label: metadata_url
|
||||
controllerInfo.CallbackURL, // label: callback_url
|
||||
controllerInfo.WebhookURL, // label: webhook_url
|
||||
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
).Set(1)
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,13 +4,12 @@ import (
|
|||
"context"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/params"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
// CollectInstanceMetric collects the metrics for the runner instances
|
||||
// reflecting the statuses and the pool they belong to.
|
||||
func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||
func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.InstanceStatus.Reset()
|
||||
|
|
@ -63,10 +62,7 @@ func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo
|
|||
poolNames[instance.PoolID].Name, // label: pool_owner
|
||||
poolNames[instance.PoolID].Type, // label: pool_type
|
||||
instance.PoolID, // label: pool_id
|
||||
controllerInfo.Hostname, // label: hostname
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
poolNames[instance.PoolID].ProviderName, // label: provider
|
||||
|
||||
).Set(1)
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
|
|
@ -57,17 +57,17 @@ func CollectObjectMetric(ctx context.Context, r *runner.Runner, ticker *time.Tic
|
|||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics")
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting instance metrics")
|
||||
err = CollectInstanceMetric(ctx, r)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting health metrics")
|
||||
err = CollectHealthMetric(ctx, r, controllerInfo)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics")
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting instance metrics")
|
||||
err = CollectInstanceMetric(ctx, r, controllerInfo)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue