diff --git a/apiserver/controllers/controllers.go b/apiserver/controllers/controllers.go
index 36b63891..acfee0b9 100644
--- a/apiserver/controllers/controllers.go
+++ b/apiserver/controllers/controllers.go
@@ -106,35 +106,23 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
- controllerInfo, err := a.r.GetControllerInfo(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
- return
- }
-
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
metrics.WebhooksReceived.WithLabelValues(
- "false", // label: valid
- "owner_unknown", // label: reason
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: controller_id
+ "false", // label: valid
+ "owner_unknown", // label: reason
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
metrics.WebhooksReceived.WithLabelValues(
- "false", // label: valid
- "signature_invalid", // label: reason
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: controller_id
+ "false", // label: valid
+ "signature_invalid", // label: reason
).Inc()
} else {
metrics.WebhooksReceived.WithLabelValues(
- "false", // label: valid
- "unknown", // label: reason
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: controller_id
+ "false", // label: valid
+ "unknown", // label: reason
).Inc()
}
@@ -142,10 +130,8 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
return
}
metrics.WebhooksReceived.WithLabelValues(
- "true", // label: valid
- "", // label: reason
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: controller_id
+ "true", // label: valid
+ "", // label: reason
).Inc()
}
diff --git a/cmd/garm/main.go b/cmd/garm/main.go
index 327f09bd..dfac21a9 100644
--- a/cmd/garm/main.go
+++ b/cmd/garm/main.go
@@ -35,6 +35,7 @@ import (
"github.com/cloudbase/garm/config"
"github.com/cloudbase/garm/database"
"github.com/cloudbase/garm/database/common"
+ "github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
@@ -219,6 +220,11 @@ func main() {
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
+ slog.InfoContext(ctx, "register metrics")
+ if err := metrics.RegisterMetrics(); err != nil {
+ log.Fatal(err)
+ }
+
slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(ctx, runner, time.NewTicker(cfg.Metrics.Duration()))
}
diff --git a/doc/config_metrics.md b/doc/config_metrics.md
index 65ae1057..17161616 100644
--- a/doc/config_metrics.md
+++ b/doc/config_metrics.md
@@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
## Common metrics
-| Metric name | Type | Labels | Description |
-|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
-| `garm_health` | Gauge | `controller_id`=<controller id>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
-| `garm_webhooks_received` | Counter | `controller_id`=<controller id>
`name`=<hostname> | This is a counter that increments every time GARM receives a webhook from GitHub. |
+| Metric name | Type | Labels | Description |
+|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
+| `garm_health` | Gauge | `controller_id`=<controller id>
`callback_url`=<callback url>
`controller_webhook_url`=<controller webhook url>
`metadata_url`=<metadata url>
`webhook_url`=<webhook url>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
+| `garm_webhooks_received` | Counter | `valid`=<valid request>
`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. |
## Enterprise metrics
@@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
-| Metric name | Type | Labels | Description |
-|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
-| `garm_runner_status` | Gauge | `controller_id`=<controller id>
`hostname`=<hostname>
`name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
+| Metric name | Type | Labels | Description |
+|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
+| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
More metrics will be added in the future.
diff --git a/metrics/health.go b/metrics/health.go
index 8c8d6bcd..d1f5e969 100644
--- a/metrics/health.go
+++ b/metrics/health.go
@@ -9,5 +9,5 @@ var (
Namespace: metricsNamespace,
Name: "health",
Help: "Health of the garm",
- }, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"})
+ }, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
)
diff --git a/metrics/instance.go b/metrics/instance.go
index e37d80fe..a4ac66bf 100644
--- a/metrics/instance.go
+++ b/metrics/instance.go
@@ -10,5 +10,5 @@ var (
Subsystem: metricsRunnerSubsystem,
Name: "status",
Help: "Status of the instance",
- }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"})
+ }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
)
diff --git a/metrics/metrics.go b/metrics/metrics.go
index 2bdec860..7dec272c 100644
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@@ -13,29 +13,41 @@ const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
-func init() {
- // runner metrics
- prometheus.MustRegister(InstanceStatus)
- // organization metrics
- prometheus.MustRegister(OrganizationInfo)
- prometheus.MustRegister(OrganizationPoolManagerStatus)
- // enterprise metrics
- prometheus.MustRegister(EnterpriseInfo)
- prometheus.MustRegister(EnterprisePoolManagerStatus)
- // repository metrics
- prometheus.MustRegister(RepositoryInfo)
- prometheus.MustRegister(RepositoryPoolManagerStatus)
- // provider metrics
- prometheus.MustRegister(ProviderInfo)
- // pool metrics
- prometheus.MustRegister(PoolInfo)
- prometheus.MustRegister(PoolStatus)
- prometheus.MustRegister(PoolMaxRunners)
- prometheus.MustRegister(PoolMinIdleRunners)
- prometheus.MustRegister(PoolBootstrapTimeout)
- // health metrics
- prometheus.MustRegister(GarmHealth)
- // webhook metrics
- prometheus.MustRegister(WebhooksReceived)
+// RegisterMetrics registers all the metrics
+func RegisterMetrics() error {
+ var collectors []prometheus.Collector
+ collectors = append(collectors,
+ // runner metrics
+ InstanceStatus,
+ // organization metrics
+ OrganizationInfo,
+ OrganizationPoolManagerStatus,
+ // enterprise metrics
+ EnterpriseInfo,
+ EnterprisePoolManagerStatus,
+ // repository metrics
+ RepositoryInfo,
+ RepositoryPoolManagerStatus,
+ // provider metrics
+ ProviderInfo,
+ // pool metrics
+ PoolInfo,
+ PoolStatus,
+ PoolMaxRunners,
+ PoolMinIdleRunners,
+ PoolBootstrapTimeout,
+ // health metrics
+ GarmHealth,
+ // webhook metrics
+ WebhooksReceived,
+ )
+
+ for _, c := range collectors {
+ if err := prometheus.Register(c); err != nil {
+ return err
+ }
+ }
+
+ return nil
}
diff --git a/metrics/pool.go b/metrics/pool.go
index 764ad706..5803af90 100644
--- a/metrics/pool.go
+++ b/metrics/pool.go
@@ -10,10 +10,6 @@ var (
Subsystem: metricsPoolSubsystem,
Name: "info",
Help: "Info of the pool",
- // ConstLabels: prometheus.Labels{
- // "controller_id": metricControllerValue,
- // "hostname": metricHostnameValue,
- // },
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
diff --git a/metrics/webhooks.go b/metrics/webhooks.go
index 7b314eb6..14b6492c 100644
--- a/metrics/webhooks.go
+++ b/metrics/webhooks.go
@@ -8,5 +8,5 @@ var (
Subsystem: metricsWebhookSubsystem,
Name: "received",
Help: "The total number of webhooks received",
- }, []string{"valid", "reason", "hostname", "controller_id"})
+ }, []string{"valid", "reason"})
)
diff --git a/runner/metrics/health.go b/runner/metrics/health.go
index c84dca45..d70adf10 100644
--- a/runner/metrics/health.go
+++ b/runner/metrics/health.go
@@ -9,14 +9,12 @@ import (
)
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
-
metrics.GarmHealth.WithLabelValues(
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: id
controllerInfo.MetadataURL, // label: metadata_url
controllerInfo.CallbackURL, // label: callback_url
controllerInfo.WebhookURL, // label: webhook_url
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
+ controllerInfo.ControllerID.String(), // label: controller_id
).Set(1)
return nil
}
diff --git a/runner/metrics/instance.go b/runner/metrics/instance.go
index 9fdb0740..a32fa081 100644
--- a/runner/metrics/instance.go
+++ b/runner/metrics/instance.go
@@ -4,13 +4,12 @@ import (
"context"
"github.com/cloudbase/garm/metrics"
- "github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
-func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
+func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.InstanceStatus.Reset()
@@ -63,10 +62,7 @@ func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
- controllerInfo.Hostname, // label: hostname
- controllerInfo.ControllerID.String(), // label: controller_id
poolNames[instance.PoolID].ProviderName, // label: provider
-
).Set(1)
}
return nil
diff --git a/runner/metrics/metrics.go b/runner/metrics/metrics.go
index 5427118f..911226dc 100644
--- a/runner/metrics/metrics.go
+++ b/runner/metrics/metrics.go
@@ -57,17 +57,17 @@ func CollectObjectMetric(ctx context.Context, r *runner.Runner, ticker *time.Tic
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics")
}
+ slog.DebugContext(ctx, "collecting instance metrics")
+ err = CollectInstanceMetric(ctx, r)
+ if err != nil {
+ slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
+ }
+
slog.DebugContext(ctx, "collecting health metrics")
err = CollectHealthMetric(ctx, r, controllerInfo)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics")
}
-
- slog.DebugContext(ctx, "collecting instance metrics")
- err = CollectInstanceMetric(ctx, r, controllerInfo)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
- }
}
}
}()