diff --git a/apiserver/controllers/controllers.go b/apiserver/controllers/controllers.go
index d7dd5f44..7c33695e 100644
--- a/apiserver/controllers/controllers.go
+++ b/apiserver/controllers/controllers.go
@@ -37,7 +37,7 @@ import (
)
func NewAPIController(r *runner.Runner, authenticator *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
- controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
+ controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext(context.Background()))
if err != nil {
return nil, errors.Wrap(err, "failed to get controller info")
}
@@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
}
}
-func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
- controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
- // If labels are empty, not attempt will be made to record webhook.
- return []string{}
- }
- return []string{
- valid, reason,
- controllerInfo.Hostname, controllerInfo.ControllerID.String(),
- }
-}
-
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
@@ -119,31 +106,33 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
- var labelValues []string
- defer func() {
- if len(labelValues) == 0 {
- return
- }
- if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
- }
- }()
-
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
- labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
+ metrics.WebhooksReceived.WithLabelValues(
+ "false", // label: valid
+ "owner_unknown", // label: reason
+ ).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
- labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
+ metrics.WebhooksReceived.WithLabelValues(
+ "false", // label: valid
+ "signature_invalid", // label: reason
+ ).Inc()
} else {
- labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
+ metrics.WebhooksReceived.WithLabelValues(
+ "false", // label: valid
+ "unknown", // label: reason
+ ).Inc()
}
handleError(ctx, w, err)
return
}
- labelValues = a.webhookMetricLabelValues(ctx, "true", "")
+ metrics.WebhooksReceived.WithLabelValues(
+ "true", // label: valid
+ "", // label: reason
+ ).Inc()
}
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
diff --git a/auth/context.go b/auth/context.go
index bce9f25e..71d29d27 100644
--- a/auth/context.go
+++ b/auth/context.go
@@ -238,8 +238,10 @@ func UserID(ctx context.Context) string {
// GetAdminContext will return an admin context. This can be used internally
// when fetching users.
-func GetAdminContext() context.Context {
- ctx := context.Background()
+func GetAdminContext(ctx context.Context) context.Context {
+ if ctx == nil {
+ ctx = context.Background()
+ }
ctx = SetUserID(ctx, "")
ctx = SetAdmin(ctx, true)
ctx = SetIsEnabled(ctx, true)
diff --git a/cmd/garm/main.go b/cmd/garm/main.go
index 4b10fbaa..df7a82e8 100644
--- a/cmd/garm/main.go
+++ b/cmd/garm/main.go
@@ -37,6 +37,7 @@ import (
"github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
+ runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
"github.com/cloudbase/garm/util/appdefaults"
"github.com/cloudbase/garm/websocket"
@@ -214,13 +215,18 @@ func main() {
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
+ // start the metrics collector
if cfg.Metrics.Enable {
- slog.InfoContext(ctx, "registering prometheus metrics collectors")
- if err := metrics.RegisterCollectors(runner); err != nil {
- log.Fatal(err)
- }
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
+
+ slog.InfoContext(ctx, "register metrics")
+ if err := metrics.RegisterMetrics(); err != nil {
+ log.Fatal(err)
+ }
+
+ slog.InfoContext(ctx, "start metrics collection")
+ runnerMetrics.CollectObjectMetric(ctx, runner, cfg.Metrics.Duration())
}
if cfg.Default.DebugServer {
diff --git a/config/config.go b/config/config.go
index 26505ae3..a12b91a9 100644
--- a/config/config.go
+++ b/config/config.go
@@ -456,8 +456,39 @@ func (t *TLSConfig) Validate() error {
}
type Metrics struct {
+ // DisableAuth defines if the API endpoint will be protected by
+ // JWT authentication
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
- Enable bool `toml:"enable" json:"enable"`
+ // Enable define if the API endpoint for metrics collection will
+ // be enabled
+ Enable bool `toml:"enable" json:"enable"`
+ // Period defines the internal period at which internal metrics are getting updated
+ // and propagated to the /metrics endpoint
+ Period time.Duration `toml:"period" json:"period"`
+}
+
+// ParseDuration parses the configured duration and returns a time.Duration of 0
+// if the duration is invalid.
+func (m *Metrics) ParseDuration() (time.Duration, error) {
+ duration, err := time.ParseDuration(fmt.Sprint(m.Period))
+ if err != nil {
+ return 0, err
+ }
+ return duration, nil
+}
+
+// Duration returns the configured duration or the default duration if no value
+// is configured or the configured value is invalid.
+func (m *Metrics) Duration() time.Duration {
+ duration, err := m.ParseDuration()
+ if err != nil {
+ slog.With(slog.Any("error", err)).Error(fmt.Sprintf("defined duration %s is invalid", m.Period))
+ }
+ if duration == 0 {
+ slog.Debug(fmt.Sprintf("using default duration %s for metrics update interval", appdefaults.DefaultMetricsUpdateInterval))
+ return appdefaults.DefaultMetricsUpdateInterval
+ }
+ return duration
}
// APIServer holds configuration for the API server
diff --git a/doc/config_metrics.md b/doc/config_metrics.md
index 8eaeb214..17161616 100644
--- a/doc/config_metrics.md
+++ b/doc/config_metrics.md
@@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
## Common metrics
-| Metric name | Type | Labels | Description |
-|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
-| `garm_health` | Gauge | `controller_id`=<controller id>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
-| `garm_webhooks_received` | Counter | `controller_id`=<controller id>
`name`=<hostname> | This is a counter that increments every time GARM receives a webhook from GitHub. |
+| Metric name | Type | Labels | Description |
+|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
+| `garm_health` | Gauge | `controller_id`=<controller id>
`callback_url`=<callback url>
`controller_webhook_url`=<controller webhook url>
`metadata_url`=<metadata url>
`webhook_url`=<webhook url>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
+| `garm_webhooks_received` | Counter | `valid`=<valid request>
`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. |
## Enterprise metrics
@@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
-| Metric name | Type | Labels | Description |
-|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
-| `garm_runner_status` | Gauge | `controller_id`=<controller id>
`hostname`=<hostname>
`name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
+| Metric name | Type | Labels | Description |
+|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
+| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
More metrics will be added in the future.
@@ -60,15 +60,27 @@ Metrics are disabled by default. To enable them, add the following to your confi
```toml
[metrics]
-# Toggle metrics. If set to false, the API endpoint for metrics collection will
-# be disabled.
-enable = true
+
# Toggle to disable authentication (not recommended) on the metrics endpoint.
# If you do disable authentication, I encourage you to put a reverse proxy in front
# of garm and limit which systems can access that particular endpoint. Ideally, you
# would enable some kind of authentication using the reverse proxy, if the built-in auth
# is not sufficient for your needs.
-disable_auth = false
+#
+# Default: false
+disable_auth = true
+
+# Toggle metrics. If set to false, the API endpoint for metrics collection will
+# be disabled.
+#
+# Default: false
+enable = true
+
+# period is the time interval when the /metrics endpoint will update internal metrics about
+# controller specific objects (e.g. runners, pools, etc.)
+#
+# Default: "60s"
+period = "30s"
```
You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it.
diff --git a/metrics/enterprise.go b/metrics/enterprise.go
index ddba98aa..f8382edf 100644
--- a/metrics/enterprise.go
+++ b/metrics/enterprise.go
@@ -1,50 +1,21 @@
package metrics
import (
- "log/slog"
- "strconv"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectOrganizationMetric collects the metrics for the enterprise objects
-func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
+var (
+ EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsEnterpriseSubsystem,
+ Name: "info",
+ Help: "Info of the enterprise",
+ }, []string{"name", "id"})
- enterprises, err := c.runner.ListEnterprises(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
- return
- }
-
- for _, enterprise := range enterprises {
-
- enterpriseInfo, err := prometheus.NewConstMetric(
- c.enterpriseInfo,
- prometheus.GaugeValue,
- 1,
- enterprise.Name, // label: name
- enterprise.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
- continue
- }
- ch <- enterpriseInfo
-
- enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
- c.enterprisePoolManagerStatus,
- prometheus.GaugeValue,
- bool2float64(enterprise.PoolManagerStatus.IsRunning),
- enterprise.Name, // label: name
- enterprise.ID, // label: id
- strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
- continue
- }
- ch <- enterprisePoolManagerStatus
- }
-}
+ EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsEnterpriseSubsystem,
+ Name: "pool_manager_status",
+ Help: "Status of the enterprise pool manager",
+ }, []string{"name", "id", "running"})
+)
diff --git a/metrics/health.go b/metrics/health.go
index da4b67dc..d1f5e969 100644
--- a/metrics/health.go
+++ b/metrics/health.go
@@ -1,22 +1,13 @@
package metrics
import (
- "log/slog"
-
"github.com/prometheus/client_golang/prometheus"
)
-func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- m, err := prometheus.NewConstMetric(
- c.healthMetric,
- prometheus.GaugeValue,
- 1,
- hostname,
- controllerID,
- )
- if err != nil {
- slog.With(slog.Any("error", err)).Error("error on creating health metric")
- return
- }
- ch <- m
-}
+var (
+ GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Name: "health",
+ Help: "Health of the garm",
+ }, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
+)
diff --git a/metrics/instance.go b/metrics/instance.go
index d89409ed..a4ac66bf 100644
--- a/metrics/instance.go
+++ b/metrics/instance.go
@@ -1,79 +1,14 @@
package metrics
import (
- "log/slog"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectInstanceMetric collects the metrics for the runner instances
-// reflecting the statuses and the pool they belong to.
-func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
-
- instances, err := c.runner.ListAllInstances(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances")
- return
- }
-
- pools, err := c.runner.ListAllPools(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
- return
- }
-
- type poolInfo struct {
- Name string
- Type string
- ProviderName string
- }
-
- poolNames := make(map[string]poolInfo)
- for _, pool := range pools {
- if pool.EnterpriseName != "" {
- poolNames[pool.ID] = poolInfo{
- Name: pool.EnterpriseName,
- Type: string(pool.PoolType()),
- ProviderName: pool.ProviderName,
- }
- } else if pool.OrgName != "" {
- poolNames[pool.ID] = poolInfo{
- Name: pool.OrgName,
- Type: string(pool.PoolType()),
- ProviderName: pool.ProviderName,
- }
- } else {
- poolNames[pool.ID] = poolInfo{
- Name: pool.RepoName,
- Type: string(pool.PoolType()),
- ProviderName: pool.ProviderName,
- }
- }
- }
-
- for _, instance := range instances {
-
- m, err := prometheus.NewConstMetric(
- c.instanceMetric,
- prometheus.GaugeValue,
- 1,
- instance.Name, // label: name
- string(instance.Status), // label: status
- string(instance.RunnerStatus), // label: runner_status
- poolNames[instance.PoolID].Name, // label: pool_owner
- poolNames[instance.PoolID].Type, // label: pool_type
- instance.PoolID, // label: pool_id
- hostname, // label: hostname
- controllerID, // label: controller_id
- poolNames[instance.PoolID].ProviderName, // label: provider
- )
-
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric")
- continue
- }
- ch <- m
- }
-}
+var (
+ InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsRunnerSubsystem,
+ Name: "status",
+ Help: "Status of the instance",
+ }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
+)
diff --git a/metrics/metrics.go b/metrics/metrics.go
index 61d02015..7dec272c 100644
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@@ -1,206 +1,53 @@
package metrics
import (
- "log/slog"
-
- "github.com/cloudbase/garm/auth"
- "github.com/cloudbase/garm/params"
- "github.com/cloudbase/garm/runner"
-
- "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)
-const metricsNamespace = "garm_"
-const metricsRunnerSubsystem = "runner_"
-const metricsPoolSubsystem = "pool_"
-const metricsProviderSubsystem = "provider_"
-const metricsOrganizationSubsystem = "organization_"
-const metricsRepositorySubsystem = "repository_"
-const metricsEnterpriseSubsystem = "enterprise_"
-const metricsWebhookSubsystem = "webhook_"
+const metricsNamespace = "garm"
+const metricsRunnerSubsystem = "runner"
+const metricsPoolSubsystem = "pool"
+const metricsProviderSubsystem = "provider"
+const metricsOrganizationSubsystem = "organization"
+const metricsRepositorySubsystem = "repository"
+const metricsEnterpriseSubsystem = "enterprise"
+const metricsWebhookSubsystem = "webhook"
-var webhooksReceived *prometheus.CounterVec = nil
+// RegisterMetrics registers all the metrics
+func RegisterMetrics() error {
-// RecordWebhookWithLabels will increment a webhook metric identified by specific
-// values. If metrics are disabled, this function is a noop.
-func RecordWebhookWithLabels(lvs ...string) error {
- if webhooksReceived == nil {
- // not registered. Noop
- return nil
+ var collectors []prometheus.Collector
+ collectors = append(collectors,
+ // runner metrics
+ InstanceStatus,
+ // organization metrics
+ OrganizationInfo,
+ OrganizationPoolManagerStatus,
+ // enterprise metrics
+ EnterpriseInfo,
+ EnterprisePoolManagerStatus,
+ // repository metrics
+ RepositoryInfo,
+ RepositoryPoolManagerStatus,
+ // provider metrics
+ ProviderInfo,
+ // pool metrics
+ PoolInfo,
+ PoolStatus,
+ PoolMaxRunners,
+ PoolMinIdleRunners,
+ PoolBootstrapTimeout,
+ // health metrics
+ GarmHealth,
+ // webhook metrics
+ WebhooksReceived,
+ )
+
+ for _, c := range collectors {
+ if err := prometheus.Register(c); err != nil {
+ return err
+ }
}
- counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...)
- if err != nil {
- return errors.Wrap(err, "recording metric")
- }
- counter.Inc()
return nil
}
-
-func RegisterCollectors(runner *runner.Runner) error {
- if webhooksReceived != nil {
- // Already registered.
- return nil
- }
-
- garmCollector, err := NewGarmCollector(runner)
- if err != nil {
- return errors.Wrap(err, "getting collector")
- }
-
- if err := prometheus.Register(garmCollector); err != nil {
- return errors.Wrap(err, "registering collector")
- }
-
- // metric to count total webhooks received
- // at this point the webhook is not yet authenticated and
- // we don't know if it's meant for us or not
- webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
- Name: metricsNamespace + metricsWebhookSubsystem + "received",
- Help: "The total number of webhooks received",
- }, []string{"valid", "reason", "hostname", "controller_id"})
-
- err = prometheus.Register(webhooksReceived)
- if err != nil {
- return errors.Wrap(err, "registering webhooks recv counter")
- }
- return nil
-}
-
-type GarmCollector struct {
- healthMetric *prometheus.Desc
- instanceMetric *prometheus.Desc
-
- // pool metrics
- poolInfo *prometheus.Desc
- poolStatus *prometheus.Desc
- poolMaxRunners *prometheus.Desc
- poolMinIdleRunners *prometheus.Desc
- poolBootstrapTimeout *prometheus.Desc
-
- // provider metrics
- providerInfo *prometheus.Desc
-
- organizationInfo *prometheus.Desc
- organizationPoolManagerStatus *prometheus.Desc
- repositoryInfo *prometheus.Desc
- repositoryPoolManagerStatus *prometheus.Desc
- enterpriseInfo *prometheus.Desc
- enterprisePoolManagerStatus *prometheus.Desc
-
- runner *runner.Runner
- cachedControllerInfo params.ControllerInfo
-}
-
-func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) {
- controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
- if err != nil {
- return nil, errors.Wrap(err, "fetching controller info")
- }
- return &GarmCollector{
- runner: r,
- instanceMetric: prometheus.NewDesc(
- metricsNamespace+metricsRunnerSubsystem+"status",
- "Status of the runner",
- []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil,
- ),
- healthMetric: prometheus.NewDesc(
- metricsNamespace+"health",
- "Health of the runner",
- []string{"hostname", "controller_id"}, nil,
- ),
- poolInfo: prometheus.NewDesc(
- metricsNamespace+metricsPoolSubsystem+"info",
- "Information of the pool",
- []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil,
- ),
- poolStatus: prometheus.NewDesc(
- metricsNamespace+metricsPoolSubsystem+"status",
- "Status of the pool",
- []string{"id", "enabled"}, nil,
- ),
- poolMaxRunners: prometheus.NewDesc(
- metricsNamespace+metricsPoolSubsystem+"max_runners",
- "Max runners of the pool",
- []string{"id"}, nil,
- ),
- poolMinIdleRunners: prometheus.NewDesc(
- metricsNamespace+metricsPoolSubsystem+"min_idle_runners",
- "Min idle runners of the pool",
- []string{"id"}, nil,
- ),
- poolBootstrapTimeout: prometheus.NewDesc(
- metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout",
- "Bootstrap timeout of the pool",
- []string{"id"}, nil,
- ),
- providerInfo: prometheus.NewDesc(
- metricsNamespace+metricsProviderSubsystem+"info",
- "Info of the provider",
- []string{"name", "type", "description"}, nil,
- ),
- organizationInfo: prometheus.NewDesc(
- metricsNamespace+metricsOrganizationSubsystem+"info",
- "Info of the organization",
- []string{"name", "id"}, nil,
- ),
- organizationPoolManagerStatus: prometheus.NewDesc(
- metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status",
- "Status of the organization pool manager",
- []string{"name", "id", "running"}, nil,
- ),
- repositoryInfo: prometheus.NewDesc(
- metricsNamespace+metricsRepositorySubsystem+"info",
- "Info of the organization",
- []string{"name", "owner", "id"}, nil,
- ),
- repositoryPoolManagerStatus: prometheus.NewDesc(
- metricsNamespace+metricsRepositorySubsystem+"pool_manager_status",
- "Status of the repository pool manager",
- []string{"name", "id", "running"}, nil,
- ),
- enterpriseInfo: prometheus.NewDesc(
- metricsNamespace+metricsEnterpriseSubsystem+"info",
- "Info of the organization",
- []string{"name", "id"}, nil,
- ),
- enterprisePoolManagerStatus: prometheus.NewDesc(
- metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status",
- "Status of the enterprise pool manager",
- []string{"name", "id", "running"}, nil,
- ),
-
- cachedControllerInfo: controllerInfo,
- }, nil
-}
-
-func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- c.instanceMetric
- ch <- c.healthMetric
- ch <- c.poolInfo
- ch <- c.poolStatus
- ch <- c.poolMaxRunners
- ch <- c.poolMinIdleRunners
- ch <- c.providerInfo
- ch <- c.organizationInfo
- ch <- c.organizationPoolManagerStatus
- ch <- c.enterpriseInfo
- ch <- c.enterprisePoolManagerStatus
-}
-
-func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
- controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
- if err != nil {
- slog.With(slog.Any("error", err)).Error("failed to get controller info")
- return
- }
-
- c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
- c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
-}
diff --git a/metrics/organization.go b/metrics/organization.go
index e5ea8292..38d7c611 100644
--- a/metrics/organization.go
+++ b/metrics/organization.go
@@ -1,50 +1,21 @@
package metrics
import (
- "log/slog"
- "strconv"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectOrganizationMetric collects the metrics for the organization objects
-func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
+var (
+ OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsOrganizationSubsystem,
+ Name: "info",
+ Help: "Info of the organization",
+ }, []string{"name", "id"})
- organizations, err := c.runner.ListOrganizations(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
- return
- }
-
- for _, organization := range organizations {
-
- organizationInfo, err := prometheus.NewConstMetric(
- c.organizationInfo,
- prometheus.GaugeValue,
- 1,
- organization.Name, // label: name
- organization.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric")
- continue
- }
- ch <- organizationInfo
-
- organizationPoolManagerStatus, err := prometheus.NewConstMetric(
- c.organizationPoolManagerStatus,
- prometheus.GaugeValue,
- bool2float64(organization.PoolManagerStatus.IsRunning),
- organization.Name, // label: name
- organization.ID, // label: id
- strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric")
- continue
- }
- ch <- organizationPoolManagerStatus
- }
-}
+ OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsOrganizationSubsystem,
+ Name: "pool_manager_status",
+ Help: "Status of the organization pool manager",
+ }, []string{"name", "id", "running"})
+)
diff --git a/metrics/pool.go b/metrics/pool.go
index 110812bd..5803af90 100644
--- a/metrics/pool.go
+++ b/metrics/pool.go
@@ -1,121 +1,42 @@
package metrics
import (
- "log/slog"
- "strconv"
- "strings"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectPoolMetric collects the metrics for the pool objects
-func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
+var (
+ PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsPoolSubsystem,
+ Name: "info",
+ Help: "Info of the pool",
+ }, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
- pools, err := c.runner.ListAllPools(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
- return
- }
+ PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsPoolSubsystem,
+ Name: "status",
+ Help: "Status of the pool",
+ }, []string{"id", "enabled"})
- type poolInfo struct {
- Name string
- Type string
- }
+ PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsPoolSubsystem,
+ Name: "max_runners",
+ Help: "Maximum number of runners in the pool",
+ }, []string{"id"})
- poolNames := make(map[string]poolInfo)
- for _, pool := range pools {
- if pool.EnterpriseName != "" {
- poolNames[pool.ID] = poolInfo{
- Name: pool.EnterpriseName,
- Type: string(pool.PoolType()),
- }
- } else if pool.OrgName != "" {
- poolNames[pool.ID] = poolInfo{
- Name: pool.OrgName,
- Type: string(pool.PoolType()),
- }
- } else {
- poolNames[pool.ID] = poolInfo{
- Name: pool.RepoName,
- Type: string(pool.PoolType()),
- }
- }
+ PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsPoolSubsystem,
+ Name: "min_idle_runners",
+ Help: "Minimum number of idle runners in the pool",
+ }, []string{"id"})
- var poolTags []string
- for _, tag := range pool.Tags {
- poolTags = append(poolTags, tag.Name)
- }
-
- poolInfo, err := prometheus.NewConstMetric(
- c.poolInfo,
- prometheus.GaugeValue,
- 1,
- pool.ID, // label: id
- pool.Image, // label: image
- pool.Flavor, // label: flavor
- pool.Prefix, // label: prefix
- string(pool.OSType), // label: os_type
- string(pool.OSArch), // label: os_arch
- strings.Join(poolTags, ","), // label: tags
- pool.ProviderName, // label: provider
- poolNames[pool.ID].Name, // label: pool_owner
- poolNames[pool.ID].Type, // label: pool_type
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric")
- continue
- }
- ch <- poolInfo
-
- poolStatus, err := prometheus.NewConstMetric(
- c.poolStatus,
- prometheus.GaugeValue,
- bool2float64(pool.Enabled),
- pool.ID, // label: id
- strconv.FormatBool(pool.Enabled), // label: enabled
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric")
- continue
- }
- ch <- poolStatus
-
- poolMaxRunners, err := prometheus.NewConstMetric(
- c.poolMaxRunners,
- prometheus.GaugeValue,
- float64(pool.MaxRunners),
- pool.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric")
- continue
- }
- ch <- poolMaxRunners
-
- poolMinIdleRunners, err := prometheus.NewConstMetric(
- c.poolMinIdleRunners,
- prometheus.GaugeValue,
- float64(pool.MinIdleRunners),
- pool.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric")
- continue
- }
- ch <- poolMinIdleRunners
-
- poolBootstrapTimeout, err := prometheus.NewConstMetric(
- c.poolBootstrapTimeout,
- prometheus.GaugeValue,
- float64(pool.RunnerBootstrapTimeout),
- pool.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric")
- continue
- }
- ch <- poolBootstrapTimeout
- }
-}
+ PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsPoolSubsystem,
+ Name: "bootstrap_timeout",
+ Help: "Runner bootstrap timeout in the pool",
+ }, []string{"id"})
+)
diff --git a/metrics/provider.go b/metrics/provider.go
index e51f295e..5034a7e9 100644
--- a/metrics/provider.go
+++ b/metrics/provider.go
@@ -1,36 +1,14 @@
package metrics
import (
- "log/slog"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectPoolMetric collects the metrics for the pool objects
-func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
-
- providers, err := c.runner.ListProviders(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
- return
- }
-
- for _, provider := range providers {
-
- providerInfo, err := prometheus.NewConstMetric(
- c.providerInfo,
- prometheus.GaugeValue,
- 1,
- provider.Name, // label: name
- string(provider.ProviderType), // label: type
- provider.Description, // label: description
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric")
- continue
- }
- ch <- providerInfo
- }
-}
+var (
+ ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsProviderSubsystem,
+ Name: "info",
+ Help: "Info of the organization",
+ }, []string{"name", "type", "description"})
+)
diff --git a/metrics/repository.go b/metrics/repository.go
index b778782e..a84dd120 100644
--- a/metrics/repository.go
+++ b/metrics/repository.go
@@ -1,51 +1,21 @@
package metrics
import (
- "log/slog"
- "strconv"
-
- "github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
-// CollectOrganizationMetric collects the metrics for the repository objects
-func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
- ctx := auth.GetAdminContext()
+var (
+ RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsRepositorySubsystem,
+ Name: "info",
+ Help: "Info of the enterprise",
+ }, []string{"name", "id"})
- repositories, err := c.runner.ListRepositories(ctx)
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
- return
- }
-
- for _, repository := range repositories {
-
- repositoryInfo, err := prometheus.NewConstMetric(
- c.repositoryInfo,
- prometheus.GaugeValue,
- 1,
- repository.Name, // label: name
- repository.Owner, // label: owner
- repository.ID, // label: id
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric")
- continue
- }
- ch <- repositoryInfo
-
- repositoryPoolManagerStatus, err := prometheus.NewConstMetric(
- c.repositoryPoolManagerStatus,
- prometheus.GaugeValue,
- bool2float64(repository.PoolManagerStatus.IsRunning),
- repository.Name, // label: name
- repository.ID, // label: id
- strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
- )
- if err != nil {
- slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric")
- continue
- }
- ch <- repositoryPoolManagerStatus
- }
-}
+ RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsRepositorySubsystem,
+ Name: "pool_manager_status",
+ Help: "Status of the enterprise pool manager",
+ }, []string{"name", "id", "running"})
+)
diff --git a/metrics/util.go b/metrics/util.go
index ae2d7087..b2edb580 100644
--- a/metrics/util.go
+++ b/metrics/util.go
@@ -1,6 +1,6 @@
package metrics
-func bool2float64(b bool) float64 {
+func Bool2float64(b bool) float64 {
if b {
return 1
}
diff --git a/metrics/webhooks.go b/metrics/webhooks.go
new file mode 100644
index 00000000..14b6492c
--- /dev/null
+++ b/metrics/webhooks.go
@@ -0,0 +1,12 @@
+package metrics
+
+import "github.com/prometheus/client_golang/prometheus"
+
+var (
+ WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsWebhookSubsystem,
+ Name: "received",
+ Help: "The total number of webhooks received",
+ }, []string{"valid", "reason"})
+)
diff --git a/runner/enterprises_test.go b/runner/enterprises_test.go
index 809577a7..922f1bff 100644
--- a/runner/enterprises_test.go
+++ b/runner/enterprises_test.go
@@ -60,7 +60,7 @@ type EnterpriseTestSuite struct {
}
func (s *EnterpriseTestSuite) SetupTest() {
- adminCtx := auth.GetAdminContext()
+ adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
diff --git a/runner/metrics/enterprise.go b/runner/metrics/enterprise.go
new file mode 100644
index 00000000..8cce89d6
--- /dev/null
+++ b/runner/metrics/enterprise.go
@@ -0,0 +1,36 @@
+package metrics
+
+import (
+ "context"
+ "strconv"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+// CollectOrganizationMetric collects the metrics for the enterprise objects
+func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.EnterpriseInfo.Reset()
+ metrics.EnterprisePoolManagerStatus.Reset()
+
+ enterprises, err := r.ListEnterprises(ctx)
+ if err != nil {
+ return err
+ }
+
+ for _, enterprise := range enterprises {
+ metrics.EnterpriseInfo.WithLabelValues(
+ enterprise.Name, // label: name
+ enterprise.ID, // label: id
+ ).Set(1)
+
+ metrics.EnterprisePoolManagerStatus.WithLabelValues(
+ enterprise.Name, // label: name
+ enterprise.ID, // label: id
+ strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
+ ).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning))
+ }
+ return nil
+}
diff --git a/runner/metrics/health.go b/runner/metrics/health.go
new file mode 100644
index 00000000..d70adf10
--- /dev/null
+++ b/runner/metrics/health.go
@@ -0,0 +1,20 @@
+package metrics
+
+import (
+ "context"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/params"
+ "github.com/cloudbase/garm/runner"
+)
+
+func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
+ metrics.GarmHealth.WithLabelValues(
+ controllerInfo.MetadataURL, // label: metadata_url
+ controllerInfo.CallbackURL, // label: callback_url
+ controllerInfo.WebhookURL, // label: webhook_url
+ controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
+ controllerInfo.ControllerID.String(), // label: controller_id
+ ).Set(1)
+ return nil
+}
diff --git a/runner/metrics/instance.go b/runner/metrics/instance.go
new file mode 100644
index 00000000..a32fa081
--- /dev/null
+++ b/runner/metrics/instance.go
@@ -0,0 +1,69 @@
+package metrics
+
+import (
+ "context"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+// CollectInstanceMetric collects the metrics for the runner instances
+// reflecting the statuses and the pool they belong to.
+func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.InstanceStatus.Reset()
+
+ instances, err := r.ListAllInstances(ctx)
+ if err != nil {
+ return err
+ }
+
+ pools, err := r.ListAllPools(ctx)
+ if err != nil {
+ return err
+ }
+
+ type poolInfo struct {
+ Name string
+ Type string
+ ProviderName string
+ }
+
+ poolNames := make(map[string]poolInfo)
+ for _, pool := range pools {
+ if pool.EnterpriseName != "" {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.EnterpriseName,
+ Type: string(pool.PoolType()),
+ ProviderName: pool.ProviderName,
+ }
+ } else if pool.OrgName != "" {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.OrgName,
+ Type: string(pool.PoolType()),
+ ProviderName: pool.ProviderName,
+ }
+ } else {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.RepoName,
+ Type: string(pool.PoolType()),
+ ProviderName: pool.ProviderName,
+ }
+ }
+ }
+
+ for _, instance := range instances {
+
+ metrics.InstanceStatus.WithLabelValues(
+ instance.Name, // label: name
+ string(instance.Status), // label: status
+ string(instance.RunnerStatus), // label: runner_status
+ poolNames[instance.PoolID].Name, // label: pool_owner
+ poolNames[instance.PoolID].Type, // label: pool_type
+ instance.PoolID, // label: pool_id
+ poolNames[instance.PoolID].ProviderName, // label: provider
+ ).Set(1)
+ }
+ return nil
+}
diff --git a/runner/metrics/metrics.go b/runner/metrics/metrics.go
new file mode 100644
index 00000000..59fc6bbb
--- /dev/null
+++ b/runner/metrics/metrics.go
@@ -0,0 +1,90 @@
+package metrics
+
+import (
+ "context"
+ "log/slog"
+ "time"
+
+ "github.com/cloudbase/garm/auth"
+ "github.com/cloudbase/garm/params"
+ "github.com/cloudbase/garm/runner"
+)
+
+func CollectObjectMetric(ctx context.Context, r *runner.Runner, duration time.Duration) {
+ ctx = auth.GetAdminContext(ctx)
+
+ // get controller info for health metrics
+ controllerInfo, err := r.GetControllerInfo(ctx)
+ if err != nil {
+ slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info")
+ }
+
+ // we do not want to wait until the first ticker happens
+ // for that we start an initial collection immediately
+ slog.InfoContext(ctx, "collecting metrics")
+ if err := collectMetrics(ctx, r, controllerInfo); err != nil {
+ slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
+ }
+
+ go func() {
+ ticker := time.NewTicker(duration)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ slog.InfoContext(ctx, "collecting metrics")
+
+ if err := collectMetrics(ctx, r, controllerInfo); err != nil {
+ slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
+ }
+ }
+ }
+ }()
+}
+
+func collectMetrics(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
+ slog.DebugContext(ctx, "collecting organization metrics")
+ err := CollectOrganizationMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting enterprise metrics")
+ err = CollectEnterpriseMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting repository metrics")
+ err = CollectRepositoryMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting provider metrics")
+ err = CollectProviderMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting pool metrics")
+ err = CollectPoolMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting instance metrics")
+ err = CollectInstanceMetric(ctx, r)
+ if err != nil {
+ return err
+ }
+
+ slog.DebugContext(ctx, "collecting health metrics")
+ err = CollectHealthMetric(ctx, r, controllerInfo)
+ if err != nil {
+ return err
+ }
+ return nil
+}
diff --git a/runner/metrics/organization.go b/runner/metrics/organization.go
new file mode 100644
index 00000000..0be9ced6
--- /dev/null
+++ b/runner/metrics/organization.go
@@ -0,0 +1,36 @@
+package metrics
+
+import (
+ "context"
+ "strconv"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+// CollectOrganizationMetric collects the metrics for the organization objects
+func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.OrganizationInfo.Reset()
+ metrics.OrganizationPoolManagerStatus.Reset()
+
+ organizations, err := r.ListOrganizations(ctx)
+ if err != nil {
+ return err
+ }
+
+ for _, organization := range organizations {
+ metrics.OrganizationInfo.WithLabelValues(
+ organization.Name, // label: name
+ organization.ID, // label: id
+ ).Set(1)
+
+ metrics.OrganizationPoolManagerStatus.WithLabelValues(
+ organization.Name, // label: name
+ organization.ID, // label: id
+ strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
+ ).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning))
+ }
+ return nil
+}
diff --git a/runner/metrics/pool.go b/runner/metrics/pool.go
new file mode 100644
index 00000000..817cb104
--- /dev/null
+++ b/runner/metrics/pool.go
@@ -0,0 +1,87 @@
+package metrics
+
+import (
+ "context"
+ "strconv"
+ "strings"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+// CollectPoolMetric collects the metrics for the pool objects
+func CollectPoolMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.PoolInfo.Reset()
+ metrics.PoolStatus.Reset()
+ metrics.PoolMaxRunners.Reset()
+ metrics.PoolMinIdleRunners.Reset()
+ metrics.PoolBootstrapTimeout.Reset()
+
+ pools, err := r.ListAllPools(ctx)
+ if err != nil {
+ return err
+ }
+
+ type poolInfo struct {
+ Name string
+ Type string
+ }
+
+ poolNames := make(map[string]poolInfo)
+ for _, pool := range pools {
+ if pool.EnterpriseName != "" {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.EnterpriseName,
+ Type: string(pool.PoolType()),
+ }
+ } else if pool.OrgName != "" {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.OrgName,
+ Type: string(pool.PoolType()),
+ }
+ } else {
+ poolNames[pool.ID] = poolInfo{
+ Name: pool.RepoName,
+ Type: string(pool.PoolType()),
+ }
+ }
+
+ var poolTags []string
+ for _, tag := range pool.Tags {
+ poolTags = append(poolTags, tag.Name)
+ }
+
+ metrics.PoolInfo.WithLabelValues(
+ pool.ID, // label: id
+ pool.Image, // label: image
+ pool.Flavor, // label: flavor
+ pool.Prefix, // label: prefix
+ string(pool.OSType), // label: os_type
+ string(pool.OSArch), // label: os_arch
+ strings.Join(poolTags, ","), // label: tags
+ pool.ProviderName, // label: provider
+ poolNames[pool.ID].Name, // label: pool_owner
+ poolNames[pool.ID].Type, // label: pool_type
+ ).Set(1)
+
+ metrics.PoolStatus.WithLabelValues(
+ pool.ID, // label: id
+ strconv.FormatBool(pool.Enabled), // label: enabled
+ ).Set(metrics.Bool2float64(pool.Enabled))
+
+ metrics.PoolMaxRunners.WithLabelValues(
+ pool.ID, // label: id
+ ).Set(float64(pool.MaxRunners))
+
+ metrics.PoolMinIdleRunners.WithLabelValues(
+ pool.ID, // label: id
+ ).Set(float64(pool.MinIdleRunners))
+
+ metrics.PoolBootstrapTimeout.WithLabelValues(
+ pool.ID, // label: id
+ ).Set(float64(pool.RunnerBootstrapTimeout))
+ }
+ return nil
+}
diff --git a/runner/metrics/provider.go b/runner/metrics/provider.go
new file mode 100644
index 00000000..398f8ee3
--- /dev/null
+++ b/runner/metrics/provider.go
@@ -0,0 +1,27 @@
+package metrics
+
+import (
+ "context"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+func CollectProviderMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.ProviderInfo.Reset()
+
+ providers, err := r.ListProviders(ctx)
+ if err != nil {
+ return err
+ }
+ for _, provider := range providers {
+ metrics.ProviderInfo.WithLabelValues(
+ provider.Name, // label: name
+ string(provider.ProviderType), // label: type
+ provider.Description, // label: description
+ ).Set(1)
+ }
+ return nil
+}
diff --git a/runner/metrics/repository.go b/runner/metrics/repository.go
new file mode 100644
index 00000000..ba2fab29
--- /dev/null
+++ b/runner/metrics/repository.go
@@ -0,0 +1,35 @@
+package metrics
+
+import (
+ "context"
+ "strconv"
+
+ "github.com/cloudbase/garm/metrics"
+ "github.com/cloudbase/garm/runner"
+)
+
+func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error {
+
+ // reset metrics
+ metrics.EnterpriseInfo.Reset()
+ metrics.EnterprisePoolManagerStatus.Reset()
+
+ repositories, err := r.ListRepositories(ctx)
+ if err != nil {
+ return err
+ }
+
+ for _, repository := range repositories {
+ metrics.EnterpriseInfo.WithLabelValues(
+ repository.Name, // label: name
+ repository.ID, // label: id
+ ).Set(1)
+
+ metrics.EnterprisePoolManagerStatus.WithLabelValues(
+ repository.Name, // label: name
+ repository.ID, // label: id
+ strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
+ ).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning))
+ }
+ return nil
+}
diff --git a/runner/organizations_test.go b/runner/organizations_test.go
index 3aa3e427..167dbccf 100644
--- a/runner/organizations_test.go
+++ b/runner/organizations_test.go
@@ -60,7 +60,7 @@ type OrgTestSuite struct {
}
func (s *OrgTestSuite) SetupTest() {
- adminCtx := auth.GetAdminContext()
+ adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
diff --git a/runner/pools_test.go b/runner/pools_test.go
index db112b69..94fee6b6 100644
--- a/runner/pools_test.go
+++ b/runner/pools_test.go
@@ -47,7 +47,7 @@ type PoolTestSuite struct {
}
func (s *PoolTestSuite) SetupTest() {
- adminCtx := auth.GetAdminContext()
+ adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
diff --git a/runner/repositories_test.go b/runner/repositories_test.go
index 4c918124..e5b59ade 100644
--- a/runner/repositories_test.go
+++ b/runner/repositories_test.go
@@ -59,7 +59,7 @@ type RepoTestSuite struct {
}
func (s *RepoTestSuite) SetupTest() {
- adminCtx := auth.GetAdminContext()
+ adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
@@ -90,7 +90,7 @@ func (s *RepoTestSuite) SetupTest() {
var minIdleRunners uint = 20
providerMock := runnerCommonMocks.NewProvider(s.T())
fixtures := &RepoTestFixtures{
- AdminContext: auth.GetAdminContext(),
+ AdminContext: auth.GetAdminContext(context.Background()),
Store: db,
StoreRepos: repos,
Providers: map[string]common.Provider{
diff --git a/util/appdefaults/appdefaults.go b/util/appdefaults/appdefaults.go
index d0d86976..26faa667 100644
--- a/util/appdefaults/appdefaults.go
+++ b/util/appdefaults/appdefaults.go
@@ -27,4 +27,7 @@ const (
// uploadBaseURL is the default URL for guthub uploads.
GithubDefaultUploadBaseURL = "https://uploads.github.com/"
+
+ // metrics data update interval
+ DefaultMetricsUpdateInterval = 60 * time.Second
)