diff --git a/apiserver/controllers/controllers.go b/apiserver/controllers/controllers.go index d7dd5f44..f8a8c3b3 100644 --- a/apiserver/controllers/controllers.go +++ b/apiserver/controllers/controllers.go @@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) { } } -func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string { - controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext()) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info") - // If labels are empty, not attempt will be made to record webhook. - return []string{} - } - return []string{ - valid, reason, - controllerInfo.Hostname, controllerInfo.ControllerID.String(), - } -} - func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) { defer r.Body.Close() body, err := io.ReadAll(r.Body) @@ -119,31 +106,47 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo signature := r.Header.Get("X-Hub-Signature-256") hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type") - var labelValues []string - defer func() { - if len(labelValues) == 0 { - return - } - if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric") - } - }() + controllerInfo, err := a.r.GetControllerInfo(ctx) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info") + return + } if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil { if errors.Is(err, gErrors.ErrNotFound) { - labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown") + metrics.WebhooksReceived.WithLabelValues( + "false", // label: valid + "owner_unknown", // label: reason + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: controller_id + ).Inc() slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?") return } else if strings.Contains(err.Error(), "signature") { // TODO: check error type - labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid") + metrics.WebhooksReceived.WithLabelValues( + "false", // label: valid + "signature_invalid", // label: reason + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: controller_id + ).Inc() } else { - labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown") + metrics.WebhooksReceived.WithLabelValues( + "false", // label: valid + "unknown", // label: reason + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: controller_id + ).Inc() } handleError(ctx, w, err) return } - labelValues = a.webhookMetricLabelValues(ctx, "true", "") + metrics.WebhooksReceived.WithLabelValues( + "true", // label: valid + "", // label: reason + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: controller_id + ).Inc() } func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) { diff --git a/cmd/garm/main.go b/cmd/garm/main.go index 4b10fbaa..16e411e7 100644 --- a/cmd/garm/main.go +++ b/cmd/garm/main.go @@ -35,8 +35,8 @@ import ( "github.com/cloudbase/garm/config" "github.com/cloudbase/garm/database" "github.com/cloudbase/garm/database/common" - "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/runner" + runnerMetrics "github.com/cloudbase/garm/runner/metrics" garmUtil "github.com/cloudbase/garm/util" "github.com/cloudbase/garm/util/appdefaults" "github.com/cloudbase/garm/websocket" @@ -214,13 +214,13 @@ func main() { router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement) + // start the metrics collector if cfg.Metrics.Enable { - slog.InfoContext(ctx, "registering prometheus metrics collectors") - if err := metrics.RegisterCollectors(runner); err != nil { - log.Fatal(err) - } slog.InfoContext(ctx, "setting up metric routes") router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware) + + slog.InfoContext(ctx, "start metrics collection") + runnerMetrics.CollectObjectMetric(runner, time.NewTicker(cfg.Metrics.Period)) } if cfg.Default.DebugServer { diff --git a/config/config.go b/config/config.go index 26505ae3..7ab850a9 100644 --- a/config/config.go +++ b/config/config.go @@ -456,8 +456,9 @@ func (t *TLSConfig) Validate() error { } type Metrics struct { - DisableAuth bool `toml:"disable_auth" json:"disable-auth"` - Enable bool `toml:"enable" json:"enable"` + DisableAuth bool `toml:"disable_auth" json:"disable-auth"` + Enable bool `toml:"enable" json:"enable"` + Period time.Duration `toml:"period" json:"period"` } // APIServer holds configuration for the API server diff --git a/metrics/enterprise.go b/metrics/enterprise.go index ddba98aa..f8382edf 100644 --- a/metrics/enterprise.go +++ b/metrics/enterprise.go @@ -1,50 +1,21 @@ package metrics import ( - "log/slog" - "strconv" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectOrganizationMetric collects the metrics for the enterprise objects -func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() +var ( + EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsEnterpriseSubsystem, + Name: "info", + Help: "Info of the enterprise", + }, []string{"name", "id"}) - enterprises, err := c.runner.ListEnterprises(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") - return - } - - for _, enterprise := range enterprises { - - enterpriseInfo, err := prometheus.NewConstMetric( - c.enterpriseInfo, - prometheus.GaugeValue, - 1, - enterprise.Name, // label: name - enterprise.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric") - continue - } - ch <- enterpriseInfo - - enterprisePoolManagerStatus, err := prometheus.NewConstMetric( - c.enterprisePoolManagerStatus, - prometheus.GaugeValue, - bool2float64(enterprise.PoolManagerStatus.IsRunning), - enterprise.Name, // label: name - enterprise.ID, // label: id - strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric") - continue - } - ch <- enterprisePoolManagerStatus - } -} + EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsEnterpriseSubsystem, + Name: "pool_manager_status", + Help: "Status of the enterprise pool manager", + }, []string{"name", "id", "running"}) +) diff --git a/metrics/health.go b/metrics/health.go index da4b67dc..8c8d6bcd 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -1,22 +1,13 @@ package metrics import ( - "log/slog" - "github.com/prometheus/client_golang/prometheus" ) -func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - m, err := prometheus.NewConstMetric( - c.healthMetric, - prometheus.GaugeValue, - 1, - hostname, - controllerID, - ) - if err != nil { - slog.With(slog.Any("error", err)).Error("error on creating health metric") - return - } - ch <- m -} +var ( + GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Name: "health", + Help: "Health of the garm", + }, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"}) +) diff --git a/metrics/instance.go b/metrics/instance.go index d89409ed..e37d80fe 100644 --- a/metrics/instance.go +++ b/metrics/instance.go @@ -1,79 +1,14 @@ package metrics import ( - "log/slog" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectInstanceMetric collects the metrics for the runner instances -// reflecting the statuses and the pool they belong to. -func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() - - instances, err := c.runner.ListAllInstances(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances") - return - } - - pools, err := c.runner.ListAllPools(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") - return - } - - type poolInfo struct { - Name string - Type string - ProviderName string - } - - poolNames := make(map[string]poolInfo) - for _, pool := range pools { - if pool.EnterpriseName != "" { - poolNames[pool.ID] = poolInfo{ - Name: pool.EnterpriseName, - Type: string(pool.PoolType()), - ProviderName: pool.ProviderName, - } - } else if pool.OrgName != "" { - poolNames[pool.ID] = poolInfo{ - Name: pool.OrgName, - Type: string(pool.PoolType()), - ProviderName: pool.ProviderName, - } - } else { - poolNames[pool.ID] = poolInfo{ - Name: pool.RepoName, - Type: string(pool.PoolType()), - ProviderName: pool.ProviderName, - } - } - } - - for _, instance := range instances { - - m, err := prometheus.NewConstMetric( - c.instanceMetric, - prometheus.GaugeValue, - 1, - instance.Name, // label: name - string(instance.Status), // label: status - string(instance.RunnerStatus), // label: runner_status - poolNames[instance.PoolID].Name, // label: pool_owner - poolNames[instance.PoolID].Type, // label: pool_type - instance.PoolID, // label: pool_id - hostname, // label: hostname - controllerID, // label: controller_id - poolNames[instance.PoolID].ProviderName, // label: provider - ) - - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric") - continue - } - ch <- m - } -} +var ( + InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsRunnerSubsystem, + Name: "status", + Help: "Status of the instance", + }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}) +) diff --git a/metrics/metrics.go b/metrics/metrics.go index 61d02015..2bdec860 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -1,206 +1,41 @@ package metrics import ( - "log/slog" - - "github.com/cloudbase/garm/auth" - "github.com/cloudbase/garm/params" - "github.com/cloudbase/garm/runner" - - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" ) -const metricsNamespace = "garm_" -const metricsRunnerSubsystem = "runner_" -const metricsPoolSubsystem = "pool_" -const metricsProviderSubsystem = "provider_" -const metricsOrganizationSubsystem = "organization_" -const metricsRepositorySubsystem = "repository_" -const metricsEnterpriseSubsystem = "enterprise_" -const metricsWebhookSubsystem = "webhook_" - -var webhooksReceived *prometheus.CounterVec = nil - -// RecordWebhookWithLabels will increment a webhook metric identified by specific -// values. If metrics are disabled, this function is a noop. -func RecordWebhookWithLabels(lvs ...string) error { - if webhooksReceived == nil { - // not registered. Noop - return nil - } - - counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...) - if err != nil { - return errors.Wrap(err, "recording metric") - } - counter.Inc() - return nil -} - -func RegisterCollectors(runner *runner.Runner) error { - if webhooksReceived != nil { - // Already registered. - return nil - } - - garmCollector, err := NewGarmCollector(runner) - if err != nil { - return errors.Wrap(err, "getting collector") - } - - if err := prometheus.Register(garmCollector); err != nil { - return errors.Wrap(err, "registering collector") - } - - // metric to count total webhooks received - // at this point the webhook is not yet authenticated and - // we don't know if it's meant for us or not - webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: metricsNamespace + metricsWebhookSubsystem + "received", - Help: "The total number of webhooks received", - }, []string{"valid", "reason", "hostname", "controller_id"}) - - err = prometheus.Register(webhooksReceived) - if err != nil { - return errors.Wrap(err, "registering webhooks recv counter") - } - return nil -} - -type GarmCollector struct { - healthMetric *prometheus.Desc - instanceMetric *prometheus.Desc - - // pool metrics - poolInfo *prometheus.Desc - poolStatus *prometheus.Desc - poolMaxRunners *prometheus.Desc - poolMinIdleRunners *prometheus.Desc - poolBootstrapTimeout *prometheus.Desc +const metricsNamespace = "garm" +const metricsRunnerSubsystem = "runner" +const metricsPoolSubsystem = "pool" +const metricsProviderSubsystem = "provider" +const metricsOrganizationSubsystem = "organization" +const metricsRepositorySubsystem = "repository" +const metricsEnterpriseSubsystem = "enterprise" +const metricsWebhookSubsystem = "webhook" +func init() { + // runner metrics + prometheus.MustRegister(InstanceStatus) + // organization metrics + prometheus.MustRegister(OrganizationInfo) + prometheus.MustRegister(OrganizationPoolManagerStatus) + // enterprise metrics + prometheus.MustRegister(EnterpriseInfo) + prometheus.MustRegister(EnterprisePoolManagerStatus) + // repository metrics + prometheus.MustRegister(RepositoryInfo) + prometheus.MustRegister(RepositoryPoolManagerStatus) // provider metrics - providerInfo *prometheus.Desc + prometheus.MustRegister(ProviderInfo) + // pool metrics + prometheus.MustRegister(PoolInfo) + prometheus.MustRegister(PoolStatus) + prometheus.MustRegister(PoolMaxRunners) + prometheus.MustRegister(PoolMinIdleRunners) + prometheus.MustRegister(PoolBootstrapTimeout) + // health metrics + prometheus.MustRegister(GarmHealth) + // webhook metrics + prometheus.MustRegister(WebhooksReceived) - organizationInfo *prometheus.Desc - organizationPoolManagerStatus *prometheus.Desc - repositoryInfo *prometheus.Desc - repositoryPoolManagerStatus *prometheus.Desc - enterpriseInfo *prometheus.Desc - enterprisePoolManagerStatus *prometheus.Desc - - runner *runner.Runner - cachedControllerInfo params.ControllerInfo -} - -func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) { - controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext()) - if err != nil { - return nil, errors.Wrap(err, "fetching controller info") - } - return &GarmCollector{ - runner: r, - instanceMetric: prometheus.NewDesc( - metricsNamespace+metricsRunnerSubsystem+"status", - "Status of the runner", - []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil, - ), - healthMetric: prometheus.NewDesc( - metricsNamespace+"health", - "Health of the runner", - []string{"hostname", "controller_id"}, nil, - ), - poolInfo: prometheus.NewDesc( - metricsNamespace+metricsPoolSubsystem+"info", - "Information of the pool", - []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil, - ), - poolStatus: prometheus.NewDesc( - metricsNamespace+metricsPoolSubsystem+"status", - "Status of the pool", - []string{"id", "enabled"}, nil, - ), - poolMaxRunners: prometheus.NewDesc( - metricsNamespace+metricsPoolSubsystem+"max_runners", - "Max runners of the pool", - []string{"id"}, nil, - ), - poolMinIdleRunners: prometheus.NewDesc( - metricsNamespace+metricsPoolSubsystem+"min_idle_runners", - "Min idle runners of the pool", - []string{"id"}, nil, - ), - poolBootstrapTimeout: prometheus.NewDesc( - metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout", - "Bootstrap timeout of the pool", - []string{"id"}, nil, - ), - providerInfo: prometheus.NewDesc( - metricsNamespace+metricsProviderSubsystem+"info", - "Info of the provider", - []string{"name", "type", "description"}, nil, - ), - organizationInfo: prometheus.NewDesc( - metricsNamespace+metricsOrganizationSubsystem+"info", - "Info of the organization", - []string{"name", "id"}, nil, - ), - organizationPoolManagerStatus: prometheus.NewDesc( - metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status", - "Status of the organization pool manager", - []string{"name", "id", "running"}, nil, - ), - repositoryInfo: prometheus.NewDesc( - metricsNamespace+metricsRepositorySubsystem+"info", - "Info of the organization", - []string{"name", "owner", "id"}, nil, - ), - repositoryPoolManagerStatus: prometheus.NewDesc( - metricsNamespace+metricsRepositorySubsystem+"pool_manager_status", - "Status of the repository pool manager", - []string{"name", "id", "running"}, nil, - ), - enterpriseInfo: prometheus.NewDesc( - metricsNamespace+metricsEnterpriseSubsystem+"info", - "Info of the organization", - []string{"name", "id"}, nil, - ), - enterprisePoolManagerStatus: prometheus.NewDesc( - metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status", - "Status of the enterprise pool manager", - []string{"name", "id", "running"}, nil, - ), - - cachedControllerInfo: controllerInfo, - }, nil -} - -func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.instanceMetric - ch <- c.healthMetric - ch <- c.poolInfo - ch <- c.poolStatus - ch <- c.poolMaxRunners - ch <- c.poolMinIdleRunners - ch <- c.providerInfo - ch <- c.organizationInfo - ch <- c.organizationPoolManagerStatus - ch <- c.enterpriseInfo - ch <- c.enterprisePoolManagerStatus -} - -func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) { - controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext()) - if err != nil { - slog.With(slog.Any("error", err)).Error("failed to get controller info") - return - } - - c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) - c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String()) } diff --git a/metrics/organization.go b/metrics/organization.go index e5ea8292..38d7c611 100644 --- a/metrics/organization.go +++ b/metrics/organization.go @@ -1,50 +1,21 @@ package metrics import ( - "log/slog" - "strconv" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectOrganizationMetric collects the metrics for the organization objects -func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() +var ( + OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsOrganizationSubsystem, + Name: "info", + Help: "Info of the organization", + }, []string{"name", "id"}) - organizations, err := c.runner.ListOrganizations(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") - return - } - - for _, organization := range organizations { - - organizationInfo, err := prometheus.NewConstMetric( - c.organizationInfo, - prometheus.GaugeValue, - 1, - organization.Name, // label: name - organization.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric") - continue - } - ch <- organizationInfo - - organizationPoolManagerStatus, err := prometheus.NewConstMetric( - c.organizationPoolManagerStatus, - prometheus.GaugeValue, - bool2float64(organization.PoolManagerStatus.IsRunning), - organization.Name, // label: name - organization.ID, // label: id - strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric") - continue - } - ch <- organizationPoolManagerStatus - } -} + OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsOrganizationSubsystem, + Name: "pool_manager_status", + Help: "Status of the organization pool manager", + }, []string{"name", "id", "running"}) +) diff --git a/metrics/pool.go b/metrics/pool.go index 110812bd..5803af90 100644 --- a/metrics/pool.go +++ b/metrics/pool.go @@ -1,121 +1,42 @@ package metrics import ( - "log/slog" - "strconv" - "strings" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectPoolMetric collects the metrics for the pool objects -func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() +var ( + PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsPoolSubsystem, + Name: "info", + Help: "Info of the pool", + }, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}) - pools, err := c.runner.ListAllPools(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") - return - } + PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsPoolSubsystem, + Name: "status", + Help: "Status of the pool", + }, []string{"id", "enabled"}) - type poolInfo struct { - Name string - Type string - } + PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsPoolSubsystem, + Name: "max_runners", + Help: "Maximum number of runners in the pool", + }, []string{"id"}) - poolNames := make(map[string]poolInfo) - for _, pool := range pools { - if pool.EnterpriseName != "" { - poolNames[pool.ID] = poolInfo{ - Name: pool.EnterpriseName, - Type: string(pool.PoolType()), - } - } else if pool.OrgName != "" { - poolNames[pool.ID] = poolInfo{ - Name: pool.OrgName, - Type: string(pool.PoolType()), - } - } else { - poolNames[pool.ID] = poolInfo{ - Name: pool.RepoName, - Type: string(pool.PoolType()), - } - } + PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsPoolSubsystem, + Name: "min_idle_runners", + Help: "Minimum number of idle runners in the pool", + }, []string{"id"}) - var poolTags []string - for _, tag := range pool.Tags { - poolTags = append(poolTags, tag.Name) - } - - poolInfo, err := prometheus.NewConstMetric( - c.poolInfo, - prometheus.GaugeValue, - 1, - pool.ID, // label: id - pool.Image, // label: image - pool.Flavor, // label: flavor - pool.Prefix, // label: prefix - string(pool.OSType), // label: os_type - string(pool.OSArch), // label: os_arch - strings.Join(poolTags, ","), // label: tags - pool.ProviderName, // label: provider - poolNames[pool.ID].Name, // label: pool_owner - poolNames[pool.ID].Type, // label: pool_type - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric") - continue - } - ch <- poolInfo - - poolStatus, err := prometheus.NewConstMetric( - c.poolStatus, - prometheus.GaugeValue, - bool2float64(pool.Enabled), - pool.ID, // label: id - strconv.FormatBool(pool.Enabled), // label: enabled - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric") - continue - } - ch <- poolStatus - - poolMaxRunners, err := prometheus.NewConstMetric( - c.poolMaxRunners, - prometheus.GaugeValue, - float64(pool.MaxRunners), - pool.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric") - continue - } - ch <- poolMaxRunners - - poolMinIdleRunners, err := prometheus.NewConstMetric( - c.poolMinIdleRunners, - prometheus.GaugeValue, - float64(pool.MinIdleRunners), - pool.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric") - continue - } - ch <- poolMinIdleRunners - - poolBootstrapTimeout, err := prometheus.NewConstMetric( - c.poolBootstrapTimeout, - prometheus.GaugeValue, - float64(pool.RunnerBootstrapTimeout), - pool.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric") - continue - } - ch <- poolBootstrapTimeout - } -} + PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsPoolSubsystem, + Name: "bootstrap_timeout", + Help: "Runner bootstrap timeout in the pool", + }, []string{"id"}) +) diff --git a/metrics/provider.go b/metrics/provider.go index e51f295e..5034a7e9 100644 --- a/metrics/provider.go +++ b/metrics/provider.go @@ -1,36 +1,14 @@ package metrics import ( - "log/slog" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectPoolMetric collects the metrics for the pool objects -func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() - - providers, err := c.runner.ListProviders(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") - return - } - - for _, provider := range providers { - - providerInfo, err := prometheus.NewConstMetric( - c.providerInfo, - prometheus.GaugeValue, - 1, - provider.Name, // label: name - string(provider.ProviderType), // label: type - provider.Description, // label: description - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric") - continue - } - ch <- providerInfo - } -} +var ( + ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsProviderSubsystem, + Name: "info", + Help: "Info of the organization", + }, []string{"name", "type", "description"}) +) diff --git a/metrics/repository.go b/metrics/repository.go index b778782e..a84dd120 100644 --- a/metrics/repository.go +++ b/metrics/repository.go @@ -1,51 +1,21 @@ package metrics import ( - "log/slog" - "strconv" - - "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" ) -// CollectOrganizationMetric collects the metrics for the repository objects -func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { - ctx := auth.GetAdminContext() +var ( + RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsRepositorySubsystem, + Name: "info", + Help: "Info of the enterprise", + }, []string{"name", "id"}) - repositories, err := c.runner.ListRepositories(ctx) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") - return - } - - for _, repository := range repositories { - - repositoryInfo, err := prometheus.NewConstMetric( - c.repositoryInfo, - prometheus.GaugeValue, - 1, - repository.Name, // label: name - repository.Owner, // label: owner - repository.ID, // label: id - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric") - continue - } - ch <- repositoryInfo - - repositoryPoolManagerStatus, err := prometheus.NewConstMetric( - c.repositoryPoolManagerStatus, - prometheus.GaugeValue, - bool2float64(repository.PoolManagerStatus.IsRunning), - repository.Name, // label: name - repository.ID, // label: id - strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running - ) - if err != nil { - slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric") - continue - } - ch <- repositoryPoolManagerStatus - } -} + RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsRepositorySubsystem, + Name: "pool_manager_status", + Help: "Status of the enterprise pool manager", + }, []string{"name", "id", "running"}) +) diff --git a/metrics/util.go b/metrics/util.go index ae2d7087..b2edb580 100644 --- a/metrics/util.go +++ b/metrics/util.go @@ -1,6 +1,6 @@ package metrics -func bool2float64(b bool) float64 { +func Bool2float64(b bool) float64 { if b { return 1 } diff --git a/metrics/webhooks.go b/metrics/webhooks.go new file mode 100644 index 00000000..7b314eb6 --- /dev/null +++ b/metrics/webhooks.go @@ -0,0 +1,12 @@ +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +var ( + WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWebhookSubsystem, + Name: "received", + Help: "The total number of webhooks received", + }, []string{"valid", "reason", "hostname", "controller_id"}) +) diff --git a/runner/metrics/enterprise.go b/runner/metrics/enterprise.go new file mode 100644 index 00000000..8cce89d6 --- /dev/null +++ b/runner/metrics/enterprise.go @@ -0,0 +1,36 @@ +package metrics + +import ( + "context" + "strconv" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/runner" +) + +// CollectOrganizationMetric collects the metrics for the enterprise objects +func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error { + + // reset metrics + metrics.EnterpriseInfo.Reset() + metrics.EnterprisePoolManagerStatus.Reset() + + enterprises, err := r.ListEnterprises(ctx) + if err != nil { + return err + } + + for _, enterprise := range enterprises { + metrics.EnterpriseInfo.WithLabelValues( + enterprise.Name, // label: name + enterprise.ID, // label: id + ).Set(1) + + metrics.EnterprisePoolManagerStatus.WithLabelValues( + enterprise.Name, // label: name + enterprise.ID, // label: id + strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running + ).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning)) + } + return nil +} diff --git a/runner/metrics/health.go b/runner/metrics/health.go new file mode 100644 index 00000000..c84dca45 --- /dev/null +++ b/runner/metrics/health.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "context" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/params" + "github.com/cloudbase/garm/runner" +) + +func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error { + + metrics.GarmHealth.WithLabelValues( + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: id + controllerInfo.MetadataURL, // label: metadata_url + controllerInfo.CallbackURL, // label: callback_url + controllerInfo.WebhookURL, // label: webhook_url + controllerInfo.ControllerWebhookURL, // label: controller_webhook_url + ).Set(1) + return nil +} diff --git a/runner/metrics/instance.go b/runner/metrics/instance.go new file mode 100644 index 00000000..9fdb0740 --- /dev/null +++ b/runner/metrics/instance.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "context" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/params" + "github.com/cloudbase/garm/runner" +) + +// CollectInstanceMetric collects the metrics for the runner instances +// reflecting the statuses and the pool they belong to. +func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error { + + // reset metrics + metrics.InstanceStatus.Reset() + + instances, err := r.ListAllInstances(ctx) + if err != nil { + return err + } + + pools, err := r.ListAllPools(ctx) + if err != nil { + return err + } + + type poolInfo struct { + Name string + Type string + ProviderName string + } + + poolNames := make(map[string]poolInfo) + for _, pool := range pools { + if pool.EnterpriseName != "" { + poolNames[pool.ID] = poolInfo{ + Name: pool.EnterpriseName, + Type: string(pool.PoolType()), + ProviderName: pool.ProviderName, + } + } else if pool.OrgName != "" { + poolNames[pool.ID] = poolInfo{ + Name: pool.OrgName, + Type: string(pool.PoolType()), + ProviderName: pool.ProviderName, + } + } else { + poolNames[pool.ID] = poolInfo{ + Name: pool.RepoName, + Type: string(pool.PoolType()), + ProviderName: pool.ProviderName, + } + } + } + + for _, instance := range instances { + + metrics.InstanceStatus.WithLabelValues( + instance.Name, // label: name + string(instance.Status), // label: status + string(instance.RunnerStatus), // label: runner_status + poolNames[instance.PoolID].Name, // label: pool_owner + poolNames[instance.PoolID].Type, // label: pool_type + instance.PoolID, // label: pool_id + controllerInfo.Hostname, // label: hostname + controllerInfo.ControllerID.String(), // label: controller_id + poolNames[instance.PoolID].ProviderName, // label: provider + + ).Set(1) + } + return nil +} diff --git a/runner/metrics/metrics.go b/runner/metrics/metrics.go new file mode 100644 index 00000000..577d67f7 --- /dev/null +++ b/runner/metrics/metrics.go @@ -0,0 +1,70 @@ +package metrics + +import ( + "log/slog" + "time" + + "github.com/cloudbase/garm/auth" + "github.com/cloudbase/garm/runner" +) + +func CollectObjectMetric(r *runner.Runner, timer *time.Ticker) { + + ctx := auth.GetAdminContext() + + controllerInfo, err := r.GetControllerInfo(ctx) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info") + } + + go func() { + // we wan't to initiate the collection immediately + for ; true; <-timer.C { + + slog.InfoContext(ctx, "collecting metrics") + + var err error + slog.DebugContext(ctx, "collecting organization metrics") + err = CollectOrganizationMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organization metrics") + } + + slog.DebugContext(ctx, "collecting enterprise metrics") + err = CollectEnterpriseMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprise metrics") + } + + slog.DebugContext(ctx, "collecting repository metrics") + err = CollectRepositoryMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repository metrics") + } + + slog.DebugContext(ctx, "collecting provider metrics") + err = CollectProviderMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect provider metrics") + } + + slog.DebugContext(ctx, "collecting pool metrics") + err = CollectPoolMetric(ctx, r) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics") + } + + slog.DebugContext(ctx, "collecting health metrics") + err = CollectHealthMetric(ctx, r, controllerInfo) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics") + } + + slog.DebugContext(ctx, "collecting instance metrics") + err = CollectInstanceMetric(ctx, r, controllerInfo) + if err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics") + } + } + }() +} diff --git a/runner/metrics/organization.go b/runner/metrics/organization.go new file mode 100644 index 00000000..0be9ced6 --- /dev/null +++ b/runner/metrics/organization.go @@ -0,0 +1,36 @@ +package metrics + +import ( + "context" + "strconv" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/runner" +) + +// CollectOrganizationMetric collects the metrics for the organization objects +func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error { + + // reset metrics + metrics.OrganizationInfo.Reset() + metrics.OrganizationPoolManagerStatus.Reset() + + organizations, err := r.ListOrganizations(ctx) + if err != nil { + return err + } + + for _, organization := range organizations { + metrics.OrganizationInfo.WithLabelValues( + organization.Name, // label: name + organization.ID, // label: id + ).Set(1) + + metrics.OrganizationPoolManagerStatus.WithLabelValues( + organization.Name, // label: name + organization.ID, // label: id + strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running + ).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning)) + } + return nil +} diff --git a/runner/metrics/pool.go b/runner/metrics/pool.go new file mode 100644 index 00000000..817cb104 --- /dev/null +++ b/runner/metrics/pool.go @@ -0,0 +1,87 @@ +package metrics + +import ( + "context" + "strconv" + "strings" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/runner" +) + +// CollectPoolMetric collects the metrics for the pool objects +func CollectPoolMetric(ctx context.Context, r *runner.Runner) error { + + // reset metrics + metrics.PoolInfo.Reset() + metrics.PoolStatus.Reset() + metrics.PoolMaxRunners.Reset() + metrics.PoolMinIdleRunners.Reset() + metrics.PoolBootstrapTimeout.Reset() + + pools, err := r.ListAllPools(ctx) + if err != nil { + return err + } + + type poolInfo struct { + Name string + Type string + } + + poolNames := make(map[string]poolInfo) + for _, pool := range pools { + if pool.EnterpriseName != "" { + poolNames[pool.ID] = poolInfo{ + Name: pool.EnterpriseName, + Type: string(pool.PoolType()), + } + } else if pool.OrgName != "" { + poolNames[pool.ID] = poolInfo{ + Name: pool.OrgName, + Type: string(pool.PoolType()), + } + } else { + poolNames[pool.ID] = poolInfo{ + Name: pool.RepoName, + Type: string(pool.PoolType()), + } + } + + var poolTags []string + for _, tag := range pool.Tags { + poolTags = append(poolTags, tag.Name) + } + + metrics.PoolInfo.WithLabelValues( + pool.ID, // label: id + pool.Image, // label: image + pool.Flavor, // label: flavor + pool.Prefix, // label: prefix + string(pool.OSType), // label: os_type + string(pool.OSArch), // label: os_arch + strings.Join(poolTags, ","), // label: tags + pool.ProviderName, // label: provider + poolNames[pool.ID].Name, // label: pool_owner + poolNames[pool.ID].Type, // label: pool_type + ).Set(1) + + metrics.PoolStatus.WithLabelValues( + pool.ID, // label: id + strconv.FormatBool(pool.Enabled), // label: enabled + ).Set(metrics.Bool2float64(pool.Enabled)) + + metrics.PoolMaxRunners.WithLabelValues( + pool.ID, // label: id + ).Set(float64(pool.MaxRunners)) + + metrics.PoolMinIdleRunners.WithLabelValues( + pool.ID, // label: id + ).Set(float64(pool.MinIdleRunners)) + + metrics.PoolBootstrapTimeout.WithLabelValues( + pool.ID, // label: id + ).Set(float64(pool.RunnerBootstrapTimeout)) + } + return nil +} diff --git a/runner/metrics/provider.go b/runner/metrics/provider.go new file mode 100644 index 00000000..398f8ee3 --- /dev/null +++ b/runner/metrics/provider.go @@ -0,0 +1,27 @@ +package metrics + +import ( + "context" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/runner" +) + +func CollectProviderMetric(ctx context.Context, r *runner.Runner) error { + + // reset metrics + metrics.ProviderInfo.Reset() + + providers, err := r.ListProviders(ctx) + if err != nil { + return err + } + for _, provider := range providers { + metrics.ProviderInfo.WithLabelValues( + provider.Name, // label: name + string(provider.ProviderType), // label: type + provider.Description, // label: description + ).Set(1) + } + return nil +} diff --git a/runner/metrics/repository.go b/runner/metrics/repository.go new file mode 100644 index 00000000..ba2fab29 --- /dev/null +++ b/runner/metrics/repository.go @@ -0,0 +1,35 @@ +package metrics + +import ( + "context" + "strconv" + + "github.com/cloudbase/garm/metrics" + "github.com/cloudbase/garm/runner" +) + +func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error { + + // reset metrics + metrics.EnterpriseInfo.Reset() + metrics.EnterprisePoolManagerStatus.Reset() + + repositories, err := r.ListRepositories(ctx) + if err != nil { + return err + } + + for _, repository := range repositories { + metrics.EnterpriseInfo.WithLabelValues( + repository.Name, // label: name + repository.ID, // label: id + ).Set(1) + + metrics.EnterprisePoolManagerStatus.WithLabelValues( + repository.Name, // label: name + repository.ID, // label: id + strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running + ).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning)) + } + return nil +}