chore: refactor metrics endpoint

refactoring is needed to make the metrics package usable from within the
runner package for further metrics.

This change also makes the metric-collector independent from requests to
the /metrics endpoint

Signed-off-by: Mario Constanti <mario.constanti@mercedes-benz.com>
This commit is contained in:
Mario Constanti 2024-02-19 16:22:32 +01:00
parent f68cf98d67
commit 1d8d9459eb
21 changed files with 564 additions and 590 deletions

View file

@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
} }
} }
func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
// If labels are empty, not attempt will be made to record webhook.
return []string{}
}
return []string{
valid, reason,
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
}
}
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) { func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
defer r.Body.Close() defer r.Body.Close()
body, err := io.ReadAll(r.Body) body, err := io.ReadAll(r.Body)
@ -119,31 +106,47 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256") signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type") hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
var labelValues []string controllerInfo, err := a.r.GetControllerInfo(ctx)
defer func() { if err != nil {
if len(labelValues) == 0 { slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
return return
} }
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
}
}()
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil { if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) { if errors.Is(err, gErrors.ErrNotFound) {
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown") metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"owner_unknown", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?") slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type } else if strings.Contains(err.Error(), "signature") { // TODO: check error type
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid") metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"signature_invalid", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
).Inc()
} else { } else {
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown") metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"unknown", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
).Inc()
} }
handleError(ctx, w, err) handleError(ctx, w, err)
return return
} }
labelValues = a.webhookMetricLabelValues(ctx, "true", "") metrics.WebhooksReceived.WithLabelValues(
"true", // label: valid
"", // label: reason
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
).Inc()
} }
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) { func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {

View file

@ -35,8 +35,8 @@ import (
"github.com/cloudbase/garm/config" "github.com/cloudbase/garm/config"
"github.com/cloudbase/garm/database" "github.com/cloudbase/garm/database"
"github.com/cloudbase/garm/database/common" "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner" "github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util" garmUtil "github.com/cloudbase/garm/util"
"github.com/cloudbase/garm/util/appdefaults" "github.com/cloudbase/garm/util/appdefaults"
"github.com/cloudbase/garm/websocket" "github.com/cloudbase/garm/websocket"
@ -214,13 +214,13 @@ func main() {
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement) router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
// start the metrics collector
if cfg.Metrics.Enable { if cfg.Metrics.Enable {
slog.InfoContext(ctx, "registering prometheus metrics collectors")
if err := metrics.RegisterCollectors(runner); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "setting up metric routes") slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware) router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(runner, time.NewTicker(cfg.Metrics.Period))
} }
if cfg.Default.DebugServer { if cfg.Default.DebugServer {

View file

@ -456,8 +456,9 @@ func (t *TLSConfig) Validate() error {
} }
type Metrics struct { type Metrics struct {
DisableAuth bool `toml:"disable_auth" json:"disable-auth"` DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
Enable bool `toml:"enable" json:"enable"` Enable bool `toml:"enable" json:"enable"`
Period time.Duration `toml:"period" json:"period"`
} }
// APIServer holds configuration for the API server // APIServer holds configuration for the API server

View file

@ -1,50 +1,21 @@
package metrics package metrics
import ( import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectOrganizationMetric collects the metrics for the enterprise objects var (
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ctx := auth.GetAdminContext() Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})
enterprises, err := c.runner.ListEnterprises(ctx) EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
if err != nil { Namespace: metricsNamespace,
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") Subsystem: metricsEnterpriseSubsystem,
return Name: "pool_manager_status",
} Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
for _, enterprise := range enterprises { )
enterpriseInfo, err := prometheus.NewConstMetric(
c.enterpriseInfo,
prometheus.GaugeValue,
1,
enterprise.Name, // label: name
enterprise.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
continue
}
ch <- enterpriseInfo
enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
c.enterprisePoolManagerStatus,
prometheus.GaugeValue,
bool2float64(enterprise.PoolManagerStatus.IsRunning),
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
continue
}
ch <- enterprisePoolManagerStatus
}
}

View file

@ -1,22 +1,13 @@
package metrics package metrics
import ( import (
"log/slog"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { var (
m, err := prometheus.NewConstMetric( GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
c.healthMetric, Namespace: metricsNamespace,
prometheus.GaugeValue, Name: "health",
1, Help: "Health of the garm",
hostname, }, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"})
controllerID, )
)
if err != nil {
slog.With(slog.Any("error", err)).Error("error on creating health metric")
return
}
ch <- m
}

View file

@ -1,79 +1,14 @@
package metrics package metrics
import ( import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectInstanceMetric collects the metrics for the runner instances var (
// reflecting the statuses and the pool they belong to. InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { Namespace: metricsNamespace,
ctx := auth.GetAdminContext() Subsystem: metricsRunnerSubsystem,
Name: "status",
instances, err := c.runner.ListAllInstances(ctx) Help: "Status of the instance",
if err != nil { }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"})
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances") )
return
}
pools, err := c.runner.ListAllPools(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
return
}
type poolInfo struct {
Name string
Type string
ProviderName string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
}
}
for _, instance := range instances {
m, err := prometheus.NewConstMetric(
c.instanceMetric,
prometheus.GaugeValue,
1,
instance.Name, // label: name
string(instance.Status), // label: status
string(instance.RunnerStatus), // label: runner_status
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
hostname, // label: hostname
controllerID, // label: controller_id
poolNames[instance.PoolID].ProviderName, // label: provider
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric")
continue
}
ch <- m
}
}

View file

@ -1,206 +1,41 @@
package metrics package metrics
import ( import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
const metricsNamespace = "garm_" const metricsNamespace = "garm"
const metricsRunnerSubsystem = "runner_" const metricsRunnerSubsystem = "runner"
const metricsPoolSubsystem = "pool_" const metricsPoolSubsystem = "pool"
const metricsProviderSubsystem = "provider_" const metricsProviderSubsystem = "provider"
const metricsOrganizationSubsystem = "organization_" const metricsOrganizationSubsystem = "organization"
const metricsRepositorySubsystem = "repository_" const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise_" const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook_" const metricsWebhookSubsystem = "webhook"
var webhooksReceived *prometheus.CounterVec = nil
// RecordWebhookWithLabels will increment a webhook metric identified by specific
// values. If metrics are disabled, this function is a noop.
func RecordWebhookWithLabels(lvs ...string) error {
if webhooksReceived == nil {
// not registered. Noop
return nil
}
counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...)
if err != nil {
return errors.Wrap(err, "recording metric")
}
counter.Inc()
return nil
}
func RegisterCollectors(runner *runner.Runner) error {
if webhooksReceived != nil {
// Already registered.
return nil
}
garmCollector, err := NewGarmCollector(runner)
if err != nil {
return errors.Wrap(err, "getting collector")
}
if err := prometheus.Register(garmCollector); err != nil {
return errors.Wrap(err, "registering collector")
}
// metric to count total webhooks received
// at this point the webhook is not yet authenticated and
// we don't know if it's meant for us or not
webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: metricsNamespace + metricsWebhookSubsystem + "received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason", "hostname", "controller_id"})
err = prometheus.Register(webhooksReceived)
if err != nil {
return errors.Wrap(err, "registering webhooks recv counter")
}
return nil
}
type GarmCollector struct {
healthMetric *prometheus.Desc
instanceMetric *prometheus.Desc
// pool metrics
poolInfo *prometheus.Desc
poolStatus *prometheus.Desc
poolMaxRunners *prometheus.Desc
poolMinIdleRunners *prometheus.Desc
poolBootstrapTimeout *prometheus.Desc
func init() {
// runner metrics
prometheus.MustRegister(InstanceStatus)
// organization metrics
prometheus.MustRegister(OrganizationInfo)
prometheus.MustRegister(OrganizationPoolManagerStatus)
// enterprise metrics
prometheus.MustRegister(EnterpriseInfo)
prometheus.MustRegister(EnterprisePoolManagerStatus)
// repository metrics
prometheus.MustRegister(RepositoryInfo)
prometheus.MustRegister(RepositoryPoolManagerStatus)
// provider metrics // provider metrics
providerInfo *prometheus.Desc prometheus.MustRegister(ProviderInfo)
// pool metrics
prometheus.MustRegister(PoolInfo)
prometheus.MustRegister(PoolStatus)
prometheus.MustRegister(PoolMaxRunners)
prometheus.MustRegister(PoolMinIdleRunners)
prometheus.MustRegister(PoolBootstrapTimeout)
// health metrics
prometheus.MustRegister(GarmHealth)
// webhook metrics
prometheus.MustRegister(WebhooksReceived)
organizationInfo *prometheus.Desc
organizationPoolManagerStatus *prometheus.Desc
repositoryInfo *prometheus.Desc
repositoryPoolManagerStatus *prometheus.Desc
enterpriseInfo *prometheus.Desc
enterprisePoolManagerStatus *prometheus.Desc
runner *runner.Runner
cachedControllerInfo params.ControllerInfo
}
func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) {
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
return nil, errors.Wrap(err, "fetching controller info")
}
return &GarmCollector{
runner: r,
instanceMetric: prometheus.NewDesc(
metricsNamespace+metricsRunnerSubsystem+"status",
"Status of the runner",
[]string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil,
),
healthMetric: prometheus.NewDesc(
metricsNamespace+"health",
"Health of the runner",
[]string{"hostname", "controller_id"}, nil,
),
poolInfo: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"info",
"Information of the pool",
[]string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil,
),
poolStatus: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"status",
"Status of the pool",
[]string{"id", "enabled"}, nil,
),
poolMaxRunners: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"max_runners",
"Max runners of the pool",
[]string{"id"}, nil,
),
poolMinIdleRunners: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"min_idle_runners",
"Min idle runners of the pool",
[]string{"id"}, nil,
),
poolBootstrapTimeout: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout",
"Bootstrap timeout of the pool",
[]string{"id"}, nil,
),
providerInfo: prometheus.NewDesc(
metricsNamespace+metricsProviderSubsystem+"info",
"Info of the provider",
[]string{"name", "type", "description"}, nil,
),
organizationInfo: prometheus.NewDesc(
metricsNamespace+metricsOrganizationSubsystem+"info",
"Info of the organization",
[]string{"name", "id"}, nil,
),
organizationPoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status",
"Status of the organization pool manager",
[]string{"name", "id", "running"}, nil,
),
repositoryInfo: prometheus.NewDesc(
metricsNamespace+metricsRepositorySubsystem+"info",
"Info of the organization",
[]string{"name", "owner", "id"}, nil,
),
repositoryPoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsRepositorySubsystem+"pool_manager_status",
"Status of the repository pool manager",
[]string{"name", "id", "running"}, nil,
),
enterpriseInfo: prometheus.NewDesc(
metricsNamespace+metricsEnterpriseSubsystem+"info",
"Info of the organization",
[]string{"name", "id"}, nil,
),
enterprisePoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status",
"Status of the enterprise pool manager",
[]string{"name", "id", "running"}, nil,
),
cachedControllerInfo: controllerInfo,
}, nil
}
func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.instanceMetric
ch <- c.healthMetric
ch <- c.poolInfo
ch <- c.poolStatus
ch <- c.poolMaxRunners
ch <- c.poolMinIdleRunners
ch <- c.providerInfo
ch <- c.organizationInfo
ch <- c.organizationPoolManagerStatus
ch <- c.enterpriseInfo
ch <- c.enterprisePoolManagerStatus
}
func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).Error("failed to get controller info")
return
}
c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
} }

View file

@ -1,50 +1,21 @@
package metrics package metrics
import ( import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectOrganizationMetric collects the metrics for the organization objects var (
func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ctx := auth.GetAdminContext() Namespace: metricsNamespace,
Subsystem: metricsOrganizationSubsystem,
Name: "info",
Help: "Info of the organization",
}, []string{"name", "id"})
organizations, err := c.runner.ListOrganizations(ctx) OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
if err != nil { Namespace: metricsNamespace,
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") Subsystem: metricsOrganizationSubsystem,
return Name: "pool_manager_status",
} Help: "Status of the organization pool manager",
}, []string{"name", "id", "running"})
for _, organization := range organizations { )
organizationInfo, err := prometheus.NewConstMetric(
c.organizationInfo,
prometheus.GaugeValue,
1,
organization.Name, // label: name
organization.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric")
continue
}
ch <- organizationInfo
organizationPoolManagerStatus, err := prometheus.NewConstMetric(
c.organizationPoolManagerStatus,
prometheus.GaugeValue,
bool2float64(organization.PoolManagerStatus.IsRunning),
organization.Name, // label: name
organization.ID, // label: id
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric")
continue
}
ch <- organizationPoolManagerStatus
}
}

View file

@ -1,121 +1,42 @@
package metrics package metrics
import ( import (
"log/slog"
"strconv"
"strings"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectPoolMetric collects the metrics for the pool objects var (
func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ctx := auth.GetAdminContext() Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "info",
Help: "Info of the pool",
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
pools, err := c.runner.ListAllPools(ctx) PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
if err != nil { Namespace: metricsNamespace,
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") Subsystem: metricsPoolSubsystem,
return Name: "status",
} Help: "Status of the pool",
}, []string{"id", "enabled"})
type poolInfo struct { PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name string Namespace: metricsNamespace,
Type string Subsystem: metricsPoolSubsystem,
} Name: "max_runners",
Help: "Maximum number of runners in the pool",
}, []string{"id"})
poolNames := make(map[string]poolInfo) PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
for _, pool := range pools { Namespace: metricsNamespace,
if pool.EnterpriseName != "" { Subsystem: metricsPoolSubsystem,
poolNames[pool.ID] = poolInfo{ Name: "min_idle_runners",
Name: pool.EnterpriseName, Help: "Minimum number of idle runners in the pool",
Type: string(pool.PoolType()), }, []string{"id"})
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
}
}
var poolTags []string PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{
for _, tag := range pool.Tags { Namespace: metricsNamespace,
poolTags = append(poolTags, tag.Name) Subsystem: metricsPoolSubsystem,
} Name: "bootstrap_timeout",
Help: "Runner bootstrap timeout in the pool",
poolInfo, err := prometheus.NewConstMetric( }, []string{"id"})
c.poolInfo, )
prometheus.GaugeValue,
1,
pool.ID, // label: id
pool.Image, // label: image
pool.Flavor, // label: flavor
pool.Prefix, // label: prefix
string(pool.OSType), // label: os_type
string(pool.OSArch), // label: os_arch
strings.Join(poolTags, ","), // label: tags
pool.ProviderName, // label: provider
poolNames[pool.ID].Name, // label: pool_owner
poolNames[pool.ID].Type, // label: pool_type
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric")
continue
}
ch <- poolInfo
poolStatus, err := prometheus.NewConstMetric(
c.poolStatus,
prometheus.GaugeValue,
bool2float64(pool.Enabled),
pool.ID, // label: id
strconv.FormatBool(pool.Enabled), // label: enabled
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric")
continue
}
ch <- poolStatus
poolMaxRunners, err := prometheus.NewConstMetric(
c.poolMaxRunners,
prometheus.GaugeValue,
float64(pool.MaxRunners),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric")
continue
}
ch <- poolMaxRunners
poolMinIdleRunners, err := prometheus.NewConstMetric(
c.poolMinIdleRunners,
prometheus.GaugeValue,
float64(pool.MinIdleRunners),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric")
continue
}
ch <- poolMinIdleRunners
poolBootstrapTimeout, err := prometheus.NewConstMetric(
c.poolBootstrapTimeout,
prometheus.GaugeValue,
float64(pool.RunnerBootstrapTimeout),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric")
continue
}
ch <- poolBootstrapTimeout
}
}

View file

@ -1,36 +1,14 @@
package metrics package metrics
import ( import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectPoolMetric collects the metrics for the pool objects var (
func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ctx := auth.GetAdminContext() Namespace: metricsNamespace,
Subsystem: metricsProviderSubsystem,
providers, err := c.runner.ListProviders(ctx) Name: "info",
if err != nil { Help: "Info of the organization",
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") }, []string{"name", "type", "description"})
return )
}
for _, provider := range providers {
providerInfo, err := prometheus.NewConstMetric(
c.providerInfo,
prometheus.GaugeValue,
1,
provider.Name, // label: name
string(provider.ProviderType), // label: type
provider.Description, // label: description
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric")
continue
}
ch <- providerInfo
}
}

View file

@ -1,51 +1,21 @@
package metrics package metrics
import ( import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
// CollectOrganizationMetric collects the metrics for the repository objects var (
func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) { RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
ctx := auth.GetAdminContext() Namespace: metricsNamespace,
Subsystem: metricsRepositorySubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})
repositories, err := c.runner.ListRepositories(ctx) RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
if err != nil { Namespace: metricsNamespace,
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") Subsystem: metricsRepositorySubsystem,
return Name: "pool_manager_status",
} Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
for _, repository := range repositories { )
repositoryInfo, err := prometheus.NewConstMetric(
c.repositoryInfo,
prometheus.GaugeValue,
1,
repository.Name, // label: name
repository.Owner, // label: owner
repository.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric")
continue
}
ch <- repositoryInfo
repositoryPoolManagerStatus, err := prometheus.NewConstMetric(
c.repositoryPoolManagerStatus,
prometheus.GaugeValue,
bool2float64(repository.PoolManagerStatus.IsRunning),
repository.Name, // label: name
repository.ID, // label: id
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric")
continue
}
ch <- repositoryPoolManagerStatus
}
}

View file

@ -1,6 +1,6 @@
package metrics package metrics
func bool2float64(b bool) float64 { func Bool2float64(b bool) float64 {
if b { if b {
return 1 return 1
} }

12
metrics/webhooks.go Normal file
View file

@ -0,0 +1,12 @@
package metrics
import "github.com/prometheus/client_golang/prometheus"
var (
WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWebhookSubsystem,
Name: "received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason", "hostname", "controller_id"})
)

View file

@ -0,0 +1,36 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectOrganizationMetric collects the metrics for the enterprise objects
func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.EnterpriseInfo.Reset()
metrics.EnterprisePoolManagerStatus.Reset()
enterprises, err := r.ListEnterprises(ctx)
if err != nil {
return err
}
for _, enterprise := range enterprises {
metrics.EnterpriseInfo.WithLabelValues(
enterprise.Name, // label: name
enterprise.ID, // label: id
).Set(1)
metrics.EnterprisePoolManagerStatus.WithLabelValues(
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning))
}
return nil
}

22
runner/metrics/health.go Normal file
View file

@ -0,0 +1,22 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
metrics.GarmHealth.WithLabelValues(
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: id
controllerInfo.MetadataURL, // label: metadata_url
controllerInfo.CallbackURL, // label: callback_url
controllerInfo.WebhookURL, // label: webhook_url
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
).Set(1)
return nil
}

View file

@ -0,0 +1,73 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
// reset metrics
metrics.InstanceStatus.Reset()
instances, err := r.ListAllInstances(ctx)
if err != nil {
return err
}
pools, err := r.ListAllPools(ctx)
if err != nil {
return err
}
type poolInfo struct {
Name string
Type string
ProviderName string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
}
}
for _, instance := range instances {
metrics.InstanceStatus.WithLabelValues(
instance.Name, // label: name
string(instance.Status), // label: status
string(instance.RunnerStatus), // label: runner_status
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
controllerInfo.Hostname, // label: hostname
controllerInfo.ControllerID.String(), // label: controller_id
poolNames[instance.PoolID].ProviderName, // label: provider
).Set(1)
}
return nil
}

70
runner/metrics/metrics.go Normal file
View file

@ -0,0 +1,70 @@
package metrics
import (
"log/slog"
"time"
"github.com/cloudbase/garm/auth"
"github.com/cloudbase/garm/runner"
)
func CollectObjectMetric(r *runner.Runner, timer *time.Ticker) {
ctx := auth.GetAdminContext()
controllerInfo, err := r.GetControllerInfo(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info")
}
go func() {
// we wan't to initiate the collection immediately
for ; true; <-timer.C {
slog.InfoContext(ctx, "collecting metrics")
var err error
slog.DebugContext(ctx, "collecting organization metrics")
err = CollectOrganizationMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organization metrics")
}
slog.DebugContext(ctx, "collecting enterprise metrics")
err = CollectEnterpriseMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprise metrics")
}
slog.DebugContext(ctx, "collecting repository metrics")
err = CollectRepositoryMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repository metrics")
}
slog.DebugContext(ctx, "collecting provider metrics")
err = CollectProviderMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect provider metrics")
}
slog.DebugContext(ctx, "collecting pool metrics")
err = CollectPoolMetric(ctx, r)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics")
}
slog.DebugContext(ctx, "collecting health metrics")
err = CollectHealthMetric(ctx, r, controllerInfo)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics")
}
slog.DebugContext(ctx, "collecting instance metrics")
err = CollectInstanceMetric(ctx, r, controllerInfo)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
}
}
}()
}

View file

@ -0,0 +1,36 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectOrganizationMetric collects the metrics for the organization objects
func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.OrganizationInfo.Reset()
metrics.OrganizationPoolManagerStatus.Reset()
organizations, err := r.ListOrganizations(ctx)
if err != nil {
return err
}
for _, organization := range organizations {
metrics.OrganizationInfo.WithLabelValues(
organization.Name, // label: name
organization.ID, // label: id
).Set(1)
metrics.OrganizationPoolManagerStatus.WithLabelValues(
organization.Name, // label: name
organization.ID, // label: id
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning))
}
return nil
}

87
runner/metrics/pool.go Normal file
View file

@ -0,0 +1,87 @@
package metrics
import (
"context"
"strconv"
"strings"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectPoolMetric collects the metrics for the pool objects
func CollectPoolMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.PoolInfo.Reset()
metrics.PoolStatus.Reset()
metrics.PoolMaxRunners.Reset()
metrics.PoolMinIdleRunners.Reset()
metrics.PoolBootstrapTimeout.Reset()
pools, err := r.ListAllPools(ctx)
if err != nil {
return err
}
type poolInfo struct {
Name string
Type string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
}
}
var poolTags []string
for _, tag := range pool.Tags {
poolTags = append(poolTags, tag.Name)
}
metrics.PoolInfo.WithLabelValues(
pool.ID, // label: id
pool.Image, // label: image
pool.Flavor, // label: flavor
pool.Prefix, // label: prefix
string(pool.OSType), // label: os_type
string(pool.OSArch), // label: os_arch
strings.Join(poolTags, ","), // label: tags
pool.ProviderName, // label: provider
poolNames[pool.ID].Name, // label: pool_owner
poolNames[pool.ID].Type, // label: pool_type
).Set(1)
metrics.PoolStatus.WithLabelValues(
pool.ID, // label: id
strconv.FormatBool(pool.Enabled), // label: enabled
).Set(metrics.Bool2float64(pool.Enabled))
metrics.PoolMaxRunners.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.MaxRunners))
metrics.PoolMinIdleRunners.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.MinIdleRunners))
metrics.PoolBootstrapTimeout.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.RunnerBootstrapTimeout))
}
return nil
}

View file

@ -0,0 +1,27 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
func CollectProviderMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.ProviderInfo.Reset()
providers, err := r.ListProviders(ctx)
if err != nil {
return err
}
for _, provider := range providers {
metrics.ProviderInfo.WithLabelValues(
provider.Name, // label: name
string(provider.ProviderType), // label: type
provider.Description, // label: description
).Set(1)
}
return nil
}

View file

@ -0,0 +1,35 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.EnterpriseInfo.Reset()
metrics.EnterprisePoolManagerStatus.Reset()
repositories, err := r.ListRepositories(ctx)
if err != nil {
return err
}
for _, repository := range repositories {
metrics.EnterpriseInfo.WithLabelValues(
repository.Name, // label: name
repository.ID, // label: id
).Set(1)
metrics.EnterprisePoolManagerStatus.WithLabelValues(
repository.Name, // label: name
repository.ID, // label: id
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning))
}
return nil
}