Merge pull request #216 from mercedes-benz/extended_metrics

chore: refactor metrics endpoint
This commit is contained in:
Gabriel 2024-02-20 17:46:22 +02:00 committed by GitHub
commit e108140eb6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 645 additions and 606 deletions

View file

@ -37,7 +37,7 @@ import (
)
func NewAPIController(r *runner.Runner, authenticator *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext(context.Background()))
if err != nil {
return nil, errors.Wrap(err, "failed to get controller info")
}
@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
}
}
func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
// If labels are empty, not attempt will be made to record webhook.
return []string{}
}
return []string{
valid, reason,
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
}
}
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
@ -119,31 +106,33 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
var labelValues []string
defer func() {
if len(labelValues) == 0 {
return
}
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
}
}()
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"owner_unknown", // label: reason
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"signature_invalid", // label: reason
).Inc()
} else {
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"unknown", // label: reason
).Inc()
}
handleError(ctx, w, err)
return
}
labelValues = a.webhookMetricLabelValues(ctx, "true", "")
metrics.WebhooksReceived.WithLabelValues(
"true", // label: valid
"", // label: reason
).Inc()
}
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {

View file

@ -238,8 +238,10 @@ func UserID(ctx context.Context) string {
// GetAdminContext will return an admin context. This can be used internally
// when fetching users.
func GetAdminContext() context.Context {
ctx := context.Background()
func GetAdminContext(ctx context.Context) context.Context {
if ctx == nil {
ctx = context.Background()
}
ctx = SetUserID(ctx, "")
ctx = SetAdmin(ctx, true)
ctx = SetIsEnabled(ctx, true)

View file

@ -37,6 +37,7 @@ import (
"github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
"github.com/cloudbase/garm/util/appdefaults"
"github.com/cloudbase/garm/websocket"
@ -214,13 +215,18 @@ func main() {
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
// start the metrics collector
if cfg.Metrics.Enable {
slog.InfoContext(ctx, "registering prometheus metrics collectors")
if err := metrics.RegisterCollectors(runner); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
slog.InfoContext(ctx, "register metrics")
if err := metrics.RegisterMetrics(); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(ctx, runner, cfg.Metrics.Duration())
}
if cfg.Default.DebugServer {

View file

@ -456,8 +456,39 @@ func (t *TLSConfig) Validate() error {
}
type Metrics struct {
// DisableAuth defines if the API endpoint will be protected by
// JWT authentication
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
Enable bool `toml:"enable" json:"enable"`
// Enable define if the API endpoint for metrics collection will
// be enabled
Enable bool `toml:"enable" json:"enable"`
// Period defines the internal period at which internal metrics are getting updated
// and propagated to the /metrics endpoint
Period time.Duration `toml:"period" json:"period"`
}
// ParseDuration parses the configured duration and returns a time.Duration of 0
// if the duration is invalid.
func (m *Metrics) ParseDuration() (time.Duration, error) {
duration, err := time.ParseDuration(fmt.Sprint(m.Period))
if err != nil {
return 0, err
}
return duration, nil
}
// Duration returns the configured duration or the default duration if no value
// is configured or the configured value is invalid.
func (m *Metrics) Duration() time.Duration {
duration, err := m.ParseDuration()
if err != nil {
slog.With(slog.Any("error", err)).Error(fmt.Sprintf("defined duration %s is invalid", m.Period))
}
if duration == 0 {
slog.Debug(fmt.Sprintf("using default duration %s for metrics update interval", appdefaults.DefaultMetricsUpdateInterval))
return appdefaults.DefaultMetricsUpdateInterval
}
return duration
}
// APIServer holds configuration for the API server

View file

@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
## Common metrics
| Metric name | Type | Labels | Description |
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
| Metric name | Type | Labels | Description |
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`callback_url`=&lt;callback url&gt; <br>`controller_webhook_url`=&lt;controller webhook url&gt; <br>`metadata_url`=&lt;metadata url&gt; <br>`webhook_url`=&lt;webhook url&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `valid`=&lt;valid request&gt; <br>`reason`=&lt;reason for invalid requests&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
## Enterprise metrics
@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
| Metric name | Type | Labels | Description |
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `controller_id`=&lt;controller id&gt; <br>`hostname`=&lt;hostname&gt; <br>`name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
More metrics will be added in the future.
@ -60,15 +60,27 @@ Metrics are disabled by default. To enable them, add the following to your confi
```toml
[metrics]
# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
enable = true
# Toggle to disable authentication (not recommended) on the metrics endpoint.
# If you do disable authentication, I encourage you to put a reverse proxy in front
# of garm and limit which systems can access that particular endpoint. Ideally, you
# would enable some kind of authentication using the reverse proxy, if the built-in auth
# is not sufficient for your needs.
disable_auth = false
#
# Default: false
disable_auth = true
# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
#
# Default: false
enable = true
# period is the time interval when the /metrics endpoint will update internal metrics about
# controller specific objects (e.g. runners, pools, etc.)
#
# Default: "60s"
period = "30s"
```
You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it.

View file

@ -1,50 +1,21 @@
package metrics
import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectOrganizationMetric collects the metrics for the enterprise objects
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
var (
EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})
enterprises, err := c.runner.ListEnterprises(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}
for _, enterprise := range enterprises {
enterpriseInfo, err := prometheus.NewConstMetric(
c.enterpriseInfo,
prometheus.GaugeValue,
1,
enterprise.Name, // label: name
enterprise.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
continue
}
ch <- enterpriseInfo
enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
c.enterprisePoolManagerStatus,
prometheus.GaugeValue,
bool2float64(enterprise.PoolManagerStatus.IsRunning),
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
continue
}
ch <- enterprisePoolManagerStatus
}
}
EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "pool_manager_status",
Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
)

View file

@ -1,22 +1,13 @@
package metrics
import (
"log/slog"
"github.com/prometheus/client_golang/prometheus"
)
func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
m, err := prometheus.NewConstMetric(
c.healthMetric,
prometheus.GaugeValue,
1,
hostname,
controllerID,
)
if err != nil {
slog.With(slog.Any("error", err)).Error("error on creating health metric")
return
}
ch <- m
}
var (
GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "health",
Help: "Health of the garm",
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
)

View file

@ -1,79 +1,14 @@
package metrics
import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
instances, err := c.runner.ListAllInstances(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances")
return
}
pools, err := c.runner.ListAllPools(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
return
}
type poolInfo struct {
Name string
Type string
ProviderName string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
}
}
for _, instance := range instances {
m, err := prometheus.NewConstMetric(
c.instanceMetric,
prometheus.GaugeValue,
1,
instance.Name, // label: name
string(instance.Status), // label: status
string(instance.RunnerStatus), // label: runner_status
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
hostname, // label: hostname
controllerID, // label: controller_id
poolNames[instance.PoolID].ProviderName, // label: provider
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric")
continue
}
ch <- m
}
}
var (
InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsRunnerSubsystem,
Name: "status",
Help: "Status of the instance",
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
)

View file

@ -1,206 +1,53 @@
package metrics
import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)
const metricsNamespace = "garm_"
const metricsRunnerSubsystem = "runner_"
const metricsPoolSubsystem = "pool_"
const metricsProviderSubsystem = "provider_"
const metricsOrganizationSubsystem = "organization_"
const metricsRepositorySubsystem = "repository_"
const metricsEnterpriseSubsystem = "enterprise_"
const metricsWebhookSubsystem = "webhook_"
const metricsNamespace = "garm"
const metricsRunnerSubsystem = "runner"
const metricsPoolSubsystem = "pool"
const metricsProviderSubsystem = "provider"
const metricsOrganizationSubsystem = "organization"
const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
var webhooksReceived *prometheus.CounterVec = nil
// RegisterMetrics registers all the metrics
func RegisterMetrics() error {
// RecordWebhookWithLabels will increment a webhook metric identified by specific
// values. If metrics are disabled, this function is a noop.
func RecordWebhookWithLabels(lvs ...string) error {
if webhooksReceived == nil {
// not registered. Noop
return nil
var collectors []prometheus.Collector
collectors = append(collectors,
// runner metrics
InstanceStatus,
// organization metrics
OrganizationInfo,
OrganizationPoolManagerStatus,
// enterprise metrics
EnterpriseInfo,
EnterprisePoolManagerStatus,
// repository metrics
RepositoryInfo,
RepositoryPoolManagerStatus,
// provider metrics
ProviderInfo,
// pool metrics
PoolInfo,
PoolStatus,
PoolMaxRunners,
PoolMinIdleRunners,
PoolBootstrapTimeout,
// health metrics
GarmHealth,
// webhook metrics
WebhooksReceived,
)
for _, c := range collectors {
if err := prometheus.Register(c); err != nil {
return err
}
}
counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...)
if err != nil {
return errors.Wrap(err, "recording metric")
}
counter.Inc()
return nil
}
func RegisterCollectors(runner *runner.Runner) error {
if webhooksReceived != nil {
// Already registered.
return nil
}
garmCollector, err := NewGarmCollector(runner)
if err != nil {
return errors.Wrap(err, "getting collector")
}
if err := prometheus.Register(garmCollector); err != nil {
return errors.Wrap(err, "registering collector")
}
// metric to count total webhooks received
// at this point the webhook is not yet authenticated and
// we don't know if it's meant for us or not
webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: metricsNamespace + metricsWebhookSubsystem + "received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason", "hostname", "controller_id"})
err = prometheus.Register(webhooksReceived)
if err != nil {
return errors.Wrap(err, "registering webhooks recv counter")
}
return nil
}
type GarmCollector struct {
healthMetric *prometheus.Desc
instanceMetric *prometheus.Desc
// pool metrics
poolInfo *prometheus.Desc
poolStatus *prometheus.Desc
poolMaxRunners *prometheus.Desc
poolMinIdleRunners *prometheus.Desc
poolBootstrapTimeout *prometheus.Desc
// provider metrics
providerInfo *prometheus.Desc
organizationInfo *prometheus.Desc
organizationPoolManagerStatus *prometheus.Desc
repositoryInfo *prometheus.Desc
repositoryPoolManagerStatus *prometheus.Desc
enterpriseInfo *prometheus.Desc
enterprisePoolManagerStatus *prometheus.Desc
runner *runner.Runner
cachedControllerInfo params.ControllerInfo
}
func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) {
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
return nil, errors.Wrap(err, "fetching controller info")
}
return &GarmCollector{
runner: r,
instanceMetric: prometheus.NewDesc(
metricsNamespace+metricsRunnerSubsystem+"status",
"Status of the runner",
[]string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil,
),
healthMetric: prometheus.NewDesc(
metricsNamespace+"health",
"Health of the runner",
[]string{"hostname", "controller_id"}, nil,
),
poolInfo: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"info",
"Information of the pool",
[]string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil,
),
poolStatus: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"status",
"Status of the pool",
[]string{"id", "enabled"}, nil,
),
poolMaxRunners: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"max_runners",
"Max runners of the pool",
[]string{"id"}, nil,
),
poolMinIdleRunners: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"min_idle_runners",
"Min idle runners of the pool",
[]string{"id"}, nil,
),
poolBootstrapTimeout: prometheus.NewDesc(
metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout",
"Bootstrap timeout of the pool",
[]string{"id"}, nil,
),
providerInfo: prometheus.NewDesc(
metricsNamespace+metricsProviderSubsystem+"info",
"Info of the provider",
[]string{"name", "type", "description"}, nil,
),
organizationInfo: prometheus.NewDesc(
metricsNamespace+metricsOrganizationSubsystem+"info",
"Info of the organization",
[]string{"name", "id"}, nil,
),
organizationPoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status",
"Status of the organization pool manager",
[]string{"name", "id", "running"}, nil,
),
repositoryInfo: prometheus.NewDesc(
metricsNamespace+metricsRepositorySubsystem+"info",
"Info of the organization",
[]string{"name", "owner", "id"}, nil,
),
repositoryPoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsRepositorySubsystem+"pool_manager_status",
"Status of the repository pool manager",
[]string{"name", "id", "running"}, nil,
),
enterpriseInfo: prometheus.NewDesc(
metricsNamespace+metricsEnterpriseSubsystem+"info",
"Info of the organization",
[]string{"name", "id"}, nil,
),
enterprisePoolManagerStatus: prometheus.NewDesc(
metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status",
"Status of the enterprise pool manager",
[]string{"name", "id", "running"}, nil,
),
cachedControllerInfo: controllerInfo,
}, nil
}
func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.instanceMetric
ch <- c.healthMetric
ch <- c.poolInfo
ch <- c.poolStatus
ch <- c.poolMaxRunners
ch <- c.poolMinIdleRunners
ch <- c.providerInfo
ch <- c.organizationInfo
ch <- c.organizationPoolManagerStatus
ch <- c.enterpriseInfo
ch <- c.enterprisePoolManagerStatus
}
func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).Error("failed to get controller info")
return
}
c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
}

View file

@ -1,50 +1,21 @@
package metrics
import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectOrganizationMetric collects the metrics for the organization objects
func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
var (
OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsOrganizationSubsystem,
Name: "info",
Help: "Info of the organization",
}, []string{"name", "id"})
organizations, err := c.runner.ListOrganizations(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}
for _, organization := range organizations {
organizationInfo, err := prometheus.NewConstMetric(
c.organizationInfo,
prometheus.GaugeValue,
1,
organization.Name, // label: name
organization.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric")
continue
}
ch <- organizationInfo
organizationPoolManagerStatus, err := prometheus.NewConstMetric(
c.organizationPoolManagerStatus,
prometheus.GaugeValue,
bool2float64(organization.PoolManagerStatus.IsRunning),
organization.Name, // label: name
organization.ID, // label: id
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric")
continue
}
ch <- organizationPoolManagerStatus
}
}
OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsOrganizationSubsystem,
Name: "pool_manager_status",
Help: "Status of the organization pool manager",
}, []string{"name", "id", "running"})
)

View file

@ -1,121 +1,42 @@
package metrics
import (
"log/slog"
"strconv"
"strings"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectPoolMetric collects the metrics for the pool objects
func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
var (
PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "info",
Help: "Info of the pool",
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
pools, err := c.runner.ListAllPools(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
return
}
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "status",
Help: "Status of the pool",
}, []string{"id", "enabled"})
type poolInfo struct {
Name string
Type string
}
PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "max_runners",
Help: "Maximum number of runners in the pool",
}, []string{"id"})
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
}
}
PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "min_idle_runners",
Help: "Minimum number of idle runners in the pool",
}, []string{"id"})
var poolTags []string
for _, tag := range pool.Tags {
poolTags = append(poolTags, tag.Name)
}
poolInfo, err := prometheus.NewConstMetric(
c.poolInfo,
prometheus.GaugeValue,
1,
pool.ID, // label: id
pool.Image, // label: image
pool.Flavor, // label: flavor
pool.Prefix, // label: prefix
string(pool.OSType), // label: os_type
string(pool.OSArch), // label: os_arch
strings.Join(poolTags, ","), // label: tags
pool.ProviderName, // label: provider
poolNames[pool.ID].Name, // label: pool_owner
poolNames[pool.ID].Type, // label: pool_type
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric")
continue
}
ch <- poolInfo
poolStatus, err := prometheus.NewConstMetric(
c.poolStatus,
prometheus.GaugeValue,
bool2float64(pool.Enabled),
pool.ID, // label: id
strconv.FormatBool(pool.Enabled), // label: enabled
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric")
continue
}
ch <- poolStatus
poolMaxRunners, err := prometheus.NewConstMetric(
c.poolMaxRunners,
prometheus.GaugeValue,
float64(pool.MaxRunners),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric")
continue
}
ch <- poolMaxRunners
poolMinIdleRunners, err := prometheus.NewConstMetric(
c.poolMinIdleRunners,
prometheus.GaugeValue,
float64(pool.MinIdleRunners),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric")
continue
}
ch <- poolMinIdleRunners
poolBootstrapTimeout, err := prometheus.NewConstMetric(
c.poolBootstrapTimeout,
prometheus.GaugeValue,
float64(pool.RunnerBootstrapTimeout),
pool.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric")
continue
}
ch <- poolBootstrapTimeout
}
}
PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsPoolSubsystem,
Name: "bootstrap_timeout",
Help: "Runner bootstrap timeout in the pool",
}, []string{"id"})
)

View file

@ -1,36 +1,14 @@
package metrics
import (
"log/slog"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectPoolMetric collects the metrics for the pool objects
func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
providers, err := c.runner.ListProviders(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}
for _, provider := range providers {
providerInfo, err := prometheus.NewConstMetric(
c.providerInfo,
prometheus.GaugeValue,
1,
provider.Name, // label: name
string(provider.ProviderType), // label: type
provider.Description, // label: description
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric")
continue
}
ch <- providerInfo
}
}
var (
ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsProviderSubsystem,
Name: "info",
Help: "Info of the organization",
}, []string{"name", "type", "description"})
)

View file

@ -1,51 +1,21 @@
package metrics
import (
"log/slog"
"strconv"
"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)
// CollectOrganizationMetric collects the metrics for the repository objects
func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
var (
RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsRepositorySubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})
repositories, err := c.runner.ListRepositories(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}
for _, repository := range repositories {
repositoryInfo, err := prometheus.NewConstMetric(
c.repositoryInfo,
prometheus.GaugeValue,
1,
repository.Name, // label: name
repository.Owner, // label: owner
repository.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric")
continue
}
ch <- repositoryInfo
repositoryPoolManagerStatus, err := prometheus.NewConstMetric(
c.repositoryPoolManagerStatus,
prometheus.GaugeValue,
bool2float64(repository.PoolManagerStatus.IsRunning),
repository.Name, // label: name
repository.ID, // label: id
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric")
continue
}
ch <- repositoryPoolManagerStatus
}
}
RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsRepositorySubsystem,
Name: "pool_manager_status",
Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
)

View file

@ -1,6 +1,6 @@
package metrics
func bool2float64(b bool) float64 {
func Bool2float64(b bool) float64 {
if b {
return 1
}

12
metrics/webhooks.go Normal file
View file

@ -0,0 +1,12 @@
package metrics
import "github.com/prometheus/client_golang/prometheus"
var (
WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWebhookSubsystem,
Name: "received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason"})
)

View file

@ -60,7 +60,7 @@ type EnterpriseTestSuite struct {
}
func (s *EnterpriseTestSuite) SetupTest() {
adminCtx := auth.GetAdminContext()
adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())

View file

@ -0,0 +1,36 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectOrganizationMetric collects the metrics for the enterprise objects
func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.EnterpriseInfo.Reset()
metrics.EnterprisePoolManagerStatus.Reset()
enterprises, err := r.ListEnterprises(ctx)
if err != nil {
return err
}
for _, enterprise := range enterprises {
metrics.EnterpriseInfo.WithLabelValues(
enterprise.Name, // label: name
enterprise.ID, // label: id
).Set(1)
metrics.EnterprisePoolManagerStatus.WithLabelValues(
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning))
}
return nil
}

20
runner/metrics/health.go Normal file
View file

@ -0,0 +1,20 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
metrics.GarmHealth.WithLabelValues(
controllerInfo.MetadataURL, // label: metadata_url
controllerInfo.CallbackURL, // label: callback_url
controllerInfo.WebhookURL, // label: webhook_url
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
controllerInfo.ControllerID.String(), // label: controller_id
).Set(1)
return nil
}

View file

@ -0,0 +1,69 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.InstanceStatus.Reset()
instances, err := r.ListAllInstances(ctx)
if err != nil {
return err
}
pools, err := r.ListAllPools(ctx)
if err != nil {
return err
}
type poolInfo struct {
Name string
Type string
ProviderName string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
ProviderName: pool.ProviderName,
}
}
}
for _, instance := range instances {
metrics.InstanceStatus.WithLabelValues(
instance.Name, // label: name
string(instance.Status), // label: status
string(instance.RunnerStatus), // label: runner_status
poolNames[instance.PoolID].Name, // label: pool_owner
poolNames[instance.PoolID].Type, // label: pool_type
instance.PoolID, // label: pool_id
poolNames[instance.PoolID].ProviderName, // label: provider
).Set(1)
}
return nil
}

90
runner/metrics/metrics.go Normal file
View file

@ -0,0 +1,90 @@
package metrics
import (
"context"
"log/slog"
"time"
"github.com/cloudbase/garm/auth"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner"
)
func CollectObjectMetric(ctx context.Context, r *runner.Runner, duration time.Duration) {
ctx = auth.GetAdminContext(ctx)
// get controller info for health metrics
controllerInfo, err := r.GetControllerInfo(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info")
}
// we do not want to wait until the first ticker happens
// for that we start an initial collection immediately
slog.InfoContext(ctx, "collecting metrics")
if err := collectMetrics(ctx, r, controllerInfo); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
}
go func() {
ticker := time.NewTicker(duration)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
slog.InfoContext(ctx, "collecting metrics")
if err := collectMetrics(ctx, r, controllerInfo); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
}
}
}
}()
}
func collectMetrics(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
slog.DebugContext(ctx, "collecting organization metrics")
err := CollectOrganizationMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting enterprise metrics")
err = CollectEnterpriseMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting repository metrics")
err = CollectRepositoryMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting provider metrics")
err = CollectProviderMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting pool metrics")
err = CollectPoolMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting instance metrics")
err = CollectInstanceMetric(ctx, r)
if err != nil {
return err
}
slog.DebugContext(ctx, "collecting health metrics")
err = CollectHealthMetric(ctx, r, controllerInfo)
if err != nil {
return err
}
return nil
}

View file

@ -0,0 +1,36 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectOrganizationMetric collects the metrics for the organization objects
func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.OrganizationInfo.Reset()
metrics.OrganizationPoolManagerStatus.Reset()
organizations, err := r.ListOrganizations(ctx)
if err != nil {
return err
}
for _, organization := range organizations {
metrics.OrganizationInfo.WithLabelValues(
organization.Name, // label: name
organization.ID, // label: id
).Set(1)
metrics.OrganizationPoolManagerStatus.WithLabelValues(
organization.Name, // label: name
organization.ID, // label: id
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning))
}
return nil
}

87
runner/metrics/pool.go Normal file
View file

@ -0,0 +1,87 @@
package metrics
import (
"context"
"strconv"
"strings"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
// CollectPoolMetric collects the metrics for the pool objects
func CollectPoolMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.PoolInfo.Reset()
metrics.PoolStatus.Reset()
metrics.PoolMaxRunners.Reset()
metrics.PoolMinIdleRunners.Reset()
metrics.PoolBootstrapTimeout.Reset()
pools, err := r.ListAllPools(ctx)
if err != nil {
return err
}
type poolInfo struct {
Name string
Type string
}
poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: string(pool.PoolType()),
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: string(pool.PoolType()),
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: string(pool.PoolType()),
}
}
var poolTags []string
for _, tag := range pool.Tags {
poolTags = append(poolTags, tag.Name)
}
metrics.PoolInfo.WithLabelValues(
pool.ID, // label: id
pool.Image, // label: image
pool.Flavor, // label: flavor
pool.Prefix, // label: prefix
string(pool.OSType), // label: os_type
string(pool.OSArch), // label: os_arch
strings.Join(poolTags, ","), // label: tags
pool.ProviderName, // label: provider
poolNames[pool.ID].Name, // label: pool_owner
poolNames[pool.ID].Type, // label: pool_type
).Set(1)
metrics.PoolStatus.WithLabelValues(
pool.ID, // label: id
strconv.FormatBool(pool.Enabled), // label: enabled
).Set(metrics.Bool2float64(pool.Enabled))
metrics.PoolMaxRunners.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.MaxRunners))
metrics.PoolMinIdleRunners.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.MinIdleRunners))
metrics.PoolBootstrapTimeout.WithLabelValues(
pool.ID, // label: id
).Set(float64(pool.RunnerBootstrapTimeout))
}
return nil
}

View file

@ -0,0 +1,27 @@
package metrics
import (
"context"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
func CollectProviderMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.ProviderInfo.Reset()
providers, err := r.ListProviders(ctx)
if err != nil {
return err
}
for _, provider := range providers {
metrics.ProviderInfo.WithLabelValues(
provider.Name, // label: name
string(provider.ProviderType), // label: type
provider.Description, // label: description
).Set(1)
}
return nil
}

View file

@ -0,0 +1,35 @@
package metrics
import (
"context"
"strconv"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
)
func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error {
// reset metrics
metrics.EnterpriseInfo.Reset()
metrics.EnterprisePoolManagerStatus.Reset()
repositories, err := r.ListRepositories(ctx)
if err != nil {
return err
}
for _, repository := range repositories {
metrics.EnterpriseInfo.WithLabelValues(
repository.Name, // label: name
repository.ID, // label: id
).Set(1)
metrics.EnterprisePoolManagerStatus.WithLabelValues(
repository.Name, // label: name
repository.ID, // label: id
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning))
}
return nil
}

View file

@ -60,7 +60,7 @@ type OrgTestSuite struct {
}
func (s *OrgTestSuite) SetupTest() {
adminCtx := auth.GetAdminContext()
adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())

View file

@ -47,7 +47,7 @@ type PoolTestSuite struct {
}
func (s *PoolTestSuite) SetupTest() {
adminCtx := auth.GetAdminContext()
adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())

View file

@ -59,7 +59,7 @@ type RepoTestSuite struct {
}
func (s *RepoTestSuite) SetupTest() {
adminCtx := auth.GetAdminContext()
adminCtx := auth.GetAdminContext(context.Background())
// create testing sqlite database
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
@ -90,7 +90,7 @@ func (s *RepoTestSuite) SetupTest() {
var minIdleRunners uint = 20
providerMock := runnerCommonMocks.NewProvider(s.T())
fixtures := &RepoTestFixtures{
AdminContext: auth.GetAdminContext(),
AdminContext: auth.GetAdminContext(context.Background()),
Store: db,
StoreRepos: repos,
Providers: map[string]common.Provider{

View file

@ -27,4 +27,7 @@ const (
// uploadBaseURL is the default URL for guthub uploads.
GithubDefaultUploadBaseURL = "https://uploads.github.com/"
// metrics data update interval
DefaultMetricsUpdateInterval = 60 * time.Second
)