Merge pull request #216 from mercedes-benz/extended_metrics
chore: refactor metrics endpoint
This commit is contained in:
commit
e108140eb6
28 changed files with 645 additions and 606 deletions
|
|
@ -37,7 +37,7 @@ import (
|
|||
)
|
||||
|
||||
func NewAPIController(r *runner.Runner, authenticator *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
|
||||
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
|
||||
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext(context.Background()))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to get controller info")
|
||||
}
|
||||
|
|
@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
|
|||
}
|
||||
}
|
||||
|
||||
func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
|
||||
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
|
||||
// If labels are empty, not attempt will be made to record webhook.
|
||||
return []string{}
|
||||
}
|
||||
return []string{
|
||||
valid, reason,
|
||||
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
|
||||
}
|
||||
}
|
||||
|
||||
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
|
||||
defer r.Body.Close()
|
||||
body, err := io.ReadAll(r.Body)
|
||||
|
|
@ -119,31 +106,33 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
|
|||
signature := r.Header.Get("X-Hub-Signature-256")
|
||||
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
|
||||
|
||||
var labelValues []string
|
||||
defer func() {
|
||||
if len(labelValues) == 0 {
|
||||
return
|
||||
}
|
||||
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
|
||||
}
|
||||
}()
|
||||
|
||||
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
|
||||
if errors.Is(err, gErrors.ErrNotFound) {
|
||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"owner_unknown", // label: reason
|
||||
).Inc()
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
|
||||
return
|
||||
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
|
||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"signature_invalid", // label: reason
|
||||
).Inc()
|
||||
} else {
|
||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"false", // label: valid
|
||||
"unknown", // label: reason
|
||||
).Inc()
|
||||
}
|
||||
|
||||
handleError(ctx, w, err)
|
||||
return
|
||||
}
|
||||
labelValues = a.webhookMetricLabelValues(ctx, "true", "")
|
||||
metrics.WebhooksReceived.WithLabelValues(
|
||||
"true", // label: valid
|
||||
"", // label: reason
|
||||
).Inc()
|
||||
}
|
||||
|
||||
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
|
||||
|
|
|
|||
|
|
@ -238,8 +238,10 @@ func UserID(ctx context.Context) string {
|
|||
|
||||
// GetAdminContext will return an admin context. This can be used internally
|
||||
// when fetching users.
|
||||
func GetAdminContext() context.Context {
|
||||
ctx := context.Background()
|
||||
func GetAdminContext(ctx context.Context) context.Context {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
ctx = SetUserID(ctx, "")
|
||||
ctx = SetAdmin(ctx, true)
|
||||
ctx = SetIsEnabled(ctx, true)
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ import (
|
|||
"github.com/cloudbase/garm/database/common"
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
|
||||
garmUtil "github.com/cloudbase/garm/util"
|
||||
"github.com/cloudbase/garm/util/appdefaults"
|
||||
"github.com/cloudbase/garm/websocket"
|
||||
|
|
@ -214,13 +215,18 @@ func main() {
|
|||
|
||||
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
|
||||
|
||||
// start the metrics collector
|
||||
if cfg.Metrics.Enable {
|
||||
slog.InfoContext(ctx, "registering prometheus metrics collectors")
|
||||
if err := metrics.RegisterCollectors(runner); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
slog.InfoContext(ctx, "setting up metric routes")
|
||||
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
|
||||
|
||||
slog.InfoContext(ctx, "register metrics")
|
||||
if err := metrics.RegisterMetrics(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
slog.InfoContext(ctx, "start metrics collection")
|
||||
runnerMetrics.CollectObjectMetric(ctx, runner, cfg.Metrics.Duration())
|
||||
}
|
||||
|
||||
if cfg.Default.DebugServer {
|
||||
|
|
|
|||
|
|
@ -456,8 +456,39 @@ func (t *TLSConfig) Validate() error {
|
|||
}
|
||||
|
||||
type Metrics struct {
|
||||
// DisableAuth defines if the API endpoint will be protected by
|
||||
// JWT authentication
|
||||
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
|
||||
Enable bool `toml:"enable" json:"enable"`
|
||||
// Enable define if the API endpoint for metrics collection will
|
||||
// be enabled
|
||||
Enable bool `toml:"enable" json:"enable"`
|
||||
// Period defines the internal period at which internal metrics are getting updated
|
||||
// and propagated to the /metrics endpoint
|
||||
Period time.Duration `toml:"period" json:"period"`
|
||||
}
|
||||
|
||||
// ParseDuration parses the configured duration and returns a time.Duration of 0
|
||||
// if the duration is invalid.
|
||||
func (m *Metrics) ParseDuration() (time.Duration, error) {
|
||||
duration, err := time.ParseDuration(fmt.Sprint(m.Period))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return duration, nil
|
||||
}
|
||||
|
||||
// Duration returns the configured duration or the default duration if no value
|
||||
// is configured or the configured value is invalid.
|
||||
func (m *Metrics) Duration() time.Duration {
|
||||
duration, err := m.ParseDuration()
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).Error(fmt.Sprintf("defined duration %s is invalid", m.Period))
|
||||
}
|
||||
if duration == 0 {
|
||||
slog.Debug(fmt.Sprintf("using default duration %s for metrics update interval", appdefaults.DefaultMetricsUpdateInterval))
|
||||
return appdefaults.DefaultMetricsUpdateInterval
|
||||
}
|
||||
return duration
|
||||
}
|
||||
|
||||
// APIServer holds configuration for the API server
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it
|
|||
|
||||
## Common metrics
|
||||
|
||||
| Metric name | Type | Labels | Description |
|
||||
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
|
||||
| `garm_health` | Gauge | `controller_id`=<controller id> <br>`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
|
||||
| `garm_webhooks_received` | Counter | `controller_id`=<controller id> <br>`name`=<hostname> | This is a counter that increments every time GARM receives a webhook from GitHub. |
|
||||
| Metric name | Type | Labels | Description |
|
||||
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
|
||||
| `garm_health` | Gauge | `controller_id`=<controller id> <br>`callback_url`=<callback url> <br>`controller_webhook_url`=<controller webhook url> <br>`metadata_url`=<metadata url> <br>`webhook_url`=<webhook url> <br>`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
|
||||
| `garm_webhooks_received` | Counter | `valid`=<valid request> <br>`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. |
|
||||
|
||||
## Enterprise metrics
|
||||
|
||||
|
|
@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it
|
|||
|
||||
## Runner metrics
|
||||
|
||||
| Metric name | Type | Labels | Description |
|
||||
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
|
||||
| `garm_runner_status` | Gauge | `controller_id`=<controller id> <br>`hostname`=<hostname> <br>`name`=<runner name> <br>`pool_owner`=<owner name> <br>`pool_type`=<repository\|organization\|enterprise> <br>`provider`=<provider name> <br>`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown> <br>`status`=<idle\|pending\|terminated\|installing\|failed\|active> <br> | This is a gauge value that gives us details about the runners garm spawns |
|
||||
| Metric name | Type | Labels | Description |
|
||||
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
|
||||
| `garm_runner_status` | Gauge | `name`=<runner name> <br>`pool_owner`=<owner name> <br>`pool_type`=<repository\|organization\|enterprise> <br>`provider`=<provider name> <br>`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown> <br>`status`=<idle\|pending\|terminated\|installing\|failed\|active> <br> | This is a gauge value that gives us details about the runners garm spawns |
|
||||
|
||||
More metrics will be added in the future.
|
||||
|
||||
|
|
@ -60,15 +60,27 @@ Metrics are disabled by default. To enable them, add the following to your confi
|
|||
|
||||
```toml
|
||||
[metrics]
|
||||
# Toggle metrics. If set to false, the API endpoint for metrics collection will
|
||||
# be disabled.
|
||||
enable = true
|
||||
|
||||
# Toggle to disable authentication (not recommended) on the metrics endpoint.
|
||||
# If you do disable authentication, I encourage you to put a reverse proxy in front
|
||||
# of garm and limit which systems can access that particular endpoint. Ideally, you
|
||||
# would enable some kind of authentication using the reverse proxy, if the built-in auth
|
||||
# is not sufficient for your needs.
|
||||
disable_auth = false
|
||||
#
|
||||
# Default: false
|
||||
disable_auth = true
|
||||
|
||||
# Toggle metrics. If set to false, the API endpoint for metrics collection will
|
||||
# be disabled.
|
||||
#
|
||||
# Default: false
|
||||
enable = true
|
||||
|
||||
# period is the time interval when the /metrics endpoint will update internal metrics about
|
||||
# controller specific objects (e.g. runners, pools, etc.)
|
||||
#
|
||||
# Default: "60s"
|
||||
period = "30s"
|
||||
```
|
||||
|
||||
You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it.
|
||||
|
|
|
|||
|
|
@ -1,50 +1,21 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectOrganizationMetric collects the metrics for the enterprise objects
|
||||
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
var (
|
||||
EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsEnterpriseSubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the enterprise",
|
||||
}, []string{"name", "id"})
|
||||
|
||||
enterprises, err := c.runner.ListEnterprises(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
||||
return
|
||||
}
|
||||
|
||||
for _, enterprise := range enterprises {
|
||||
|
||||
enterpriseInfo, err := prometheus.NewConstMetric(
|
||||
c.enterpriseInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
enterprise.Name, // label: name
|
||||
enterprise.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
|
||||
continue
|
||||
}
|
||||
ch <- enterpriseInfo
|
||||
|
||||
enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
|
||||
c.enterprisePoolManagerStatus,
|
||||
prometheus.GaugeValue,
|
||||
bool2float64(enterprise.PoolManagerStatus.IsRunning),
|
||||
enterprise.Name, // label: name
|
||||
enterprise.ID, // label: id
|
||||
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
|
||||
continue
|
||||
}
|
||||
ch <- enterprisePoolManagerStatus
|
||||
}
|
||||
}
|
||||
EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsEnterpriseSubsystem,
|
||||
Name: "pool_manager_status",
|
||||
Help: "Status of the enterprise pool manager",
|
||||
}, []string{"name", "id", "running"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,22 +1,13 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
m, err := prometheus.NewConstMetric(
|
||||
c.healthMetric,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
hostname,
|
||||
controllerID,
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).Error("error on creating health metric")
|
||||
return
|
||||
}
|
||||
ch <- m
|
||||
}
|
||||
var (
|
||||
GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "health",
|
||||
Help: "Health of the garm",
|
||||
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,79 +1,14 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectInstanceMetric collects the metrics for the runner instances
|
||||
// reflecting the statuses and the pool they belong to.
|
||||
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
|
||||
instances, err := c.runner.ListAllInstances(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances")
|
||||
return
|
||||
}
|
||||
|
||||
pools, err := c.runner.ListAllPools(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
|
||||
return
|
||||
}
|
||||
|
||||
type poolInfo struct {
|
||||
Name string
|
||||
Type string
|
||||
ProviderName string
|
||||
}
|
||||
|
||||
poolNames := make(map[string]poolInfo)
|
||||
for _, pool := range pools {
|
||||
if pool.EnterpriseName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.EnterpriseName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
} else if pool.OrgName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.OrgName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
} else {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.RepoName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, instance := range instances {
|
||||
|
||||
m, err := prometheus.NewConstMetric(
|
||||
c.instanceMetric,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
instance.Name, // label: name
|
||||
string(instance.Status), // label: status
|
||||
string(instance.RunnerStatus), // label: runner_status
|
||||
poolNames[instance.PoolID].Name, // label: pool_owner
|
||||
poolNames[instance.PoolID].Type, // label: pool_type
|
||||
instance.PoolID, // label: pool_id
|
||||
hostname, // label: hostname
|
||||
controllerID, // label: controller_id
|
||||
poolNames[instance.PoolID].ProviderName, // label: provider
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric")
|
||||
continue
|
||||
}
|
||||
ch <- m
|
||||
}
|
||||
}
|
||||
var (
|
||||
InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsRunnerSubsystem,
|
||||
Name: "status",
|
||||
Help: "Status of the instance",
|
||||
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,206 +1,53 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/cloudbase/garm/params"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const metricsNamespace = "garm_"
|
||||
const metricsRunnerSubsystem = "runner_"
|
||||
const metricsPoolSubsystem = "pool_"
|
||||
const metricsProviderSubsystem = "provider_"
|
||||
const metricsOrganizationSubsystem = "organization_"
|
||||
const metricsRepositorySubsystem = "repository_"
|
||||
const metricsEnterpriseSubsystem = "enterprise_"
|
||||
const metricsWebhookSubsystem = "webhook_"
|
||||
const metricsNamespace = "garm"
|
||||
const metricsRunnerSubsystem = "runner"
|
||||
const metricsPoolSubsystem = "pool"
|
||||
const metricsProviderSubsystem = "provider"
|
||||
const metricsOrganizationSubsystem = "organization"
|
||||
const metricsRepositorySubsystem = "repository"
|
||||
const metricsEnterpriseSubsystem = "enterprise"
|
||||
const metricsWebhookSubsystem = "webhook"
|
||||
|
||||
var webhooksReceived *prometheus.CounterVec = nil
|
||||
// RegisterMetrics registers all the metrics
|
||||
func RegisterMetrics() error {
|
||||
|
||||
// RecordWebhookWithLabels will increment a webhook metric identified by specific
|
||||
// values. If metrics are disabled, this function is a noop.
|
||||
func RecordWebhookWithLabels(lvs ...string) error {
|
||||
if webhooksReceived == nil {
|
||||
// not registered. Noop
|
||||
return nil
|
||||
var collectors []prometheus.Collector
|
||||
collectors = append(collectors,
|
||||
// runner metrics
|
||||
InstanceStatus,
|
||||
// organization metrics
|
||||
OrganizationInfo,
|
||||
OrganizationPoolManagerStatus,
|
||||
// enterprise metrics
|
||||
EnterpriseInfo,
|
||||
EnterprisePoolManagerStatus,
|
||||
// repository metrics
|
||||
RepositoryInfo,
|
||||
RepositoryPoolManagerStatus,
|
||||
// provider metrics
|
||||
ProviderInfo,
|
||||
// pool metrics
|
||||
PoolInfo,
|
||||
PoolStatus,
|
||||
PoolMaxRunners,
|
||||
PoolMinIdleRunners,
|
||||
PoolBootstrapTimeout,
|
||||
// health metrics
|
||||
GarmHealth,
|
||||
// webhook metrics
|
||||
WebhooksReceived,
|
||||
)
|
||||
|
||||
for _, c := range collectors {
|
||||
if err := prometheus.Register(c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "recording metric")
|
||||
}
|
||||
counter.Inc()
|
||||
return nil
|
||||
}
|
||||
|
||||
func RegisterCollectors(runner *runner.Runner) error {
|
||||
if webhooksReceived != nil {
|
||||
// Already registered.
|
||||
return nil
|
||||
}
|
||||
|
||||
garmCollector, err := NewGarmCollector(runner)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "getting collector")
|
||||
}
|
||||
|
||||
if err := prometheus.Register(garmCollector); err != nil {
|
||||
return errors.Wrap(err, "registering collector")
|
||||
}
|
||||
|
||||
// metric to count total webhooks received
|
||||
// at this point the webhook is not yet authenticated and
|
||||
// we don't know if it's meant for us or not
|
||||
webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: metricsNamespace + metricsWebhookSubsystem + "received",
|
||||
Help: "The total number of webhooks received",
|
||||
}, []string{"valid", "reason", "hostname", "controller_id"})
|
||||
|
||||
err = prometheus.Register(webhooksReceived)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "registering webhooks recv counter")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type GarmCollector struct {
|
||||
healthMetric *prometheus.Desc
|
||||
instanceMetric *prometheus.Desc
|
||||
|
||||
// pool metrics
|
||||
poolInfo *prometheus.Desc
|
||||
poolStatus *prometheus.Desc
|
||||
poolMaxRunners *prometheus.Desc
|
||||
poolMinIdleRunners *prometheus.Desc
|
||||
poolBootstrapTimeout *prometheus.Desc
|
||||
|
||||
// provider metrics
|
||||
providerInfo *prometheus.Desc
|
||||
|
||||
organizationInfo *prometheus.Desc
|
||||
organizationPoolManagerStatus *prometheus.Desc
|
||||
repositoryInfo *prometheus.Desc
|
||||
repositoryPoolManagerStatus *prometheus.Desc
|
||||
enterpriseInfo *prometheus.Desc
|
||||
enterprisePoolManagerStatus *prometheus.Desc
|
||||
|
||||
runner *runner.Runner
|
||||
cachedControllerInfo params.ControllerInfo
|
||||
}
|
||||
|
||||
func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) {
|
||||
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "fetching controller info")
|
||||
}
|
||||
return &GarmCollector{
|
||||
runner: r,
|
||||
instanceMetric: prometheus.NewDesc(
|
||||
metricsNamespace+metricsRunnerSubsystem+"status",
|
||||
"Status of the runner",
|
||||
[]string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil,
|
||||
),
|
||||
healthMetric: prometheus.NewDesc(
|
||||
metricsNamespace+"health",
|
||||
"Health of the runner",
|
||||
[]string{"hostname", "controller_id"}, nil,
|
||||
),
|
||||
poolInfo: prometheus.NewDesc(
|
||||
metricsNamespace+metricsPoolSubsystem+"info",
|
||||
"Information of the pool",
|
||||
[]string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil,
|
||||
),
|
||||
poolStatus: prometheus.NewDesc(
|
||||
metricsNamespace+metricsPoolSubsystem+"status",
|
||||
"Status of the pool",
|
||||
[]string{"id", "enabled"}, nil,
|
||||
),
|
||||
poolMaxRunners: prometheus.NewDesc(
|
||||
metricsNamespace+metricsPoolSubsystem+"max_runners",
|
||||
"Max runners of the pool",
|
||||
[]string{"id"}, nil,
|
||||
),
|
||||
poolMinIdleRunners: prometheus.NewDesc(
|
||||
metricsNamespace+metricsPoolSubsystem+"min_idle_runners",
|
||||
"Min idle runners of the pool",
|
||||
[]string{"id"}, nil,
|
||||
),
|
||||
poolBootstrapTimeout: prometheus.NewDesc(
|
||||
metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout",
|
||||
"Bootstrap timeout of the pool",
|
||||
[]string{"id"}, nil,
|
||||
),
|
||||
providerInfo: prometheus.NewDesc(
|
||||
metricsNamespace+metricsProviderSubsystem+"info",
|
||||
"Info of the provider",
|
||||
[]string{"name", "type", "description"}, nil,
|
||||
),
|
||||
organizationInfo: prometheus.NewDesc(
|
||||
metricsNamespace+metricsOrganizationSubsystem+"info",
|
||||
"Info of the organization",
|
||||
[]string{"name", "id"}, nil,
|
||||
),
|
||||
organizationPoolManagerStatus: prometheus.NewDesc(
|
||||
metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status",
|
||||
"Status of the organization pool manager",
|
||||
[]string{"name", "id", "running"}, nil,
|
||||
),
|
||||
repositoryInfo: prometheus.NewDesc(
|
||||
metricsNamespace+metricsRepositorySubsystem+"info",
|
||||
"Info of the organization",
|
||||
[]string{"name", "owner", "id"}, nil,
|
||||
),
|
||||
repositoryPoolManagerStatus: prometheus.NewDesc(
|
||||
metricsNamespace+metricsRepositorySubsystem+"pool_manager_status",
|
||||
"Status of the repository pool manager",
|
||||
[]string{"name", "id", "running"}, nil,
|
||||
),
|
||||
enterpriseInfo: prometheus.NewDesc(
|
||||
metricsNamespace+metricsEnterpriseSubsystem+"info",
|
||||
"Info of the organization",
|
||||
[]string{"name", "id"}, nil,
|
||||
),
|
||||
enterprisePoolManagerStatus: prometheus.NewDesc(
|
||||
metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status",
|
||||
"Status of the enterprise pool manager",
|
||||
[]string{"name", "id", "running"}, nil,
|
||||
),
|
||||
|
||||
cachedControllerInfo: controllerInfo,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- c.instanceMetric
|
||||
ch <- c.healthMetric
|
||||
ch <- c.poolInfo
|
||||
ch <- c.poolStatus
|
||||
ch <- c.poolMaxRunners
|
||||
ch <- c.poolMinIdleRunners
|
||||
ch <- c.providerInfo
|
||||
ch <- c.organizationInfo
|
||||
ch <- c.organizationPoolManagerStatus
|
||||
ch <- c.enterpriseInfo
|
||||
ch <- c.enterprisePoolManagerStatus
|
||||
}
|
||||
|
||||
func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).Error("failed to get controller info")
|
||||
return
|
||||
}
|
||||
|
||||
c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,50 +1,21 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectOrganizationMetric collects the metrics for the organization objects
|
||||
func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
var (
|
||||
OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsOrganizationSubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the organization",
|
||||
}, []string{"name", "id"})
|
||||
|
||||
organizations, err := c.runner.ListOrganizations(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
||||
return
|
||||
}
|
||||
|
||||
for _, organization := range organizations {
|
||||
|
||||
organizationInfo, err := prometheus.NewConstMetric(
|
||||
c.organizationInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
organization.Name, // label: name
|
||||
organization.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric")
|
||||
continue
|
||||
}
|
||||
ch <- organizationInfo
|
||||
|
||||
organizationPoolManagerStatus, err := prometheus.NewConstMetric(
|
||||
c.organizationPoolManagerStatus,
|
||||
prometheus.GaugeValue,
|
||||
bool2float64(organization.PoolManagerStatus.IsRunning),
|
||||
organization.Name, // label: name
|
||||
organization.ID, // label: id
|
||||
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric")
|
||||
continue
|
||||
}
|
||||
ch <- organizationPoolManagerStatus
|
||||
}
|
||||
}
|
||||
OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsOrganizationSubsystem,
|
||||
Name: "pool_manager_status",
|
||||
Help: "Status of the organization pool manager",
|
||||
}, []string{"name", "id", "running"})
|
||||
)
|
||||
|
|
|
|||
143
metrics/pool.go
143
metrics/pool.go
|
|
@ -1,121 +1,42 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectPoolMetric collects the metrics for the pool objects
|
||||
func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
var (
|
||||
PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the pool",
|
||||
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
|
||||
|
||||
pools, err := c.runner.ListAllPools(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
|
||||
return
|
||||
}
|
||||
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "status",
|
||||
Help: "Status of the pool",
|
||||
}, []string{"id", "enabled"})
|
||||
|
||||
type poolInfo struct {
|
||||
Name string
|
||||
Type string
|
||||
}
|
||||
PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "max_runners",
|
||||
Help: "Maximum number of runners in the pool",
|
||||
}, []string{"id"})
|
||||
|
||||
poolNames := make(map[string]poolInfo)
|
||||
for _, pool := range pools {
|
||||
if pool.EnterpriseName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.EnterpriseName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
} else if pool.OrgName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.OrgName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
} else {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.RepoName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
}
|
||||
PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "min_idle_runners",
|
||||
Help: "Minimum number of idle runners in the pool",
|
||||
}, []string{"id"})
|
||||
|
||||
var poolTags []string
|
||||
for _, tag := range pool.Tags {
|
||||
poolTags = append(poolTags, tag.Name)
|
||||
}
|
||||
|
||||
poolInfo, err := prometheus.NewConstMetric(
|
||||
c.poolInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
pool.ID, // label: id
|
||||
pool.Image, // label: image
|
||||
pool.Flavor, // label: flavor
|
||||
pool.Prefix, // label: prefix
|
||||
string(pool.OSType), // label: os_type
|
||||
string(pool.OSArch), // label: os_arch
|
||||
strings.Join(poolTags, ","), // label: tags
|
||||
pool.ProviderName, // label: provider
|
||||
poolNames[pool.ID].Name, // label: pool_owner
|
||||
poolNames[pool.ID].Type, // label: pool_type
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric")
|
||||
continue
|
||||
}
|
||||
ch <- poolInfo
|
||||
|
||||
poolStatus, err := prometheus.NewConstMetric(
|
||||
c.poolStatus,
|
||||
prometheus.GaugeValue,
|
||||
bool2float64(pool.Enabled),
|
||||
pool.ID, // label: id
|
||||
strconv.FormatBool(pool.Enabled), // label: enabled
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric")
|
||||
continue
|
||||
}
|
||||
ch <- poolStatus
|
||||
|
||||
poolMaxRunners, err := prometheus.NewConstMetric(
|
||||
c.poolMaxRunners,
|
||||
prometheus.GaugeValue,
|
||||
float64(pool.MaxRunners),
|
||||
pool.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric")
|
||||
continue
|
||||
}
|
||||
ch <- poolMaxRunners
|
||||
|
||||
poolMinIdleRunners, err := prometheus.NewConstMetric(
|
||||
c.poolMinIdleRunners,
|
||||
prometheus.GaugeValue,
|
||||
float64(pool.MinIdleRunners),
|
||||
pool.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric")
|
||||
continue
|
||||
}
|
||||
ch <- poolMinIdleRunners
|
||||
|
||||
poolBootstrapTimeout, err := prometheus.NewConstMetric(
|
||||
c.poolBootstrapTimeout,
|
||||
prometheus.GaugeValue,
|
||||
float64(pool.RunnerBootstrapTimeout),
|
||||
pool.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric")
|
||||
continue
|
||||
}
|
||||
ch <- poolBootstrapTimeout
|
||||
}
|
||||
}
|
||||
PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsPoolSubsystem,
|
||||
Name: "bootstrap_timeout",
|
||||
Help: "Runner bootstrap timeout in the pool",
|
||||
}, []string{"id"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,36 +1,14 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectPoolMetric collects the metrics for the pool objects
|
||||
func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
|
||||
providers, err := c.runner.ListProviders(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
||||
return
|
||||
}
|
||||
|
||||
for _, provider := range providers {
|
||||
|
||||
providerInfo, err := prometheus.NewConstMetric(
|
||||
c.providerInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
provider.Name, // label: name
|
||||
string(provider.ProviderType), // label: type
|
||||
provider.Description, // label: description
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric")
|
||||
continue
|
||||
}
|
||||
ch <- providerInfo
|
||||
}
|
||||
}
|
||||
var (
|
||||
ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsProviderSubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the organization",
|
||||
}, []string{"name", "type", "description"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,51 +1,21 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// CollectOrganizationMetric collects the metrics for the repository objects
|
||||
func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
||||
ctx := auth.GetAdminContext()
|
||||
var (
|
||||
RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsRepositorySubsystem,
|
||||
Name: "info",
|
||||
Help: "Info of the enterprise",
|
||||
}, []string{"name", "id"})
|
||||
|
||||
repositories, err := c.runner.ListRepositories(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
||||
return
|
||||
}
|
||||
|
||||
for _, repository := range repositories {
|
||||
|
||||
repositoryInfo, err := prometheus.NewConstMetric(
|
||||
c.repositoryInfo,
|
||||
prometheus.GaugeValue,
|
||||
1,
|
||||
repository.Name, // label: name
|
||||
repository.Owner, // label: owner
|
||||
repository.ID, // label: id
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric")
|
||||
continue
|
||||
}
|
||||
ch <- repositoryInfo
|
||||
|
||||
repositoryPoolManagerStatus, err := prometheus.NewConstMetric(
|
||||
c.repositoryPoolManagerStatus,
|
||||
prometheus.GaugeValue,
|
||||
bool2float64(repository.PoolManagerStatus.IsRunning),
|
||||
repository.Name, // label: name
|
||||
repository.ID, // label: id
|
||||
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
|
||||
)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric")
|
||||
continue
|
||||
}
|
||||
ch <- repositoryPoolManagerStatus
|
||||
}
|
||||
}
|
||||
RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsRepositorySubsystem,
|
||||
Name: "pool_manager_status",
|
||||
Help: "Status of the enterprise pool manager",
|
||||
}, []string{"name", "id", "running"})
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
package metrics
|
||||
|
||||
func bool2float64(b bool) float64 {
|
||||
func Bool2float64(b bool) float64 {
|
||||
if b {
|
||||
return 1
|
||||
}
|
||||
|
|
|
|||
12
metrics/webhooks.go
Normal file
12
metrics/webhooks.go
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
package metrics
|
||||
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
var (
|
||||
WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsWebhookSubsystem,
|
||||
Name: "received",
|
||||
Help: "The total number of webhooks received",
|
||||
}, []string{"valid", "reason"})
|
||||
)
|
||||
|
|
@ -60,7 +60,7 @@ type EnterpriseTestSuite struct {
|
|||
}
|
||||
|
||||
func (s *EnterpriseTestSuite) SetupTest() {
|
||||
adminCtx := auth.GetAdminContext()
|
||||
adminCtx := auth.GetAdminContext(context.Background())
|
||||
|
||||
// create testing sqlite database
|
||||
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
|
||||
|
|
|
|||
36
runner/metrics/enterprise.go
Normal file
36
runner/metrics/enterprise.go
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
// CollectOrganizationMetric collects the metrics for the enterprise objects
|
||||
func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.EnterpriseInfo.Reset()
|
||||
metrics.EnterprisePoolManagerStatus.Reset()
|
||||
|
||||
enterprises, err := r.ListEnterprises(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, enterprise := range enterprises {
|
||||
metrics.EnterpriseInfo.WithLabelValues(
|
||||
enterprise.Name, // label: name
|
||||
enterprise.ID, // label: id
|
||||
).Set(1)
|
||||
|
||||
metrics.EnterprisePoolManagerStatus.WithLabelValues(
|
||||
enterprise.Name, // label: name
|
||||
enterprise.ID, // label: id
|
||||
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
|
||||
).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
20
runner/metrics/health.go
Normal file
20
runner/metrics/health.go
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/params"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||
metrics.GarmHealth.WithLabelValues(
|
||||
controllerInfo.MetadataURL, // label: metadata_url
|
||||
controllerInfo.CallbackURL, // label: callback_url
|
||||
controllerInfo.WebhookURL, // label: webhook_url
|
||||
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
|
||||
controllerInfo.ControllerID.String(), // label: controller_id
|
||||
).Set(1)
|
||||
return nil
|
||||
}
|
||||
69
runner/metrics/instance.go
Normal file
69
runner/metrics/instance.go
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
// CollectInstanceMetric collects the metrics for the runner instances
|
||||
// reflecting the statuses and the pool they belong to.
|
||||
func CollectInstanceMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.InstanceStatus.Reset()
|
||||
|
||||
instances, err := r.ListAllInstances(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pools, err := r.ListAllPools(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
type poolInfo struct {
|
||||
Name string
|
||||
Type string
|
||||
ProviderName string
|
||||
}
|
||||
|
||||
poolNames := make(map[string]poolInfo)
|
||||
for _, pool := range pools {
|
||||
if pool.EnterpriseName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.EnterpriseName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
} else if pool.OrgName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.OrgName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
} else {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.RepoName,
|
||||
Type: string(pool.PoolType()),
|
||||
ProviderName: pool.ProviderName,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, instance := range instances {
|
||||
|
||||
metrics.InstanceStatus.WithLabelValues(
|
||||
instance.Name, // label: name
|
||||
string(instance.Status), // label: status
|
||||
string(instance.RunnerStatus), // label: runner_status
|
||||
poolNames[instance.PoolID].Name, // label: pool_owner
|
||||
poolNames[instance.PoolID].Type, // label: pool_type
|
||||
instance.PoolID, // label: pool_id
|
||||
poolNames[instance.PoolID].ProviderName, // label: provider
|
||||
).Set(1)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
90
runner/metrics/metrics.go
Normal file
90
runner/metrics/metrics.go
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/cloudbase/garm/auth"
|
||||
"github.com/cloudbase/garm/params"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
func CollectObjectMetric(ctx context.Context, r *runner.Runner, duration time.Duration) {
|
||||
ctx = auth.GetAdminContext(ctx)
|
||||
|
||||
// get controller info for health metrics
|
||||
controllerInfo, err := r.GetControllerInfo(ctx)
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info")
|
||||
}
|
||||
|
||||
// we do not want to wait until the first ticker happens
|
||||
// for that we start an initial collection immediately
|
||||
slog.InfoContext(ctx, "collecting metrics")
|
||||
if err := collectMetrics(ctx, r, controllerInfo); err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(duration)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
slog.InfoContext(ctx, "collecting metrics")
|
||||
|
||||
if err := collectMetrics(ctx, r, controllerInfo); err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics")
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func collectMetrics(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||
slog.DebugContext(ctx, "collecting organization metrics")
|
||||
err := CollectOrganizationMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting enterprise metrics")
|
||||
err = CollectEnterpriseMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting repository metrics")
|
||||
err = CollectRepositoryMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting provider metrics")
|
||||
err = CollectProviderMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting pool metrics")
|
||||
err = CollectPoolMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting instance metrics")
|
||||
err = CollectInstanceMetric(ctx, r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.DebugContext(ctx, "collecting health metrics")
|
||||
err = CollectHealthMetric(ctx, r, controllerInfo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
36
runner/metrics/organization.go
Normal file
36
runner/metrics/organization.go
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
// CollectOrganizationMetric collects the metrics for the organization objects
|
||||
func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.OrganizationInfo.Reset()
|
||||
metrics.OrganizationPoolManagerStatus.Reset()
|
||||
|
||||
organizations, err := r.ListOrganizations(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, organization := range organizations {
|
||||
metrics.OrganizationInfo.WithLabelValues(
|
||||
organization.Name, // label: name
|
||||
organization.ID, // label: id
|
||||
).Set(1)
|
||||
|
||||
metrics.OrganizationPoolManagerStatus.WithLabelValues(
|
||||
organization.Name, // label: name
|
||||
organization.ID, // label: id
|
||||
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
|
||||
).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
87
runner/metrics/pool.go
Normal file
87
runner/metrics/pool.go
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
// CollectPoolMetric collects the metrics for the pool objects
|
||||
func CollectPoolMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.PoolInfo.Reset()
|
||||
metrics.PoolStatus.Reset()
|
||||
metrics.PoolMaxRunners.Reset()
|
||||
metrics.PoolMinIdleRunners.Reset()
|
||||
metrics.PoolBootstrapTimeout.Reset()
|
||||
|
||||
pools, err := r.ListAllPools(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
type poolInfo struct {
|
||||
Name string
|
||||
Type string
|
||||
}
|
||||
|
||||
poolNames := make(map[string]poolInfo)
|
||||
for _, pool := range pools {
|
||||
if pool.EnterpriseName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.EnterpriseName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
} else if pool.OrgName != "" {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.OrgName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
} else {
|
||||
poolNames[pool.ID] = poolInfo{
|
||||
Name: pool.RepoName,
|
||||
Type: string(pool.PoolType()),
|
||||
}
|
||||
}
|
||||
|
||||
var poolTags []string
|
||||
for _, tag := range pool.Tags {
|
||||
poolTags = append(poolTags, tag.Name)
|
||||
}
|
||||
|
||||
metrics.PoolInfo.WithLabelValues(
|
||||
pool.ID, // label: id
|
||||
pool.Image, // label: image
|
||||
pool.Flavor, // label: flavor
|
||||
pool.Prefix, // label: prefix
|
||||
string(pool.OSType), // label: os_type
|
||||
string(pool.OSArch), // label: os_arch
|
||||
strings.Join(poolTags, ","), // label: tags
|
||||
pool.ProviderName, // label: provider
|
||||
poolNames[pool.ID].Name, // label: pool_owner
|
||||
poolNames[pool.ID].Type, // label: pool_type
|
||||
).Set(1)
|
||||
|
||||
metrics.PoolStatus.WithLabelValues(
|
||||
pool.ID, // label: id
|
||||
strconv.FormatBool(pool.Enabled), // label: enabled
|
||||
).Set(metrics.Bool2float64(pool.Enabled))
|
||||
|
||||
metrics.PoolMaxRunners.WithLabelValues(
|
||||
pool.ID, // label: id
|
||||
).Set(float64(pool.MaxRunners))
|
||||
|
||||
metrics.PoolMinIdleRunners.WithLabelValues(
|
||||
pool.ID, // label: id
|
||||
).Set(float64(pool.MinIdleRunners))
|
||||
|
||||
metrics.PoolBootstrapTimeout.WithLabelValues(
|
||||
pool.ID, // label: id
|
||||
).Set(float64(pool.RunnerBootstrapTimeout))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
27
runner/metrics/provider.go
Normal file
27
runner/metrics/provider.go
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
func CollectProviderMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.ProviderInfo.Reset()
|
||||
|
||||
providers, err := r.ListProviders(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, provider := range providers {
|
||||
metrics.ProviderInfo.WithLabelValues(
|
||||
provider.Name, // label: name
|
||||
string(provider.ProviderType), // label: type
|
||||
provider.Description, // label: description
|
||||
).Set(1)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
35
runner/metrics/repository.go
Normal file
35
runner/metrics/repository.go
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
|
||||
"github.com/cloudbase/garm/metrics"
|
||||
"github.com/cloudbase/garm/runner"
|
||||
)
|
||||
|
||||
func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error {
|
||||
|
||||
// reset metrics
|
||||
metrics.EnterpriseInfo.Reset()
|
||||
metrics.EnterprisePoolManagerStatus.Reset()
|
||||
|
||||
repositories, err := r.ListRepositories(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, repository := range repositories {
|
||||
metrics.EnterpriseInfo.WithLabelValues(
|
||||
repository.Name, // label: name
|
||||
repository.ID, // label: id
|
||||
).Set(1)
|
||||
|
||||
metrics.EnterprisePoolManagerStatus.WithLabelValues(
|
||||
repository.Name, // label: name
|
||||
repository.ID, // label: id
|
||||
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
|
||||
).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
@ -60,7 +60,7 @@ type OrgTestSuite struct {
|
|||
}
|
||||
|
||||
func (s *OrgTestSuite) SetupTest() {
|
||||
adminCtx := auth.GetAdminContext()
|
||||
adminCtx := auth.GetAdminContext(context.Background())
|
||||
|
||||
// create testing sqlite database
|
||||
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ type PoolTestSuite struct {
|
|||
}
|
||||
|
||||
func (s *PoolTestSuite) SetupTest() {
|
||||
adminCtx := auth.GetAdminContext()
|
||||
adminCtx := auth.GetAdminContext(context.Background())
|
||||
|
||||
// create testing sqlite database
|
||||
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ type RepoTestSuite struct {
|
|||
}
|
||||
|
||||
func (s *RepoTestSuite) SetupTest() {
|
||||
adminCtx := auth.GetAdminContext()
|
||||
adminCtx := auth.GetAdminContext(context.Background())
|
||||
|
||||
// create testing sqlite database
|
||||
dbCfg := garmTesting.GetTestSqliteDBConfig(s.T())
|
||||
|
|
@ -90,7 +90,7 @@ func (s *RepoTestSuite) SetupTest() {
|
|||
var minIdleRunners uint = 20
|
||||
providerMock := runnerCommonMocks.NewProvider(s.T())
|
||||
fixtures := &RepoTestFixtures{
|
||||
AdminContext: auth.GetAdminContext(),
|
||||
AdminContext: auth.GetAdminContext(context.Background()),
|
||||
Store: db,
|
||||
StoreRepos: repos,
|
||||
Providers: map[string]common.Provider{
|
||||
|
|
|
|||
|
|
@ -27,4 +27,7 @@ const (
|
|||
|
||||
// uploadBaseURL is the default URL for guthub uploads.
|
||||
GithubDefaultUploadBaseURL = "https://uploads.github.com/"
|
||||
|
||||
// metrics data update interval
|
||||
DefaultMetricsUpdateInterval = 60 * time.Second
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue