chore: refactor metrics endpoint
refactoring is needed to make the metrics package usable from within the runner package for further metrics. This change also makes the metric-collector independent from requests to the /metrics endpoint Signed-off-by: Mario Constanti <mario.constanti@mercedes-benz.com>
This commit is contained in:
parent
f68cf98d67
commit
1d8d9459eb
21 changed files with 564 additions and 590 deletions
|
|
@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
|
|
||||||
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
|
|
||||||
// If labels are empty, not attempt will be made to record webhook.
|
|
||||||
return []string{}
|
|
||||||
}
|
|
||||||
return []string{
|
|
||||||
valid, reason,
|
|
||||||
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
|
func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
|
||||||
defer r.Body.Close()
|
defer r.Body.Close()
|
||||||
body, err := io.ReadAll(r.Body)
|
body, err := io.ReadAll(r.Body)
|
||||||
|
|
@ -119,31 +106,47 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
|
||||||
signature := r.Header.Get("X-Hub-Signature-256")
|
signature := r.Header.Get("X-Hub-Signature-256")
|
||||||
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
|
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
|
||||||
|
|
||||||
var labelValues []string
|
controllerInfo, err := a.r.GetControllerInfo(ctx)
|
||||||
defer func() {
|
if err != nil {
|
||||||
if len(labelValues) == 0 {
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
|
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
|
||||||
if errors.Is(err, gErrors.ErrNotFound) {
|
if errors.Is(err, gErrors.ErrNotFound) {
|
||||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
|
metrics.WebhooksReceived.WithLabelValues(
|
||||||
|
"false", // label: valid
|
||||||
|
"owner_unknown", // label: reason
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: controller_id
|
||||||
|
).Inc()
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
|
||||||
return
|
return
|
||||||
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
|
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
|
||||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
|
metrics.WebhooksReceived.WithLabelValues(
|
||||||
|
"false", // label: valid
|
||||||
|
"signature_invalid", // label: reason
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: controller_id
|
||||||
|
).Inc()
|
||||||
} else {
|
} else {
|
||||||
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
|
metrics.WebhooksReceived.WithLabelValues(
|
||||||
|
"false", // label: valid
|
||||||
|
"unknown", // label: reason
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: controller_id
|
||||||
|
).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
handleError(ctx, w, err)
|
handleError(ctx, w, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
labelValues = a.webhookMetricLabelValues(ctx, "true", "")
|
metrics.WebhooksReceived.WithLabelValues(
|
||||||
|
"true", // label: valid
|
||||||
|
"", // label: reason
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: controller_id
|
||||||
|
).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
|
func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|
|
||||||
|
|
@ -35,8 +35,8 @@ import (
|
||||||
"github.com/cloudbase/garm/config"
|
"github.com/cloudbase/garm/config"
|
||||||
"github.com/cloudbase/garm/database"
|
"github.com/cloudbase/garm/database"
|
||||||
"github.com/cloudbase/garm/database/common"
|
"github.com/cloudbase/garm/database/common"
|
||||||
"github.com/cloudbase/garm/metrics"
|
|
||||||
"github.com/cloudbase/garm/runner"
|
"github.com/cloudbase/garm/runner"
|
||||||
|
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
|
||||||
garmUtil "github.com/cloudbase/garm/util"
|
garmUtil "github.com/cloudbase/garm/util"
|
||||||
"github.com/cloudbase/garm/util/appdefaults"
|
"github.com/cloudbase/garm/util/appdefaults"
|
||||||
"github.com/cloudbase/garm/websocket"
|
"github.com/cloudbase/garm/websocket"
|
||||||
|
|
@ -214,13 +214,13 @@ func main() {
|
||||||
|
|
||||||
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
|
router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)
|
||||||
|
|
||||||
|
// start the metrics collector
|
||||||
if cfg.Metrics.Enable {
|
if cfg.Metrics.Enable {
|
||||||
slog.InfoContext(ctx, "registering prometheus metrics collectors")
|
|
||||||
if err := metrics.RegisterCollectors(runner); err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
slog.InfoContext(ctx, "setting up metric routes")
|
slog.InfoContext(ctx, "setting up metric routes")
|
||||||
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
|
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)
|
||||||
|
|
||||||
|
slog.InfoContext(ctx, "start metrics collection")
|
||||||
|
runnerMetrics.CollectObjectMetric(runner, time.NewTicker(cfg.Metrics.Period))
|
||||||
}
|
}
|
||||||
|
|
||||||
if cfg.Default.DebugServer {
|
if cfg.Default.DebugServer {
|
||||||
|
|
|
||||||
|
|
@ -456,8 +456,9 @@ func (t *TLSConfig) Validate() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Metrics struct {
|
type Metrics struct {
|
||||||
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
|
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
|
||||||
Enable bool `toml:"enable" json:"enable"`
|
Enable bool `toml:"enable" json:"enable"`
|
||||||
|
Period time.Duration `toml:"period" json:"period"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// APIServer holds configuration for the API server
|
// APIServer holds configuration for the API server
|
||||||
|
|
|
||||||
|
|
@ -1,50 +1,21 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectOrganizationMetric collects the metrics for the enterprise objects
|
var (
|
||||||
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
ctx := auth.GetAdminContext()
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsEnterpriseSubsystem,
|
||||||
|
Name: "info",
|
||||||
|
Help: "Info of the enterprise",
|
||||||
|
}, []string{"name", "id"})
|
||||||
|
|
||||||
enterprises, err := c.runner.ListEnterprises(ctx)
|
EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
if err != nil {
|
Namespace: metricsNamespace,
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
Subsystem: metricsEnterpriseSubsystem,
|
||||||
return
|
Name: "pool_manager_status",
|
||||||
}
|
Help: "Status of the enterprise pool manager",
|
||||||
|
}, []string{"name", "id", "running"})
|
||||||
for _, enterprise := range enterprises {
|
)
|
||||||
|
|
||||||
enterpriseInfo, err := prometheus.NewConstMetric(
|
|
||||||
c.enterpriseInfo,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
enterprise.Name, // label: name
|
|
||||||
enterprise.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- enterpriseInfo
|
|
||||||
|
|
||||||
enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
|
|
||||||
c.enterprisePoolManagerStatus,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
bool2float64(enterprise.PoolManagerStatus.IsRunning),
|
|
||||||
enterprise.Name, // label: name
|
|
||||||
enterprise.ID, // label: id
|
|
||||||
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- enterprisePoolManagerStatus
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,22 +1,13 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
var (
|
||||||
m, err := prometheus.NewConstMetric(
|
GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
c.healthMetric,
|
Namespace: metricsNamespace,
|
||||||
prometheus.GaugeValue,
|
Name: "health",
|
||||||
1,
|
Help: "Health of the garm",
|
||||||
hostname,
|
}, []string{"hostname", "controller_id", "metadata_url", "callback_url", "webhook_url", "controller_webhook_url"})
|
||||||
controllerID,
|
)
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).Error("error on creating health metric")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ch <- m
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,79 +1,14 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectInstanceMetric collects the metrics for the runner instances
|
var (
|
||||||
// reflecting the statuses and the pool they belong to.
|
InstanceStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
Namespace: metricsNamespace,
|
||||||
ctx := auth.GetAdminContext()
|
Subsystem: metricsRunnerSubsystem,
|
||||||
|
Name: "status",
|
||||||
instances, err := c.runner.ListAllInstances(ctx)
|
Help: "Status of the instance",
|
||||||
if err != nil {
|
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"})
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances")
|
)
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
pools, err := c.runner.ListAllPools(ctx)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
type poolInfo struct {
|
|
||||||
Name string
|
|
||||||
Type string
|
|
||||||
ProviderName string
|
|
||||||
}
|
|
||||||
|
|
||||||
poolNames := make(map[string]poolInfo)
|
|
||||||
for _, pool := range pools {
|
|
||||||
if pool.EnterpriseName != "" {
|
|
||||||
poolNames[pool.ID] = poolInfo{
|
|
||||||
Name: pool.EnterpriseName,
|
|
||||||
Type: string(pool.PoolType()),
|
|
||||||
ProviderName: pool.ProviderName,
|
|
||||||
}
|
|
||||||
} else if pool.OrgName != "" {
|
|
||||||
poolNames[pool.ID] = poolInfo{
|
|
||||||
Name: pool.OrgName,
|
|
||||||
Type: string(pool.PoolType()),
|
|
||||||
ProviderName: pool.ProviderName,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
poolNames[pool.ID] = poolInfo{
|
|
||||||
Name: pool.RepoName,
|
|
||||||
Type: string(pool.PoolType()),
|
|
||||||
ProviderName: pool.ProviderName,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, instance := range instances {
|
|
||||||
|
|
||||||
m, err := prometheus.NewConstMetric(
|
|
||||||
c.instanceMetric,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
instance.Name, // label: name
|
|
||||||
string(instance.Status), // label: status
|
|
||||||
string(instance.RunnerStatus), // label: runner_status
|
|
||||||
poolNames[instance.PoolID].Name, // label: pool_owner
|
|
||||||
poolNames[instance.PoolID].Type, // label: pool_type
|
|
||||||
instance.PoolID, // label: pool_id
|
|
||||||
hostname, // label: hostname
|
|
||||||
controllerID, // label: controller_id
|
|
||||||
poolNames[instance.PoolID].ProviderName, // label: provider
|
|
||||||
)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- m
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,206 +1,41 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/cloudbase/garm/params"
|
|
||||||
"github.com/cloudbase/garm/runner"
|
|
||||||
|
|
||||||
"github.com/pkg/errors"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
const metricsNamespace = "garm_"
|
const metricsNamespace = "garm"
|
||||||
const metricsRunnerSubsystem = "runner_"
|
const metricsRunnerSubsystem = "runner"
|
||||||
const metricsPoolSubsystem = "pool_"
|
const metricsPoolSubsystem = "pool"
|
||||||
const metricsProviderSubsystem = "provider_"
|
const metricsProviderSubsystem = "provider"
|
||||||
const metricsOrganizationSubsystem = "organization_"
|
const metricsOrganizationSubsystem = "organization"
|
||||||
const metricsRepositorySubsystem = "repository_"
|
const metricsRepositorySubsystem = "repository"
|
||||||
const metricsEnterpriseSubsystem = "enterprise_"
|
const metricsEnterpriseSubsystem = "enterprise"
|
||||||
const metricsWebhookSubsystem = "webhook_"
|
const metricsWebhookSubsystem = "webhook"
|
||||||
|
|
||||||
var webhooksReceived *prometheus.CounterVec = nil
|
|
||||||
|
|
||||||
// RecordWebhookWithLabels will increment a webhook metric identified by specific
|
|
||||||
// values. If metrics are disabled, this function is a noop.
|
|
||||||
func RecordWebhookWithLabels(lvs ...string) error {
|
|
||||||
if webhooksReceived == nil {
|
|
||||||
// not registered. Noop
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
counter, err := webhooksReceived.GetMetricWithLabelValues(lvs...)
|
|
||||||
if err != nil {
|
|
||||||
return errors.Wrap(err, "recording metric")
|
|
||||||
}
|
|
||||||
counter.Inc()
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func RegisterCollectors(runner *runner.Runner) error {
|
|
||||||
if webhooksReceived != nil {
|
|
||||||
// Already registered.
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
garmCollector, err := NewGarmCollector(runner)
|
|
||||||
if err != nil {
|
|
||||||
return errors.Wrap(err, "getting collector")
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := prometheus.Register(garmCollector); err != nil {
|
|
||||||
return errors.Wrap(err, "registering collector")
|
|
||||||
}
|
|
||||||
|
|
||||||
// metric to count total webhooks received
|
|
||||||
// at this point the webhook is not yet authenticated and
|
|
||||||
// we don't know if it's meant for us or not
|
|
||||||
webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
||||||
Name: metricsNamespace + metricsWebhookSubsystem + "received",
|
|
||||||
Help: "The total number of webhooks received",
|
|
||||||
}, []string{"valid", "reason", "hostname", "controller_id"})
|
|
||||||
|
|
||||||
err = prometheus.Register(webhooksReceived)
|
|
||||||
if err != nil {
|
|
||||||
return errors.Wrap(err, "registering webhooks recv counter")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type GarmCollector struct {
|
|
||||||
healthMetric *prometheus.Desc
|
|
||||||
instanceMetric *prometheus.Desc
|
|
||||||
|
|
||||||
// pool metrics
|
|
||||||
poolInfo *prometheus.Desc
|
|
||||||
poolStatus *prometheus.Desc
|
|
||||||
poolMaxRunners *prometheus.Desc
|
|
||||||
poolMinIdleRunners *prometheus.Desc
|
|
||||||
poolBootstrapTimeout *prometheus.Desc
|
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// runner metrics
|
||||||
|
prometheus.MustRegister(InstanceStatus)
|
||||||
|
// organization metrics
|
||||||
|
prometheus.MustRegister(OrganizationInfo)
|
||||||
|
prometheus.MustRegister(OrganizationPoolManagerStatus)
|
||||||
|
// enterprise metrics
|
||||||
|
prometheus.MustRegister(EnterpriseInfo)
|
||||||
|
prometheus.MustRegister(EnterprisePoolManagerStatus)
|
||||||
|
// repository metrics
|
||||||
|
prometheus.MustRegister(RepositoryInfo)
|
||||||
|
prometheus.MustRegister(RepositoryPoolManagerStatus)
|
||||||
// provider metrics
|
// provider metrics
|
||||||
providerInfo *prometheus.Desc
|
prometheus.MustRegister(ProviderInfo)
|
||||||
|
// pool metrics
|
||||||
|
prometheus.MustRegister(PoolInfo)
|
||||||
|
prometheus.MustRegister(PoolStatus)
|
||||||
|
prometheus.MustRegister(PoolMaxRunners)
|
||||||
|
prometheus.MustRegister(PoolMinIdleRunners)
|
||||||
|
prometheus.MustRegister(PoolBootstrapTimeout)
|
||||||
|
// health metrics
|
||||||
|
prometheus.MustRegister(GarmHealth)
|
||||||
|
// webhook metrics
|
||||||
|
prometheus.MustRegister(WebhooksReceived)
|
||||||
|
|
||||||
organizationInfo *prometheus.Desc
|
|
||||||
organizationPoolManagerStatus *prometheus.Desc
|
|
||||||
repositoryInfo *prometheus.Desc
|
|
||||||
repositoryPoolManagerStatus *prometheus.Desc
|
|
||||||
enterpriseInfo *prometheus.Desc
|
|
||||||
enterprisePoolManagerStatus *prometheus.Desc
|
|
||||||
|
|
||||||
runner *runner.Runner
|
|
||||||
cachedControllerInfo params.ControllerInfo
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewGarmCollector(r *runner.Runner) (*GarmCollector, error) {
|
|
||||||
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.Wrap(err, "fetching controller info")
|
|
||||||
}
|
|
||||||
return &GarmCollector{
|
|
||||||
runner: r,
|
|
||||||
instanceMetric: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsRunnerSubsystem+"status",
|
|
||||||
"Status of the runner",
|
|
||||||
[]string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id", "provider"}, nil,
|
|
||||||
),
|
|
||||||
healthMetric: prometheus.NewDesc(
|
|
||||||
metricsNamespace+"health",
|
|
||||||
"Health of the runner",
|
|
||||||
[]string{"hostname", "controller_id"}, nil,
|
|
||||||
),
|
|
||||||
poolInfo: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsPoolSubsystem+"info",
|
|
||||||
"Information of the pool",
|
|
||||||
[]string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"}, nil,
|
|
||||||
),
|
|
||||||
poolStatus: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsPoolSubsystem+"status",
|
|
||||||
"Status of the pool",
|
|
||||||
[]string{"id", "enabled"}, nil,
|
|
||||||
),
|
|
||||||
poolMaxRunners: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsPoolSubsystem+"max_runners",
|
|
||||||
"Max runners of the pool",
|
|
||||||
[]string{"id"}, nil,
|
|
||||||
),
|
|
||||||
poolMinIdleRunners: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsPoolSubsystem+"min_idle_runners",
|
|
||||||
"Min idle runners of the pool",
|
|
||||||
[]string{"id"}, nil,
|
|
||||||
),
|
|
||||||
poolBootstrapTimeout: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsPoolSubsystem+"bootstrap_timeout",
|
|
||||||
"Bootstrap timeout of the pool",
|
|
||||||
[]string{"id"}, nil,
|
|
||||||
),
|
|
||||||
providerInfo: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsProviderSubsystem+"info",
|
|
||||||
"Info of the provider",
|
|
||||||
[]string{"name", "type", "description"}, nil,
|
|
||||||
),
|
|
||||||
organizationInfo: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsOrganizationSubsystem+"info",
|
|
||||||
"Info of the organization",
|
|
||||||
[]string{"name", "id"}, nil,
|
|
||||||
),
|
|
||||||
organizationPoolManagerStatus: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsOrganizationSubsystem+"pool_manager_status",
|
|
||||||
"Status of the organization pool manager",
|
|
||||||
[]string{"name", "id", "running"}, nil,
|
|
||||||
),
|
|
||||||
repositoryInfo: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsRepositorySubsystem+"info",
|
|
||||||
"Info of the organization",
|
|
||||||
[]string{"name", "owner", "id"}, nil,
|
|
||||||
),
|
|
||||||
repositoryPoolManagerStatus: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsRepositorySubsystem+"pool_manager_status",
|
|
||||||
"Status of the repository pool manager",
|
|
||||||
[]string{"name", "id", "running"}, nil,
|
|
||||||
),
|
|
||||||
enterpriseInfo: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsEnterpriseSubsystem+"info",
|
|
||||||
"Info of the organization",
|
|
||||||
[]string{"name", "id"}, nil,
|
|
||||||
),
|
|
||||||
enterprisePoolManagerStatus: prometheus.NewDesc(
|
|
||||||
metricsNamespace+metricsEnterpriseSubsystem+"pool_manager_status",
|
|
||||||
"Status of the enterprise pool manager",
|
|
||||||
[]string{"name", "id", "running"}, nil,
|
|
||||||
),
|
|
||||||
|
|
||||||
cachedControllerInfo: controllerInfo,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
||||||
ch <- c.instanceMetric
|
|
||||||
ch <- c.healthMetric
|
|
||||||
ch <- c.poolInfo
|
|
||||||
ch <- c.poolStatus
|
|
||||||
ch <- c.poolMaxRunners
|
|
||||||
ch <- c.poolMinIdleRunners
|
|
||||||
ch <- c.providerInfo
|
|
||||||
ch <- c.organizationInfo
|
|
||||||
ch <- c.organizationPoolManagerStatus
|
|
||||||
ch <- c.enterpriseInfo
|
|
||||||
ch <- c.enterprisePoolManagerStatus
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
|
|
||||||
controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).Error("failed to get controller info")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectPoolMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectProviderMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectOrganizationMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectRepositoryMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
c.CollectEnterpriseMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,50 +1,21 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectOrganizationMetric collects the metrics for the organization objects
|
var (
|
||||||
func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
OrganizationInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
ctx := auth.GetAdminContext()
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsOrganizationSubsystem,
|
||||||
|
Name: "info",
|
||||||
|
Help: "Info of the organization",
|
||||||
|
}, []string{"name", "id"})
|
||||||
|
|
||||||
organizations, err := c.runner.ListOrganizations(ctx)
|
OrganizationPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
if err != nil {
|
Namespace: metricsNamespace,
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
Subsystem: metricsOrganizationSubsystem,
|
||||||
return
|
Name: "pool_manager_status",
|
||||||
}
|
Help: "Status of the organization pool manager",
|
||||||
|
}, []string{"name", "id", "running"})
|
||||||
for _, organization := range organizations {
|
)
|
||||||
|
|
||||||
organizationInfo, err := prometheus.NewConstMetric(
|
|
||||||
c.organizationInfo,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
organization.Name, // label: name
|
|
||||||
organization.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- organizationInfo
|
|
||||||
|
|
||||||
organizationPoolManagerStatus, err := prometheus.NewConstMetric(
|
|
||||||
c.organizationPoolManagerStatus,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
bool2float64(organization.PoolManagerStatus.IsRunning),
|
|
||||||
organization.Name, // label: name
|
|
||||||
organization.ID, // label: id
|
|
||||||
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- organizationPoolManagerStatus
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
143
metrics/pool.go
143
metrics/pool.go
|
|
@ -1,121 +1,42 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectPoolMetric collects the metrics for the pool objects
|
var (
|
||||||
func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
PoolInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
ctx := auth.GetAdminContext()
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsPoolSubsystem,
|
||||||
|
Name: "info",
|
||||||
|
Help: "Info of the pool",
|
||||||
|
}, []string{"id", "image", "flavor", "prefix", "os_type", "os_arch", "tags", "provider", "pool_owner", "pool_type"})
|
||||||
|
|
||||||
pools, err := c.runner.ListAllPools(ctx)
|
PoolStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
if err != nil {
|
Namespace: metricsNamespace,
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools")
|
Subsystem: metricsPoolSubsystem,
|
||||||
return
|
Name: "status",
|
||||||
}
|
Help: "Status of the pool",
|
||||||
|
}, []string{"id", "enabled"})
|
||||||
|
|
||||||
type poolInfo struct {
|
PoolMaxRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
Name string
|
Namespace: metricsNamespace,
|
||||||
Type string
|
Subsystem: metricsPoolSubsystem,
|
||||||
}
|
Name: "max_runners",
|
||||||
|
Help: "Maximum number of runners in the pool",
|
||||||
|
}, []string{"id"})
|
||||||
|
|
||||||
poolNames := make(map[string]poolInfo)
|
PoolMinIdleRunners = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
for _, pool := range pools {
|
Namespace: metricsNamespace,
|
||||||
if pool.EnterpriseName != "" {
|
Subsystem: metricsPoolSubsystem,
|
||||||
poolNames[pool.ID] = poolInfo{
|
Name: "min_idle_runners",
|
||||||
Name: pool.EnterpriseName,
|
Help: "Minimum number of idle runners in the pool",
|
||||||
Type: string(pool.PoolType()),
|
}, []string{"id"})
|
||||||
}
|
|
||||||
} else if pool.OrgName != "" {
|
|
||||||
poolNames[pool.ID] = poolInfo{
|
|
||||||
Name: pool.OrgName,
|
|
||||||
Type: string(pool.PoolType()),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
poolNames[pool.ID] = poolInfo{
|
|
||||||
Name: pool.RepoName,
|
|
||||||
Type: string(pool.PoolType()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var poolTags []string
|
PoolBootstrapTimeout = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
for _, tag := range pool.Tags {
|
Namespace: metricsNamespace,
|
||||||
poolTags = append(poolTags, tag.Name)
|
Subsystem: metricsPoolSubsystem,
|
||||||
}
|
Name: "bootstrap_timeout",
|
||||||
|
Help: "Runner bootstrap timeout in the pool",
|
||||||
poolInfo, err := prometheus.NewConstMetric(
|
}, []string{"id"})
|
||||||
c.poolInfo,
|
)
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
pool.ID, // label: id
|
|
||||||
pool.Image, // label: image
|
|
||||||
pool.Flavor, // label: flavor
|
|
||||||
pool.Prefix, // label: prefix
|
|
||||||
string(pool.OSType), // label: os_type
|
|
||||||
string(pool.OSArch), // label: os_arch
|
|
||||||
strings.Join(poolTags, ","), // label: tags
|
|
||||||
pool.ProviderName, // label: provider
|
|
||||||
poolNames[pool.ID].Name, // label: pool_owner
|
|
||||||
poolNames[pool.ID].Type, // label: pool_type
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- poolInfo
|
|
||||||
|
|
||||||
poolStatus, err := prometheus.NewConstMetric(
|
|
||||||
c.poolStatus,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
bool2float64(pool.Enabled),
|
|
||||||
pool.ID, // label: id
|
|
||||||
strconv.FormatBool(pool.Enabled), // label: enabled
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- poolStatus
|
|
||||||
|
|
||||||
poolMaxRunners, err := prometheus.NewConstMetric(
|
|
||||||
c.poolMaxRunners,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(pool.MaxRunners),
|
|
||||||
pool.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- poolMaxRunners
|
|
||||||
|
|
||||||
poolMinIdleRunners, err := prometheus.NewConstMetric(
|
|
||||||
c.poolMinIdleRunners,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(pool.MinIdleRunners),
|
|
||||||
pool.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- poolMinIdleRunners
|
|
||||||
|
|
||||||
poolBootstrapTimeout, err := prometheus.NewConstMetric(
|
|
||||||
c.poolBootstrapTimeout,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(pool.RunnerBootstrapTimeout),
|
|
||||||
pool.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- poolBootstrapTimeout
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,36 +1,14 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectPoolMetric collects the metrics for the pool objects
|
var (
|
||||||
func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
ProviderInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
ctx := auth.GetAdminContext()
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsProviderSubsystem,
|
||||||
providers, err := c.runner.ListProviders(ctx)
|
Name: "info",
|
||||||
if err != nil {
|
Help: "Info of the organization",
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
}, []string{"name", "type", "description"})
|
||||||
return
|
)
|
||||||
}
|
|
||||||
|
|
||||||
for _, provider := range providers {
|
|
||||||
|
|
||||||
providerInfo, err := prometheus.NewConstMetric(
|
|
||||||
c.providerInfo,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
provider.Name, // label: name
|
|
||||||
string(provider.ProviderType), // label: type
|
|
||||||
provider.Description, // label: description
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- providerInfo
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,51 +1,21 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/cloudbase/garm/auth"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CollectOrganizationMetric collects the metrics for the repository objects
|
var (
|
||||||
func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
|
RepositoryInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
ctx := auth.GetAdminContext()
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsRepositorySubsystem,
|
||||||
|
Name: "info",
|
||||||
|
Help: "Info of the enterprise",
|
||||||
|
}, []string{"name", "id"})
|
||||||
|
|
||||||
repositories, err := c.runner.ListRepositories(ctx)
|
RepositoryPoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
if err != nil {
|
Namespace: metricsNamespace,
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
|
Subsystem: metricsRepositorySubsystem,
|
||||||
return
|
Name: "pool_manager_status",
|
||||||
}
|
Help: "Status of the enterprise pool manager",
|
||||||
|
}, []string{"name", "id", "running"})
|
||||||
for _, repository := range repositories {
|
)
|
||||||
|
|
||||||
repositoryInfo, err := prometheus.NewConstMetric(
|
|
||||||
c.repositoryInfo,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
1,
|
|
||||||
repository.Name, // label: name
|
|
||||||
repository.Owner, // label: owner
|
|
||||||
repository.ID, // label: id
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- repositoryInfo
|
|
||||||
|
|
||||||
repositoryPoolManagerStatus, err := prometheus.NewConstMetric(
|
|
||||||
c.repositoryPoolManagerStatus,
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
bool2float64(repository.PoolManagerStatus.IsRunning),
|
|
||||||
repository.Name, // label: name
|
|
||||||
repository.ID, // label: id
|
|
||||||
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ch <- repositoryPoolManagerStatus
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
func bool2float64(b bool) float64 {
|
func Bool2float64(b bool) float64 {
|
||||||
if b {
|
if b {
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
|
||||||
12
metrics/webhooks.go
Normal file
12
metrics/webhooks.go
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import "github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
var (
|
||||||
|
WebhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Namespace: metricsNamespace,
|
||||||
|
Subsystem: metricsWebhookSubsystem,
|
||||||
|
Name: "received",
|
||||||
|
Help: "The total number of webhooks received",
|
||||||
|
}, []string{"valid", "reason", "hostname", "controller_id"})
|
||||||
|
)
|
||||||
36
runner/metrics/enterprise.go
Normal file
36
runner/metrics/enterprise.go
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CollectOrganizationMetric collects the metrics for the enterprise objects
|
||||||
|
func CollectEnterpriseMetric(ctx context.Context, r *runner.Runner) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.EnterpriseInfo.Reset()
|
||||||
|
metrics.EnterprisePoolManagerStatus.Reset()
|
||||||
|
|
||||||
|
enterprises, err := r.ListEnterprises(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, enterprise := range enterprises {
|
||||||
|
metrics.EnterpriseInfo.WithLabelValues(
|
||||||
|
enterprise.Name, // label: name
|
||||||
|
enterprise.ID, // label: id
|
||||||
|
).Set(1)
|
||||||
|
|
||||||
|
metrics.EnterprisePoolManagerStatus.WithLabelValues(
|
||||||
|
enterprise.Name, // label: name
|
||||||
|
enterprise.ID, // label: id
|
||||||
|
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
|
||||||
|
).Set(metrics.Bool2float64(enterprise.PoolManagerStatus.IsRunning))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
22
runner/metrics/health.go
Normal file
22
runner/metrics/health.go
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/params"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CollectHealthMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||||
|
|
||||||
|
metrics.GarmHealth.WithLabelValues(
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: id
|
||||||
|
controllerInfo.MetadataURL, // label: metadata_url
|
||||||
|
controllerInfo.CallbackURL, // label: callback_url
|
||||||
|
controllerInfo.WebhookURL, // label: webhook_url
|
||||||
|
controllerInfo.ControllerWebhookURL, // label: controller_webhook_url
|
||||||
|
).Set(1)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
73
runner/metrics/instance.go
Normal file
73
runner/metrics/instance.go
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/params"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CollectInstanceMetric collects the metrics for the runner instances
|
||||||
|
// reflecting the statuses and the pool they belong to.
|
||||||
|
func CollectInstanceMetric(ctx context.Context, r *runner.Runner, controllerInfo params.ControllerInfo) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.InstanceStatus.Reset()
|
||||||
|
|
||||||
|
instances, err := r.ListAllInstances(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
pools, err := r.ListAllPools(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
type poolInfo struct {
|
||||||
|
Name string
|
||||||
|
Type string
|
||||||
|
ProviderName string
|
||||||
|
}
|
||||||
|
|
||||||
|
poolNames := make(map[string]poolInfo)
|
||||||
|
for _, pool := range pools {
|
||||||
|
if pool.EnterpriseName != "" {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.EnterpriseName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
ProviderName: pool.ProviderName,
|
||||||
|
}
|
||||||
|
} else if pool.OrgName != "" {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.OrgName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
ProviderName: pool.ProviderName,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.RepoName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
ProviderName: pool.ProviderName,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, instance := range instances {
|
||||||
|
|
||||||
|
metrics.InstanceStatus.WithLabelValues(
|
||||||
|
instance.Name, // label: name
|
||||||
|
string(instance.Status), // label: status
|
||||||
|
string(instance.RunnerStatus), // label: runner_status
|
||||||
|
poolNames[instance.PoolID].Name, // label: pool_owner
|
||||||
|
poolNames[instance.PoolID].Type, // label: pool_type
|
||||||
|
instance.PoolID, // label: pool_id
|
||||||
|
controllerInfo.Hostname, // label: hostname
|
||||||
|
controllerInfo.ControllerID.String(), // label: controller_id
|
||||||
|
poolNames[instance.PoolID].ProviderName, // label: provider
|
||||||
|
|
||||||
|
).Set(1)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
70
runner/metrics/metrics.go
Normal file
70
runner/metrics/metrics.go
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/auth"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CollectObjectMetric(r *runner.Runner, timer *time.Ticker) {
|
||||||
|
|
||||||
|
ctx := auth.GetAdminContext()
|
||||||
|
|
||||||
|
controllerInfo, err := r.GetControllerInfo(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot get controller info")
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
// we wan't to initiate the collection immediately
|
||||||
|
for ; true; <-timer.C {
|
||||||
|
|
||||||
|
slog.InfoContext(ctx, "collecting metrics")
|
||||||
|
|
||||||
|
var err error
|
||||||
|
slog.DebugContext(ctx, "collecting organization metrics")
|
||||||
|
err = CollectOrganizationMetric(ctx, r)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organization metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting enterprise metrics")
|
||||||
|
err = CollectEnterpriseMetric(ctx, r)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprise metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting repository metrics")
|
||||||
|
err = CollectRepositoryMetric(ctx, r)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repository metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting provider metrics")
|
||||||
|
err = CollectProviderMetric(ctx, r)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect provider metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting pool metrics")
|
||||||
|
err = CollectPoolMetric(ctx, r)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect pool metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting health metrics")
|
||||||
|
err = CollectHealthMetric(ctx, r, controllerInfo)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect health metrics")
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.DebugContext(ctx, "collecting instance metrics")
|
||||||
|
err = CollectInstanceMetric(ctx, r, controllerInfo)
|
||||||
|
if err != nil {
|
||||||
|
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect instance metrics")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
36
runner/metrics/organization.go
Normal file
36
runner/metrics/organization.go
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CollectOrganizationMetric collects the metrics for the organization objects
|
||||||
|
func CollectOrganizationMetric(ctx context.Context, r *runner.Runner) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.OrganizationInfo.Reset()
|
||||||
|
metrics.OrganizationPoolManagerStatus.Reset()
|
||||||
|
|
||||||
|
organizations, err := r.ListOrganizations(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, organization := range organizations {
|
||||||
|
metrics.OrganizationInfo.WithLabelValues(
|
||||||
|
organization.Name, // label: name
|
||||||
|
organization.ID, // label: id
|
||||||
|
).Set(1)
|
||||||
|
|
||||||
|
metrics.OrganizationPoolManagerStatus.WithLabelValues(
|
||||||
|
organization.Name, // label: name
|
||||||
|
organization.ID, // label: id
|
||||||
|
strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running
|
||||||
|
).Set(metrics.Bool2float64(organization.PoolManagerStatus.IsRunning))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
87
runner/metrics/pool.go
Normal file
87
runner/metrics/pool.go
Normal file
|
|
@ -0,0 +1,87 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CollectPoolMetric collects the metrics for the pool objects
|
||||||
|
func CollectPoolMetric(ctx context.Context, r *runner.Runner) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.PoolInfo.Reset()
|
||||||
|
metrics.PoolStatus.Reset()
|
||||||
|
metrics.PoolMaxRunners.Reset()
|
||||||
|
metrics.PoolMinIdleRunners.Reset()
|
||||||
|
metrics.PoolBootstrapTimeout.Reset()
|
||||||
|
|
||||||
|
pools, err := r.ListAllPools(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
type poolInfo struct {
|
||||||
|
Name string
|
||||||
|
Type string
|
||||||
|
}
|
||||||
|
|
||||||
|
poolNames := make(map[string]poolInfo)
|
||||||
|
for _, pool := range pools {
|
||||||
|
if pool.EnterpriseName != "" {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.EnterpriseName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
}
|
||||||
|
} else if pool.OrgName != "" {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.OrgName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
poolNames[pool.ID] = poolInfo{
|
||||||
|
Name: pool.RepoName,
|
||||||
|
Type: string(pool.PoolType()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var poolTags []string
|
||||||
|
for _, tag := range pool.Tags {
|
||||||
|
poolTags = append(poolTags, tag.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics.PoolInfo.WithLabelValues(
|
||||||
|
pool.ID, // label: id
|
||||||
|
pool.Image, // label: image
|
||||||
|
pool.Flavor, // label: flavor
|
||||||
|
pool.Prefix, // label: prefix
|
||||||
|
string(pool.OSType), // label: os_type
|
||||||
|
string(pool.OSArch), // label: os_arch
|
||||||
|
strings.Join(poolTags, ","), // label: tags
|
||||||
|
pool.ProviderName, // label: provider
|
||||||
|
poolNames[pool.ID].Name, // label: pool_owner
|
||||||
|
poolNames[pool.ID].Type, // label: pool_type
|
||||||
|
).Set(1)
|
||||||
|
|
||||||
|
metrics.PoolStatus.WithLabelValues(
|
||||||
|
pool.ID, // label: id
|
||||||
|
strconv.FormatBool(pool.Enabled), // label: enabled
|
||||||
|
).Set(metrics.Bool2float64(pool.Enabled))
|
||||||
|
|
||||||
|
metrics.PoolMaxRunners.WithLabelValues(
|
||||||
|
pool.ID, // label: id
|
||||||
|
).Set(float64(pool.MaxRunners))
|
||||||
|
|
||||||
|
metrics.PoolMinIdleRunners.WithLabelValues(
|
||||||
|
pool.ID, // label: id
|
||||||
|
).Set(float64(pool.MinIdleRunners))
|
||||||
|
|
||||||
|
metrics.PoolBootstrapTimeout.WithLabelValues(
|
||||||
|
pool.ID, // label: id
|
||||||
|
).Set(float64(pool.RunnerBootstrapTimeout))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
27
runner/metrics/provider.go
Normal file
27
runner/metrics/provider.go
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CollectProviderMetric(ctx context.Context, r *runner.Runner) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.ProviderInfo.Reset()
|
||||||
|
|
||||||
|
providers, err := r.ListProviders(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, provider := range providers {
|
||||||
|
metrics.ProviderInfo.WithLabelValues(
|
||||||
|
provider.Name, // label: name
|
||||||
|
string(provider.ProviderType), // label: type
|
||||||
|
provider.Description, // label: description
|
||||||
|
).Set(1)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
35
runner/metrics/repository.go
Normal file
35
runner/metrics/repository.go
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/cloudbase/garm/metrics"
|
||||||
|
"github.com/cloudbase/garm/runner"
|
||||||
|
)
|
||||||
|
|
||||||
|
func CollectRepositoryMetric(ctx context.Context, r *runner.Runner) error {
|
||||||
|
|
||||||
|
// reset metrics
|
||||||
|
metrics.EnterpriseInfo.Reset()
|
||||||
|
metrics.EnterprisePoolManagerStatus.Reset()
|
||||||
|
|
||||||
|
repositories, err := r.ListRepositories(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, repository := range repositories {
|
||||||
|
metrics.EnterpriseInfo.WithLabelValues(
|
||||||
|
repository.Name, // label: name
|
||||||
|
repository.ID, // label: id
|
||||||
|
).Set(1)
|
||||||
|
|
||||||
|
metrics.EnterprisePoolManagerStatus.WithLabelValues(
|
||||||
|
repository.Name, // label: name
|
||||||
|
repository.ID, // label: id
|
||||||
|
strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running
|
||||||
|
).Set(metrics.Bool2float64(repository.PoolManagerStatus.IsRunning))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue