Merge pull request #217 from mercedes-benz/add_counter_metrics

extend metrics for github and provider executions
This commit is contained in:
Gabriel 2024-02-22 09:35:41 +02:00 committed by GitHub
commit dd6f1e48b8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 428 additions and 9 deletions

View file

@ -48,11 +48,18 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| Metric name | Type | Labels | Description |
|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| `garm_runner_operations_total` | Counter | `provider`=&lt;provider name&gt; <br>`operation`=&lt;CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop&gt; | This is a counter that increments every time a runner operation is performed |
| `garm_runner_errors_total` | Counter | `provider`=&lt;provider name&gt; <br>`operation`=&lt;CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop&gt; | This is a counter that increments every time a runner operation errored |
More metrics will be added in the future.
## Github metrics
| Metric name | Type | Labels | Description |
|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
| `garm_github_operations_total` | Counter | `operation`=&lt;ListRunners\|CreateRegistrationToken\|...&gt; <br>`scope`=&lt;Organization\|Repository\|Enterprise&gt; | This is a counter that increments every time a github operation is performed |
| `garm_github_errors_total` | Counter | `operation`=&lt;ListRunners\|CreateRegistrationToken\|...&gt; <br>`scope`=&lt;Organization\|Repository\|Enterprise&gt; | This is a counter that increments every time a github operation errored |
## Enabling metrics

19
metrics/github.go Normal file
View file

@ -0,0 +1,19 @@
package metrics
import "github.com/prometheus/client_golang/prometheus"
var (
GithubOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsGithubSubsystem,
Name: "operations_total",
Help: "Total number of github operation attempts",
}, []string{"operation", "scope"})
GithubOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsGithubSubsystem,
Name: "errors_total",
Help: "Total number of failed github operation attempts",
}, []string{"operation", "scope"})
)

View file

@ -11,4 +11,18 @@ var (
Name: "status",
Help: "Status of the instance",
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
InstanceOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsRunnerSubsystem,
Name: "operations_total",
Help: "Total number of instance operation attempts",
}, []string{"operation", "provider"})
InstanceOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsRunnerSubsystem,
Name: "errors_total",
Help: "Total number of failed instance operation attempts",
}, []string{"operation", "provider"})
)

View file

@ -12,12 +12,15 @@ const metricsOrganizationSubsystem = "organization"
const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
const metricsGithubSubsystem = "github"
// RegisterMetrics registers all the metrics
func RegisterMetrics() error {
var collectors []prometheus.Collector
collectors = append(collectors,
// metrics created during the periodically update of the metrics
//
// runner metrics
InstanceStatus,
// organization metrics
@ -39,6 +42,16 @@ func RegisterMetrics() error {
PoolBootstrapTimeout,
// health metrics
GarmHealth,
// metrics used within normal garm operations
// e.g. count instance creations, count github api calls, ...
//
// runner instances
InstanceOperationCount,
InstanceOperationFailedCount,
// github
GithubOperationCount,
GithubOperationFailedCount,
// webhook metrics
WebhooksReceived,
)

View file

@ -13,6 +13,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@ -85,8 +86,16 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g
}
for {
metrics.GithubOperationCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
runnerGroups, ghResp, err := r.ghcEnterpriseCli.ListRunnerGroups(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -123,8 +132,16 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
metrics.GithubOperationCount.WithLabelValues(
"GenerateEnterpriseJITConfig", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
jitConfig, resp, err := r.ghcEnterpriseCli.GenerateEnterpriseJITConfig(ctx, r.cfg.Name, &req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GenerateEnterpriseJITConfig", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@ -134,7 +151,17 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
runner = jitConfig.Runner
defer func() {
if err != nil && runner != nil {
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
_, innerErr := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runner.GetID())
if innerErr != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
}
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(), "organization", r.cfg.Name)
@ -166,8 +193,16 @@ func (r *enterprise) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
metrics.GithubOperationCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@ -212,8 +247,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
metrics.GithubOperationCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
runners, ghResp, err := r.ghcEnterpriseCli.ListRunners(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -231,8 +274,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {
func (r *enterprise) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
metrics.GithubOperationCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
tools, ghResp, err := r.ghcEnterpriseCli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -255,7 +306,19 @@ func (r *enterprise) FetchDbInstances() ([]params.Instance, error) {
}
func (r *enterprise) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
return r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
ghResp, err := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
return nil, err
}
return ghResp, nil
}
func (r *enterprise) ListPools() ([]params.Pool, error) {
@ -275,9 +338,18 @@ func (r *enterprise) JwtToken() string {
}
func (r *enterprise) GetGithubRegistrationToken() (string, error) {
metrics.GithubOperationCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
tk, ghResp, err := r.ghcEnterpriseCli.CreateRegistrationToken(r.ctx, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching registration token")
}

View file

@ -27,6 +27,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@ -97,8 +98,16 @@ func (r *organization) findRunnerGroupByName(ctx context.Context, name string) (
}
for {
metrics.GithubOperationCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
runnerGroups, ghResp, err := r.ghcli.ListOrganizationRunnerGroups(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -135,8 +144,16 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
metrics.GithubOperationCount.WithLabelValues(
"GenerateOrgJITConfig", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
jitConfig, resp, err := r.ghcli.GenerateOrgJITConfig(ctx, r.cfg.Name, &req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GenerateOrgJITConfig", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@ -146,7 +163,17 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p
runner = jitConfig.GetRunner()
defer func() {
if err != nil && runner != nil {
metrics.GithubOperationCount.WithLabelValues(
"RemoveOrganizationRunner", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
_, innerErr := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runner.GetID())
if innerErr != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveOrganizationRunner", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
}
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(), "organization", r.cfg.Name)
@ -178,8 +205,16 @@ func (r *organization) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
metrics.GithubOperationCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Organization.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@ -223,8 +258,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
metrics.GithubOperationCount.WithLabelValues(
"ListOrganizationRunners", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
runners, ghResp, err := r.ghcli.ListOrganizationRunners(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrganizationRunners", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -243,8 +286,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) {
func (r *organization) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
metrics.GithubOperationCount.WithLabelValues(
"ListOrganizationRunnerApplicationDownloads", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
tools, ghResp, err := r.ghcli.ListOrganizationRunnerApplicationDownloads(r.ctx, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrganizationRunnerApplicationDownloads", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools")
}
@ -267,7 +318,21 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) {
}
func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
return r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
ghResp, err := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
return nil, err
}
return ghResp, nil
}
func (r *organization) ListPools() ([]params.Pool, error) {
@ -287,9 +352,17 @@ func (r *organization) JwtToken() string {
}
func (r *organization) GetGithubRegistrationToken() (string, error) {
metrics.GithubOperationCount.WithLabelValues(
"CreateOrganizationRegistrationToken", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
tk, ghResp, err := r.ghcli.CreateOrganizationRegistrationToken(r.ctx, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateOrganizationRegistrationToken", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token")
}
@ -340,8 +413,16 @@ func (r *organization) listHooks(ctx context.Context) ([]*github.Hook, error) {
}
var allHooks []*github.Hook
for {
metrics.GithubOperationCount.WithLabelValues(
"ListOrgHooks", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
hooks, ghResp, err := r.ghcli.ListOrgHooks(ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrgHooks", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusNotFound {
return nil, runnerErrors.NewBadRequestError("organization not found or your PAT does not have access to manage webhooks")
}
@ -366,12 +447,30 @@ func (r *organization) InstallHook(ctx context.Context, req *github.Hook) (param
return params.HookInfo{}, errors.Wrap(err, "validating hook request")
}
metrics.GithubOperationCount.WithLabelValues(
"CreateOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
hook, _, err := r.ghcli.CreateOrgHook(ctx, r.cfg.Name, req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
return params.HookInfo{}, errors.Wrap(err, "creating organization hook")
}
metrics.GithubOperationCount.WithLabelValues(
"PingOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
if _, err := r.ghcli.PingOrgHook(ctx, r.cfg.Name, hook.GetID()); err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"PingOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to ping hook", "hook_id", hook.GetID())
}
@ -386,8 +485,16 @@ func (r *organization) UninstallHook(ctx context.Context, url string) error {
for _, hook := range allHooks {
if hook.Config["url"] == url {
metrics.GithubOperationCount.WithLabelValues(
"DeleteOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
_, err = r.ghcli.DeleteOrgHook(ctx, r.cfg.Name, hook.GetID())
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"DeleteOrgHook", // label: operation
metricsLabelOrganizationScope, // label: scope
).Inc()
return errors.Wrap(err, "deleting hook")
}
return nil

View file

@ -57,6 +57,10 @@ const (
// before we give up.
// TODO: make this configurable(?)
maxCreateAttempts = 5
metricsLabelEnterpriseScope = "Enterprise"
metricsLabelRepositoryScope = "Repository"
metricsLabelOrganizationScope = "Organization"
)
type keyMutex struct {

View file

@ -27,6 +27,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@ -99,8 +100,16 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
metrics.GithubOperationCount.WithLabelValues(
"GenerateRepoJITConfig", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
jitConfig, resp, err := r.ghcli.GenerateRepoJITConfig(ctx, r.cfg.Owner, r.cfg.Name, &req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GenerateRepoJITConfig", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@ -110,7 +119,17 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par
defer func() {
if err != nil && runner != nil {
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
_, innerErr := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runner.GetID())
if innerErr != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
}
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(),
@ -144,8 +163,16 @@ func (r *repository) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
metrics.GithubOperationCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@ -189,8 +216,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
metrics.GithubOperationCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
runners, ghResp, err := r.ghcli.ListRunners(r.ctx, r.cfg.Owner, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@ -209,8 +244,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) {
func (r *repository) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
metrics.GithubOperationCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
tools, ghResp, err := r.ghcli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Owner, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools")
}
@ -233,7 +276,20 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) {
}
func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
return r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
ghResp, err := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
return nil, err
}
return ghResp, nil
}
func (r *repository) ListPools() ([]params.Pool, error) {
@ -253,9 +309,17 @@ func (r *repository) JwtToken() string {
}
func (r *repository) GetGithubRegistrationToken() (string, error) {
metrics.GithubOperationCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
tk, ghResp, err := r.ghcli.CreateRegistrationToken(r.ctx, r.cfg.Owner, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token")
}
@ -305,8 +369,16 @@ func (r *repository) listHooks(ctx context.Context) ([]*github.Hook, error) {
}
var allHooks []*github.Hook
for {
metrics.GithubOperationCount.WithLabelValues(
"ListRepoHooks", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
hooks, ghResp, err := r.ghcli.ListRepoHooks(ctx, r.cfg.Owner, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationCount.WithLabelValues(
"ListRepoHooks", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusNotFound {
return nil, runnerErrors.NewBadRequestError("repository not found or your PAT does not have access to manage webhooks")
}
@ -331,12 +403,30 @@ func (r *repository) InstallHook(ctx context.Context, req *github.Hook) (params.
return params.HookInfo{}, errors.Wrap(err, "validating hook request")
}
metrics.GithubOperationCount.WithLabelValues(
"CreateRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
hook, _, err := r.ghcli.CreateRepoHook(ctx, r.cfg.Owner, r.cfg.Name, req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
return params.HookInfo{}, errors.Wrap(err, "creating repository hook")
}
metrics.GithubOperationCount.WithLabelValues(
"PingRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
if _, err := r.ghcli.PingRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID()); err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"PingRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(
ctx, "failed to ping hook",
"hook_id", hook.GetID(),
@ -355,8 +445,16 @@ func (r *repository) UninstallHook(ctx context.Context, url string) error {
for _, hook := range allHooks {
if hook.Config["url"] == url {
metrics.GithubOperationCount.WithLabelValues(
"DeleteRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
_, err = r.ghcli.DeleteRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID())
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"DeleteRepoHook", // label: operation
metricsLabelRepositoryScope, // label: scope
).Inc()
return errors.Wrap(err, "deleting hook")
}
return nil

View file

@ -14,6 +14,7 @@ import (
garmErrors "github.com/cloudbase/garm-provider-common/errors"
garmExec "github.com/cloudbase/garm-provider-common/util/exec"
"github.com/cloudbase/garm/config"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
@ -82,17 +83,34 @@ func (e *external) CreateInstance(ctx context.Context, bootstrapParams commonPar
return commonParams.ProviderInstance{}, errors.Wrap(err, "serializing bootstrap params")
}
metrics.InstanceOperationCount.WithLabelValues(
"CreateInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
out, err := garmExec.Exec(ctx, e.execPath, asJs, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"CreateInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param commonParams.ProviderInstance
if err := json.Unmarshal(out, &param); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"CreateInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
if err := e.validateResult(ctx, param); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"CreateInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
@ -113,10 +131,18 @@ func (e *external) DeleteInstance(ctx context.Context, instance string) error {
}
asEnv = append(asEnv, e.environmentVariables...)
metrics.InstanceOperationCount.WithLabelValues(
"DeleteInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) || exitErr.ExitCode() != execution.ExitCodeNotFound {
metrics.InstanceOperationFailedCount.WithLabelValues(
"DeleteInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
@ -134,19 +160,35 @@ func (e *external) GetInstance(ctx context.Context, instance string) (commonPara
}
asEnv = append(asEnv, e.environmentVariables...)
// TODO(gabriel-samfira): handle error types. Of particular insterest is to
// TODO(gabriel-samfira): handle error types. Of particular interest is to
// know when the error is ErrNotFound.
metrics.InstanceOperationCount.WithLabelValues(
"GetInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"GetInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param commonParams.ProviderInstance
if err := json.Unmarshal(out, &param); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"GetInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
if err := e.validateResult(ctx, param); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"GetInstance", // label: operation
e.cfg.Name, // label: provider
).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
@ -163,19 +205,36 @@ func (e *external) ListInstances(ctx context.Context, poolID string) ([]commonPa
}
asEnv = append(asEnv, e.environmentVariables...)
metrics.InstanceOperationCount.WithLabelValues(
"ListInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"ListInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param []commonParams.ProviderInstance
if err := json.Unmarshal(out, &param); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"ListInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
ret := make([]commonParams.ProviderInstance, len(param))
for idx, inst := range param {
if err := e.validateResult(ctx, inst); err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"ListInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
ret[idx] = inst
@ -192,8 +251,17 @@ func (e *external) RemoveAllInstances(ctx context.Context) error {
}
asEnv = append(asEnv, e.environmentVariables...)
metrics.InstanceOperationCount.WithLabelValues(
"RemoveAllInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"RemoveAllInstances", // label: operation
e.cfg.Name, // label: provider
).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil
@ -209,8 +277,16 @@ func (e *external) Stop(ctx context.Context, instance string, force bool) error
}
asEnv = append(asEnv, e.environmentVariables...)
metrics.InstanceOperationCount.WithLabelValues(
"Stop", // label: operation
e.cfg.Name, // label: provider
).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"Stop", // label: operation
e.cfg.Name, // label: provider
).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil
@ -226,8 +302,17 @@ func (e *external) Start(ctx context.Context, instance string) error {
}
asEnv = append(asEnv, e.environmentVariables...)
metrics.InstanceOperationCount.WithLabelValues(
"Start", // label: operation
e.cfg.Name, // label: provider
).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
metrics.InstanceOperationFailedCount.WithLabelValues(
"Start", // label: operation
e.cfg.Name, // label: provider
).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil