2022-05-05 13:25:50 +00:00
|
|
|
// Copyright 2022 Cloudbase Solutions SRL
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
|
|
|
// not use this file except in compliance with the License. You may obtain
|
|
|
|
|
// a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
|
// License for the specific language governing permissions and limitations
|
|
|
|
|
// under the License.
|
|
|
|
|
|
2022-04-26 20:29:58 +00:00
|
|
|
package pool
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
import (
|
|
|
|
|
"context"
|
2025-01-28 22:55:31 +00:00
|
|
|
"crypto/rand"
|
2025-08-16 19:31:58 +00:00
|
|
|
"errors"
|
2022-05-04 16:27:24 +00:00
|
|
|
"fmt"
|
2024-01-05 23:32:16 +00:00
|
|
|
"log/slog"
|
2022-12-20 10:15:29 +01:00
|
|
|
"math"
|
2025-01-28 22:55:31 +00:00
|
|
|
"math/big"
|
2022-06-30 10:17:39 +00:00
|
|
|
"net/http"
|
2023-06-24 00:22:51 +00:00
|
|
|
"strconv"
|
2022-06-30 10:17:39 +00:00
|
|
|
"strings"
|
|
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
|
2025-06-17 21:03:46 +00:00
|
|
|
"github.com/google/go-github/v72/github"
|
2024-02-22 07:31:51 +01:00
|
|
|
"github.com/google/uuid"
|
|
|
|
|
"golang.org/x/sync/errgroup"
|
2023-07-21 15:30:22 +00:00
|
|
|
|
2023-07-22 22:26:47 +00:00
|
|
|
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
|
2024-02-22 07:31:51 +01:00
|
|
|
commonParams "github.com/cloudbase/garm-provider-common/params"
|
2023-07-22 22:39:17 +00:00
|
|
|
"github.com/cloudbase/garm-provider-common/util"
|
2023-03-12 16:01:49 +02:00
|
|
|
"github.com/cloudbase/garm/auth"
|
2025-05-06 17:50:12 +00:00
|
|
|
"github.com/cloudbase/garm/cache"
|
2023-03-12 16:01:49 +02:00
|
|
|
dbCommon "github.com/cloudbase/garm/database/common"
|
2024-06-20 15:28:56 +00:00
|
|
|
"github.com/cloudbase/garm/database/watcher"
|
2025-04-07 16:45:05 +00:00
|
|
|
"github.com/cloudbase/garm/locking"
|
2023-03-12 16:01:49 +02:00
|
|
|
"github.com/cloudbase/garm/params"
|
|
|
|
|
"github.com/cloudbase/garm/runner/common"
|
2024-03-17 10:21:41 +00:00
|
|
|
garmUtil "github.com/cloudbase/garm/util"
|
2025-04-04 20:44:57 +00:00
|
|
|
ghClient "github.com/cloudbase/garm/util/github"
|
2025-08-24 22:36:44 +00:00
|
|
|
"github.com/cloudbase/garm/util/github/scalesets"
|
2022-05-04 16:27:24 +00:00
|
|
|
)
|
|
|
|
|
|
2022-04-26 20:29:58 +00:00
|
|
|
var (
|
2025-05-14 21:09:02 +00:00
|
|
|
poolIDLabelprefix = "runner-pool-id"
|
|
|
|
|
controllerLabelPrefix = "runner-controller-id"
|
2023-06-24 00:22:51 +00:00
|
|
|
// We tag runners that have been spawned as a result of a queued job with the job ID
|
|
|
|
|
// that spawned them. There is no way to guarantee that the runner spawned in response to a particular
|
|
|
|
|
// job, will be picked up by that job. We mark them so as in the very likely event that the runner
|
|
|
|
|
// has picked up a different job, we can clear the lock on the job that spaned it.
|
|
|
|
|
// The job it picked up would already be transitioned to in_progress so it will be ignored by the
|
|
|
|
|
// consume loop.
|
2025-05-14 21:09:02 +00:00
|
|
|
jobLabelPrefix = "in_response_to_job"
|
2022-04-26 20:29:58 +00:00
|
|
|
)
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-05-10 12:28:39 +00:00
|
|
|
const (
|
|
|
|
|
// maxCreateAttempts is the number of times we will attempt to create an instance
|
|
|
|
|
// before we give up.
|
2024-02-22 09:30:20 +01:00
|
|
|
//
|
|
|
|
|
// nolint:golangci-lint,godox
|
2022-05-10 12:28:39 +00:00
|
|
|
// TODO: make this configurable(?)
|
|
|
|
|
maxCreateAttempts = 5
|
|
|
|
|
)
|
|
|
|
|
|
2025-05-12 21:47:13 +00:00
|
|
|
func NewEntityPoolManager(ctx context.Context, entity params.ForgeEntity, instanceTokenGetter auth.InstanceTokenGetter, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) {
|
2025-07-17 16:39:55 +00:00
|
|
|
ctx = garmUtil.WithSlogContext(
|
|
|
|
|
ctx,
|
|
|
|
|
slog.Any("pool_mgr", entity.String()),
|
|
|
|
|
slog.Any("endpoint", entity.Credentials.Endpoint.Name),
|
|
|
|
|
slog.Any("pool_type", entity.EntityType),
|
|
|
|
|
)
|
2025-04-06 17:54:35 +00:00
|
|
|
ghc, err := ghClient.Client(ctx, entity)
|
2024-03-17 10:50:36 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("error getting github client: %w", err)
|
2024-03-17 10:50:36 +00:00
|
|
|
}
|
|
|
|
|
|
2024-03-18 09:37:25 +00:00
|
|
|
if entity.WebhookSecret == "" {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("webhook secret is empty")
|
2024-03-18 09:37:25 +00:00
|
|
|
}
|
|
|
|
|
|
2024-06-20 15:28:56 +00:00
|
|
|
controllerInfo, err := store.ControllerInfo()
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("error getting controller info: %w", err)
|
2024-06-20 15:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
2025-05-14 00:34:54 +00:00
|
|
|
consumerID := fmt.Sprintf("pool-manager-%s-%s", entity.String(), entity.Credentials.Endpoint.Name)
|
2024-07-29 17:35:57 +00:00
|
|
|
slog.InfoContext(ctx, "registering consumer", "consumer_id", consumerID)
|
2024-06-20 15:28:56 +00:00
|
|
|
consumer, err := watcher.RegisterConsumer(
|
|
|
|
|
ctx, consumerID,
|
|
|
|
|
composeWatcherFilters(entity),
|
|
|
|
|
)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("error registering consumer: %w", err)
|
2024-06-20 15:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
2024-03-17 10:50:36 +00:00
|
|
|
wg := &sync.WaitGroup{}
|
2025-04-07 16:45:05 +00:00
|
|
|
backoff, err := locking.NewInstanceDeleteBackoff(ctx)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("error creating backoff: %w", err)
|
2025-04-07 16:45:05 +00:00
|
|
|
}
|
2024-03-17 10:50:36 +00:00
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
var scaleSetCli *scalesets.ScaleSetClient
|
|
|
|
|
if entity.Credentials.ForgeType == params.GithubEndpointType {
|
|
|
|
|
scaleSetCli, err = scalesets.NewClient(ghc)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("failed to get scalesets client: %w", err)
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-03-17 10:50:36 +00:00
|
|
|
repo := &basePoolManager{
|
2024-06-20 15:28:56 +00:00
|
|
|
ctx: ctx,
|
2025-04-24 23:29:40 +00:00
|
|
|
consumerID: consumerID,
|
2024-06-20 15:28:56 +00:00
|
|
|
entity: entity,
|
|
|
|
|
ghcli: ghc,
|
2025-08-24 22:36:44 +00:00
|
|
|
scaleSetClient: scaleSetCli,
|
2024-06-20 15:28:56 +00:00
|
|
|
controllerInfo: controllerInfo,
|
|
|
|
|
instanceTokenGetter: instanceTokenGetter,
|
|
|
|
|
|
|
|
|
|
store: store,
|
|
|
|
|
providers: providers,
|
|
|
|
|
quit: make(chan struct{}),
|
|
|
|
|
wg: wg,
|
2025-01-28 22:55:31 +00:00
|
|
|
backoff: backoff,
|
2024-06-20 15:28:56 +00:00
|
|
|
consumer: consumer,
|
2024-03-17 10:50:36 +00:00
|
|
|
}
|
|
|
|
|
return repo, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
type basePoolManager struct {
|
2024-06-20 15:28:56 +00:00
|
|
|
ctx context.Context
|
2025-04-24 23:29:40 +00:00
|
|
|
consumerID string
|
2025-05-12 21:47:13 +00:00
|
|
|
entity params.ForgeEntity
|
2024-06-20 15:28:56 +00:00
|
|
|
ghcli common.GithubClient
|
2025-08-24 22:36:44 +00:00
|
|
|
scaleSetClient *scalesets.ScaleSetClient
|
2024-06-20 15:28:56 +00:00
|
|
|
controllerInfo params.ControllerInfo
|
|
|
|
|
instanceTokenGetter auth.InstanceTokenGetter
|
|
|
|
|
consumer dbCommon.Consumer
|
2022-05-04 16:27:24 +00:00
|
|
|
|
|
|
|
|
store dbCommon.Store
|
|
|
|
|
|
|
|
|
|
providers map[string]common.Provider
|
2023-10-09 10:55:11 +00:00
|
|
|
tools []commonParams.RunnerApplicationDownload
|
2022-05-04 16:27:24 +00:00
|
|
|
quit chan struct{}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
managerIsRunning bool
|
|
|
|
|
managerErrorReason string
|
|
|
|
|
|
2025-01-28 22:55:31 +00:00
|
|
|
mux sync.Mutex
|
|
|
|
|
wg *sync.WaitGroup
|
2025-04-07 16:45:05 +00:00
|
|
|
backoff locking.InstanceDeleteBackoff
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2024-09-02 10:53:26 +03:00
|
|
|
func (r *basePoolManager) getProviderBaseParams(pool params.Pool) common.ProviderBaseParams {
|
|
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
return common.ProviderBaseParams{
|
|
|
|
|
PoolInfo: pool,
|
|
|
|
|
ControllerInfo: r.controllerInfo,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-10 00:03:49 +00:00
|
|
|
func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error {
|
2024-03-17 10:21:41 +00:00
|
|
|
if err := r.ValidateOwner(job); err != nil {
|
2025-05-16 23:58:39 +00:00
|
|
|
slog.ErrorContext(r.ctx, "failed to validate owner", "error", err)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error validating owner: %w", err)
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
|
2024-11-22 11:13:00 +01:00
|
|
|
// we see events where the lables seem to be missing. We should ignore these
|
|
|
|
|
// as we can't know if we should handle them or not.
|
|
|
|
|
if len(job.WorkflowJob.Labels) == 0 {
|
2024-11-22 16:46:39 +01:00
|
|
|
slog.WarnContext(r.ctx, "job has no labels", "workflow_job", job.WorkflowJob.Name)
|
2024-11-22 11:13:00 +01:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-06 20:59:41 +00:00
|
|
|
jobParams, err := r.paramsWorkflowJobToParamsJob(job)
|
|
|
|
|
if err != nil {
|
2025-05-16 23:58:39 +00:00
|
|
|
slog.ErrorContext(r.ctx, "failed to convert job to params", "error", err)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error converting job to params: %w", err)
|
2025-05-06 20:59:41 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-24 00:22:51 +00:00
|
|
|
var triggeredBy int64
|
2023-06-06 16:31:30 +03:00
|
|
|
defer func() {
|
2025-07-18 07:51:50 +00:00
|
|
|
if jobParams.WorkflowJobID == 0 {
|
2025-05-06 17:50:12 +00:00
|
|
|
return
|
|
|
|
|
}
|
2023-04-10 00:03:49 +00:00
|
|
|
// we're updating the job in the database, regardless of whether it was successful or not.
|
|
|
|
|
// or if it was meant for this pool or not. Github will send the same job data to all hierarchies
|
|
|
|
|
// that have been configured to work with garm. Updating the job at all levels should yield the same
|
|
|
|
|
// outcome in the db.
|
2025-07-18 07:51:50 +00:00
|
|
|
_, err := r.store.GetJobByID(r.ctx, jobParams.WorkflowJobID)
|
2023-06-27 08:27:19 +00:00
|
|
|
if err != nil {
|
2023-06-29 15:25:53 +00:00
|
|
|
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to get job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", jobParams.WorkflowJobID)
|
2023-06-29 15:25:53 +00:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// This job is new to us. Check if we have a pool that can handle it.
|
2025-05-06 17:50:12 +00:00
|
|
|
potentialPools := cache.FindPoolsMatchingAllTags(r.entity.ID, jobParams.Labels)
|
2023-06-29 15:25:53 +00:00
|
|
|
if len(potentialPools) == 0 {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.WarnContext(
|
|
|
|
|
r.ctx, "no pools matching tags; not recording job",
|
|
|
|
|
"requested_tags", strings.Join(jobParams.Labels, ", "))
|
2023-06-29 15:25:53 +00:00
|
|
|
return
|
|
|
|
|
}
|
2023-06-27 08:27:19 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-24 00:22:51 +00:00
|
|
|
if _, jobErr := r.store.CreateOrUpdateJob(r.ctx, jobParams); jobErr != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", jobErr)).ErrorContext(
|
2025-07-18 07:51:50 +00:00
|
|
|
r.ctx, "failed to update job", "job_id", jobParams.WorkflowJobID)
|
2023-06-24 00:22:51 +00:00
|
|
|
}
|
|
|
|
|
|
2025-07-18 07:51:50 +00:00
|
|
|
if triggeredBy != 0 && jobParams.WorkflowJobID != triggeredBy {
|
2023-06-24 00:22:51 +00:00
|
|
|
// The triggeredBy value is only set by the "in_progress" webhook. The runner that
|
|
|
|
|
// transitioned to in_progress was created as a result of a different queued job. If that job is
|
|
|
|
|
// still queued and we don't remove the lock, it will linger until the lock timeout is reached.
|
|
|
|
|
// That may take a long time, so we break the lock here and allow it to be scheduled again.
|
|
|
|
|
if err := r.store.BreakLockJobIsQueued(r.ctx, triggeredBy); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to break lock for job",
|
|
|
|
|
"job_id", triggeredBy)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
switch job.Action {
|
|
|
|
|
case "queued":
|
2023-04-10 00:03:49 +00:00
|
|
|
// Record the job in the database. Queued jobs will be picked up by the consumeQueuedJobs() method
|
|
|
|
|
// when reconciling.
|
2023-06-06 16:31:30 +03:00
|
|
|
case "completed":
|
2024-06-05 13:48:53 +02:00
|
|
|
// If job was not assigned to a runner, we can ignore it.
|
|
|
|
|
if jobParams.RunnerName == "" {
|
|
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "job never got assigned to a runner, ignoring")
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-06 17:50:12 +00:00
|
|
|
fromCache, ok := cache.GetInstanceCache(jobParams.RunnerName)
|
|
|
|
|
if !ok {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if _, ok := cache.GetEntityPool(r.entity.ID, fromCache.PoolID); !ok {
|
|
|
|
|
slog.DebugContext(r.ctx, "instance belongs to a pool not managed by this entity", "pool_id", fromCache.PoolID)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-06 16:31:30 +03:00
|
|
|
// update instance workload state.
|
2023-07-21 15:30:22 +00:00
|
|
|
if _, err := r.setInstanceRunnerStatus(jobParams.RunnerName, params.RunnerTerminated); err != nil {
|
2023-06-06 16:31:30 +03:00
|
|
|
if errors.Is(err, runnerErrors.ErrNotFound) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", util.SanitizeLogEntry(jobParams.RunnerName))
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "marking instance as pending_delete",
|
|
|
|
|
"runner_name", util.SanitizeLogEntry(jobParams.RunnerName))
|
2023-07-21 15:30:22 +00:00
|
|
|
if _, err := r.setInstanceStatus(jobParams.RunnerName, commonParams.InstancePendingDelete, nil); err != nil {
|
2023-06-06 16:31:30 +03:00
|
|
|
if errors.Is(err, runnerErrors.ErrNotFound) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", util.SanitizeLogEntry(jobParams.RunnerName))
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
case "in_progress":
|
2025-05-06 17:50:12 +00:00
|
|
|
fromCache, ok := cache.GetInstanceCache(jobParams.RunnerName)
|
|
|
|
|
if !ok {
|
|
|
|
|
slog.DebugContext(r.ctx, "instance not found in cache", "runner_name", jobParams.RunnerName)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pool, ok := cache.GetEntityPool(r.entity.ID, fromCache.PoolID)
|
|
|
|
|
if !ok {
|
|
|
|
|
slog.DebugContext(r.ctx, "instance belongs to a pool not managed by this entity", "pool_id", fromCache.PoolID)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2023-06-06 16:31:30 +03:00
|
|
|
// update instance workload state.
|
2023-07-21 15:30:22 +00:00
|
|
|
instance, err := r.setInstanceRunnerStatus(jobParams.RunnerName, params.RunnerActive)
|
2023-04-10 00:03:49 +00:00
|
|
|
if err != nil {
|
2023-06-06 16:31:30 +03:00
|
|
|
if errors.Is(err, runnerErrors.ErrNotFound) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", util.SanitizeLogEntry(jobParams.RunnerName))
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
2023-06-24 00:22:51 +00:00
|
|
|
// Set triggeredBy here so we break the lock on any potential queued job.
|
2024-02-22 07:37:06 +01:00
|
|
|
triggeredBy = jobIDFromLabels(instance.AditionalLabels)
|
2023-04-10 00:03:49 +00:00
|
|
|
|
|
|
|
|
// A runner has picked up the job, and is now running it. It may need to be replaced if the pool has
|
|
|
|
|
// a minimum number of idle runners configured.
|
|
|
|
|
if err := r.ensureIdleRunnersForOnePool(pool); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "error ensuring idle runners for pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-22 07:37:06 +01:00
|
|
|
func jobIDFromLabels(labels []string) int64 {
|
2023-06-24 00:22:51 +00:00
|
|
|
for _, lbl := range labels {
|
|
|
|
|
if strings.HasPrefix(lbl, jobLabelPrefix) {
|
2025-05-14 21:09:02 +00:00
|
|
|
trimLength := min(len(jobLabelPrefix)+1, len(lbl))
|
|
|
|
|
jobID, err := strconv.ParseInt(lbl[trimLength:], 10, 64)
|
2023-06-24 00:22:51 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
2024-02-22 07:37:06 +01:00
|
|
|
return jobID
|
2023-06-24 00:22:51 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-30 00:57:45 +03:00
|
|
|
func (r *basePoolManager) startLoopForFunction(f func() error, interval time.Duration, name string, alwaysRun bool) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "starting loop for entity",
|
|
|
|
|
"loop_name", name)
|
2023-06-23 08:16:41 +00:00
|
|
|
ticker := time.NewTicker(interval)
|
|
|
|
|
r.wg.Add(1)
|
|
|
|
|
|
2023-06-06 16:31:30 +03:00
|
|
|
defer func() {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "pool loop exited",
|
|
|
|
|
"loop_name", name)
|
2023-06-23 08:16:41 +00:00
|
|
|
ticker.Stop()
|
|
|
|
|
r.wg.Done()
|
2023-06-06 16:31:30 +03:00
|
|
|
}()
|
2023-06-23 08:16:41 +00:00
|
|
|
|
2023-06-06 16:31:30 +03:00
|
|
|
for {
|
2023-06-30 00:57:45 +03:00
|
|
|
shouldRun := r.managerIsRunning
|
|
|
|
|
if alwaysRun {
|
|
|
|
|
shouldRun = true
|
|
|
|
|
}
|
|
|
|
|
switch shouldRun {
|
2023-06-06 16:31:30 +03:00
|
|
|
case true:
|
|
|
|
|
select {
|
2023-06-23 08:16:41 +00:00
|
|
|
case <-ticker.C:
|
|
|
|
|
if err := f(); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "error in loop",
|
|
|
|
|
"loop_name", name)
|
2023-06-06 16:31:30 +03:00
|
|
|
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
2025-04-27 19:34:44 +00:00
|
|
|
r.SetPoolRunningState(false, err.Error())
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
// daemon is shutting down.
|
|
|
|
|
return
|
|
|
|
|
case <-r.quit:
|
|
|
|
|
// this worker was stopped.
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
select {
|
|
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
// daemon is shutting down.
|
|
|
|
|
return
|
|
|
|
|
case <-r.quit:
|
|
|
|
|
// this worker was stopped.
|
|
|
|
|
return
|
|
|
|
|
default:
|
2024-06-05 11:57:33 +02:00
|
|
|
r.waitForTimeoutOrCancelled(common.BackoffTimer)
|
2023-06-06 16:31:30 +03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 08:16:41 +00:00
|
|
|
func (r *basePoolManager) updateTools() error {
|
2025-05-14 21:09:02 +00:00
|
|
|
tools, err := cache.GetGithubToolsCache(r.entity.ID)
|
2023-06-23 08:16:41 +00:00
|
|
|
if err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
2024-06-20 15:28:56 +00:00
|
|
|
r.ctx, "failed to update tools for entity", "entity", r.entity.String())
|
2025-04-27 19:34:44 +00:00
|
|
|
r.SetPoolRunningState(false, err.Error())
|
2024-06-20 15:28:56 +00:00
|
|
|
return fmt.Errorf("failed to update tools for entity %s: %w", r.entity.String(), err)
|
2023-06-23 08:16:41 +00:00
|
|
|
}
|
2025-05-14 21:09:02 +00:00
|
|
|
|
2023-06-23 08:16:41 +00:00
|
|
|
r.mux.Lock()
|
|
|
|
|
r.tools = tools
|
|
|
|
|
r.mux.Unlock()
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(r.ctx, "successfully updated tools")
|
2025-04-27 19:34:44 +00:00
|
|
|
r.SetPoolRunningState(true, "")
|
2025-05-14 21:09:02 +00:00
|
|
|
return nil
|
2023-06-23 08:16:41 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
// cleanupOrphanedProviderRunners compares runners in github with local runners and removes
|
|
|
|
|
// any local runners that are not present in Github. Runners that are "idle" in our
|
|
|
|
|
// provider, but do not exist in github, will be removed. This can happen if the
|
|
|
|
|
// garm was offline while a job was executed by a github action. When this
|
|
|
|
|
// happens, github will remove the ephemeral worker and send a webhook our way.
|
|
|
|
|
// If we were offline and did not process the webhook, the instance will linger.
|
|
|
|
|
// We need to remove it from the provider and database.
|
2025-08-24 22:36:44 +00:00
|
|
|
func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []forgeRunner) error {
|
2024-04-01 14:48:31 +00:00
|
|
|
dbInstances, err := r.store.ListEntityInstances(r.ctx, r.entity)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instances from db: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
runnerNames := map[string]bool{}
|
|
|
|
|
for _, run := range runners {
|
2024-06-20 15:28:56 +00:00
|
|
|
if !isManagedRunner(labelsFromRunner(run), r.controllerInfo.ControllerID.String()) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "runner is not managed by a pool we manage",
|
2025-08-24 22:36:44 +00:00
|
|
|
"runner_name", run.Name)
|
2022-12-29 16:49:50 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-08-24 22:36:44 +00:00
|
|
|
runnerNames[run.Name] = true
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, instance := range dbInstances {
|
2025-04-11 10:42:31 +00:00
|
|
|
if instance.ScaleSetID != 0 {
|
|
|
|
|
// ignore scale set instances.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-04-07 16:45:05 +00:00
|
|
|
defer locking.Unlock(instance.Name, false)
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2024-02-22 08:40:25 +01:00
|
|
|
switch instance.Status {
|
2023-07-21 15:30:22 +00:00
|
|
|
case commonParams.InstancePendingCreate,
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
commonParams.InstancePendingDelete, commonParams.InstancePendingForceDelete:
|
2022-05-04 16:27:24 +00:00
|
|
|
// this instance is in the process of being created or is awaiting deletion.
|
2022-06-29 16:23:01 +00:00
|
|
|
// Instances in pending_create did not get a chance to register themselves in,
|
2022-05-04 16:27:24 +00:00
|
|
|
// github so we let them be for now.
|
|
|
|
|
continue
|
|
|
|
|
}
|
2025-05-01 13:35:56 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, instance.PoolID)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instance pool info: %w", err)
|
2025-05-01 13:35:56 +00:00
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
|
2023-06-06 14:49:28 +00:00
|
|
|
switch instance.RunnerStatus {
|
2023-07-21 15:30:22 +00:00
|
|
|
case params.RunnerPending, params.RunnerInstalling:
|
2025-05-01 13:35:56 +00:00
|
|
|
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
|
2023-08-26 19:40:01 +00:00
|
|
|
// runner is still installing. We give it a chance to finish.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "runner is still installing, give it a chance to finish",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-08-26 19:40:01 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2023-06-06 14:49:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if time.Since(instance.UpdatedAt).Minutes() < 5 {
|
|
|
|
|
// instance was updated recently. We give it a chance to register itself in github.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "instance was updated recently, skipping check",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-06-06 14:49:28 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
if ok := runnerNames[instance.Name]; !ok {
|
|
|
|
|
// Set pending_delete on DB field. Allow consolidate() to remove it.
|
2023-07-21 15:30:22 +00:00
|
|
|
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstancePendingDelete, nil); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner",
|
|
|
|
|
"runner_name", instance.Name)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-29 23:44:03 +00:00
|
|
|
// reapTimedOutRunners will mark as pending_delete any runner that has a status
|
|
|
|
|
// of "running" in the provider, but that has not registered with Github, and has
|
|
|
|
|
// received no new updates in the configured timeout interval.
|
2025-08-24 22:36:44 +00:00
|
|
|
func (r *basePoolManager) reapTimedOutRunners(runners []forgeRunner) error {
|
2024-04-01 14:48:31 +00:00
|
|
|
dbInstances, err := r.store.ListEntityInstances(r.ctx, r.entity)
|
2022-06-29 23:44:03 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instances from db: %w", err)
|
2022-06-29 23:44:03 +00:00
|
|
|
}
|
|
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
runnersByName := map[string]forgeRunner{}
|
2022-06-29 23:44:03 +00:00
|
|
|
for _, run := range runners {
|
2024-06-20 15:28:56 +00:00
|
|
|
if !isManagedRunner(labelsFromRunner(run), r.controllerInfo.ControllerID.String()) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "runner is not managed by a pool we manage",
|
2025-08-24 22:36:44 +00:00
|
|
|
"runner_name", run.Name)
|
2022-12-29 16:49:50 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-08-24 22:36:44 +00:00
|
|
|
runnersByName[run.Name] = run
|
2022-06-29 23:44:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, instance := range dbInstances {
|
2025-04-11 10:42:31 +00:00
|
|
|
if instance.ScaleSetID != 0 {
|
|
|
|
|
// ignore scale set instances.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "attempting to lock instance",
|
|
|
|
|
"runner_name", instance.Name)
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-04-07 16:45:05 +00:00
|
|
|
defer locking.Unlock(instance.Name, false)
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2025-05-01 13:35:56 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, instance.PoolID)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instance pool info: %w", err)
|
2025-05-01 13:35:56 +00:00
|
|
|
}
|
|
|
|
|
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
|
2023-06-13 21:01:40 +03:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
// There are 3 cases (currently) where we consider a runner as timed out:
|
2023-06-13 21:01:40 +03:00
|
|
|
// * The runner never joined github within the pool timeout
|
|
|
|
|
// * The runner managed to join github, but the setup process failed later and the runner
|
|
|
|
|
// never started on the instance.
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
// * A JIT config was created, but the runner never joined github.
|
2025-08-24 22:36:44 +00:00
|
|
|
if runner, ok := runnersByName[instance.Name]; !ok || runner.Status == "offline" {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "reaping timed-out/failed runner",
|
|
|
|
|
"runner_name", instance.Name)
|
2024-03-10 15:21:39 +00:00
|
|
|
if err := r.DeleteRunner(instance, false, false); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2022-06-29 23:44:03 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-29 16:23:01 +00:00
|
|
|
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
|
|
|
|
|
// as offline and for which we no longer have a local instance.
|
|
|
|
|
// This may happen if someone manually deletes the instance in the provider. We need to
|
|
|
|
|
// first remove the instance from github, and then from our database.
|
2025-08-24 22:36:44 +00:00
|
|
|
func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []forgeRunner) error {
|
2023-07-23 12:47:56 +00:00
|
|
|
poolInstanceCache := map[string][]commonParams.ProviderInstance{}
|
2023-06-06 16:07:07 +03:00
|
|
|
g, ctx := errgroup.WithContext(r.ctx)
|
2022-06-29 16:23:01 +00:00
|
|
|
for _, runner := range runners {
|
2024-06-20 15:28:56 +00:00
|
|
|
if !isManagedRunner(labelsFromRunner(runner), r.controllerInfo.ControllerID.String()) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "runner is not managed by a pool we manage",
|
2025-08-24 22:36:44 +00:00
|
|
|
"runner_name", runner.Name)
|
2022-10-17 18:43:52 +03:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
status := runner.Status
|
2022-06-29 16:23:01 +00:00
|
|
|
if status != "offline" {
|
|
|
|
|
// Runner is online. Ignore it.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
dbInstance, err := r.store.GetInstance(r.ctx, runner.Name)
|
2022-06-29 16:23:01 +00:00
|
|
|
if err != nil {
|
|
|
|
|
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instance from DB: %w", err)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
// We no longer have a DB entry for this instance, and the runner appears offline in github.
|
|
|
|
|
// Previous forceful removal may have failed?
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "Runner has no database entry in garm, removing from github",
|
2025-08-24 22:36:44 +00:00
|
|
|
"runner_name", runner.Name)
|
|
|
|
|
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.ID); err != nil {
|
2022-06-29 16:23:01 +00:00
|
|
|
// Removed in the meantime?
|
2025-04-27 19:34:44 +00:00
|
|
|
if errors.Is(err, runnerErrors.ErrNotFound) {
|
2022-06-29 16:23:01 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing runner: %w", err)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
2025-04-27 19:34:44 +00:00
|
|
|
if dbInstance.ScaleSetID != 0 {
|
|
|
|
|
// ignore scale set instances.
|
|
|
|
|
continue
|
|
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
|
2024-02-22 08:40:25 +01:00
|
|
|
switch dbInstance.Status {
|
2023-07-21 15:30:22 +00:00
|
|
|
case commonParams.InstancePendingDelete, commonParams.InstanceDeleting:
|
2023-06-06 14:51:28 +00:00
|
|
|
// already marked for deletion or is in the process of being deleted.
|
2022-06-29 16:23:01 +00:00
|
|
|
// Let consolidate take care of it.
|
|
|
|
|
continue
|
2023-08-20 14:27:58 +00:00
|
|
|
case commonParams.InstancePendingCreate, commonParams.InstanceCreating:
|
|
|
|
|
// instance is still being created. We give it a chance to finish.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "instance is still being created, give it a chance to finish",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
2023-08-20 14:27:58 +00:00
|
|
|
continue
|
|
|
|
|
case commonParams.InstanceRunning:
|
2023-08-21 11:57:52 +00:00
|
|
|
// this check is not strictly needed, but can help avoid unnecessary strain on the provider.
|
|
|
|
|
// At worst, we will have a runner that is offline in github for 5 minutes before we reap it.
|
2023-08-20 14:27:58 +00:00
|
|
|
if time.Since(dbInstance.UpdatedAt).Minutes() < 5 {
|
|
|
|
|
// instance was updated recently. We give it a chance to register itself in github.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "instance was updated recently, skipping check",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
2023-08-20 14:27:58 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
2024-04-01 14:48:31 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, dbInstance.PoolID)
|
2022-06-29 16:23:01 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching pool: %w", err)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check if the provider still has the instance.
|
2025-04-11 10:42:31 +00:00
|
|
|
provider, ok := r.providers[dbInstance.ProviderName]
|
2022-06-29 16:23:01 +00:00
|
|
|
if !ok {
|
2025-04-11 10:42:31 +00:00
|
|
|
return fmt.Errorf("unknown provider %s for pool %s", dbInstance.ProviderName, dbInstance.PoolID)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-23 12:47:56 +00:00
|
|
|
var poolInstances []commonParams.ProviderInstance
|
2025-04-11 10:42:31 +00:00
|
|
|
poolInstances, ok = poolInstanceCache[dbInstance.PoolID]
|
2023-01-08 16:40:42 +00:00
|
|
|
if !ok {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "updating instances cache for pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2024-07-09 12:49:29 +03:00
|
|
|
listInstancesParams := common.ListInstancesParams{
|
|
|
|
|
ListInstancesV011: common.ListInstancesV011Params{
|
2024-09-02 10:53:26 +03:00
|
|
|
ProviderBaseParams: r.getProviderBaseParams(pool),
|
2024-07-09 12:49:29 +03:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
poolInstances, err = provider.ListInstances(r.ctx, pool.ID, listInstancesParams)
|
2023-01-08 16:40:42 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instances for pool %s: %w", dbInstance.PoolID, err)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
2025-04-11 10:42:31 +00:00
|
|
|
poolInstanceCache[dbInstance.PoolID] = poolInstances
|
2023-01-08 16:40:42 +00:00
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(dbInstance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"runner_name", dbInstance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-06 16:07:07 +03:00
|
|
|
// See: https://golang.org/doc/faq#closures_and_goroutines
|
|
|
|
|
runner := runner
|
|
|
|
|
g.Go(func() error {
|
2023-06-30 08:48:29 +03:00
|
|
|
deleteMux := false
|
|
|
|
|
defer func() {
|
2025-04-07 16:45:05 +00:00
|
|
|
locking.Unlock(dbInstance.Name, deleteMux)
|
2023-06-30 08:48:29 +03:00
|
|
|
}()
|
2023-06-06 16:07:07 +03:00
|
|
|
providerInstance, ok := instanceInList(dbInstance.Name, poolInstances)
|
|
|
|
|
if !ok {
|
|
|
|
|
// The runner instance is no longer on the provider, and it appears offline in github.
|
|
|
|
|
// It should be safe to force remove it.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "Runner instance is no longer on the provider, removing from github",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
2025-08-24 22:36:44 +00:00
|
|
|
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.ID); err != nil {
|
2023-06-06 16:07:07 +03:00
|
|
|
// Removed in the meantime?
|
2025-04-27 19:34:44 +00:00
|
|
|
if errors.Is(err, runnerErrors.ErrNotFound) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
2024-02-22 07:29:51 +01:00
|
|
|
r.ctx, "runner disappeared from github",
|
2024-01-05 23:32:16 +00:00
|
|
|
"runner_name", dbInstance.Name)
|
2023-06-06 16:07:07 +03:00
|
|
|
} else {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing runner from github: %w", err)
|
2022-10-21 00:23:04 +03:00
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
2023-06-06 16:07:07 +03:00
|
|
|
// Remove the database entry for the runner.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "Removing from database",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
2023-06-06 16:07:07 +03:00
|
|
|
if err := r.store.DeleteInstance(ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing runner from database: %w", err)
|
2023-06-06 16:07:07 +03:00
|
|
|
}
|
2023-06-30 08:48:29 +03:00
|
|
|
deleteMux = true
|
2023-06-06 16:07:07 +03:00
|
|
|
return nil
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
if providerInstance.Status == commonParams.InstanceRunning {
|
2023-06-06 16:07:07 +03:00
|
|
|
// instance is running, but github reports runner as offline. Log the event.
|
2023-06-06 14:49:28 +00:00
|
|
|
// This scenario may require manual intervention.
|
2023-06-06 16:07:07 +03:00
|
|
|
// Perhaps it just came online and github did not yet change it's status?
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.WarnContext(
|
|
|
|
|
r.ctx, "instance is online but github reports runner as offline",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
2023-06-06 16:07:07 +03:00
|
|
|
return nil
|
2024-02-22 08:57:59 +01:00
|
|
|
}
|
2024-02-22 07:31:51 +01:00
|
|
|
|
2024-02-22 08:57:59 +01:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "instance was found in stopped state; starting",
|
|
|
|
|
"runner_name", dbInstance.Name)
|
|
|
|
|
|
2024-07-09 12:49:29 +03:00
|
|
|
startParams := common.StartParams{
|
|
|
|
|
StartV011: common.StartV011Params{
|
2024-09-02 10:53:26 +03:00
|
|
|
ProviderBaseParams: r.getProviderBaseParams(pool),
|
2024-07-09 12:49:29 +03:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
if err := provider.Start(r.ctx, dbInstance.ProviderID, startParams); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error starting instance %s: %w", dbInstance.ProviderID, err)
|
2023-03-27 15:02:25 +00:00
|
|
|
}
|
2023-06-06 16:07:07 +03:00
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing orphaned github runners: %w", err)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) waitForErrorGroupOrContextCancelled(g *errgroup.Group) error {
|
|
|
|
|
if g == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
done := make(chan error, 1)
|
|
|
|
|
go func() {
|
|
|
|
|
waitErr := g.Wait()
|
|
|
|
|
done <- waitErr
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
select {
|
|
|
|
|
case err := <-done:
|
|
|
|
|
return err
|
|
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
return r.ctx.Err()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
func (r *basePoolManager) setInstanceRunnerStatus(runnerName string, status params.RunnerStatus) (params.Instance, error) {
|
2022-05-04 16:27:24 +00:00
|
|
|
updateParams := params.UpdateInstanceParams{
|
|
|
|
|
RunnerStatus: status,
|
|
|
|
|
}
|
2024-04-01 14:48:31 +00:00
|
|
|
instance, err := r.store.UpdateInstance(r.ctx, runnerName, updateParams)
|
2023-04-10 00:03:49 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.Instance{}, fmt.Errorf("error updating runner state: %w", err)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2023-04-10 00:03:49 +00:00
|
|
|
return instance, nil
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
func (r *basePoolManager) setInstanceStatus(runnerName string, status commonParams.InstanceStatus, providerFault []byte) (params.Instance, error) {
|
2022-05-04 16:27:24 +00:00
|
|
|
updateParams := params.UpdateInstanceParams{
|
2022-05-10 12:28:39 +00:00
|
|
|
Status: status,
|
|
|
|
|
ProviderFault: providerFault,
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2024-04-01 14:48:31 +00:00
|
|
|
instance, err := r.store.UpdateInstance(r.ctx, runnerName, updateParams)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.Instance{}, fmt.Errorf("error updating runner state: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2023-04-10 00:03:49 +00:00
|
|
|
return instance, nil
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-08-21 15:33:30 +00:00
|
|
|
func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditionalLabels []string) (err error) {
|
2024-04-01 14:48:31 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, poolID)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching pool: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-12-11 12:37:33 +00:00
|
|
|
provider, ok := r.providers[pool.ProviderName]
|
|
|
|
|
if !ok {
|
|
|
|
|
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-27 16:27:25 +02:00
|
|
|
name := fmt.Sprintf("%s-%s", pool.GetRunnerPrefix(), util.NewID())
|
2023-08-26 19:40:01 +00:00
|
|
|
labels := r.getLabelsForInstance(pool)
|
2023-12-11 12:37:33 +00:00
|
|
|
|
|
|
|
|
jitConfig := make(map[string]string)
|
|
|
|
|
var runner *github.Runner
|
|
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
if !provider.DisableJITConfig() && r.entity.Credentials.ForgeType != params.GiteaEndpointType {
|
2024-03-17 10:21:41 +00:00
|
|
|
jitConfig, runner, err = r.ghcli.GetEntityJITConfig(ctx, name, pool, labels)
|
2023-12-11 12:37:33 +00:00
|
|
|
if err != nil {
|
2025-08-23 00:02:11 +00:00
|
|
|
return fmt.Errorf("failed to generate JIT config: %w", err)
|
2023-12-11 12:37:33 +00:00
|
|
|
}
|
2023-08-26 19:40:01 +00:00
|
|
|
}
|
2022-12-06 20:01:10 +00:00
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
createParams := params.CreateInstanceParams{
|
2023-03-27 08:40:22 +00:00
|
|
|
Name: name,
|
2023-07-21 15:30:22 +00:00
|
|
|
Status: commonParams.InstancePendingCreate,
|
|
|
|
|
RunnerStatus: params.RunnerPending,
|
2023-03-27 08:40:22 +00:00
|
|
|
OSArch: pool.OSArch,
|
|
|
|
|
OSType: pool.OSType,
|
2024-06-20 15:28:56 +00:00
|
|
|
CallbackURL: r.controllerInfo.CallbackURL,
|
|
|
|
|
MetadataURL: r.controllerInfo.MetadataURL,
|
2023-03-27 08:40:22 +00:00
|
|
|
CreateAttempt: 1,
|
|
|
|
|
GitHubRunnerGroup: pool.GitHubRunnerGroup,
|
2023-06-24 00:22:51 +00:00
|
|
|
AditionalLabels: aditionalLabels,
|
2023-08-26 19:40:01 +00:00
|
|
|
JitConfiguration: jitConfig,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if runner != nil {
|
|
|
|
|
createParams.AgentID = runner.GetID()
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-08-21 15:33:30 +00:00
|
|
|
instance, err := r.store.CreateInstance(r.ctx, poolID, createParams)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error creating instance: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-08-21 15:33:30 +00:00
|
|
|
defer func() {
|
2023-08-26 19:40:01 +00:00
|
|
|
if err != nil {
|
|
|
|
|
if instance.ID != "" {
|
2024-03-10 15:21:39 +00:00
|
|
|
if err := r.DeleteRunner(instance, false, false); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to cleanup instance",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-08-26 19:40:01 +00:00
|
|
|
}
|
2023-08-21 15:33:30 +00:00
|
|
|
}
|
2023-08-21 11:57:52 +00:00
|
|
|
|
2023-08-26 19:40:01 +00:00
|
|
|
if runner != nil {
|
2025-04-27 19:34:44 +00:00
|
|
|
runnerCleanupErr := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
|
2023-08-26 19:40:01 +00:00
|
|
|
if err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", runnerCleanupErr)).ErrorContext(
|
|
|
|
|
ctx, "failed to remove runner",
|
|
|
|
|
"gh_runner_id", runner.GetID())
|
2023-08-26 19:40:01 +00:00
|
|
|
}
|
2023-08-21 15:33:30 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-08-26 19:40:01 +00:00
|
|
|
}()
|
2023-08-21 11:57:52 +00:00
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) Status() params.PoolManagerStatus {
|
|
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
return params.PoolManagerStatus{
|
|
|
|
|
IsRunning: r.managerIsRunning,
|
|
|
|
|
FailureReason: r.managerErrorReason,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-05 11:57:33 +02:00
|
|
|
func (r *basePoolManager) waitForTimeoutOrCancelled(timeout time.Duration) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, fmt.Sprintf("sleeping for %.2f minutes", timeout.Minutes()))
|
2024-07-05 10:48:27 +00:00
|
|
|
timer := time.NewTimer(timeout)
|
|
|
|
|
defer timer.Stop()
|
2022-10-20 17:22:47 +03:00
|
|
|
select {
|
2024-07-05 10:48:27 +00:00
|
|
|
case <-timer.C:
|
2022-10-20 17:22:47 +03:00
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
case <-r.quit:
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-27 19:34:44 +00:00
|
|
|
func (r *basePoolManager) SetPoolRunningState(isRunning bool, failureReason string) {
|
2022-10-20 17:22:47 +03:00
|
|
|
r.mux.Lock()
|
|
|
|
|
r.managerErrorReason = failureReason
|
|
|
|
|
r.managerIsRunning = isRunning
|
|
|
|
|
r.mux.Unlock()
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-21 11:57:52 +00:00
|
|
|
func (r *basePoolManager) getLabelsForInstance(pool params.Pool) []string {
|
|
|
|
|
labels := []string{}
|
|
|
|
|
for _, tag := range pool.Tags {
|
|
|
|
|
labels = append(labels, tag.Name)
|
|
|
|
|
}
|
|
|
|
|
labels = append(labels, r.controllerLabel())
|
|
|
|
|
labels = append(labels, r.poolLabel(pool.ID))
|
|
|
|
|
return labels
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) addInstanceToProvider(instance params.Instance) error {
|
2024-04-01 14:48:31 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, instance.PoolID)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching pool: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
provider, ok := r.providers[pool.ProviderName]
|
|
|
|
|
if !ok {
|
2022-10-20 17:22:47 +03:00
|
|
|
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-07-17 07:21:20 +00:00
|
|
|
jwtValidity := pool.RunnerTimeout()
|
|
|
|
|
|
2025-05-19 19:45:45 +00:00
|
|
|
jwtToken, err := r.instanceTokenGetter.NewInstanceJWTToken(instance, r.entity, pool.PoolType(), jwtValidity)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching instance jwt token: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-08-22 11:58:52 +00:00
|
|
|
hasJITConfig := len(instance.JitConfiguration) > 0
|
|
|
|
|
|
2023-07-22 22:39:17 +00:00
|
|
|
bootstrapArgs := commonParams.BootstrapInstance{
|
2023-03-27 08:40:22 +00:00
|
|
|
Name: instance.Name,
|
|
|
|
|
Tools: r.tools,
|
2025-05-12 21:47:13 +00:00
|
|
|
RepoURL: r.entity.ForgeURL(),
|
2023-03-27 08:40:22 +00:00
|
|
|
MetadataURL: instance.MetadataURL,
|
|
|
|
|
CallbackURL: instance.CallbackURL,
|
|
|
|
|
InstanceToken: jwtToken,
|
|
|
|
|
OSArch: pool.OSArch,
|
|
|
|
|
OSType: pool.OSType,
|
|
|
|
|
Flavor: pool.Flavor,
|
|
|
|
|
Image: pool.Image,
|
|
|
|
|
ExtraSpecs: pool.ExtraSpecs,
|
|
|
|
|
PoolID: instance.PoolID,
|
2025-05-14 00:34:54 +00:00
|
|
|
CACertBundle: r.entity.Credentials.CABundle,
|
2023-03-27 08:40:22 +00:00
|
|
|
GitHubRunnerGroup: instance.GitHubRunnerGroup,
|
2023-08-22 11:58:52 +00:00
|
|
|
JitConfigEnabled: hasJITConfig,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !hasJITConfig {
|
|
|
|
|
// We still need the labels here for situations where we don't have a JIT config generated.
|
|
|
|
|
// This can happen if GARM is used against an instance of GHES older than version 3.10.
|
|
|
|
|
// The labels field should be ignored by providers if JIT config is enabled.
|
|
|
|
|
bootstrapArgs.Labels = r.getLabelsForInstance(pool)
|
2022-12-06 15:15:46 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-13 23:34:16 +00:00
|
|
|
var instanceIDToDelete string
|
|
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
|
if instanceIDToDelete != "" {
|
2024-07-09 12:49:29 +03:00
|
|
|
deleteInstanceParams := common.DeleteInstanceParams{
|
|
|
|
|
DeleteInstanceV011: common.DeleteInstanceV011Params{
|
2024-09-02 10:53:26 +03:00
|
|
|
ProviderBaseParams: r.getProviderBaseParams(pool),
|
2024-07-09 12:49:29 +03:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
if err := provider.DeleteInstance(r.ctx, instanceIDToDelete, deleteInstanceParams); err != nil {
|
2022-05-13 23:34:16 +00:00
|
|
|
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to cleanup instance",
|
|
|
|
|
"provider_id", instanceIDToDelete)
|
2022-05-13 23:34:16 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
|
2024-07-09 12:49:29 +03:00
|
|
|
createInstanceParams := common.CreateInstanceParams{
|
|
|
|
|
CreateInstanceV011: common.CreateInstanceV011Params{
|
2024-09-02 10:53:26 +03:00
|
|
|
ProviderBaseParams: r.getProviderBaseParams(pool),
|
2024-07-09 12:49:29 +03:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
providerInstance, err := provider.CreateInstance(r.ctx, bootstrapArgs, createInstanceParams)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2022-05-13 23:34:16 +00:00
|
|
|
instanceIDToDelete = instance.Name
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error creating instance: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
if providerInstance.Status == commonParams.InstanceError {
|
2022-05-13 23:34:16 +00:00
|
|
|
instanceIDToDelete = instance.ProviderID
|
|
|
|
|
if instanceIDToDelete == "" {
|
|
|
|
|
instanceIDToDelete = instance.Name
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
updateInstanceArgs := r.updateArgsFromProviderInstance(providerInstance)
|
2024-03-30 18:22:06 +00:00
|
|
|
if _, err := r.store.UpdateInstance(r.ctx, instance.Name, updateInstanceArgs); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating instance: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-10 00:03:49 +00:00
|
|
|
// paramsWorkflowJobToParamsJob returns a params.Job from a params.WorkflowJob, and aditionally determines
|
|
|
|
|
// if the runner belongs to this pool or not. It will always return a valid params.Job, even if it errs out.
|
|
|
|
|
// This allows us to still update the job in the database, even if we determined that it wasn't necessarily meant
|
|
|
|
|
// for this pool.
|
|
|
|
|
// If garm manages multiple hierarchies (repos, org, enterprise) which involve the same repo, we will get a hook
|
|
|
|
|
// whenever a job involving our repo triggers a hook. So even if the job is picked up by a runner at the enterprise
|
|
|
|
|
// level, the repo and org still get a hook.
|
|
|
|
|
// We even get a hook if a particular job is picked up by a GitHub hosted runner. We don't know who will pick up the job
|
|
|
|
|
// until the "in_progress" event is sent and we can see which runner picked it up.
|
|
|
|
|
//
|
|
|
|
|
// We save the details of that job at every level, because we want to at least update the status of the job. We make
|
|
|
|
|
// decissions based on the status of saved jobs. A "queued" job will prompt garm to search for an appropriate pool
|
|
|
|
|
// and spin up a runner there if no other idle runner exists to pick it up.
|
|
|
|
|
func (r *basePoolManager) paramsWorkflowJobToParamsJob(job params.WorkflowJob) (params.Job, error) {
|
2023-06-24 00:22:51 +00:00
|
|
|
asUUID, err := uuid.Parse(r.ID())
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.Job{}, fmt.Errorf("error parsing pool ID as UUID: %w", err)
|
2023-06-24 00:22:51 +00:00
|
|
|
}
|
|
|
|
|
|
2023-04-10 00:03:49 +00:00
|
|
|
jobParams := params.Job{
|
2025-07-18 07:51:50 +00:00
|
|
|
WorkflowJobID: job.WorkflowJob.ID,
|
2023-04-10 00:03:49 +00:00
|
|
|
Action: job.Action,
|
|
|
|
|
RunID: job.WorkflowJob.RunID,
|
|
|
|
|
Status: job.WorkflowJob.Status,
|
|
|
|
|
Conclusion: job.WorkflowJob.Conclusion,
|
|
|
|
|
StartedAt: job.WorkflowJob.StartedAt,
|
|
|
|
|
CompletedAt: job.WorkflowJob.CompletedAt,
|
|
|
|
|
Name: job.WorkflowJob.Name,
|
|
|
|
|
GithubRunnerID: job.WorkflowJob.RunnerID,
|
2024-06-05 12:37:20 +02:00
|
|
|
RunnerName: job.WorkflowJob.RunnerName,
|
2023-04-10 00:03:49 +00:00
|
|
|
RunnerGroupID: job.WorkflowJob.RunnerGroupID,
|
|
|
|
|
RunnerGroupName: job.WorkflowJob.RunnerGroupName,
|
|
|
|
|
RepositoryName: job.Repository.Name,
|
|
|
|
|
RepositoryOwner: job.Repository.Owner.Login,
|
|
|
|
|
Labels: job.WorkflowJob.Labels,
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-17 05:59:47 +00:00
|
|
|
switch r.entity.EntityType {
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeEnterprise:
|
2023-06-28 14:50:59 +00:00
|
|
|
jobParams.EnterpriseID = &asUUID
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeRepository:
|
2023-06-28 14:50:59 +00:00
|
|
|
jobParams.RepoID = &asUUID
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeOrganization:
|
2023-06-28 14:50:59 +00:00
|
|
|
jobParams.OrgID = &asUUID
|
2023-04-10 00:03:49 +00:00
|
|
|
default:
|
2025-08-16 19:31:58 +00:00
|
|
|
return jobParams, fmt.Errorf("unknown pool type: %s", r.entity.EntityType)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return jobParams, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) poolLabel(poolID string) string {
|
2025-05-14 21:09:02 +00:00
|
|
|
return fmt.Sprintf("%s=%s", poolIDLabelprefix, poolID)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) controllerLabel() string {
|
2025-05-14 21:09:02 +00:00
|
|
|
return fmt.Sprintf("%s=%s", controllerLabelPrefix, r.controllerInfo.ControllerID.String())
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-23 12:47:56 +00:00
|
|
|
func (r *basePoolManager) updateArgsFromProviderInstance(providerInstance commonParams.ProviderInstance) params.UpdateInstanceParams {
|
2022-05-04 16:27:24 +00:00
|
|
|
return params.UpdateInstanceParams{
|
2022-05-10 12:28:39 +00:00
|
|
|
ProviderID: providerInstance.ProviderID,
|
|
|
|
|
OSName: providerInstance.OSName,
|
|
|
|
|
OSVersion: providerInstance.OSVersion,
|
|
|
|
|
Addresses: providerInstance.Addresses,
|
|
|
|
|
Status: providerInstance.Status,
|
|
|
|
|
ProviderFault: providerInstance.ProviderFault,
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-07-23 12:47:56 +00:00
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool) error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "scaling down pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2022-12-20 10:15:29 +01:00
|
|
|
if !pool.Enabled {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "pool is disabled, skipping scale down",
|
|
|
|
|
"pool_id", pool.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
existingInstances, err := r.store.ListPoolInstances(r.ctx, pool.ID)
|
|
|
|
|
if err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to ensure minimum idle workers for pool %s: %w", pool.ID, err)
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
idleWorkers := []params.Instance{}
|
|
|
|
|
for _, inst := range existingInstances {
|
2023-02-06 15:29:01 +02:00
|
|
|
// Idle runners that have been spawned and are still idle after 5 minutes, are take into
|
|
|
|
|
// consideration for scale-down. The 5 minute grace period prevents a situation where a
|
|
|
|
|
// "queued" workflow triggers the creation of a new idle runner, and this routine reaps
|
|
|
|
|
// an idle runner before they have a chance to pick up a job.
|
2023-07-21 15:30:22 +00:00
|
|
|
if inst.RunnerStatus == params.RunnerIdle && inst.Status == commonParams.InstanceRunning && time.Since(inst.UpdatedAt).Minutes() > 2 {
|
2022-12-20 10:15:29 +01:00
|
|
|
idleWorkers = append(idleWorkers, inst)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if len(idleWorkers) == 0 {
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
2025-04-06 17:54:35 +00:00
|
|
|
surplus := float64(len(idleWorkers) - pool.MinIdleRunnersAsInt())
|
2022-12-20 10:15:29 +01:00
|
|
|
|
|
|
|
|
if surplus <= 0 {
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
scaleDownFactor := 0.5 // could be configurable
|
|
|
|
|
numScaleDown := int(math.Ceil(surplus * scaleDownFactor))
|
|
|
|
|
|
|
|
|
|
if numScaleDown <= 0 || numScaleDown > len(idleWorkers) {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("invalid number of instances to scale down: %v, check your scaleDownFactor: %v", numScaleDown, scaleDownFactor)
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
g, _ := errgroup.WithContext(ctx)
|
|
|
|
|
|
2022-12-20 10:15:29 +01:00
|
|
|
for _, instanceToDelete := range idleWorkers[:numScaleDown] {
|
2023-06-23 01:07:55 +03:00
|
|
|
instanceToDelete := instanceToDelete
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instanceToDelete.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"provider_id", instanceToDelete.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2025-04-07 16:45:05 +00:00
|
|
|
defer locking.Unlock(instanceToDelete.Name, false)
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
g.Go(func() error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
2024-03-11 15:41:38 +00:00
|
|
|
ctx, "scaling down idle worker from pool",
|
2024-01-05 23:32:16 +00:00
|
|
|
"runner_name", instanceToDelete.Name,
|
|
|
|
|
"pool_id", pool.ID)
|
2024-03-10 15:21:39 +00:00
|
|
|
if err := r.DeleteRunner(instanceToDelete, false, false); err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to delete instance %s: %w", instanceToDelete.ID, err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
if numScaleDown > 0 {
|
|
|
|
|
// We just scaled down a runner for this pool. That means that if we have jobs that are
|
|
|
|
|
// still queued in our DB, and those jobs should match this pool but have not been picked
|
|
|
|
|
// up by a runner, they are most likely stale and can be removed. For now, we can simply
|
|
|
|
|
// remove jobs older than 10 minutes.
|
|
|
|
|
//
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
// TODO: should probably allow aditional filters to list functions. Would help to filter by date
|
|
|
|
|
// instead of returning a bunch of results and filtering manually.
|
2024-03-17 10:21:41 +00:00
|
|
|
queued, err := r.store.ListEntityJobsByStatus(r.ctx, r.entity.EntityType, r.entity.ID, params.JobStatusQueued)
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
if err != nil && !errors.Is(err, runnerErrors.ErrNotFound) {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error listing queued jobs: %w", err)
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, job := range queued {
|
|
|
|
|
if time.Since(job.CreatedAt).Minutes() > 10 && pool.HasRequiredLabels(job.Labels) {
|
2025-07-18 07:51:50 +00:00
|
|
|
if err := r.store.DeleteJob(ctx, job.WorkflowJobID); err != nil && !errors.Is(err, runnerErrors.ErrNotFound) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to delete job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to scale down pool %s: %w", pool.ID, err)
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2023-06-24 00:22:51 +00:00
|
|
|
func (r *basePoolManager) addRunnerToPool(pool params.Pool, aditionalLabels []string) error {
|
2023-04-10 00:03:49 +00:00
|
|
|
if !pool.Enabled {
|
2023-06-24 00:22:51 +00:00
|
|
|
return fmt.Errorf("pool %s is disabled", pool.ID)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
poolInstanceCount, err := r.store.PoolInstanceCount(r.ctx, pool.ID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("failed to list pool instances: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-06 17:54:35 +00:00
|
|
|
if poolInstanceCount >= int64(pool.MaxRunnersAsInt()) {
|
2023-04-10 00:03:49 +00:00
|
|
|
return fmt.Errorf("max workers (%d) reached for pool %s", pool.MaxRunners, pool.ID)
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-24 00:22:51 +00:00
|
|
|
if err := r.AddRunner(r.ctx, pool.ID, aditionalLabels); err != nil {
|
2023-04-10 00:03:49 +00:00
|
|
|
return fmt.Errorf("failed to add new instance for pool %s: %s", pool.ID, err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) ensureIdleRunnersForOnePool(pool params.Pool) error {
|
2023-06-23 21:14:22 +00:00
|
|
|
if !pool.Enabled || pool.MinIdleRunners == 0 {
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2023-06-23 21:14:22 +00:00
|
|
|
|
2022-05-05 13:07:06 +00:00
|
|
|
existingInstances, err := r.store.ListPoolInstances(r.ctx, pool.ID)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to ensure minimum idle workers for pool %s: %w", pool.ID, err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-05 07:54:52 +00:00
|
|
|
if uint(len(existingInstances)) >= pool.MaxRunners {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "max workers reached for pool, skipping idle worker creation",
|
|
|
|
|
"max_runners", pool.MaxRunners,
|
|
|
|
|
"pool_id", pool.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-05-05 07:54:52 +00:00
|
|
|
idleOrPendingWorkers := []params.Instance{}
|
|
|
|
|
for _, inst := range existingInstances {
|
2023-07-21 15:30:22 +00:00
|
|
|
if inst.RunnerStatus != params.RunnerActive && inst.RunnerStatus != params.RunnerTerminated {
|
2022-05-05 07:54:52 +00:00
|
|
|
idleOrPendingWorkers = append(idleOrPendingWorkers, inst)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-05-05 07:54:52 +00:00
|
|
|
var required int
|
2025-04-06 17:54:35 +00:00
|
|
|
if len(idleOrPendingWorkers) < pool.MinIdleRunnersAsInt() {
|
2022-05-05 07:54:52 +00:00
|
|
|
// get the needed delta.
|
2025-04-06 17:54:35 +00:00
|
|
|
required = pool.MinIdleRunnersAsInt() - len(idleOrPendingWorkers)
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-05-05 07:54:52 +00:00
|
|
|
projectedInstanceCount := len(existingInstances) + required
|
2025-04-06 17:54:35 +00:00
|
|
|
|
|
|
|
|
var projected uint
|
|
|
|
|
if projectedInstanceCount > 0 {
|
|
|
|
|
projected = uint(projectedInstanceCount)
|
|
|
|
|
}
|
|
|
|
|
if projected > pool.MaxRunners {
|
2022-05-05 07:54:52 +00:00
|
|
|
// ensure we don't go above max workers
|
2025-04-06 17:54:35 +00:00
|
|
|
delta := projectedInstanceCount - pool.MaxRunnersAsInt()
|
2024-02-22 10:51:34 +01:00
|
|
|
required -= delta
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-05-05 07:54:52 +00:00
|
|
|
for i := 0; i < required; i++ {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "adding new idle worker to pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2023-06-24 00:22:51 +00:00
|
|
|
if err := r.AddRunner(r.ctx, pool.ID, nil); err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to add new instance for pool %s: %w", pool.ID, err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, pool params.Pool) error {
|
2022-05-10 12:28:39 +00:00
|
|
|
if !pool.Enabled {
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "running retry failed instances for pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2022-05-10 12:28:39 +00:00
|
|
|
|
|
|
|
|
existingInstances, err := r.store.ListPoolInstances(r.ctx, pool.ID)
|
|
|
|
|
if err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to list instances for pool %s: %w", pool.ID, err)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
g, errCtx := errgroup.WithContext(ctx)
|
2022-05-10 12:28:39 +00:00
|
|
|
for _, instance := range existingInstances {
|
2024-01-30 09:37:26 +00:00
|
|
|
instance := instance
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
if instance.Status != commonParams.InstanceError {
|
2022-05-13 23:34:16 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if instance.CreateAttempt >= maxCreateAttempts {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "attempting to retry failed instance",
|
|
|
|
|
"runner_name", instance.Name)
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-06 16:07:07 +03:00
|
|
|
g.Go(func() error {
|
2025-04-07 16:45:05 +00:00
|
|
|
defer locking.Unlock(instance.Name, false)
|
2024-01-30 09:37:26 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "attempting to clean up any previous instance",
|
|
|
|
|
"runner_name", instance.Name)
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
2022-12-29 16:49:50 +00:00
|
|
|
// NOTE(gabriel-samfira): this is done in parallel. If there are many failed instances
|
|
|
|
|
// this has the potential to create many API requests to the target provider.
|
|
|
|
|
// TODO(gabriel-samfira): implement request throttling.
|
2023-06-23 01:07:55 +03:00
|
|
|
if err := r.deleteInstanceFromProvider(errCtx, instance); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to delete instance from provider",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-02-06 13:34:50 +02:00
|
|
|
// Bail here, otherwise we risk creating multiple failing instances, and losing track
|
|
|
|
|
// of them. If Create instance failed to return a proper provider ID, we rely on the
|
|
|
|
|
// name to delete the instance. If we don't bail here, and end up with multiple
|
|
|
|
|
// instances with the same name, using the name to clean up failed instances will fail
|
|
|
|
|
// on any subsequent call, unless the external or native provider takes into account
|
|
|
|
|
// non unique names and loops over all of them. Something which is extremely hacky and
|
|
|
|
|
// which we would rather avoid.
|
2023-06-06 16:07:07 +03:00
|
|
|
return err
|
2022-12-29 16:49:50 +00:00
|
|
|
}
|
2024-01-30 09:37:26 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "cleanup of previously failed instance complete",
|
|
|
|
|
"runner_name", instance.Name)
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
2022-12-29 16:49:50 +00:00
|
|
|
// TODO(gabriel-samfira): Incrementing CreateAttempt should be done within a transaction.
|
|
|
|
|
// It's fairly safe to do here (for now), as there should be no other code path that updates
|
|
|
|
|
// an instance in this state.
|
2023-08-22 11:58:52 +00:00
|
|
|
var tokenFetched bool = len(instance.JitConfiguration) > 0
|
2022-12-29 16:49:50 +00:00
|
|
|
updateParams := params.UpdateInstanceParams{
|
2023-06-06 16:07:07 +03:00
|
|
|
CreateAttempt: instance.CreateAttempt + 1,
|
2022-12-29 22:57:10 +00:00
|
|
|
TokenFetched: &tokenFetched,
|
2023-07-21 15:30:22 +00:00
|
|
|
Status: commonParams.InstancePendingCreate,
|
2023-08-22 11:58:52 +00:00
|
|
|
RunnerStatus: params.RunnerPending,
|
2022-12-29 16:49:50 +00:00
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "queueing previously failed instance for retry",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-12-29 16:49:50 +00:00
|
|
|
// Set instance to pending create and wait for retry.
|
2024-04-01 14:48:31 +00:00
|
|
|
if _, err := r.store.UpdateInstance(r.ctx, instance.Name, updateParams); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-12-29 16:49:50 +00:00
|
|
|
}
|
2023-06-06 16:07:07 +03:00
|
|
|
return nil
|
|
|
|
|
})
|
|
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to retry failed instances for pool %s: %w", pool.ID, err)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) retryFailedInstances() error {
|
2025-05-06 17:50:12 +00:00
|
|
|
pools := cache.GetEntityPools(r.entity.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
g, ctx := errgroup.WithContext(r.ctx)
|
2022-05-10 12:28:39 +00:00
|
|
|
for _, pool := range pools {
|
2023-06-23 01:07:55 +03:00
|
|
|
pool := pool
|
|
|
|
|
g.Go(func() error {
|
|
|
|
|
if err := r.retryFailedInstancesForOnePool(ctx, pool); err != nil {
|
|
|
|
|
return fmt.Errorf("retrying failed instances for pool %s: %w", pool.ID, err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
})
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
|
|
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
|
|
|
|
return fmt.Errorf("retrying failed instances: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) scaleDown() error {
|
2025-05-06 17:50:12 +00:00
|
|
|
pools := cache.GetEntityPools(r.entity.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
g, ctx := errgroup.WithContext(r.ctx)
|
2022-12-20 10:15:29 +01:00
|
|
|
for _, pool := range pools {
|
2023-06-23 01:07:55 +03:00
|
|
|
pool := pool
|
|
|
|
|
g.Go(func() error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "running scale down for pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
return r.scaleDownOnePool(ctx, pool)
|
|
|
|
|
})
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to scale down: %w", err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
2022-12-20 10:15:29 +01:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) ensureMinIdleRunners() error {
|
2025-05-06 17:50:12 +00:00
|
|
|
pools := cache.GetEntityPools(r.entity.ID)
|
2023-06-23 01:07:55 +03:00
|
|
|
g, _ := errgroup.WithContext(r.ctx)
|
2022-05-05 07:54:52 +00:00
|
|
|
for _, pool := range pools {
|
2023-06-23 01:07:55 +03:00
|
|
|
pool := pool
|
|
|
|
|
g.Go(func() error {
|
|
|
|
|
return r.ensureIdleRunnersForOnePool(pool)
|
|
|
|
|
})
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
|
|
|
|
|
if err := r.waitForErrorGroupOrContextCancelled(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to ensure minimum idle workers: %w", err)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) deleteInstanceFromProvider(ctx context.Context, instance params.Instance) error {
|
2024-04-01 14:48:31 +00:00
|
|
|
pool, err := r.store.GetEntityPool(r.ctx, r.entity, instance.PoolID)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error fetching pool: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2025-04-11 10:42:31 +00:00
|
|
|
provider, ok := r.providers[instance.ProviderName]
|
2022-05-04 16:27:24 +00:00
|
|
|
if !ok {
|
2025-04-11 10:42:31 +00:00
|
|
|
return fmt.Errorf("unknown provider %s for pool %s", instance.ProviderName, instance.PoolID)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-10 12:28:39 +00:00
|
|
|
identifier := instance.ProviderID
|
|
|
|
|
if identifier == "" {
|
|
|
|
|
// provider did not return a provider ID?
|
|
|
|
|
// try with name
|
|
|
|
|
identifier = instance.Name
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-30 09:37:26 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
ctx, "calling delete instance on provider",
|
|
|
|
|
"runner_name", instance.Name,
|
|
|
|
|
"provider_id", identifier)
|
|
|
|
|
|
2024-07-09 12:49:29 +03:00
|
|
|
deleteInstanceParams := common.DeleteInstanceParams{
|
|
|
|
|
DeleteInstanceV011: common.DeleteInstanceV011Params{
|
2024-09-02 10:53:26 +03:00
|
|
|
ProviderBaseParams: r.getProviderBaseParams(pool),
|
2024-07-09 12:49:29 +03:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
if err := provider.DeleteInstance(ctx, identifier, deleteInstanceParams); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing instance: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-14 00:34:54 +00:00
|
|
|
func (r *basePoolManager) sleepWithCancel(sleepTime time.Duration) (canceled bool) {
|
2025-07-07 04:54:10 +00:00
|
|
|
if sleepTime == 0 {
|
|
|
|
|
return false
|
|
|
|
|
}
|
2025-05-14 00:34:54 +00:00
|
|
|
ticker := time.NewTicker(sleepTime)
|
|
|
|
|
defer ticker.Stop()
|
|
|
|
|
|
|
|
|
|
select {
|
|
|
|
|
case <-ticker.C:
|
|
|
|
|
return false
|
|
|
|
|
case <-r.quit:
|
|
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) deletePendingInstances() error {
|
2024-04-01 14:48:31 +00:00
|
|
|
instances, err := r.store.ListEntityInstances(r.ctx, r.entity)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to fetch instances from store: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "removing instances in pending_delete")
|
2022-05-04 16:27:24 +00:00
|
|
|
for _, instance := range instances {
|
2025-04-11 10:42:31 +00:00
|
|
|
if instance.ScaleSetID != 0 {
|
|
|
|
|
// instance is part of a scale set. Skip.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
if instance.Status != commonParams.InstancePendingDelete && instance.Status != commonParams.InstancePendingForceDelete {
|
2022-05-04 16:27:24 +00:00
|
|
|
// not in pending_delete status. Skip.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "removing instance from pool",
|
|
|
|
|
"runner_name", instance.Name,
|
|
|
|
|
"pool_id", instance.PoolID)
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "failed to acquire lock for instance",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-28 22:55:31 +00:00
|
|
|
shouldProcess, deadline := r.backoff.ShouldProcess(instance.Name)
|
|
|
|
|
if !shouldProcess {
|
|
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "backoff in effect for instance",
|
|
|
|
|
"runner_name", instance.Name, "deadline", deadline)
|
2025-04-07 16:45:05 +00:00
|
|
|
locking.Unlock(instance.Name, false)
|
2023-06-23 01:07:55 +03:00
|
|
|
continue
|
2022-05-13 23:34:16 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
|
2023-06-23 10:18:56 +00:00
|
|
|
go func(instance params.Instance) (err error) {
|
2025-01-28 22:55:31 +00:00
|
|
|
// Prevent Thundering Herd. Should alleviate some of the database
|
|
|
|
|
// is locked errors in sqlite3.
|
|
|
|
|
num, err := rand.Int(rand.Reader, big.NewInt(2000))
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("failed to generate random number: %w", err)
|
|
|
|
|
}
|
|
|
|
|
jitter := time.Duration(num.Int64()) * time.Millisecond
|
2025-05-14 00:34:54 +00:00
|
|
|
if canceled := r.sleepWithCancel(jitter); canceled {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2025-01-28 22:55:31 +00:00
|
|
|
|
|
|
|
|
currentStatus := instance.Status
|
2023-06-30 08:48:29 +03:00
|
|
|
deleteMux := false
|
|
|
|
|
defer func() {
|
2025-04-07 16:45:05 +00:00
|
|
|
locking.Unlock(instance.Name, deleteMux)
|
2025-01-28 22:55:31 +00:00
|
|
|
if deleteMux {
|
|
|
|
|
// deleteMux is set only when the instance was successfully removed.
|
|
|
|
|
// We can use it as a marker to signal that the backoff is no longer
|
|
|
|
|
// needed.
|
|
|
|
|
r.backoff.Delete(instance.Name)
|
|
|
|
|
}
|
2023-06-30 08:48:29 +03:00
|
|
|
}()
|
2022-06-29 16:23:01 +00:00
|
|
|
defer func(instance params.Instance) {
|
|
|
|
|
if err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to remove instance",
|
|
|
|
|
"runner_name", instance.Name)
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
// failed to remove from provider. Set status to previous value, which will retry
|
|
|
|
|
// the operation.
|
2025-01-28 22:55:31 +00:00
|
|
|
if _, err := r.setInstanceStatus(instance.Name, currentStatus, []byte(err.Error())); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
2025-01-28 22:55:31 +00:00
|
|
|
r.backoff.RecordFailure(instance.Name)
|
2022-05-13 23:34:16 +00:00
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
}(instance)
|
|
|
|
|
|
2025-01-28 22:55:31 +00:00
|
|
|
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceDeleting, nil); err != nil {
|
|
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "removing instance from provider",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
err = r.deleteInstanceFromProvider(r.ctx, instance)
|
2022-06-29 16:23:01 +00:00
|
|
|
if err != nil {
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
if currentStatus != commonParams.InstancePendingForceDelete {
|
|
|
|
|
return fmt.Errorf("failed to remove instance from provider: %w", err)
|
|
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to remove instance from provider (continuing anyway)",
|
|
|
|
|
"instance", instance.Name)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "removing instance from database",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
if deleteErr := r.store.DeleteInstance(r.ctx, instance.PoolID, instance.Name); deleteErr != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to delete instance from database: %w", deleteErr)
|
2022-12-29 16:49:50 +00:00
|
|
|
}
|
2023-06-30 08:48:29 +03:00
|
|
|
deleteMux = true
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "instance was successfully removed",
|
|
|
|
|
"runner_name", instance.Name)
|
2023-06-23 01:07:55 +03:00
|
|
|
return nil
|
2023-06-23 10:18:56 +00:00
|
|
|
}(instance) //nolint
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
|
|
|
|
|
return nil
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-23 01:07:55 +03:00
|
|
|
func (r *basePoolManager) addPendingInstances() error {
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
2022-05-04 16:27:24 +00:00
|
|
|
// TODO: filter instances by status.
|
2024-04-01 14:48:31 +00:00
|
|
|
instances, err := r.store.ListEntityInstances(r.ctx, r.entity)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2023-06-23 01:07:55 +03:00
|
|
|
return fmt.Errorf("failed to fetch instances from store: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
for _, instance := range instances {
|
2025-04-11 10:42:31 +00:00
|
|
|
if instance.ScaleSetID != 0 {
|
|
|
|
|
// instance is part of a scale set. Skip.
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
if instance.Status != commonParams.InstancePendingCreate {
|
2022-05-04 16:27:24 +00:00
|
|
|
// not in pending_create status. Skip.
|
|
|
|
|
continue
|
|
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "attempting to acquire lock for instance",
|
|
|
|
|
"runner_name", instance.Name,
|
|
|
|
|
"action", "create_pending")
|
2025-05-02 09:32:24 +00:00
|
|
|
lockAcquired := locking.TryLock(instance.Name, r.consumerID)
|
|
|
|
|
if !lockAcquired {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "failed to acquire lock for instance",
|
2025-05-02 09:32:24 +00:00
|
|
|
"runner_name", instance.Name)
|
2023-06-23 10:18:56 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-13 23:34:16 +00:00
|
|
|
// Set the instance to "creating" before launching the goroutine. This will ensure that addPendingInstances()
|
|
|
|
|
// won't attempt to create the runner a second time.
|
2023-07-21 15:30:22 +00:00
|
|
|
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceCreating, nil); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
2025-04-07 16:45:05 +00:00
|
|
|
locking.Unlock(instance.Name, false)
|
2023-02-06 13:34:50 +02:00
|
|
|
// We failed to transition the instance to Creating. This means that garm will retry to create this instance
|
|
|
|
|
// when the loop runs again and we end up with multiple instances.
|
|
|
|
|
continue
|
2022-05-13 23:34:16 +00:00
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
|
|
|
|
|
go func(instance params.Instance) {
|
2025-04-07 16:45:05 +00:00
|
|
|
defer locking.Unlock(instance.Name, false)
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "creating instance in pool",
|
|
|
|
|
"runner_name", instance.Name,
|
|
|
|
|
"pool_id", instance.PoolID)
|
2022-05-10 12:28:39 +00:00
|
|
|
if err := r.addInstanceToProvider(instance); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
2024-01-30 09:37:26 +00:00
|
|
|
r.ctx, "failed to add instance to provider",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-05-10 12:28:39 +00:00
|
|
|
errAsBytes := []byte(err.Error())
|
2024-01-05 23:32:16 +00:00
|
|
|
if _, statusErr := r.setInstanceStatus(instance.Name, commonParams.InstanceError, errAsBytes); statusErr != nil {
|
|
|
|
|
slog.With(slog.Any("error", statusErr)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner status",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
2024-01-30 09:37:26 +00:00
|
|
|
r.ctx, "failed to create instance in provider",
|
|
|
|
|
"runner_name", instance.Name)
|
2022-05-10 12:28:39 +00:00
|
|
|
}
|
2023-06-23 10:18:56 +00:00
|
|
|
}(instance)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2023-06-23 01:07:55 +03:00
|
|
|
|
|
|
|
|
return nil
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) Wait() error {
|
2023-06-23 08:16:41 +00:00
|
|
|
done := make(chan struct{})
|
2024-07-05 10:48:27 +00:00
|
|
|
timer := time.NewTimer(60 * time.Second)
|
2023-06-23 08:16:41 +00:00
|
|
|
go func() {
|
|
|
|
|
r.wg.Wait()
|
2024-07-05 10:48:27 +00:00
|
|
|
timer.Stop()
|
2023-06-23 08:16:41 +00:00
|
|
|
close(done)
|
|
|
|
|
}()
|
2022-05-04 16:27:24 +00:00
|
|
|
select {
|
2023-06-23 08:16:41 +00:00
|
|
|
case <-done:
|
2024-07-05 10:48:27 +00:00
|
|
|
case <-timer.C:
|
2025-08-16 19:31:58 +00:00
|
|
|
return runnerErrors.NewTimeoutError("waiting for pool to stop")
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 08:16:41 +00:00
|
|
|
func (r *basePoolManager) runnerCleanup() (err error) {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "running runner cleanup")
|
2024-03-17 10:21:41 +00:00
|
|
|
runners, err := r.GetGithubRunners()
|
2023-06-23 08:16:41 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("failed to fetch github runners: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := r.reapTimedOutRunners(runners); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to reap timed out runners: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
if err := r.cleanupOrphanedRunners(runners); err != nil {
|
2023-06-23 08:16:41 +00:00
|
|
|
return fmt.Errorf("failed to cleanup orphaned runners: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-24 22:36:44 +00:00
|
|
|
func (r *basePoolManager) cleanupOrphanedRunners(runners []forgeRunner) error {
|
2022-05-04 16:27:24 +00:00
|
|
|
if err := r.cleanupOrphanedProviderRunners(runners); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error cleaning orphaned instances: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := r.cleanupOrphanedGithubRunners(runners); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error cleaning orphaned github runners: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2023-06-23 08:16:41 +00:00
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *basePoolManager) Start() error {
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
initialToolUpdate := make(chan struct{}, 1)
|
|
|
|
|
go func() {
|
2024-03-01 19:10:30 +00:00
|
|
|
slog.Info("running initial tool update")
|
2025-05-14 21:09:02 +00:00
|
|
|
for {
|
|
|
|
|
slog.DebugContext(r.ctx, "waiting for tools to be available")
|
|
|
|
|
hasTools, stopped := r.waitForToolsOrCancel()
|
|
|
|
|
if stopped {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if hasTools {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-03-01 19:10:30 +00:00
|
|
|
if err := r.updateTools(); err != nil {
|
|
|
|
|
slog.With(slog.Any("error", err)).Error("failed to update tools")
|
|
|
|
|
}
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
initialToolUpdate <- struct{}{}
|
|
|
|
|
}()
|
|
|
|
|
|
2024-06-20 15:28:56 +00:00
|
|
|
go r.runWatcher()
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
go func() {
|
2024-03-01 19:10:30 +00:00
|
|
|
select {
|
|
|
|
|
case <-r.quit:
|
|
|
|
|
return
|
|
|
|
|
case <-r.ctx.Done():
|
|
|
|
|
return
|
|
|
|
|
case <-initialToolUpdate:
|
|
|
|
|
}
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
defer close(initialToolUpdate)
|
|
|
|
|
go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false)
|
|
|
|
|
go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false)
|
2024-03-10 15:21:39 +00:00
|
|
|
// always run the delete pending instances routine. This way we can still remove existing runners, even if the pool is not running.
|
|
|
|
|
go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", true)
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false)
|
|
|
|
|
go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false)
|
|
|
|
|
go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false)
|
|
|
|
|
go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true)
|
|
|
|
|
go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false)
|
|
|
|
|
}()
|
2022-05-04 16:27:24 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) Stop() error {
|
2022-05-04 16:27:24 +00:00
|
|
|
close(r.quit)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) WebhookSecret() string {
|
2024-03-17 10:21:41 +00:00
|
|
|
return r.entity.WebhookSecret
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2022-05-05 07:54:52 +00:00
|
|
|
|
2022-10-20 17:22:47 +03:00
|
|
|
func (r *basePoolManager) ID() string {
|
2024-03-17 10:21:41 +00:00
|
|
|
return r.entity.ID
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
2022-07-07 16:48:00 +00:00
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
// Delete runner will delete a runner from a pool. If forceRemove is set to true, any error received from
|
|
|
|
|
// the IaaS provider will be ignored and deletion will continue.
|
2024-03-10 15:21:39 +00:00
|
|
|
func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove, bypassGHUnauthorizedError bool) error {
|
|
|
|
|
if !r.managerIsRunning && !bypassGHUnauthorizedError {
|
2024-03-17 10:21:41 +00:00
|
|
|
return runnerErrors.NewConflictError("pool manager is not running for %s", r.entity.String())
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
2025-04-27 19:34:44 +00:00
|
|
|
|
2022-07-07 16:48:00 +00:00
|
|
|
if runner.AgentID != 0 {
|
2025-04-27 19:34:44 +00:00
|
|
|
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.AgentID); err != nil {
|
|
|
|
|
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
|
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "failed to remove runner from github")
|
|
|
|
|
// Mark the pool as offline from this point forward
|
|
|
|
|
r.SetPoolRunningState(false, fmt.Sprintf("failed to remove runner: %q", err))
|
|
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to remove runner")
|
|
|
|
|
if bypassGHUnauthorizedError {
|
|
|
|
|
slog.Info("bypass github unauthorized error is set, marking runner for deletion")
|
2024-03-10 15:21:39 +00:00
|
|
|
} else {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing runner: %w", err)
|
2024-03-10 15:21:39 +00:00
|
|
|
}
|
2025-04-27 19:34:44 +00:00
|
|
|
} else {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error removing runner: %w", err)
|
2022-07-07 16:48:00 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
instanceStatus := commonParams.InstancePendingDelete
|
|
|
|
|
if forceRemove {
|
|
|
|
|
instanceStatus = commonParams.InstancePendingForceDelete
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "setting instance status",
|
|
|
|
|
"runner_name", runner.Name,
|
|
|
|
|
"status", instanceStatus)
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
if _, err := r.setInstanceStatus(runner.Name, instanceStatus, nil); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to update runner",
|
|
|
|
|
"runner_name", runner.Name)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error updating runner: %w", err)
|
2022-07-07 16:48:00 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2023-04-10 00:03:49 +00:00
|
|
|
|
2023-07-04 10:48:14 +00:00
|
|
|
// consumeQueuedJobs will pull all the known jobs from the database and attempt to create a new
|
|
|
|
|
// runner in one of the pools it manages, if it matches the requested labels.
|
2023-04-10 00:03:49 +00:00
|
|
|
// This is a best effort attempt to consume queued jobs. We do not have any real way to know which
|
|
|
|
|
// runner from which pool will pick up a job we react to here. For example, the same job may be received
|
|
|
|
|
// by an enterprise manager, an org manager AND a repo manager. If an idle runner from another pool
|
|
|
|
|
// picks up the job after we created a runner in this pool, we will have an extra runner that may or may not
|
|
|
|
|
// have a job waiting for it.
|
|
|
|
|
// This is not a huge problem, as we have scale down logic which should remove any idle runners that have not
|
|
|
|
|
// picked up a job within a certain time frame. Also, the logic here should ensure that eventually, all known
|
|
|
|
|
// queued jobs will be consumed sooner or later.
|
|
|
|
|
//
|
|
|
|
|
// NOTE: jobs that were created while the garm instance was down, will be unknown to garm itself and will linger
|
|
|
|
|
// in queued state if the pools defined in garm have a minimum idle runner value set to 0. Simply put, garm won't
|
|
|
|
|
// know about the queued jobs that we didn't get a webhook for. Listing all jobs on startup is not feasible, as
|
|
|
|
|
// an enterprise may have thousands of repos and thousands of jobs in queued state. To fetch all jobs for an
|
|
|
|
|
// enterprise, we'd have to list all repos, and for each repo list all jobs currently in queued state. This is
|
|
|
|
|
// not desirable by any measure.
|
2023-07-04 10:48:14 +00:00
|
|
|
//
|
|
|
|
|
// One way to handle situations where garm comes up after a longer period of time, is to temporarily max out the
|
|
|
|
|
// min-idle-runner setting on pools, or at least raise it above 0. The idle runners will start to consume jobs, and
|
|
|
|
|
// as they do so, new idle runners will be spun up in their stead. New jobs will record in the DB as they come in,
|
|
|
|
|
// so those will trigger the creation of a runner. The jobs we don't know about will be dealt with by the idle runners.
|
|
|
|
|
// Once jobs are consumed, you can set min-idle-runners to 0 again.
|
2023-04-10 00:03:49 +00:00
|
|
|
func (r *basePoolManager) consumeQueuedJobs() error {
|
2024-03-17 10:21:41 +00:00
|
|
|
queued, err := r.store.ListEntityJobsByStatus(r.ctx, r.entity.EntityType, r.entity.ID, params.JobStatusQueued)
|
2023-04-10 00:03:49 +00:00
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error listing queued jobs: %w", err)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
poolsCache := poolsForTags{
|
2024-06-20 15:28:56 +00:00
|
|
|
poolCacheType: r.entity.GetPoolBalancerType(),
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "found queued jobs",
|
|
|
|
|
"job_count", len(queued))
|
2023-04-10 00:03:49 +00:00
|
|
|
for _, job := range queued {
|
|
|
|
|
if job.LockedBy != uuid.Nil && job.LockedBy.String() != r.ID() {
|
|
|
|
|
// Job was handled by us or another entity.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "job is locked",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID,
|
2024-01-05 23:32:16 +00:00
|
|
|
"locking_entity", job.LockedBy.String())
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-06 17:54:35 +00:00
|
|
|
if time.Since(job.UpdatedAt) < time.Second*r.controllerInfo.JobBackoff() {
|
2023-04-10 00:03:49 +00:00
|
|
|
// give the idle runners a chance to pick up the job.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
2024-07-01 10:27:31 +00:00
|
|
|
r.ctx, "job backoff not reached", "backoff_interval", r.controllerInfo.MinimumJobAgeBackoff,
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-24 00:22:51 +00:00
|
|
|
if time.Since(job.UpdatedAt) >= time.Minute*10 {
|
Prevent abusing the GH API
On large deployments with many jobs, we cannot check each job that
we recorded in the DB against the GH API.
Before this change, if a job was updated more than 10 minutes ago,
garm would check against the GH api if that job still existed. While
this approach allowed us to maintain a consistent view over which jobs
still exist and which are stale, it had the potential of spamming the
GH API, leading to rate limiting.
This change uses the scale-down loop as an indicator for job staleness.
If a job remains in queued state in our DB, but has dissapeared from GH
or was serviced by another runner and we never got the hook (garm was down
or GH had an issue - happened in the past), then garm will spin up a new
runner for it. If that runner or any other runner is scaled down, we check
if we have jobs in the queue that should have matched that runner. If we did,
there is a high chance that the job no longer exists in GH and we can remove
the job from the queue.
Of course, there is a chance that GH is having issues and the job is never
pushed to the runner, but we can't really account for everything. In this case
I'd rather avoid rate limiting ourselves.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-12-15 22:33:22 +00:00
|
|
|
// Job is still queued in our db, 10 minutes after a matching runner
|
|
|
|
|
// was spawned. Unlock it and try again. A different job may have picked up
|
|
|
|
|
// the runner.
|
2025-07-18 07:51:50 +00:00
|
|
|
if err := r.store.UnlockJob(r.ctx, job.WorkflowJobID, r.ID()); err != nil {
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
2023-04-10 00:03:49 +00:00
|
|
|
// TODO: Implament a cache? Should we return here?
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to unlock job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-23 15:03:55 +00:00
|
|
|
if job.LockedBy.String() == r.ID() {
|
2024-02-22 09:30:20 +01:00
|
|
|
// nolint:golangci-lint,godox
|
2023-06-23 15:03:55 +00:00
|
|
|
// Job is locked by us. We must have already attepted to create a runner for it. Skip.
|
|
|
|
|
// TODO(gabriel-samfira): create an in-memory state of existing runners that we can easily
|
|
|
|
|
// check for existing pending or idle runners. If we can't find any, attempt to allocate another
|
|
|
|
|
// runner.
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "job is locked by us",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-06-23 15:03:55 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-27 11:50:04 +00:00
|
|
|
poolRR, ok := poolsCache.Get(job.Labels)
|
|
|
|
|
if !ok {
|
2024-03-17 10:21:41 +00:00
|
|
|
potentialPools, err := r.store.FindPoolsMatchingAllTags(r.ctx, r.entity.EntityType, r.entity.ID, job.Labels)
|
2023-06-27 11:50:04 +00:00
|
|
|
if err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "error finding pools matching labels")
|
2023-06-27 11:50:04 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
poolRR = poolsCache.Add(job.Labels, potentialPools)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
|
|
|
|
|
2023-06-27 11:50:04 +00:00
|
|
|
if poolRR.Len() == 0 {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(r.ctx, "could not find pools with labels", "requested_labels", strings.Join(job.Labels, ","))
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
runnerCreated := false
|
2025-07-18 07:51:50 +00:00
|
|
|
if err := r.store.LockJob(r.ctx, job.WorkflowJobID, r.ID()); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "could not lock job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2023-06-24 00:22:51 +00:00
|
|
|
|
|
|
|
|
jobLabels := []string{
|
2025-07-18 07:51:50 +00:00
|
|
|
fmt.Sprintf("%s=%d", jobLabelPrefix, job.WorkflowJobID),
|
2023-06-24 00:22:51 +00:00
|
|
|
}
|
2023-06-28 14:50:59 +00:00
|
|
|
for i := 0; i < poolRR.Len(); i++ {
|
2023-06-27 11:50:04 +00:00
|
|
|
pool, err := poolRR.Next()
|
|
|
|
|
if err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "could not find a pool to create a runner for job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-06-27 11:50:04 +00:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "attempting to create a runner in pool",
|
|
|
|
|
"pool_id", pool.ID,
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-06-24 00:22:51 +00:00
|
|
|
if err := r.addRunnerToPool(pool, jobLabels); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "could not add runner to pool",
|
|
|
|
|
"pool_id", pool.ID)
|
2023-04-10 00:03:49 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(r.ctx, "a new runner was added as a response to queued job",
|
|
|
|
|
"pool_id", pool.ID,
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2023-04-10 00:03:49 +00:00
|
|
|
runnerCreated = true
|
|
|
|
|
break
|
|
|
|
|
}
|
2023-06-27 11:50:04 +00:00
|
|
|
|
2023-04-10 00:03:49 +00:00
|
|
|
if !runnerCreated {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.WarnContext(
|
|
|
|
|
r.ctx, "could not create a runner for job; unlocking",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
|
|
|
|
if err := r.store.UnlockJob(r.ctx, job.WorkflowJobID, r.ID()); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to unlock job",
|
2025-07-18 07:51:50 +00:00
|
|
|
"job_id", job.WorkflowJobID)
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error unlocking job: %w", err)
|
2023-04-10 00:03:49 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-06-23 21:14:22 +00:00
|
|
|
|
|
|
|
|
if err := r.store.DeleteCompletedJobs(r.ctx); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
r.ctx, "failed to delete completed jobs")
|
2023-06-23 21:14:22 +00:00
|
|
|
}
|
2023-04-10 00:03:49 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
2023-08-15 17:19:06 +00:00
|
|
|
|
2024-03-18 10:56:49 +00:00
|
|
|
func (r *basePoolManager) UninstallWebhook(ctx context.Context) error {
|
2024-06-20 15:28:56 +00:00
|
|
|
if r.controllerInfo.ControllerWebhookURL == "" {
|
2025-08-16 19:31:58 +00:00
|
|
|
return runnerErrors.NewBadRequestError("controller webhook url is empty")
|
2024-03-18 10:56:49 +00:00
|
|
|
}
|
|
|
|
|
|
2024-03-17 10:21:41 +00:00
|
|
|
allHooks, err := r.listHooks(ctx)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return fmt.Errorf("error listing hooks: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
|
2024-03-18 10:14:31 +00:00
|
|
|
var controllerHookID int64
|
|
|
|
|
var baseHook string
|
2024-06-20 15:28:56 +00:00
|
|
|
trimmedBase := strings.TrimRight(r.controllerInfo.WebhookURL, "/")
|
|
|
|
|
trimmedController := strings.TrimRight(r.controllerInfo.ControllerWebhookURL, "/")
|
2024-03-18 10:14:31 +00:00
|
|
|
|
2024-03-17 10:21:41 +00:00
|
|
|
for _, hook := range allHooks {
|
2024-03-18 10:14:31 +00:00
|
|
|
hookInfo := hookToParamsHookInfo(hook)
|
|
|
|
|
info := strings.TrimRight(hookInfo.URL, "/")
|
|
|
|
|
if strings.EqualFold(info, trimmedController) {
|
|
|
|
|
controllerHookID = hook.GetID()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if strings.EqualFold(info, trimmedBase) {
|
|
|
|
|
baseHook = hookInfo.URL
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if controllerHookID != 0 {
|
|
|
|
|
_, err = r.ghcli.DeleteEntityHook(ctx, controllerHookID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("deleting hook: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
2024-03-18 10:14:31 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if baseHook != "" {
|
|
|
|
|
return runnerErrors.NewBadRequestError("base hook found (%s) and must be deleted manually", baseHook)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
2024-03-18 10:14:31 +00:00
|
|
|
|
2024-03-17 10:21:41 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *basePoolManager) InstallHook(ctx context.Context, req *github.Hook) (params.HookInfo, error) {
|
|
|
|
|
allHooks, err := r.listHooks(ctx)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.HookInfo{}, fmt.Errorf("error listing hooks: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
|
2024-06-20 15:28:56 +00:00
|
|
|
if err := validateHookRequest(r.controllerInfo.ControllerID.String(), r.controllerInfo.WebhookURL, allHooks, req); err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.HookInfo{}, fmt.Errorf("error validating hook request: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hook, err := r.ghcli.CreateEntityHook(ctx, req)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.HookInfo{}, fmt.Errorf("error creating entity hook: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if _, err := r.ghcli.PingEntityHook(ctx, hook.GetID()); err != nil {
|
|
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(
|
|
|
|
|
ctx, "failed to ping hook",
|
|
|
|
|
"hook_id", hook.GetID(),
|
|
|
|
|
"entity", r.entity)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hookToParamsHookInfo(hook), nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-16 09:11:45 +00:00
|
|
|
func (r *basePoolManager) InstallWebhook(ctx context.Context, param params.InstallWebhookParams) (params.HookInfo, error) {
|
2024-06-20 15:28:56 +00:00
|
|
|
if r.controllerInfo.ControllerWebhookURL == "" {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.HookInfo{}, runnerErrors.NewBadRequestError("controller webhook url is empty")
|
2023-08-15 17:19:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
insecureSSL := "0"
|
|
|
|
|
if param.InsecureSSL {
|
|
|
|
|
insecureSSL = "1"
|
|
|
|
|
}
|
|
|
|
|
req := &github.Hook{
|
2025-04-28 13:19:27 +00:00
|
|
|
Active: github.Ptr(true),
|
|
|
|
|
Config: &github.HookConfig{
|
|
|
|
|
ContentType: github.Ptr("json"),
|
|
|
|
|
InsecureSSL: github.Ptr(insecureSSL),
|
|
|
|
|
URL: github.Ptr(r.controllerInfo.ControllerWebhookURL),
|
|
|
|
|
Secret: github.Ptr(r.WebhookSecret()),
|
2023-08-15 17:19:06 +00:00
|
|
|
},
|
|
|
|
|
Events: []string{
|
|
|
|
|
"workflow_job",
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-17 10:21:41 +00:00
|
|
|
return r.InstallHook(ctx, req)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *basePoolManager) ValidateOwner(job params.WorkflowJob) error {
|
|
|
|
|
switch r.entity.EntityType {
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeRepository:
|
2024-03-17 10:21:41 +00:00
|
|
|
if !strings.EqualFold(job.Repository.Name, r.entity.Name) || !strings.EqualFold(job.Repository.Owner.Login, r.entity.Owner) {
|
|
|
|
|
return runnerErrors.NewBadRequestError("job not meant for this pool manager")
|
|
|
|
|
}
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeOrganization:
|
2025-05-16 23:58:39 +00:00
|
|
|
if !strings.EqualFold(job.GetOrgName(r.entity.Credentials.ForgeType), r.entity.Owner) {
|
2024-03-17 10:21:41 +00:00
|
|
|
return runnerErrors.NewBadRequestError("job not meant for this pool manager")
|
|
|
|
|
}
|
2025-05-12 21:47:13 +00:00
|
|
|
case params.ForgeEntityTypeEnterprise:
|
2024-03-17 10:21:41 +00:00
|
|
|
if !strings.EqualFold(job.Enterprise.Slug, r.entity.Owner) {
|
|
|
|
|
return runnerErrors.NewBadRequestError("job not meant for this pool manager")
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
return runnerErrors.NewBadRequestError("unknown entity type")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-28 10:08:19 +00:00
|
|
|
func (r *basePoolManager) GithubRunnerRegistrationToken() (string, error) {
|
2024-03-17 10:21:41 +00:00
|
|
|
tk, ghResp, err := r.ghcli.CreateEntityRegistrationToken(r.ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
|
2025-08-16 19:31:58 +00:00
|
|
|
return "", runnerErrors.NewUnauthorizedError("error fetching token")
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
2025-08-16 19:31:58 +00:00
|
|
|
return "", fmt.Errorf("error creating runner token: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
return *tk.Token, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *basePoolManager) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
|
|
|
|
|
tools, ghResp, err := r.ghcli.ListEntityRunnerApplicationDownloads(r.ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, runnerErrors.NewUnauthorizedError("error fetching tools")
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
2025-08-16 19:31:58 +00:00
|
|
|
return nil, fmt.Errorf("error fetching runner tools: %w", err)
|
2024-03-17 10:21:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret := []commonParams.RunnerApplicationDownload{}
|
|
|
|
|
for _, tool := range tools {
|
|
|
|
|
if tool == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
ret = append(ret, commonParams.RunnerApplicationDownload(*tool))
|
|
|
|
|
}
|
|
|
|
|
return ret, nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-16 09:11:45 +00:00
|
|
|
func (r *basePoolManager) GetWebhookInfo(ctx context.Context) (params.HookInfo, error) {
|
2024-03-17 10:28:35 +00:00
|
|
|
allHooks, err := r.listHooks(ctx)
|
|
|
|
|
if err != nil {
|
2025-08-16 19:31:58 +00:00
|
|
|
return params.HookInfo{}, fmt.Errorf("error listing hooks: %w", err)
|
2024-03-17 10:28:35 +00:00
|
|
|
}
|
2024-06-20 15:28:56 +00:00
|
|
|
trimmedBase := strings.TrimRight(r.controllerInfo.WebhookURL, "/")
|
|
|
|
|
trimmedController := strings.TrimRight(r.controllerInfo.ControllerWebhookURL, "/")
|
2024-03-18 10:14:31 +00:00
|
|
|
|
|
|
|
|
var controllerHookInfo *params.HookInfo
|
|
|
|
|
var baseHookInfo *params.HookInfo
|
2024-03-17 10:28:35 +00:00
|
|
|
|
|
|
|
|
for _, hook := range allHooks {
|
|
|
|
|
hookInfo := hookToParamsHookInfo(hook)
|
2024-03-18 10:14:31 +00:00
|
|
|
info := strings.TrimRight(hookInfo.URL, "/")
|
|
|
|
|
if strings.EqualFold(info, trimmedController) {
|
|
|
|
|
controllerHookInfo = &hookInfo
|
|
|
|
|
break
|
2024-03-17 10:28:35 +00:00
|
|
|
}
|
2024-03-18 10:14:31 +00:00
|
|
|
if strings.EqualFold(info, trimmedBase) {
|
|
|
|
|
baseHookInfo = &hookInfo
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Return the controller hook info if available.
|
|
|
|
|
if controllerHookInfo != nil {
|
|
|
|
|
return *controllerHookInfo, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fall back to base hook info if defined.
|
|
|
|
|
if baseHookInfo != nil {
|
|
|
|
|
return *baseHookInfo, nil
|
2024-03-17 10:28:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return params.HookInfo{}, runnerErrors.NewNotFoundError("hook not found")
|
2023-08-16 09:11:45 +00:00
|
|
|
}
|
2023-08-28 09:44:18 +00:00
|
|
|
|
|
|
|
|
func (r *basePoolManager) RootCABundle() (params.CertificateBundle, error) {
|
2024-06-20 15:28:56 +00:00
|
|
|
return r.entity.Credentials.RootCertificateBundle()
|
2023-08-28 09:44:18 +00:00
|
|
|
}
|