2022-05-05 13:25:50 +00:00
|
|
|
// Copyright 2022 Cloudbase Solutions SRL
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
|
|
|
// not use this file except in compliance with the License. You may obtain
|
|
|
|
|
// a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
|
// License for the specific language governing permissions and limitations
|
|
|
|
|
// under the License.
|
|
|
|
|
|
2022-04-13 16:47:42 +00:00
|
|
|
package runner
|
|
|
|
|
|
|
|
|
|
import (
|
2022-04-15 15:22:47 +00:00
|
|
|
"context"
|
2022-04-23 13:05:40 +00:00
|
|
|
"crypto/hmac"
|
2024-02-22 17:34:12 +01:00
|
|
|
"crypto/sha1" //nolint:golangci-lint,gosec // sha1 is used for github webhooks
|
2022-04-23 13:05:40 +00:00
|
|
|
"crypto/sha256"
|
|
|
|
|
"encoding/hex"
|
|
|
|
|
"encoding/json"
|
2022-05-05 07:54:52 +00:00
|
|
|
"fmt"
|
2022-04-23 13:05:40 +00:00
|
|
|
"hash"
|
2024-01-05 23:32:16 +00:00
|
|
|
"log/slog"
|
2023-01-26 14:02:53 +01:00
|
|
|
"os"
|
2022-04-26 20:29:58 +00:00
|
|
|
"strings"
|
|
|
|
|
"sync"
|
2022-05-05 07:54:52 +00:00
|
|
|
"time"
|
2022-04-26 20:29:58 +00:00
|
|
|
|
2024-02-22 16:54:38 +01:00
|
|
|
"github.com/google/uuid"
|
|
|
|
|
"github.com/juju/clock"
|
|
|
|
|
"github.com/juju/retry"
|
|
|
|
|
"github.com/pkg/errors"
|
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
|
|
2023-07-22 22:26:47 +00:00
|
|
|
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
|
2024-02-22 16:54:38 +01:00
|
|
|
commonParams "github.com/cloudbase/garm-provider-common/params"
|
2023-07-22 22:39:17 +00:00
|
|
|
"github.com/cloudbase/garm-provider-common/util"
|
2023-03-12 16:01:49 +02:00
|
|
|
"github.com/cloudbase/garm/auth"
|
|
|
|
|
"github.com/cloudbase/garm/config"
|
|
|
|
|
dbCommon "github.com/cloudbase/garm/database/common"
|
|
|
|
|
"github.com/cloudbase/garm/params"
|
|
|
|
|
"github.com/cloudbase/garm/runner/common"
|
|
|
|
|
"github.com/cloudbase/garm/runner/pool"
|
|
|
|
|
"github.com/cloudbase/garm/runner/providers"
|
2022-04-13 16:47:42 +00:00
|
|
|
)
|
|
|
|
|
|
2023-06-28 09:48:43 +00:00
|
|
|
func NewRunner(ctx context.Context, cfg config.Config, db dbCommon.Store) (*Runner, error) {
|
2024-02-22 17:28:39 +01:00
|
|
|
ctrlID, err := db.ControllerInfo()
|
2022-04-22 14:46:27 +00:00
|
|
|
if err != nil {
|
2022-04-29 16:08:31 +00:00
|
|
|
return nil, errors.Wrap(err, "fetching controller info")
|
2022-04-22 14:46:27 +00:00
|
|
|
}
|
2022-04-29 16:08:31 +00:00
|
|
|
|
2024-02-22 17:28:39 +01:00
|
|
|
providers, err := providers.LoadProvidersFromConfig(ctx, cfg, ctrlID.ControllerID.String())
|
2022-04-26 20:29:58 +00:00
|
|
|
if err != nil {
|
2022-04-29 16:08:31 +00:00
|
|
|
return nil, errors.Wrap(err, "loading providers")
|
2022-04-26 20:29:58 +00:00
|
|
|
}
|
2022-04-22 14:46:27 +00:00
|
|
|
|
2022-04-29 14:18:22 +00:00
|
|
|
creds := map[string]config.Github{}
|
|
|
|
|
|
|
|
|
|
for _, ghcreds := range cfg.Github {
|
|
|
|
|
creds[ghcreds.Name] = ghcreds
|
|
|
|
|
}
|
2022-08-10 12:15:12 +03:00
|
|
|
|
|
|
|
|
poolManagerCtrl := &poolManagerCtrl{
|
2024-02-22 17:28:39 +01:00
|
|
|
controllerID: ctrlID.ControllerID.String(),
|
2022-04-29 16:08:31 +00:00
|
|
|
config: cfg,
|
2022-08-10 12:15:12 +03:00
|
|
|
credentials: creds,
|
2022-04-28 16:13:20 +00:00
|
|
|
repositories: map[string]common.PoolManager{},
|
|
|
|
|
organizations: map[string]common.PoolManager{},
|
2022-10-13 18:32:21 +00:00
|
|
|
enterprises: map[string]common.PoolManager{},
|
2022-08-10 12:15:12 +03:00
|
|
|
}
|
|
|
|
|
runner := &Runner{
|
|
|
|
|
ctx: ctx,
|
|
|
|
|
config: cfg,
|
|
|
|
|
store: db,
|
|
|
|
|
poolManagerCtrl: poolManagerCtrl,
|
|
|
|
|
providers: providers,
|
|
|
|
|
credentials: creds,
|
2024-02-22 17:28:39 +01:00
|
|
|
controllerID: ctrlID.ControllerID,
|
2022-04-18 17:26:13 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
if err := runner.loadReposOrgsAndEnterprises(); err != nil {
|
2022-04-28 16:13:20 +00:00
|
|
|
return nil, errors.Wrap(err, "loading pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-18 17:26:13 +00:00
|
|
|
return runner, nil
|
2022-04-15 15:22:47 +00:00
|
|
|
}
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
type poolManagerCtrl struct {
|
2022-04-18 17:26:13 +00:00
|
|
|
mux sync.Mutex
|
|
|
|
|
|
2022-04-22 14:46:27 +00:00
|
|
|
controllerID string
|
2022-08-10 12:15:12 +03:00
|
|
|
config config.Config
|
|
|
|
|
credentials map[string]config.Github
|
2022-04-22 14:46:27 +00:00
|
|
|
|
|
|
|
|
repositories map[string]common.PoolManager
|
|
|
|
|
organizations map[string]common.PoolManager
|
2022-10-13 16:09:28 +00:00
|
|
|
enterprises map[string]common.PoolManager
|
2022-08-10 12:15:12 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) CreateRepoPoolManager(ctx context.Context, repo params.Repository, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
cfgInternal, err := p.getInternalConfig(ctx, repo.CredentialsName, repo.GetBalancerType())
|
2022-08-10 12:15:12 +03:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
2024-03-17 10:50:36 +00:00
|
|
|
entity := params.GithubEntity{
|
|
|
|
|
Owner: repo.Owner,
|
|
|
|
|
Name: repo.Name,
|
|
|
|
|
ID: repo.ID,
|
|
|
|
|
WebhookSecret: repo.WebhookSecret,
|
|
|
|
|
EntityType: params.GithubEntityTypeRepository,
|
|
|
|
|
}
|
|
|
|
|
poolManager, err := pool.NewEntityPoolManager(ctx, entity, cfgInternal, providers, store)
|
2022-08-10 12:15:12 +03:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "creating repo pool manager")
|
|
|
|
|
}
|
|
|
|
|
p.repositories[repo.ID] = poolManager
|
|
|
|
|
return poolManager, nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-01 19:10:30 +00:00
|
|
|
func (p *poolManagerCtrl) UpdateRepoPoolManager(ctx context.Context, repo params.Repository) (common.PoolManager, error) {
|
2023-07-05 09:46:19 +00:00
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.repositories[repo.ID]
|
|
|
|
|
if !ok {
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "repository %s/%s pool manager not loaded", repo.Owner, repo.Name)
|
|
|
|
|
}
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
internalCfg, err := p.getInternalConfig(ctx, repo.CredentialsName, repo.GetBalancerType())
|
2023-07-05 09:46:19 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
newState := params.UpdatePoolStateParams{
|
|
|
|
|
WebhookSecret: repo.WebhookSecret,
|
|
|
|
|
InternalConfig: &internalCfg,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := poolMgr.RefreshState(newState); err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "updating repo pool manager")
|
|
|
|
|
}
|
|
|
|
|
return poolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
func (p *poolManagerCtrl) GetRepoPoolManager(repo params.Repository) (common.PoolManager, error) {
|
|
|
|
|
if repoPoolMgr, ok := p.repositories[repo.ID]; ok {
|
|
|
|
|
return repoPoolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "repository %s/%s pool manager not loaded", repo.Owner, repo.Name)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) DeleteRepoPoolManager(repo params.Repository) error {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.repositories[repo.ID]
|
|
|
|
|
if ok {
|
|
|
|
|
if err := poolMgr.Stop(); err != nil {
|
|
|
|
|
return errors.Wrap(err, "stopping repo pool manager")
|
|
|
|
|
}
|
|
|
|
|
delete(p.repositories, repo.ID)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) GetRepoPoolManagers() (map[string]common.PoolManager, error) {
|
|
|
|
|
return p.repositories, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) CreateOrgPoolManager(ctx context.Context, org params.Organization, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
cfgInternal, err := p.getInternalConfig(ctx, org.CredentialsName, org.GetBalancerType())
|
2022-08-10 12:15:12 +03:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
2024-03-17 10:50:36 +00:00
|
|
|
entity := params.GithubEntity{
|
|
|
|
|
Owner: org.Name,
|
|
|
|
|
ID: org.ID,
|
|
|
|
|
WebhookSecret: org.WebhookSecret,
|
|
|
|
|
EntityType: params.GithubEntityTypeOrganization,
|
|
|
|
|
}
|
|
|
|
|
poolManager, err := pool.NewEntityPoolManager(ctx, entity, cfgInternal, providers, store)
|
2022-08-10 12:15:12 +03:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "creating org pool manager")
|
|
|
|
|
}
|
|
|
|
|
p.organizations[org.ID] = poolManager
|
|
|
|
|
return poolManager, nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-01 19:10:30 +00:00
|
|
|
func (p *poolManagerCtrl) UpdateOrgPoolManager(ctx context.Context, org params.Organization) (common.PoolManager, error) {
|
2023-07-05 09:46:19 +00:00
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.organizations[org.ID]
|
|
|
|
|
if !ok {
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "org %s pool manager not loaded", org.Name)
|
|
|
|
|
}
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
internalCfg, err := p.getInternalConfig(ctx, org.CredentialsName, org.GetBalancerType())
|
2023-07-05 09:46:19 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
newState := params.UpdatePoolStateParams{
|
|
|
|
|
WebhookSecret: org.WebhookSecret,
|
|
|
|
|
InternalConfig: &internalCfg,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := poolMgr.RefreshState(newState); err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "updating repo pool manager")
|
|
|
|
|
}
|
|
|
|
|
return poolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
func (p *poolManagerCtrl) GetOrgPoolManager(org params.Organization) (common.PoolManager, error) {
|
|
|
|
|
if orgPoolMgr, ok := p.organizations[org.ID]; ok {
|
|
|
|
|
return orgPoolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "organization %s pool manager not loaded", org.Name)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) DeleteOrgPoolManager(org params.Organization) error {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.organizations[org.ID]
|
|
|
|
|
if ok {
|
|
|
|
|
if err := poolMgr.Stop(); err != nil {
|
|
|
|
|
return errors.Wrap(err, "stopping org pool manager")
|
|
|
|
|
}
|
|
|
|
|
delete(p.organizations, org.ID)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) GetOrgPoolManagers() (map[string]common.PoolManager, error) {
|
|
|
|
|
return p.organizations, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-13 16:09:28 +00:00
|
|
|
func (p *poolManagerCtrl) CreateEnterprisePoolManager(ctx context.Context, enterprise params.Enterprise, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
cfgInternal, err := p.getInternalConfig(ctx, enterprise.CredentialsName, enterprise.GetBalancerType())
|
2022-10-13 16:09:28 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
2024-03-17 10:50:36 +00:00
|
|
|
|
|
|
|
|
entity := params.GithubEntity{
|
|
|
|
|
Owner: enterprise.Name,
|
|
|
|
|
ID: enterprise.ID,
|
|
|
|
|
WebhookSecret: enterprise.WebhookSecret,
|
|
|
|
|
EntityType: params.GithubEntityTypeEnterprise,
|
|
|
|
|
}
|
|
|
|
|
poolManager, err := pool.NewEntityPoolManager(ctx, entity, cfgInternal, providers, store)
|
2022-10-13 16:09:28 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "creating enterprise pool manager")
|
|
|
|
|
}
|
|
|
|
|
p.enterprises[enterprise.ID] = poolManager
|
|
|
|
|
return poolManager, nil
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-01 19:10:30 +00:00
|
|
|
func (p *poolManagerCtrl) UpdateEnterprisePoolManager(ctx context.Context, enterprise params.Enterprise) (common.PoolManager, error) {
|
2023-07-05 09:46:19 +00:00
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.enterprises[enterprise.ID]
|
|
|
|
|
if !ok {
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "enterprise %s pool manager not loaded", enterprise.Name)
|
|
|
|
|
}
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
internalCfg, err := p.getInternalConfig(ctx, enterprise.CredentialsName, enterprise.GetBalancerType())
|
2023-07-05 09:46:19 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "fetching internal config")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
newState := params.UpdatePoolStateParams{
|
|
|
|
|
WebhookSecret: enterprise.WebhookSecret,
|
|
|
|
|
InternalConfig: &internalCfg,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err := poolMgr.RefreshState(newState); err != nil {
|
|
|
|
|
return nil, errors.Wrap(err, "updating repo pool manager")
|
|
|
|
|
}
|
|
|
|
|
return poolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-13 16:09:28 +00:00
|
|
|
func (p *poolManagerCtrl) GetEnterprisePoolManager(enterprise params.Enterprise) (common.PoolManager, error) {
|
|
|
|
|
if enterprisePoolMgr, ok := p.enterprises[enterprise.ID]; ok {
|
|
|
|
|
return enterprisePoolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
return nil, errors.Wrapf(runnerErrors.ErrNotFound, "enterprise %s pool manager not loaded", enterprise.Name)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) DeleteEnterprisePoolManager(enterprise params.Enterprise) error {
|
|
|
|
|
p.mux.Lock()
|
|
|
|
|
defer p.mux.Unlock()
|
|
|
|
|
|
|
|
|
|
poolMgr, ok := p.enterprises[enterprise.ID]
|
|
|
|
|
if ok {
|
|
|
|
|
if err := poolMgr.Stop(); err != nil {
|
|
|
|
|
return errors.Wrap(err, "stopping enterprise pool manager")
|
|
|
|
|
}
|
|
|
|
|
delete(p.enterprises, enterprise.ID)
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *poolManagerCtrl) GetEnterprisePoolManagers() (map[string]common.PoolManager, error) {
|
|
|
|
|
return p.enterprises, nil
|
|
|
|
|
}
|
|
|
|
|
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
func (p *poolManagerCtrl) getInternalConfig(ctx context.Context, credsName string, poolBalancerType params.PoolBalancerType) (params.Internal, error) {
|
2022-08-10 12:15:12 +03:00
|
|
|
creds, ok := p.credentials[credsName]
|
|
|
|
|
if !ok {
|
|
|
|
|
return params.Internal{}, runnerErrors.NewBadRequestError("invalid credential name (%s)", credsName)
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-12 21:45:07 +00:00
|
|
|
caBundle, err := creds.CACertBundle()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return params.Internal{}, fmt.Errorf("fetching CA bundle for creds: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-15 17:19:06 +00:00
|
|
|
var controllerWebhookURL string
|
|
|
|
|
if p.config.Default.WebhookURL != "" {
|
|
|
|
|
controllerWebhookURL = fmt.Sprintf("%s/%s", p.config.Default.WebhookURL, p.controllerID)
|
|
|
|
|
}
|
2024-03-01 19:10:30 +00:00
|
|
|
httpClient, err := creds.HTTPClient(ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return params.Internal{}, fmt.Errorf("fetching http client for creds: %w", err)
|
|
|
|
|
}
|
2022-08-10 12:15:12 +03:00
|
|
|
return params.Internal{
|
2023-08-15 17:19:06 +00:00
|
|
|
ControllerID: p.controllerID,
|
|
|
|
|
InstanceCallbackURL: p.config.Default.CallbackURL,
|
|
|
|
|
InstanceMetadataURL: p.config.Default.MetadataURL,
|
|
|
|
|
BaseWebhookURL: p.config.Default.WebhookURL,
|
|
|
|
|
ControllerWebhookURL: controllerWebhookURL,
|
|
|
|
|
JWTSecret: p.config.JWTAuth.Secret,
|
Add pool balancing strategy
This change adds the ability to specify the pool balancing strategy to
use when processing queued jobs. Before this change, GARM would round-robin
through all pools that matched the set of tags requested by queued jobs.
When round-robin (default) is used for an entity (repo, org or enterprise)
and you have 2 pools defined for that entity with a common set of tags that
match 10 jobs (for example), then those jobs would trigger the creation of
a new runner in each of the two pools in turn. Job 1 would go to pool 1,
job 2 would go to pool 2, job 3 to pool 1, job 4 to pool 2 and so on.
When "stack" is used, those same 10 jobs would trigger the creation of a
new runner in the pool with the highest priority, every time.
In both cases, if a pool is full, the next one would be tried automatically.
For the stack case, this would mean that if pool 2 had a priority of 10 and
pool 1 would have a priority of 5, pool 2 would be saturated first, then
pool 1.
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2024-03-14 20:04:34 +00:00
|
|
|
PoolBalancerType: poolBalancerType,
|
2022-10-12 21:45:07 +00:00
|
|
|
GithubCredentialsDetails: params.GithubCredentials{
|
|
|
|
|
Name: creds.Name,
|
|
|
|
|
Description: creds.Description,
|
2022-10-13 18:53:56 +00:00
|
|
|
BaseURL: creds.BaseEndpoint(),
|
|
|
|
|
APIBaseURL: creds.APIEndpoint(),
|
|
|
|
|
UploadBaseURL: creds.UploadEndpoint(),
|
2022-10-12 21:45:07 +00:00
|
|
|
CABundle: caBundle,
|
2024-03-01 19:10:30 +00:00
|
|
|
HTTPClient: httpClient,
|
2022-10-12 21:45:07 +00:00
|
|
|
},
|
2022-08-10 12:15:12 +03:00
|
|
|
}, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type Runner struct {
|
|
|
|
|
mux sync.Mutex
|
|
|
|
|
|
|
|
|
|
config config.Config
|
|
|
|
|
ctx context.Context
|
|
|
|
|
store dbCommon.Store
|
|
|
|
|
|
|
|
|
|
poolManagerCtrl PoolManagerController
|
|
|
|
|
|
|
|
|
|
providers map[string]common.Provider
|
|
|
|
|
credentials map[string]config.Github
|
2023-01-26 14:02:53 +01:00
|
|
|
|
|
|
|
|
controllerInfo params.ControllerInfo
|
2023-01-27 12:35:51 +02:00
|
|
|
controllerID uuid.UUID
|
2022-04-22 14:46:27 +00:00
|
|
|
}
|
|
|
|
|
|
2023-01-26 20:24:26 +01:00
|
|
|
// GetControllerInfo returns the controller id and the hostname.
|
|
|
|
|
// This data might be used in metrics and logging.
|
2023-01-27 12:35:51 +02:00
|
|
|
func (r *Runner) GetControllerInfo(ctx context.Context) (params.ControllerInfo, error) {
|
|
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return params.ControllerInfo{}, runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
2023-01-29 13:16:35 +02:00
|
|
|
// It is unlikely that fetching the hostname will encounter an error on a standard
|
|
|
|
|
// linux (or Windows) system, but if os.Hostname() can fail, we need to at least retry
|
|
|
|
|
// a few times before giving up.
|
|
|
|
|
// This retries 10 times within one second. While it has the potential to give us a
|
|
|
|
|
// one second delay before returning either the hostname or an error, I expect this
|
|
|
|
|
// to succeed on the first try.
|
|
|
|
|
// As a side note, Windows requires a reboot for the hostname change to take effect,
|
|
|
|
|
// so if we'll ever support Windows as a target system, the hostname can be cached.
|
|
|
|
|
var hostname string
|
|
|
|
|
err := retry.Call(retry.CallArgs{
|
|
|
|
|
Func: func() error {
|
|
|
|
|
var err error
|
|
|
|
|
hostname, err = os.Hostname()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetching hostname")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
},
|
|
|
|
|
Attempts: 10,
|
|
|
|
|
Delay: 100 * time.Millisecond,
|
|
|
|
|
Clock: clock.WallClock,
|
|
|
|
|
})
|
2023-01-26 20:24:26 +01:00
|
|
|
if err != nil {
|
2023-01-27 12:35:51 +02:00
|
|
|
return params.ControllerInfo{}, errors.Wrap(err, "fetching hostname")
|
2023-01-17 17:32:28 +01:00
|
|
|
}
|
2023-01-26 20:24:26 +01:00
|
|
|
r.controllerInfo.Hostname = hostname
|
2023-08-15 17:19:06 +00:00
|
|
|
var controllerWebhook string
|
|
|
|
|
if r.controllerID != uuid.Nil && r.config.Default.WebhookURL != "" {
|
|
|
|
|
controllerWebhook = fmt.Sprintf("%s/%s", r.config.Default.WebhookURL, r.controllerID.String())
|
|
|
|
|
}
|
2023-01-27 12:35:51 +02:00
|
|
|
return params.ControllerInfo{
|
2023-08-15 17:19:06 +00:00
|
|
|
ControllerID: r.controllerID,
|
|
|
|
|
Hostname: hostname,
|
|
|
|
|
MetadataURL: r.config.Default.MetadataURL,
|
|
|
|
|
CallbackURL: r.config.Default.CallbackURL,
|
|
|
|
|
WebhookURL: r.config.Default.WebhookURL,
|
|
|
|
|
ControllerWebhookURL: controllerWebhook,
|
2023-01-27 12:35:51 +02:00
|
|
|
}, nil
|
2023-01-17 17:32:28 +01:00
|
|
|
}
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
func (r *Runner) ListCredentials(ctx context.Context) ([]params.GithubCredentials, error) {
|
2022-10-21 02:49:53 +03:00
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return nil, runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
2022-04-28 16:13:20 +00:00
|
|
|
ret := []params.GithubCredentials{}
|
|
|
|
|
|
|
|
|
|
for _, val := range r.config.Github {
|
|
|
|
|
ret = append(ret, params.GithubCredentials{
|
2022-10-12 21:45:07 +00:00
|
|
|
Name: val.Name,
|
|
|
|
|
Description: val.Description,
|
|
|
|
|
BaseURL: val.BaseEndpoint(),
|
|
|
|
|
APIBaseURL: val.APIEndpoint(),
|
|
|
|
|
UploadBaseURL: val.UploadEndpoint(),
|
2024-03-02 17:04:27 +00:00
|
|
|
AuthType: params.GithubAuthType(val.AuthType),
|
2022-04-28 16:13:20 +00:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
return ret, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *Runner) ListProviders(ctx context.Context) ([]params.Provider, error) {
|
2022-10-21 02:49:53 +03:00
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return nil, runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
2022-04-28 16:13:20 +00:00
|
|
|
ret := []params.Provider{}
|
|
|
|
|
|
|
|
|
|
for _, val := range r.providers {
|
|
|
|
|
ret = append(ret, val.AsParams())
|
|
|
|
|
}
|
|
|
|
|
return ret, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
func (r *Runner) loadReposOrgsAndEnterprises() error {
|
2022-04-26 20:29:58 +00:00
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
repos, err := r.store.ListRepositories(r.ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetching repositories")
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
orgs, err := r.store.ListOrganizations(r.ctx)
|
|
|
|
|
if err != nil {
|
2022-08-10 12:15:12 +03:00
|
|
|
return errors.Wrap(err, "fetching organizations")
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
enterprises, err := r.store.ListEnterprises(r.ctx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetching enterprises")
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
g, _ := errgroup.WithContext(r.ctx)
|
2022-05-05 07:54:52 +00:00
|
|
|
for _, repo := range repos {
|
2023-07-03 07:35:30 +00:00
|
|
|
repo := repo
|
|
|
|
|
g.Go(func() error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(
|
|
|
|
|
r.ctx, "creating pool manager for repo",
|
|
|
|
|
"repo_owner", repo.Owner, "repo_name", repo.Name)
|
2022-08-10 12:15:12 +03:00
|
|
|
_, err := r.poolManagerCtrl.CreateRepoPoolManager(r.ctx, repo, r.providers, r.store)
|
2023-07-03 07:35:30 +00:00
|
|
|
return err
|
|
|
|
|
})
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
for _, org := range orgs {
|
2023-07-03 07:35:30 +00:00
|
|
|
org := org
|
|
|
|
|
g.Go(func() error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(r.ctx, "creating pool manager for organization", "org_name", org.Name)
|
2022-08-10 12:15:12 +03:00
|
|
|
_, err := r.poolManagerCtrl.CreateOrgPoolManager(r.ctx, org, r.providers, r.store)
|
2023-07-03 07:35:30 +00:00
|
|
|
return err
|
|
|
|
|
})
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
for _, enterprise := range enterprises {
|
2023-07-03 07:35:30 +00:00
|
|
|
enterprise := enterprise
|
|
|
|
|
g.Go(func() error {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.InfoContext(r.ctx, "creating pool manager for enterprise", "enterprise_name", enterprise.Name)
|
2022-10-13 18:32:21 +00:00
|
|
|
_, err := r.poolManagerCtrl.CreateEnterprisePoolManager(r.ctx, enterprise, r.providers, r.store)
|
2023-07-03 07:35:30 +00:00
|
|
|
return err
|
|
|
|
|
})
|
2022-10-13 18:32:21 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
if err := r.waitForErrorGroupOrTimeout(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to create pool managers: %w", err)
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
2022-04-28 16:13:20 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *Runner) Start() error {
|
2022-04-29 14:18:22 +00:00
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
repositories, err := r.poolManagerCtrl.GetRepoPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch repo pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
organizations, err := r.poolManagerCtrl.GetOrgPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch org pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
enterprises, err := r.poolManagerCtrl.GetEnterprisePoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch enterprise pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
g, _ := errgroup.WithContext(r.ctx)
|
2022-08-10 12:15:12 +03:00
|
|
|
for _, repo := range repositories {
|
2023-07-03 07:35:30 +00:00
|
|
|
repo := repo
|
|
|
|
|
g.Go(func() error {
|
|
|
|
|
return repo.Start()
|
|
|
|
|
})
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
for _, org := range organizations {
|
2023-07-03 07:35:30 +00:00
|
|
|
org := org
|
|
|
|
|
g.Go(func() error {
|
|
|
|
|
return org.Start()
|
|
|
|
|
})
|
2022-05-05 07:54:52 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-13 18:32:21 +00:00
|
|
|
for _, enterprise := range enterprises {
|
2023-07-03 07:35:30 +00:00
|
|
|
enterprise := enterprise
|
|
|
|
|
g.Go(func() error {
|
|
|
|
|
return enterprise.Start()
|
|
|
|
|
})
|
2022-10-13 18:32:21 +00:00
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
if err := r.waitForErrorGroupOrTimeout(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to start pool managers: %w", err)
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
func (r *Runner) waitForErrorGroupOrTimeout(g *errgroup.Group) error {
|
|
|
|
|
if g == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
done := make(chan error, 1)
|
|
|
|
|
go func() {
|
|
|
|
|
done <- g.Wait()
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
select {
|
|
|
|
|
case err := <-done:
|
|
|
|
|
return err
|
|
|
|
|
case <-time.After(60 * time.Second):
|
|
|
|
|
return fmt.Errorf("timed out waiting for pool manager start")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
func (r *Runner) Stop() error {
|
2022-04-29 14:18:22 +00:00
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
repos, err := r.poolManagerCtrl.GetRepoPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch repo pool managers")
|
|
|
|
|
}
|
2022-04-26 20:29:58 +00:00
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
orgs, err := r.poolManagerCtrl.GetOrgPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch org pool managers")
|
|
|
|
|
}
|
2022-10-20 17:22:47 +03:00
|
|
|
|
|
|
|
|
enterprises, err := r.poolManagerCtrl.GetEnterprisePoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch enterprise pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
g, _ := errgroup.WithContext(r.ctx)
|
2022-10-20 17:22:47 +03:00
|
|
|
|
|
|
|
|
for _, repo := range repos {
|
2023-07-03 07:35:30 +00:00
|
|
|
poolMgr := repo
|
|
|
|
|
g.Go(func() error {
|
2022-10-20 17:22:47 +03:00
|
|
|
err := poolMgr.Stop()
|
2022-12-30 15:07:40 +00:00
|
|
|
if err != nil {
|
2023-07-03 07:35:30 +00:00
|
|
|
return fmt.Errorf("failed to stop repo pool manager: %w", err)
|
2022-12-30 15:07:40 +00:00
|
|
|
}
|
2023-07-03 07:35:30 +00:00
|
|
|
return poolMgr.Wait()
|
|
|
|
|
})
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
for _, org := range orgs {
|
2023-07-03 07:35:30 +00:00
|
|
|
poolMgr := org
|
|
|
|
|
g.Go(func() error {
|
2022-10-20 17:22:47 +03:00
|
|
|
err := poolMgr.Stop()
|
2022-12-30 15:07:40 +00:00
|
|
|
if err != nil {
|
2023-07-03 07:35:30 +00:00
|
|
|
return fmt.Errorf("failed to stop org pool manager: %w", err)
|
2022-12-30 15:07:40 +00:00
|
|
|
}
|
2023-07-03 07:35:30 +00:00
|
|
|
return poolMgr.Wait()
|
|
|
|
|
})
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, enterprise := range enterprises {
|
2023-07-03 07:35:30 +00:00
|
|
|
poolMgr := enterprise
|
|
|
|
|
g.Go(func() error {
|
2022-10-20 17:22:47 +03:00
|
|
|
err := poolMgr.Stop()
|
2022-12-30 15:07:40 +00:00
|
|
|
if err != nil {
|
2023-07-03 07:35:30 +00:00
|
|
|
return fmt.Errorf("failed to stop enterprise pool manager: %w", err)
|
2022-12-30 15:07:40 +00:00
|
|
|
}
|
2023-07-03 07:35:30 +00:00
|
|
|
return poolMgr.Wait()
|
|
|
|
|
})
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
|
|
|
|
|
2023-07-03 07:35:30 +00:00
|
|
|
if err := r.waitForErrorGroupOrTimeout(g); err != nil {
|
|
|
|
|
return fmt.Errorf("failed to stop pool managers: %w", err)
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *Runner) Wait() error {
|
2022-04-29 14:18:22 +00:00
|
|
|
r.mux.Lock()
|
|
|
|
|
defer r.mux.Unlock()
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
var wg sync.WaitGroup
|
|
|
|
|
|
2022-08-10 12:15:12 +03:00
|
|
|
repos, err := r.poolManagerCtrl.GetRepoPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch repo pool managers")
|
|
|
|
|
}
|
2022-10-20 17:22:47 +03:00
|
|
|
|
|
|
|
|
orgs, err := r.poolManagerCtrl.GetOrgPoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch org pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
enterprises, err := r.poolManagerCtrl.GetEnterprisePoolManagers()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetch enterprise pool managers")
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-22 17:28:39 +01:00
|
|
|
for poolID, repo := range repos {
|
2022-04-28 16:13:20 +00:00
|
|
|
wg.Add(1)
|
|
|
|
|
go func(id string, poolMgr common.PoolManager) {
|
|
|
|
|
defer wg.Done()
|
|
|
|
|
if err := poolMgr.Wait(); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id, "pool_mgr_id", poolMgr.ID())
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
2024-02-22 17:28:39 +01:00
|
|
|
}(poolID, repo)
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
|
|
|
|
|
2024-02-22 17:28:39 +01:00
|
|
|
for poolID, org := range orgs {
|
2022-04-28 16:13:20 +00:00
|
|
|
wg.Add(1)
|
|
|
|
|
go func(id string, poolMgr common.PoolManager) {
|
|
|
|
|
defer wg.Done()
|
|
|
|
|
if err := poolMgr.Wait(); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id)
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
2024-02-22 17:28:39 +01:00
|
|
|
}(poolID, org)
|
2022-04-28 16:13:20 +00:00
|
|
|
}
|
2022-10-20 17:22:47 +03:00
|
|
|
|
2024-02-22 17:28:39 +01:00
|
|
|
for poolID, enterprise := range enterprises {
|
2022-10-20 17:22:47 +03:00
|
|
|
wg.Add(1)
|
|
|
|
|
go func(id string, poolMgr common.PoolManager) {
|
|
|
|
|
defer wg.Done()
|
|
|
|
|
if err := poolMgr.Wait(); err != nil {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id)
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
2024-02-22 17:28:39 +01:00
|
|
|
}(poolID, enterprise)
|
2022-10-20 17:22:47 +03:00
|
|
|
}
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
wg.Wait()
|
2022-04-26 20:29:58 +00:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-23 13:05:40 +00:00
|
|
|
func (r *Runner) validateHookBody(signature, secret string, body []byte) error {
|
|
|
|
|
if secret == "" {
|
2023-01-23 17:43:32 +01:00
|
|
|
return runnerErrors.NewMissingSecretError("missing secret to validate webhook signature")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if signature == "" {
|
|
|
|
|
// A secret was set in our config, but a signature was not received
|
|
|
|
|
// from Github. Authentication of the body cannot be done.
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewUnauthorizedError("missing github signature")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sigParts := strings.SplitN(signature, "=", 2)
|
|
|
|
|
if len(sigParts) != 2 {
|
|
|
|
|
// We expect the signature from github to be of the format:
|
|
|
|
|
// hashType=hashValue
|
|
|
|
|
// ie: sha256=1fc917c7ad66487470e466c0ad40ddd45b9f7730a4b43e1b2542627f0596bbdc
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewBadRequestError("invalid signature format")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var hashFunc func() hash.Hash
|
|
|
|
|
switch sigParts[0] {
|
|
|
|
|
case "sha256":
|
|
|
|
|
hashFunc = sha256.New
|
|
|
|
|
case "sha1":
|
|
|
|
|
hashFunc = sha1.New
|
|
|
|
|
default:
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewBadRequestError("unknown signature type")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mac := hmac.New(hashFunc, []byte(secret))
|
|
|
|
|
_, err := mac.Write(body)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "failed to compute sha256")
|
|
|
|
|
}
|
|
|
|
|
expectedMAC := hex.EncodeToString(mac.Sum(nil))
|
|
|
|
|
|
|
|
|
|
if !hmac.Equal([]byte(sigParts[1]), []byte(expectedMAC)) {
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewUnauthorizedError("signature missmatch")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
2022-04-22 14:46:27 +00:00
|
|
|
}
|
|
|
|
|
|
2022-04-23 13:05:40 +00:00
|
|
|
func (r *Runner) DispatchWorkflowJob(hookTargetType, signature string, jobData []byte) error {
|
2022-08-10 12:15:12 +03:00
|
|
|
if len(jobData) == 0 {
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewBadRequestError("missing job data")
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var job params.WorkflowJob
|
|
|
|
|
if err := json.Unmarshal(jobData, &job); err != nil {
|
2022-04-29 14:18:22 +00:00
|
|
|
return errors.Wrapf(runnerErrors.ErrBadRequest, "invalid job data: %s", err)
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
2022-04-26 20:29:58 +00:00
|
|
|
var poolManager common.PoolManager
|
2022-04-22 14:46:27 +00:00
|
|
|
var err error
|
2022-04-23 13:05:40 +00:00
|
|
|
|
|
|
|
|
switch HookTargetType(hookTargetType) {
|
|
|
|
|
case RepoHook:
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "got hook for repo",
|
|
|
|
|
"repo_owner", util.SanitizeLogEntry(job.Repository.Owner.Login),
|
|
|
|
|
"repo_name", util.SanitizeLogEntry(job.Repository.Name))
|
2022-04-28 16:13:20 +00:00
|
|
|
poolManager, err = r.findRepoPoolManager(job.Repository.Owner.Login, job.Repository.Name)
|
2022-04-23 13:05:40 +00:00
|
|
|
case OrganizationHook:
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "got hook for organization",
|
|
|
|
|
"organization", util.SanitizeLogEntry(job.Organization.Login))
|
2022-04-28 16:13:20 +00:00
|
|
|
poolManager, err = r.findOrgPoolManager(job.Organization.Login)
|
2022-10-13 16:09:28 +00:00
|
|
|
case EnterpriseHook:
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.DebugContext(
|
|
|
|
|
r.ctx, "got hook for enterprise",
|
|
|
|
|
"enterprise", util.SanitizeLogEntry(job.Enterprise.Slug))
|
2022-10-13 16:09:28 +00:00
|
|
|
poolManager, err = r.findEnterprisePoolManager(job.Enterprise.Slug)
|
2022-04-22 14:46:27 +00:00
|
|
|
default:
|
2022-04-29 14:18:22 +00:00
|
|
|
return runnerErrors.NewBadRequestError("cannot handle hook target type %s", hookTargetType)
|
2022-04-23 13:05:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
// We don't have a repository or organization configured that
|
|
|
|
|
// can handle this workflow job.
|
2022-04-26 20:29:58 +00:00
|
|
|
return errors.Wrap(err, "fetching poolManager")
|
2022-04-22 14:46:27 +00:00
|
|
|
}
|
2022-04-23 13:05:40 +00:00
|
|
|
|
|
|
|
|
// We found a pool. Validate the webhook job. If a secret is configured,
|
|
|
|
|
// we make sure that the source of this workflow job is valid.
|
2022-04-26 20:29:58 +00:00
|
|
|
secret := poolManager.WebhookSecret()
|
2022-04-23 13:05:40 +00:00
|
|
|
if err := r.validateHookBody(signature, secret, jobData); err != nil {
|
|
|
|
|
return errors.Wrap(err, "validating webhook data")
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-28 16:13:20 +00:00
|
|
|
if err := poolManager.HandleWorkflowJob(job); err != nil {
|
|
|
|
|
return errors.Wrap(err, "handling workflow job")
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-22 14:46:27 +00:00
|
|
|
return nil
|
2022-04-13 16:47:42 +00:00
|
|
|
}
|
2022-04-18 17:26:13 +00:00
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
func (r *Runner) appendTagsToCreatePoolParams(param params.CreatePoolParams) (params.CreatePoolParams, error) {
|
|
|
|
|
if err := param.Validate(); err != nil {
|
|
|
|
|
return params.CreatePoolParams{}, errors.Wrapf(runnerErrors.ErrBadRequest, "validating params: %s", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !IsSupportedOSType(param.OSType) {
|
|
|
|
|
return params.CreatePoolParams{}, runnerErrors.NewBadRequestError("invalid OS type %s", param.OSType)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !IsSupportedArch(param.OSArch) {
|
|
|
|
|
return params.CreatePoolParams{}, runnerErrors.NewBadRequestError("invalid OS architecture %s", param.OSArch)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_, ok := r.providers[param.ProviderName]
|
|
|
|
|
if !ok {
|
|
|
|
|
return params.CreatePoolParams{}, runnerErrors.NewBadRequestError("no such provider %s", param.ProviderName)
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-19 21:28:24 +02:00
|
|
|
newTags, err := r.processTags(string(param.OSArch), param.OSType, param.Tags)
|
2022-06-24 11:41:38 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return params.CreatePoolParams{}, errors.Wrap(err, "processing tags")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
param.Tags = newTags
|
|
|
|
|
|
|
|
|
|
return param, nil
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-21 15:30:22 +00:00
|
|
|
func (r *Runner) processTags(osArch string, osType commonParams.OSType, tags []string) ([]string, error) {
|
2022-05-04 16:27:24 +00:00
|
|
|
// github automatically adds the "self-hosted" tag as well as the OS type (linux, windows, etc)
|
|
|
|
|
// and architecture (arm, x64, etc) to all self hosted runners. When a workflow job comes in, we try
|
|
|
|
|
// to find a pool based on the labels that are set in the workflow. If we don't explicitly define these
|
|
|
|
|
// default tags for each pool, and the user targets these labels, we won't be able to match any pools.
|
|
|
|
|
// The downside is that all pools with the same OS and arch will have these default labels. Users should
|
|
|
|
|
// set distinct and unique labels on each pool, and explicitly target those labels, or risk assigning
|
|
|
|
|
// the job to the wrong worker type.
|
2022-06-24 11:41:38 +00:00
|
|
|
ghArch, err := util.ResolveToGithubArch(osArch)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2022-06-24 11:41:38 +00:00
|
|
|
return nil, errors.Wrap(err, "invalid arch")
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2023-03-19 21:28:24 +02:00
|
|
|
ghOSType, err := util.ResolveToGithubTag(osType)
|
2022-05-04 16:27:24 +00:00
|
|
|
if err != nil {
|
2022-06-24 11:41:38 +00:00
|
|
|
return nil, errors.Wrap(err, "invalid os type")
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-06-24 11:41:38 +00:00
|
|
|
labels := []string{
|
2022-05-04 16:27:24 +00:00
|
|
|
"self-hosted",
|
|
|
|
|
ghArch,
|
2022-06-24 11:41:38 +00:00
|
|
|
ghOSType,
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
2022-06-24 11:41:38 +00:00
|
|
|
for _, val := range tags {
|
|
|
|
|
if val != "self-hosted" && val != ghArch && val != ghOSType {
|
|
|
|
|
labels = append(labels, val)
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-05-04 16:27:24 +00:00
|
|
|
|
2022-06-24 11:41:38 +00:00
|
|
|
return labels, nil
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *Runner) GetInstance(ctx context.Context, instanceName string) (params.Instance, error) {
|
|
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return params.Instance{}, runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instance, err := r.store.GetInstanceByName(ctx, instanceName)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return params.Instance{}, errors.Wrap(err, "fetching instance")
|
|
|
|
|
}
|
|
|
|
|
return instance, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-05 13:07:06 +00:00
|
|
|
func (r *Runner) ListAllInstances(ctx context.Context) ([]params.Instance, error) {
|
|
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return nil, runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instances, err := r.store.ListAllInstances(ctx)
|
|
|
|
|
if err != nil {
|
2022-08-10 12:15:12 +03:00
|
|
|
return nil, errors.Wrap(err, "fetching instances")
|
2022-05-05 13:07:06 +00:00
|
|
|
}
|
|
|
|
|
return instances, nil
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
func (r *Runner) AddInstanceStatusMessage(ctx context.Context, param params.InstanceUpdateMessage) error {
|
|
|
|
|
instanceID := auth.InstanceID(ctx)
|
|
|
|
|
if instanceID == "" {
|
|
|
|
|
return runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-29 16:49:50 +00:00
|
|
|
if err := r.store.AddInstanceEvent(ctx, instanceID, params.StatusEvent, params.EventInfo, param.Message); err != nil {
|
2022-05-04 16:27:24 +00:00
|
|
|
return errors.Wrap(err, "adding status update")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
updateParams := params.UpdateInstanceParams{
|
|
|
|
|
RunnerStatus: param.Status,
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-29 16:23:01 +00:00
|
|
|
if param.AgentID != nil {
|
|
|
|
|
updateParams.AgentID = *param.AgentID
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-04 16:27:24 +00:00
|
|
|
if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil {
|
2024-01-04 15:23:43 +00:00
|
|
|
return errors.Wrap(err, "updating runner agent ID")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (r *Runner) UpdateSystemInfo(ctx context.Context, param params.UpdateSystemInfoParams) error {
|
|
|
|
|
instanceID := auth.InstanceID(ctx)
|
|
|
|
|
if instanceID == "" {
|
2024-01-05 23:32:16 +00:00
|
|
|
slog.ErrorContext(ctx, "missing instance ID")
|
2024-01-04 15:23:43 +00:00
|
|
|
return runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if param.OSName == "" && param.OSVersion == "" && param.AgentID == nil {
|
|
|
|
|
// Nothing to update
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
updateParams := params.UpdateInstanceParams{
|
|
|
|
|
OSName: param.OSName,
|
|
|
|
|
OSVersion: param.OSVersion,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if param.AgentID != nil {
|
|
|
|
|
updateParams.AgentID = *param.AgentID
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil {
|
|
|
|
|
return errors.Wrap(err, "updating runner system info")
|
2022-05-04 16:27:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
|
2022-12-06 19:48:00 +00:00
|
|
|
func (r *Runner) getPoolManagerFromInstance(ctx context.Context, instance params.Instance) (common.PoolManager, error) {
|
2022-06-29 16:23:01 +00:00
|
|
|
pool, err := r.store.GetPoolByID(ctx, instance.PoolID)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrap(err, "fetching pool")
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var poolMgr common.PoolManager
|
|
|
|
|
|
2024-02-22 17:33:19 +01:00
|
|
|
switch {
|
|
|
|
|
case pool.RepoID != "":
|
2022-06-29 16:23:01 +00:00
|
|
|
repo, err := r.store.GetRepositoryByID(ctx, pool.RepoID)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrap(err, "fetching repo")
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
poolMgr, err = r.findRepoPoolManager(repo.Owner, repo.Name)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrapf(err, "fetching pool manager for repo %s", pool.RepoName)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
2024-02-22 17:33:19 +01:00
|
|
|
case pool.OrgID != "":
|
2022-06-29 16:23:01 +00:00
|
|
|
org, err := r.store.GetOrganizationByID(ctx, pool.OrgID)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrap(err, "fetching org")
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
poolMgr, err = r.findOrgPoolManager(org.Name)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrapf(err, "fetching pool manager for org %s", pool.OrgName)
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
2024-02-22 17:33:19 +01:00
|
|
|
case pool.EnterpriseID != "":
|
2022-10-13 18:32:21 +00:00
|
|
|
enterprise, err := r.store.GetEnterpriseByID(ctx, pool.EnterpriseID)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrap(err, "fetching enterprise")
|
2022-10-13 18:32:21 +00:00
|
|
|
}
|
|
|
|
|
poolMgr, err = r.findEnterprisePoolManager(enterprise.Name)
|
|
|
|
|
if err != nil {
|
2022-12-06 19:48:00 +00:00
|
|
|
return nil, errors.Wrapf(err, "fetching pool manager for enterprise %s", pool.EnterpriseName)
|
2022-10-13 18:32:21 +00:00
|
|
|
}
|
2022-06-29 16:23:01 +00:00
|
|
|
}
|
|
|
|
|
|
2022-12-06 19:48:00 +00:00
|
|
|
return poolMgr, nil
|
|
|
|
|
}
|
|
|
|
|
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
// DeleteRunner removes a runner from a pool. If forceDelete is true, GARM will ignore any provider errors
|
|
|
|
|
// that may occur, and attempt to remove the runner from GitHub and then the database, regardless of provider
|
|
|
|
|
// errors.
|
2024-03-10 15:21:39 +00:00
|
|
|
func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDelete, bypassGithubUnauthorized bool) error {
|
2022-12-06 19:48:00 +00:00
|
|
|
if !auth.IsAdmin(ctx) {
|
|
|
|
|
return runnerErrors.ErrUnauthorized
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instance, err := r.store.GetInstanceByName(ctx, instanceName)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetching instance")
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-29 23:45:17 +00:00
|
|
|
switch instance.Status {
|
Add force delete runner
This branch adds the ability to forcefully remove a runner from GARM.
When the operator wishes to manually remove a runner, the workflow is as
follows:
* Check that the runner exists in GitHub. If it does, attempt to
remove it. An error here indicates that the runner may be processing
a job. In this case, we don't continue and the operator gets immediate
feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
from the database.
Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.
In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.
A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.
This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.
The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.
Fixes: #160
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2023-10-12 06:15:36 +00:00
|
|
|
case commonParams.InstanceRunning, commonParams.InstanceError,
|
|
|
|
|
commonParams.InstancePendingForceDelete, commonParams.InstancePendingDelete:
|
2022-12-29 23:45:17 +00:00
|
|
|
default:
|
2024-01-12 19:53:27 +00:00
|
|
|
validStates := []string{
|
|
|
|
|
string(commonParams.InstanceRunning),
|
|
|
|
|
string(commonParams.InstanceError),
|
|
|
|
|
string(commonParams.InstancePendingForceDelete),
|
|
|
|
|
string(commonParams.InstancePendingDelete),
|
|
|
|
|
}
|
|
|
|
|
return runnerErrors.NewBadRequestError("runner must be in one of the following states: %q", strings.Join(validStates, ", "))
|
2022-12-29 23:45:17 +00:00
|
|
|
}
|
|
|
|
|
|
2022-12-06 19:48:00 +00:00
|
|
|
poolMgr, err := r.getPoolManagerFromInstance(ctx, instance)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return errors.Wrap(err, "fetching pool manager for instance")
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-10 15:21:39 +00:00
|
|
|
if err := poolMgr.DeleteRunner(instance, forceDelete, bypassGithubUnauthorized); err != nil {
|
2022-06-29 16:23:01 +00:00
|
|
|
return errors.Wrap(err, "removing runner")
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|