Add force delete runner

This branch adds the ability to forcefully remove a runner from GARM.

When the operator wishes to manually remove a runner, the workflow is as
follows:

* Check that the runner exists in GitHub. If it does, attempt to
  remove it. An error here indicates that the runner may be processing
  a job. In this case, we don't continue and the operator gets immediate
  feedback from the API.
* Mark the runner in the database as pending_delete
* Allow the consolidate loop to reap it from the provider and remove it
  from the database.

Removing the instance from the provider is async. If the provider errs out,
GARM will keep trying to remove it in perpetuity until the provider succedes.

In situations where the provider is misconfigured, this will never happen, leaving
the instance in a permanent state of pending_delete.

A provider may fail for various reasons. Either credentials have expired, the
API endpoint has changed, the provider is misconfigured or the operator may just
have removed it from the config before cleaning up the runners. While some cases
are recoverable, some are not. We cannot have a situation in which we cannot clean
resources in garm because of a misconfiguration.

This change adds the pending_force_delete instance status. Instances marked with
this status, will be removed from GARM even if the provider reports an error.

The GARM cli has been modified to give new meaning to the --force-remove-runner
option. This option in the CLI is no longer mandatory. Instead, setting it will mark
the runner with the new pending_force_delete status. Omitting it will mark the runner
with the old status of pending_delete.

Fixes: #160

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2023-10-12 06:15:36 +00:00
parent 7f4f4bd7e1
commit d09f12dfd8
19 changed files with 221 additions and 88 deletions

View file

@ -18,6 +18,7 @@ import (
"encoding/json"
"log"
"net/http"
"strconv"
gErrors "github.com/cloudbase/garm-provider-common/errors"
"github.com/cloudbase/garm/apiserver/params"
@ -121,6 +122,12 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
// in: path
// required: true
//
// + name: forceRemove
// description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
// type: boolean
// in: query
// required: false
//
// Responses:
// default: APIErrorResponse
func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) {
@ -138,7 +145,8 @@ func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Req
return
}
if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil {
forceRemove, _ := strconv.ParseBool(r.URL.Query().Get("forceRemove"))
if err := a.r.DeleteRunner(ctx, instanceName, forceRemove); err != nil {
log.Printf("removing runner: %s", err)
handleError(w, err)
return

View file

@ -597,6 +597,10 @@ paths:
name: instanceName
required: true
type: string
- description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
in: query
name: forceRemove
type: boolean
responses:
default:
description: APIErrorResponse

View file

@ -14,6 +14,7 @@ import (
"github.com/go-openapi/runtime"
cr "github.com/go-openapi/runtime/client"
"github.com/go-openapi/strfmt"
"github.com/go-openapi/swag"
)
// NewDeleteInstanceParams creates a new DeleteInstanceParams object,
@ -61,6 +62,12 @@ DeleteInstanceParams contains all the parameters to send to the API endpoint
*/
type DeleteInstanceParams struct {
/* ForceRemove.
If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
*/
ForceRemove *bool
/* InstanceName.
Runner instance name.
@ -120,6 +127,17 @@ func (o *DeleteInstanceParams) SetHTTPClient(client *http.Client) {
o.HTTPClient = client
}
// WithForceRemove adds the forceRemove to the delete instance params
func (o *DeleteInstanceParams) WithForceRemove(forceRemove *bool) *DeleteInstanceParams {
o.SetForceRemove(forceRemove)
return o
}
// SetForceRemove adds the forceRemove to the delete instance params
func (o *DeleteInstanceParams) SetForceRemove(forceRemove *bool) {
o.ForceRemove = forceRemove
}
// WithInstanceName adds the instanceName to the delete instance params
func (o *DeleteInstanceParams) WithInstanceName(instanceName string) *DeleteInstanceParams {
o.SetInstanceName(instanceName)
@ -139,6 +157,23 @@ func (o *DeleteInstanceParams) WriteToRequest(r runtime.ClientRequest, reg strfm
}
var res []error
if o.ForceRemove != nil {
// query param forceRemove
var qrForceRemove bool
if o.ForceRemove != nil {
qrForceRemove = *o.ForceRemove
}
qForceRemove := swag.FormatBool(qrForceRemove)
if qForceRemove != "" {
if err := r.SetQueryParam("forceRemove", qForceRemove); err != nil {
return err
}
}
}
// path param instanceName
if err := r.SetPathParam("instanceName", o.InstanceName); err != nil {
return err

View file

@ -187,12 +187,9 @@ to either cancel the workflow or wait for it to finish.
return fmt.Errorf("requires a runner name")
}
if !forceRemove {
return fmt.Errorf("use --force-remove-runner=true to remove a runner")
}
deleteInstanceReq := apiClientInstances.NewDeleteInstanceParams()
deleteInstanceReq.InstanceName = args[0]
deleteInstanceReq.ForceRemove = &forceRemove
if err := apiCli.Instances.DeleteInstance(deleteInstanceReq, authToken); err != nil {
return err
}
@ -207,7 +204,7 @@ func init() {
runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.")
runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "enterprise", "all")
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner")
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Forcefully remove a runner. If set to true, GARM will ignore provider errors when removing the runner.")
runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner")
runnerCmd.AddCommand(

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -1578,12 +1578,13 @@ func (_m *Store) UpdateUser(ctx context.Context, user string, param params.Updat
return r0, r1
}
// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewStore(t interface {
type mockConstructorTestingTNewStore interface {
mock.TestingT
Cleanup(func())
}) *Store {
}
// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewStore(t mockConstructorTestingTNewStore) *Store {
mock := &Store{}
mock.Mock.Test(t)

2
go.mod
View file

@ -4,7 +4,7 @@ go 1.20
require (
github.com/BurntSushi/toml v1.2.1
github.com/cloudbase/garm-provider-common v0.1.0
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700
github.com/go-openapi/errors v0.20.4
github.com/go-openapi/runtime v0.26.0
github.com/go-openapi/strfmt v0.21.7

4
go.sum
View file

@ -22,8 +22,8 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn
github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudbase/garm-provider-common v0.1.0 h1:gc2n8nsLjt7G3InAfqZ+75iZjSIUkIx86d6/DFA2+jc=
github.com/cloudbase/garm-provider-common v0.1.0/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc=
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700 h1:ZCJ1zZ2WI/37ffzpRsu7t5zzShAMThhYsXw7bBNKBR0=
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -730,12 +730,13 @@ func (_m *GithubClient) RemoveRunner(ctx context.Context, owner string, repo str
return r0, r1
}
// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewGithubClient(t interface {
type mockConstructorTestingTNewGithubClient interface {
mock.TestingT
Cleanup(func())
}) *GithubClient {
}
// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewGithubClient(t mockConstructorTestingTNewGithubClient) *GithubClient {
mock := &GithubClient{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -215,12 +215,13 @@ func (_m *GithubEnterpriseClient) RemoveRunner(ctx context.Context, enterprise s
return r0, r1
}
// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewGithubEnterpriseClient(t interface {
type mockConstructorTestingTNewGithubEnterpriseClient interface {
mock.TestingT
Cleanup(func())
}) *GithubEnterpriseClient {
}
// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewGithubEnterpriseClient(t mockConstructorTestingTNewGithubEnterpriseClient) *GithubEnterpriseClient {
mock := &GithubEnterpriseClient{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -171,12 +171,13 @@ func (_m *OrganizationHooks) PingOrgHook(ctx context.Context, org string, id int
return r0, r1
}
// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewOrganizationHooks(t interface {
type mockConstructorTestingTNewOrganizationHooks interface {
mock.TestingT
Cleanup(func())
}) *OrganizationHooks {
}
// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewOrganizationHooks(t mockConstructorTestingTNewOrganizationHooks) *OrganizationHooks {
mock := &OrganizationHooks{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -14,6 +14,20 @@ type PoolManager struct {
mock.Mock
}
// DeleteRunner provides a mock function with given fields: runner, forceRemove
func (_m *PoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error {
ret := _m.Called(runner, forceRemove)
var r0 error
if rf, ok := ret.Get(0).(func(params.Instance, bool) error); ok {
r0 = rf(runner, forceRemove)
} else {
r0 = ret.Error(0)
}
return r0
}
// ForceDeleteRunner provides a mock function with given fields: runner
func (_m *PoolManager) ForceDeleteRunner(runner params.Instance) error {
ret := _m.Called(runner)
@ -250,12 +264,13 @@ func (_m *PoolManager) WebhookSecret() string {
return r0
}
// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewPoolManager(t interface {
type mockConstructorTestingTNewPoolManager interface {
mock.TestingT
Cleanup(func())
}) *PoolManager {
}
// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewPoolManager(t mockConstructorTestingTNewPoolManager) *PoolManager {
mock := &PoolManager{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -160,12 +160,13 @@ func (_m *Provider) Stop(ctx context.Context, instance string, force bool) error
return r0
}
// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewProvider(t interface {
type mockConstructorTestingTNewProvider interface {
mock.TestingT
Cleanup(func())
}) *Provider {
}
// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewProvider(t mockConstructorTestingTNewProvider) *Provider {
mock := &Provider{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -171,12 +171,13 @@ func (_m *RepositoryHooks) PingRepoHook(ctx context.Context, owner string, repo
return r0, r1
}
// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewRepositoryHooks(t interface {
type mockConstructorTestingTNewRepositoryHooks interface {
mock.TestingT
Cleanup(func())
}) *RepositoryHooks {
}
// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewRepositoryHooks(t mockConstructorTestingTNewRepositoryHooks) *RepositoryHooks {
mock := &RepositoryHooks{}
mock.Mock.Test(t)

View file

@ -38,22 +38,54 @@ const (
//go:generate mockery --all
type PoolManager interface {
// ID returns the ID of the entity (repo, org, enterprise)
ID() string
// WebhookSecret returns the unencrypted webhook secret associated with the webhook installed
// in GitHub for GARM. For GARM to receive webhook events for an entity, either the operator or
// GARM will have to create a webhook in GitHub which points to the GARM API server. To authenticate
// the webhook, a webhook secret is used. This function returns that secret.
WebhookSecret() string
// GithubRunnerRegistrationToken returns a new registration token for a github runner. This is used
// for GHES installations that have not yet upgraded to a version >= 3.10. Starting with 3.10, we use
// just-in-time runners, which no longer require exposing a runner registration token.
GithubRunnerRegistrationToken() (string, error)
// HandleWorkflowJob handles a workflow job meant for a particular entity. When a webhook is fired for
// a repo, org or enterprise, we determine the destination of that webhook, retrieve the pool manager
// for it and call this function with the WorkflowJob as a parameter.
HandleWorkflowJob(job params.WorkflowJob) error
// RefreshState allows us to update webhook secrets and configuration for a pool manager.
RefreshState(param params.UpdatePoolStateParams) error
// ForceDeleteRunner will attempt to remove a runner from the pool.
//
// Deprecated: FunctionName is deprecated. Use DeleteRunner instead.
ForceDeleteRunner(runner params.Instance) error
// DeleteRunner will attempt to remove a runner from the pool. If forceRemove is true, any error
// received from the provider will be ignored and we will procede to remove the runner from the database.
// An error received while attempting to remove from GitHub (other than 404) will still stop the deletion
// process. This can happen if the runner is already processing a job. At which point, you can simply cancel
// the job in github. Doing so will prompt GARM to reap the runner automatically.
DeleteRunner(runner params.Instance, forceRemove bool) error
// InstallWebhook will create a webhook in github for the entity associated with this pool manager.
InstallWebhook(ctx context.Context, param params.InstallWebhookParams) (params.HookInfo, error)
// GetWebhookInfo will return information about the webhook installed in github for the entity associated
GetWebhookInfo(ctx context.Context) (params.HookInfo, error)
// UninstallWebhook will remove the webhook installed in github for the entity associated with this pool manager.
UninstallWebhook(ctx context.Context) error
// RootCABundle will return a CA bundle that must be installed on all runners in order to properly validate
// x509 certificates used by various systems involved. This CA bundle is defined in the GARM config file and
// can include multiple CA certificates for the GARM api server, GHES server and any provider API endpoint that
// may use internal or self signed certificates.
RootCABundle() (params.CertificateBundle, error)
// PoolManager lifecycle functions. Start/stop pool.
// Start will start the pool manager and all associated workers.
Start() error
// Stop will stop the pool manager and all associated workers.
Stop() error
// Status will return the current status of the pool manager.
Status() params.PoolManagerStatus
// Wait will block until the pool manager has stopped.
Wait() error
}

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -373,12 +373,13 @@ func (_m *PoolManagerController) UpdateRepoPoolManager(ctx context.Context, repo
return r0, r1
}
// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewPoolManagerController(t interface {
type mockConstructorTestingTNewPoolManagerController interface {
mock.TestingT
Cleanup(func())
}) *PoolManagerController {
}
// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewPoolManagerController(t mockConstructorTestingTNewPoolManagerController) *PoolManagerController {
mock := &PoolManagerController{}
mock.Mock.Test(t)

View file

@ -381,7 +381,7 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne
switch commonParams.InstanceStatus(instance.Status) {
case commonParams.InstancePendingCreate,
commonParams.InstancePendingDelete:
commonParams.InstancePendingDelete, commonParams.InstancePendingForceDelete:
// this instance is in the process of being created or is awaiting deletion.
// Instances in pending_create did not get a chance to register themselves in,
// github so we let them be for now.
@ -454,13 +454,14 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
continue
}
// There are 2 cases (currently) where we consider a runner as timed out:
// There are 3 cases (currently) where we consider a runner as timed out:
// * The runner never joined github within the pool timeout
// * The runner managed to join github, but the setup process failed later and the runner
// never started on the instance.
// * A JIT config was created, but the runner never joined github.
if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" {
r.log("reaping timed-out/failed runner %s", instance.Name)
if err := r.ForceDeleteRunner(instance); err != nil {
if err := r.DeleteRunner(instance, false); err != nil {
r.log("failed to update runner %s status: %s", instance.Name, err)
return errors.Wrap(err, "updating runner")
}
@ -726,7 +727,7 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
defer func() {
if err != nil {
if instance.ID != "" {
if err := r.ForceDeleteRunner(instance); err != nil {
if err := r.DeleteRunner(instance, false); err != nil {
r.log("failed to cleanup instance: %s", instance.Name)
}
}
@ -1030,7 +1031,7 @@ func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool
g.Go(func() error {
r.log("scaling down idle worker %s from pool %s\n", instanceToDelete.Name, pool.ID)
if err := r.ForceDeleteRunner(instanceToDelete); err != nil {
if err := r.DeleteRunner(instanceToDelete, false); err != nil {
return fmt.Errorf("failed to delete instance %s: %w", instanceToDelete.ID, err)
}
return nil
@ -1272,7 +1273,7 @@ func (r *basePoolManager) deletePendingInstances() error {
r.log("removing instances in pending_delete")
for _, instance := range instances {
if instance.Status != commonParams.InstancePendingDelete {
if instance.Status != commonParams.InstancePendingDelete && instance.Status != commonParams.InstancePendingForceDelete {
// not in pending_delete status. Skip.
continue
}
@ -1284,6 +1285,7 @@ func (r *basePoolManager) deletePendingInstances() error {
continue
}
currentStatus := instance.Status
// Set the status to deleting before launching the goroutine that removes
// the runner from the provider (which can take a long time).
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceDeleting, nil); err != nil {
@ -1300,9 +1302,9 @@ func (r *basePoolManager) deletePendingInstances() error {
defer func(instance params.Instance) {
if err != nil {
r.log("failed to remove instance %s: %s", instance.Name, err)
// failed to remove from provider. Set the status back to pending_delete, which
// will retry the operation.
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstancePendingDelete, nil); err != nil {
// failed to remove from provider. Set status to previous value, which will retry
// the operation.
if _, err := r.setInstanceStatus(instance.Name, currentStatus, nil); err != nil {
r.log("failed to update runner %s status: %s", instance.Name, err)
}
}
@ -1311,7 +1313,10 @@ func (r *basePoolManager) deletePendingInstances() error {
r.log("removing instance %s from provider", instance.Name)
err = r.deleteInstanceFromProvider(r.ctx, instance)
if err != nil {
return fmt.Errorf("failed to remove instance from provider: %w", err)
if currentStatus != commonParams.InstancePendingForceDelete {
return fmt.Errorf("failed to remove instance from provider: %w", err)
}
log.Printf("failed to remove instance %s from provider (continuing anyway): %s", instance.Name, err)
}
r.log("removing instance %s from database", instance.Name)
if deleteErr := r.store.DeleteInstance(r.ctx, instance.PoolID, instance.Name); deleteErr != nil {
@ -1397,18 +1402,14 @@ func (r *basePoolManager) runnerCleanup() (err error) {
return fmt.Errorf("failed to reap timed out runners: %w", err)
}
if err := r.cleanupOrphanedRunners(); err != nil {
if err := r.cleanupOrphanedRunners(runners); err != nil {
return fmt.Errorf("failed to cleanup orphaned runners: %w", err)
}
return nil
}
func (r *basePoolManager) cleanupOrphanedRunners() error {
runners, err := r.helper.GetGithubRunners()
if err != nil {
return errors.Wrap(err, "fetching github runners")
}
func (r *basePoolManager) cleanupOrphanedRunners(runners []*github.Runner) error {
if err := r.cleanupOrphanedProviderRunners(runners); err != nil {
return errors.Wrap(err, "cleaning orphaned instances")
}
@ -1421,16 +1422,24 @@ func (r *basePoolManager) cleanupOrphanedRunners() error {
}
func (r *basePoolManager) Start() error {
go r.updateTools() //nolint
initialToolUpdate := make(chan struct{}, 1)
go func() {
r.updateTools() //nolint
initialToolUpdate <- struct{}{}
}()
go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false)
go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false)
go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false)
go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false)
go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false)
go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false)
go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true)
go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false)
go func() {
<-initialToolUpdate
defer close(initialToolUpdate)
go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false)
go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false)
go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false)
go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false)
go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false)
go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false)
go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true)
go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false)
}()
return nil
}
@ -1455,7 +1464,16 @@ func (r *basePoolManager) ID() string {
return r.helper.ID()
}
// ForceDeleteRunner will delete a runner from a pool.
//
// Deprecated: Use DeleteRunner instead.
func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error {
return r.DeleteRunner(runner, true)
}
// Delete runner will delete a runner from a pool. If forceRemove is set to true, any error received from
// the IaaS provider will be ignored and deletion will continue.
func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error {
if !r.managerIsRunning {
return runnerErrors.NewConflictError("pool manager is not running for %s", r.helper.String())
}
@ -1485,9 +1503,14 @@ func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error {
}
}
}
r.log("setting instance status for: %v", runner.Name)
if _, err := r.setInstanceStatus(runner.Name, commonParams.InstancePendingDelete, nil); err != nil {
instanceStatus := commonParams.InstancePendingDelete
if forceRemove {
instanceStatus = commonParams.InstancePendingForceDelete
}
r.log("setting instance status for %v to %v", runner.Name, instanceStatus)
if _, err := r.setInstanceStatus(runner.Name, instanceStatus, nil); err != nil {
r.log("failed to update runner %s status: %s", runner.Name, err)
return errors.Wrap(err, "updating runner")
}

View file

@ -898,7 +898,17 @@ func (r *Runner) getPoolManagerFromInstance(ctx context.Context, instance params
return poolMgr, nil
}
// ForceDeleteRunner will attempt to remove a runner from a pool.
//
// Deprecated: FunctionName is deprecated. Use DeleteRunner instead.
func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error {
return r.DeleteRunner(ctx, instanceName, true)
}
// DeleteRunner removes a runner from a pool. If forceDelete is true, GARM will ignore any provider errors
// that may occur, and attempt to remove the runner from GitHub and then the database, regardless of provider
// errors.
func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDelete bool) error {
if !auth.IsAdmin(ctx) {
return runnerErrors.ErrUnauthorized
}
@ -909,7 +919,8 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err
}
switch instance.Status {
case commonParams.InstanceRunning, commonParams.InstanceError:
case commonParams.InstanceRunning, commonParams.InstanceError,
commonParams.InstancePendingForceDelete, commonParams.InstancePendingDelete:
default:
return runnerErrors.NewBadRequestError("runner must be in %q or %q state", commonParams.InstanceRunning, commonParams.InstanceError)
}
@ -919,7 +930,7 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err
return errors.Wrap(err, "fetching pool manager for instance")
}
if err := poolMgr.ForceDeleteRunner(instance); err != nil {
if err := poolMgr.DeleteRunner(instance, forceDelete); err != nil {
return errors.Wrap(err, "removing runner")
}
return nil

View file

@ -39,14 +39,15 @@ const (
)
const (
InstanceRunning InstanceStatus = "running"
InstanceStopped InstanceStatus = "stopped"
InstanceError InstanceStatus = "error"
InstancePendingDelete InstanceStatus = "pending_delete"
InstanceDeleting InstanceStatus = "deleting"
InstancePendingCreate InstanceStatus = "pending_create"
InstanceCreating InstanceStatus = "creating"
InstanceStatusUnknown InstanceStatus = "unknown"
InstanceRunning InstanceStatus = "running"
InstanceStopped InstanceStatus = "stopped"
InstanceError InstanceStatus = "error"
InstancePendingDelete InstanceStatus = "pending_delete"
InstancePendingForceDelete InstanceStatus = "pending_force_delete"
InstanceDeleting InstanceStatus = "deleting"
InstancePendingCreate InstanceStatus = "pending_create"
InstanceCreating InstanceStatus = "creating"
InstanceStatusUnknown InstanceStatus = "unknown"
)
const (

2
vendor/modules.txt vendored
View file

@ -14,7 +14,7 @@ github.com/cespare/xxhash/v2
# github.com/chzyer/readline v1.5.1
## explicit; go 1.15
github.com/chzyer/readline
# github.com/cloudbase/garm-provider-common v0.1.0
# github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700
## explicit; go 1.20
github.com/cloudbase/garm-provider-common/cloudconfig
github.com/cloudbase/garm-provider-common/defaults