Merge pull request #183 from gabriel-samfira/force-remove-runner

Add force delete runner
This commit is contained in:
Gabriel 2023-11-21 20:56:24 +02:00 committed by GitHub
commit 05e179604d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 359 additions and 104 deletions

View file

@ -18,6 +18,7 @@ import (
"encoding/json"
"log"
"net/http"
"strconv"
gErrors "github.com/cloudbase/garm-provider-common/errors"
"github.com/cloudbase/garm/apiserver/params"
@ -121,6 +122,12 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
// in: path
// required: true
//
// + name: forceRemove
// description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
// type: boolean
// in: query
// required: false
//
// Responses:
// default: APIErrorResponse
func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) {
@ -138,7 +145,8 @@ func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Req
return
}
if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil {
forceRemove, _ := strconv.ParseBool(r.URL.Query().Get("forceRemove"))
if err := a.r.DeleteRunner(ctx, instanceName, forceRemove); err != nil {
log.Printf("removing runner: %s", err)
handleError(w, err)
return

View file

@ -597,6 +597,10 @@ paths:
name: instanceName
required: true
type: string
- description: If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
in: query
name: forceRemove
type: boolean
responses:
default:
description: APIErrorResponse

View file

@ -14,6 +14,7 @@ import (
"github.com/go-openapi/runtime"
cr "github.com/go-openapi/runtime/client"
"github.com/go-openapi/strfmt"
"github.com/go-openapi/swag"
)
// NewDeleteInstanceParams creates a new DeleteInstanceParams object,
@ -61,6 +62,12 @@ DeleteInstanceParams contains all the parameters to send to the API endpoint
*/
type DeleteInstanceParams struct {
/* ForceRemove.
If true GARM will ignore any provider error when removing the runner and will continue to remove the runner from github and the GARM database.
*/
ForceRemove *bool
/* InstanceName.
Runner instance name.
@ -120,6 +127,17 @@ func (o *DeleteInstanceParams) SetHTTPClient(client *http.Client) {
o.HTTPClient = client
}
// WithForceRemove adds the forceRemove to the delete instance params
func (o *DeleteInstanceParams) WithForceRemove(forceRemove *bool) *DeleteInstanceParams {
o.SetForceRemove(forceRemove)
return o
}
// SetForceRemove adds the forceRemove to the delete instance params
func (o *DeleteInstanceParams) SetForceRemove(forceRemove *bool) {
o.ForceRemove = forceRemove
}
// WithInstanceName adds the instanceName to the delete instance params
func (o *DeleteInstanceParams) WithInstanceName(instanceName string) *DeleteInstanceParams {
o.SetInstanceName(instanceName)
@ -139,6 +157,23 @@ func (o *DeleteInstanceParams) WriteToRequest(r runtime.ClientRequest, reg strfm
}
var res []error
if o.ForceRemove != nil {
// query param forceRemove
var qrForceRemove bool
if o.ForceRemove != nil {
qrForceRemove = *o.ForceRemove
}
qForceRemove := swag.FormatBool(qrForceRemove)
if qForceRemove != "" {
if err := r.SetQueryParam("forceRemove", qForceRemove); err != nil {
return err
}
}
}
// path param instanceName
if err := r.SetPathParam("instanceName", o.InstanceName); err != nil {
return err

View file

@ -187,12 +187,9 @@ to either cancel the workflow or wait for it to finish.
return fmt.Errorf("requires a runner name")
}
if !forceRemove {
return fmt.Errorf("use --force-remove-runner=true to remove a runner")
}
deleteInstanceReq := apiClientInstances.NewDeleteInstanceParams()
deleteInstanceReq.InstanceName = args[0]
deleteInstanceReq.ForceRemove = &forceRemove
if err := apiCli.Instances.DeleteInstance(deleteInstanceReq, authToken); err != nil {
return err
}
@ -207,7 +204,7 @@ func init() {
runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.")
runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "enterprise", "all")
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner")
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Forcefully remove a runner. If set to true, GARM will ignore provider errors when removing the runner.")
runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner")
runnerCmd.AddCommand(

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -1578,12 +1578,13 @@ func (_m *Store) UpdateUser(ctx context.Context, user string, param params.Updat
return r0, r1
}
// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewStore(t interface {
type mockConstructorTestingTNewStore interface {
mock.TestingT
Cleanup(func())
}) *Store {
}
// NewStore creates a new instance of Store. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewStore(t mockConstructorTestingTNewStore) *Store {
mock := &Store{}
mock.Mock.Test(t)

2
go.mod
View file

@ -4,7 +4,7 @@ go 1.20
require (
github.com/BurntSushi/toml v1.2.1
github.com/cloudbase/garm-provider-common v0.1.0
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700
github.com/go-openapi/errors v0.20.4
github.com/go-openapi/runtime v0.26.0
github.com/go-openapi/strfmt v0.21.7

4
go.sum
View file

@ -22,8 +22,8 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn
github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cloudbase/garm-provider-common v0.1.0 h1:gc2n8nsLjt7G3InAfqZ+75iZjSIUkIx86d6/DFA2+jc=
github.com/cloudbase/garm-provider-common v0.1.0/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc=
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700 h1:ZCJ1zZ2WI/37ffzpRsu7t5zzShAMThhYsXw7bBNKBR0=
github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700/go.mod h1:igxJRT3OlykERYc6ssdRQXcb+BCaeSfnucg6I0OSoDc=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -730,12 +730,13 @@ func (_m *GithubClient) RemoveRunner(ctx context.Context, owner string, repo str
return r0, r1
}
// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewGithubClient(t interface {
type mockConstructorTestingTNewGithubClient interface {
mock.TestingT
Cleanup(func())
}) *GithubClient {
}
// NewGithubClient creates a new instance of GithubClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewGithubClient(t mockConstructorTestingTNewGithubClient) *GithubClient {
mock := &GithubClient{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -215,12 +215,13 @@ func (_m *GithubEnterpriseClient) RemoveRunner(ctx context.Context, enterprise s
return r0, r1
}
// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewGithubEnterpriseClient(t interface {
type mockConstructorTestingTNewGithubEnterpriseClient interface {
mock.TestingT
Cleanup(func())
}) *GithubEnterpriseClient {
}
// NewGithubEnterpriseClient creates a new instance of GithubEnterpriseClient. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewGithubEnterpriseClient(t mockConstructorTestingTNewGithubEnterpriseClient) *GithubEnterpriseClient {
mock := &GithubEnterpriseClient{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -171,12 +171,13 @@ func (_m *OrganizationHooks) PingOrgHook(ctx context.Context, org string, id int
return r0, r1
}
// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewOrganizationHooks(t interface {
type mockConstructorTestingTNewOrganizationHooks interface {
mock.TestingT
Cleanup(func())
}) *OrganizationHooks {
}
// NewOrganizationHooks creates a new instance of OrganizationHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewOrganizationHooks(t mockConstructorTestingTNewOrganizationHooks) *OrganizationHooks {
mock := &OrganizationHooks{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -14,6 +14,20 @@ type PoolManager struct {
mock.Mock
}
// DeleteRunner provides a mock function with given fields: runner, forceRemove
func (_m *PoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error {
ret := _m.Called(runner, forceRemove)
var r0 error
if rf, ok := ret.Get(0).(func(params.Instance, bool) error); ok {
r0 = rf(runner, forceRemove)
} else {
r0 = ret.Error(0)
}
return r0
}
// ForceDeleteRunner provides a mock function with given fields: runner
func (_m *PoolManager) ForceDeleteRunner(runner params.Instance) error {
ret := _m.Called(runner)
@ -250,12 +264,13 @@ func (_m *PoolManager) WebhookSecret() string {
return r0
}
// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewPoolManager(t interface {
type mockConstructorTestingTNewPoolManager interface {
mock.TestingT
Cleanup(func())
}) *PoolManager {
}
// NewPoolManager creates a new instance of PoolManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewPoolManager(t mockConstructorTestingTNewPoolManager) *PoolManager {
mock := &PoolManager{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -160,12 +160,13 @@ func (_m *Provider) Stop(ctx context.Context, instance string, force bool) error
return r0
}
// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewProvider(t interface {
type mockConstructorTestingTNewProvider interface {
mock.TestingT
Cleanup(func())
}) *Provider {
}
// NewProvider creates a new instance of Provider. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewProvider(t mockConstructorTestingTNewProvider) *Provider {
mock := &Provider{}
mock.Mock.Test(t)

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -171,12 +171,13 @@ func (_m *RepositoryHooks) PingRepoHook(ctx context.Context, owner string, repo
return r0, r1
}
// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewRepositoryHooks(t interface {
type mockConstructorTestingTNewRepositoryHooks interface {
mock.TestingT
Cleanup(func())
}) *RepositoryHooks {
}
// NewRepositoryHooks creates a new instance of RepositoryHooks. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewRepositoryHooks(t mockConstructorTestingTNewRepositoryHooks) *RepositoryHooks {
mock := &RepositoryHooks{}
mock.Mock.Test(t)

View file

@ -38,22 +38,54 @@ const (
//go:generate mockery --all
type PoolManager interface {
// ID returns the ID of the entity (repo, org, enterprise)
ID() string
// WebhookSecret returns the unencrypted webhook secret associated with the webhook installed
// in GitHub for GARM. For GARM to receive webhook events for an entity, either the operator or
// GARM will have to create a webhook in GitHub which points to the GARM API server. To authenticate
// the webhook, a webhook secret is used. This function returns that secret.
WebhookSecret() string
// GithubRunnerRegistrationToken returns a new registration token for a github runner. This is used
// for GHES installations that have not yet upgraded to a version >= 3.10. Starting with 3.10, we use
// just-in-time runners, which no longer require exposing a runner registration token.
GithubRunnerRegistrationToken() (string, error)
// HandleWorkflowJob handles a workflow job meant for a particular entity. When a webhook is fired for
// a repo, org or enterprise, we determine the destination of that webhook, retrieve the pool manager
// for it and call this function with the WorkflowJob as a parameter.
HandleWorkflowJob(job params.WorkflowJob) error
// RefreshState allows us to update webhook secrets and configuration for a pool manager.
RefreshState(param params.UpdatePoolStateParams) error
// ForceDeleteRunner will attempt to remove a runner from the pool.
//
// Deprecated: FunctionName is deprecated. Use DeleteRunner instead.
ForceDeleteRunner(runner params.Instance) error
// DeleteRunner will attempt to remove a runner from the pool. If forceRemove is true, any error
// received from the provider will be ignored and we will procede to remove the runner from the database.
// An error received while attempting to remove from GitHub (other than 404) will still stop the deletion
// process. This can happen if the runner is already processing a job. At which point, you can simply cancel
// the job in github. Doing so will prompt GARM to reap the runner automatically.
DeleteRunner(runner params.Instance, forceRemove bool) error
// InstallWebhook will create a webhook in github for the entity associated with this pool manager.
InstallWebhook(ctx context.Context, param params.InstallWebhookParams) (params.HookInfo, error)
// GetWebhookInfo will return information about the webhook installed in github for the entity associated
GetWebhookInfo(ctx context.Context) (params.HookInfo, error)
// UninstallWebhook will remove the webhook installed in github for the entity associated with this pool manager.
UninstallWebhook(ctx context.Context) error
// RootCABundle will return a CA bundle that must be installed on all runners in order to properly validate
// x509 certificates used by various systems involved. This CA bundle is defined in the GARM config file and
// can include multiple CA certificates for the GARM api server, GHES server and any provider API endpoint that
// may use internal or self signed certificates.
RootCABundle() (params.CertificateBundle, error)
// PoolManager lifecycle functions. Start/stop pool.
// Start will start the pool manager and all associated workers.
Start() error
// Stop will stop the pool manager and all associated workers.
Stop() error
// Status will return the current status of the pool manager.
Status() params.PoolManagerStatus
// Wait will block until the pool manager has stopped.
Wait() error
}

View file

@ -1,4 +1,4 @@
// Code generated by mockery v0.0.0-dev. DO NOT EDIT.
// Code generated by mockery v2.28.1. DO NOT EDIT.
package mocks
@ -373,12 +373,13 @@ func (_m *PoolManagerController) UpdateRepoPoolManager(ctx context.Context, repo
return r0, r1
}
// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewPoolManagerController(t interface {
type mockConstructorTestingTNewPoolManagerController interface {
mock.TestingT
Cleanup(func())
}) *PoolManagerController {
}
// NewPoolManagerController creates a new instance of PoolManagerController. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewPoolManagerController(t mockConstructorTestingTNewPoolManagerController) *PoolManagerController {
mock := &PoolManagerController{}
mock.Mock.Test(t)

View file

@ -381,7 +381,7 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne
switch commonParams.InstanceStatus(instance.Status) {
case commonParams.InstancePendingCreate,
commonParams.InstancePendingDelete:
commonParams.InstancePendingDelete, commonParams.InstancePendingForceDelete:
// this instance is in the process of being created or is awaiting deletion.
// Instances in pending_create did not get a chance to register themselves in,
// github so we let them be for now.
@ -454,13 +454,14 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
continue
}
// There are 2 cases (currently) where we consider a runner as timed out:
// There are 3 cases (currently) where we consider a runner as timed out:
// * The runner never joined github within the pool timeout
// * The runner managed to join github, but the setup process failed later and the runner
// never started on the instance.
// * A JIT config was created, but the runner never joined github.
if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" {
r.log("reaping timed-out/failed runner %s", instance.Name)
if err := r.ForceDeleteRunner(instance); err != nil {
if err := r.DeleteRunner(instance, false); err != nil {
r.log("failed to update runner %s status: %s", instance.Name, err)
return errors.Wrap(err, "updating runner")
}
@ -726,7 +727,7 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
defer func() {
if err != nil {
if instance.ID != "" {
if err := r.ForceDeleteRunner(instance); err != nil {
if err := r.DeleteRunner(instance, false); err != nil {
r.log("failed to cleanup instance: %s", instance.Name)
}
}
@ -1030,7 +1031,7 @@ func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool
g.Go(func() error {
r.log("scaling down idle worker %s from pool %s\n", instanceToDelete.Name, pool.ID)
if err := r.ForceDeleteRunner(instanceToDelete); err != nil {
if err := r.DeleteRunner(instanceToDelete, false); err != nil {
return fmt.Errorf("failed to delete instance %s: %w", instanceToDelete.ID, err)
}
return nil
@ -1272,7 +1273,7 @@ func (r *basePoolManager) deletePendingInstances() error {
r.log("removing instances in pending_delete")
for _, instance := range instances {
if instance.Status != commonParams.InstancePendingDelete {
if instance.Status != commonParams.InstancePendingDelete && instance.Status != commonParams.InstancePendingForceDelete {
// not in pending_delete status. Skip.
continue
}
@ -1284,6 +1285,7 @@ func (r *basePoolManager) deletePendingInstances() error {
continue
}
currentStatus := instance.Status
// Set the status to deleting before launching the goroutine that removes
// the runner from the provider (which can take a long time).
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceDeleting, nil); err != nil {
@ -1300,9 +1302,9 @@ func (r *basePoolManager) deletePendingInstances() error {
defer func(instance params.Instance) {
if err != nil {
r.log("failed to remove instance %s: %s", instance.Name, err)
// failed to remove from provider. Set the status back to pending_delete, which
// will retry the operation.
if _, err := r.setInstanceStatus(instance.Name, commonParams.InstancePendingDelete, nil); err != nil {
// failed to remove from provider. Set status to previous value, which will retry
// the operation.
if _, err := r.setInstanceStatus(instance.Name, currentStatus, nil); err != nil {
r.log("failed to update runner %s status: %s", instance.Name, err)
}
}
@ -1311,7 +1313,10 @@ func (r *basePoolManager) deletePendingInstances() error {
r.log("removing instance %s from provider", instance.Name)
err = r.deleteInstanceFromProvider(r.ctx, instance)
if err != nil {
return fmt.Errorf("failed to remove instance from provider: %w", err)
if currentStatus != commonParams.InstancePendingForceDelete {
return fmt.Errorf("failed to remove instance from provider: %w", err)
}
log.Printf("failed to remove instance %s from provider (continuing anyway): %s", instance.Name, err)
}
r.log("removing instance %s from database", instance.Name)
if deleteErr := r.store.DeleteInstance(r.ctx, instance.PoolID, instance.Name); deleteErr != nil {
@ -1397,18 +1402,14 @@ func (r *basePoolManager) runnerCleanup() (err error) {
return fmt.Errorf("failed to reap timed out runners: %w", err)
}
if err := r.cleanupOrphanedRunners(); err != nil {
if err := r.cleanupOrphanedRunners(runners); err != nil {
return fmt.Errorf("failed to cleanup orphaned runners: %w", err)
}
return nil
}
func (r *basePoolManager) cleanupOrphanedRunners() error {
runners, err := r.helper.GetGithubRunners()
if err != nil {
return errors.Wrap(err, "fetching github runners")
}
func (r *basePoolManager) cleanupOrphanedRunners(runners []*github.Runner) error {
if err := r.cleanupOrphanedProviderRunners(runners); err != nil {
return errors.Wrap(err, "cleaning orphaned instances")
}
@ -1421,16 +1422,24 @@ func (r *basePoolManager) cleanupOrphanedRunners() error {
}
func (r *basePoolManager) Start() error {
go r.updateTools() //nolint
initialToolUpdate := make(chan struct{}, 1)
go func() {
r.updateTools() //nolint
initialToolUpdate <- struct{}{}
}()
go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false)
go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false)
go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false)
go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false)
go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false)
go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false)
go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true)
go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false)
go func() {
<-initialToolUpdate
defer close(initialToolUpdate)
go r.startLoopForFunction(r.runnerCleanup, common.PoolReapTimeoutInterval, "timeout_reaper", false)
go r.startLoopForFunction(r.scaleDown, common.PoolScaleDownInterval, "scale_down", false)
go r.startLoopForFunction(r.deletePendingInstances, common.PoolConsilitationInterval, "consolidate[delete_pending]", false)
go r.startLoopForFunction(r.addPendingInstances, common.PoolConsilitationInterval, "consolidate[add_pending]", false)
go r.startLoopForFunction(r.ensureMinIdleRunners, common.PoolConsilitationInterval, "consolidate[ensure_min_idle]", false)
go r.startLoopForFunction(r.retryFailedInstances, common.PoolConsilitationInterval, "consolidate[retry_failed]", false)
go r.startLoopForFunction(r.updateTools, common.PoolToolUpdateInterval, "update_tools", true)
go r.startLoopForFunction(r.consumeQueuedJobs, common.PoolConsilitationInterval, "job_queue_consumer", false)
}()
return nil
}
@ -1455,7 +1464,16 @@ func (r *basePoolManager) ID() string {
return r.helper.ID()
}
// ForceDeleteRunner will delete a runner from a pool.
//
// Deprecated: Use DeleteRunner instead.
func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error {
return r.DeleteRunner(runner, true)
}
// Delete runner will delete a runner from a pool. If forceRemove is set to true, any error received from
// the IaaS provider will be ignored and deletion will continue.
func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove bool) error {
if !r.managerIsRunning {
return runnerErrors.NewConflictError("pool manager is not running for %s", r.helper.String())
}
@ -1485,9 +1503,14 @@ func (r *basePoolManager) ForceDeleteRunner(runner params.Instance) error {
}
}
}
r.log("setting instance status for: %v", runner.Name)
if _, err := r.setInstanceStatus(runner.Name, commonParams.InstancePendingDelete, nil); err != nil {
instanceStatus := commonParams.InstancePendingDelete
if forceRemove {
instanceStatus = commonParams.InstancePendingForceDelete
}
r.log("setting instance status for %v to %v", runner.Name, instanceStatus)
if _, err := r.setInstanceStatus(runner.Name, instanceStatus, nil); err != nil {
r.log("failed to update runner %s status: %s", runner.Name, err)
return errors.Wrap(err, "updating runner")
}

View file

@ -898,7 +898,17 @@ func (r *Runner) getPoolManagerFromInstance(ctx context.Context, instance params
return poolMgr, nil
}
// ForceDeleteRunner will attempt to remove a runner from a pool.
//
// Deprecated: FunctionName is deprecated. Use DeleteRunner instead.
func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error {
return r.DeleteRunner(ctx, instanceName, true)
}
// DeleteRunner removes a runner from a pool. If forceDelete is true, GARM will ignore any provider errors
// that may occur, and attempt to remove the runner from GitHub and then the database, regardless of provider
// errors.
func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDelete bool) error {
if !auth.IsAdmin(ctx) {
return runnerErrors.ErrUnauthorized
}
@ -909,7 +919,8 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err
}
switch instance.Status {
case commonParams.InstanceRunning, commonParams.InstanceError:
case commonParams.InstanceRunning, commonParams.InstanceError,
commonParams.InstancePendingForceDelete, commonParams.InstancePendingDelete:
default:
return runnerErrors.NewBadRequestError("runner must be in %q or %q state", commonParams.InstanceRunning, commonParams.InstanceError)
}
@ -919,7 +930,7 @@ func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) err
return errors.Wrap(err, "fetching pool manager for instance")
}
if err := poolMgr.ForceDeleteRunner(instance); err != nil {
if err := poolMgr.DeleteRunner(instance, forceDelete); err != nil {
return errors.Wrap(err, "removing runner")
}
return nil

View file

@ -50,6 +50,14 @@ description = "Local LXD installation"
protocol = "simplestreams"
skip_verify = false
[[provider]]
name = "test_external"
description = "external test provider"
provider_type = "external"
[provider.external]
config_file = "/etc/garm/test-provider/config"
provider_executable = "/etc/garm/test-provider/garm-external-provider"
[[github]]
name = "${CREDENTIALS_NAME}"
description = "GARM GitHub OAuth token"

View file

@ -387,9 +387,9 @@ func getInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWrit
return &getInstancesResponse.Payload, nil
}
func deleteInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWriter, instanceID string) error {
func deleteInstance(apiCli *client.GarmAPI, apiAuthToken runtime.ClientAuthInfoWriter, instanceID string, forceRemove bool) error {
return apiCli.Instances.DeleteInstance(
clientInstances.NewDeleteInstanceParams().WithInstanceName(instanceID),
clientInstances.NewDeleteInstanceParams().WithInstanceName(instanceID).WithForceRemove(&forceRemove),
apiAuthToken)
}

View file

@ -72,7 +72,7 @@ func GracefulCleanup() {
panic(err)
}
for _, instance := range poolInstances {
if err := deleteInstance(cli, authToken, instance.Name); err != nil {
if err := deleteInstance(cli, authToken, instance.Name, false); err != nil {
panic(err)
}
log.Printf("Instance %s deletion initiated", instance.Name)

View file

@ -33,7 +33,14 @@ func waitInstanceStatus(name string, status commonParams.InstanceStatus, runnerS
return nil, fmt.Errorf("timeout waiting for instance %s status to reach status %s and runner status %s", name, status, runnerStatus)
}
func waitInstanceToBeRemoved(name string, timeout time.Duration) error {
func DeleteInstance(name string, forceRemove bool) {
if err := deleteInstance(cli, authToken, name, forceRemove); err != nil {
panic(err)
}
log.Printf("Instance %s deletion initiated", name)
}
func WaitInstanceToBeRemoved(name string, timeout time.Duration) error {
var timeWaited time.Duration = 0
var instance *params.Instance
@ -67,7 +74,7 @@ func waitInstanceToBeRemoved(name string, timeout time.Duration) error {
return fmt.Errorf("instance %s was not removed within the timeout", name)
}
func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error {
func WaitPoolInstances(poolID string, status commonParams.InstanceStatus, runnerStatus params.RunnerStatus, timeout time.Duration) error {
var timeWaited time.Duration = 0
pool, err := getPool(cli, authToken, poolID)
@ -75,22 +82,22 @@ func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error {
return err
}
log.Printf("Waiting for pool %s to have all instances as idle running", poolID)
log.Printf("Waiting for pool %s instances to reach status: %s and runner status: %s", poolID, status, runnerStatus)
for timeWaited < timeout {
poolInstances, err := listPoolInstances(cli, authToken, poolID)
if err != nil {
return err
}
runningIdleCount := 0
instancesCount := 0
for _, instance := range poolInstances {
if instance.Status == commonParams.InstanceRunning && instance.RunnerStatus == params.RunnerIdle {
runningIdleCount++
if instance.Status == status && instance.RunnerStatus == runnerStatus {
instancesCount++
}
}
log.Printf("Pool min idle runners: %d, pool instances: %d, current pool running idle instances: %d", pool.MinIdleRunners, len(poolInstances), runningIdleCount)
if runningIdleCount == int(pool.MinIdleRunners) && runningIdleCount == len(poolInstances) {
log.Printf("Pool %s instance reached status: %s and runner status: %s: %d/%d", poolID, status, runnerStatus, instancesCount, len(poolInstances))
if instancesCount == int(pool.MinIdleRunners) && instancesCount == len(poolInstances) {
return nil
}
time.Sleep(5 * time.Second)
@ -99,5 +106,5 @@ func waitPoolRunningIdleInstances(poolID string, timeout time.Duration) error {
_ = dumpPoolInstancesDetails(pool.ID)
return fmt.Errorf("timeout waiting for pool %s to have all idle instances running", poolID)
return fmt.Errorf("timeout waiting for pool %s instances to reach status: %s and runner status: %s", poolID, status, runnerStatus)
}

View file

@ -41,13 +41,13 @@ func ValidateJobLifecycle(label string) {
}
// wait for instance to be removed
err = waitInstanceToBeRemoved(instance.Name, 5*time.Minute)
err = WaitInstanceToBeRemoved(instance.Name, 5*time.Minute)
if err != nil {
panic(err)
}
// wait for GARM to rebuild the pool running idle instances
err = waitPoolRunningIdleInstances(instance.PoolID, 6*time.Minute)
err = WaitPoolInstances(instance.PoolID, commonParams.InstanceRunning, params.RunnerIdle, 5*time.Minute)
if err != nil {
panic(err)
}

View file

@ -4,6 +4,7 @@ import (
"log"
"time"
commonParams "github.com/cloudbase/garm-provider-common/params"
"github.com/cloudbase/garm/params"
)
@ -100,7 +101,7 @@ func WaitOrgRunningIdleInstances(orgID string, timeout time.Duration) {
panic(err)
}
for _, pool := range orgPools {
err := waitPoolRunningIdleInstances(pool.ID, timeout)
err := WaitPoolInstances(pool.ID, commonParams.InstanceRunning, params.RunnerIdle, timeout)
if err != nil {
_ = dumpOrgInstancesDetails(orgID)
panic(err)

View file

@ -4,6 +4,7 @@ import (
"log"
"time"
commonParams "github.com/cloudbase/garm-provider-common/params"
"github.com/cloudbase/garm/params"
)
@ -95,13 +96,22 @@ func DeleteRepoPool(repoID, repoPoolID string) {
}
}
func DisableRepoPool(repoID, repoPoolID string) {
log.Printf("Disable repo %s pool %s", repoID, repoPoolID)
enabled := false
poolParams := params.UpdatePoolParams{Enabled: &enabled}
if _, err := updateRepoPool(cli, authToken, repoID, repoPoolID, poolParams); err != nil {
panic(err)
}
}
func WaitRepoRunningIdleInstances(repoID string, timeout time.Duration) {
repoPools, err := listRepoPools(cli, authToken, repoID)
if err != nil {
panic(err)
}
for _, pool := range repoPools {
err := waitPoolRunningIdleInstances(pool.ID, timeout)
err := WaitPoolInstances(pool.ID, commonParams.InstanceRunning, params.RunnerIdle, timeout)
if err != nil {
_ = dumpRepoInstancesDetails(repoID)
panic(err)

View file

@ -2,6 +2,7 @@ package main
import (
"fmt"
"log"
"os"
"time"
@ -34,6 +35,17 @@ var (
Tags: []string{"repo-runner"},
Enabled: true,
}
repoPoolParams2 = params.CreatePoolParams{
MaxRunners: 2,
MinIdleRunners: 0,
Flavor: "default",
Image: "ubuntu:22.04",
OSType: commonParams.Linux,
OSArch: commonParams.Amd64,
ProviderName: "test_external",
Tags: []string{"repo-runner-2"},
Enabled: true,
}
orgName = os.Getenv("ORG_NAME")
orgWebhookSecret = os.Getenv("ORG_WEBHOOK_SECRET")
@ -101,6 +113,29 @@ func main() {
repoPool = e2e.CreateRepoPool(repo.ID, repoPoolParams)
_ = e2e.UpdateRepoPool(repo.ID, repoPool.ID, repoPoolParams.MaxRunners, 1)
/////////////////////////////
// Test external provider ///
/////////////////////////////
repoPool2 := e2e.CreateRepoPool(repo.ID, repoPoolParams2)
_ = e2e.UpdateRepoPool(repo.ID, repoPool2.ID, repoPoolParams2.MaxRunners, 1)
err := e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstanceRunning, params.RunnerPending, 1*time.Minute)
if err != nil {
log.Printf("Failed to wait for instance to be running: %v", err)
}
repoPool2 = e2e.GetRepoPool(repo.ID, repoPool2.ID)
e2e.DisableRepoPool(repo.ID, repoPool2.ID)
e2e.DeleteInstance(repoPool2.Instances[0].Name, false)
err = e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstancePendingDelete, params.RunnerPending, 1*time.Minute)
if err != nil {
log.Printf("Failed to wait for instance to be running: %v", err)
}
e2e.DeleteInstance(repoPool2.Instances[0].Name, true) // delete instance with forceRemove
err = e2e.WaitInstanceToBeRemoved(repoPool2.Instances[0].Name, 1*time.Minute)
if err != nil {
log.Printf("Failed to wait for instance to be removed: %v", err)
}
e2e.DeleteRepoPool(repo.ID, repoPool2.ID)
///////////////////
// organizations //
///////////////////

View file

@ -0,0 +1,56 @@
#!/bin/bash
set -e
set -o pipefail
if [ ! -t 0 ]
then
INPUT=$(cat -)
fi
if [ -z "$GARM_PROVIDER_CONFIG_FILE" ]
then
echo "no config file specified in env"
exit 1
fi
source "$GARM_PROVIDER_CONFIG_FILE"
function CreateInstance() {
if [ -z "$INPUT" ]; then
echo "expected build params in stdin"
exit 1
fi
jq -rnc '{"provider_id": "test-provider-id", "name": "test-instance-name", "os_type": "linux", "os_name": "ubuntu", "os_version": "20.04", "os_arch": "x86_64", "status": "running"}'
}
case "$GARM_COMMAND" in
"CreateInstance")
CreateInstance
;;
"DeleteInstance")
echo "RemoveAllInstances not implemented"
exit 1
;;
"GetInstance")
echo "Get instance with id: ${GARM_INSTANCE_ID}"
;;
"ListInstances")
echo "List instances with pool id: ${GARM_POOL_ID}"
;;
"StartInstance")
echo "Start instance: ${GARM_INSTANCE_NAME} with id: ${GARM_INSTANCE_ID}"
;;
"StopInstance")
echo "Stop instance: ${GARM_INSTANCE_NAME} with id: ${GARM_INSTANCE_ID}"
;;
"RemoveAllInstances")
echo "RemoveAllInstances not implemented"
exit 1
;;
*)
echo "Invalid GARM provider command: \"$GARM_COMMAND\""
exit 1
;;
esac

View file

@ -5,6 +5,7 @@ DIR="$(dirname $0)"
BINARIES_DIR="$PWD/bin"
CONTRIB_DIR="$PWD/contrib"
CONFIG_DIR="$PWD/test/integration/config"
CONFIG_DIR_PROV="$PWD/test/integration/provider"
if [[ ! -f $BINARIES_DIR/garm ]] || [[ ! -f $BINARIES_DIR/garm-cli ]]; then
echo "ERROR: Please build GARM binaries first"
@ -46,6 +47,10 @@ sudo mkdir -p /etc/garm
cat $CONFIG_DIR/config.toml | envsubst | sudo tee /etc/garm/config.toml
sudo chown -R garm:garm /etc/garm
sudo mkdir /etc/garm/test-provider
sudo touch $CONFIG_DIR_PROV/config
sudo cp $CONFIG_DIR_PROV/* /etc/garm/test-provider
sudo mv $BINARIES_DIR/* /usr/local/bin/
sudo cp $CONTRIB_DIR/garm.service /etc/systemd/system/garm.service

View file

@ -39,14 +39,15 @@ const (
)
const (
InstanceRunning InstanceStatus = "running"
InstanceStopped InstanceStatus = "stopped"
InstanceError InstanceStatus = "error"
InstancePendingDelete InstanceStatus = "pending_delete"
InstanceDeleting InstanceStatus = "deleting"
InstancePendingCreate InstanceStatus = "pending_create"
InstanceCreating InstanceStatus = "creating"
InstanceStatusUnknown InstanceStatus = "unknown"
InstanceRunning InstanceStatus = "running"
InstanceStopped InstanceStatus = "stopped"
InstanceError InstanceStatus = "error"
InstancePendingDelete InstanceStatus = "pending_delete"
InstancePendingForceDelete InstanceStatus = "pending_force_delete"
InstanceDeleting InstanceStatus = "deleting"
InstancePendingCreate InstanceStatus = "pending_create"
InstanceCreating InstanceStatus = "creating"
InstanceStatusUnknown InstanceStatus = "unknown"
)
const (

2
vendor/modules.txt vendored
View file

@ -14,7 +14,7 @@ github.com/cespare/xxhash/v2
# github.com/chzyer/readline v1.5.1
## explicit; go 1.15
github.com/chzyer/readline
# github.com/cloudbase/garm-provider-common v0.1.0
# github.com/cloudbase/garm-provider-common v0.1.1-0.20231012061429-49001794e700
## explicit; go 1.20
github.com/cloudbase/garm-provider-common/cloudconfig
github.com/cloudbase/garm-provider-common/defaults