From 5390efbaabeefae10a80f5cd5fcd35384a7a7754 Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Wed, 29 Jun 2022 16:23:01 +0000 Subject: [PATCH] Add manual runner removal Runners can now be manually removed using the CLI. Some restrictions apply: * A runner must be idle in github. Github will not allow us to remove a runner that is running a workflow. * The runner status must be "running" Signed-off-by: Gabriel Adrian Samfira --- apiserver/controllers/instances.go | 27 +- apiserver/controllers/pools.go | 1 - apiserver/routers/routers.go | 9 +- cloudconfig/templates.go | 12 +- cmd/garm-cli/client/client.go | 14 + cmd/garm-cli/cmd/runner.go | 40 +++ .../azure/cloudconfig/install_runner.tpl | 11 +- .../openstack/cloudconfig/install_runner.tpl | 11 +- database/sql/instances.go | 4 + database/sql/models.go | 1 + database/sql/util.go | 2 + errors/errors.go | 3 +- params/params.go | 7 +- params/requests.go | 2 + runner/common/pool.go | 3 +- runner/pool/common.go | 245 +++++++++++------- runner/pool/interfaces.go | 2 +- runner/pool/organization.go | 5 +- runner/pool/repository.go | 5 +- runner/runner.go | 52 ++++ 20 files changed, 336 insertions(+), 120 deletions(-) diff --git a/apiserver/controllers/instances.go b/apiserver/controllers/instances.go index 86518d44..b7332b6c 100644 --- a/apiserver/controllers/instances.go +++ b/apiserver/controllers/instances.go @@ -58,14 +58,14 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques w.WriteHeader(http.StatusBadRequest) json.NewEncoder(w).Encode(params.APIErrorResponse{ Error: "Bad Request", - Details: "No pool ID specified", + Details: "No runner name specified", }) return } instance, err := a.r.GetInstance(ctx, instanceName) if err != nil { - log.Printf("listing pools: %s", err) + log.Printf("listing instances: %s", err) handleError(w, err) return } @@ -74,6 +74,29 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques json.NewEncoder(w).Encode(instance) } +func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + vars := mux.Vars(r) + instanceName, ok := vars["instanceName"] + if !ok { + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(params.APIErrorResponse{ + Error: "Bad Request", + Details: "No instance name specified", + }) + return + } + + if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil { + log.Printf("removing runner: %s", err) + handleError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) +} + func (a *APIController) ListRepoInstancesHandler(w http.ResponseWriter, r *http.Request) { ctx := r.Context() vars := mux.Vars(r) diff --git a/apiserver/controllers/pools.go b/apiserver/controllers/pools.go index ebd0a8e0..50692a95 100644 --- a/apiserver/controllers/pools.go +++ b/apiserver/controllers/pools.go @@ -88,7 +88,6 @@ func (a *APIController) DeletePoolByIDHandler(w http.ResponseWriter, r *http.Req w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) - } func (a *APIController) UpdatePoolByIDHandler(w http.ResponseWriter, r *http.Request) { diff --git a/apiserver/routers/routers.go b/apiserver/routers/routers.go index 880a1508..f9a88e48 100644 --- a/apiserver/routers/routers.go +++ b/apiserver/routers/routers.go @@ -78,12 +78,15 @@ func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddl ///////////// // Runners // ///////////// - // List runners - apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS") - apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS") // Get instance apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS") apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS") + // Delete runner + apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS") + apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS") + // List runners + apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS") + apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS") ///////////////////// // Repos and pools // diff --git a/cloudconfig/templates.go b/cloudconfig/templates.go index fbea1b20..0a4f203c 100644 --- a/cloudconfig/templates.go +++ b/cloudconfig/templates.go @@ -41,7 +41,8 @@ function sendStatus() { function success() { MSG="$1" - call "{\"status\": \"idle\", \"message\": \"$MSG\"}" + ID=$2 + call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}" } function fail() { @@ -72,7 +73,14 @@ sendStatus "installing runner service" sendStatus "starting service" ./svc.sh start || fail "failed to start service" -success "runner successfully installed" +set +e +AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9) +if [ $? -ne 0 ];then + fail "failed to get agent ID" +fi +set -e + +success "runner successfully installed" $AGENT_ID ` type InstallRunnerParams struct { diff --git a/cmd/garm-cli/client/client.go b/cmd/garm-cli/client/client.go index e63bdf38..a32801ad 100644 --- a/cmd/garm-cli/client/client.go +++ b/cmd/garm-cli/client/client.go @@ -151,6 +151,20 @@ func (c *Client) GetInstanceByName(instanceName string) (params.Instance, error) return response, nil } +func (c *Client) DeleteRunner(instanceName string) error { + url := fmt.Sprintf("%s/api/v1/instances/%s", c.Config.BaseURL, instanceName) + resp, err := c.client.R(). + Delete(url) + if err != nil || resp.IsError() { + apiErr, decErr := c.decodeAPIError(resp.Body()) + if decErr != nil { + return errors.Wrap(decErr, "sending request") + } + return fmt.Errorf("error deleting runner: %s", apiErr.Details) + } + return nil +} + func (c *Client) ListPoolInstances(poolID string) ([]params.Instance, error) { url := fmt.Sprintf("%s/api/v1/pools/%s/instances", c.Config.BaseURL, poolID) diff --git a/cmd/garm-cli/cmd/runner.go b/cmd/garm-cli/cmd/runner.go index ba5a8a01..87347c9f 100644 --- a/cmd/garm-cli/cmd/runner.go +++ b/cmd/garm-cli/cmd/runner.go @@ -27,6 +27,7 @@ var ( runnerRepository string runnerOrganization string runnerAll bool + forceRemove bool ) // runnerCmd represents the runner command @@ -133,15 +134,54 @@ var runnerShowCmd = &cobra.Command{ }, } +var runnerDeleteCmd = &cobra.Command{ + Use: "delete", + Short: "Remove a runner", + Aliases: []string{"remove", "rm", "del"}, + Long: `Remove a runner. + +This command deletes an existing runner. If it registered in Github +and we recorded an agent ID for it, we will attempt to remove it from +Github first, then mark the runner as pending_delete so it will be +cleaned up by the provider. + +NOTE: An active runner cannot be removed from Github. You will have +to either cancel the workflow or wait for it to finish. +`, + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) error { + if needsInit { + return needsInitError + } + + if len(args) == 0 { + return fmt.Errorf("requires a runner name") + } + + if !forceRemove { + return fmt.Errorf("use --force-remove-runner=true to remove a runner") + } + + if err := cli.DeleteRunner(args[0]); err != nil { + return err + } + return nil + }, +} + func init() { runnerListCmd.Flags().StringVarP(&runnerRepository, "repo", "r", "", "List all runners from all pools within this repository.") runnerListCmd.Flags().StringVarP(&runnerOrganization, "org", "o", "", "List all runners from all pools withing this organization.") runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.") runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "all") + runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner") + runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner") + runnerCmd.AddCommand( runnerListCmd, runnerShowCmd, + runnerDeleteCmd, ) rootCmd.AddCommand(runnerCmd) diff --git a/contrib/providers.d/azure/cloudconfig/install_runner.tpl b/contrib/providers.d/azure/cloudconfig/install_runner.tpl index 0453fe8c..14d88479 100644 --- a/contrib/providers.d/azure/cloudconfig/install_runner.tpl +++ b/contrib/providers.d/azure/cloudconfig/install_runner.tpl @@ -24,7 +24,8 @@ function sendStatus() { function success() { MSG="$1" - call "{\"status\": \"idle\", \"message\": \"$MSG\"}" + ID=$2 + call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}" } function fail() { @@ -57,5 +58,11 @@ sendStatus "installing runner service" sendStatus "starting service" ./svc.sh start || fail "failed to start service" -success "runner successfully installed" +set +e +AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9) +if [ $? -ne 0 ];then + fail "failed to get agent ID" +fi +set -e +success "runner successfully installed" $AGENT_ID diff --git a/contrib/providers.d/openstack/cloudconfig/install_runner.tpl b/contrib/providers.d/openstack/cloudconfig/install_runner.tpl index 0453fe8c..14d88479 100644 --- a/contrib/providers.d/openstack/cloudconfig/install_runner.tpl +++ b/contrib/providers.d/openstack/cloudconfig/install_runner.tpl @@ -24,7 +24,8 @@ function sendStatus() { function success() { MSG="$1" - call "{\"status\": \"idle\", \"message\": \"$MSG\"}" + ID=$2 + call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}" } function fail() { @@ -57,5 +58,11 @@ sendStatus "installing runner service" sendStatus "starting service" ./svc.sh start || fail "failed to start service" -success "runner successfully installed" +set +e +AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9) +if [ $? -ne 0 ];then + fail "failed to get agent ID" +fi +set -e +success "runner successfully installed" $AGENT_ID diff --git a/database/sql/instances.go b/database/sql/instances.go index 055095d0..e190a2e8 100644 --- a/database/sql/instances.go +++ b/database/sql/instances.go @@ -159,6 +159,10 @@ func (s *sqlDatabase) UpdateInstance(ctx context.Context, instanceID string, par return params.Instance{}, errors.Wrap(err, "updating instance") } + if param.AgentID != 0 { + instance.AgentID = param.AgentID + } + if param.ProviderID != "" { instance.ProviderID = ¶m.ProviderID } diff --git a/database/sql/models.go b/database/sql/models.go index a68e4069..2e9e4a33 100644 --- a/database/sql/models.go +++ b/database/sql/models.go @@ -116,6 +116,7 @@ type Instance struct { ProviderID *string `gorm:"uniqueIndex"` Name string `gorm:"uniqueIndex"` + AgentID int64 OSType config.OSType OSArch config.OSArch OSName string diff --git a/database/sql/util.go b/database/sql/util.go index 454e7888..e047b18c 100644 --- a/database/sql/util.go +++ b/database/sql/util.go @@ -31,6 +31,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance { ret := params.Instance{ ID: instance.ID.String(), ProviderID: id, + AgentID: instance.AgentID, Name: instance.Name, OSType: instance.OSType, OSName: instance.OSName, @@ -42,6 +43,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance { CallbackURL: instance.CallbackURL, StatusMessages: []params.StatusMessage{}, CreateAttempt: instance.CreateAttempt, + UpdatedAt: instance.UpdatedAt, } if len(instance.ProviderFault) > 0 { diff --git a/errors/errors.go b/errors/errors.go index b205fa45..c3a6400b 100644 --- a/errors/errors.go +++ b/errors/errors.go @@ -29,7 +29,8 @@ var ( // ErrBadRequest is returned is a malformed request is sent ErrBadRequest = NewBadRequestError("invalid request") // ErrTimeout is returned when a timeout occurs. - ErrTimeout = fmt.Errorf("timed out") + ErrTimeout = fmt.Errorf("timed out") + ErrUnprocessable = fmt.Errorf("cannot process request") ) type baseError struct { diff --git a/params/params.go b/params/params.go index 226b220a..2c0c649c 100644 --- a/params/params.go +++ b/params/params.go @@ -47,6 +47,8 @@ type Instance struct { // with the compute instance. We use this to identify the // instance in the provider. ProviderID string `json:"provider_id,omitempty"` + // AgentID is the github runner agent ID. + AgentID int64 `json:"agent_id"` // Name is the name associated with an instance. Depending on // the provider, this may or may not be useful in the context of // the provider, but we can use it internally to identify the @@ -73,8 +75,9 @@ type Instance struct { StatusMessages []StatusMessage `json:"status_messages,omitempty"` // Do not serialize sensitive info. - CallbackURL string `json:"-"` - CreateAttempt int `json:"-"` + CallbackURL string `json:"-"` + CreateAttempt int `json:"-"` + UpdatedAt time.Time `json:"updated_at"` } type BootstrapInstance struct { diff --git a/params/requests.go b/params/requests.go index 14be3751..b6acc737 100644 --- a/params/requests.go +++ b/params/requests.go @@ -153,6 +153,7 @@ type UpdateInstanceParams struct { Status common.InstanceStatus `json:"status,omitempty"` RunnerStatus common.RunnerStatus `json:"runner_status,omitempty"` ProviderFault []byte `json:"provider_fault,omitempty"` + AgentID int64 `json:"-"` CreateAttempt int `json:"-"` } @@ -186,4 +187,5 @@ type UpdateRepositoryParams struct { type InstanceUpdateMessage struct { Status common.RunnerStatus `json:"status"` Message string `json:"message"` + AgentID *int64 `json:"agent_id"` } diff --git a/runner/common/pool.go b/runner/common/pool.go index 9165db62..1d8b77fa 100644 --- a/runner/common/pool.go +++ b/runner/common/pool.go @@ -26,10 +26,11 @@ const ( ) type PoolManager interface { + ID() string WebhookSecret() string HandleWorkflowJob(job params.WorkflowJob) error RefreshState(param params.UpdatePoolStateParams) error - ID() string + ForceDeleteRunner(runner params.Instance) error // AddPool(ctx context.Context, pool params.Pool) error // PoolManager lifecycle functions. Start/stop pool. diff --git a/runner/pool/common.go b/runner/pool/common.go index 85923b86..0d508983 100644 --- a/runner/pool/common.go +++ b/runner/pool/common.go @@ -24,6 +24,7 @@ import ( "garm/runner/common" providerCommon "garm/runner/providers/common" "log" + "net/http" "strings" "sync" "time" @@ -69,11 +70,6 @@ type basePool struct { // If we were offline and did not process the webhook, the instance will linger. // We need to remove it from the provider and database. func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) error { - // runners, err := r.getGithubRunners() - // if err != nil { - // return errors.Wrap(err, "fetching github runners") - // } - dbInstances, err := r.helper.FetchDbInstances() if err != nil { return errors.Wrap(err, "fetching instances from db") @@ -85,13 +81,25 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro } for _, instance := range dbInstances { - if providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingCreate || providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingDelete { + switch providerCommon.InstanceStatus(instance.Status) { + case providerCommon.InstancePendingCreate, + providerCommon.InstancePendingDelete: // this instance is in the process of being created or is awaiting deletion. - // Instances in pending_Create did not get a chance to register themselves in, + // Instances in pending_create did not get a chance to register themselves in, // github so we let them be for now. continue } + if ok := runnerNames[instance.Name]; !ok { + // if instance.Status == providerCommon.InstanceRunning { + // if time.Since(instance.UpdatedAt).Minutes() < 20 { + // // Allow up to 20 minutes for instance to finish installing. + // // Anything beyond that is considered a timeout and the instance + // // is marked for deletion. + // // TODO(gabriel-samfira): Make the timeout configurable. + // continue + // } + // } // Set pending_delete on DB field. Allow consolidate() to remove it. if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { log.Printf("failed to update runner %s status", instance.Name) @@ -102,6 +110,95 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro return nil } +// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear +// as offline and for which we no longer have a local instance. +// This may happen if someone manually deletes the instance in the provider. We need to +// first remove the instance from github, and then from our database. +func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error { + for _, runner := range runners { + status := runner.GetStatus() + if status != "offline" { + // Runner is online. Ignore it. + continue + } + + dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name) + if err != nil { + if !errors.Is(err, runnerErrors.ErrNotFound) { + return errors.Wrap(err, "fetching instance from DB") + } + // We no longer have a DB entry for this instance, and the runner appears offline in github. + // Previous forceful removal may have failed? + log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name) + resp, err := r.helper.RemoveGithubRunner(*runner.ID) + if err != nil { + // Removed in the meantime? + if resp != nil && resp.StatusCode == http.StatusNotFound { + continue + } + return errors.Wrap(err, "removing runner") + } + continue + } + + if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete { + // already marked for deleting, which means the github workflow finished. + // Let consolidate take care of it. + continue + } + + pool, err := r.helper.GetPoolByID(dbInstance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching pool") + } + + // check if the provider still has the instance. + provider, ok := r.providers[pool.ProviderName] + if !ok { + return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID) + } + + // Check if the instance is still on the provider. + _, err = provider.GetInstance(r.ctx, dbInstance.Name) + if err != nil { + if !errors.Is(err, runnerErrors.ErrNotFound) { + return errors.Wrap(err, "fetching instance from provider") + } + // The runner instance is no longer on the provider, and it appears offline in github. + // It should be safe to force remove it. + log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name) + resp, err := r.helper.RemoveGithubRunner(*runner.ID) + if err != nil { + // Removed in the meantime? + if resp != nil && resp.StatusCode == http.StatusNotFound { + log.Printf("runner dissapeared from github") + } else { + return errors.Wrap(err, "removing runner from github") + } + } + // Remove the database entry for the runner. + log.Printf("Removing %s from database", dbInstance.Name) + if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil { + return errors.Wrap(err, "removing runner from database") + } + continue + } + + if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning { + // instance is running, but github reports runner as offline. Log the event. + // This scenario requires manual intervention. + // Perhaps it just came online and github did not yet change it's status? + log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name) + continue + } + //start the instance + if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil { + return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID) + } + } + return nil +} + func (r *basePool) fetchInstance(runnerName string) (params.Instance, error) { runner, err := r.store.GetInstanceByName(r.ctx, runnerName) if err != nil { @@ -363,18 +460,18 @@ func (r *basePool) HandleWorkflowJob(job params.WorkflowJob) error { log.Printf("no runner was assigned. Skipping.") return nil } + // update instance workload state. + if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil { + log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) + return errors.Wrap(err, "updating runner") + } log.Printf("marking instance %s as pending_delete", job.WorkflowJob.RunnerName) if err := r.setInstanceStatus(job.WorkflowJob.RunnerName, providerCommon.InstancePendingDelete, nil); err != nil { log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) return errors.Wrap(err, "updating runner") } - // update instance workload state. Set job_id in instance state. - if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil { - log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) - return errors.Wrap(err, "updating runner") - } case "in_progress": - // update instance workload state. Set job_id in instance state. + // update instance workload state. if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerActive); err != nil { log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) return errors.Wrap(err, "updating runner") @@ -523,84 +620,6 @@ func (r *basePool) ensureMinIdleRunners() { wg.Wait() } -// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear -// as offline and for which we no longer have a local instance. -// This may happen if someone manually deletes the instance in the provider. We need to -// first remove the instance from github, and then from our database. -func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error { - for _, runner := range runners { - status := runner.GetStatus() - if status != "offline" { - // Runner is online. Ignore it. - continue - } - - dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name) - if err != nil { - if !errors.Is(err, runnerErrors.ErrNotFound) { - return errors.Wrap(err, "fetching instance from DB") - } - // We no longer have a DB entry for this instance, and the runner appears offline in github. - // Previous forceful removal may have failed? - log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name) - if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil { - return errors.Wrap(err, "removing runner") - } - continue - } - - if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete { - // already marked for deleting, which means the github workflow finished. - // Let consolidate take care of it. - continue - } - - pool, err := r.helper.GetPoolByID(dbInstance.PoolID) - if err != nil { - return errors.Wrap(err, "fetching pool") - } - - // check if the provider still has the instance. - provider, ok := r.providers[pool.ProviderName] - if !ok { - return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID) - } - - // Check if the instance is still on the provider. - _, err = provider.GetInstance(r.ctx, dbInstance.Name) - if err != nil { - if !errors.Is(err, runnerErrors.ErrNotFound) { - return errors.Wrap(err, "fetching instance from provider") - } - // The runner instance is no longer on the provider, and it appears offline in github. - // It should be safe to force remove it. - log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name) - if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil { - return errors.Wrap(err, "removing runner from github") - } - // Remove the database entry for the runner. - log.Printf("Removing %s from database", dbInstance.Name) - if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil { - return errors.Wrap(err, "removing runner from database") - } - continue - } - - if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning { - // instance is running, but github reports runner as offline. Log the event. - // This scenario requires manual intervention. - // Perhaps it just came online and github did not yet change it's status? - log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name) - continue - } - //start the instance - if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil { - return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID) - } - } - return nil -} - func (r *basePool) deleteInstanceFromProvider(instance params.Instance) error { pool, err := r.helper.GetPoolByID(instance.PoolID) if err != nil { @@ -657,19 +676,51 @@ func (r *basePool) deletePendingInstances() { if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceDeleting, nil); err != nil { log.Printf("failed to update runner %s status", instance.Name) } - go func(instance params.Instance) { - if err := r.deleteInstanceFromProvider(instance); err != nil { - // failed to remove from provider. Set the status back to pending_delete, which - // will retry the operation. - if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { - log.Printf("failed to update runner %s status", instance.Name) + go func(instance params.Instance) (err error) { + defer func(instance params.Instance) { + if err != nil { + // failed to remove from provider. Set the status back to pending_delete, which + // will retry the operation. + if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { + log.Printf("failed to update runner %s status", instance.Name) + } } + }(instance) + + err = r.deleteInstanceFromProvider(instance) + if err != nil { log.Printf("failed to delete instance from provider: %+v", err) } + return }(instance) } } +func (r *basePool) ForceDeleteRunner(runner params.Instance) error { + if runner.AgentID != 0 { + resp, err := r.helper.RemoveGithubRunner(runner.AgentID) + if err != nil { + if resp != nil { + switch resp.StatusCode { + case http.StatusUnprocessableEntity: + return errors.Wrapf(runnerErrors.ErrUnprocessable, "removing runner: %q", err) + case http.StatusNotFound: + return errors.Wrapf(runnerErrors.ErrNotFound, "removing runner: %q", err) + default: + return errors.Wrap(err, "removing runner") + } + } + return errors.Wrap(err, "removing runner") + } + } + + if err := r.setInstanceStatus(runner.Name, providerCommon.InstancePendingDelete, nil); err != nil { + log.Printf("failed to update runner %s status", runner.Name) + return errors.Wrap(err, "updating runner") + } + return nil +} + func (r *basePool) addPendingInstances() { // TODO: filter instances by status. instances, err := r.helper.FetchDbInstances() diff --git a/runner/pool/interfaces.go b/runner/pool/interfaces.go index 08a2bfd5..6cc522a9 100644 --- a/runner/pool/interfaces.go +++ b/runner/pool/interfaces.go @@ -25,7 +25,7 @@ type poolHelper interface { GetGithubRunners() ([]*github.Runner, error) FetchTools() ([]*github.RunnerApplicationDownload, error) FetchDbInstances() ([]params.Instance, error) - RemoveGithubRunner(runnerID int64) error + RemoveGithubRunner(runnerID int64) (*github.Response, error) ListPools() ([]params.Pool, error) GithubURL() string JwtToken() string diff --git a/runner/pool/organization.go b/runner/pool/organization.go index e51e2247..067c535f 100644 --- a/runner/pool/organization.go +++ b/runner/pool/organization.go @@ -112,9 +112,8 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) { return r.store.ListOrgInstances(r.ctx, r.id) } -func (r *organization) RemoveGithubRunner(runnerID int64) error { - _, err := r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID) - return errors.Wrap(err, "removing runner") +func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) { + return r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID) } func (r *organization) ListPools() ([]params.Pool, error) { diff --git a/runner/pool/repository.go b/runner/pool/repository.go index 02a27b2c..3c12d5d6 100644 --- a/runner/pool/repository.go +++ b/runner/pool/repository.go @@ -114,9 +114,8 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) { return r.store.ListRepoInstances(r.ctx, r.id) } -func (r *repository) RemoveGithubRunner(runnerID int64) error { - _, err := r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID) - return errors.Wrap(err, "removing runner") +func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) { + return r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID) } func (r *repository) ListPools() ([]params.Pool, error) { diff --git a/runner/runner.go b/runner/runner.go index 1e584391..f9e5acfd 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -39,6 +39,7 @@ import ( "garm/params" "garm/runner/common" "garm/runner/providers" + providerCommon "garm/runner/providers/common" "garm/util" "github.com/pkg/errors" @@ -538,9 +539,60 @@ func (r *Runner) AddInstanceStatusMessage(ctx context.Context, param params.Inst RunnerStatus: param.Status, } + if param.AgentID != nil { + updateParams.AgentID = *param.AgentID + } + if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil { return errors.Wrap(err, "updating runner state") } return nil } + +func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error { + if !auth.IsAdmin(ctx) { + return runnerErrors.ErrUnauthorized + } + + instance, err := r.store.GetInstanceByName(ctx, instanceName) + if err != nil { + return errors.Wrap(err, "fetching instance") + } + + if instance.Status != providerCommon.InstanceRunning { + return runnerErrors.NewBadRequestError("runner must be in %q state", providerCommon.InstanceRunning) + } + + pool, err := r.store.GetPoolByID(ctx, instance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching pool") + } + + var poolMgr common.PoolManager + + if pool.RepoID != "" { + repo, err := r.store.GetRepositoryByID(ctx, pool.RepoID) + if err != nil { + return errors.Wrap(err, "fetching repo") + } + poolMgr, err = r.findRepoPoolManager(repo.Owner, repo.Name) + if err != nil { + return errors.Wrapf(err, "fetching pool manager for repo %s", pool.RepoName) + } + } else if pool.OrgID != "" { + org, err := r.store.GetOrganizationByID(ctx, pool.OrgID) + if err != nil { + return errors.Wrap(err, "fetching org") + } + poolMgr, err = r.findOrgPoolManager(org.Name) + if err != nil { + return errors.Wrapf(err, "fetching pool manager for org %s", pool.OrgName) + } + } + + if err := poolMgr.ForceDeleteRunner(instance); err != nil { + return errors.Wrap(err, "removing runner") + } + return nil +}