Add manual runner removal

Runners can now be manually removed using the CLI. Some restrictions apply:

  * A runner must be idle in github. Github will not allow us to remove a runner
that is running a workflow.
  * The runner status must be "running"

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2022-06-29 16:23:01 +00:00
parent 1f419d0abc
commit 5390efbaab
20 changed files with 336 additions and 120 deletions

View file

@ -24,6 +24,7 @@ import (
"garm/runner/common"
providerCommon "garm/runner/providers/common"
"log"
"net/http"
"strings"
"sync"
"time"
@ -69,11 +70,6 @@ type basePool struct {
// If we were offline and did not process the webhook, the instance will linger.
// We need to remove it from the provider and database.
func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) error {
// runners, err := r.getGithubRunners()
// if err != nil {
// return errors.Wrap(err, "fetching github runners")
// }
dbInstances, err := r.helper.FetchDbInstances()
if err != nil {
return errors.Wrap(err, "fetching instances from db")
@ -85,13 +81,25 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
}
for _, instance := range dbInstances {
if providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingCreate || providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingDelete {
switch providerCommon.InstanceStatus(instance.Status) {
case providerCommon.InstancePendingCreate,
providerCommon.InstancePendingDelete:
// this instance is in the process of being created or is awaiting deletion.
// Instances in pending_Create did not get a chance to register themselves in,
// Instances in pending_create did not get a chance to register themselves in,
// github so we let them be for now.
continue
}
if ok := runnerNames[instance.Name]; !ok {
// if instance.Status == providerCommon.InstanceRunning {
// if time.Since(instance.UpdatedAt).Minutes() < 20 {
// // Allow up to 20 minutes for instance to finish installing.
// // Anything beyond that is considered a timeout and the instance
// // is marked for deletion.
// // TODO(gabriel-samfira): Make the timeout configurable.
// continue
// }
// }
// Set pending_delete on DB field. Allow consolidate() to remove it.
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
@ -102,6 +110,95 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
return nil
}
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
// first remove the instance from github, and then from our database.
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
for _, runner := range runners {
status := runner.GetStatus()
if status != "offline" {
// Runner is online. Ignore it.
continue
}
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from DB")
}
// We no longer have a DB entry for this instance, and the runner appears offline in github.
// Previous forceful removal may have failed?
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
if err != nil {
// Removed in the meantime?
if resp != nil && resp.StatusCode == http.StatusNotFound {
continue
}
return errors.Wrap(err, "removing runner")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting, which means the github workflow finished.
// Let consolidate take care of it.
continue
}
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
}
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
if err != nil {
// Removed in the meantime?
if resp != nil && resp.StatusCode == http.StatusNotFound {
log.Printf("runner dissapeared from github")
} else {
return errors.Wrap(err, "removing runner from github")
}
}
// Remove the database entry for the runner.
log.Printf("Removing %s from database", dbInstance.Name)
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
return errors.Wrap(err, "removing runner from database")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
}
return nil
}
func (r *basePool) fetchInstance(runnerName string) (params.Instance, error) {
runner, err := r.store.GetInstanceByName(r.ctx, runnerName)
if err != nil {
@ -363,18 +460,18 @@ func (r *basePool) HandleWorkflowJob(job params.WorkflowJob) error {
log.Printf("no runner was assigned. Skipping.")
return nil
}
// update instance workload state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
}
log.Printf("marking instance %s as pending_delete", job.WorkflowJob.RunnerName)
if err := r.setInstanceStatus(job.WorkflowJob.RunnerName, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
}
// update instance workload state. Set job_id in instance state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
}
case "in_progress":
// update instance workload state. Set job_id in instance state.
// update instance workload state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerActive); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
@ -523,84 +620,6 @@ func (r *basePool) ensureMinIdleRunners() {
wg.Wait()
}
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
// first remove the instance from github, and then from our database.
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
for _, runner := range runners {
status := runner.GetStatus()
if status != "offline" {
// Runner is online. Ignore it.
continue
}
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from DB")
}
// We no longer have a DB entry for this instance, and the runner appears offline in github.
// Previous forceful removal may have failed?
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting, which means the github workflow finished.
// Let consolidate take care of it.
continue
}
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
}
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner from github")
}
// Remove the database entry for the runner.
log.Printf("Removing %s from database", dbInstance.Name)
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
return errors.Wrap(err, "removing runner from database")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
}
return nil
}
func (r *basePool) deleteInstanceFromProvider(instance params.Instance) error {
pool, err := r.helper.GetPoolByID(instance.PoolID)
if err != nil {
@ -657,19 +676,51 @@ func (r *basePool) deletePendingInstances() {
if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceDeleting, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
}
go func(instance params.Instance) {
if err := r.deleteInstanceFromProvider(instance); err != nil {
// failed to remove from provider. Set the status back to pending_delete, which
// will retry the operation.
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
go func(instance params.Instance) (err error) {
defer func(instance params.Instance) {
if err != nil {
// failed to remove from provider. Set the status back to pending_delete, which
// will retry the operation.
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
}
}
}(instance)
err = r.deleteInstanceFromProvider(instance)
if err != nil {
log.Printf("failed to delete instance from provider: %+v", err)
}
return
}(instance)
}
}
func (r *basePool) ForceDeleteRunner(runner params.Instance) error {
if runner.AgentID != 0 {
resp, err := r.helper.RemoveGithubRunner(runner.AgentID)
if err != nil {
if resp != nil {
switch resp.StatusCode {
case http.StatusUnprocessableEntity:
return errors.Wrapf(runnerErrors.ErrUnprocessable, "removing runner: %q", err)
case http.StatusNotFound:
return errors.Wrapf(runnerErrors.ErrNotFound, "removing runner: %q", err)
default:
return errors.Wrap(err, "removing runner")
}
}
return errors.Wrap(err, "removing runner")
}
}
if err := r.setInstanceStatus(runner.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", runner.Name)
return errors.Wrap(err, "updating runner")
}
return nil
}
func (r *basePool) addPendingInstances() {
// TODO: filter instances by status.
instances, err := r.helper.FetchDbInstances()