Fix cleanup orphaned github runners

This commit is contained in:
Gabriel Adrian Samfira 2022-06-24 11:17:34 +00:00
parent 63824247be
commit 846bd672ad
4 changed files with 59 additions and 38 deletions

View file

@ -58,7 +58,7 @@ type Store interface {
FindOrganizationPoolByTags(ctx context.Context, orgID string, tags []string) (params.Pool, error)
CreateInstance(ctx context.Context, poolID string, param params.CreateInstanceParams) (params.Instance, error)
DeleteInstance(ctx context.Context, poolID string, instanceID string) error
DeleteInstance(ctx context.Context, poolID string, instanceName string) error
UpdateInstance(ctx context.Context, instanceID string, param params.UpdateInstanceParams) (params.Instance, error)
ListPoolInstances(ctx context.Context, poolID string) ([]params.Instance, error)

View file

@ -535,51 +535,67 @@ func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error
continue
}
removeRunner := false
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from DB")
}
// We no longer have a DB entry for this instance. Previous forceful
// removal may have failed?
removeRunner = true
} else {
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting. Let consolidate take care of it.
continue
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
// we started the instance. Give it a chance to come online
continue
}
if removeRunner {
// We no longer have a DB entry for this instance, and the runner appears offline in github.
// Previous forceful removal may have failed?
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting, which means the github workflow finished.
// Let consolidate take care of it.
continue
}
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
}
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner from github")
}
// Remove the database entry for the runner.
log.Printf("Removing %s from database", dbInstance.Name)
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
return errors.Wrap(err, "removing runner from database")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
}
return nil

View file

@ -121,6 +121,8 @@ func (e *external) GetInstance(ctx context.Context, instance string) (params.Ins
fmt.Sprintf("GARM_INSTANCE_ID=%s", instance),
}
// TODO(gabriel-samfira): handle error types. Of particular insterest is to
// know when the error is ErrNotFound.
out, err := exec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
return params.Instance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)

View file

@ -310,6 +310,9 @@ func (l *LXD) CreateInstance(ctx context.Context, bootstrapParams params.Bootstr
func (l *LXD) GetInstance(ctx context.Context, instanceName string) (params.Instance, error) {
instance, _, err := l.cli.GetInstanceFull(instanceName)
if err != nil {
if isNotFoundError(err) {
return params.Instance{}, errors.Wrapf(runnerErrors.ErrNotFound, "fetching instance: %q", err)
}
return params.Instance{}, errors.Wrap(err, "fetching instance")
}