Fix cleanup orphaned github runners
This commit is contained in:
parent
63824247be
commit
846bd672ad
4 changed files with 59 additions and 38 deletions
|
|
@ -58,7 +58,7 @@ type Store interface {
|
|||
FindOrganizationPoolByTags(ctx context.Context, orgID string, tags []string) (params.Pool, error)
|
||||
|
||||
CreateInstance(ctx context.Context, poolID string, param params.CreateInstanceParams) (params.Instance, error)
|
||||
DeleteInstance(ctx context.Context, poolID string, instanceID string) error
|
||||
DeleteInstance(ctx context.Context, poolID string, instanceName string) error
|
||||
UpdateInstance(ctx context.Context, instanceID string, param params.UpdateInstanceParams) (params.Instance, error)
|
||||
|
||||
ListPoolInstances(ctx context.Context, poolID string) ([]params.Instance, error)
|
||||
|
|
|
|||
|
|
@ -535,51 +535,67 @@ func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error
|
|||
continue
|
||||
}
|
||||
|
||||
removeRunner := false
|
||||
|
||||
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from DB")
|
||||
}
|
||||
// We no longer have a DB entry for this instance. Previous forceful
|
||||
// removal may have failed?
|
||||
removeRunner = true
|
||||
} else {
|
||||
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
|
||||
// already marked for deleting. Let consolidate take care of it.
|
||||
continue
|
||||
}
|
||||
// check if the provider still has the instance.
|
||||
provider, ok := r.providers[pool.ProviderName]
|
||||
if !ok {
|
||||
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
|
||||
// instance is running, but github reports runner as offline. Log the event.
|
||||
// This scenario requires manual intervention.
|
||||
// Perhaps it just came online and github did not yet change it's status?
|
||||
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
|
||||
continue
|
||||
}
|
||||
//start the instance
|
||||
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
|
||||
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
|
||||
}
|
||||
// we started the instance. Give it a chance to come online
|
||||
continue
|
||||
}
|
||||
|
||||
if removeRunner {
|
||||
// We no longer have a DB entry for this instance, and the runner appears offline in github.
|
||||
// Previous forceful removal may have failed?
|
||||
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
|
||||
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
|
||||
// already marked for deleting, which means the github workflow finished.
|
||||
// Let consolidate take care of it.
|
||||
continue
|
||||
}
|
||||
|
||||
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
|
||||
// check if the provider still has the instance.
|
||||
provider, ok := r.providers[pool.ProviderName]
|
||||
if !ok {
|
||||
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
||||
}
|
||||
|
||||
// Check if the instance is still on the provider.
|
||||
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from provider")
|
||||
}
|
||||
// The runner instance is no longer on the provider, and it appears offline in github.
|
||||
// It should be safe to force remove it.
|
||||
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
|
||||
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
|
||||
return errors.Wrap(err, "removing runner from github")
|
||||
}
|
||||
// Remove the database entry for the runner.
|
||||
log.Printf("Removing %s from database", dbInstance.Name)
|
||||
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
|
||||
return errors.Wrap(err, "removing runner from database")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
|
||||
// instance is running, but github reports runner as offline. Log the event.
|
||||
// This scenario requires manual intervention.
|
||||
// Perhaps it just came online and github did not yet change it's status?
|
||||
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
|
||||
continue
|
||||
}
|
||||
//start the instance
|
||||
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
|
||||
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
2
runner/providers/external/external.go
vendored
2
runner/providers/external/external.go
vendored
|
|
@ -121,6 +121,8 @@ func (e *external) GetInstance(ctx context.Context, instance string) (params.Ins
|
|||
fmt.Sprintf("GARM_INSTANCE_ID=%s", instance),
|
||||
}
|
||||
|
||||
// TODO(gabriel-samfira): handle error types. Of particular insterest is to
|
||||
// know when the error is ErrNotFound.
|
||||
out, err := exec.Exec(ctx, e.execPath, nil, asEnv)
|
||||
if err != nil {
|
||||
return params.Instance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
|
||||
|
|
|
|||
|
|
@ -310,6 +310,9 @@ func (l *LXD) CreateInstance(ctx context.Context, bootstrapParams params.Bootstr
|
|||
func (l *LXD) GetInstance(ctx context.Context, instanceName string) (params.Instance, error) {
|
||||
instance, _, err := l.cli.GetInstanceFull(instanceName)
|
||||
if err != nil {
|
||||
if isNotFoundError(err) {
|
||||
return params.Instance{}, errors.Wrapf(runnerErrors.ErrNotFound, "fetching instance: %q", err)
|
||||
}
|
||||
return params.Instance{}, errors.Wrap(err, "fetching instance")
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue