diff --git a/database/common/common.go b/database/common/common.go index 57d3c897..f710e3b1 100644 --- a/database/common/common.go +++ b/database/common/common.go @@ -58,7 +58,7 @@ type Store interface { FindOrganizationPoolByTags(ctx context.Context, orgID string, tags []string) (params.Pool, error) CreateInstance(ctx context.Context, poolID string, param params.CreateInstanceParams) (params.Instance, error) - DeleteInstance(ctx context.Context, poolID string, instanceID string) error + DeleteInstance(ctx context.Context, poolID string, instanceName string) error UpdateInstance(ctx context.Context, instanceID string, param params.UpdateInstanceParams) (params.Instance, error) ListPoolInstances(ctx context.Context, poolID string) ([]params.Instance, error) diff --git a/runner/pool/common.go b/runner/pool/common.go index 813f2504..85923b86 100644 --- a/runner/pool/common.go +++ b/runner/pool/common.go @@ -535,51 +535,67 @@ func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error continue } - removeRunner := false - dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name) if err != nil { if !errors.Is(err, runnerErrors.ErrNotFound) { return errors.Wrap(err, "fetching instance from DB") } - // We no longer have a DB entry for this instance. Previous forceful - // removal may have failed? - removeRunner = true - } else { - pool, err := r.helper.GetPoolByID(dbInstance.PoolID) - if err != nil { - return errors.Wrap(err, "fetching pool") - } - - if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete { - // already marked for deleting. Let consolidate take care of it. - continue - } - // check if the provider still has the instance. - provider, ok := r.providers[pool.ProviderName] - if !ok { - return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID) - } - - if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning { - // instance is running, but github reports runner as offline. Log the event. - // This scenario requires manual intervention. - // Perhaps it just came online and github did not yet change it's status? - log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name) - continue - } - //start the instance - if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil { - return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID) - } - // we started the instance. Give it a chance to come online - continue - } - - if removeRunner { + // We no longer have a DB entry for this instance, and the runner appears offline in github. + // Previous forceful removal may have failed? + log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name) if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil { return errors.Wrap(err, "removing runner") } + continue + } + + if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete { + // already marked for deleting, which means the github workflow finished. + // Let consolidate take care of it. + continue + } + + pool, err := r.helper.GetPoolByID(dbInstance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching pool") + } + + // check if the provider still has the instance. + provider, ok := r.providers[pool.ProviderName] + if !ok { + return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID) + } + + // Check if the instance is still on the provider. + _, err = provider.GetInstance(r.ctx, dbInstance.Name) + if err != nil { + if !errors.Is(err, runnerErrors.ErrNotFound) { + return errors.Wrap(err, "fetching instance from provider") + } + // The runner instance is no longer on the provider, and it appears offline in github. + // It should be safe to force remove it. + log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name) + if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil { + return errors.Wrap(err, "removing runner from github") + } + // Remove the database entry for the runner. + log.Printf("Removing %s from database", dbInstance.Name) + if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil { + return errors.Wrap(err, "removing runner from database") + } + continue + } + + if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning { + // instance is running, but github reports runner as offline. Log the event. + // This scenario requires manual intervention. + // Perhaps it just came online and github did not yet change it's status? + log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name) + continue + } + //start the instance + if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil { + return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID) } } return nil diff --git a/runner/providers/external/external.go b/runner/providers/external/external.go index dc3da784..63127cd5 100644 --- a/runner/providers/external/external.go +++ b/runner/providers/external/external.go @@ -121,6 +121,8 @@ func (e *external) GetInstance(ctx context.Context, instance string) (params.Ins fmt.Sprintf("GARM_INSTANCE_ID=%s", instance), } + // TODO(gabriel-samfira): handle error types. Of particular insterest is to + // know when the error is ErrNotFound. out, err := exec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { return params.Instance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) diff --git a/runner/providers/lxd/lxd.go b/runner/providers/lxd/lxd.go index 63dd5e89..7d8cb119 100644 --- a/runner/providers/lxd/lxd.go +++ b/runner/providers/lxd/lxd.go @@ -310,6 +310,9 @@ func (l *LXD) CreateInstance(ctx context.Context, bootstrapParams params.Bootstr func (l *LXD) GetInstance(ctx context.Context, instanceName string) (params.Instance, error) { instance, _, err := l.cli.GetInstanceFull(instanceName) if err != nil { + if isNotFoundError(err) { + return params.Instance{}, errors.Wrapf(runnerErrors.ErrNotFound, "fetching instance: %q", err) + } return params.Instance{}, errors.Wrap(err, "fetching instance") }