Fix leftover instances and refactor
Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
parent
004ad1f124
commit
f2ad7a3481
13 changed files with 178 additions and 81 deletions
|
|
@ -54,13 +54,6 @@ type PoolManager interface {
|
|||
// for it and call this function with the WorkflowJob as a parameter.
|
||||
HandleWorkflowJob(job params.WorkflowJob) error
|
||||
|
||||
// DeleteRunner will attempt to remove a runner from the pool. If forceRemove is true, any error
|
||||
// received from the provider will be ignored and we will proceed to remove the runner from the database.
|
||||
// An error received while attempting to remove from GitHub (other than 404) will still stop the deletion
|
||||
// process. This can happen if the runner is already processing a job. At which point, you can simply cancel
|
||||
// the job in github. Doing so will prompt GARM to reap the runner automatically.
|
||||
DeleteRunner(runner params.Instance, forceRemove, bypassGHUnauthorizedError bool) error
|
||||
|
||||
// InstallWebhook will create a webhook in github for the entity associated with this pool manager.
|
||||
InstallWebhook(ctx context.Context, param params.InstallWebhookParams) (params.HookInfo, error)
|
||||
// GetWebhookInfo will return information about the webhook installed in github for the entity associated
|
||||
|
|
@ -74,6 +67,8 @@ type PoolManager interface {
|
|||
// may use internal or self signed certificates.
|
||||
RootCABundle() (params.CertificateBundle, error)
|
||||
|
||||
SetPoolRunningState(isRunning bool, failureReason string)
|
||||
|
||||
// Start will start the pool manager and all associated workers.
|
||||
Start() error
|
||||
// Stop will stop the pool manager and all associated workers.
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ type GithubEntityOperations interface {
|
|||
PingEntityHook(ctx context.Context, id int64) (ret *github.Response, err error)
|
||||
ListEntityRunners(ctx context.Context, opts *github.ListOptions) (*github.Runners, *github.Response, error)
|
||||
ListEntityRunnerApplicationDownloads(ctx context.Context) ([]*github.RunnerApplicationDownload, *github.Response, error)
|
||||
RemoveEntityRunner(ctx context.Context, runnerID int64) (*github.Response, error)
|
||||
RemoveEntityRunner(ctx context.Context, runnerID int64) error
|
||||
CreateEntityRegistrationToken(ctx context.Context) (*github.RegistrationToken, *github.Response, error)
|
||||
GetEntityJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error)
|
||||
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ func (r *Runner) GetRunnerServiceName(ctx context.Context) (string, error) {
|
|||
"pool_id", instance.PoolID)
|
||||
return "", errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
entity, err = pool.GithubEntity()
|
||||
entity, err = pool.GetEntity()
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
ctx, "failed to get pool entity",
|
||||
|
|
@ -81,7 +81,7 @@ func (r *Runner) GetRunnerServiceName(ctx context.Context) (string, error) {
|
|||
"scale_set_id", instance.ScaleSetID)
|
||||
return "", errors.Wrap(err, "fetching scale set")
|
||||
}
|
||||
entity, err = scaleSet.GithubEntity()
|
||||
entity, err = scaleSet.GetEntity()
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
ctx, "failed to get scale set entity",
|
||||
|
|
|
|||
|
|
@ -349,7 +349,7 @@ func (r *basePoolManager) startLoopForFunction(f func() error, interval time.Dur
|
|||
r.ctx, "error in loop",
|
||||
"loop_name", name)
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
r.setPoolRunningState(false, err.Error())
|
||||
r.SetPoolRunningState(false, err.Error())
|
||||
}
|
||||
}
|
||||
case <-r.ctx.Done():
|
||||
|
|
@ -380,7 +380,7 @@ func (r *basePoolManager) updateTools() error {
|
|||
if err != nil {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
r.ctx, "failed to update tools for entity", "entity", r.entity.String())
|
||||
r.setPoolRunningState(false, err.Error())
|
||||
r.SetPoolRunningState(false, err.Error())
|
||||
return fmt.Errorf("failed to update tools for entity %s: %w", r.entity.String(), err)
|
||||
}
|
||||
r.mux.Lock()
|
||||
|
|
@ -388,7 +388,7 @@ func (r *basePoolManager) updateTools() error {
|
|||
r.mux.Unlock()
|
||||
|
||||
slog.DebugContext(r.ctx, "successfully updated tools")
|
||||
r.setPoolRunningState(true, "")
|
||||
r.SetPoolRunningState(true, "")
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
@ -565,16 +565,19 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner)
|
|||
slog.InfoContext(
|
||||
r.ctx, "Runner has no database entry in garm, removing from github",
|
||||
"runner_name", runner.GetName())
|
||||
resp, err := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
|
||||
if err != nil {
|
||||
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID()); err != nil {
|
||||
// Removed in the meantime?
|
||||
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
||||
if errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
continue
|
||||
}
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if dbInstance.ScaleSetID != 0 {
|
||||
// ignore scale set instances.
|
||||
continue
|
||||
}
|
||||
|
||||
switch dbInstance.Status {
|
||||
case commonParams.InstancePendingDelete, commonParams.InstanceDeleting:
|
||||
|
|
@ -650,10 +653,9 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner)
|
|||
slog.InfoContext(
|
||||
r.ctx, "Runner instance is no longer on the provider, removing from github",
|
||||
"runner_name", dbInstance.Name)
|
||||
resp, err := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
|
||||
if err != nil {
|
||||
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID()); err != nil {
|
||||
// Removed in the meantime?
|
||||
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
||||
if errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
slog.DebugContext(
|
||||
r.ctx, "runner disappeared from github",
|
||||
"runner_name", dbInstance.Name)
|
||||
|
|
@ -806,7 +808,7 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
|
|||
}
|
||||
|
||||
if runner != nil {
|
||||
_, runnerCleanupErr := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
|
||||
runnerCleanupErr := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
|
||||
if err != nil {
|
||||
slog.With(slog.Any("error", runnerCleanupErr)).ErrorContext(
|
||||
ctx, "failed to remove runner",
|
||||
|
|
@ -840,7 +842,7 @@ func (r *basePoolManager) waitForTimeoutOrCancelled(timeout time.Duration) {
|
|||
}
|
||||
}
|
||||
|
||||
func (r *basePoolManager) setPoolRunningState(isRunning bool, failureReason string) {
|
||||
func (r *basePoolManager) SetPoolRunningState(isRunning bool, failureReason string) {
|
||||
r.mux.Lock()
|
||||
r.managerErrorReason = failureReason
|
||||
r.managerIsRunning = isRunning
|
||||
|
|
@ -1660,45 +1662,22 @@ func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove, bypa
|
|||
if !r.managerIsRunning && !bypassGHUnauthorizedError {
|
||||
return runnerErrors.NewConflictError("pool manager is not running for %s", r.entity.String())
|
||||
}
|
||||
|
||||
if runner.AgentID != 0 {
|
||||
resp, err := r.ghcli.RemoveEntityRunner(r.ctx, runner.AgentID)
|
||||
if err != nil {
|
||||
if resp != nil {
|
||||
switch resp.StatusCode {
|
||||
case http.StatusUnprocessableEntity:
|
||||
return errors.Wrapf(runnerErrors.ErrBadRequest, "removing runner: %q", err)
|
||||
case http.StatusNotFound:
|
||||
// Runner may have been deleted by a finished job, or manually by the user.
|
||||
slog.DebugContext(
|
||||
r.ctx, "runner was not found in github",
|
||||
"agent_id", runner.AgentID)
|
||||
case http.StatusUnauthorized:
|
||||
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "failed to remove runner from github")
|
||||
// Mark the pool as offline from this point forward
|
||||
r.setPoolRunningState(false, fmt.Sprintf("failed to remove runner: %q", err))
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
r.ctx, "failed to remove runner")
|
||||
if bypassGHUnauthorizedError {
|
||||
slog.Info("bypass github unauthorized error is set, marking runner for deletion")
|
||||
break
|
||||
}
|
||||
// evaluate the next switch case.
|
||||
fallthrough
|
||||
default:
|
||||
if err := r.ghcli.RemoveEntityRunner(r.ctx, runner.AgentID); err != nil {
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "failed to remove runner from github")
|
||||
// Mark the pool as offline from this point forward
|
||||
r.SetPoolRunningState(false, fmt.Sprintf("failed to remove runner: %q", err))
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
r.ctx, "failed to remove runner")
|
||||
if bypassGHUnauthorizedError {
|
||||
slog.Info("bypass github unauthorized error is set, marking runner for deletion")
|
||||
} else {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
} else {
|
||||
errResp := &github.ErrorResponse{}
|
||||
if errors.As(err, &errResp) {
|
||||
if errResp.Response != nil && errResp.Response.StatusCode == http.StatusUnauthorized && bypassGHUnauthorizedError {
|
||||
slog.Info("bypass github unauthorized error is set, marking runner for deletion")
|
||||
} else {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
} else {
|
||||
// We got a nil response. Assume we are in error.
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,8 +41,8 @@ func (s *stubGithubClient) ListEntityRunnerApplicationDownloads(_ context.Contex
|
|||
return nil, nil, s.err
|
||||
}
|
||||
|
||||
func (s *stubGithubClient) RemoveEntityRunner(_ context.Context, _ int64) (*github.Response, error) {
|
||||
return nil, s.err
|
||||
func (s *stubGithubClient) RemoveEntityRunner(_ context.Context, _ int64) error {
|
||||
return s.err
|
||||
}
|
||||
|
||||
func (s *stubGithubClient) CreateEntityRegistrationToken(_ context.Context) (*github.RegistrationToken, *github.Response, error) {
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ func (r *Runner) UpdatePoolByID(ctx context.Context, poolID string, param params
|
|||
return params.Pool{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners")
|
||||
}
|
||||
|
||||
entity, err := pool.GithubEntity()
|
||||
entity, err := pool.GetEntity()
|
||||
if err != nil {
|
||||
return params.Pool{}, errors.Wrap(err, "getting entity")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,8 @@ import (
|
|||
"github.com/cloudbase/garm/runner/common"
|
||||
"github.com/cloudbase/garm/runner/pool"
|
||||
"github.com/cloudbase/garm/runner/providers"
|
||||
"github.com/cloudbase/garm/util/github"
|
||||
"github.com/cloudbase/garm/util/github/scalesets"
|
||||
)
|
||||
|
||||
func NewRunner(ctx context.Context, cfg config.Config, db dbCommon.Store) (*Runner, error) {
|
||||
|
|
@ -849,13 +851,92 @@ func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDel
|
|||
return runnerErrors.NewBadRequestError("runner must be in one of the following states: %q", strings.Join(validStates, ", "))
|
||||
}
|
||||
|
||||
poolMgr, err := r.getPoolManagerFromInstance(ctx, instance)
|
||||
ghCli, ssCli, err := r.getGHCliFromInstance(ctx, instance)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool manager for instance")
|
||||
return errors.Wrap(err, "fetching github client")
|
||||
}
|
||||
|
||||
if err := poolMgr.DeleteRunner(instance, forceDelete, bypassGithubUnauthorized); err != nil {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
if instance.AgentID != 0 {
|
||||
if instance.ScaleSetID != 0 {
|
||||
err = ssCli.RemoveRunner(ctx, instance.AgentID)
|
||||
} else if instance.PoolID != "" {
|
||||
err = ghCli.RemoveEntityRunner(ctx, instance.AgentID)
|
||||
} else {
|
||||
return errors.New("instance does not have a pool or scale set")
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) && instance.PoolID != "" {
|
||||
poolMgr, err := r.getPoolManagerFromInstance(ctx, instance)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool manager for instance")
|
||||
}
|
||||
poolMgr.SetPoolRunningState(false, fmt.Sprintf("failed to remove runner: %q", err))
|
||||
}
|
||||
if !bypassGithubUnauthorized {
|
||||
return errors.Wrap(err, "removing runner from github")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
instanceStatus := commonParams.InstancePendingDelete
|
||||
if forceDelete {
|
||||
instanceStatus = commonParams.InstancePendingForceDelete
|
||||
}
|
||||
|
||||
slog.InfoContext(
|
||||
r.ctx, "setting instance status",
|
||||
"runner_name", instance.Name,
|
||||
"status", instanceStatus)
|
||||
|
||||
updateParams := params.UpdateInstanceParams{
|
||||
Status: instanceStatus,
|
||||
}
|
||||
_, err = r.store.UpdateInstance(r.ctx, instance.Name, updateParams)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "updating runner state")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Runner) getGHCliFromInstance(ctx context.Context, instance params.Instance) (common.GithubClient, *scalesets.ScaleSetClient, error) {
|
||||
// TODO(gabriel-samfira): We can probably cache the entity.
|
||||
var entityGetter params.EntityGetter
|
||||
var err error
|
||||
if instance.PoolID != "" {
|
||||
entityGetter, err = r.store.GetPoolByID(ctx, instance.PoolID)
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
} else if instance.ScaleSetID != 0 {
|
||||
entityGetter, err = r.store.GetScaleSetByID(ctx, instance.ScaleSetID)
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "fetching scale set")
|
||||
}
|
||||
} else {
|
||||
return nil, nil, errors.New("instance does not have a pool or scale set")
|
||||
}
|
||||
|
||||
entity, err := entityGetter.GetEntity()
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "fetching entity")
|
||||
}
|
||||
|
||||
// Fetching the entity from the database will populate all fields, including credentials.
|
||||
entity, err = r.store.GetGithubEntity(ctx, entity.EntityType, entity.ID)
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "fetching entity")
|
||||
}
|
||||
|
||||
ghCli, err := github.Client(ctx, entity)
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "creating github client")
|
||||
}
|
||||
|
||||
scaleSetCli, err := scalesets.NewClient(ghCli)
|
||||
if err != nil {
|
||||
return nil, nil, errors.Wrap(err, "creating scaleset client")
|
||||
}
|
||||
return ghCli, scaleSetCli, nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ func (r *Runner) DeleteScaleSetByID(ctx context.Context, scaleSetID uint) error
|
|||
return runnerErrors.NewBadRequestError("scale set is enabled; disable it first")
|
||||
}
|
||||
|
||||
paramEntity, err := scaleSet.GithubEntity()
|
||||
paramEntity, err := scaleSet.GetEntity()
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "getting entity")
|
||||
}
|
||||
|
|
@ -137,7 +137,7 @@ func (r *Runner) UpdateScaleSetByID(ctx context.Context, scaleSetID uint, param
|
|||
return params.ScaleSet{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners")
|
||||
}
|
||||
|
||||
paramEntity, err := scaleSet.GithubEntity()
|
||||
paramEntity, err := scaleSet.GetEntity()
|
||||
if err != nil {
|
||||
return params.ScaleSet{}, errors.Wrap(err, "getting entity")
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue