From 15a13084417b3a42a358cbdbec71e2d0da27ad85 Mon Sep 17 00:00:00 2001 From: Gabriel Adrian Samfira Date: Wed, 29 Jun 2022 23:44:03 +0000 Subject: [PATCH] Add timeout functionality for pool runner bootstrap Pools can now define a bootstrap timeout for runners. The timeout can be defined per pool and indicates the amount of time after which a runner is considered defunct and removed. If a runner doesn't join github in the configured amount of time, and it receives no updates indicating that it is installing the runner via instance status updates, it is considered defunct. Signed-off-by: Gabriel Adrian Samfira --- apiserver/controllers/pools.go | 2 + cmd/garm-cli/cmd/pool.go | 25 ++++++++----- cmd/garm-cli/cmd/repo_pool.go | 20 +++++----- config/config.go | 5 +++ database/sql/models.go | 19 +++++----- database/sql/util.go | 27 ++++++++------ params/params.go | 38 +++++++++++-------- params/requests.go | 36 +++++++++--------- runner/organizations.go | 5 +++ runner/pool/common.go | 68 ++++++++++++++++++++++++++++------ runner/pools.go | 4 ++ runner/repositories.go | 5 +++ 12 files changed, 173 insertions(+), 81 deletions(-) diff --git a/apiserver/controllers/pools.go b/apiserver/controllers/pools.go index 50692a95..6f5fee96 100644 --- a/apiserver/controllers/pools.go +++ b/apiserver/controllers/pools.go @@ -62,6 +62,8 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques return } + pool.RunnerBootstrapTimeout = pool.RunnerTimeout() + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(pool) } diff --git a/cmd/garm-cli/cmd/pool.go b/cmd/garm-cli/cmd/pool.go index 4d30abba..574f650b 100644 --- a/cmd/garm-cli/cmd/pool.go +++ b/cmd/garm-cli/cmd/pool.go @@ -161,15 +161,16 @@ var poolAddCmd = &cobra.Command{ tags := strings.Split(poolTags, ",") newPoolParams := params.CreatePoolParams{ - ProviderName: poolProvider, - MaxRunners: poolMaxRunners, - MinIdleRunners: poolMinIdleRunners, - Image: poolImage, - Flavor: poolFlavor, - OSType: config.OSType(poolOSType), - OSArch: config.OSArch(poolOSArch), - Tags: tags, - Enabled: poolEnabled, + ProviderName: poolProvider, + MaxRunners: poolMaxRunners, + MinIdleRunners: poolMinIdleRunners, + Image: poolImage, + Flavor: poolFlavor, + OSType: config.OSType(poolOSType), + OSArch: config.OSArch(poolOSArch), + Tags: tags, + Enabled: poolEnabled, + RunnerBootstrapTimeout: poolRunnerBootstrapTimeout, } if err := newPoolParams.Validate(); err != nil { return err @@ -252,6 +253,10 @@ explicitly remove them using the runner delete command. poolUpdateParams.Enabled = &poolEnabled } + if cmd.Flags().Changed("runner-bootstrap-timeout") { + poolUpdateParams.RunnerBootstrapTimeout = poolRunnerBootstrapTimeout + } + pool, err := cli.UpdatePoolByID(args[0], poolUpdateParams) if err != nil { return err @@ -276,6 +281,7 @@ func init() { poolUpdateCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.") poolUpdateCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.") poolUpdateCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.") + poolUpdateCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.") poolAddCmd.Flags().StringVar(&poolProvider, "provider-name", "", "The name of the provider where runners will be created.") poolAddCmd.Flags().StringVar(&poolImage, "image", "", "The provider-specific image name to use for runners in this pool.") @@ -284,6 +290,7 @@ func init() { poolAddCmd.Flags().StringVar(&poolOSType, "os-type", "linux", "Operating system type (windows, linux, etc).") poolAddCmd.Flags().StringVar(&poolOSArch, "os-arch", "amd64", "Operating system architecture (amd64, arm, etc).") poolAddCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.") + poolAddCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.") poolAddCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.") poolAddCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.") poolAddCmd.MarkFlagRequired("provider-name") diff --git a/cmd/garm-cli/cmd/repo_pool.go b/cmd/garm-cli/cmd/repo_pool.go index 80b3058d..8e430ab6 100644 --- a/cmd/garm-cli/cmd/repo_pool.go +++ b/cmd/garm-cli/cmd/repo_pool.go @@ -25,15 +25,16 @@ import ( ) var ( - poolProvider string - poolMaxRunners uint - poolMinIdleRunners uint - poolImage string - poolFlavor string - poolOSType string - poolOSArch string - poolTags string - poolEnabled bool + poolProvider string + poolMaxRunners uint + poolMinIdleRunners uint + poolImage string + poolFlavor string + poolOSType string + poolOSArch string + poolTags string + poolEnabled bool + poolRunnerBootstrapTimeout uint ) // repoPoolCmd represents the pool command @@ -323,6 +324,7 @@ func formatOnePool(pool params.Pool) { t.AppendRow(table.Row{"OS Architecture", pool.OSArch}) t.AppendRow(table.Row{"Max Runners", pool.MaxRunners}) t.AppendRow(table.Row{"Min Idle Runners", pool.MinIdleRunners}) + t.AppendRow(table.Row{"Runner Bootstrap Timeout", pool.RunnerBootstrapTimeout}) t.AppendRow(table.Row{"Tags", strings.Join(tags, ", ")}) t.AppendRow(table.Row{"Belongs to", belongsTo}) t.AppendRow(table.Row{"Level", level}) diff --git a/config/config.go b/config/config.go index e5c10004..0e561d3b 100644 --- a/config/config.go +++ b/config/config.go @@ -64,6 +64,11 @@ const ( // DefaultPoolQueueSize is the default size for a pool queue. DefaultPoolQueueSize = 10 + // DefaultRunnerBootstrapTimeout is the default timeout in minutes a runner is + // considered to be defunct. If a runner does not join github in the alloted amount + // of time and no new updates have been made to it's state, it will be removed. + DefaultRunnerBootstrapTimeout = 20 + GithubBaseURL = "https://github.com" ) diff --git a/database/sql/models.go b/database/sql/models.go index 2e9e4a33..de3b9e24 100644 --- a/database/sql/models.go +++ b/database/sql/models.go @@ -54,15 +54,16 @@ type Tag struct { type Pool struct { Base - ProviderName string `gorm:"index:idx_pool_type"` - MaxRunners uint - MinIdleRunners uint - Image string `gorm:"index:idx_pool_type"` - Flavor string `gorm:"index:idx_pool_type"` - OSType config.OSType - OSArch config.OSArch - Tags []*Tag `gorm:"many2many:pool_tags;"` - Enabled bool + ProviderName string `gorm:"index:idx_pool_type"` + MaxRunners uint + MinIdleRunners uint + RunnerBootstrapTimeout uint + Image string `gorm:"index:idx_pool_type"` + Flavor string `gorm:"index:idx_pool_type"` + OSType config.OSType + OSArch config.OSArch + Tags []*Tag `gorm:"many2many:pool_tags;"` + Enabled bool RepoID uuid.UUID `gorm:"index"` Repository Repository `gorm:"foreignKey:RepoID"` diff --git a/database/sql/util.go b/database/sql/util.go index e047b18c..3852845e 100644 --- a/database/sql/util.go +++ b/database/sql/util.go @@ -87,17 +87,18 @@ func (s *sqlDatabase) sqlToCommonOrganization(org Organization) params.Organizat func (s *sqlDatabase) sqlToCommonPool(pool Pool) params.Pool { ret := params.Pool{ - ID: pool.ID.String(), - ProviderName: pool.ProviderName, - MaxRunners: pool.MaxRunners, - MinIdleRunners: pool.MinIdleRunners, - Image: pool.Image, - Flavor: pool.Flavor, - OSArch: pool.OSArch, - OSType: pool.OSType, - Enabled: pool.Enabled, - Tags: make([]params.Tag, len(pool.Tags)), - Instances: make([]params.Instance, len(pool.Instances)), + ID: pool.ID.String(), + ProviderName: pool.ProviderName, + MaxRunners: pool.MaxRunners, + MinIdleRunners: pool.MinIdleRunners, + Image: pool.Image, + Flavor: pool.Flavor, + OSArch: pool.OSArch, + OSType: pool.OSType, + Enabled: pool.Enabled, + Tags: make([]params.Tag, len(pool.Tags)), + Instances: make([]params.Instance, len(pool.Instances)), + RunnerBootstrapTimeout: pool.RunnerBootstrapTimeout, } if pool.RepoID != uuid.Nil { @@ -209,6 +210,10 @@ func (s *sqlDatabase) updatePool(pool Pool, param params.UpdatePoolParams) (para pool.OSType = param.OSType } + if param.RunnerBootstrapTimeout > 0 { + pool.RunnerBootstrapTimeout = param.RunnerBootstrapTimeout + } + if q := s.conn.Save(&pool); q.Error != nil { return params.Pool{}, errors.Wrap(q.Error, "saving database entry") } diff --git a/params/params.go b/params/params.go index 2c0c649c..e35cb298 100644 --- a/params/params.go +++ b/params/params.go @@ -111,21 +111,29 @@ type Tag struct { } type Pool struct { - ID string `json:"id"` - ProviderName string `json:"provider_name"` - MaxRunners uint `json:"max_runners"` - MinIdleRunners uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` - Tags []Tag `json:"tags"` - Enabled bool `json:"enabled"` - Instances []Instance `json:"instances"` - RepoID string `json:"repo_id,omitempty"` - RepoName string `json:"repo_name,omitempty"` - OrgID string `json:"org_id,omitempty"` - OrgName string `json:"org_name,omitempty"` + ID string `json:"id"` + ProviderName string `json:"provider_name"` + MaxRunners uint `json:"max_runners"` + MinIdleRunners uint `json:"min_idle_runners"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` + Tags []Tag `json:"tags"` + Enabled bool `json:"enabled"` + Instances []Instance `json:"instances"` + RepoID string `json:"repo_id,omitempty"` + RepoName string `json:"repo_name,omitempty"` + OrgID string `json:"org_id,omitempty"` + OrgName string `json:"org_name,omitempty"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` +} + +func (p *Pool) RunnerTimeout() uint { + if p.RunnerBootstrapTimeout == 0 { + return config.DefaultRunnerBootstrapTimeout + } + return p.RunnerBootstrapTimeout } type Internal struct { diff --git a/params/requests.go b/params/requests.go index b6acc737..30deab74 100644 --- a/params/requests.go +++ b/params/requests.go @@ -78,14 +78,15 @@ type NewUserParams struct { } type UpdatePoolParams struct { - Tags []string `json:"tags"` - Enabled *bool `json:"enabled"` - MaxRunners *uint `json:"max_runners"` - MinIdleRunners *uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` + Tags []string `json:"tags"` + Enabled *bool `json:"enabled"` + MaxRunners *uint `json:"max_runners"` + MinIdleRunners *uint `json:"min_idle_runners"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` } type CreateInstanceParams struct { @@ -101,15 +102,16 @@ type CreateInstanceParams struct { } type CreatePoolParams struct { - ProviderName string `json:"provider_name"` - MaxRunners uint `json:"max_runners"` - MinIdleRunners uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` - Tags []string `json:"tags"` - Enabled bool `json:"enabled"` + ProviderName string `json:"provider_name"` + MaxRunners uint `json:"max_runners"` + MinIdleRunners uint `json:"min_idle_runners"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` + Tags []string `json:"tags"` + Enabled bool `json:"enabled"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` } func (p *CreatePoolParams) Validate() error { diff --git a/runner/organizations.go b/runner/organizations.go index 2897df06..b9be1573 100644 --- a/runner/organizations.go +++ b/runner/organizations.go @@ -20,6 +20,7 @@ import ( "strings" "garm/auth" + "garm/config" runnerErrors "garm/errors" "garm/params" "garm/runner/common" @@ -202,6 +203,10 @@ func (r *Runner) CreateOrgPool(ctx context.Context, orgID string, param params.C return params.Pool{}, errors.Wrap(err, "fetching pool params") } + if param.RunnerBootstrapTimeout == 0 { + param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout + } + pool, err := r.store.CreateOrganizationPool(ctx, orgID, createPoolParams) if err != nil { return params.Pool{}, errors.Wrap(err, "creating pool") diff --git a/runner/pool/common.go b/runner/pool/common.go index 0d508983..86571e9b 100644 --- a/runner/pool/common.go +++ b/runner/pool/common.go @@ -91,15 +91,6 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro } if ok := runnerNames[instance.Name]; !ok { - // if instance.Status == providerCommon.InstanceRunning { - // if time.Since(instance.UpdatedAt).Minutes() < 20 { - // // Allow up to 20 minutes for instance to finish installing. - // // Anything beyond that is considered a timeout and the instance - // // is marked for deletion. - // // TODO(gabriel-samfira): Make the timeout configurable. - // continue - // } - // } // Set pending_delete on DB field. Allow consolidate() to remove it. if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { log.Printf("failed to update runner %s status", instance.Name) @@ -110,6 +101,42 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro return nil } +// reapTimedOutRunners will mark as pending_delete any runner that has a status +// of "running" in the provider, but that has not registered with Github, and has +// received no new updates in the configured timeout interval. +func (r *basePool) reapTimedOutRunners(runners []*github.Runner) error { + log.Printf("Checking for timed out runners") + dbInstances, err := r.helper.FetchDbInstances() + if err != nil { + return errors.Wrap(err, "fetching instances from db") + } + + runnerNames := map[string]bool{} + for _, run := range runners { + runnerNames[*run.Name] = true + } + + for _, instance := range dbInstances { + if ok := runnerNames[instance.Name]; !ok { + if instance.Status == providerCommon.InstanceRunning { + pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching instance pool info") + } + if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { + continue + } + log.Printf("reaping instance %s due to timeout", instance.Name) + if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { + log.Printf("failed to update runner %s status", instance.Name) + return errors.Wrap(err, "updating runner") + } + } + } + } + return nil +} + // cleanupOrphanedGithubRunners will forcefully remove any github runners that appear // as offline and for which we no longer have a local instance. // This may happen if someone manually deletes the instance in the provider. We need to @@ -312,8 +339,14 @@ func (r *basePool) AddRunner(ctx context.Context, poolID string) error { } func (r *basePool) loop() { + consolidateTimer := time.NewTicker(5 * time.Second) + reapTimer := time.NewTicker(5 * time.Minute) + toolUpdateTimer := time.NewTicker(3 * time.Hour) defer func() { log.Printf("repository %s loop exited", r.helper.String()) + consolidateTimer.Stop() + reapTimer.Stop() + toolUpdateTimer.Stop() close(r.done) }() log.Printf("starting loop for %s", r.helper.String()) @@ -330,10 +363,23 @@ func (r *basePool) loop() { for { select { - case <-time.After(5 * time.Second): + case <-reapTimer.C: + runners, err := r.helper.GetGithubRunners() + if err != nil { + log.Printf("error fetching github runners: %s", err) + continue + } + if err := r.reapTimedOutRunners(runners); err != nil { + log.Printf("failed to reap timed out runners: %q", err) + } + + if err := r.cleanupOrphanedGithubRunners(runners); err != nil { + log.Printf("failed to clean orphaned github runners: %q", err) + } + case <-consolidateTimer.C: // consolidate. r.consolidate() - case <-time.After(3 * time.Hour): + case <-toolUpdateTimer.C: // Update tools cache. tools, err := r.helper.FetchTools() if err != nil { diff --git a/runner/pools.go b/runner/pools.go index 1f704319..54c59c55 100644 --- a/runner/pools.go +++ b/runner/pools.go @@ -90,6 +90,10 @@ func (r *Runner) UpdatePoolByID(ctx context.Context, poolID string, param params minIdleRunners = *param.MinIdleRunners } + if param.RunnerBootstrapTimeout == 0 { + return params.Pool{}, runnerErrors.NewBadRequestError("runner_bootstrap_timeout cannot be 0") + } + if minIdleRunners > maxRunners { return params.Pool{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners") } diff --git a/runner/repositories.go b/runner/repositories.go index 347056ec..63387ef8 100644 --- a/runner/repositories.go +++ b/runner/repositories.go @@ -20,6 +20,7 @@ import ( "strings" "garm/auth" + "garm/config" runnerErrors "garm/errors" "garm/params" "garm/runner/common" @@ -202,6 +203,10 @@ func (r *Runner) CreateRepoPool(ctx context.Context, repoID string, param params return params.Pool{}, errors.Wrap(err, "fetching pool params") } + if param.RunnerBootstrapTimeout == 0 { + param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout + } + pool, err := r.store.CreateRepositoryPool(ctx, repoID, createPoolParams) if err != nil { return params.Pool{}, errors.Wrap(err, "creating pool")