Add timeout functionality for pool runner bootstrap

Pools can now define a bootstrap timeout for runners. The timeout can
be defined per pool and indicates the amount of time after which a runner
is considered defunct and removed.

If a runner doesn't join github in the configured amount of time, and it
receives no updates indicating that it is installing the runner via instance
status updates, it is considered defunct.

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2022-06-29 23:44:03 +00:00
parent 5390efbaab
commit 15a1308441
12 changed files with 173 additions and 81 deletions

View file

@ -62,6 +62,8 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques
return
}
pool.RunnerBootstrapTimeout = pool.RunnerTimeout()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(pool)
}

View file

@ -161,15 +161,16 @@ var poolAddCmd = &cobra.Command{
tags := strings.Split(poolTags, ",")
newPoolParams := params.CreatePoolParams{
ProviderName: poolProvider,
MaxRunners: poolMaxRunners,
MinIdleRunners: poolMinIdleRunners,
Image: poolImage,
Flavor: poolFlavor,
OSType: config.OSType(poolOSType),
OSArch: config.OSArch(poolOSArch),
Tags: tags,
Enabled: poolEnabled,
ProviderName: poolProvider,
MaxRunners: poolMaxRunners,
MinIdleRunners: poolMinIdleRunners,
Image: poolImage,
Flavor: poolFlavor,
OSType: config.OSType(poolOSType),
OSArch: config.OSArch(poolOSArch),
Tags: tags,
Enabled: poolEnabled,
RunnerBootstrapTimeout: poolRunnerBootstrapTimeout,
}
if err := newPoolParams.Validate(); err != nil {
return err
@ -252,6 +253,10 @@ explicitly remove them using the runner delete command.
poolUpdateParams.Enabled = &poolEnabled
}
if cmd.Flags().Changed("runner-bootstrap-timeout") {
poolUpdateParams.RunnerBootstrapTimeout = poolRunnerBootstrapTimeout
}
pool, err := cli.UpdatePoolByID(args[0], poolUpdateParams)
if err != nil {
return err
@ -276,6 +281,7 @@ func init() {
poolUpdateCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.")
poolUpdateCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.")
poolUpdateCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.")
poolUpdateCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.")
poolAddCmd.Flags().StringVar(&poolProvider, "provider-name", "", "The name of the provider where runners will be created.")
poolAddCmd.Flags().StringVar(&poolImage, "image", "", "The provider-specific image name to use for runners in this pool.")
@ -284,6 +290,7 @@ func init() {
poolAddCmd.Flags().StringVar(&poolOSType, "os-type", "linux", "Operating system type (windows, linux, etc).")
poolAddCmd.Flags().StringVar(&poolOSArch, "os-arch", "amd64", "Operating system architecture (amd64, arm, etc).")
poolAddCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.")
poolAddCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.")
poolAddCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.")
poolAddCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.")
poolAddCmd.MarkFlagRequired("provider-name")

View file

@ -25,15 +25,16 @@ import (
)
var (
poolProvider string
poolMaxRunners uint
poolMinIdleRunners uint
poolImage string
poolFlavor string
poolOSType string
poolOSArch string
poolTags string
poolEnabled bool
poolProvider string
poolMaxRunners uint
poolMinIdleRunners uint
poolImage string
poolFlavor string
poolOSType string
poolOSArch string
poolTags string
poolEnabled bool
poolRunnerBootstrapTimeout uint
)
// repoPoolCmd represents the pool command
@ -323,6 +324,7 @@ func formatOnePool(pool params.Pool) {
t.AppendRow(table.Row{"OS Architecture", pool.OSArch})
t.AppendRow(table.Row{"Max Runners", pool.MaxRunners})
t.AppendRow(table.Row{"Min Idle Runners", pool.MinIdleRunners})
t.AppendRow(table.Row{"Runner Bootstrap Timeout", pool.RunnerBootstrapTimeout})
t.AppendRow(table.Row{"Tags", strings.Join(tags, ", ")})
t.AppendRow(table.Row{"Belongs to", belongsTo})
t.AppendRow(table.Row{"Level", level})

View file

@ -64,6 +64,11 @@ const (
// DefaultPoolQueueSize is the default size for a pool queue.
DefaultPoolQueueSize = 10
// DefaultRunnerBootstrapTimeout is the default timeout in minutes a runner is
// considered to be defunct. If a runner does not join github in the alloted amount
// of time and no new updates have been made to it's state, it will be removed.
DefaultRunnerBootstrapTimeout = 20
GithubBaseURL = "https://github.com"
)

View file

@ -54,15 +54,16 @@ type Tag struct {
type Pool struct {
Base
ProviderName string `gorm:"index:idx_pool_type"`
MaxRunners uint
MinIdleRunners uint
Image string `gorm:"index:idx_pool_type"`
Flavor string `gorm:"index:idx_pool_type"`
OSType config.OSType
OSArch config.OSArch
Tags []*Tag `gorm:"many2many:pool_tags;"`
Enabled bool
ProviderName string `gorm:"index:idx_pool_type"`
MaxRunners uint
MinIdleRunners uint
RunnerBootstrapTimeout uint
Image string `gorm:"index:idx_pool_type"`
Flavor string `gorm:"index:idx_pool_type"`
OSType config.OSType
OSArch config.OSArch
Tags []*Tag `gorm:"many2many:pool_tags;"`
Enabled bool
RepoID uuid.UUID `gorm:"index"`
Repository Repository `gorm:"foreignKey:RepoID"`

View file

@ -87,17 +87,18 @@ func (s *sqlDatabase) sqlToCommonOrganization(org Organization) params.Organizat
func (s *sqlDatabase) sqlToCommonPool(pool Pool) params.Pool {
ret := params.Pool{
ID: pool.ID.String(),
ProviderName: pool.ProviderName,
MaxRunners: pool.MaxRunners,
MinIdleRunners: pool.MinIdleRunners,
Image: pool.Image,
Flavor: pool.Flavor,
OSArch: pool.OSArch,
OSType: pool.OSType,
Enabled: pool.Enabled,
Tags: make([]params.Tag, len(pool.Tags)),
Instances: make([]params.Instance, len(pool.Instances)),
ID: pool.ID.String(),
ProviderName: pool.ProviderName,
MaxRunners: pool.MaxRunners,
MinIdleRunners: pool.MinIdleRunners,
Image: pool.Image,
Flavor: pool.Flavor,
OSArch: pool.OSArch,
OSType: pool.OSType,
Enabled: pool.Enabled,
Tags: make([]params.Tag, len(pool.Tags)),
Instances: make([]params.Instance, len(pool.Instances)),
RunnerBootstrapTimeout: pool.RunnerBootstrapTimeout,
}
if pool.RepoID != uuid.Nil {
@ -209,6 +210,10 @@ func (s *sqlDatabase) updatePool(pool Pool, param params.UpdatePoolParams) (para
pool.OSType = param.OSType
}
if param.RunnerBootstrapTimeout > 0 {
pool.RunnerBootstrapTimeout = param.RunnerBootstrapTimeout
}
if q := s.conn.Save(&pool); q.Error != nil {
return params.Pool{}, errors.Wrap(q.Error, "saving database entry")
}

View file

@ -111,21 +111,29 @@ type Tag struct {
}
type Pool struct {
ID string `json:"id"`
ProviderName string `json:"provider_name"`
MaxRunners uint `json:"max_runners"`
MinIdleRunners uint `json:"min_idle_runners"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
Tags []Tag `json:"tags"`
Enabled bool `json:"enabled"`
Instances []Instance `json:"instances"`
RepoID string `json:"repo_id,omitempty"`
RepoName string `json:"repo_name,omitempty"`
OrgID string `json:"org_id,omitempty"`
OrgName string `json:"org_name,omitempty"`
ID string `json:"id"`
ProviderName string `json:"provider_name"`
MaxRunners uint `json:"max_runners"`
MinIdleRunners uint `json:"min_idle_runners"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
Tags []Tag `json:"tags"`
Enabled bool `json:"enabled"`
Instances []Instance `json:"instances"`
RepoID string `json:"repo_id,omitempty"`
RepoName string `json:"repo_name,omitempty"`
OrgID string `json:"org_id,omitempty"`
OrgName string `json:"org_name,omitempty"`
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
}
func (p *Pool) RunnerTimeout() uint {
if p.RunnerBootstrapTimeout == 0 {
return config.DefaultRunnerBootstrapTimeout
}
return p.RunnerBootstrapTimeout
}
type Internal struct {

View file

@ -78,14 +78,15 @@ type NewUserParams struct {
}
type UpdatePoolParams struct {
Tags []string `json:"tags"`
Enabled *bool `json:"enabled"`
MaxRunners *uint `json:"max_runners"`
MinIdleRunners *uint `json:"min_idle_runners"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
Tags []string `json:"tags"`
Enabled *bool `json:"enabled"`
MaxRunners *uint `json:"max_runners"`
MinIdleRunners *uint `json:"min_idle_runners"`
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
}
type CreateInstanceParams struct {
@ -101,15 +102,16 @@ type CreateInstanceParams struct {
}
type CreatePoolParams struct {
ProviderName string `json:"provider_name"`
MaxRunners uint `json:"max_runners"`
MinIdleRunners uint `json:"min_idle_runners"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
Tags []string `json:"tags"`
Enabled bool `json:"enabled"`
ProviderName string `json:"provider_name"`
MaxRunners uint `json:"max_runners"`
MinIdleRunners uint `json:"min_idle_runners"`
Image string `json:"image"`
Flavor string `json:"flavor"`
OSType config.OSType `json:"os_type"`
OSArch config.OSArch `json:"os_arch"`
Tags []string `json:"tags"`
Enabled bool `json:"enabled"`
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
}
func (p *CreatePoolParams) Validate() error {

View file

@ -20,6 +20,7 @@ import (
"strings"
"garm/auth"
"garm/config"
runnerErrors "garm/errors"
"garm/params"
"garm/runner/common"
@ -202,6 +203,10 @@ func (r *Runner) CreateOrgPool(ctx context.Context, orgID string, param params.C
return params.Pool{}, errors.Wrap(err, "fetching pool params")
}
if param.RunnerBootstrapTimeout == 0 {
param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout
}
pool, err := r.store.CreateOrganizationPool(ctx, orgID, createPoolParams)
if err != nil {
return params.Pool{}, errors.Wrap(err, "creating pool")

View file

@ -91,15 +91,6 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
}
if ok := runnerNames[instance.Name]; !ok {
// if instance.Status == providerCommon.InstanceRunning {
// if time.Since(instance.UpdatedAt).Minutes() < 20 {
// // Allow up to 20 minutes for instance to finish installing.
// // Anything beyond that is considered a timeout and the instance
// // is marked for deletion.
// // TODO(gabriel-samfira): Make the timeout configurable.
// continue
// }
// }
// Set pending_delete on DB field. Allow consolidate() to remove it.
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
@ -110,6 +101,42 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
return nil
}
// reapTimedOutRunners will mark as pending_delete any runner that has a status
// of "running" in the provider, but that has not registered with Github, and has
// received no new updates in the configured timeout interval.
func (r *basePool) reapTimedOutRunners(runners []*github.Runner) error {
log.Printf("Checking for timed out runners")
dbInstances, err := r.helper.FetchDbInstances()
if err != nil {
return errors.Wrap(err, "fetching instances from db")
}
runnerNames := map[string]bool{}
for _, run := range runners {
runnerNames[*run.Name] = true
}
for _, instance := range dbInstances {
if ok := runnerNames[instance.Name]; !ok {
if instance.Status == providerCommon.InstanceRunning {
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching instance pool info")
}
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
continue
}
log.Printf("reaping instance %s due to timeout", instance.Name)
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
return errors.Wrap(err, "updating runner")
}
}
}
}
return nil
}
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
@ -312,8 +339,14 @@ func (r *basePool) AddRunner(ctx context.Context, poolID string) error {
}
func (r *basePool) loop() {
consolidateTimer := time.NewTicker(5 * time.Second)
reapTimer := time.NewTicker(5 * time.Minute)
toolUpdateTimer := time.NewTicker(3 * time.Hour)
defer func() {
log.Printf("repository %s loop exited", r.helper.String())
consolidateTimer.Stop()
reapTimer.Stop()
toolUpdateTimer.Stop()
close(r.done)
}()
log.Printf("starting loop for %s", r.helper.String())
@ -330,10 +363,23 @@ func (r *basePool) loop() {
for {
select {
case <-time.After(5 * time.Second):
case <-reapTimer.C:
runners, err := r.helper.GetGithubRunners()
if err != nil {
log.Printf("error fetching github runners: %s", err)
continue
}
if err := r.reapTimedOutRunners(runners); err != nil {
log.Printf("failed to reap timed out runners: %q", err)
}
if err := r.cleanupOrphanedGithubRunners(runners); err != nil {
log.Printf("failed to clean orphaned github runners: %q", err)
}
case <-consolidateTimer.C:
// consolidate.
r.consolidate()
case <-time.After(3 * time.Hour):
case <-toolUpdateTimer.C:
// Update tools cache.
tools, err := r.helper.FetchTools()
if err != nil {

View file

@ -90,6 +90,10 @@ func (r *Runner) UpdatePoolByID(ctx context.Context, poolID string, param params
minIdleRunners = *param.MinIdleRunners
}
if param.RunnerBootstrapTimeout == 0 {
return params.Pool{}, runnerErrors.NewBadRequestError("runner_bootstrap_timeout cannot be 0")
}
if minIdleRunners > maxRunners {
return params.Pool{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners")
}

View file

@ -20,6 +20,7 @@ import (
"strings"
"garm/auth"
"garm/config"
runnerErrors "garm/errors"
"garm/params"
"garm/runner/common"
@ -202,6 +203,10 @@ func (r *Runner) CreateRepoPool(ctx context.Context, repoID string, param params
return params.Pool{}, errors.Wrap(err, "fetching pool params")
}
if param.RunnerBootstrapTimeout == 0 {
param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout
}
pool, err := r.store.CreateRepositoryPool(ctx, repoID, createPoolParams)
if err != nil {
return params.Pool{}, errors.Wrap(err, "creating pool")