Add timeout functionality for pool runner bootstrap
Pools can now define a bootstrap timeout for runners. The timeout can be defined per pool and indicates the amount of time after which a runner is considered defunct and removed. If a runner doesn't join github in the configured amount of time, and it receives no updates indicating that it is installing the runner via instance status updates, it is considered defunct. Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
parent
5390efbaab
commit
15a1308441
12 changed files with 173 additions and 81 deletions
|
|
@ -62,6 +62,8 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques
|
|||
return
|
||||
}
|
||||
|
||||
pool.RunnerBootstrapTimeout = pool.RunnerTimeout()
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(pool)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -161,15 +161,16 @@ var poolAddCmd = &cobra.Command{
|
|||
|
||||
tags := strings.Split(poolTags, ",")
|
||||
newPoolParams := params.CreatePoolParams{
|
||||
ProviderName: poolProvider,
|
||||
MaxRunners: poolMaxRunners,
|
||||
MinIdleRunners: poolMinIdleRunners,
|
||||
Image: poolImage,
|
||||
Flavor: poolFlavor,
|
||||
OSType: config.OSType(poolOSType),
|
||||
OSArch: config.OSArch(poolOSArch),
|
||||
Tags: tags,
|
||||
Enabled: poolEnabled,
|
||||
ProviderName: poolProvider,
|
||||
MaxRunners: poolMaxRunners,
|
||||
MinIdleRunners: poolMinIdleRunners,
|
||||
Image: poolImage,
|
||||
Flavor: poolFlavor,
|
||||
OSType: config.OSType(poolOSType),
|
||||
OSArch: config.OSArch(poolOSArch),
|
||||
Tags: tags,
|
||||
Enabled: poolEnabled,
|
||||
RunnerBootstrapTimeout: poolRunnerBootstrapTimeout,
|
||||
}
|
||||
if err := newPoolParams.Validate(); err != nil {
|
||||
return err
|
||||
|
|
@ -252,6 +253,10 @@ explicitly remove them using the runner delete command.
|
|||
poolUpdateParams.Enabled = &poolEnabled
|
||||
}
|
||||
|
||||
if cmd.Flags().Changed("runner-bootstrap-timeout") {
|
||||
poolUpdateParams.RunnerBootstrapTimeout = poolRunnerBootstrapTimeout
|
||||
}
|
||||
|
||||
pool, err := cli.UpdatePoolByID(args[0], poolUpdateParams)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
@ -276,6 +281,7 @@ func init() {
|
|||
poolUpdateCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.")
|
||||
poolUpdateCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.")
|
||||
poolUpdateCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.")
|
||||
poolUpdateCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.")
|
||||
|
||||
poolAddCmd.Flags().StringVar(&poolProvider, "provider-name", "", "The name of the provider where runners will be created.")
|
||||
poolAddCmd.Flags().StringVar(&poolImage, "image", "", "The provider-specific image name to use for runners in this pool.")
|
||||
|
|
@ -284,6 +290,7 @@ func init() {
|
|||
poolAddCmd.Flags().StringVar(&poolOSType, "os-type", "linux", "Operating system type (windows, linux, etc).")
|
||||
poolAddCmd.Flags().StringVar(&poolOSArch, "os-arch", "amd64", "Operating system architecture (amd64, arm, etc).")
|
||||
poolAddCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.")
|
||||
poolAddCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.")
|
||||
poolAddCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.")
|
||||
poolAddCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.")
|
||||
poolAddCmd.MarkFlagRequired("provider-name")
|
||||
|
|
|
|||
|
|
@ -25,15 +25,16 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
poolProvider string
|
||||
poolMaxRunners uint
|
||||
poolMinIdleRunners uint
|
||||
poolImage string
|
||||
poolFlavor string
|
||||
poolOSType string
|
||||
poolOSArch string
|
||||
poolTags string
|
||||
poolEnabled bool
|
||||
poolProvider string
|
||||
poolMaxRunners uint
|
||||
poolMinIdleRunners uint
|
||||
poolImage string
|
||||
poolFlavor string
|
||||
poolOSType string
|
||||
poolOSArch string
|
||||
poolTags string
|
||||
poolEnabled bool
|
||||
poolRunnerBootstrapTimeout uint
|
||||
)
|
||||
|
||||
// repoPoolCmd represents the pool command
|
||||
|
|
@ -323,6 +324,7 @@ func formatOnePool(pool params.Pool) {
|
|||
t.AppendRow(table.Row{"OS Architecture", pool.OSArch})
|
||||
t.AppendRow(table.Row{"Max Runners", pool.MaxRunners})
|
||||
t.AppendRow(table.Row{"Min Idle Runners", pool.MinIdleRunners})
|
||||
t.AppendRow(table.Row{"Runner Bootstrap Timeout", pool.RunnerBootstrapTimeout})
|
||||
t.AppendRow(table.Row{"Tags", strings.Join(tags, ", ")})
|
||||
t.AppendRow(table.Row{"Belongs to", belongsTo})
|
||||
t.AppendRow(table.Row{"Level", level})
|
||||
|
|
|
|||
|
|
@ -64,6 +64,11 @@ const (
|
|||
// DefaultPoolQueueSize is the default size for a pool queue.
|
||||
DefaultPoolQueueSize = 10
|
||||
|
||||
// DefaultRunnerBootstrapTimeout is the default timeout in minutes a runner is
|
||||
// considered to be defunct. If a runner does not join github in the alloted amount
|
||||
// of time and no new updates have been made to it's state, it will be removed.
|
||||
DefaultRunnerBootstrapTimeout = 20
|
||||
|
||||
GithubBaseURL = "https://github.com"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -54,15 +54,16 @@ type Tag struct {
|
|||
type Pool struct {
|
||||
Base
|
||||
|
||||
ProviderName string `gorm:"index:idx_pool_type"`
|
||||
MaxRunners uint
|
||||
MinIdleRunners uint
|
||||
Image string `gorm:"index:idx_pool_type"`
|
||||
Flavor string `gorm:"index:idx_pool_type"`
|
||||
OSType config.OSType
|
||||
OSArch config.OSArch
|
||||
Tags []*Tag `gorm:"many2many:pool_tags;"`
|
||||
Enabled bool
|
||||
ProviderName string `gorm:"index:idx_pool_type"`
|
||||
MaxRunners uint
|
||||
MinIdleRunners uint
|
||||
RunnerBootstrapTimeout uint
|
||||
Image string `gorm:"index:idx_pool_type"`
|
||||
Flavor string `gorm:"index:idx_pool_type"`
|
||||
OSType config.OSType
|
||||
OSArch config.OSArch
|
||||
Tags []*Tag `gorm:"many2many:pool_tags;"`
|
||||
Enabled bool
|
||||
|
||||
RepoID uuid.UUID `gorm:"index"`
|
||||
Repository Repository `gorm:"foreignKey:RepoID"`
|
||||
|
|
|
|||
|
|
@ -87,17 +87,18 @@ func (s *sqlDatabase) sqlToCommonOrganization(org Organization) params.Organizat
|
|||
|
||||
func (s *sqlDatabase) sqlToCommonPool(pool Pool) params.Pool {
|
||||
ret := params.Pool{
|
||||
ID: pool.ID.String(),
|
||||
ProviderName: pool.ProviderName,
|
||||
MaxRunners: pool.MaxRunners,
|
||||
MinIdleRunners: pool.MinIdleRunners,
|
||||
Image: pool.Image,
|
||||
Flavor: pool.Flavor,
|
||||
OSArch: pool.OSArch,
|
||||
OSType: pool.OSType,
|
||||
Enabled: pool.Enabled,
|
||||
Tags: make([]params.Tag, len(pool.Tags)),
|
||||
Instances: make([]params.Instance, len(pool.Instances)),
|
||||
ID: pool.ID.String(),
|
||||
ProviderName: pool.ProviderName,
|
||||
MaxRunners: pool.MaxRunners,
|
||||
MinIdleRunners: pool.MinIdleRunners,
|
||||
Image: pool.Image,
|
||||
Flavor: pool.Flavor,
|
||||
OSArch: pool.OSArch,
|
||||
OSType: pool.OSType,
|
||||
Enabled: pool.Enabled,
|
||||
Tags: make([]params.Tag, len(pool.Tags)),
|
||||
Instances: make([]params.Instance, len(pool.Instances)),
|
||||
RunnerBootstrapTimeout: pool.RunnerBootstrapTimeout,
|
||||
}
|
||||
|
||||
if pool.RepoID != uuid.Nil {
|
||||
|
|
@ -209,6 +210,10 @@ func (s *sqlDatabase) updatePool(pool Pool, param params.UpdatePoolParams) (para
|
|||
pool.OSType = param.OSType
|
||||
}
|
||||
|
||||
if param.RunnerBootstrapTimeout > 0 {
|
||||
pool.RunnerBootstrapTimeout = param.RunnerBootstrapTimeout
|
||||
}
|
||||
|
||||
if q := s.conn.Save(&pool); q.Error != nil {
|
||||
return params.Pool{}, errors.Wrap(q.Error, "saving database entry")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,21 +111,29 @@ type Tag struct {
|
|||
}
|
||||
|
||||
type Pool struct {
|
||||
ID string `json:"id"`
|
||||
ProviderName string `json:"provider_name"`
|
||||
MaxRunners uint `json:"max_runners"`
|
||||
MinIdleRunners uint `json:"min_idle_runners"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
Tags []Tag `json:"tags"`
|
||||
Enabled bool `json:"enabled"`
|
||||
Instances []Instance `json:"instances"`
|
||||
RepoID string `json:"repo_id,omitempty"`
|
||||
RepoName string `json:"repo_name,omitempty"`
|
||||
OrgID string `json:"org_id,omitempty"`
|
||||
OrgName string `json:"org_name,omitempty"`
|
||||
ID string `json:"id"`
|
||||
ProviderName string `json:"provider_name"`
|
||||
MaxRunners uint `json:"max_runners"`
|
||||
MinIdleRunners uint `json:"min_idle_runners"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
Tags []Tag `json:"tags"`
|
||||
Enabled bool `json:"enabled"`
|
||||
Instances []Instance `json:"instances"`
|
||||
RepoID string `json:"repo_id,omitempty"`
|
||||
RepoName string `json:"repo_name,omitempty"`
|
||||
OrgID string `json:"org_id,omitempty"`
|
||||
OrgName string `json:"org_name,omitempty"`
|
||||
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
|
||||
}
|
||||
|
||||
func (p *Pool) RunnerTimeout() uint {
|
||||
if p.RunnerBootstrapTimeout == 0 {
|
||||
return config.DefaultRunnerBootstrapTimeout
|
||||
}
|
||||
return p.RunnerBootstrapTimeout
|
||||
}
|
||||
|
||||
type Internal struct {
|
||||
|
|
|
|||
|
|
@ -78,14 +78,15 @@ type NewUserParams struct {
|
|||
}
|
||||
|
||||
type UpdatePoolParams struct {
|
||||
Tags []string `json:"tags"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
MaxRunners *uint `json:"max_runners"`
|
||||
MinIdleRunners *uint `json:"min_idle_runners"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
Tags []string `json:"tags"`
|
||||
Enabled *bool `json:"enabled"`
|
||||
MaxRunners *uint `json:"max_runners"`
|
||||
MinIdleRunners *uint `json:"min_idle_runners"`
|
||||
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
}
|
||||
|
||||
type CreateInstanceParams struct {
|
||||
|
|
@ -101,15 +102,16 @@ type CreateInstanceParams struct {
|
|||
}
|
||||
|
||||
type CreatePoolParams struct {
|
||||
ProviderName string `json:"provider_name"`
|
||||
MaxRunners uint `json:"max_runners"`
|
||||
MinIdleRunners uint `json:"min_idle_runners"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
Tags []string `json:"tags"`
|
||||
Enabled bool `json:"enabled"`
|
||||
ProviderName string `json:"provider_name"`
|
||||
MaxRunners uint `json:"max_runners"`
|
||||
MinIdleRunners uint `json:"min_idle_runners"`
|
||||
Image string `json:"image"`
|
||||
Flavor string `json:"flavor"`
|
||||
OSType config.OSType `json:"os_type"`
|
||||
OSArch config.OSArch `json:"os_arch"`
|
||||
Tags []string `json:"tags"`
|
||||
Enabled bool `json:"enabled"`
|
||||
RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"`
|
||||
}
|
||||
|
||||
func (p *CreatePoolParams) Validate() error {
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import (
|
|||
"strings"
|
||||
|
||||
"garm/auth"
|
||||
"garm/config"
|
||||
runnerErrors "garm/errors"
|
||||
"garm/params"
|
||||
"garm/runner/common"
|
||||
|
|
@ -202,6 +203,10 @@ func (r *Runner) CreateOrgPool(ctx context.Context, orgID string, param params.C
|
|||
return params.Pool{}, errors.Wrap(err, "fetching pool params")
|
||||
}
|
||||
|
||||
if param.RunnerBootstrapTimeout == 0 {
|
||||
param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout
|
||||
}
|
||||
|
||||
pool, err := r.store.CreateOrganizationPool(ctx, orgID, createPoolParams)
|
||||
if err != nil {
|
||||
return params.Pool{}, errors.Wrap(err, "creating pool")
|
||||
|
|
|
|||
|
|
@ -91,15 +91,6 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
|
|||
}
|
||||
|
||||
if ok := runnerNames[instance.Name]; !ok {
|
||||
// if instance.Status == providerCommon.InstanceRunning {
|
||||
// if time.Since(instance.UpdatedAt).Minutes() < 20 {
|
||||
// // Allow up to 20 minutes for instance to finish installing.
|
||||
// // Anything beyond that is considered a timeout and the instance
|
||||
// // is marked for deletion.
|
||||
// // TODO(gabriel-samfira): Make the timeout configurable.
|
||||
// continue
|
||||
// }
|
||||
// }
|
||||
// Set pending_delete on DB field. Allow consolidate() to remove it.
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
|
|
@ -110,6 +101,42 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
|
|||
return nil
|
||||
}
|
||||
|
||||
// reapTimedOutRunners will mark as pending_delete any runner that has a status
|
||||
// of "running" in the provider, but that has not registered with Github, and has
|
||||
// received no new updates in the configured timeout interval.
|
||||
func (r *basePool) reapTimedOutRunners(runners []*github.Runner) error {
|
||||
log.Printf("Checking for timed out runners")
|
||||
dbInstances, err := r.helper.FetchDbInstances()
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instances from db")
|
||||
}
|
||||
|
||||
runnerNames := map[string]bool{}
|
||||
for _, run := range runners {
|
||||
runnerNames[*run.Name] = true
|
||||
}
|
||||
|
||||
for _, instance := range dbInstances {
|
||||
if ok := runnerNames[instance.Name]; !ok {
|
||||
if instance.Status == providerCommon.InstanceRunning {
|
||||
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instance pool info")
|
||||
}
|
||||
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
|
||||
continue
|
||||
}
|
||||
log.Printf("reaping instance %s due to timeout", instance.Name)
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
|
||||
// as offline and for which we no longer have a local instance.
|
||||
// This may happen if someone manually deletes the instance in the provider. We need to
|
||||
|
|
@ -312,8 +339,14 @@ func (r *basePool) AddRunner(ctx context.Context, poolID string) error {
|
|||
}
|
||||
|
||||
func (r *basePool) loop() {
|
||||
consolidateTimer := time.NewTicker(5 * time.Second)
|
||||
reapTimer := time.NewTicker(5 * time.Minute)
|
||||
toolUpdateTimer := time.NewTicker(3 * time.Hour)
|
||||
defer func() {
|
||||
log.Printf("repository %s loop exited", r.helper.String())
|
||||
consolidateTimer.Stop()
|
||||
reapTimer.Stop()
|
||||
toolUpdateTimer.Stop()
|
||||
close(r.done)
|
||||
}()
|
||||
log.Printf("starting loop for %s", r.helper.String())
|
||||
|
|
@ -330,10 +363,23 @@ func (r *basePool) loop() {
|
|||
|
||||
for {
|
||||
select {
|
||||
case <-time.After(5 * time.Second):
|
||||
case <-reapTimer.C:
|
||||
runners, err := r.helper.GetGithubRunners()
|
||||
if err != nil {
|
||||
log.Printf("error fetching github runners: %s", err)
|
||||
continue
|
||||
}
|
||||
if err := r.reapTimedOutRunners(runners); err != nil {
|
||||
log.Printf("failed to reap timed out runners: %q", err)
|
||||
}
|
||||
|
||||
if err := r.cleanupOrphanedGithubRunners(runners); err != nil {
|
||||
log.Printf("failed to clean orphaned github runners: %q", err)
|
||||
}
|
||||
case <-consolidateTimer.C:
|
||||
// consolidate.
|
||||
r.consolidate()
|
||||
case <-time.After(3 * time.Hour):
|
||||
case <-toolUpdateTimer.C:
|
||||
// Update tools cache.
|
||||
tools, err := r.helper.FetchTools()
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -90,6 +90,10 @@ func (r *Runner) UpdatePoolByID(ctx context.Context, poolID string, param params
|
|||
minIdleRunners = *param.MinIdleRunners
|
||||
}
|
||||
|
||||
if param.RunnerBootstrapTimeout == 0 {
|
||||
return params.Pool{}, runnerErrors.NewBadRequestError("runner_bootstrap_timeout cannot be 0")
|
||||
}
|
||||
|
||||
if minIdleRunners > maxRunners {
|
||||
return params.Pool{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import (
|
|||
"strings"
|
||||
|
||||
"garm/auth"
|
||||
"garm/config"
|
||||
runnerErrors "garm/errors"
|
||||
"garm/params"
|
||||
"garm/runner/common"
|
||||
|
|
@ -202,6 +203,10 @@ func (r *Runner) CreateRepoPool(ctx context.Context, repoID string, param params
|
|||
return params.Pool{}, errors.Wrap(err, "fetching pool params")
|
||||
}
|
||||
|
||||
if param.RunnerBootstrapTimeout == 0 {
|
||||
param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout
|
||||
}
|
||||
|
||||
pool, err := r.store.CreateRepositoryPool(ctx, repoID, createPoolParams)
|
||||
if err != nil {
|
||||
return params.Pool{}, errors.Wrap(err, "creating pool")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue