diff --git a/apiserver/controllers/pools.go b/apiserver/controllers/pools.go index 50692a95..6f5fee96 100644 --- a/apiserver/controllers/pools.go +++ b/apiserver/controllers/pools.go @@ -62,6 +62,8 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques return } + pool.RunnerBootstrapTimeout = pool.RunnerTimeout() + w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(pool) } diff --git a/cmd/garm-cli/cmd/pool.go b/cmd/garm-cli/cmd/pool.go index 4d30abba..574f650b 100644 --- a/cmd/garm-cli/cmd/pool.go +++ b/cmd/garm-cli/cmd/pool.go @@ -161,15 +161,16 @@ var poolAddCmd = &cobra.Command{ tags := strings.Split(poolTags, ",") newPoolParams := params.CreatePoolParams{ - ProviderName: poolProvider, - MaxRunners: poolMaxRunners, - MinIdleRunners: poolMinIdleRunners, - Image: poolImage, - Flavor: poolFlavor, - OSType: config.OSType(poolOSType), - OSArch: config.OSArch(poolOSArch), - Tags: tags, - Enabled: poolEnabled, + ProviderName: poolProvider, + MaxRunners: poolMaxRunners, + MinIdleRunners: poolMinIdleRunners, + Image: poolImage, + Flavor: poolFlavor, + OSType: config.OSType(poolOSType), + OSArch: config.OSArch(poolOSArch), + Tags: tags, + Enabled: poolEnabled, + RunnerBootstrapTimeout: poolRunnerBootstrapTimeout, } if err := newPoolParams.Validate(); err != nil { return err @@ -252,6 +253,10 @@ explicitly remove them using the runner delete command. poolUpdateParams.Enabled = &poolEnabled } + if cmd.Flags().Changed("runner-bootstrap-timeout") { + poolUpdateParams.RunnerBootstrapTimeout = poolRunnerBootstrapTimeout + } + pool, err := cli.UpdatePoolByID(args[0], poolUpdateParams) if err != nil { return err @@ -276,6 +281,7 @@ func init() { poolUpdateCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.") poolUpdateCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.") poolUpdateCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.") + poolUpdateCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.") poolAddCmd.Flags().StringVar(&poolProvider, "provider-name", "", "The name of the provider where runners will be created.") poolAddCmd.Flags().StringVar(&poolImage, "image", "", "The provider-specific image name to use for runners in this pool.") @@ -284,6 +290,7 @@ func init() { poolAddCmd.Flags().StringVar(&poolOSType, "os-type", "linux", "Operating system type (windows, linux, etc).") poolAddCmd.Flags().StringVar(&poolOSArch, "os-arch", "amd64", "Operating system architecture (amd64, arm, etc).") poolAddCmd.Flags().UintVar(&poolMaxRunners, "max-runners", 5, "The maximum number of runner this pool will create.") + poolAddCmd.Flags().UintVar(&poolRunnerBootstrapTimeout, "runner-bootstrap-timeout", 20, "Duration in minutes after which a runner is considered failed if it does not join Github.") poolAddCmd.Flags().UintVar(&poolMinIdleRunners, "min-idle-runners", 1, "Attempt to maintain a minimum of idle self-hosted runners of this type.") poolAddCmd.Flags().BoolVar(&poolEnabled, "enabled", false, "Enable this pool.") poolAddCmd.MarkFlagRequired("provider-name") diff --git a/cmd/garm-cli/cmd/repo_pool.go b/cmd/garm-cli/cmd/repo_pool.go index 80b3058d..8e430ab6 100644 --- a/cmd/garm-cli/cmd/repo_pool.go +++ b/cmd/garm-cli/cmd/repo_pool.go @@ -25,15 +25,16 @@ import ( ) var ( - poolProvider string - poolMaxRunners uint - poolMinIdleRunners uint - poolImage string - poolFlavor string - poolOSType string - poolOSArch string - poolTags string - poolEnabled bool + poolProvider string + poolMaxRunners uint + poolMinIdleRunners uint + poolImage string + poolFlavor string + poolOSType string + poolOSArch string + poolTags string + poolEnabled bool + poolRunnerBootstrapTimeout uint ) // repoPoolCmd represents the pool command @@ -323,6 +324,7 @@ func formatOnePool(pool params.Pool) { t.AppendRow(table.Row{"OS Architecture", pool.OSArch}) t.AppendRow(table.Row{"Max Runners", pool.MaxRunners}) t.AppendRow(table.Row{"Min Idle Runners", pool.MinIdleRunners}) + t.AppendRow(table.Row{"Runner Bootstrap Timeout", pool.RunnerBootstrapTimeout}) t.AppendRow(table.Row{"Tags", strings.Join(tags, ", ")}) t.AppendRow(table.Row{"Belongs to", belongsTo}) t.AppendRow(table.Row{"Level", level}) diff --git a/config/config.go b/config/config.go index e5c10004..0e561d3b 100644 --- a/config/config.go +++ b/config/config.go @@ -64,6 +64,11 @@ const ( // DefaultPoolQueueSize is the default size for a pool queue. DefaultPoolQueueSize = 10 + // DefaultRunnerBootstrapTimeout is the default timeout in minutes a runner is + // considered to be defunct. If a runner does not join github in the alloted amount + // of time and no new updates have been made to it's state, it will be removed. + DefaultRunnerBootstrapTimeout = 20 + GithubBaseURL = "https://github.com" ) diff --git a/database/sql/models.go b/database/sql/models.go index 2e9e4a33..de3b9e24 100644 --- a/database/sql/models.go +++ b/database/sql/models.go @@ -54,15 +54,16 @@ type Tag struct { type Pool struct { Base - ProviderName string `gorm:"index:idx_pool_type"` - MaxRunners uint - MinIdleRunners uint - Image string `gorm:"index:idx_pool_type"` - Flavor string `gorm:"index:idx_pool_type"` - OSType config.OSType - OSArch config.OSArch - Tags []*Tag `gorm:"many2many:pool_tags;"` - Enabled bool + ProviderName string `gorm:"index:idx_pool_type"` + MaxRunners uint + MinIdleRunners uint + RunnerBootstrapTimeout uint + Image string `gorm:"index:idx_pool_type"` + Flavor string `gorm:"index:idx_pool_type"` + OSType config.OSType + OSArch config.OSArch + Tags []*Tag `gorm:"many2many:pool_tags;"` + Enabled bool RepoID uuid.UUID `gorm:"index"` Repository Repository `gorm:"foreignKey:RepoID"` diff --git a/database/sql/util.go b/database/sql/util.go index e047b18c..3852845e 100644 --- a/database/sql/util.go +++ b/database/sql/util.go @@ -87,17 +87,18 @@ func (s *sqlDatabase) sqlToCommonOrganization(org Organization) params.Organizat func (s *sqlDatabase) sqlToCommonPool(pool Pool) params.Pool { ret := params.Pool{ - ID: pool.ID.String(), - ProviderName: pool.ProviderName, - MaxRunners: pool.MaxRunners, - MinIdleRunners: pool.MinIdleRunners, - Image: pool.Image, - Flavor: pool.Flavor, - OSArch: pool.OSArch, - OSType: pool.OSType, - Enabled: pool.Enabled, - Tags: make([]params.Tag, len(pool.Tags)), - Instances: make([]params.Instance, len(pool.Instances)), + ID: pool.ID.String(), + ProviderName: pool.ProviderName, + MaxRunners: pool.MaxRunners, + MinIdleRunners: pool.MinIdleRunners, + Image: pool.Image, + Flavor: pool.Flavor, + OSArch: pool.OSArch, + OSType: pool.OSType, + Enabled: pool.Enabled, + Tags: make([]params.Tag, len(pool.Tags)), + Instances: make([]params.Instance, len(pool.Instances)), + RunnerBootstrapTimeout: pool.RunnerBootstrapTimeout, } if pool.RepoID != uuid.Nil { @@ -209,6 +210,10 @@ func (s *sqlDatabase) updatePool(pool Pool, param params.UpdatePoolParams) (para pool.OSType = param.OSType } + if param.RunnerBootstrapTimeout > 0 { + pool.RunnerBootstrapTimeout = param.RunnerBootstrapTimeout + } + if q := s.conn.Save(&pool); q.Error != nil { return params.Pool{}, errors.Wrap(q.Error, "saving database entry") } diff --git a/params/params.go b/params/params.go index 2c0c649c..e35cb298 100644 --- a/params/params.go +++ b/params/params.go @@ -111,21 +111,29 @@ type Tag struct { } type Pool struct { - ID string `json:"id"` - ProviderName string `json:"provider_name"` - MaxRunners uint `json:"max_runners"` - MinIdleRunners uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` - Tags []Tag `json:"tags"` - Enabled bool `json:"enabled"` - Instances []Instance `json:"instances"` - RepoID string `json:"repo_id,omitempty"` - RepoName string `json:"repo_name,omitempty"` - OrgID string `json:"org_id,omitempty"` - OrgName string `json:"org_name,omitempty"` + ID string `json:"id"` + ProviderName string `json:"provider_name"` + MaxRunners uint `json:"max_runners"` + MinIdleRunners uint `json:"min_idle_runners"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` + Tags []Tag `json:"tags"` + Enabled bool `json:"enabled"` + Instances []Instance `json:"instances"` + RepoID string `json:"repo_id,omitempty"` + RepoName string `json:"repo_name,omitempty"` + OrgID string `json:"org_id,omitempty"` + OrgName string `json:"org_name,omitempty"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` +} + +func (p *Pool) RunnerTimeout() uint { + if p.RunnerBootstrapTimeout == 0 { + return config.DefaultRunnerBootstrapTimeout + } + return p.RunnerBootstrapTimeout } type Internal struct { diff --git a/params/requests.go b/params/requests.go index b6acc737..30deab74 100644 --- a/params/requests.go +++ b/params/requests.go @@ -78,14 +78,15 @@ type NewUserParams struct { } type UpdatePoolParams struct { - Tags []string `json:"tags"` - Enabled *bool `json:"enabled"` - MaxRunners *uint `json:"max_runners"` - MinIdleRunners *uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` + Tags []string `json:"tags"` + Enabled *bool `json:"enabled"` + MaxRunners *uint `json:"max_runners"` + MinIdleRunners *uint `json:"min_idle_runners"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` } type CreateInstanceParams struct { @@ -101,15 +102,16 @@ type CreateInstanceParams struct { } type CreatePoolParams struct { - ProviderName string `json:"provider_name"` - MaxRunners uint `json:"max_runners"` - MinIdleRunners uint `json:"min_idle_runners"` - Image string `json:"image"` - Flavor string `json:"flavor"` - OSType config.OSType `json:"os_type"` - OSArch config.OSArch `json:"os_arch"` - Tags []string `json:"tags"` - Enabled bool `json:"enabled"` + ProviderName string `json:"provider_name"` + MaxRunners uint `json:"max_runners"` + MinIdleRunners uint `json:"min_idle_runners"` + Image string `json:"image"` + Flavor string `json:"flavor"` + OSType config.OSType `json:"os_type"` + OSArch config.OSArch `json:"os_arch"` + Tags []string `json:"tags"` + Enabled bool `json:"enabled"` + RunnerBootstrapTimeout uint `json:"runner_bootstrap_timeout"` } func (p *CreatePoolParams) Validate() error { diff --git a/runner/organizations.go b/runner/organizations.go index 2897df06..b9be1573 100644 --- a/runner/organizations.go +++ b/runner/organizations.go @@ -20,6 +20,7 @@ import ( "strings" "garm/auth" + "garm/config" runnerErrors "garm/errors" "garm/params" "garm/runner/common" @@ -202,6 +203,10 @@ func (r *Runner) CreateOrgPool(ctx context.Context, orgID string, param params.C return params.Pool{}, errors.Wrap(err, "fetching pool params") } + if param.RunnerBootstrapTimeout == 0 { + param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout + } + pool, err := r.store.CreateOrganizationPool(ctx, orgID, createPoolParams) if err != nil { return params.Pool{}, errors.Wrap(err, "creating pool") diff --git a/runner/pool/common.go b/runner/pool/common.go index 0d508983..86571e9b 100644 --- a/runner/pool/common.go +++ b/runner/pool/common.go @@ -91,15 +91,6 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro } if ok := runnerNames[instance.Name]; !ok { - // if instance.Status == providerCommon.InstanceRunning { - // if time.Since(instance.UpdatedAt).Minutes() < 20 { - // // Allow up to 20 minutes for instance to finish installing. - // // Anything beyond that is considered a timeout and the instance - // // is marked for deletion. - // // TODO(gabriel-samfira): Make the timeout configurable. - // continue - // } - // } // Set pending_delete on DB field. Allow consolidate() to remove it. if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { log.Printf("failed to update runner %s status", instance.Name) @@ -110,6 +101,42 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro return nil } +// reapTimedOutRunners will mark as pending_delete any runner that has a status +// of "running" in the provider, but that has not registered with Github, and has +// received no new updates in the configured timeout interval. +func (r *basePool) reapTimedOutRunners(runners []*github.Runner) error { + log.Printf("Checking for timed out runners") + dbInstances, err := r.helper.FetchDbInstances() + if err != nil { + return errors.Wrap(err, "fetching instances from db") + } + + runnerNames := map[string]bool{} + for _, run := range runners { + runnerNames[*run.Name] = true + } + + for _, instance := range dbInstances { + if ok := runnerNames[instance.Name]; !ok { + if instance.Status == providerCommon.InstanceRunning { + pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching instance pool info") + } + if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { + continue + } + log.Printf("reaping instance %s due to timeout", instance.Name) + if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { + log.Printf("failed to update runner %s status", instance.Name) + return errors.Wrap(err, "updating runner") + } + } + } + } + return nil +} + // cleanupOrphanedGithubRunners will forcefully remove any github runners that appear // as offline and for which we no longer have a local instance. // This may happen if someone manually deletes the instance in the provider. We need to @@ -312,8 +339,14 @@ func (r *basePool) AddRunner(ctx context.Context, poolID string) error { } func (r *basePool) loop() { + consolidateTimer := time.NewTicker(5 * time.Second) + reapTimer := time.NewTicker(5 * time.Minute) + toolUpdateTimer := time.NewTicker(3 * time.Hour) defer func() { log.Printf("repository %s loop exited", r.helper.String()) + consolidateTimer.Stop() + reapTimer.Stop() + toolUpdateTimer.Stop() close(r.done) }() log.Printf("starting loop for %s", r.helper.String()) @@ -330,10 +363,23 @@ func (r *basePool) loop() { for { select { - case <-time.After(5 * time.Second): + case <-reapTimer.C: + runners, err := r.helper.GetGithubRunners() + if err != nil { + log.Printf("error fetching github runners: %s", err) + continue + } + if err := r.reapTimedOutRunners(runners); err != nil { + log.Printf("failed to reap timed out runners: %q", err) + } + + if err := r.cleanupOrphanedGithubRunners(runners); err != nil { + log.Printf("failed to clean orphaned github runners: %q", err) + } + case <-consolidateTimer.C: // consolidate. r.consolidate() - case <-time.After(3 * time.Hour): + case <-toolUpdateTimer.C: // Update tools cache. tools, err := r.helper.FetchTools() if err != nil { diff --git a/runner/pools.go b/runner/pools.go index 1f704319..54c59c55 100644 --- a/runner/pools.go +++ b/runner/pools.go @@ -90,6 +90,10 @@ func (r *Runner) UpdatePoolByID(ctx context.Context, poolID string, param params minIdleRunners = *param.MinIdleRunners } + if param.RunnerBootstrapTimeout == 0 { + return params.Pool{}, runnerErrors.NewBadRequestError("runner_bootstrap_timeout cannot be 0") + } + if minIdleRunners > maxRunners { return params.Pool{}, runnerErrors.NewBadRequestError("min_idle_runners cannot be larger than max_runners") } diff --git a/runner/repositories.go b/runner/repositories.go index 347056ec..63387ef8 100644 --- a/runner/repositories.go +++ b/runner/repositories.go @@ -20,6 +20,7 @@ import ( "strings" "garm/auth" + "garm/config" runnerErrors "garm/errors" "garm/params" "garm/runner/common" @@ -202,6 +203,10 @@ func (r *Runner) CreateRepoPool(ctx context.Context, repoID string, param params return params.Pool{}, errors.Wrap(err, "fetching pool params") } + if param.RunnerBootstrapTimeout == 0 { + param.RunnerBootstrapTimeout = config.DefaultRunnerBootstrapTimeout + } + pool, err := r.store.CreateRepositoryPool(ctx, repoID, createPoolParams) if err != nil { return params.Pool{}, errors.Wrap(err, "creating pool")