Add JIT config as part of instance create

We must create the DB entry for a runner with a JIT config included. Adding it later
via an update runs the risk of having the consolidate loop pick up the incomplete instance.

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2023-08-26 19:40:01 +00:00
parent 8c507a9251
commit 019948acbe
7 changed files with 44 additions and 77 deletions

View file

@ -82,6 +82,7 @@ func (s *sqlDatabase) CreateInstance(ctx context.Context, poolID string, param p
GitHubRunnerGroup: param.GitHubRunnerGroup,
JitConfiguration: secret,
AditionalLabels: labels,
AgentID: param.AgentID,
}
q := s.conn.Create(&newInstance)
if q.Error != nil {

View file

@ -136,7 +136,8 @@ type CreateInstanceParams struct {
// GithubRunnerGroup is the github runner group to which the runner belongs.
// The runner group must be created by someone with access to the enterprise.
GitHubRunnerGroup string
CreateAttempt int `json:"-"`
CreateAttempt int `json:"-"`
AgentID int64 `json:"-"`
AditionalLabels []string
JitConfiguration map[string]string
}

View file

@ -104,15 +104,7 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g
return nil, errors.Wrap(runnerErrors.ErrNotFound, "runner group not found")
}
func (r *enterprise) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
if instance.AgentID != 0 {
return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest)
}
if instance.JitConfiguration != nil {
return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest)
}
func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
var rg int64 = 1
if pool.GitHubRunnerGroup != "" {
runnerGroup, err := r.findRunnerGroupByName(ctx, pool.GitHubRunnerGroup)
@ -123,7 +115,7 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance params.Instance,
}
req := github.GenerateJITConfigRequest{
Name: instance.Name,
Name: instance,
RunnerGroupID: rg,
Labels: labels,
// TODO(gabriel-samfira): Should we make this configurable?

View file

@ -37,7 +37,7 @@ type poolHelper interface {
GithubCLI() common.GithubClient
GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (map[string]string, *github.Runner, error)
GetJITConfig(ctx context.Context, instanceName string, pool params.Pool, labels []string) (map[string]string, *github.Runner, error)
FetchDbInstances() ([]params.Instance, error)
ListPools() ([]params.Pool, error)

View file

@ -116,15 +116,7 @@ func (r *organization) findRunnerGroupByName(ctx context.Context, name string) (
return nil, errors.Wrap(runnerErrors.ErrNotFound, "runner group not found")
}
func (r *organization) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
if instance.AgentID != 0 {
return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest)
}
if instance.JitConfiguration != nil {
return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest)
}
func (r *organization) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
var rg int64 = 1
if pool.GitHubRunnerGroup != "" {
runnerGroup, err := r.findRunnerGroupByName(ctx, pool.GitHubRunnerGroup)
@ -135,7 +127,7 @@ func (r *organization) GetJITConfig(ctx context.Context, instance params.Instanc
}
req := github.GenerateJITConfigRequest{
Name: instance.Name,
Name: instance,
RunnerGroupID: rg,
Labels: labels,
// TODO(gabriel-samfira): Should we make this configurable?

View file

@ -388,11 +388,18 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne
continue
}
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching instance pool info")
}
switch instance.RunnerStatus {
case params.RunnerPending, params.RunnerInstalling:
// runner is still installing. We give it a chance to finish.
r.log("runner %s is still installing, give it a chance to finish", instance.Name)
continue
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
// runner is still installing. We give it a chance to finish.
r.log("runner %s is still installing, give it a chance to finish", instance.Name)
continue
}
}
if time.Since(instance.UpdatedAt).Minutes() < 5 {
@ -451,20 +458,7 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
// * The runner never joined github within the pool timeout
// * The runner managed to join github, but the setup process failed later and the runner
// never started on the instance.
//
// There are several steps in the user data that sets up the runner:
// * Download and unarchive the runner from github (or used the cached version)
// * Configure runner (connects to github). At this point the runner is seen as offline.
// * Install the service
// * Set SELinux context (if SELinux is enabled)
// * Start the service (if successful, the runner will transition to "online")
// * Get the runner ID
//
// If we fail getting the runner ID after it's started, garm will set the runner status to "failed",
// even though, technically the runner is online and fully functional. This is why we check here for
// both the runner status as reported by GitHub and the runner status as reported by the provider.
// If the runner is "offline" and marked as "failed", it should be safe to reap it.
if runner, ok := runnersByName[instance.Name]; !ok || (runner.GetStatus() == "offline" && instance.RunnerStatus == params.RunnerFailed) {
if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" {
r.log("reaping timed-out/failed runner %s", instance.Name)
if err := r.ForceDeleteRunner(instance); err != nil {
r.log("failed to update runner %s status: %s", instance.Name, err)
@ -699,6 +693,12 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
}
name := fmt.Sprintf("%s-%s", pool.GetRunnerPrefix(), util.NewID())
labels := r.getLabelsForInstance(pool)
// Attempt to create JIT config
jitConfig, runner, err := r.helper.GetJITConfig(ctx, name, pool, labels)
if err != nil {
r.log("failed to get JIT config, falling back to registration token: %s", err)
}
createParams := params.CreateInstanceParams{
Name: name,
@ -711,6 +711,11 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
CreateAttempt: 1,
GitHubRunnerGroup: pool.GitHubRunnerGroup,
AditionalLabels: aditionalLabels,
JitConfiguration: jitConfig,
}
if runner != nil {
createParams.AgentID = runner.GetID()
}
instance, err := r.store.CreateInstance(r.ctx, poolID, createParams)
@ -719,38 +724,22 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
}
defer func() {
if err != nil && instance.ID != "" {
if err := r.ForceDeleteRunner(instance); err != nil {
r.log("failed to cleanup instance: %s", instance.Name)
if err != nil {
if instance.ID != "" {
if err := r.ForceDeleteRunner(instance); err != nil {
r.log("failed to cleanup instance: %s", instance.Name)
}
}
if runner != nil {
_, runnerCleanupErr := r.helper.RemoveGithubRunner(runner.GetID())
if err != nil {
r.log("failed to remove runner %d: %s", runner.GetID(), runnerCleanupErr)
}
}
}
}()
labels := r.getLabelsForInstance(pool)
// Attempt to create JIT config
jitConfig, runner, err := r.helper.GetJITConfig(ctx, instance, pool, labels)
if err == nil {
updateParams := params.UpdateInstanceParams{
AgentID: runner.GetID(),
// We're using a JIT config. Setting the TokenFetched will disable the registration token
// metadata endpoint.
TokenFetched: github.Bool(true),
JitConfiguration: jitConfig,
}
instance, err = r.updateInstance(instance.Name, updateParams)
if err != nil {
// The agent ID is not recorded in the instance, so the deferred ForceDeleteRunner() will not
// attempt to clean it up. We need to do it here.
_, runnerCleanupErr := r.helper.RemoveGithubRunner(runner.GetID())
if err != nil {
log.Printf("failed to remove runner %d: %s", runner.GetID(), runnerCleanupErr)
}
return errors.Wrap(err, "updating instance")
}
} else {
r.log("failed to get JIT config, falling back to registration token: %s", err)
}
return nil
}

View file

@ -88,17 +88,9 @@ type repository struct {
mux sync.Mutex
}
func (r *repository) GetJITConfig(ctx context.Context, instance params.Instance, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
if instance.AgentID != 0 {
return nil, nil, fmt.Errorf("instance already has an agent ID: %w", runnerErrors.ErrBadRequest)
}
if instance.JitConfiguration != nil {
return nil, nil, fmt.Errorf("instance already has a JIT configuration: %w", runnerErrors.ErrBadRequest)
}
func (r *repository) GetJITConfig(ctx context.Context, instance string, pool params.Pool, labels []string) (jitConfigMap map[string]string, runner *github.Runner, err error) {
req := github.GenerateJITConfigRequest{
Name: instance.Name,
Name: instance,
// At the repository level we only have the default runner group.
RunnerGroupID: 1,
Labels: labels,