Fix max runners check and leaky JIT runners

The check for max runners was added to CreateInstance(), but we crete the
JIT runners before we run the function to add a runner to the DB. The defer
function to clean up the JIT runner was being run after the error return
generated by CreateInstance. So the cleanup code never ran. Additionally
we would know that max runners was reached only after creating the JIT
runner. Which kills rate limits.

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2026-02-11 22:54:50 +02:00 committed by Gabriel
parent 03c28d8598
commit 80e08e7db0

View file

@ -887,6 +887,19 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
}
}
defer func() {
if err != nil {
if runner != nil {
runnerCleanupErr := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
if err != nil {
slog.With(slog.Any("error", runnerCleanupErr)).ErrorContext(
ctx, "failed to remove runner",
"gh_runner_id", runner.GetID())
}
}
}
}()
createParams := params.CreateInstanceParams{
Name: name,
Status: commonParams.InstancePendingCreate,
@ -906,32 +919,10 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona
createParams.AgentID = runner.GetID()
}
instance, err := r.store.CreateInstance(r.ctx, poolID, createParams)
if err != nil {
if _, err := r.store.CreateInstance(r.ctx, poolID, createParams); err != nil {
return fmt.Errorf("error creating instance: %w", err)
}
defer func() {
if err != nil {
if instance.ID != "" {
if err := r.DeleteRunner(instance, false, false); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(
ctx, "failed to cleanup instance",
"runner_name", instance.Name)
}
}
if runner != nil {
runnerCleanupErr := r.ghcli.RemoveEntityRunner(r.ctx, runner.GetID())
if err != nil {
slog.With(slog.Any("error", runnerCleanupErr)).ErrorContext(
ctx, "failed to remove runner",
"gh_runner_id", runner.GetID())
}
}
}
}()
return nil
}
@ -1247,6 +1238,15 @@ func (r *basePoolManager) addRunnerToPool(pool params.Pool, aditionalLabels []st
return fmt.Errorf("pool %s is disabled", pool.ID)
}
poolInstanceCount, err := r.store.PoolInstanceCount(r.ctx, pool.ID)
if err != nil {
return fmt.Errorf("failed to list pool instances: %w", err)
}
if poolInstanceCount >= int64(pool.MaxRunners) {
return fmt.Errorf("max workers (%d) reached for pool %s", pool.MaxRunners, pool.ID)
}
if err := r.AddRunner(r.ctx, pool.ID, aditionalLabels); err != nil {
return fmt.Errorf("failed to add new instance for pool %s: %w", pool.ID, err)
}