Delete all inactionable jobs

GARM cares about jobs in queued state for anything that requires
decision making. Anything else is purely informational.

This change cleans up all inactionable jobs and refuses to record jobs
that are not already in the database, have an inactionable state and
which do not have a runner we own handling them.

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2026-02-10 14:08:44 +02:00 committed by Gabriel
parent eeb8bf6457
commit ec0fd6e3f8
9 changed files with 487 additions and 187 deletions

View file

@ -424,16 +424,19 @@ func (s *sqlDatabase) GetJobByID(_ context.Context, jobID int64) (params.Job, er
return sqlWorkflowJobToParamsJob(job)
}
// DeleteCompletedJobs deletes all completed jobs.
func (s *sqlDatabase) DeleteCompletedJobs(_ context.Context) error {
query := s.conn.Model(&WorkflowJob{}).Where("status = ?", params.JobStatusCompleted)
if err := query.Unscoped().Delete(&WorkflowJob{}); err.Error != nil {
if errors.Is(err.Error, gorm.ErrRecordNotFound) {
return nil
}
return err.Error
// DeleteInactionableJobs will delete jobs that are not in queued state and have no
// runner associated with them. This can happen if we have a pool that matches labels
// defined on a job, but the job itself was picked up by a runner we don't manage.
// When a job transitions from queued to anything else, GARM only uses them for informational
// purposes. So they are safe to delete.
// Also deletes completed jobs with GARM runners attached as they are no longer needed.
func (s *sqlDatabase) DeleteInactionableJobs(_ context.Context) error {
q := s.conn.
Unscoped().
Where("(status != ? AND instance_id IS NULL) OR (status = ? AND instance_id IS NOT NULL)", params.JobStatusQueued, params.JobStatusCompleted).
Delete(&WorkflowJob{})
if q.Error != nil {
return fmt.Errorf("deleting inactionable jobs: %w", q.Error)
}
return nil
}

189
database/sql/jobs_test.go Normal file
View file

@ -0,0 +1,189 @@
// Copyright 2025 Cloudbase Solutions SRL
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
package sql
import (
"context"
"fmt"
"testing"
"github.com/stretchr/testify/suite"
dbCommon "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/database/watcher"
garmTesting "github.com/cloudbase/garm/internal/testing"
"github.com/cloudbase/garm/params"
)
type JobsTestSuite struct {
suite.Suite
Store dbCommon.Store
adminCtx context.Context
}
func (s *JobsTestSuite) SetupTest() {
ctx := context.Background()
watcher.InitWatcher(ctx)
// Create testing sqlite database
db, err := NewSQLDatabase(ctx, garmTesting.GetTestSqliteDBConfig(s.T()))
if err != nil {
s.FailNow(fmt.Sprintf("failed to create db connection: %s", err))
}
s.Store = db
adminCtx := garmTesting.ImpersonateAdminContext(ctx, db, s.T())
s.adminCtx = adminCtx
}
func (s *JobsTestSuite) TearDownTest() {
watcher.CloseWatcher()
}
func TestJobsTestSuite(t *testing.T) {
suite.Run(t, new(JobsTestSuite))
}
// TestDeleteInactionableJobs verifies the deletion logic for jobs
func (s *JobsTestSuite) TestDeleteInactionableJobs() {
db := s.Store.(*sqlDatabase)
// Create mix of jobs to test all conditions:
// 1. Queued jobs (should NOT be deleted)
queuedJob := params.Job{
WorkflowJobID: 12345,
RunID: 67890,
Action: "test-action",
Status: string(params.JobStatusQueued),
Name: "queued-job",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err := s.Store.CreateOrUpdateJob(s.adminCtx, queuedJob)
s.Require().NoError(err)
// 2. In-progress job without instance (should be deleted)
inProgressNoInstance := params.Job{
WorkflowJobID: 12346,
RunID: 67890,
Action: "test-action",
Status: string(params.JobStatusInProgress),
Name: "inprogress-no-instance",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err = s.Store.CreateOrUpdateJob(s.adminCtx, inProgressNoInstance)
s.Require().NoError(err)
// 3. Completed job without instance (should be deleted)
completedNoInstance := params.Job{
WorkflowJobID: 12347,
RunID: 67890,
Action: "test-action",
Status: string(params.JobStatusCompleted),
Conclusion: "success",
Name: "completed-no-instance",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err = s.Store.CreateOrUpdateJob(s.adminCtx, completedNoInstance)
s.Require().NoError(err)
// Count total jobs before deletion
var countBefore int64
err = db.conn.Model(&WorkflowJob{}).Count(&countBefore).Error
s.Require().NoError(err)
s.Require().Equal(int64(3), countBefore, "Should have 3 jobs before deletion")
// Run deletion
err = s.Store.DeleteInactionableJobs(s.adminCtx)
s.Require().NoError(err)
// Count remaining jobs - should only have the queued job
var countAfter int64
err = db.conn.Model(&WorkflowJob{}).Count(&countAfter).Error
s.Require().NoError(err)
s.Require().Equal(int64(1), countAfter, "Should have 1 job remaining (queued)")
// Verify the remaining job is the queued one
var remaining WorkflowJob
err = db.conn.Where("workflow_job_id = ?", 12345).First(&remaining).Error
s.Require().NoError(err)
s.Require().Equal("queued", remaining.Status)
}
// TestDeleteInactionableJobs_AllScenarios verifies all deletion rules
func (s *JobsTestSuite) TestDeleteInactionableJobs_AllScenarios() {
db := s.Store.(*sqlDatabase)
// Rule 1: Queued jobs are NEVER deleted (regardless of instance_id)
queuedNoInstance := params.Job{
WorkflowJobID: 20001,
RunID: 67890,
Status: string(params.JobStatusQueued),
Name: "queued-no-instance",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err := s.Store.CreateOrUpdateJob(s.adminCtx, queuedNoInstance)
s.Require().NoError(err)
// Rule 2: Non-queued jobs WITHOUT instance_id ARE deleted
inProgressNoInstance := params.Job{
WorkflowJobID: 20002,
RunID: 67890,
Status: string(params.JobStatusInProgress),
Name: "inprogress-no-instance",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err = s.Store.CreateOrUpdateJob(s.adminCtx, inProgressNoInstance)
s.Require().NoError(err)
completedNoInstance := params.Job{
WorkflowJobID: 20003,
RunID: 67890,
Status: string(params.JobStatusCompleted),
Conclusion: "success",
Name: "completed-no-instance",
RepositoryName: "test-repo",
RepositoryOwner: "test-owner",
}
_, err = s.Store.CreateOrUpdateJob(s.adminCtx, completedNoInstance)
s.Require().NoError(err)
// Count jobs before deletion
var countBefore int64
err = db.conn.Model(&WorkflowJob{}).Count(&countBefore).Error
s.Require().NoError(err)
s.Require().Equal(int64(3), countBefore)
// Run deletion
err = s.Store.DeleteInactionableJobs(s.adminCtx)
s.Require().NoError(err)
// After deletion, only queued job should remain
var countAfter int64
err = db.conn.Model(&WorkflowJob{}).Count(&countAfter).Error
s.Require().NoError(err)
s.Require().Equal(int64(1), countAfter, "Only queued job should remain")
// Verify it's the queued job that remains
var jobs []WorkflowJob
err = db.conn.Find(&jobs).Error
s.Require().NoError(err)
s.Require().Len(jobs, 1)
s.Require().Equal(string(params.JobStatusQueued), jobs[0].Status)
}

View file

@ -400,7 +400,7 @@ type WorkflowJob struct {
Conclusion string
// Status is the phase of the lifecycle that the job is currently in.
// "queued", "in_progress" and "completed".
Status string
Status string `gorm:"index:idx_workflow_jobs_status_instance_id,priority:1"`
// Name is the name if the job that was triggered.
Name string
@ -409,7 +409,7 @@ type WorkflowJob struct {
GithubRunnerID int64
InstanceID *uuid.UUID `gorm:"index:idx_instance_job"`
InstanceID *uuid.UUID `gorm:"index:idx_instance_job;index:idx_workflow_jobs_status_instance_id,priority:2"`
Instance Instance `gorm:"foreignKey:InstanceID"`
RunnerGroupID int64