Add workflow and scaleset job ID to metrics and fixes
This change adds workflow job ID, scaleset job ID and workflow run ID to the metrics. This change also attempts to fix how jobs are recorded when a workflow is posted by a webhook, but the job is handled by a scale set. Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
parent
c8844e543c
commit
3ceb2f7ebb
4 changed files with 53 additions and 13 deletions
|
|
@ -63,7 +63,7 @@ func formatJobs(jobs []params.Job) {
|
|||
return
|
||||
}
|
||||
t := table.NewWriter()
|
||||
header := table.Row{"ID", "Name", "Status", "Conclusion", "Runner Name", "Repository", "Requested Labels", "Locked by"}
|
||||
header := table.Row{"Workflow Job ID", "Scale Set Job ID", "Name", "Status", "Conclusion", "Runner Name", "Repository", "Requested Labels", "Locked by"}
|
||||
t.AppendHeader(header)
|
||||
|
||||
for _, job := range jobs {
|
||||
|
|
@ -72,7 +72,7 @@ func formatJobs(jobs []params.Job) {
|
|||
if job.LockedBy != uuid.Nil {
|
||||
lockedBy = job.LockedBy.String()
|
||||
}
|
||||
t.AppendRow(table.Row{job.ID, job.Name, job.Status, job.Conclusion, job.RunnerName, repo, strings.Join(job.Labels, " "), lockedBy})
|
||||
t.AppendRow(table.Row{job.WorkflowJobID, job.ScaleSetJobID, job.Name, job.Status, job.Conclusion, job.RunnerName, repo, strings.Join(job.Labels, " "), lockedBy})
|
||||
t.AppendSeparator()
|
||||
}
|
||||
fmt.Println(t.Render())
|
||||
|
|
|
|||
|
|
@ -22,5 +22,16 @@ var JobStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|||
Namespace: metricsNamespace,
|
||||
Subsystem: metricsJobsSubsystem,
|
||||
Name: "status",
|
||||
Help: "List of jobs and their status",
|
||||
}, []string{"job_id", "name", "status", "conclusion", "runner_name", "repository", "requested_labels"})
|
||||
Help: "List of workflow jobs and their status",
|
||||
}, []string{
|
||||
"job_id",
|
||||
"workflow_job_id",
|
||||
"scaleset_job_id",
|
||||
"workflow_run_id",
|
||||
"name",
|
||||
"status",
|
||||
"conclusion",
|
||||
"runner_name",
|
||||
"repository",
|
||||
"requested_labels",
|
||||
})
|
||||
|
|
|
|||
|
|
@ -35,13 +35,16 @@ func CollectJobMetric(ctx context.Context, r *runner.Runner) error {
|
|||
|
||||
for _, job := range jobs {
|
||||
metrics.JobStatus.WithLabelValues(
|
||||
fmt.Sprintf("%d", job.ID), // label: job_id
|
||||
job.Name, // label: name
|
||||
job.Status, // label: status
|
||||
job.Conclusion, // label: conclusion
|
||||
job.RunnerName, // label: runner_name
|
||||
job.RepositoryName, // label: repository
|
||||
strings.Join(job.Labels, " "), // label: requested_labels
|
||||
fmt.Sprintf("%d", job.ID), // label: job_id
|
||||
fmt.Sprintf("%d", job.WorkflowJobID), // label: workflow_job_id
|
||||
job.ScaleSetJobID, // label: scaleset_job_id
|
||||
fmt.Sprintf("%d", job.WorkflowJobID), // label: scaleset_job_id
|
||||
job.Name, // label: name
|
||||
job.Status, // label: status
|
||||
job.Conclusion, // label: conclusion
|
||||
job.RunnerName, // label: runner_name
|
||||
job.RepositoryName, // label: repository
|
||||
strings.Join(job.Labels, " "), // label: requested_labels
|
||||
).Set(1)
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
|
|
@ -330,6 +330,11 @@ func (r *basePoolManager) handleCompletedJob(jobParams params.Job) error {
|
|||
|
||||
// handleInProgressJob processes an in-progress job webhook
|
||||
func (r *basePoolManager) handleInProgressJob(jobParams params.Job) (triggeredBy int64, err error) {
|
||||
if jobParams.RunnerName == "" {
|
||||
slog.DebugContext(r.ctx, "instance not found in job data", "workflow_job_id", jobParams.ID)
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Mark runner as active (this also validates the instance exists)
|
||||
instance, err := r.setInstanceRunnerStatus(jobParams.RunnerName, params.RunnerActive)
|
||||
if err != nil {
|
||||
|
|
@ -393,6 +398,27 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error {
|
|||
return fmt.Errorf("error converting job to params: %w", err)
|
||||
}
|
||||
|
||||
// For in_progress/completed jobs, check if the runner belongs to a scale set.
|
||||
// Scale set jobs are handled by the scale set listener, not by webhooks.
|
||||
if job.Action == "in_progress" || job.Action == "completed" {
|
||||
if jobParams.RunnerName != "" {
|
||||
instance, err := r.store.GetInstance(r.ctx, jobParams.RunnerName)
|
||||
if err == nil && instance.ScaleSetID != 0 {
|
||||
slog.DebugContext(r.ctx, "job belongs to a scale set instance, skipping webhook processing",
|
||||
"runner_name", util.SanitizeLogEntry(jobParams.RunnerName),
|
||||
"scale_set_id", instance.ScaleSetID)
|
||||
// Clean up any orphaned queued job that may have been recorded
|
||||
// via webhook before we knew it belonged to a scale set.
|
||||
if err := r.store.DeleteJob(r.ctx, jobParams.WorkflowJobID); err != nil && !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
slog.With(slog.Any("error", err)).ErrorContext(
|
||||
r.ctx, "failed to delete orphaned webhook job for scale set instance",
|
||||
"job_id", jobParams.WorkflowJobID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process job based on action type
|
||||
var triggeredBy int64
|
||||
var actionErr error
|
||||
|
|
@ -400,10 +426,10 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error {
|
|||
switch job.Action {
|
||||
case "queued":
|
||||
// Queued jobs are just recorded; they'll be picked up by consumeQueuedJobs()
|
||||
case "completed":
|
||||
actionErr = r.handleCompletedJob(jobParams)
|
||||
case "in_progress":
|
||||
triggeredBy, actionErr = r.handleInProgressJob(jobParams)
|
||||
case "completed":
|
||||
actionErr = r.handleCompletedJob(jobParams)
|
||||
}
|
||||
|
||||
// Always persist job to database (success or failure)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue