fix: race condition in matrix job result state may result in failed jobs being marked as successful
In `setJobResult` there is no coordination between multiple jobs that
are completing, leading to a possible condition where `jobResult` can be
read from the matrix job as `""` by one-or-more jobs, marked as
`"failed"` by one-or-more jobs, and then marked as `"success"` by other
jobs.
```
==================
WARNING: DATA RACE
Read at 0x00c0006d08a0 by goroutine 29232:
code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult()
/.../forgejo-runner/act/runner/job_executor.go:173
+0x359
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6()
/.../forgejo-runner/act/runner/job_executor.go:118
+0x15d
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14()
/.../forgejo-runner/act/common/executor.go:183 +0x86
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7()
/.../forgejo-runner/act/runner/job_executor.go:161
+0x191
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16()
/.../forgejo-runner/act/common/executor.go:183 +0x86
...
Previous write at 0x00c0006d08a0 by goroutine 29234:
code.forgejo.org/forgejo/runner/v9/act/runner.(*RunContext).result()
/.../forgejo-runner/act/runner/run_context.go:897
+0x271
code.forgejo.org/forgejo/runner/v9/act/runner.setJobResult()
/.../forgejo-runner/act/runner/job_executor.go:181
+0x66e
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func6()
/.../forgejo-runner/act/runner/job_executor.go:118
+0x15d
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func14()
/.../forgejo-runner/act/common/executor.go:183 +0x86
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.func7()
/.../forgejo-runner/act/runner/job_executor.go:161
+0x191
code.forgejo.org/forgejo/runner/v9/act/runner.newJobExecutor.Executor.Finally.func16()
/.../forgejo-runner/act/common/executor.go:183 +0x86
...
==================
```
This commit is contained in:
parent
03fbeab01b
commit
c55251bbd5
3 changed files with 69 additions and 0 deletions
|
|
@ -162,6 +162,11 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
|
|||
func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success bool) {
|
||||
logger := common.Logger(ctx)
|
||||
|
||||
// As we're reading the matrix build's status (`rc.Run.Job().Result`), it's possible for it change in another
|
||||
// goroutine running `setJobResult` and invoking `.result(...)`. Prevent concurrent execution of `setJobResult`...
|
||||
rc.JobResultMutex.Lock()
|
||||
defer rc.JobResultMutex.Unlock()
|
||||
|
||||
jobResult := "success"
|
||||
// we have only one result for a whole matrix build, so we need
|
||||
// to keep an existing result state if we run a matrix
|
||||
|
|
|
|||
|
|
@ -5,7 +5,9 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"slices"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"code.forgejo.org/forgejo/runner/v9/act/common"
|
||||
"code.forgejo.org/forgejo/runner/v9/act/container"
|
||||
|
|
@ -333,3 +335,62 @@ func TestJobExecutorNewJobExecutor(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetJobResultConcurrency(t *testing.T) {
|
||||
jim := &jobInfoMock{}
|
||||
rc := &RunContext{
|
||||
Run: &model.Run{
|
||||
JobID: "test",
|
||||
Workflow: &model.Workflow{
|
||||
Jobs: map[string]*model.Job{
|
||||
"test": {
|
||||
Result: "success",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
jim.On("matrix").Return(map[string]interface{}{
|
||||
"python": []string{"3.10", "3.11", "3.12"},
|
||||
})
|
||||
|
||||
// Synthesize a race condition in setJobResult where, by reading data from the job matrix earlier and then
|
||||
// performing unsynchronzied writes to the same shared data structure, it can overwrite a failure status.
|
||||
//
|
||||
// Goroutine 1: Start marking job as success
|
||||
// (artificially suspended
|
||||
// by result() mock)
|
||||
// Goroutine 2: Mark job as failure
|
||||
// Goroutine 1: Finish marking job as success
|
||||
//
|
||||
// Correct behavior: Job is marked as a failure
|
||||
// Bug behavior: Job is marked as a success
|
||||
|
||||
var lastResult string
|
||||
jim.On("result", mock.Anything).Run(func(args mock.Arguments) {
|
||||
result := args.String(0)
|
||||
// Artificially suspend the "success" case so that the failure case races past it.
|
||||
if result == "success" {
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
rc.Run.Job().Result = result
|
||||
lastResult = result
|
||||
})
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
// Goroutine 1, mark as success:
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
setJobResult(t.Context(), jim, rc, true)
|
||||
}()
|
||||
// Goroutine 2, mark as failure:
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
setJobResult(t.Context(), jim, rc, false)
|
||||
}()
|
||||
wg.Wait()
|
||||
|
||||
assert.Equal(t, "failure", lastResult)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ import (
|
|||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
|
|
@ -56,6 +57,8 @@ type RunContext struct {
|
|||
randomName string
|
||||
networkName string
|
||||
networkCreated bool
|
||||
|
||||
JobResultMutex sync.Mutex
|
||||
}
|
||||
|
||||
func (rc *RunContext) AddMask(mask string) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue