From 646a0d761f5d544c109c8a61b4ff95a98bf947d2 Mon Sep 17 00:00:00 2001
From: Mathieu Fenniak <mathieu@fenniak.net>
Date: Sun, 3 Aug 2025 07:01:46 +0000
Subject: [PATCH] fix: longer timeouts on job cleanup without waiting (v2)
 (#798)

Probably fixes #431 (see investigation notes: https://code.forgejo.org/forgejo/runner/issues/431#issuecomment-50810) by extending the timeout for all cleanup actions from 1 minute to 30 minutes, and run cleanup after the task status is reported back to Forgejo.  Cleanup logging now goes in the runner log rather than the job log, and it was tweaked to be less verbose and contain more useful identifying information on the job in case of error.

Manual testing confirms that Forgejo receives the success/failure status of the task before cleanup begins, that no further logging reaches the task at that point, and that the longer cleanup timeout is applied as expected.

This is an alternate, simplified version of #779.

<!--start release-notes-assistant-->
<!--URL:https://code.forgejo.org/forgejo/runner-->
- bug fixes
  - [PR](https://code.forgejo.org/forgejo/runner/pulls/798): <!--number 798 --><!--line 0 --><!--description Zml4OiBsb25nZXIgdGltZW91dHMgb24gam9iIGNsZWFudXAgd2l0aG91dCB3YWl0aW5nICh2Mik=-->fix: longer timeouts on job cleanup without waiting (v2)<!--description-->
<!--end release-notes-assistant-->

Reviewed-on: https://code.forgejo.org/forgejo/runner/pulls/798
Reviewed-by: earl-warren <earl-warren@noreply.code.forgejo.org>
Co-authored-by: Mathieu Fenniak <mathieu@fenniak.net>
Co-committed-by: Mathieu Fenniak <mathieu@fenniak.net>
---
 act/runner/job_executor.go | 26 ++++++++++++++++----------
 act/runner/run_context.go  |  4 ++++
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go
index c3bcb4b8..f9ab36a0 100644
--- a/act/runner/job_executor.go
+++ b/act/runner/job_executor.go
@@ -20,6 +20,8 @@ type jobInfo interface {
 	result(result string)
 }
 
+const cleanupTimeout = 30 * time.Minute
+
 func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executor {
 	steps := make([]common.Executor, 0)
 	preSteps := make([]common.Executor, 0)
@@ -109,16 +111,23 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
 
 	postExecutor = postExecutor.Finally(func(ctx context.Context) error {
 		jobError := common.JobError(ctx)
+
+		// Fresh context to ensure job result output works even if prev. context was a cancelled job
+		ctx, cancel := context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), time.Minute)
+		defer cancel()
+		setJobResult(ctx, info, rc, jobError == nil)
+		setJobOutputs(ctx, rc)
+
 		var err error
 		if rc.Config.AutoRemove || jobError == nil {
-			// always allow 1 min for stopping and removing the runner, even if we were cancelled
-			ctx, cancel := context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), time.Minute)
+			// Separate timeout for cleanup tasks; logger is cleared so that cleanup logs go to runner, not job
+			ctx, cancel := context.WithTimeout(context.Background(), cleanupTimeout)
 			defer cancel()
 
 			logger := common.Logger(ctx)
-			logger.Infof("Cleaning up container for job %s", rc.JobName)
+			logger.Debugf("Cleaning up container for job %s", rc.jobContainerName())
 			if err = info.stopContainer()(ctx); err != nil {
-				logger.Errorf("Error while stop job container: %v", err)
+				logger.Errorf("Error while stop job container %s: %v", rc.jobContainerName(), err)
 			}
 
 			if !rc.IsHostEnv(ctx) && rc.Config.ContainerNetworkMode == "" {
@@ -127,15 +136,12 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
 				// it means that the network to which containers are connecting is created by `act_runner`,
 				// so, we should remove the network at last.
 				networkName, _ := rc.networkName()
-				logger.Infof("Cleaning up network for job %s, and network name is: %s", rc.JobName, networkName)
+				logger.Debugf("Cleaning up network %s for job %s", networkName, rc.jobContainerName())
 				if err := container.NewDockerNetworkRemoveExecutor(networkName)(ctx); err != nil {
-					logger.Errorf("Error while cleaning network: %v", err)
+					logger.Errorf("Error while cleaning network %s: %v", networkName, err)
 				}
 			}
 		}
-		setJobResult(ctx, info, rc, jobError == nil)
-		setJobOutputs(ctx, rc)
-
 		return err
 	})
 
@@ -149,7 +155,7 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
 			if ctx.Err() == context.Canceled {
 				// in case of an aborted run, we still should execute the
 				// post steps to allow cleanup.
-				ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute)
+				ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), cleanupTimeout)
 				defer cancel()
 			}
 			return postExecutor(ctx)
diff --git a/act/runner/run_context.go b/act/runner/run_context.go
index 52b742b0..d1fc1cd8 100644
--- a/act/runner/run_context.go
+++ b/act/runner/run_context.go
@@ -515,6 +515,10 @@ func (rc *RunContext) prepareJobContainer(ctx context.Context) error {
 	}
 
 	rc.cleanUpJobContainer = func(ctx context.Context) error {
+		// reinit logger from ctx since cleanUpJobContainer is be called after the job is complete, and using
+		// prepareJobContainer's logger could cause logs to continue to append to the finished job
+		logger := common.Logger(ctx)
+
 		reuseJobContainer := func(ctx context.Context) bool {
 			return rc.Config.ReuseContainers
 		}