Merge pull request #106 from gabriel-samfira/reap-failed-agent
Reap failed agent
This commit is contained in:
commit
05168ac994
3 changed files with 203 additions and 153 deletions
12
Makefile
12
Makefile
|
|
@ -6,10 +6,11 @@ USER_ID=$(shell ((docker --version | grep -q podman) && echo "0" || id -u))
|
|||
USER_GROUP=$(shell ((docker --version | grep -q podman) && echo "0" || id -g))
|
||||
ROOTDIR=$(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
GOPATH ?= $(shell go env GOPATH)
|
||||
VERSION ?= $(shell git describe --tags --match='v[0-9]*' --dirty --always)
|
||||
GO ?= go
|
||||
|
||||
|
||||
default: install
|
||||
default: build
|
||||
|
||||
.PHONY : build-static test install-lint-deps lint go-test fmt fmtcheck verify-vendor verify
|
||||
build-static:
|
||||
|
|
@ -18,9 +19,12 @@ build-static:
|
|||
docker run --rm -e USER_ID=$(USER_ID) -e USER_GROUP=$(USER_GROUP) -v $(PWD):/build/garm:z $(IMAGE_TAG) /build-static.sh
|
||||
@echo Binaries are available in $(PWD)/bin
|
||||
|
||||
install:
|
||||
@$(GO) install -tags osusergo,netgo,sqlite_omit_load_extension ./...
|
||||
@echo Binaries available in ${GOPATH}
|
||||
build:
|
||||
@echo Building garm ${VERSION}
|
||||
$(shell mkdir -p ./bin)
|
||||
@$(GO) build -ldflags "-s -w -X main.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm ./cmd/garm
|
||||
@$(GO) build -ldflags "-s -w -X github.com/cloudbase/garm/cmd/garm-cli/cmd.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm-cli ./cmd/garm-cli
|
||||
@echo Binaries are available in $(PWD)/bin
|
||||
|
||||
test: verify go-test
|
||||
|
||||
|
|
|
|||
|
|
@ -36,11 +36,11 @@ if [ -z "$METADATA_URL" ];then
|
|||
echo "no token is available and METADATA_URL is not set"
|
||||
exit 1
|
||||
fi
|
||||
GITHUB_TOKEN=$(curl --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/")
|
||||
GITHUB_TOKEN=$(curl --retry 5 --retry-max-time 5 --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/")
|
||||
|
||||
function call() {
|
||||
PAYLOAD="$1"
|
||||
curl --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)"
|
||||
curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)"
|
||||
}
|
||||
|
||||
function sendStatus() {
|
||||
|
|
@ -63,41 +63,41 @@ function fail() {
|
|||
# This will echo the version number in the filename. Given a file name like: actions-runner-osx-x64-2.299.1.tar.gz
|
||||
# this will output: 2.299.1
|
||||
function getRunnerVersion() {
|
||||
FILENAME="{{ .FileName }}"
|
||||
[[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]]
|
||||
echo $BASH_REMATCH
|
||||
FILENAME="{{ .FileName }}"
|
||||
[[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]]
|
||||
echo $BASH_REMATCH
|
||||
}
|
||||
|
||||
function getCachedToolsPath() {
|
||||
CACHED_RUNNER="/opt/cache/actions-runner/latest"
|
||||
if [ -d "$CACHED_RUNNER" ];then
|
||||
echo "$CACHED_RUNNER"
|
||||
return 0
|
||||
fi
|
||||
CACHED_RUNNER="/opt/cache/actions-runner/latest"
|
||||
if [ -d "$CACHED_RUNNER" ];then
|
||||
echo "$CACHED_RUNNER"
|
||||
return 0
|
||||
fi
|
||||
|
||||
VERSION=$(getRunnerVersion)
|
||||
if [ -z "$VERSION" ]; then
|
||||
return 0
|
||||
fi
|
||||
VERSION=$(getRunnerVersion)
|
||||
if [ -z "$VERSION" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
CACHED_RUNNER="/opt/cache/actions-runner/$VERSION"
|
||||
if [ -d "$CACHED_RUNNER" ];then
|
||||
echo "$CACHED_RUNNER"
|
||||
return 0
|
||||
fi
|
||||
return 0
|
||||
CACHED_RUNNER="/opt/cache/actions-runner/$VERSION"
|
||||
if [ -d "$CACHED_RUNNER" ];then
|
||||
echo "$CACHED_RUNNER"
|
||||
return 0
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
function downloadAndExtractRunner() {
|
||||
sendStatus "downloading tools from {{ .DownloadURL }}"
|
||||
if [ ! -z "{{ .TempDownloadToken }}" ]; then
|
||||
sendStatus "downloading tools from {{ .DownloadURL }}"
|
||||
if [ ! -z "{{ .TempDownloadToken }}" ]; then
|
||||
TEMP_TOKEN="Authorization: Bearer {{ .TempDownloadToken }}"
|
||||
fi
|
||||
curl -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools"
|
||||
mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder"
|
||||
sendStatus "extracting runner"
|
||||
tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner"
|
||||
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner"
|
||||
fi
|
||||
curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools"
|
||||
mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder"
|
||||
sendStatus "extracting runner"
|
||||
tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner"
|
||||
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner"
|
||||
}
|
||||
|
||||
TEMP_TOKEN=""
|
||||
|
|
@ -107,31 +107,59 @@ GH_RUNNER_GROUP="{{.GitHubRunnerGroup}}"
|
|||
# if it holds a value, it will be part of the command.
|
||||
RUNNER_GROUP_OPT=""
|
||||
if [ ! -z $GH_RUNNER_GROUP ];then
|
||||
RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP"
|
||||
RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP"
|
||||
fi
|
||||
|
||||
CACHED_RUNNER=$(getCachedToolsPath)
|
||||
if [ -z "$CACHED_RUNNER" ];then
|
||||
downloadAndExtractRunner
|
||||
sendStatus "installing dependencies"
|
||||
cd /home/{{ .RunnerUsername }}/actions-runner
|
||||
sudo ./bin/installdependencies.sh || fail "failed to install dependencies"
|
||||
downloadAndExtractRunner
|
||||
sendStatus "installing dependencies"
|
||||
cd /home/{{ .RunnerUsername }}/actions-runner
|
||||
sudo ./bin/installdependencies.sh || fail "failed to install dependencies"
|
||||
else
|
||||
sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner"
|
||||
cd /home/{{ .RunnerUsername }}/actions-runner
|
||||
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner"
|
||||
sendStatus "using cached runner found in $CACHED_RUNNER"
|
||||
sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner"
|
||||
cd /home/{{ .RunnerUsername }}/actions-runner
|
||||
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner"
|
||||
fi
|
||||
|
||||
|
||||
sendStatus "configuring runner"
|
||||
sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral || fail "failed to configure runner"
|
||||
set +e
|
||||
attempt=1
|
||||
while true; do
|
||||
ERROUT=$(mktemp)
|
||||
sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral 2>$ERROUT
|
||||
if [ $? -eq 0 ]; then
|
||||
rm $ERROUT || true
|
||||
sendStatus "runner successfully configured after $attempt attempt(s)"
|
||||
break
|
||||
fi
|
||||
LAST_ERR=$(cat $ERROUT)
|
||||
echo "$LAST_ERR"
|
||||
|
||||
# if the runner is already configured, remove it and try again. In the past configuring a runner
|
||||
# managed to register it but timed out later, resulting in an error.
|
||||
sudo -u {{ .RunnerUsername }} -- ./config.sh remove --token "$GITHUB_TOKEN" || true
|
||||
|
||||
if [ $attempt -gt 5 ];then
|
||||
rm $ERROUT || true
|
||||
fail "failed to configure runner: $LAST_ERR"
|
||||
fi
|
||||
|
||||
sendStatus "failed to configure runner (attempt $attempt): $LAST_ERR (retrying in 5 seconds)"
|
||||
attempt=$((attempt+1))
|
||||
rm $ERROUT || true
|
||||
sleep 5
|
||||
done
|
||||
set -e
|
||||
|
||||
sendStatus "installing runner service"
|
||||
./svc.sh install {{ .RunnerUsername }} || fail "failed to install service"
|
||||
|
||||
if [ -e "/sys/fs/selinux" ];then
|
||||
sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context"
|
||||
sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context"
|
||||
sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context"
|
||||
sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context"
|
||||
fi
|
||||
|
||||
sendStatus "starting service"
|
||||
|
|
@ -156,105 +184,105 @@ Param(
|
|||
$ErrorActionPreference="Stop"
|
||||
|
||||
function Invoke-FastWebRequest {
|
||||
[CmdletBinding()]
|
||||
Param(
|
||||
[Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)]
|
||||
[System.Uri]$Uri,
|
||||
[Parameter(Position=1)]
|
||||
[string]$OutFile,
|
||||
[Hashtable]$Headers=@{},
|
||||
[switch]$SkipIntegrityCheck=$false
|
||||
)
|
||||
PROCESS
|
||||
{
|
||||
if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type)
|
||||
{
|
||||
$assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http")
|
||||
}
|
||||
[CmdletBinding()]
|
||||
Param(
|
||||
[Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)]
|
||||
[System.Uri]$Uri,
|
||||
[Parameter(Position=1)]
|
||||
[string]$OutFile,
|
||||
[Hashtable]$Headers=@{},
|
||||
[switch]$SkipIntegrityCheck=$false
|
||||
)
|
||||
PROCESS
|
||||
{
|
||||
if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type)
|
||||
{
|
||||
$assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http")
|
||||
}
|
||||
|
||||
if(!$OutFile) {
|
||||
$OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1)
|
||||
if(!$OutFile) {
|
||||
throw "The ""OutFile"" parameter needs to be specified"
|
||||
}
|
||||
}
|
||||
if(!$OutFile) {
|
||||
$OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1)
|
||||
if(!$OutFile) {
|
||||
throw "The ""OutFile"" parameter needs to be specified"
|
||||
}
|
||||
}
|
||||
|
||||
$fragment = $Uri.Fragment.Trim('#')
|
||||
if ($fragment) {
|
||||
$details = $fragment.Split("=")
|
||||
$algorithm = $details[0]
|
||||
$hash = $details[1]
|
||||
}
|
||||
$fragment = $Uri.Fragment.Trim('#')
|
||||
if ($fragment) {
|
||||
$details = $fragment.Split("=")
|
||||
$algorithm = $details[0]
|
||||
$hash = $details[1]
|
||||
}
|
||||
|
||||
if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) {
|
||||
try {
|
||||
return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash)
|
||||
} catch {
|
||||
Remove-Item $OutFile
|
||||
}
|
||||
}
|
||||
if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) {
|
||||
try {
|
||||
return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash)
|
||||
} catch {
|
||||
Remove-Item $OutFile
|
||||
}
|
||||
}
|
||||
|
||||
$client = new-object System.Net.Http.HttpClient
|
||||
foreach ($k in $Headers.Keys){
|
||||
$client.DefaultRequestHeaders.Add($k, $Headers[$k])
|
||||
}
|
||||
$task = $client.GetStreamAsync($Uri)
|
||||
$response = $task.Result
|
||||
if($task.IsFaulted) {
|
||||
$msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status)
|
||||
if($task.Exception) {
|
||||
$msg += "Exception details: {0}" -f @($task.Exception)
|
||||
}
|
||||
Throw $msg
|
||||
}
|
||||
$outStream = New-Object IO.FileStream $OutFile, Create, Write, None
|
||||
$client = new-object System.Net.Http.HttpClient
|
||||
foreach ($k in $Headers.Keys){
|
||||
$client.DefaultRequestHeaders.Add($k, $Headers[$k])
|
||||
}
|
||||
$task = $client.GetStreamAsync($Uri)
|
||||
$response = $task.Result
|
||||
if($task.IsFaulted) {
|
||||
$msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status)
|
||||
if($task.Exception) {
|
||||
$msg += "Exception details: {0}" -f @($task.Exception)
|
||||
}
|
||||
Throw $msg
|
||||
}
|
||||
$outStream = New-Object IO.FileStream $OutFile, Create, Write, None
|
||||
|
||||
try {
|
||||
$totRead = 0
|
||||
$buffer = New-Object Byte[] 1MB
|
||||
while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) {
|
||||
$totRead += $read
|
||||
$outStream.Write($buffer, 0, $read);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
$outStream.Close()
|
||||
}
|
||||
if(!$SkipIntegrityCheck -and $fragment) {
|
||||
Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash
|
||||
}
|
||||
}
|
||||
try {
|
||||
$totRead = 0
|
||||
$buffer = New-Object Byte[] 1MB
|
||||
while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) {
|
||||
$totRead += $read
|
||||
$outStream.Write($buffer, 0, $read);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
$outStream.Close()
|
||||
}
|
||||
if(!$SkipIntegrityCheck -and $fragment) {
|
||||
Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function Import-Certificate() {
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CertificatePath,
|
||||
[parameter(Mandatory=$true)]
|
||||
[System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine",
|
||||
[parameter(Mandatory=$true)]
|
||||
[System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher"
|
||||
)
|
||||
PROCESS
|
||||
{
|
||||
$store = New-Object System.Security.Cryptography.X509Certificates.X509Store(
|
||||
$StoreName, $StoreLocation)
|
||||
$store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite)
|
||||
$cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2(
|
||||
$CertificatePath)
|
||||
$store.Add($cert)
|
||||
}
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CertificatePath,
|
||||
[parameter(Mandatory=$true)]
|
||||
[System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine",
|
||||
[parameter(Mandatory=$true)]
|
||||
[System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher"
|
||||
)
|
||||
PROCESS
|
||||
{
|
||||
$store = New-Object System.Security.Cryptography.X509Certificates.X509Store(
|
||||
$StoreName, $StoreLocation)
|
||||
$store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite)
|
||||
$cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2(
|
||||
$CertificatePath)
|
||||
$store.Add($cert)
|
||||
}
|
||||
}
|
||||
|
||||
function Invoke-APICall() {
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[object]$Payload,
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[object]$Payload,
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CallbackURL
|
||||
)
|
||||
)
|
||||
PROCESS{
|
||||
Invoke-WebRequest -UseBasicParsing -Method Post -Headers @{"Accept"="application/json"; "Authorization"="Bearer $Token"} -Uri $CallbackURL -Body (ConvertTo-Json $Payload) | Out-Null
|
||||
}
|
||||
|
|
@ -262,12 +290,12 @@ function Invoke-APICall() {
|
|||
|
||||
function Update-GarmStatus() {
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$Message,
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$Message,
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CallbackURL
|
||||
)
|
||||
)
|
||||
PROCESS{
|
||||
$body = @{
|
||||
"status"="installing"
|
||||
|
|
@ -279,14 +307,14 @@ function Update-GarmStatus() {
|
|||
|
||||
function Invoke-GarmSuccess() {
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$Message,
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[int64]$AgentID,
|
||||
[string]$Message,
|
||||
[parameter(Mandatory=$true)]
|
||||
[int64]$AgentID,
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CallbackURL
|
||||
)
|
||||
)
|
||||
PROCESS{
|
||||
$body = @{
|
||||
"status"="idle"
|
||||
|
|
@ -299,12 +327,12 @@ function Invoke-GarmSuccess() {
|
|||
|
||||
function Invoke-GarmFailure() {
|
||||
[CmdletBinding()]
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$Message,
|
||||
param (
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$Message,
|
||||
[parameter(Mandatory=$true)]
|
||||
[string]$CallbackURL
|
||||
)
|
||||
)
|
||||
PROCESS{
|
||||
$body = @{
|
||||
"status"="failed"
|
||||
|
|
|
|||
|
|
@ -365,26 +365,44 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
|
|||
return errors.Wrap(err, "fetching instances from db")
|
||||
}
|
||||
|
||||
runnerNames := map[string]bool{}
|
||||
runnersByName := map[string]*github.Runner{}
|
||||
for _, run := range runners {
|
||||
if !r.isManagedRunner(labelsFromRunner(run)) {
|
||||
log.Printf("runner %s is not managed by a pool belonging to %s", *run.Name, r.helper.String())
|
||||
continue
|
||||
}
|
||||
runnerNames[*run.Name] = true
|
||||
runnersByName[*run.Name] = run
|
||||
}
|
||||
|
||||
for _, instance := range dbInstances {
|
||||
if ok := runnerNames[instance.Name]; !ok {
|
||||
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instance pool info")
|
||||
}
|
||||
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
|
||||
continue
|
||||
}
|
||||
log.Printf("reaping instance %s due to timeout", instance.Name)
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instance pool info")
|
||||
}
|
||||
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
|
||||
continue
|
||||
}
|
||||
|
||||
// There are 2 cases (currently) where we consider a runner as timed out:
|
||||
// * The runner never joined github within the pool timeout
|
||||
// * The runner managed to join github, but the setup process failed later and the runner
|
||||
// never started on the instance.
|
||||
//
|
||||
// There are several steps in the user data that sets up the runner:
|
||||
// * Download and unarchive the runner from github (or used the cached version)
|
||||
// * Configure runner (connects to github). At this point the runner is seen as offline.
|
||||
// * Install the service
|
||||
// * Set SELinux context (if SELinux is enabled)
|
||||
// * Start the service (if successful, the runner will transition to "online")
|
||||
// * Get the runner ID
|
||||
//
|
||||
// If we fail getting the runner ID after it's started, garm will set the runner status to "failed",
|
||||
// even though, technically the runner is online and fully functional. This is why we check here for
|
||||
// both the runner status as reported by GitHub and the runner status as reported by the provider.
|
||||
// If the runner is "offline" and marked as "failed", it should be safe to reap it.
|
||||
if runner, ok := runnersByName[instance.Name]; !ok || (runner.GetStatus() == "offline" && instance.RunnerStatus == providerCommon.RunnerFailed) {
|
||||
log.Printf("reaping timed-out/failed runner %s", instance.Name)
|
||||
if err := r.ForceDeleteRunner(instance); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue