diff --git a/Makefile b/Makefile index 72fd651b..fbd2dbf2 100644 --- a/Makefile +++ b/Makefile @@ -6,10 +6,11 @@ USER_ID=$(shell ((docker --version | grep -q podman) && echo "0" || id -u)) USER_GROUP=$(shell ((docker --version | grep -q podman) && echo "0" || id -g)) ROOTDIR=$(dir $(abspath $(lastword $(MAKEFILE_LIST)))) GOPATH ?= $(shell go env GOPATH) +VERSION ?= $(shell git describe --tags --match='v[0-9]*' --dirty --always) GO ?= go -default: install +default: build .PHONY : build-static test install-lint-deps lint go-test fmt fmtcheck verify-vendor verify build-static: @@ -18,9 +19,12 @@ build-static: docker run --rm -e USER_ID=$(USER_ID) -e USER_GROUP=$(USER_GROUP) -v $(PWD):/build/garm:z $(IMAGE_TAG) /build-static.sh @echo Binaries are available in $(PWD)/bin -install: - @$(GO) install -tags osusergo,netgo,sqlite_omit_load_extension ./... - @echo Binaries available in ${GOPATH} +build: + @echo Building garm ${VERSION} + $(shell mkdir -p ./bin) + @$(GO) build -ldflags "-s -w -X main.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm ./cmd/garm + @$(GO) build -ldflags "-s -w -X github.com/cloudbase/garm/cmd/garm-cli/cmd.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm-cli ./cmd/garm-cli + @echo Binaries are available in $(PWD)/bin test: verify go-test diff --git a/cloudconfig/templates.go b/cloudconfig/templates.go index e088d2cd..e7c7e7db 100644 --- a/cloudconfig/templates.go +++ b/cloudconfig/templates.go @@ -36,11 +36,11 @@ if [ -z "$METADATA_URL" ];then echo "no token is available and METADATA_URL is not set" exit 1 fi -GITHUB_TOKEN=$(curl --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/") +GITHUB_TOKEN=$(curl --retry 5 --retry-max-time 5 --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/") function call() { PAYLOAD="$1" - curl --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)" + curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)" } function sendStatus() { @@ -63,41 +63,41 @@ function fail() { # This will echo the version number in the filename. Given a file name like: actions-runner-osx-x64-2.299.1.tar.gz # this will output: 2.299.1 function getRunnerVersion() { - FILENAME="{{ .FileName }}" - [[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]] - echo $BASH_REMATCH + FILENAME="{{ .FileName }}" + [[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]] + echo $BASH_REMATCH } function getCachedToolsPath() { - CACHED_RUNNER="/opt/cache/actions-runner/latest" - if [ -d "$CACHED_RUNNER" ];then - echo "$CACHED_RUNNER" - return 0 - fi + CACHED_RUNNER="/opt/cache/actions-runner/latest" + if [ -d "$CACHED_RUNNER" ];then + echo "$CACHED_RUNNER" + return 0 + fi - VERSION=$(getRunnerVersion) - if [ -z "$VERSION" ]; then - return 0 - fi + VERSION=$(getRunnerVersion) + if [ -z "$VERSION" ]; then + return 0 + fi - CACHED_RUNNER="/opt/cache/actions-runner/$VERSION" - if [ -d "$CACHED_RUNNER" ];then - echo "$CACHED_RUNNER" - return 0 - fi - return 0 + CACHED_RUNNER="/opt/cache/actions-runner/$VERSION" + if [ -d "$CACHED_RUNNER" ];then + echo "$CACHED_RUNNER" + return 0 + fi + return 0 } function downloadAndExtractRunner() { - sendStatus "downloading tools from {{ .DownloadURL }}" - if [ ! -z "{{ .TempDownloadToken }}" ]; then + sendStatus "downloading tools from {{ .DownloadURL }}" + if [ ! -z "{{ .TempDownloadToken }}" ]; then TEMP_TOKEN="Authorization: Bearer {{ .TempDownloadToken }}" - fi - curl -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools" - mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder" - sendStatus "extracting runner" - tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner" - chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner" + fi + curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools" + mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder" + sendStatus "extracting runner" + tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner" + chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner" } TEMP_TOKEN="" @@ -107,31 +107,59 @@ GH_RUNNER_GROUP="{{.GitHubRunnerGroup}}" # if it holds a value, it will be part of the command. RUNNER_GROUP_OPT="" if [ ! -z $GH_RUNNER_GROUP ];then - RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP" + RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP" fi CACHED_RUNNER=$(getCachedToolsPath) if [ -z "$CACHED_RUNNER" ];then - downloadAndExtractRunner - sendStatus "installing dependencies" - cd /home/{{ .RunnerUsername }}/actions-runner - sudo ./bin/installdependencies.sh || fail "failed to install dependencies" + downloadAndExtractRunner + sendStatus "installing dependencies" + cd /home/{{ .RunnerUsername }}/actions-runner + sudo ./bin/installdependencies.sh || fail "failed to install dependencies" else - sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner" - cd /home/{{ .RunnerUsername }}/actions-runner - chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner" + sendStatus "using cached runner found in $CACHED_RUNNER" + sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner" + cd /home/{{ .RunnerUsername }}/actions-runner + chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner" fi sendStatus "configuring runner" -sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral || fail "failed to configure runner" +set +e +attempt=1 +while true; do + ERROUT=$(mktemp) + sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral 2>$ERROUT + if [ $? -eq 0 ]; then + rm $ERROUT || true + sendStatus "runner successfully configured after $attempt attempt(s)" + break + fi + LAST_ERR=$(cat $ERROUT) + echo "$LAST_ERR" + + # if the runner is already configured, remove it and try again. In the past configuring a runner + # managed to register it but timed out later, resulting in an error. + sudo -u {{ .RunnerUsername }} -- ./config.sh remove --token "$GITHUB_TOKEN" || true + + if [ $attempt -gt 5 ];then + rm $ERROUT || true + fail "failed to configure runner: $LAST_ERR" + fi + + sendStatus "failed to configure runner (attempt $attempt): $LAST_ERR (retrying in 5 seconds)" + attempt=$((attempt+1)) + rm $ERROUT || true + sleep 5 +done +set -e sendStatus "installing runner service" ./svc.sh install {{ .RunnerUsername }} || fail "failed to install service" if [ -e "/sys/fs/selinux" ];then - sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context" - sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context" + sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context" + sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context" fi sendStatus "starting service" @@ -156,105 +184,105 @@ Param( $ErrorActionPreference="Stop" function Invoke-FastWebRequest { - [CmdletBinding()] - Param( - [Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)] - [System.Uri]$Uri, - [Parameter(Position=1)] - [string]$OutFile, - [Hashtable]$Headers=@{}, - [switch]$SkipIntegrityCheck=$false - ) - PROCESS - { - if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type) - { - $assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http") - } + [CmdletBinding()] + Param( + [Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)] + [System.Uri]$Uri, + [Parameter(Position=1)] + [string]$OutFile, + [Hashtable]$Headers=@{}, + [switch]$SkipIntegrityCheck=$false + ) + PROCESS + { + if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type) + { + $assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http") + } - if(!$OutFile) { - $OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1) - if(!$OutFile) { - throw "The ""OutFile"" parameter needs to be specified" - } - } + if(!$OutFile) { + $OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1) + if(!$OutFile) { + throw "The ""OutFile"" parameter needs to be specified" + } + } - $fragment = $Uri.Fragment.Trim('#') - if ($fragment) { - $details = $fragment.Split("=") - $algorithm = $details[0] - $hash = $details[1] - } + $fragment = $Uri.Fragment.Trim('#') + if ($fragment) { + $details = $fragment.Split("=") + $algorithm = $details[0] + $hash = $details[1] + } - if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) { - try { - return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash) - } catch { - Remove-Item $OutFile - } - } + if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) { + try { + return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash) + } catch { + Remove-Item $OutFile + } + } - $client = new-object System.Net.Http.HttpClient - foreach ($k in $Headers.Keys){ - $client.DefaultRequestHeaders.Add($k, $Headers[$k]) - } - $task = $client.GetStreamAsync($Uri) - $response = $task.Result - if($task.IsFaulted) { - $msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status) - if($task.Exception) { - $msg += "Exception details: {0}" -f @($task.Exception) - } - Throw $msg - } - $outStream = New-Object IO.FileStream $OutFile, Create, Write, None + $client = new-object System.Net.Http.HttpClient + foreach ($k in $Headers.Keys){ + $client.DefaultRequestHeaders.Add($k, $Headers[$k]) + } + $task = $client.GetStreamAsync($Uri) + $response = $task.Result + if($task.IsFaulted) { + $msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status) + if($task.Exception) { + $msg += "Exception details: {0}" -f @($task.Exception) + } + Throw $msg + } + $outStream = New-Object IO.FileStream $OutFile, Create, Write, None - try { - $totRead = 0 - $buffer = New-Object Byte[] 1MB - while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) { - $totRead += $read - $outStream.Write($buffer, 0, $read); - } - } - finally { - $outStream.Close() - } - if(!$SkipIntegrityCheck -and $fragment) { - Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash - } - } + try { + $totRead = 0 + $buffer = New-Object Byte[] 1MB + while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) { + $totRead += $read + $outStream.Write($buffer, 0, $read); + } + } + finally { + $outStream.Close() + } + if(!$SkipIntegrityCheck -and $fragment) { + Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash + } + } } function Import-Certificate() { - [CmdletBinding()] - param ( - [parameter(Mandatory=$true)] - [string]$CertificatePath, - [parameter(Mandatory=$true)] - [System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine", - [parameter(Mandatory=$true)] - [System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher" - ) - PROCESS - { - $store = New-Object System.Security.Cryptography.X509Certificates.X509Store( - $StoreName, $StoreLocation) - $store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite) - $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2( - $CertificatePath) - $store.Add($cert) - } + [CmdletBinding()] + param ( + [parameter(Mandatory=$true)] + [string]$CertificatePath, + [parameter(Mandatory=$true)] + [System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine", + [parameter(Mandatory=$true)] + [System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher" + ) + PROCESS + { + $store = New-Object System.Security.Cryptography.X509Certificates.X509Store( + $StoreName, $StoreLocation) + $store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite) + $cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2( + $CertificatePath) + $store.Add($cert) + } } function Invoke-APICall() { [CmdletBinding()] - param ( - [parameter(Mandatory=$true)] - [object]$Payload, + param ( + [parameter(Mandatory=$true)] + [object]$Payload, [parameter(Mandatory=$true)] [string]$CallbackURL - ) + ) PROCESS{ Invoke-WebRequest -UseBasicParsing -Method Post -Headers @{"Accept"="application/json"; "Authorization"="Bearer $Token"} -Uri $CallbackURL -Body (ConvertTo-Json $Payload) | Out-Null } @@ -262,12 +290,12 @@ function Invoke-APICall() { function Update-GarmStatus() { [CmdletBinding()] - param ( - [parameter(Mandatory=$true)] - [string]$Message, + param ( + [parameter(Mandatory=$true)] + [string]$Message, [parameter(Mandatory=$true)] [string]$CallbackURL - ) + ) PROCESS{ $body = @{ "status"="installing" @@ -279,14 +307,14 @@ function Update-GarmStatus() { function Invoke-GarmSuccess() { [CmdletBinding()] - param ( - [parameter(Mandatory=$true)] - [string]$Message, + param ( [parameter(Mandatory=$true)] - [int64]$AgentID, + [string]$Message, + [parameter(Mandatory=$true)] + [int64]$AgentID, [parameter(Mandatory=$true)] [string]$CallbackURL - ) + ) PROCESS{ $body = @{ "status"="idle" @@ -299,12 +327,12 @@ function Invoke-GarmSuccess() { function Invoke-GarmFailure() { [CmdletBinding()] - param ( - [parameter(Mandatory=$true)] - [string]$Message, + param ( + [parameter(Mandatory=$true)] + [string]$Message, [parameter(Mandatory=$true)] [string]$CallbackURL - ) + ) PROCESS{ $body = @{ "status"="failed" diff --git a/runner/pool/pool.go b/runner/pool/pool.go index dce4e741..e46bd4f0 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -365,26 +365,44 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { return errors.Wrap(err, "fetching instances from db") } - runnerNames := map[string]bool{} + runnersByName := map[string]*github.Runner{} for _, run := range runners { if !r.isManagedRunner(labelsFromRunner(run)) { log.Printf("runner %s is not managed by a pool belonging to %s", *run.Name, r.helper.String()) continue } - runnerNames[*run.Name] = true + runnersByName[*run.Name] = run } for _, instance := range dbInstances { - if ok := runnerNames[instance.Name]; !ok { - pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) - if err != nil { - return errors.Wrap(err, "fetching instance pool info") - } - if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { - continue - } - log.Printf("reaping instance %s due to timeout", instance.Name) - if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { + pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) + if err != nil { + return errors.Wrap(err, "fetching instance pool info") + } + if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { + continue + } + + // There are 2 cases (currently) where we consider a runner as timed out: + // * The runner never joined github within the pool timeout + // * The runner managed to join github, but the setup process failed later and the runner + // never started on the instance. + // + // There are several steps in the user data that sets up the runner: + // * Download and unarchive the runner from github (or used the cached version) + // * Configure runner (connects to github). At this point the runner is seen as offline. + // * Install the service + // * Set SELinux context (if SELinux is enabled) + // * Start the service (if successful, the runner will transition to "online") + // * Get the runner ID + // + // If we fail getting the runner ID after it's started, garm will set the runner status to "failed", + // even though, technically the runner is online and fully functional. This is why we check here for + // both the runner status as reported by GitHub and the runner status as reported by the provider. + // If the runner is "offline" and marked as "failed", it should be safe to reap it. + if runner, ok := runnersByName[instance.Name]; !ok || (runner.GetStatus() == "offline" && instance.RunnerStatus == providerCommon.RunnerFailed) { + log.Printf("reaping timed-out/failed runner %s", instance.Name) + if err := r.ForceDeleteRunner(instance); err != nil { log.Printf("failed to update runner %s status", instance.Name) return errors.Wrap(err, "updating runner") }