Merge pull request #106 from gabriel-samfira/reap-failed-agent

Reap failed agent
This commit is contained in:
Gabriel 2023-06-14 22:19:20 +03:00 committed by GitHub
commit 05168ac994
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 203 additions and 153 deletions

View file

@ -6,10 +6,11 @@ USER_ID=$(shell ((docker --version | grep -q podman) && echo "0" || id -u))
USER_GROUP=$(shell ((docker --version | grep -q podman) && echo "0" || id -g))
ROOTDIR=$(dir $(abspath $(lastword $(MAKEFILE_LIST))))
GOPATH ?= $(shell go env GOPATH)
VERSION ?= $(shell git describe --tags --match='v[0-9]*' --dirty --always)
GO ?= go
default: install
default: build
.PHONY : build-static test install-lint-deps lint go-test fmt fmtcheck verify-vendor verify
build-static:
@ -18,9 +19,12 @@ build-static:
docker run --rm -e USER_ID=$(USER_ID) -e USER_GROUP=$(USER_GROUP) -v $(PWD):/build/garm:z $(IMAGE_TAG) /build-static.sh
@echo Binaries are available in $(PWD)/bin
install:
@$(GO) install -tags osusergo,netgo,sqlite_omit_load_extension ./...
@echo Binaries available in ${GOPATH}
build:
@echo Building garm ${VERSION}
$(shell mkdir -p ./bin)
@$(GO) build -ldflags "-s -w -X main.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm ./cmd/garm
@$(GO) build -ldflags "-s -w -X github.com/cloudbase/garm/cmd/garm-cli/cmd.Version=${VERSION}" -tags osusergo,netgo,sqlite_omit_load_extension -o bin/garm-cli ./cmd/garm-cli
@echo Binaries are available in $(PWD)/bin
test: verify go-test

View file

@ -36,11 +36,11 @@ if [ -z "$METADATA_URL" ];then
echo "no token is available and METADATA_URL is not set"
exit 1
fi
GITHUB_TOKEN=$(curl --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/")
GITHUB_TOKEN=$(curl --retry 5 --retry-max-time 5 --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/")
function call() {
PAYLOAD="$1"
curl --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)"
curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)"
}
function sendStatus() {
@ -63,41 +63,41 @@ function fail() {
# This will echo the version number in the filename. Given a file name like: actions-runner-osx-x64-2.299.1.tar.gz
# this will output: 2.299.1
function getRunnerVersion() {
FILENAME="{{ .FileName }}"
[[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]]
echo $BASH_REMATCH
FILENAME="{{ .FileName }}"
[[ $FILENAME =~ ([0-9]+\.[0-9]+\.[0-9+]) ]]
echo $BASH_REMATCH
}
function getCachedToolsPath() {
CACHED_RUNNER="/opt/cache/actions-runner/latest"
if [ -d "$CACHED_RUNNER" ];then
echo "$CACHED_RUNNER"
return 0
fi
CACHED_RUNNER="/opt/cache/actions-runner/latest"
if [ -d "$CACHED_RUNNER" ];then
echo "$CACHED_RUNNER"
return 0
fi
VERSION=$(getRunnerVersion)
if [ -z "$VERSION" ]; then
return 0
fi
VERSION=$(getRunnerVersion)
if [ -z "$VERSION" ]; then
return 0
fi
CACHED_RUNNER="/opt/cache/actions-runner/$VERSION"
if [ -d "$CACHED_RUNNER" ];then
echo "$CACHED_RUNNER"
return 0
fi
return 0
CACHED_RUNNER="/opt/cache/actions-runner/$VERSION"
if [ -d "$CACHED_RUNNER" ];then
echo "$CACHED_RUNNER"
return 0
fi
return 0
}
function downloadAndExtractRunner() {
sendStatus "downloading tools from {{ .DownloadURL }}"
if [ ! -z "{{ .TempDownloadToken }}" ]; then
sendStatus "downloading tools from {{ .DownloadURL }}"
if [ ! -z "{{ .TempDownloadToken }}" ]; then
TEMP_TOKEN="Authorization: Bearer {{ .TempDownloadToken }}"
fi
curl -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools"
mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder"
sendStatus "extracting runner"
tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner"
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner"
fi
curl --retry 5 --retry-max-time 5 --retry-all-errors --fail -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools"
mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder"
sendStatus "extracting runner"
tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to extract runner"
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R /home/{{ .RunnerUsername }}/actions-runner/ || fail "failed to change owner"
}
TEMP_TOKEN=""
@ -107,31 +107,59 @@ GH_RUNNER_GROUP="{{.GitHubRunnerGroup}}"
# if it holds a value, it will be part of the command.
RUNNER_GROUP_OPT=""
if [ ! -z $GH_RUNNER_GROUP ];then
RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP"
RUNNER_GROUP_OPT="--runnergroup=$GH_RUNNER_GROUP"
fi
CACHED_RUNNER=$(getCachedToolsPath)
if [ -z "$CACHED_RUNNER" ];then
downloadAndExtractRunner
sendStatus "installing dependencies"
cd /home/{{ .RunnerUsername }}/actions-runner
sudo ./bin/installdependencies.sh || fail "failed to install dependencies"
downloadAndExtractRunner
sendStatus "installing dependencies"
cd /home/{{ .RunnerUsername }}/actions-runner
sudo ./bin/installdependencies.sh || fail "failed to install dependencies"
else
sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner"
cd /home/{{ .RunnerUsername }}/actions-runner
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner"
sendStatus "using cached runner found in $CACHED_RUNNER"
sudo cp -a "$CACHED_RUNNER" "/home/{{ .RunnerUsername }}/actions-runner"
cd /home/{{ .RunnerUsername }}/actions-runner
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "/home/{{ .RunnerUsername }}/actions-runner" || fail "failed to change owner"
fi
sendStatus "configuring runner"
sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral || fail "failed to configure runner"
set +e
attempt=1
while true; do
ERROUT=$(mktemp)
sudo -u {{ .RunnerUsername }} -- ./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" $RUNNER_GROUP_OPT --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --ephemeral 2>$ERROUT
if [ $? -eq 0 ]; then
rm $ERROUT || true
sendStatus "runner successfully configured after $attempt attempt(s)"
break
fi
LAST_ERR=$(cat $ERROUT)
echo "$LAST_ERR"
# if the runner is already configured, remove it and try again. In the past configuring a runner
# managed to register it but timed out later, resulting in an error.
sudo -u {{ .RunnerUsername }} -- ./config.sh remove --token "$GITHUB_TOKEN" || true
if [ $attempt -gt 5 ];then
rm $ERROUT || true
fail "failed to configure runner: $LAST_ERR"
fi
sendStatus "failed to configure runner (attempt $attempt): $LAST_ERR (retrying in 5 seconds)"
attempt=$((attempt+1))
rm $ERROUT || true
sleep 5
done
set -e
sendStatus "installing runner service"
./svc.sh install {{ .RunnerUsername }} || fail "failed to install service"
if [ -e "/sys/fs/selinux" ];then
sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context"
sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context"
sudo chcon -h user_u:object_r:bin_t /home/runner/ || fail "failed to change selinux context"
sudo chcon -R -h {{ .RunnerUsername }}:object_r:bin_t /home/runner/* || fail "failed to change selinux context"
fi
sendStatus "starting service"
@ -156,105 +184,105 @@ Param(
$ErrorActionPreference="Stop"
function Invoke-FastWebRequest {
[CmdletBinding()]
Param(
[Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)]
[System.Uri]$Uri,
[Parameter(Position=1)]
[string]$OutFile,
[Hashtable]$Headers=@{},
[switch]$SkipIntegrityCheck=$false
)
PROCESS
{
if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type)
{
$assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http")
}
[CmdletBinding()]
Param(
[Parameter(Mandatory=$True,ValueFromPipeline=$true,Position=0)]
[System.Uri]$Uri,
[Parameter(Position=1)]
[string]$OutFile,
[Hashtable]$Headers=@{},
[switch]$SkipIntegrityCheck=$false
)
PROCESS
{
if(!([System.Management.Automation.PSTypeName]'System.Net.Http.HttpClient').Type)
{
$assembly = [System.Reflection.Assembly]::LoadWithPartialName("System.Net.Http")
}
if(!$OutFile) {
$OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1)
if(!$OutFile) {
throw "The ""OutFile"" parameter needs to be specified"
}
}
if(!$OutFile) {
$OutFile = $Uri.PathAndQuery.Substring($Uri.PathAndQuery.LastIndexOf("/") + 1)
if(!$OutFile) {
throw "The ""OutFile"" parameter needs to be specified"
}
}
$fragment = $Uri.Fragment.Trim('#')
if ($fragment) {
$details = $fragment.Split("=")
$algorithm = $details[0]
$hash = $details[1]
}
$fragment = $Uri.Fragment.Trim('#')
if ($fragment) {
$details = $fragment.Split("=")
$algorithm = $details[0]
$hash = $details[1]
}
if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) {
try {
return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash)
} catch {
Remove-Item $OutFile
}
}
if (!$SkipIntegrityCheck -and $fragment -and (Test-Path $OutFile)) {
try {
return (Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash)
} catch {
Remove-Item $OutFile
}
}
$client = new-object System.Net.Http.HttpClient
foreach ($k in $Headers.Keys){
$client.DefaultRequestHeaders.Add($k, $Headers[$k])
}
$task = $client.GetStreamAsync($Uri)
$response = $task.Result
if($task.IsFaulted) {
$msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status)
if($task.Exception) {
$msg += "Exception details: {0}" -f @($task.Exception)
}
Throw $msg
}
$outStream = New-Object IO.FileStream $OutFile, Create, Write, None
$client = new-object System.Net.Http.HttpClient
foreach ($k in $Headers.Keys){
$client.DefaultRequestHeaders.Add($k, $Headers[$k])
}
$task = $client.GetStreamAsync($Uri)
$response = $task.Result
if($task.IsFaulted) {
$msg = "Request for URL '{0}' is faulted. Task status: {1}." -f @($Uri, $task.Status)
if($task.Exception) {
$msg += "Exception details: {0}" -f @($task.Exception)
}
Throw $msg
}
$outStream = New-Object IO.FileStream $OutFile, Create, Write, None
try {
$totRead = 0
$buffer = New-Object Byte[] 1MB
while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) {
$totRead += $read
$outStream.Write($buffer, 0, $read);
}
}
finally {
$outStream.Close()
}
if(!$SkipIntegrityCheck -and $fragment) {
Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash
}
}
try {
$totRead = 0
$buffer = New-Object Byte[] 1MB
while (($read = $response.Read($buffer, 0, $buffer.Length)) -gt 0) {
$totRead += $read
$outStream.Write($buffer, 0, $read);
}
}
finally {
$outStream.Close()
}
if(!$SkipIntegrityCheck -and $fragment) {
Test-FileIntegrity -File $OutFile -Algorithm $algorithm -ExpectedHash $hash
}
}
}
function Import-Certificate() {
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[string]$CertificatePath,
[parameter(Mandatory=$true)]
[System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine",
[parameter(Mandatory=$true)]
[System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher"
)
PROCESS
{
$store = New-Object System.Security.Cryptography.X509Certificates.X509Store(
$StoreName, $StoreLocation)
$store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite)
$cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2(
$CertificatePath)
$store.Add($cert)
}
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[string]$CertificatePath,
[parameter(Mandatory=$true)]
[System.Security.Cryptography.X509Certificates.StoreLocation]$StoreLocation="LocalMachine",
[parameter(Mandatory=$true)]
[System.Security.Cryptography.X509Certificates.StoreName]$StoreName="TrustedPublisher"
)
PROCESS
{
$store = New-Object System.Security.Cryptography.X509Certificates.X509Store(
$StoreName, $StoreLocation)
$store.Open([System.Security.Cryptography.X509Certificates.OpenFlags]::ReadWrite)
$cert = New-Object System.Security.Cryptography.X509Certificates.X509Certificate2(
$CertificatePath)
$store.Add($cert)
}
}
function Invoke-APICall() {
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[object]$Payload,
param (
[parameter(Mandatory=$true)]
[object]$Payload,
[parameter(Mandatory=$true)]
[string]$CallbackURL
)
)
PROCESS{
Invoke-WebRequest -UseBasicParsing -Method Post -Headers @{"Accept"="application/json"; "Authorization"="Bearer $Token"} -Uri $CallbackURL -Body (ConvertTo-Json $Payload) | Out-Null
}
@ -262,12 +290,12 @@ function Invoke-APICall() {
function Update-GarmStatus() {
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[string]$Message,
param (
[parameter(Mandatory=$true)]
[string]$Message,
[parameter(Mandatory=$true)]
[string]$CallbackURL
)
)
PROCESS{
$body = @{
"status"="installing"
@ -279,14 +307,14 @@ function Update-GarmStatus() {
function Invoke-GarmSuccess() {
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[string]$Message,
param (
[parameter(Mandatory=$true)]
[int64]$AgentID,
[string]$Message,
[parameter(Mandatory=$true)]
[int64]$AgentID,
[parameter(Mandatory=$true)]
[string]$CallbackURL
)
)
PROCESS{
$body = @{
"status"="idle"
@ -299,12 +327,12 @@ function Invoke-GarmSuccess() {
function Invoke-GarmFailure() {
[CmdletBinding()]
param (
[parameter(Mandatory=$true)]
[string]$Message,
param (
[parameter(Mandatory=$true)]
[string]$Message,
[parameter(Mandatory=$true)]
[string]$CallbackURL
)
)
PROCESS{
$body = @{
"status"="failed"

View file

@ -365,26 +365,44 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
return errors.Wrap(err, "fetching instances from db")
}
runnerNames := map[string]bool{}
runnersByName := map[string]*github.Runner{}
for _, run := range runners {
if !r.isManagedRunner(labelsFromRunner(run)) {
log.Printf("runner %s is not managed by a pool belonging to %s", *run.Name, r.helper.String())
continue
}
runnerNames[*run.Name] = true
runnersByName[*run.Name] = run
}
for _, instance := range dbInstances {
if ok := runnerNames[instance.Name]; !ok {
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching instance pool info")
}
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
continue
}
log.Printf("reaping instance %s due to timeout", instance.Name)
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching instance pool info")
}
if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) {
continue
}
// There are 2 cases (currently) where we consider a runner as timed out:
// * The runner never joined github within the pool timeout
// * The runner managed to join github, but the setup process failed later and the runner
// never started on the instance.
//
// There are several steps in the user data that sets up the runner:
// * Download and unarchive the runner from github (or used the cached version)
// * Configure runner (connects to github). At this point the runner is seen as offline.
// * Install the service
// * Set SELinux context (if SELinux is enabled)
// * Start the service (if successful, the runner will transition to "online")
// * Get the runner ID
//
// If we fail getting the runner ID after it's started, garm will set the runner status to "failed",
// even though, technically the runner is online and fully functional. This is why we check here for
// both the runner status as reported by GitHub and the runner status as reported by the provider.
// If the runner is "offline" and marked as "failed", it should be safe to reap it.
if runner, ok := runnersByName[instance.Name]; !ok || (runner.GetStatus() == "offline" && instance.RunnerStatus == providerCommon.RunnerFailed) {
log.Printf("reaping timed-out/failed runner %s", instance.Name)
if err := r.ForceDeleteRunner(instance); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
return errors.Wrap(err, "updating runner")
}