garm/internal/templates/userdata/github_linux_userdata.tmpl
Gabriel Adrian Samfira 42cfd1b3c6 Add agent mode
This change adds a new "agent mode" to GARM. The agent enables GARM to
set up a persistent websocket connection between the garm server and the
runners it spawns. The goal is to be able to easier keep track of state,
even without subsequent webhooks from the forge.

The Agent will report via websockets when the runner is actually online,
when it started a job and when it finished a job.

Additionally, the agent allows us to enable optional remote shell between
the user and any runner that is spun up using agent mode. The remote shell
is multiplexed over the same persistent websocket connection the agent
sets up with the server (the agent never listens on a port).

Enablement has also been done in the web UI for this functionality.

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
2026-02-08 00:27:47 +02:00

278 lines
9.8 KiB
Bash

#!/bin/bash
set -e
set -o pipefail
set -x
{{- if .EnableBootDebug }}
set -x
{{- end }}
# Edit the templates directly in the browser. Intellisense is available for golang templates
# {{ .DownloadURL }}
CALLBACK_URL="{{ .CallbackURL }}"
METADATA_URL="{{ .MetadataURL }}"
BEARER_TOKEN="{{ .CallbackToken }}"
# touch /tmp/hold
# while true;do
# [ -e /tmp/hold ] && sleep 2 || break
# done
RUN_HOME="/home/{{ .RunnerUsername }}/actions-runner"
if [ -z "$METADATA_URL" ];then
echo "no token is available and METADATA_URL is not set"
exit 1
fi
function call() {
PAYLOAD="$1"
[[ $CALLBACK_URL =~ ^(.*)/status(/)?$ ]] || CALLBACK_URL="${CALLBACK_URL}/status"
curl --retry 5 --retry-delay 5 --retry-connrefused --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)"
}
function systemInfo() {
if [ -f "/etc/os-release" ];then
. /etc/os-release
fi
OS_NAME=${NAME:-""}
OS_VERSION=${VERSION_ID:-""}
AGENT_ID=${1:-null}
# strip status from the callback url
[[ $CALLBACK_URL =~ ^(.*)/status(/)?$ ]] && CALLBACK_URL="${BASH_REMATCH[1]}" || true
SYSINFO_URL="${CALLBACK_URL}/system-info/"
PAYLOAD="{\"os_name\": \"$OS_NAME\", \"os_version\": \"$OS_VERSION\", \"agent_id\": $AGENT_ID}"
curl --retry 5 --retry-delay 5 --retry-connrefused --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${SYSINFO_URL}" || true
}
function sendStatus() {
MSG="$1"
call "{\"status\": \"installing\", \"message\": \"$MSG\"}"
}
function success() {
MSG="$1"
ID=${2:-null}
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
}
function fail() {
MSG="$1"
call "{\"status\": \"failed\", \"message\": \"$MSG\"}"
exit 1
}
INSTANCE_METADATA=$(curl \
--retry 5 \
--retry-delay 5 \
--retry-connrefused \
--fail -s \
-H 'Accept: application/json' \
-H "Authorization: Bearer ${BEARER_TOKEN}" \
"$METADATA_URL/runner-metadata" 2>&1) || fail "failed to get instance metadata: $INSTANCE_METADATA"
AGENT_MODE=$(echo "$INSTANCE_METADATA" | jq -r '.agent_mode // empty')
if [ "$AGENT_MODE" == "true" ]; then
sendStatus "Agent mode is enabled; setting up agent"
DOWNLOAD_URL=$(echo "$INSTANCE_METADATA" | jq -r '.agent_tools.download_url // empty')
if [ -z "$DOWNLOAD_URL" ]; then
fail "agent mode is enabled, but no agent tools found in metadata"
fi
AGENT_URL=$(echo "$INSTANCE_METADATA" | jq -r '.metadata_access.agent_url // empty')
if [ -z "$AGENT_URL" ]; then
fail "agent mode is enabled, but agent_url was not found in metadata"
fi
AGENT_TOKEN=$(echo "$INSTANCE_METADATA" | jq -r '.agent_token // empty')
if [ -z "$AGENT_TOKEN" ]; then
fail "agent mode is enabled, but agent_token was not found in metadata"
fi
AGENT_SHELL=$(echo "$INSTANCE_METADATA" | jq -r '.agent_shell_enabled // false')
sendStatus "Downloading agent from $DOWNLOAD_URL"
sudo curl --retry 5 \
--retry-delay 5 \
--retry-connrefused \
--fail -L \
-H "Authorization: Bearer ${BEARER_TOKEN}" \
-o /usr/local/bin/garm-agent "$DOWNLOAD_URL" || fail "failed to download garm-agent"
sudo chmod +x /usr/local/bin/garm-agent || fail "failed to make garm-agent executable"
sudo mkdir -p /var/log/garm-agent || fail "failed to create /var/log/garm-agent"
sudo chown {{ .RunnerUsername }}:{{ .RunnerUsername }} /var/log/garm-agent || fail "failed to chown /var/log/garm-agent"
sudo mkdir -p /etc/garm-agent || fail "failed to create /etc/garm"
sudo chown {{ .RunnerUsername }}:{{ .RunnerUsername }} /etc/garm-agent || fail "failed to change owner on /etc/garm-agent"
sendStatus "Creating config and systemd unit"
cat > /etc/garm-agent/garm-agent.toml << EOF
server_url = "$AGENT_URL"
log_file = "/var/log/garm-agent/garm-agent.log"
work_dir = "$RUN_HOME"
enable_shell = $AGENT_SHELL
token = "$AGENT_TOKEN"
runner_cmdline = ["/bin/bash", "-C", "/home/runner/actions-runner/run.sh"]
state_db_path = "/etc/garm-agent/agent-state.db"
EOF
cat > /tmp/garm-agent.service << EOF
[Unit]
Description=GARM agent
After=multi-user.target
[Service]
Type=simple
ExecStart=/usr/local/bin/garm-agent daemon --config /etc/garm-agent/garm-agent.toml
ExecReload=/bin/kill -HUP $MAINPID
Restart=always
RestartSec=5s
User={{ .RunnerUsername }}
Environment=TERM=xterm-256color
Environment=LANG=en_US.UTF-8
[Install]
WantedBy=multi-user.target
EOF
sudo mv /tmp/garm-agent.service /etc/systemd/system/garm-agent.service || fail "failed to create /etc/systemd/system/garm-agent.service"
sudo chown root:root /etc/systemd/system/garm-agent.service || fail "failed to change owner on /etc/systemd/system/garm-agent.service"
sendStatus "Reloading systemd unit"
sudo systemctl daemon-reload || fail "failed to reload systemd"
fi
function downloadAndExtractRunner() {
sendStatus "downloading tools from {{ .DownloadURL }}"
if [ ! -z "{{ .TempDownloadToken }}" ]; then
TEMP_TOKEN="Authorization: Bearer {{ .TempDownloadToken }}"
fi
curl --retry 5 --retry-delay 5 --retry-connrefused --fail -L -H "${TEMP_TOKEN}" -o "/home/{{ .RunnerUsername }}/{{ .FileName }}" "{{ .DownloadURL }}" || fail "failed to download tools"
mkdir -p "$RUN_HOME" || fail "failed to create actions-runner folder"
sendStatus "extracting runner"
tar xf "/home/{{ .RunnerUsername }}/{{ .FileName }}" -C "$RUN_HOME"/ || fail "failed to extract runner"
chown {{ .RunnerUsername }}:{{ .RunnerGroup }} -R "$RUN_HOME"/ || fail "failed to change owner"
}
if [ ! -d "$RUN_HOME" ];then
downloadAndExtractRunner
sendStatus "installing dependencies"
cd "$RUN_HOME"
attempt=1
while true; do
sudo ./bin/installdependencies.sh && break
if [ $attempt -gt 5 ];then
fail "failed to install dependencies after $attempt attempts"
fi
sendStatus "failed to install dependencies (attempt $attempt): (retrying in 15 seconds)"
attempt=$((attempt+1))
sleep 15
done
else
sendStatus "using cached runner found in $RUN_HOME"
cd "$RUN_HOME"
fi
sendStatus "configuring runner"
{{- if .UseJITConfig }}
function getRunnerFile() {
curl --retry 5 --retry-delay 5 \
--retry-connrefused --fail -s \
-X GET -H 'Accept: application/json' \
-H "Authorization: Bearer ${BEARER_TOKEN}" \
"${METADATA_URL}/$1" -o "$2"
}
sendStatus "downloading JIT credentials"
getRunnerFile "credentials/runner" ""$RUN_HOME"/.runner" || fail "failed to get runner file"
getRunnerFile "credentials/credentials" ""$RUN_HOME"/.credentials" || fail "failed to get credentials file"
getRunnerFile "credentials/credentials_rsaparams" ""$RUN_HOME"/.credentials_rsaparams" || fail "failed to get credentials_rsaparams file"
getRunnerFile "system/service-name" ""$RUN_HOME"/.service" || fail "failed to get service name file"
sed -i 's/$/\.service/' "$RUN_HOME"/.service
SVC_NAME=$(cat "$RUN_HOME"/.service)
sendStatus "generating systemd unit file"
getRunnerFile "systemd/unit-file?runAsUser={{ .RunnerUsername }}" "$SVC_NAME" || fail "failed to get service file"
sudo mv $SVC_NAME /etc/systemd/system/ || fail "failed to move service file"
sudo chown root:root /etc/systemd/system/$SVC_NAME || fail "failed to change owner"
if [ -e "/sys/fs/selinux" ];then
sudo chcon -h system_u:object_r:systemd_unit_file_t:s0 /etc/systemd/system/$SVC_NAME || fail "failed to change selinux context"
fi
sendStatus "enabling runner service"
cp "$RUN_HOME"/bin/runsvc.sh "$RUN_HOME"/ || fail "failed to copy runsvc.sh"
sudo systemctl daemon-reload || fail "failed to reload systemd"
sudo systemctl enable $SVC_NAME
{{- else}}
GITHUB_TOKEN=$(curl --retry 5 --retry-delay 5 --retry-connrefused --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}/runner-registration-token/")
set +e
attempt=1
while true; do
ERROUT=$(mktemp)
{{- if .GitHubRunnerGroup }}
./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" --runnergroup {{.GitHubRunnerGroup}} --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --no-default-labels --ephemeral 2>$ERROUT
{{- else}}
./config.sh --unattended --url "{{ .RepoURL }}" --token "$GITHUB_TOKEN" --name "{{ .RunnerName }}" --labels "{{ .RunnerLabels }}" --no-default-labels --ephemeral 2>$ERROUT
{{- end}}
if [ $? -eq 0 ]; then
rm $ERROUT || true
sendStatus "runner successfully configured after $attempt attempt(s)"
break
fi
LAST_ERR=$(cat $ERROUT)
echo "$LAST_ERR"
# if the runner is already configured, remove it and try again. In the past configuring a runner
# managed to register it but timed out later, resulting in an error.
./config.sh remove --token "$GITHUB_TOKEN" || true
if [ $attempt -gt 5 ];then
rm $ERROUT || true
fail "failed to configure runner: $LAST_ERR"
fi
sendStatus "failed to configure runner (attempt $attempt): $LAST_ERR (retrying in 5 seconds)"
attempt=$((attempt+1))
rm $ERROUT || true
sleep 5
done
set -e
sendStatus "installing runner service"
sudo ./svc.sh install {{ .RunnerUsername }} || fail "failed to install service"
{{- end}}
if [ -e "/sys/fs/selinux" ];then
sudo chcon -R -h user_u:object_r:bin_t:s0 /home/runner/ || fail "failed to change selinux context"
fi
AGENT_ID=""
{{- if .UseJITConfig }}
if [ -f "$RUN_HOME/env.sh" ];then
pushd $RUN_HOME
source env.sh
popd
fi
if [ "$AGENT_MODE" != "true" ]; then
sudo systemctl start $SVC_NAME || fail "failed to start service"
fi
{{- else}}
if [ "$AGENT_MODE" != "true" ]; then
sendStatus "starting service"
sudo ./svc.sh start || fail "failed to start service"
fi
set +e
AGENT_ID=$(grep "agentId" "$RUN_HOME"/.runner | tr -d -c 0-9)
if [ $? -ne 0 ];then
fail "failed to get agent ID"
fi
set -e
{{- end}}
systemInfo $AGENT_ID
if [ "$AGENT_MODE" == "true" ]; then
sendStatus "Starting garm-agent service"
sudo systemctl enable --now garm-agent || sendStatus "failed to start garm-agent"
fi
success "runner successfully installed" $AGENT_ID