Merge pull request #55 from gabriel-samfira/properly-exit-loop

Ensure loop closes properly and provider update
This commit is contained in:
Gabriel 2023-01-08 18:45:45 +02:00 committed by GitHub
commit e28f8e6b9d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 113 additions and 50 deletions

View file

@ -226,7 +226,7 @@ function CreateInstance() {
OS_VERSION=$(echo "${IMAGE_URN}" | cut -d ':' -f3)
ARCH="amd64"
TAGS="garm-controller-id=${GARM_CONTROLLER_ID} garm-pool-id=${GARM_POOL_ID}"
TAGS="garm_controller_id=${GARM_CONTROLLER_ID} garm_pool_id=${GARM_POOL_ID} os_type=${OS_TYPE} os_name=${OS_NAME} os_version=${OS_VERSION} os_arch=${ARCH}"
set +e
@ -298,6 +298,37 @@ function StopServer() {
az vm deallocate -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors
}
function GetInstance() {
local instance_id="${GARM_INSTANCE_ID}"
info=$(az vm show -d -n $instance_id -g $instance_id -o json --only-show-errors 2>&1)
echo $info | jq -r '
{
provider_id: .name,
name: .name,
os_type: .tags.os_type,
os_name: .tags.os_name,
os_version: .tags.os_version,
os_arch: .tags.os_arch,
pool_id: .tags.garm_pool_id,
status: {"VM starting": "pending_create", "VM running": "running", "VM stopping": "stopped", "VM stopped": "stopped", "VM deallocating": "stopped", "VM deallocated": "stopped"}[.powerState]
}'
}
function ListInstances() {
INSTANCES=$(az vm list --query "[?tags.garm_pool_id == '${GARM_POOL_ID}']" -o json --only-show-errors 2>&1)
echo $info | jq -r '
.[] | {
provider_id: .name,
name: .name,
os_type: .tags.os_type,
os_name: .tags.os_name,
os_version: .tags.os_version,
os_arch: .tags.os_arch,
pool_id: .tags.garm_pool_id,
status: {"Creating": "pending_create", "Migrating": "pending_create", "Failed": "error", "Succeeded": "running", "Deleting": "pending_delete"}[.provisioningState]
}'
}
# Login to Azure
checkValNotNull "${AZURE_SUBSCRIPTION_ID}" "AZURE_SUBSCRIPTION_ID"
checkValNotNull "${AZURE_TENANT_ID}" "AZURE_TENANT_ID"
@ -317,12 +348,10 @@ case "$GARM_COMMAND" in
DeleteInstance
;;
"GetInstance")
echo "GetInstance not implemented"
exit 1
GetInstance
;;
"ListInstances")
echo "ListInstances not implemented"
exit 1
ListInstances
;;
"StartInstance")
StartInstance

View file

@ -279,7 +279,7 @@ function CreateInstance() {
OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type')
checkValNotNull "${OS_TYPE}" "os_type" || return $?
DISTRO=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_distro')
checkValNotNull "${OS_TYPE}" "os_distro" || return $?
checkValNotNull "${DISTRO}" "os_distro" || return $?
VERSION=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_version')
checkValNotNull "${VERSION}" "os_version" || return $?
ARCH=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.architecture')
@ -306,7 +306,8 @@ function CreateInstance() {
set +e
TAGS="--tag garm-controller-id=${GARM_CONTROLLER_ID} --tag garm-pool-id=${GARM_POOL_ID}"
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
PROPERTIES="--property os_type=${OS_TYPE} --property os_name=${DISTRO} --property os_version=${VERSION} --property os_arch=${GH_ARCH} --property pool_id=${GARM_POOL_ID}"
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} ${PROPERTIES} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
if [ $? -ne 0 ];then
openstack volume delete "${INSTANCE_NAME}" || true
exit 1
@ -394,6 +395,25 @@ function StopServer() {
openstack server stop "${instance_id}"
}
function ListInstances() {
INSTANCES=$(openstack server list --os-compute-api-version 2.52 --tags garm-pool-id=${GARM_POOL_ID} --long -f json)
echo ${INSTANCES} | jq -r '
.[] | .Properties * {
provider_id: .ID,
name: .Name,
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.Status]
}'
}
function GetInstance() {
INSTANCE=$(openstack server show --os-compute-api-version 2.52 ${GARM_INSTANCE_ID} -f json)
echo ${INSTANCES} | jq -r '.properties * {
provider_id: .id,
name: .name,
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.status]
}'
}
case "$GARM_COMMAND" in
"CreateInstance")
CreateInstance
@ -402,12 +422,10 @@ case "$GARM_COMMAND" in
DeleteInstance
;;
"GetInstance")
echo "GetInstance not implemented"
exit 1
GetInstance
;;
"ListInstances")
echo "ListInstances not implemented"
exit 1
ListInstances
;;
"StartInstance")
StartInstance

View file

@ -215,7 +215,7 @@ Then you can easily parse it. If you're using ```bash```, you can use the amazin
You will have to parse the bootstrap params, verify that the requested image exists, gather operating system information, CPU architecture information and using that information, you will need to select the appropriate tools for the arch/OS combination you are deploying.
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder.
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder. Of particular interest are the [cloudconfig folders](../contrib/providers.d/openstack/cloudconfig/), where the instance user data templates are stored. These templates are used to generate the needed automation for the instances to download the github runner agent, send back status updates (including the final github runner agent ID), and download the github runner registration token from garm.
### CreateInstance outputs
@ -259,8 +259,6 @@ If the target instance does not exist in the provider, this command is expected
## GetInstance
NOTE: This operation is currently not use by ```garm```, but should be implemented.
The ```GetInstance``` command will return details about the instance, as seen by the provider.
Available environment variables:
@ -275,8 +273,6 @@ On failure, this command is expected to return a non-zero exit code.
## ListInstances
NOTE: This operation is currently not use by ```garm```, but should be implemented.
The ```ListInstances``` command will print to standard output, a json that is unserializable into an **array** of ```Instance{}```.
Available environment variables:
@ -293,9 +289,7 @@ On failure, a non-zero exit code is expected.
## RemoveAllInstances
NOTE: This operation is currently not use by ```garm```, but should be implemented.
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```.
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances.
Available environment variables:

View file

@ -178,11 +178,21 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
return nil
}
func instanceInList(instanceName string, instances []params.Instance) bool {
for _, val := range instances {
if val.Name == instanceName {
return true
}
}
return false
}
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
// first remove the instance from github, and then from our database.
func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
poolInstanceCache := map[string][]params.Instance{}
for _, runner := range runners {
if !r.isManagedRunner(labelsFromRunner(runner)) {
log.Printf("runner %s is not managed by a pool belonging to %s", *runner.Name, r.helper.String())
@ -238,12 +248,17 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner)
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
var poolInstances []params.Instance
poolInstances, ok = poolInstanceCache[pool.ID]
if !ok {
poolInstances, err = provider.ListInstances(r.ctx, pool.ID)
if err != nil {
return errors.Wrapf(err, "fetching instances for pool %s", pool.ID)
}
poolInstanceCache[pool.ID] = poolInstances
}
if !instanceInList(dbInstance.Name, poolInstances) {
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
@ -470,36 +485,43 @@ func (r *basePoolManager) loop() {
return
}
default:
log.Printf("attempting to start pool manager for %s", r.helper.String())
tools, err := r.helper.FetchTools()
var failureReason string
if err != nil {
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
r.setPoolRunningState(false, failureReason)
log.Print(failureReason)
if errors.Is(err, runnerErrors.ErrUnauthorized) {
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
} else {
r.waitForTimeoutOrCanceled(60 * time.Second)
select {
case <-r.ctx.Done():
// daemon is shutting down.
return
case <-r.quit:
// this worker was stopped.
return
default:
log.Printf("attempting to start pool manager for %s", r.helper.String())
tools, err := r.helper.FetchTools()
var failureReason string
if err != nil {
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
r.setPoolRunningState(false, failureReason)
log.Print(failureReason)
if errors.Is(err, runnerErrors.ErrUnauthorized) {
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
} else {
r.waitForTimeoutOrCanceled(60 * time.Second)
}
continue
}
continue
}
r.mux.Lock()
r.tools = tools
r.mux.Unlock()
r.mux.Lock()
r.tools = tools
r.mux.Unlock()
if err := r.runnerCleanup(); err != nil {
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
r.setPoolRunningState(false, failureReason)
log.Print(failureReason)
if errors.Is(err, runnerErrors.ErrUnauthorized) {
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
} else {
r.waitForTimeoutOrCanceled(60 * time.Second)
if err := r.runnerCleanup(); err != nil {
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
log.Print(failureReason)
if errors.Is(err, runnerErrors.ErrUnauthorized) {
r.setPoolRunningState(false, failureReason)
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
}
continue
}
continue
r.setPoolRunningState(true, "")
}
r.setPoolRunningState(true, "")
}
}
}