diff --git a/contrib/providers.d/azure/garm-external-provider b/contrib/providers.d/azure/garm-external-provider index 1e3ebff9..8a7b628c 100755 --- a/contrib/providers.d/azure/garm-external-provider +++ b/contrib/providers.d/azure/garm-external-provider @@ -226,7 +226,7 @@ function CreateInstance() { OS_VERSION=$(echo "${IMAGE_URN}" | cut -d ':' -f3) ARCH="amd64" - TAGS="garm-controller-id=${GARM_CONTROLLER_ID} garm-pool-id=${GARM_POOL_ID}" + TAGS="garm_controller_id=${GARM_CONTROLLER_ID} garm_pool_id=${GARM_POOL_ID} os_type=${OS_TYPE} os_name=${OS_NAME} os_version=${OS_VERSION} os_arch=${ARCH}" set +e @@ -298,6 +298,37 @@ function StopServer() { az vm deallocate -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors } +function GetInstance() { + local instance_id="${GARM_INSTANCE_ID}" + info=$(az vm show -d -n $instance_id -g $instance_id -o json --only-show-errors 2>&1) + echo $info | jq -r ' + { + provider_id: .name, + name: .name, + os_type: .tags.os_type, + os_name: .tags.os_name, + os_version: .tags.os_version, + os_arch: .tags.os_arch, + pool_id: .tags.garm_pool_id, + status: {"VM starting": "pending_create", "VM running": "running", "VM stopping": "stopped", "VM stopped": "stopped", "VM deallocating": "stopped", "VM deallocated": "stopped"}[.powerState] + }' +} + +function ListInstances() { + INSTANCES=$(az vm list --query "[?tags.garm_pool_id == '${GARM_POOL_ID}']" -o json --only-show-errors 2>&1) + echo $info | jq -r ' + .[] | { + provider_id: .name, + name: .name, + os_type: .tags.os_type, + os_name: .tags.os_name, + os_version: .tags.os_version, + os_arch: .tags.os_arch, + pool_id: .tags.garm_pool_id, + status: {"Creating": "pending_create", "Migrating": "pending_create", "Failed": "error", "Succeeded": "running", "Deleting": "pending_delete"}[.provisioningState] + }' +} + # Login to Azure checkValNotNull "${AZURE_SUBSCRIPTION_ID}" "AZURE_SUBSCRIPTION_ID" checkValNotNull "${AZURE_TENANT_ID}" "AZURE_TENANT_ID" @@ -317,12 +348,10 @@ case "$GARM_COMMAND" in DeleteInstance ;; "GetInstance") - echo "GetInstance not implemented" - exit 1 + GetInstance ;; "ListInstances") - echo "ListInstances not implemented" - exit 1 + ListInstances ;; "StartInstance") StartInstance diff --git a/contrib/providers.d/openstack/garm-external-provider b/contrib/providers.d/openstack/garm-external-provider index 910fec9d..7045c5fb 100755 --- a/contrib/providers.d/openstack/garm-external-provider +++ b/contrib/providers.d/openstack/garm-external-provider @@ -279,7 +279,7 @@ function CreateInstance() { OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type') checkValNotNull "${OS_TYPE}" "os_type" || return $? DISTRO=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_distro') - checkValNotNull "${OS_TYPE}" "os_distro" || return $? + checkValNotNull "${DISTRO}" "os_distro" || return $? VERSION=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_version') checkValNotNull "${VERSION}" "os_version" || return $? ARCH=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.architecture') @@ -306,7 +306,8 @@ function CreateInstance() { set +e TAGS="--tag garm-controller-id=${GARM_CONTROLLER_ID} --tag garm-pool-id=${GARM_POOL_ID}" - SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}") + PROPERTIES="--property os_type=${OS_TYPE} --property os_name=${DISTRO} --property os_version=${VERSION} --property os_arch=${GH_ARCH} --property pool_id=${GARM_POOL_ID}" + SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} ${PROPERTIES} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}") if [ $? -ne 0 ];then openstack volume delete "${INSTANCE_NAME}" || true exit 1 @@ -394,6 +395,25 @@ function StopServer() { openstack server stop "${instance_id}" } +function ListInstances() { + INSTANCES=$(openstack server list --os-compute-api-version 2.52 --tags garm-pool-id=${GARM_POOL_ID} --long -f json) + echo ${INSTANCES} | jq -r ' + .[] | .Properties * { + provider_id: .ID, + name: .Name, + status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.Status] + }' +} + +function GetInstance() { + INSTANCE=$(openstack server show --os-compute-api-version 2.52 ${GARM_INSTANCE_ID} -f json) + echo ${INSTANCES} | jq -r '.properties * { + provider_id: .id, + name: .name, + status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.status] + }' +} + case "$GARM_COMMAND" in "CreateInstance") CreateInstance @@ -402,12 +422,10 @@ case "$GARM_COMMAND" in DeleteInstance ;; "GetInstance") - echo "GetInstance not implemented" - exit 1 + GetInstance ;; "ListInstances") - echo "ListInstances not implemented" - exit 1 + ListInstances ;; "StartInstance") StartInstance diff --git a/doc/external_provider.md b/doc/external_provider.md index 5d9ba7c2..0d2871d4 100644 --- a/doc/external_provider.md +++ b/doc/external_provider.md @@ -215,7 +215,7 @@ Then you can easily parse it. If you're using ```bash```, you can use the amazin You will have to parse the bootstrap params, verify that the requested image exists, gather operating system information, CPU architecture information and using that information, you will need to select the appropriate tools for the arch/OS combination you are deploying. -Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder. +Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder. Of particular interest are the [cloudconfig folders](../contrib/providers.d/openstack/cloudconfig/), where the instance user data templates are stored. These templates are used to generate the needed automation for the instances to download the github runner agent, send back status updates (including the final github runner agent ID), and download the github runner registration token from garm. ### CreateInstance outputs @@ -259,8 +259,6 @@ If the target instance does not exist in the provider, this command is expected ## GetInstance -NOTE: This operation is currently not use by ```garm```, but should be implemented. - The ```GetInstance``` command will return details about the instance, as seen by the provider. Available environment variables: @@ -275,8 +273,6 @@ On failure, this command is expected to return a non-zero exit code. ## ListInstances -NOTE: This operation is currently not use by ```garm```, but should be implemented. - The ```ListInstances``` command will print to standard output, a json that is unserializable into an **array** of ```Instance{}```. Available environment variables: @@ -293,9 +289,7 @@ On failure, a non-zero exit code is expected. ## RemoveAllInstances -NOTE: This operation is currently not use by ```garm```, but should be implemented. - -The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. +The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances. Available environment variables: diff --git a/runner/pool/pool.go b/runner/pool/pool.go index bcae2fdd..c6c998a0 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -178,11 +178,21 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { return nil } +func instanceInList(instanceName string, instances []params.Instance) bool { + for _, val := range instances { + if val.Name == instanceName { + return true + } + } + return false +} + // cleanupOrphanedGithubRunners will forcefully remove any github runners that appear // as offline and for which we no longer have a local instance. // This may happen if someone manually deletes the instance in the provider. We need to // first remove the instance from github, and then from our database. func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) error { + poolInstanceCache := map[string][]params.Instance{} for _, runner := range runners { if !r.isManagedRunner(labelsFromRunner(runner)) { log.Printf("runner %s is not managed by a pool belonging to %s", *runner.Name, r.helper.String()) @@ -238,12 +248,17 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID) } - // Check if the instance is still on the provider. - _, err = provider.GetInstance(r.ctx, dbInstance.Name) - if err != nil { - if !errors.Is(err, runnerErrors.ErrNotFound) { - return errors.Wrap(err, "fetching instance from provider") + var poolInstances []params.Instance + poolInstances, ok = poolInstanceCache[pool.ID] + if !ok { + poolInstances, err = provider.ListInstances(r.ctx, pool.ID) + if err != nil { + return errors.Wrapf(err, "fetching instances for pool %s", pool.ID) } + poolInstanceCache[pool.ID] = poolInstances + } + + if !instanceInList(dbInstance.Name, poolInstances) { // The runner instance is no longer on the provider, and it appears offline in github. // It should be safe to force remove it. log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name) @@ -470,36 +485,43 @@ func (r *basePoolManager) loop() { return } default: - log.Printf("attempting to start pool manager for %s", r.helper.String()) - tools, err := r.helper.FetchTools() - var failureReason string - if err != nil { - failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err) - r.setPoolRunningState(false, failureReason) - log.Print(failureReason) - if errors.Is(err, runnerErrors.ErrUnauthorized) { - r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer) - } else { - r.waitForTimeoutOrCanceled(60 * time.Second) + select { + case <-r.ctx.Done(): + // daemon is shutting down. + return + case <-r.quit: + // this worker was stopped. + return + default: + log.Printf("attempting to start pool manager for %s", r.helper.String()) + tools, err := r.helper.FetchTools() + var failureReason string + if err != nil { + failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err) + r.setPoolRunningState(false, failureReason) + log.Print(failureReason) + if errors.Is(err, runnerErrors.ErrUnauthorized) { + r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer) + } else { + r.waitForTimeoutOrCanceled(60 * time.Second) + } + continue } - continue - } - r.mux.Lock() - r.tools = tools - r.mux.Unlock() + r.mux.Lock() + r.tools = tools + r.mux.Unlock() - if err := r.runnerCleanup(); err != nil { - failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err) - r.setPoolRunningState(false, failureReason) - log.Print(failureReason) - if errors.Is(err, runnerErrors.ErrUnauthorized) { - r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer) - } else { - r.waitForTimeoutOrCanceled(60 * time.Second) + if err := r.runnerCleanup(); err != nil { + failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err) + log.Print(failureReason) + if errors.Is(err, runnerErrors.ErrUnauthorized) { + r.setPoolRunningState(false, failureReason) + r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer) + } + continue } - continue + r.setPoolRunningState(true, "") } - r.setPoolRunningState(true, "") } } }