Ensure loop closes properly and provider update
* Ensure the pool loop exits properly when the pool is not yet in a running state. * Use ListInstances() when cleaning orphaned runners. This ensures We only run one API call per pool to list instances, instead of running a GetInstance() for each individual instance we are checking. Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
parent
bf844a40b3
commit
b954038624
4 changed files with 113 additions and 50 deletions
|
|
@ -226,7 +226,7 @@ function CreateInstance() {
|
|||
OS_VERSION=$(echo "${IMAGE_URN}" | cut -d ':' -f3)
|
||||
ARCH="amd64"
|
||||
|
||||
TAGS="garm-controller-id=${GARM_CONTROLLER_ID} garm-pool-id=${GARM_POOL_ID}"
|
||||
TAGS="garm_controller_id=${GARM_CONTROLLER_ID} garm_pool_id=${GARM_POOL_ID} os_type=${OS_TYPE} os_name=${OS_NAME} os_version=${OS_VERSION} os_arch=${ARCH}"
|
||||
|
||||
set +e
|
||||
|
||||
|
|
@ -298,6 +298,37 @@ function StopServer() {
|
|||
az vm deallocate -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors
|
||||
}
|
||||
|
||||
function GetInstance() {
|
||||
local instance_id="${GARM_INSTANCE_ID}"
|
||||
info=$(az vm show -d -n $instance_id -g $instance_id -o json --only-show-errors 2>&1)
|
||||
echo $info | jq -r '
|
||||
{
|
||||
provider_id: .name,
|
||||
name: .name,
|
||||
os_type: .tags.os_type,
|
||||
os_name: .tags.os_name,
|
||||
os_version: .tags.os_version,
|
||||
os_arch: .tags.os_arch,
|
||||
pool_id: .tags.garm_pool_id,
|
||||
status: {"VM starting": "pending_create", "VM running": "running", "VM stopping": "stopped", "VM stopped": "stopped", "VM deallocating": "stopped", "VM deallocated": "stopped"}[.powerState]
|
||||
}'
|
||||
}
|
||||
|
||||
function ListInstances() {
|
||||
INSTANCES=$(az vm list --query "[?tags.garm_pool_id == '${GARM_POOL_ID}']" -o json --only-show-errors 2>&1)
|
||||
echo $info | jq -r '
|
||||
.[] | {
|
||||
provider_id: .name,
|
||||
name: .name,
|
||||
os_type: .tags.os_type,
|
||||
os_name: .tags.os_name,
|
||||
os_version: .tags.os_version,
|
||||
os_arch: .tags.os_arch,
|
||||
pool_id: .tags.garm_pool_id,
|
||||
status: {"Creating": "pending_create", "Migrating": "pending_create", "Failed": "error", "Succeeded": "running", "Deleting": "pending_delete"}[.provisioningState]
|
||||
}'
|
||||
}
|
||||
|
||||
# Login to Azure
|
||||
checkValNotNull "${AZURE_SUBSCRIPTION_ID}" "AZURE_SUBSCRIPTION_ID"
|
||||
checkValNotNull "${AZURE_TENANT_ID}" "AZURE_TENANT_ID"
|
||||
|
|
@ -317,12 +348,10 @@ case "$GARM_COMMAND" in
|
|||
DeleteInstance
|
||||
;;
|
||||
"GetInstance")
|
||||
echo "GetInstance not implemented"
|
||||
exit 1
|
||||
GetInstance
|
||||
;;
|
||||
"ListInstances")
|
||||
echo "ListInstances not implemented"
|
||||
exit 1
|
||||
ListInstances
|
||||
;;
|
||||
"StartInstance")
|
||||
StartInstance
|
||||
|
|
|
|||
|
|
@ -279,7 +279,7 @@ function CreateInstance() {
|
|||
OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type')
|
||||
checkValNotNull "${OS_TYPE}" "os_type" || return $?
|
||||
DISTRO=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_distro')
|
||||
checkValNotNull "${OS_TYPE}" "os_distro" || return $?
|
||||
checkValNotNull "${DISTRO}" "os_distro" || return $?
|
||||
VERSION=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_version')
|
||||
checkValNotNull "${VERSION}" "os_version" || return $?
|
||||
ARCH=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.architecture')
|
||||
|
|
@ -306,7 +306,8 @@ function CreateInstance() {
|
|||
set +e
|
||||
|
||||
TAGS="--tag garm-controller-id=${GARM_CONTROLLER_ID} --tag garm-pool-id=${GARM_POOL_ID}"
|
||||
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
|
||||
PROPERTIES="--property os_type=${OS_TYPE} --property os_name=${DISTRO} --property os_version=${VERSION} --property os_arch=${GH_ARCH} --property pool_id=${GARM_POOL_ID}"
|
||||
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} ${PROPERTIES} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
|
||||
if [ $? -ne 0 ];then
|
||||
openstack volume delete "${INSTANCE_NAME}" || true
|
||||
exit 1
|
||||
|
|
@ -394,6 +395,25 @@ function StopServer() {
|
|||
openstack server stop "${instance_id}"
|
||||
}
|
||||
|
||||
function ListInstances() {
|
||||
INSTANCES=$(openstack server list --os-compute-api-version 2.52 --tags garm-pool-id=${GARM_POOL_ID} --long -f json)
|
||||
echo ${INSTANCES} | jq -r '
|
||||
.[] | .Properties * {
|
||||
provider_id: .ID,
|
||||
name: .Name,
|
||||
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.Status]
|
||||
}'
|
||||
}
|
||||
|
||||
function GetInstance() {
|
||||
INSTANCE=$(openstack server show --os-compute-api-version 2.52 ${GARM_INSTANCE_ID} -f json)
|
||||
echo ${INSTANCES} | jq -r '.properties * {
|
||||
provider_id: .id,
|
||||
name: .name,
|
||||
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.status]
|
||||
}'
|
||||
}
|
||||
|
||||
case "$GARM_COMMAND" in
|
||||
"CreateInstance")
|
||||
CreateInstance
|
||||
|
|
@ -402,12 +422,10 @@ case "$GARM_COMMAND" in
|
|||
DeleteInstance
|
||||
;;
|
||||
"GetInstance")
|
||||
echo "GetInstance not implemented"
|
||||
exit 1
|
||||
GetInstance
|
||||
;;
|
||||
"ListInstances")
|
||||
echo "ListInstances not implemented"
|
||||
exit 1
|
||||
ListInstances
|
||||
;;
|
||||
"StartInstance")
|
||||
StartInstance
|
||||
|
|
|
|||
|
|
@ -215,7 +215,7 @@ Then you can easily parse it. If you're using ```bash```, you can use the amazin
|
|||
|
||||
You will have to parse the bootstrap params, verify that the requested image exists, gather operating system information, CPU architecture information and using that information, you will need to select the appropriate tools for the arch/OS combination you are deploying.
|
||||
|
||||
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder.
|
||||
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder. Of particular interest are the [cloudconfig folders](../contrib/providers.d/openstack/cloudconfig/), where the instance user data templates are stored. These templates are used to generate the needed automation for the instances to download the github runner agent, send back status updates (including the final github runner agent ID), and download the github runner registration token from garm.
|
||||
|
||||
### CreateInstance outputs
|
||||
|
||||
|
|
@ -259,8 +259,6 @@ If the target instance does not exist in the provider, this command is expected
|
|||
|
||||
## GetInstance
|
||||
|
||||
NOTE: This operation is currently not use by ```garm```, but should be implemented.
|
||||
|
||||
The ```GetInstance``` command will return details about the instance, as seen by the provider.
|
||||
|
||||
Available environment variables:
|
||||
|
|
@ -275,8 +273,6 @@ On failure, this command is expected to return a non-zero exit code.
|
|||
|
||||
## ListInstances
|
||||
|
||||
NOTE: This operation is currently not use by ```garm```, but should be implemented.
|
||||
|
||||
The ```ListInstances``` command will print to standard output, a json that is unserializable into an **array** of ```Instance{}```.
|
||||
|
||||
Available environment variables:
|
||||
|
|
@ -293,9 +289,7 @@ On failure, a non-zero exit code is expected.
|
|||
|
||||
## RemoveAllInstances
|
||||
|
||||
NOTE: This operation is currently not use by ```garm```, but should be implemented.
|
||||
|
||||
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```.
|
||||
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances.
|
||||
|
||||
Available environment variables:
|
||||
|
||||
|
|
|
|||
|
|
@ -178,11 +178,21 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func instanceInList(instanceName string, instances []params.Instance) bool {
|
||||
for _, val := range instances {
|
||||
if val.Name == instanceName {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
|
||||
// as offline and for which we no longer have a local instance.
|
||||
// This may happen if someone manually deletes the instance in the provider. We need to
|
||||
// first remove the instance from github, and then from our database.
|
||||
func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
|
||||
poolInstanceCache := map[string][]params.Instance{}
|
||||
for _, runner := range runners {
|
||||
if !r.isManagedRunner(labelsFromRunner(runner)) {
|
||||
log.Printf("runner %s is not managed by a pool belonging to %s", *runner.Name, r.helper.String())
|
||||
|
|
@ -238,12 +248,17 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner)
|
|||
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
||||
}
|
||||
|
||||
// Check if the instance is still on the provider.
|
||||
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from provider")
|
||||
var poolInstances []params.Instance
|
||||
poolInstances, ok = poolInstanceCache[pool.ID]
|
||||
if !ok {
|
||||
poolInstances, err = provider.ListInstances(r.ctx, pool.ID)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "fetching instances for pool %s", pool.ID)
|
||||
}
|
||||
poolInstanceCache[pool.ID] = poolInstances
|
||||
}
|
||||
|
||||
if !instanceInList(dbInstance.Name, poolInstances) {
|
||||
// The runner instance is no longer on the provider, and it appears offline in github.
|
||||
// It should be safe to force remove it.
|
||||
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
|
||||
|
|
@ -470,36 +485,43 @@ func (r *basePoolManager) loop() {
|
|||
return
|
||||
}
|
||||
default:
|
||||
log.Printf("attempting to start pool manager for %s", r.helper.String())
|
||||
tools, err := r.helper.FetchTools()
|
||||
var failureReason string
|
||||
if err != nil {
|
||||
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
|
||||
r.setPoolRunningState(false, failureReason)
|
||||
log.Print(failureReason)
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
|
||||
} else {
|
||||
r.waitForTimeoutOrCanceled(60 * time.Second)
|
||||
select {
|
||||
case <-r.ctx.Done():
|
||||
// daemon is shutting down.
|
||||
return
|
||||
case <-r.quit:
|
||||
// this worker was stopped.
|
||||
return
|
||||
default:
|
||||
log.Printf("attempting to start pool manager for %s", r.helper.String())
|
||||
tools, err := r.helper.FetchTools()
|
||||
var failureReason string
|
||||
if err != nil {
|
||||
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
|
||||
r.setPoolRunningState(false, failureReason)
|
||||
log.Print(failureReason)
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
|
||||
} else {
|
||||
r.waitForTimeoutOrCanceled(60 * time.Second)
|
||||
}
|
||||
continue
|
||||
}
|
||||
continue
|
||||
}
|
||||
r.mux.Lock()
|
||||
r.tools = tools
|
||||
r.mux.Unlock()
|
||||
r.mux.Lock()
|
||||
r.tools = tools
|
||||
r.mux.Unlock()
|
||||
|
||||
if err := r.runnerCleanup(); err != nil {
|
||||
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
|
||||
r.setPoolRunningState(false, failureReason)
|
||||
log.Print(failureReason)
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
|
||||
} else {
|
||||
r.waitForTimeoutOrCanceled(60 * time.Second)
|
||||
if err := r.runnerCleanup(); err != nil {
|
||||
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
|
||||
log.Print(failureReason)
|
||||
if errors.Is(err, runnerErrors.ErrUnauthorized) {
|
||||
r.setPoolRunningState(false, failureReason)
|
||||
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
|
||||
}
|
||||
continue
|
||||
}
|
||||
continue
|
||||
r.setPoolRunningState(true, "")
|
||||
}
|
||||
r.setPoolRunningState(true, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue