Add manual runner removal

Runners can now be manually removed using the CLI. Some restrictions apply:

  * A runner must be idle in github. Github will not allow us to remove a runner
that is running a workflow.
  * The runner status must be "running"

Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
Gabriel Adrian Samfira 2022-06-29 16:23:01 +00:00
parent 1f419d0abc
commit 5390efbaab
20 changed files with 336 additions and 120 deletions

View file

@ -58,14 +58,14 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
w.WriteHeader(http.StatusBadRequest) w.WriteHeader(http.StatusBadRequest)
json.NewEncoder(w).Encode(params.APIErrorResponse{ json.NewEncoder(w).Encode(params.APIErrorResponse{
Error: "Bad Request", Error: "Bad Request",
Details: "No pool ID specified", Details: "No runner name specified",
}) })
return return
} }
instance, err := a.r.GetInstance(ctx, instanceName) instance, err := a.r.GetInstance(ctx, instanceName)
if err != nil { if err != nil {
log.Printf("listing pools: %s", err) log.Printf("listing instances: %s", err)
handleError(w, err) handleError(w, err)
return return
} }
@ -74,6 +74,29 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
json.NewEncoder(w).Encode(instance) json.NewEncoder(w).Encode(instance)
} }
func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
vars := mux.Vars(r)
instanceName, ok := vars["instanceName"]
if !ok {
w.WriteHeader(http.StatusBadRequest)
json.NewEncoder(w).Encode(params.APIErrorResponse{
Error: "Bad Request",
Details: "No instance name specified",
})
return
}
if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil {
log.Printf("removing runner: %s", err)
handleError(w, err)
return
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
}
func (a *APIController) ListRepoInstancesHandler(w http.ResponseWriter, r *http.Request) { func (a *APIController) ListRepoInstancesHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context() ctx := r.Context()
vars := mux.Vars(r) vars := mux.Vars(r)

View file

@ -88,7 +88,6 @@ func (a *APIController) DeletePoolByIDHandler(w http.ResponseWriter, r *http.Req
w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)
} }
func (a *APIController) UpdatePoolByIDHandler(w http.ResponseWriter, r *http.Request) { func (a *APIController) UpdatePoolByIDHandler(w http.ResponseWriter, r *http.Request) {

View file

@ -78,12 +78,15 @@ func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddl
///////////// /////////////
// Runners // // Runners //
///////////// /////////////
// List runners
apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
// Get instance // Get instance
apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS") apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS")
apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS") apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS")
// Delete runner
apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS")
apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS")
// List runners
apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
///////////////////// /////////////////////
// Repos and pools // // Repos and pools //

View file

@ -41,7 +41,8 @@ function sendStatus() {
function success() { function success() {
MSG="$1" MSG="$1"
call "{\"status\": \"idle\", \"message\": \"$MSG\"}" ID=$2
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
} }
function fail() { function fail() {
@ -72,7 +73,14 @@ sendStatus "installing runner service"
sendStatus "starting service" sendStatus "starting service"
./svc.sh start || fail "failed to start service" ./svc.sh start || fail "failed to start service"
success "runner successfully installed" set +e
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
if [ $? -ne 0 ];then
fail "failed to get agent ID"
fi
set -e
success "runner successfully installed" $AGENT_ID
` `
type InstallRunnerParams struct { type InstallRunnerParams struct {

View file

@ -151,6 +151,20 @@ func (c *Client) GetInstanceByName(instanceName string) (params.Instance, error)
return response, nil return response, nil
} }
func (c *Client) DeleteRunner(instanceName string) error {
url := fmt.Sprintf("%s/api/v1/instances/%s", c.Config.BaseURL, instanceName)
resp, err := c.client.R().
Delete(url)
if err != nil || resp.IsError() {
apiErr, decErr := c.decodeAPIError(resp.Body())
if decErr != nil {
return errors.Wrap(decErr, "sending request")
}
return fmt.Errorf("error deleting runner: %s", apiErr.Details)
}
return nil
}
func (c *Client) ListPoolInstances(poolID string) ([]params.Instance, error) { func (c *Client) ListPoolInstances(poolID string) ([]params.Instance, error) {
url := fmt.Sprintf("%s/api/v1/pools/%s/instances", c.Config.BaseURL, poolID) url := fmt.Sprintf("%s/api/v1/pools/%s/instances", c.Config.BaseURL, poolID)

View file

@ -27,6 +27,7 @@ var (
runnerRepository string runnerRepository string
runnerOrganization string runnerOrganization string
runnerAll bool runnerAll bool
forceRemove bool
) )
// runnerCmd represents the runner command // runnerCmd represents the runner command
@ -133,15 +134,54 @@ var runnerShowCmd = &cobra.Command{
}, },
} }
var runnerDeleteCmd = &cobra.Command{
Use: "delete",
Short: "Remove a runner",
Aliases: []string{"remove", "rm", "del"},
Long: `Remove a runner.
This command deletes an existing runner. If it registered in Github
and we recorded an agent ID for it, we will attempt to remove it from
Github first, then mark the runner as pending_delete so it will be
cleaned up by the provider.
NOTE: An active runner cannot be removed from Github. You will have
to either cancel the workflow or wait for it to finish.
`,
SilenceUsage: true,
RunE: func(cmd *cobra.Command, args []string) error {
if needsInit {
return needsInitError
}
if len(args) == 0 {
return fmt.Errorf("requires a runner name")
}
if !forceRemove {
return fmt.Errorf("use --force-remove-runner=true to remove a runner")
}
if err := cli.DeleteRunner(args[0]); err != nil {
return err
}
return nil
},
}
func init() { func init() {
runnerListCmd.Flags().StringVarP(&runnerRepository, "repo", "r", "", "List all runners from all pools within this repository.") runnerListCmd.Flags().StringVarP(&runnerRepository, "repo", "r", "", "List all runners from all pools within this repository.")
runnerListCmd.Flags().StringVarP(&runnerOrganization, "org", "o", "", "List all runners from all pools withing this organization.") runnerListCmd.Flags().StringVarP(&runnerOrganization, "org", "o", "", "List all runners from all pools withing this organization.")
runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.") runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.")
runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "all") runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "all")
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner")
runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner")
runnerCmd.AddCommand( runnerCmd.AddCommand(
runnerListCmd, runnerListCmd,
runnerShowCmd, runnerShowCmd,
runnerDeleteCmd,
) )
rootCmd.AddCommand(runnerCmd) rootCmd.AddCommand(runnerCmd)

View file

@ -24,7 +24,8 @@ function sendStatus() {
function success() { function success() {
MSG="$1" MSG="$1"
call "{\"status\": \"idle\", \"message\": \"$MSG\"}" ID=$2
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
} }
function fail() { function fail() {
@ -57,5 +58,11 @@ sendStatus "installing runner service"
sendStatus "starting service" sendStatus "starting service"
./svc.sh start || fail "failed to start service" ./svc.sh start || fail "failed to start service"
success "runner successfully installed" set +e
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
if [ $? -ne 0 ];then
fail "failed to get agent ID"
fi
set -e
success "runner successfully installed" $AGENT_ID

View file

@ -24,7 +24,8 @@ function sendStatus() {
function success() { function success() {
MSG="$1" MSG="$1"
call "{\"status\": \"idle\", \"message\": \"$MSG\"}" ID=$2
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
} }
function fail() { function fail() {
@ -57,5 +58,11 @@ sendStatus "installing runner service"
sendStatus "starting service" sendStatus "starting service"
./svc.sh start || fail "failed to start service" ./svc.sh start || fail "failed to start service"
success "runner successfully installed" set +e
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
if [ $? -ne 0 ];then
fail "failed to get agent ID"
fi
set -e
success "runner successfully installed" $AGENT_ID

View file

@ -159,6 +159,10 @@ func (s *sqlDatabase) UpdateInstance(ctx context.Context, instanceID string, par
return params.Instance{}, errors.Wrap(err, "updating instance") return params.Instance{}, errors.Wrap(err, "updating instance")
} }
if param.AgentID != 0 {
instance.AgentID = param.AgentID
}
if param.ProviderID != "" { if param.ProviderID != "" {
instance.ProviderID = &param.ProviderID instance.ProviderID = &param.ProviderID
} }

View file

@ -116,6 +116,7 @@ type Instance struct {
ProviderID *string `gorm:"uniqueIndex"` ProviderID *string `gorm:"uniqueIndex"`
Name string `gorm:"uniqueIndex"` Name string `gorm:"uniqueIndex"`
AgentID int64
OSType config.OSType OSType config.OSType
OSArch config.OSArch OSArch config.OSArch
OSName string OSName string

View file

@ -31,6 +31,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance {
ret := params.Instance{ ret := params.Instance{
ID: instance.ID.String(), ID: instance.ID.String(),
ProviderID: id, ProviderID: id,
AgentID: instance.AgentID,
Name: instance.Name, Name: instance.Name,
OSType: instance.OSType, OSType: instance.OSType,
OSName: instance.OSName, OSName: instance.OSName,
@ -42,6 +43,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance {
CallbackURL: instance.CallbackURL, CallbackURL: instance.CallbackURL,
StatusMessages: []params.StatusMessage{}, StatusMessages: []params.StatusMessage{},
CreateAttempt: instance.CreateAttempt, CreateAttempt: instance.CreateAttempt,
UpdatedAt: instance.UpdatedAt,
} }
if len(instance.ProviderFault) > 0 { if len(instance.ProviderFault) > 0 {

View file

@ -29,7 +29,8 @@ var (
// ErrBadRequest is returned is a malformed request is sent // ErrBadRequest is returned is a malformed request is sent
ErrBadRequest = NewBadRequestError("invalid request") ErrBadRequest = NewBadRequestError("invalid request")
// ErrTimeout is returned when a timeout occurs. // ErrTimeout is returned when a timeout occurs.
ErrTimeout = fmt.Errorf("timed out") ErrTimeout = fmt.Errorf("timed out")
ErrUnprocessable = fmt.Errorf("cannot process request")
) )
type baseError struct { type baseError struct {

View file

@ -47,6 +47,8 @@ type Instance struct {
// with the compute instance. We use this to identify the // with the compute instance. We use this to identify the
// instance in the provider. // instance in the provider.
ProviderID string `json:"provider_id,omitempty"` ProviderID string `json:"provider_id,omitempty"`
// AgentID is the github runner agent ID.
AgentID int64 `json:"agent_id"`
// Name is the name associated with an instance. Depending on // Name is the name associated with an instance. Depending on
// the provider, this may or may not be useful in the context of // the provider, this may or may not be useful in the context of
// the provider, but we can use it internally to identify the // the provider, but we can use it internally to identify the
@ -73,8 +75,9 @@ type Instance struct {
StatusMessages []StatusMessage `json:"status_messages,omitempty"` StatusMessages []StatusMessage `json:"status_messages,omitempty"`
// Do not serialize sensitive info. // Do not serialize sensitive info.
CallbackURL string `json:"-"` CallbackURL string `json:"-"`
CreateAttempt int `json:"-"` CreateAttempt int `json:"-"`
UpdatedAt time.Time `json:"updated_at"`
} }
type BootstrapInstance struct { type BootstrapInstance struct {

View file

@ -153,6 +153,7 @@ type UpdateInstanceParams struct {
Status common.InstanceStatus `json:"status,omitempty"` Status common.InstanceStatus `json:"status,omitempty"`
RunnerStatus common.RunnerStatus `json:"runner_status,omitempty"` RunnerStatus common.RunnerStatus `json:"runner_status,omitempty"`
ProviderFault []byte `json:"provider_fault,omitempty"` ProviderFault []byte `json:"provider_fault,omitempty"`
AgentID int64 `json:"-"`
CreateAttempt int `json:"-"` CreateAttempt int `json:"-"`
} }
@ -186,4 +187,5 @@ type UpdateRepositoryParams struct {
type InstanceUpdateMessage struct { type InstanceUpdateMessage struct {
Status common.RunnerStatus `json:"status"` Status common.RunnerStatus `json:"status"`
Message string `json:"message"` Message string `json:"message"`
AgentID *int64 `json:"agent_id"`
} }

View file

@ -26,10 +26,11 @@ const (
) )
type PoolManager interface { type PoolManager interface {
ID() string
WebhookSecret() string WebhookSecret() string
HandleWorkflowJob(job params.WorkflowJob) error HandleWorkflowJob(job params.WorkflowJob) error
RefreshState(param params.UpdatePoolStateParams) error RefreshState(param params.UpdatePoolStateParams) error
ID() string ForceDeleteRunner(runner params.Instance) error
// AddPool(ctx context.Context, pool params.Pool) error // AddPool(ctx context.Context, pool params.Pool) error
// PoolManager lifecycle functions. Start/stop pool. // PoolManager lifecycle functions. Start/stop pool.

View file

@ -24,6 +24,7 @@ import (
"garm/runner/common" "garm/runner/common"
providerCommon "garm/runner/providers/common" providerCommon "garm/runner/providers/common"
"log" "log"
"net/http"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -69,11 +70,6 @@ type basePool struct {
// If we were offline and did not process the webhook, the instance will linger. // If we were offline and did not process the webhook, the instance will linger.
// We need to remove it from the provider and database. // We need to remove it from the provider and database.
func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) error { func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) error {
// runners, err := r.getGithubRunners()
// if err != nil {
// return errors.Wrap(err, "fetching github runners")
// }
dbInstances, err := r.helper.FetchDbInstances() dbInstances, err := r.helper.FetchDbInstances()
if err != nil { if err != nil {
return errors.Wrap(err, "fetching instances from db") return errors.Wrap(err, "fetching instances from db")
@ -85,13 +81,25 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
} }
for _, instance := range dbInstances { for _, instance := range dbInstances {
if providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingCreate || providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingDelete { switch providerCommon.InstanceStatus(instance.Status) {
case providerCommon.InstancePendingCreate,
providerCommon.InstancePendingDelete:
// this instance is in the process of being created or is awaiting deletion. // this instance is in the process of being created or is awaiting deletion.
// Instances in pending_Create did not get a chance to register themselves in, // Instances in pending_create did not get a chance to register themselves in,
// github so we let them be for now. // github so we let them be for now.
continue continue
} }
if ok := runnerNames[instance.Name]; !ok { if ok := runnerNames[instance.Name]; !ok {
// if instance.Status == providerCommon.InstanceRunning {
// if time.Since(instance.UpdatedAt).Minutes() < 20 {
// // Allow up to 20 minutes for instance to finish installing.
// // Anything beyond that is considered a timeout and the instance
// // is marked for deletion.
// // TODO(gabriel-samfira): Make the timeout configurable.
// continue
// }
// }
// Set pending_delete on DB field. Allow consolidate() to remove it. // Set pending_delete on DB field. Allow consolidate() to remove it.
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name) log.Printf("failed to update runner %s status", instance.Name)
@ -102,6 +110,95 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
return nil return nil
} }
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
// first remove the instance from github, and then from our database.
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
for _, runner := range runners {
status := runner.GetStatus()
if status != "offline" {
// Runner is online. Ignore it.
continue
}
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from DB")
}
// We no longer have a DB entry for this instance, and the runner appears offline in github.
// Previous forceful removal may have failed?
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
if err != nil {
// Removed in the meantime?
if resp != nil && resp.StatusCode == http.StatusNotFound {
continue
}
return errors.Wrap(err, "removing runner")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting, which means the github workflow finished.
// Let consolidate take care of it.
continue
}
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
}
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
if err != nil {
// Removed in the meantime?
if resp != nil && resp.StatusCode == http.StatusNotFound {
log.Printf("runner dissapeared from github")
} else {
return errors.Wrap(err, "removing runner from github")
}
}
// Remove the database entry for the runner.
log.Printf("Removing %s from database", dbInstance.Name)
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
return errors.Wrap(err, "removing runner from database")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
}
return nil
}
func (r *basePool) fetchInstance(runnerName string) (params.Instance, error) { func (r *basePool) fetchInstance(runnerName string) (params.Instance, error) {
runner, err := r.store.GetInstanceByName(r.ctx, runnerName) runner, err := r.store.GetInstanceByName(r.ctx, runnerName)
if err != nil { if err != nil {
@ -363,18 +460,18 @@ func (r *basePool) HandleWorkflowJob(job params.WorkflowJob) error {
log.Printf("no runner was assigned. Skipping.") log.Printf("no runner was assigned. Skipping.")
return nil return nil
} }
// update instance workload state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
}
log.Printf("marking instance %s as pending_delete", job.WorkflowJob.RunnerName) log.Printf("marking instance %s as pending_delete", job.WorkflowJob.RunnerName)
if err := r.setInstanceStatus(job.WorkflowJob.RunnerName, providerCommon.InstancePendingDelete, nil); err != nil { if err := r.setInstanceStatus(job.WorkflowJob.RunnerName, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner") return errors.Wrap(err, "updating runner")
} }
// update instance workload state. Set job_id in instance state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner")
}
case "in_progress": case "in_progress":
// update instance workload state. Set job_id in instance state. // update instance workload state.
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerActive); err != nil { if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerActive); err != nil {
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName) log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
return errors.Wrap(err, "updating runner") return errors.Wrap(err, "updating runner")
@ -523,84 +620,6 @@ func (r *basePool) ensureMinIdleRunners() {
wg.Wait() wg.Wait()
} }
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
// as offline and for which we no longer have a local instance.
// This may happen if someone manually deletes the instance in the provider. We need to
// first remove the instance from github, and then from our database.
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
for _, runner := range runners {
status := runner.GetStatus()
if status != "offline" {
// Runner is online. Ignore it.
continue
}
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from DB")
}
// We no longer have a DB entry for this instance, and the runner appears offline in github.
// Previous forceful removal may have failed?
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
// already marked for deleting, which means the github workflow finished.
// Let consolidate take care of it.
continue
}
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
// check if the provider still has the instance.
provider, ok := r.providers[pool.ProviderName]
if !ok {
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
}
// Check if the instance is still on the provider.
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
if err != nil {
if !errors.Is(err, runnerErrors.ErrNotFound) {
return errors.Wrap(err, "fetching instance from provider")
}
// The runner instance is no longer on the provider, and it appears offline in github.
// It should be safe to force remove it.
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
return errors.Wrap(err, "removing runner from github")
}
// Remove the database entry for the runner.
log.Printf("Removing %s from database", dbInstance.Name)
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
return errors.Wrap(err, "removing runner from database")
}
continue
}
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
// instance is running, but github reports runner as offline. Log the event.
// This scenario requires manual intervention.
// Perhaps it just came online and github did not yet change it's status?
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
continue
}
//start the instance
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
}
}
return nil
}
func (r *basePool) deleteInstanceFromProvider(instance params.Instance) error { func (r *basePool) deleteInstanceFromProvider(instance params.Instance) error {
pool, err := r.helper.GetPoolByID(instance.PoolID) pool, err := r.helper.GetPoolByID(instance.PoolID)
if err != nil { if err != nil {
@ -657,19 +676,51 @@ func (r *basePool) deletePendingInstances() {
if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceDeleting, nil); err != nil { if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceDeleting, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name) log.Printf("failed to update runner %s status", instance.Name)
} }
go func(instance params.Instance) { go func(instance params.Instance) (err error) {
if err := r.deleteInstanceFromProvider(instance); err != nil { defer func(instance params.Instance) {
// failed to remove from provider. Set the status back to pending_delete, which if err != nil {
// will retry the operation. // failed to remove from provider. Set the status back to pending_delete, which
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil { // will retry the operation.
log.Printf("failed to update runner %s status", instance.Name) if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", instance.Name)
}
} }
}(instance)
err = r.deleteInstanceFromProvider(instance)
if err != nil {
log.Printf("failed to delete instance from provider: %+v", err) log.Printf("failed to delete instance from provider: %+v", err)
} }
return
}(instance) }(instance)
} }
} }
func (r *basePool) ForceDeleteRunner(runner params.Instance) error {
if runner.AgentID != 0 {
resp, err := r.helper.RemoveGithubRunner(runner.AgentID)
if err != nil {
if resp != nil {
switch resp.StatusCode {
case http.StatusUnprocessableEntity:
return errors.Wrapf(runnerErrors.ErrUnprocessable, "removing runner: %q", err)
case http.StatusNotFound:
return errors.Wrapf(runnerErrors.ErrNotFound, "removing runner: %q", err)
default:
return errors.Wrap(err, "removing runner")
}
}
return errors.Wrap(err, "removing runner")
}
}
if err := r.setInstanceStatus(runner.Name, providerCommon.InstancePendingDelete, nil); err != nil {
log.Printf("failed to update runner %s status", runner.Name)
return errors.Wrap(err, "updating runner")
}
return nil
}
func (r *basePool) addPendingInstances() { func (r *basePool) addPendingInstances() {
// TODO: filter instances by status. // TODO: filter instances by status.
instances, err := r.helper.FetchDbInstances() instances, err := r.helper.FetchDbInstances()

View file

@ -25,7 +25,7 @@ type poolHelper interface {
GetGithubRunners() ([]*github.Runner, error) GetGithubRunners() ([]*github.Runner, error)
FetchTools() ([]*github.RunnerApplicationDownload, error) FetchTools() ([]*github.RunnerApplicationDownload, error)
FetchDbInstances() ([]params.Instance, error) FetchDbInstances() ([]params.Instance, error)
RemoveGithubRunner(runnerID int64) error RemoveGithubRunner(runnerID int64) (*github.Response, error)
ListPools() ([]params.Pool, error) ListPools() ([]params.Pool, error)
GithubURL() string GithubURL() string
JwtToken() string JwtToken() string

View file

@ -112,9 +112,8 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) {
return r.store.ListOrgInstances(r.ctx, r.id) return r.store.ListOrgInstances(r.ctx, r.id)
} }
func (r *organization) RemoveGithubRunner(runnerID int64) error { func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
_, err := r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID) return r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
return errors.Wrap(err, "removing runner")
} }
func (r *organization) ListPools() ([]params.Pool, error) { func (r *organization) ListPools() ([]params.Pool, error) {

View file

@ -114,9 +114,8 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) {
return r.store.ListRepoInstances(r.ctx, r.id) return r.store.ListRepoInstances(r.ctx, r.id)
} }
func (r *repository) RemoveGithubRunner(runnerID int64) error { func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
_, err := r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID) return r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
return errors.Wrap(err, "removing runner")
} }
func (r *repository) ListPools() ([]params.Pool, error) { func (r *repository) ListPools() ([]params.Pool, error) {

View file

@ -39,6 +39,7 @@ import (
"garm/params" "garm/params"
"garm/runner/common" "garm/runner/common"
"garm/runner/providers" "garm/runner/providers"
providerCommon "garm/runner/providers/common"
"garm/util" "garm/util"
"github.com/pkg/errors" "github.com/pkg/errors"
@ -538,9 +539,60 @@ func (r *Runner) AddInstanceStatusMessage(ctx context.Context, param params.Inst
RunnerStatus: param.Status, RunnerStatus: param.Status,
} }
if param.AgentID != nil {
updateParams.AgentID = *param.AgentID
}
if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil { if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil {
return errors.Wrap(err, "updating runner state") return errors.Wrap(err, "updating runner state")
} }
return nil return nil
} }
func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error {
if !auth.IsAdmin(ctx) {
return runnerErrors.ErrUnauthorized
}
instance, err := r.store.GetInstanceByName(ctx, instanceName)
if err != nil {
return errors.Wrap(err, "fetching instance")
}
if instance.Status != providerCommon.InstanceRunning {
return runnerErrors.NewBadRequestError("runner must be in %q state", providerCommon.InstanceRunning)
}
pool, err := r.store.GetPoolByID(ctx, instance.PoolID)
if err != nil {
return errors.Wrap(err, "fetching pool")
}
var poolMgr common.PoolManager
if pool.RepoID != "" {
repo, err := r.store.GetRepositoryByID(ctx, pool.RepoID)
if err != nil {
return errors.Wrap(err, "fetching repo")
}
poolMgr, err = r.findRepoPoolManager(repo.Owner, repo.Name)
if err != nil {
return errors.Wrapf(err, "fetching pool manager for repo %s", pool.RepoName)
}
} else if pool.OrgID != "" {
org, err := r.store.GetOrganizationByID(ctx, pool.OrgID)
if err != nil {
return errors.Wrap(err, "fetching org")
}
poolMgr, err = r.findOrgPoolManager(org.Name)
if err != nil {
return errors.Wrapf(err, "fetching pool manager for org %s", pool.OrgName)
}
}
if err := poolMgr.ForceDeleteRunner(instance); err != nil {
return errors.Wrap(err, "removing runner")
}
return nil
}