Add manual runner removal
Runners can now be manually removed using the CLI. Some restrictions apply: * A runner must be idle in github. Github will not allow us to remove a runner that is running a workflow. * The runner status must be "running" Signed-off-by: Gabriel Adrian Samfira <gsamfira@cloudbasesolutions.com>
This commit is contained in:
parent
1f419d0abc
commit
5390efbaab
20 changed files with 336 additions and 120 deletions
|
|
@ -58,14 +58,14 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
|
|||
w.WriteHeader(http.StatusBadRequest)
|
||||
json.NewEncoder(w).Encode(params.APIErrorResponse{
|
||||
Error: "Bad Request",
|
||||
Details: "No pool ID specified",
|
||||
Details: "No runner name specified",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
instance, err := a.r.GetInstance(ctx, instanceName)
|
||||
if err != nil {
|
||||
log.Printf("listing pools: %s", err)
|
||||
log.Printf("listing instances: %s", err)
|
||||
handleError(w, err)
|
||||
return
|
||||
}
|
||||
|
|
@ -74,6 +74,29 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques
|
|||
json.NewEncoder(w).Encode(instance)
|
||||
}
|
||||
|
||||
func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Request) {
|
||||
ctx := r.Context()
|
||||
vars := mux.Vars(r)
|
||||
instanceName, ok := vars["instanceName"]
|
||||
if !ok {
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
json.NewEncoder(w).Encode(params.APIErrorResponse{
|
||||
Error: "Bad Request",
|
||||
Details: "No instance name specified",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if err := a.r.ForceDeleteRunner(ctx, instanceName); err != nil {
|
||||
log.Printf("removing runner: %s", err)
|
||||
handleError(w, err)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
func (a *APIController) ListRepoInstancesHandler(w http.ResponseWriter, r *http.Request) {
|
||||
ctx := r.Context()
|
||||
vars := mux.Vars(r)
|
||||
|
|
|
|||
|
|
@ -88,7 +88,6 @@ func (a *APIController) DeletePoolByIDHandler(w http.ResponseWriter, r *http.Req
|
|||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
}
|
||||
|
||||
func (a *APIController) UpdatePoolByIDHandler(w http.ResponseWriter, r *http.Request) {
|
||||
|
|
|
|||
|
|
@ -78,12 +78,15 @@ func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddl
|
|||
/////////////
|
||||
// Runners //
|
||||
/////////////
|
||||
// List runners
|
||||
apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
|
||||
apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
|
||||
// Get instance
|
||||
apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS")
|
||||
apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.GetInstanceHandler))).Methods("GET", "OPTIONS")
|
||||
// Delete runner
|
||||
apiRouter.Handle("/instances/{instanceName}/", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS")
|
||||
apiRouter.Handle("/instances/{instanceName}", log(os.Stdout, http.HandlerFunc(han.DeleteInstanceHandler))).Methods("DELETE", "OPTIONS")
|
||||
// List runners
|
||||
apiRouter.Handle("/instances/", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
|
||||
apiRouter.Handle("/instances", log(os.Stdout, http.HandlerFunc(han.ListAllInstancesHandler))).Methods("GET", "OPTIONS")
|
||||
|
||||
/////////////////////
|
||||
// Repos and pools //
|
||||
|
|
|
|||
|
|
@ -41,7 +41,8 @@ function sendStatus() {
|
|||
|
||||
function success() {
|
||||
MSG="$1"
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\"}"
|
||||
ID=$2
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
|
||||
}
|
||||
|
||||
function fail() {
|
||||
|
|
@ -72,7 +73,14 @@ sendStatus "installing runner service"
|
|||
sendStatus "starting service"
|
||||
./svc.sh start || fail "failed to start service"
|
||||
|
||||
success "runner successfully installed"
|
||||
set +e
|
||||
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
|
||||
if [ $? -ne 0 ];then
|
||||
fail "failed to get agent ID"
|
||||
fi
|
||||
set -e
|
||||
|
||||
success "runner successfully installed" $AGENT_ID
|
||||
`
|
||||
|
||||
type InstallRunnerParams struct {
|
||||
|
|
|
|||
|
|
@ -151,6 +151,20 @@ func (c *Client) GetInstanceByName(instanceName string) (params.Instance, error)
|
|||
return response, nil
|
||||
}
|
||||
|
||||
func (c *Client) DeleteRunner(instanceName string) error {
|
||||
url := fmt.Sprintf("%s/api/v1/instances/%s", c.Config.BaseURL, instanceName)
|
||||
resp, err := c.client.R().
|
||||
Delete(url)
|
||||
if err != nil || resp.IsError() {
|
||||
apiErr, decErr := c.decodeAPIError(resp.Body())
|
||||
if decErr != nil {
|
||||
return errors.Wrap(decErr, "sending request")
|
||||
}
|
||||
return fmt.Errorf("error deleting runner: %s", apiErr.Details)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) ListPoolInstances(poolID string) ([]params.Instance, error) {
|
||||
url := fmt.Sprintf("%s/api/v1/pools/%s/instances", c.Config.BaseURL, poolID)
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ var (
|
|||
runnerRepository string
|
||||
runnerOrganization string
|
||||
runnerAll bool
|
||||
forceRemove bool
|
||||
)
|
||||
|
||||
// runnerCmd represents the runner command
|
||||
|
|
@ -133,15 +134,54 @@ var runnerShowCmd = &cobra.Command{
|
|||
},
|
||||
}
|
||||
|
||||
var runnerDeleteCmd = &cobra.Command{
|
||||
Use: "delete",
|
||||
Short: "Remove a runner",
|
||||
Aliases: []string{"remove", "rm", "del"},
|
||||
Long: `Remove a runner.
|
||||
|
||||
This command deletes an existing runner. If it registered in Github
|
||||
and we recorded an agent ID for it, we will attempt to remove it from
|
||||
Github first, then mark the runner as pending_delete so it will be
|
||||
cleaned up by the provider.
|
||||
|
||||
NOTE: An active runner cannot be removed from Github. You will have
|
||||
to either cancel the workflow or wait for it to finish.
|
||||
`,
|
||||
SilenceUsage: true,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
if needsInit {
|
||||
return needsInitError
|
||||
}
|
||||
|
||||
if len(args) == 0 {
|
||||
return fmt.Errorf("requires a runner name")
|
||||
}
|
||||
|
||||
if !forceRemove {
|
||||
return fmt.Errorf("use --force-remove-runner=true to remove a runner")
|
||||
}
|
||||
|
||||
if err := cli.DeleteRunner(args[0]); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
runnerListCmd.Flags().StringVarP(&runnerRepository, "repo", "r", "", "List all runners from all pools within this repository.")
|
||||
runnerListCmd.Flags().StringVarP(&runnerOrganization, "org", "o", "", "List all runners from all pools withing this organization.")
|
||||
runnerListCmd.Flags().BoolVarP(&runnerAll, "all", "a", false, "List all runners, regardless of org or repo.")
|
||||
runnerListCmd.MarkFlagsMutuallyExclusive("repo", "org", "all")
|
||||
|
||||
runnerDeleteCmd.Flags().BoolVarP(&forceRemove, "force-remove-runner", "f", false, "Confirm you want to delete a runner")
|
||||
runnerDeleteCmd.MarkFlagsMutuallyExclusive("force-remove-runner")
|
||||
|
||||
runnerCmd.AddCommand(
|
||||
runnerListCmd,
|
||||
runnerShowCmd,
|
||||
runnerDeleteCmd,
|
||||
)
|
||||
|
||||
rootCmd.AddCommand(runnerCmd)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,8 @@ function sendStatus() {
|
|||
|
||||
function success() {
|
||||
MSG="$1"
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\"}"
|
||||
ID=$2
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
|
||||
}
|
||||
|
||||
function fail() {
|
||||
|
|
@ -57,5 +58,11 @@ sendStatus "installing runner service"
|
|||
sendStatus "starting service"
|
||||
./svc.sh start || fail "failed to start service"
|
||||
|
||||
success "runner successfully installed"
|
||||
set +e
|
||||
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
|
||||
if [ $? -ne 0 ];then
|
||||
fail "failed to get agent ID"
|
||||
fi
|
||||
set -e
|
||||
|
||||
success "runner successfully installed" $AGENT_ID
|
||||
|
|
|
|||
|
|
@ -24,7 +24,8 @@ function sendStatus() {
|
|||
|
||||
function success() {
|
||||
MSG="$1"
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\"}"
|
||||
ID=$2
|
||||
call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}"
|
||||
}
|
||||
|
||||
function fail() {
|
||||
|
|
@ -57,5 +58,11 @@ sendStatus "installing runner service"
|
|||
sendStatus "starting service"
|
||||
./svc.sh start || fail "failed to start service"
|
||||
|
||||
success "runner successfully installed"
|
||||
set +e
|
||||
AGENT_ID=$(grep "agentId" /home/{{ .RunnerUsername }}/actions-runner/.runner | tr -d -c 0-9)
|
||||
if [ $? -ne 0 ];then
|
||||
fail "failed to get agent ID"
|
||||
fi
|
||||
set -e
|
||||
|
||||
success "runner successfully installed" $AGENT_ID
|
||||
|
|
|
|||
|
|
@ -159,6 +159,10 @@ func (s *sqlDatabase) UpdateInstance(ctx context.Context, instanceID string, par
|
|||
return params.Instance{}, errors.Wrap(err, "updating instance")
|
||||
}
|
||||
|
||||
if param.AgentID != 0 {
|
||||
instance.AgentID = param.AgentID
|
||||
}
|
||||
|
||||
if param.ProviderID != "" {
|
||||
instance.ProviderID = ¶m.ProviderID
|
||||
}
|
||||
|
|
|
|||
|
|
@ -116,6 +116,7 @@ type Instance struct {
|
|||
|
||||
ProviderID *string `gorm:"uniqueIndex"`
|
||||
Name string `gorm:"uniqueIndex"`
|
||||
AgentID int64
|
||||
OSType config.OSType
|
||||
OSArch config.OSArch
|
||||
OSName string
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance {
|
|||
ret := params.Instance{
|
||||
ID: instance.ID.String(),
|
||||
ProviderID: id,
|
||||
AgentID: instance.AgentID,
|
||||
Name: instance.Name,
|
||||
OSType: instance.OSType,
|
||||
OSName: instance.OSName,
|
||||
|
|
@ -42,6 +43,7 @@ func (s *sqlDatabase) sqlToParamsInstance(instance Instance) params.Instance {
|
|||
CallbackURL: instance.CallbackURL,
|
||||
StatusMessages: []params.StatusMessage{},
|
||||
CreateAttempt: instance.CreateAttempt,
|
||||
UpdatedAt: instance.UpdatedAt,
|
||||
}
|
||||
|
||||
if len(instance.ProviderFault) > 0 {
|
||||
|
|
|
|||
|
|
@ -29,7 +29,8 @@ var (
|
|||
// ErrBadRequest is returned is a malformed request is sent
|
||||
ErrBadRequest = NewBadRequestError("invalid request")
|
||||
// ErrTimeout is returned when a timeout occurs.
|
||||
ErrTimeout = fmt.Errorf("timed out")
|
||||
ErrTimeout = fmt.Errorf("timed out")
|
||||
ErrUnprocessable = fmt.Errorf("cannot process request")
|
||||
)
|
||||
|
||||
type baseError struct {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@ type Instance struct {
|
|||
// with the compute instance. We use this to identify the
|
||||
// instance in the provider.
|
||||
ProviderID string `json:"provider_id,omitempty"`
|
||||
// AgentID is the github runner agent ID.
|
||||
AgentID int64 `json:"agent_id"`
|
||||
// Name is the name associated with an instance. Depending on
|
||||
// the provider, this may or may not be useful in the context of
|
||||
// the provider, but we can use it internally to identify the
|
||||
|
|
@ -73,8 +75,9 @@ type Instance struct {
|
|||
StatusMessages []StatusMessage `json:"status_messages,omitempty"`
|
||||
|
||||
// Do not serialize sensitive info.
|
||||
CallbackURL string `json:"-"`
|
||||
CreateAttempt int `json:"-"`
|
||||
CallbackURL string `json:"-"`
|
||||
CreateAttempt int `json:"-"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type BootstrapInstance struct {
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ type UpdateInstanceParams struct {
|
|||
Status common.InstanceStatus `json:"status,omitempty"`
|
||||
RunnerStatus common.RunnerStatus `json:"runner_status,omitempty"`
|
||||
ProviderFault []byte `json:"provider_fault,omitempty"`
|
||||
AgentID int64 `json:"-"`
|
||||
CreateAttempt int `json:"-"`
|
||||
}
|
||||
|
||||
|
|
@ -186,4 +187,5 @@ type UpdateRepositoryParams struct {
|
|||
type InstanceUpdateMessage struct {
|
||||
Status common.RunnerStatus `json:"status"`
|
||||
Message string `json:"message"`
|
||||
AgentID *int64 `json:"agent_id"`
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,10 +26,11 @@ const (
|
|||
)
|
||||
|
||||
type PoolManager interface {
|
||||
ID() string
|
||||
WebhookSecret() string
|
||||
HandleWorkflowJob(job params.WorkflowJob) error
|
||||
RefreshState(param params.UpdatePoolStateParams) error
|
||||
ID() string
|
||||
ForceDeleteRunner(runner params.Instance) error
|
||||
// AddPool(ctx context.Context, pool params.Pool) error
|
||||
|
||||
// PoolManager lifecycle functions. Start/stop pool.
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import (
|
|||
"garm/runner/common"
|
||||
providerCommon "garm/runner/providers/common"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
|
@ -69,11 +70,6 @@ type basePool struct {
|
|||
// If we were offline and did not process the webhook, the instance will linger.
|
||||
// We need to remove it from the provider and database.
|
||||
func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) error {
|
||||
// runners, err := r.getGithubRunners()
|
||||
// if err != nil {
|
||||
// return errors.Wrap(err, "fetching github runners")
|
||||
// }
|
||||
|
||||
dbInstances, err := r.helper.FetchDbInstances()
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instances from db")
|
||||
|
|
@ -85,13 +81,25 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
|
|||
}
|
||||
|
||||
for _, instance := range dbInstances {
|
||||
if providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingCreate || providerCommon.InstanceStatus(instance.Status) == providerCommon.InstancePendingDelete {
|
||||
switch providerCommon.InstanceStatus(instance.Status) {
|
||||
case providerCommon.InstancePendingCreate,
|
||||
providerCommon.InstancePendingDelete:
|
||||
// this instance is in the process of being created or is awaiting deletion.
|
||||
// Instances in pending_Create did not get a chance to register themselves in,
|
||||
// Instances in pending_create did not get a chance to register themselves in,
|
||||
// github so we let them be for now.
|
||||
continue
|
||||
}
|
||||
|
||||
if ok := runnerNames[instance.Name]; !ok {
|
||||
// if instance.Status == providerCommon.InstanceRunning {
|
||||
// if time.Since(instance.UpdatedAt).Minutes() < 20 {
|
||||
// // Allow up to 20 minutes for instance to finish installing.
|
||||
// // Anything beyond that is considered a timeout and the instance
|
||||
// // is marked for deletion.
|
||||
// // TODO(gabriel-samfira): Make the timeout configurable.
|
||||
// continue
|
||||
// }
|
||||
// }
|
||||
// Set pending_delete on DB field. Allow consolidate() to remove it.
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
|
|
@ -102,6 +110,95 @@ func (r *basePool) cleanupOrphanedProviderRunners(runners []*github.Runner) erro
|
|||
return nil
|
||||
}
|
||||
|
||||
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
|
||||
// as offline and for which we no longer have a local instance.
|
||||
// This may happen if someone manually deletes the instance in the provider. We need to
|
||||
// first remove the instance from github, and then from our database.
|
||||
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
|
||||
for _, runner := range runners {
|
||||
status := runner.GetStatus()
|
||||
if status != "offline" {
|
||||
// Runner is online. Ignore it.
|
||||
continue
|
||||
}
|
||||
|
||||
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from DB")
|
||||
}
|
||||
// We no longer have a DB entry for this instance, and the runner appears offline in github.
|
||||
// Previous forceful removal may have failed?
|
||||
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
|
||||
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
|
||||
if err != nil {
|
||||
// Removed in the meantime?
|
||||
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
||||
continue
|
||||
}
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
|
||||
// already marked for deleting, which means the github workflow finished.
|
||||
// Let consolidate take care of it.
|
||||
continue
|
||||
}
|
||||
|
||||
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
|
||||
// check if the provider still has the instance.
|
||||
provider, ok := r.providers[pool.ProviderName]
|
||||
if !ok {
|
||||
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
||||
}
|
||||
|
||||
// Check if the instance is still on the provider.
|
||||
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from provider")
|
||||
}
|
||||
// The runner instance is no longer on the provider, and it appears offline in github.
|
||||
// It should be safe to force remove it.
|
||||
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
|
||||
resp, err := r.helper.RemoveGithubRunner(*runner.ID)
|
||||
if err != nil {
|
||||
// Removed in the meantime?
|
||||
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
||||
log.Printf("runner dissapeared from github")
|
||||
} else {
|
||||
return errors.Wrap(err, "removing runner from github")
|
||||
}
|
||||
}
|
||||
// Remove the database entry for the runner.
|
||||
log.Printf("Removing %s from database", dbInstance.Name)
|
||||
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
|
||||
return errors.Wrap(err, "removing runner from database")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
|
||||
// instance is running, but github reports runner as offline. Log the event.
|
||||
// This scenario requires manual intervention.
|
||||
// Perhaps it just came online and github did not yet change it's status?
|
||||
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
|
||||
continue
|
||||
}
|
||||
//start the instance
|
||||
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
|
||||
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *basePool) fetchInstance(runnerName string) (params.Instance, error) {
|
||||
runner, err := r.store.GetInstanceByName(r.ctx, runnerName)
|
||||
if err != nil {
|
||||
|
|
@ -363,18 +460,18 @@ func (r *basePool) HandleWorkflowJob(job params.WorkflowJob) error {
|
|||
log.Printf("no runner was assigned. Skipping.")
|
||||
return nil
|
||||
}
|
||||
// update instance workload state.
|
||||
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
|
||||
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
log.Printf("marking instance %s as pending_delete", job.WorkflowJob.RunnerName)
|
||||
if err := r.setInstanceStatus(job.WorkflowJob.RunnerName, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
// update instance workload state. Set job_id in instance state.
|
||||
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerTerminated); err != nil {
|
||||
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
case "in_progress":
|
||||
// update instance workload state. Set job_id in instance state.
|
||||
// update instance workload state.
|
||||
if err := r.setInstanceRunnerStatus(job, providerCommon.RunnerActive); err != nil {
|
||||
log.Printf("failed to update runner %s status", job.WorkflowJob.RunnerName)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
|
|
@ -523,84 +620,6 @@ func (r *basePool) ensureMinIdleRunners() {
|
|||
wg.Wait()
|
||||
}
|
||||
|
||||
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
|
||||
// as offline and for which we no longer have a local instance.
|
||||
// This may happen if someone manually deletes the instance in the provider. We need to
|
||||
// first remove the instance from github, and then from our database.
|
||||
func (r *basePool) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
|
||||
for _, runner := range runners {
|
||||
status := runner.GetStatus()
|
||||
if status != "offline" {
|
||||
// Runner is online. Ignore it.
|
||||
continue
|
||||
}
|
||||
|
||||
dbInstance, err := r.store.GetInstanceByName(r.ctx, *runner.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from DB")
|
||||
}
|
||||
// We no longer have a DB entry for this instance, and the runner appears offline in github.
|
||||
// Previous forceful removal may have failed?
|
||||
log.Printf("Runner %s has no database entry in garm, removing from github", *runner.Name)
|
||||
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstancePendingDelete {
|
||||
// already marked for deleting, which means the github workflow finished.
|
||||
// Let consolidate take care of it.
|
||||
continue
|
||||
}
|
||||
|
||||
pool, err := r.helper.GetPoolByID(dbInstance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
|
||||
// check if the provider still has the instance.
|
||||
provider, ok := r.providers[pool.ProviderName]
|
||||
if !ok {
|
||||
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
|
||||
}
|
||||
|
||||
// Check if the instance is still on the provider.
|
||||
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
|
||||
if err != nil {
|
||||
if !errors.Is(err, runnerErrors.ErrNotFound) {
|
||||
return errors.Wrap(err, "fetching instance from provider")
|
||||
}
|
||||
// The runner instance is no longer on the provider, and it appears offline in github.
|
||||
// It should be safe to force remove it.
|
||||
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
|
||||
if err := r.helper.RemoveGithubRunner(*runner.ID); err != nil {
|
||||
return errors.Wrap(err, "removing runner from github")
|
||||
}
|
||||
// Remove the database entry for the runner.
|
||||
log.Printf("Removing %s from database", dbInstance.Name)
|
||||
if err := r.store.DeleteInstance(r.ctx, dbInstance.PoolID, dbInstance.Name); err != nil {
|
||||
return errors.Wrap(err, "removing runner from database")
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if providerCommon.InstanceStatus(dbInstance.Status) == providerCommon.InstanceRunning {
|
||||
// instance is running, but github reports runner as offline. Log the event.
|
||||
// This scenario requires manual intervention.
|
||||
// Perhaps it just came online and github did not yet change it's status?
|
||||
log.Printf("instance %s is online but github reports runner as offline", dbInstance.Name)
|
||||
continue
|
||||
}
|
||||
//start the instance
|
||||
if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil {
|
||||
return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *basePool) deleteInstanceFromProvider(instance params.Instance) error {
|
||||
pool, err := r.helper.GetPoolByID(instance.PoolID)
|
||||
if err != nil {
|
||||
|
|
@ -657,19 +676,51 @@ func (r *basePool) deletePendingInstances() {
|
|||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstanceDeleting, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
}
|
||||
go func(instance params.Instance) {
|
||||
if err := r.deleteInstanceFromProvider(instance); err != nil {
|
||||
// failed to remove from provider. Set the status back to pending_delete, which
|
||||
// will retry the operation.
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
go func(instance params.Instance) (err error) {
|
||||
defer func(instance params.Instance) {
|
||||
if err != nil {
|
||||
// failed to remove from provider. Set the status back to pending_delete, which
|
||||
// will retry the operation.
|
||||
if err := r.setInstanceStatus(instance.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", instance.Name)
|
||||
}
|
||||
}
|
||||
}(instance)
|
||||
|
||||
err = r.deleteInstanceFromProvider(instance)
|
||||
if err != nil {
|
||||
log.Printf("failed to delete instance from provider: %+v", err)
|
||||
}
|
||||
return
|
||||
}(instance)
|
||||
}
|
||||
}
|
||||
|
||||
func (r *basePool) ForceDeleteRunner(runner params.Instance) error {
|
||||
if runner.AgentID != 0 {
|
||||
resp, err := r.helper.RemoveGithubRunner(runner.AgentID)
|
||||
if err != nil {
|
||||
if resp != nil {
|
||||
switch resp.StatusCode {
|
||||
case http.StatusUnprocessableEntity:
|
||||
return errors.Wrapf(runnerErrors.ErrUnprocessable, "removing runner: %q", err)
|
||||
case http.StatusNotFound:
|
||||
return errors.Wrapf(runnerErrors.ErrNotFound, "removing runner: %q", err)
|
||||
default:
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
}
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
}
|
||||
|
||||
if err := r.setInstanceStatus(runner.Name, providerCommon.InstancePendingDelete, nil); err != nil {
|
||||
log.Printf("failed to update runner %s status", runner.Name)
|
||||
return errors.Wrap(err, "updating runner")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *basePool) addPendingInstances() {
|
||||
// TODO: filter instances by status.
|
||||
instances, err := r.helper.FetchDbInstances()
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ type poolHelper interface {
|
|||
GetGithubRunners() ([]*github.Runner, error)
|
||||
FetchTools() ([]*github.RunnerApplicationDownload, error)
|
||||
FetchDbInstances() ([]params.Instance, error)
|
||||
RemoveGithubRunner(runnerID int64) error
|
||||
RemoveGithubRunner(runnerID int64) (*github.Response, error)
|
||||
ListPools() ([]params.Pool, error)
|
||||
GithubURL() string
|
||||
JwtToken() string
|
||||
|
|
|
|||
|
|
@ -112,9 +112,8 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) {
|
|||
return r.store.ListOrgInstances(r.ctx, r.id)
|
||||
}
|
||||
|
||||
func (r *organization) RemoveGithubRunner(runnerID int64) error {
|
||||
_, err := r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
|
||||
return errors.Wrap(err, "removing runner")
|
||||
func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
|
||||
return r.ghcli.Actions.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
|
||||
}
|
||||
|
||||
func (r *organization) ListPools() ([]params.Pool, error) {
|
||||
|
|
|
|||
|
|
@ -114,9 +114,8 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) {
|
|||
return r.store.ListRepoInstances(r.ctx, r.id)
|
||||
}
|
||||
|
||||
func (r *repository) RemoveGithubRunner(runnerID int64) error {
|
||||
_, err := r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
|
||||
return errors.Wrap(err, "removing runner")
|
||||
func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
|
||||
return r.ghcli.Actions.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
|
||||
}
|
||||
|
||||
func (r *repository) ListPools() ([]params.Pool, error) {
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ import (
|
|||
"garm/params"
|
||||
"garm/runner/common"
|
||||
"garm/runner/providers"
|
||||
providerCommon "garm/runner/providers/common"
|
||||
"garm/util"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
|
|
@ -538,9 +539,60 @@ func (r *Runner) AddInstanceStatusMessage(ctx context.Context, param params.Inst
|
|||
RunnerStatus: param.Status,
|
||||
}
|
||||
|
||||
if param.AgentID != nil {
|
||||
updateParams.AgentID = *param.AgentID
|
||||
}
|
||||
|
||||
if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil {
|
||||
return errors.Wrap(err, "updating runner state")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Runner) ForceDeleteRunner(ctx context.Context, instanceName string) error {
|
||||
if !auth.IsAdmin(ctx) {
|
||||
return runnerErrors.ErrUnauthorized
|
||||
}
|
||||
|
||||
instance, err := r.store.GetInstanceByName(ctx, instanceName)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching instance")
|
||||
}
|
||||
|
||||
if instance.Status != providerCommon.InstanceRunning {
|
||||
return runnerErrors.NewBadRequestError("runner must be in %q state", providerCommon.InstanceRunning)
|
||||
}
|
||||
|
||||
pool, err := r.store.GetPoolByID(ctx, instance.PoolID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching pool")
|
||||
}
|
||||
|
||||
var poolMgr common.PoolManager
|
||||
|
||||
if pool.RepoID != "" {
|
||||
repo, err := r.store.GetRepositoryByID(ctx, pool.RepoID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching repo")
|
||||
}
|
||||
poolMgr, err = r.findRepoPoolManager(repo.Owner, repo.Name)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "fetching pool manager for repo %s", pool.RepoName)
|
||||
}
|
||||
} else if pool.OrgID != "" {
|
||||
org, err := r.store.GetOrganizationByID(ctx, pool.OrgID)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "fetching org")
|
||||
}
|
||||
poolMgr, err = r.findOrgPoolManager(org.Name)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "fetching pool manager for org %s", pool.OrgName)
|
||||
}
|
||||
}
|
||||
|
||||
if err := poolMgr.ForceDeleteRunner(instance); err != nil {
|
||||
return errors.Wrap(err, "removing runner")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue