diff --git a/README.md b/README.md index 0430ecbf..9b7f571e 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,34 @@ Check out the [quickstart](/doc/quickstart.md) document for instructions on how Thanks to the efforts of the amazing folks at @mercedes-benz, GARM can now be integrated into k8s via their operator. Check out the [GARM operator](https://github.com/mercedes-benz/garm-operator/) for more details. +## Configuration + +The ```GARM``` configuration is a simple ```toml```. The sample config file in [the testdata folder](/testdata/config.toml) is fairly well commented and should be enough to get you started. The configuration file is split into several sections, each of which is documented in its own page. The sections are: + +* [The default section](/doc/config_default.md) +* [Logging](/doc/config_logging.md) +* [Database](/doc/database.md) +* [Github credentials](/doc/github_credentials.md) +* [Providers](/doc/providers.md) +* [Metrics](/doc/config_metrics.md) +* [JWT authentication](/doc/config_jwt_auth.md) +* [API server](/doc/config_api_server.md) + +## Using GARM + +GARM is designed with simplicity in mind. At least we try to keep it as simple as possible. We're aware that adding a new tool in your workflow can be painful, especially when you already have to deal with so many. The cognitive load for OPS has reached a level where it feels overwhelming at times to even wrap your head around a new tool. As such, we believe that tools should be simple, should take no more than a few hours to understand and set up and if you absolutely need to interact with the tool, it should be as intuitive as possible. + +We've written a short introduction into some of the commands that GARM has and some of the concepts involved in setting up GARM, managing runners and how GitHub does some of the things it does. + +[You can find it here](/doc/using_garm.md). + +Please, feel free to [open an issue](https://github.com/cloudbase/garm/issues/new) if you find the documentation lacking and would like more info. Sometimes we forget the challanges that new users face as we're so close to the code and how it works. Any feedback is welcome and we're always looking to improve the documentation. + ## Supported providers GARM uses providers to create runners in a particular IaaS. The providers are external executables that GARM calls into to create runners. Before you can create runners, you'll need to install at least one provider. -## Installing external providers +### Installing external providers External providers are binaries that GARM calls into to create runners in a particular IaaS. There are several external providers available: @@ -41,21 +64,12 @@ External providers are binaries that GARM calls into to create runners in a part * [Kubernetes](https://github.com/mercedes-benz/garm-provider-k8s) - Thanks to the amazing folks at @mercedes-benz for sharing their awesome provider! * [LXD](https://github.com/cloudbase/garm-provider-lxd) * [Incus](https://github.com/cloudbase/garm-provider-incus) +* [Equinix Metal](https://github.com/cloudbase/garm-provider-equinix) +* [Amazon EC2](https://github.com/cloudbase/garm-provider-aws) +* [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) Follow the instructions in the README of each provider to install them. -## Configuration - -The ```GARM``` configuration is a simple ```toml```. The sample config file in [the testdata folder](/testdata/config.toml) is fairly well commented and should be enough to get you started. The configuration file is split into several sections, each of which is documented in its own page. The sections are: - -* [The default section](/doc/config_default.md) -* [Database](/doc/database.md) -* [Github credentials](/doc/github_credentials.md) -* [Providers](/doc/providers.md) -* [Metrics](/doc/config_metrics.md) -* [JWT authentication](/doc/config_jwt_auth.md) -* [API server](/doc/config_api_server.md) - ## Optimizing your runners If you would like to optimize the startup time of new instance, take a look at the [performance considerations](/doc/performance_considerations.md) page. diff --git a/apiserver/controllers/controllers.go b/apiserver/controllers/controllers.go index 8e42f320..d7dd5f44 100644 --- a/apiserver/controllers/controllers.go +++ b/apiserver/controllers/controllers.go @@ -15,9 +15,10 @@ package controllers import ( + "context" "encoding/json" "io" - "log" + "log/slog" "net/http" "strings" @@ -60,7 +61,7 @@ type APIController struct { controllerID string } -func handleError(w http.ResponseWriter, err error) { +func handleError(ctx context.Context, w http.ResponseWriter, err error) { w.Header().Set("Content-Type", "application/json") origErr := errors.Cause(err) apiErr := params.APIErrorResponse{ @@ -90,14 +91,14 @@ func handleError(w http.ResponseWriter, err error) { } if err := json.NewEncoder(w).Encode(apiErr); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } -func (a *APIController) webhookMetricLabelValues(valid, reason string) []string { +func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string { controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext()) if err != nil { - log.Printf("failed to get controller info: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info") // If labels are empty, not attempt will be made to record webhook. return []string{} } @@ -107,11 +108,11 @@ func (a *APIController) webhookMetricLabelValues(valid, reason string) []string } } -func (a *APIController) handleWorkflowJobEvent(w http.ResponseWriter, r *http.Request) { +func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) { defer r.Body.Close() body, err := io.ReadAll(r.Body) if err != nil { - handleError(w, gErrors.NewBadRequestError("invalid post body: %s", err)) + handleError(ctx, w, gErrors.NewBadRequestError("invalid post body: %s", err)) return } @@ -124,28 +125,30 @@ func (a *APIController) handleWorkflowJobEvent(w http.ResponseWriter, r *http.Re return } if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil { - log.Printf("failed to record metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric") } }() if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil { if errors.Is(err, gErrors.ErrNotFound) { - labelValues = a.webhookMetricLabelValues("false", "owner_unknown") - log.Printf("got not found error from DispatchWorkflowJob. webhook not meant for us?: %q", err) + labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown") + slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?") return } else if strings.Contains(err.Error(), "signature") { // TODO: check error type - labelValues = a.webhookMetricLabelValues("false", "signature_invalid") + labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid") } else { - labelValues = a.webhookMetricLabelValues("false", "unknown") + labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown") } - handleError(w, err) + handleError(ctx, w, err) return } - labelValues = a.webhookMetricLabelValues("true", "") + labelValues = a.webhookMetricLabelValues(ctx, "true", "") } func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + vars := mux.Vars(r) controllerID, ok := vars["controllerID"] // If the webhook URL includes a controller ID, we validate that it's meant for us. We still @@ -154,7 +157,7 @@ func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) { // via garm. We cannot tag a webhook URL on github, so there is no way to determine ownership. // Using a controllerID suffix is a simple way to denote ownership. if ok && controllerID != a.controllerID { - log.Printf("ignoring webhook meant for controller %s", util.SanitizeLogEntry(controllerID)) + slog.InfoContext(ctx, "ignoring webhook meant for foreign controller", "req_controller_id", controllerID) return } @@ -163,10 +166,9 @@ func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) { event := runnerParams.Event(headers.Get("X-Github-Event")) switch event { case runnerParams.WorkflowJobEvent: - a.handleWorkflowJobEvent(w, r) + a.handleWorkflowJobEvent(ctx, w, r) default: - log.Printf("ignoring unknown event %s", util.SanitizeLogEntry(string(event))) - return + slog.InfoContext(ctx, "ignoring unknown event", "gh_event", util.SanitizeLogEntry(string(event))) } } @@ -175,19 +177,19 @@ func (a *APIController) WSHandler(writer http.ResponseWriter, req *http.Request) if !auth.IsAdmin(ctx) { writer.WriteHeader(http.StatusForbidden) if _, err := writer.Write([]byte("you need admin level access to view logs")); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if a.hub == nil { - handleError(writer, gErrors.NewBadRequestError("log streamer is disabled")) + handleError(ctx, writer, gErrors.NewBadRequestError("log streamer is disabled")) return } conn, err := a.upgrader.Upgrade(writer, req, nil) if err != nil { - log.Printf("error upgrading to websockets: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error upgrading to websockets") return } @@ -198,11 +200,11 @@ func (a *APIController) WSHandler(writer http.ResponseWriter, req *http.Request) // the client once the token expires. client, err := wsWriter.NewClient(conn, a.hub) if err != nil { - log.Printf("failed to create new client: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to create new client") return } if err := a.hub.Register(client); err != nil { - log.Printf("failed to register new client: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to register new client") return } client.Go() @@ -210,6 +212,7 @@ func (a *APIController) WSHandler(writer http.ResponseWriter, req *http.Request) // NotFoundHandler is returned when an invalid URL is acccessed func (a *APIController) NotFoundHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() apiErr := params.APIErrorResponse{ Details: "Resource not found", Error: "Not found", @@ -218,7 +221,7 @@ func (a *APIController) NotFoundHandler(w http.ResponseWriter, r *http.Request) w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusNotFound) if err := json.NewEncoder(w).Encode(apiErr); err != nil { - log.Printf("failet to write response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failet to write response") } } @@ -233,19 +236,19 @@ func (a *APIController) MetricsTokenHandler(w http.ResponseWriter, r *http.Reque ctx := r.Context() if !auth.IsAdmin(ctx) { - handleError(w, gErrors.ErrUnauthorized) + handleError(ctx, w, gErrors.ErrUnauthorized) return } token, err := a.auth.GetJWTMetricsToken(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") err = json.NewEncoder(w).Encode(runnerParams.JWTResponse{Token: token}) if err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -266,32 +269,32 @@ func (a *APIController) MetricsTokenHandler(w http.ResponseWriter, r *http.Reque // // LoginHandler returns a jwt token func (a *APIController) LoginHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() var loginInfo runnerParams.PasswordLoginParams if err := json.NewDecoder(r.Body).Decode(&loginInfo); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } if err := loginInfo.Validate(); err != nil { - handleError(w, err) + handleError(ctx, w, err) return } - ctx := r.Context() ctx, err := a.auth.AuthenticateUser(ctx, loginInfo) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } tokenString, err := a.auth.GetJWTToken(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(runnerParams.JWTResponse{Token: tokenString}); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -310,28 +313,27 @@ func (a *APIController) LoginHandler(w http.ResponseWriter, r *http.Request) { // 200: User // 400: APIErrorResponse func (a *APIController) FirstRunHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() if a.auth.IsInitialized() { err := gErrors.NewConflictError("already initialized") - handleError(w, err) + handleError(ctx, w, err) return } - ctx := r.Context() - var newUserParams runnerParams.NewUserParams if err := json.NewDecoder(r.Body).Decode(&newUserParams); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } newUser, err := a.auth.InitController(ctx, newUserParams) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(newUser); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -346,13 +348,13 @@ func (a *APIController) ListCredentials(w http.ResponseWriter, r *http.Request) ctx := r.Context() creds, err := a.r.ListCredentials(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(creds); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -367,13 +369,13 @@ func (a *APIController) ListProviders(w http.ResponseWriter, r *http.Request) { ctx := r.Context() providers, err := a.r.ListProviders(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(providers); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -388,13 +390,13 @@ func (a *APIController) ListAllJobs(w http.ResponseWriter, r *http.Request) { ctx := r.Context() jobs, err := a.r.ListAllJobs(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(jobs); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -409,12 +411,12 @@ func (a *APIController) ControllerInfoHandler(w http.ResponseWriter, r *http.Req ctx := r.Context() info, err := a.r.GetControllerInfo(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(info); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/controllers/enterprises.go b/apiserver/controllers/enterprises.go index 6a015df2..39abb223 100644 --- a/apiserver/controllers/enterprises.go +++ b/apiserver/controllers/enterprises.go @@ -16,7 +16,7 @@ package controllers import ( "encoding/json" - "log" + "log/slog" "net/http" gErrors "github.com/cloudbase/garm-provider-common/errors" @@ -45,20 +45,20 @@ func (a *APIController) CreateEnterpriseHandler(w http.ResponseWriter, r *http.R var enterpriseData runnerParams.CreateEnterpriseParams if err := json.NewDecoder(r.Body).Decode(&enterpriseData); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } enterprise, err := a.r.CreateEnterprise(ctx, enterpriseData) if err != nil { - log.Printf("error creating enterprise: %+v", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating enterprise") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(enterprise); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -74,14 +74,14 @@ func (a *APIController) ListEnterprisesHandler(w http.ResponseWriter, r *http.Re enterprise, err := a.r.ListEnterprises(ctx) if err != nil { - log.Printf("listing enterprise: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing enterprise") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(enterprise); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -110,21 +110,21 @@ func (a *APIController) GetEnterpriseByIDHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } enterprise, err := a.r.GetEnterpriseByID(ctx, enterpriseID) if err != nil { - log.Printf("fetching enterprise: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching enterprise") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(enterprise); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -152,14 +152,14 @@ func (a *APIController) DeleteEnterpriseHandler(w http.ResponseWriter, r *http.R Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.DeleteEnterprise(ctx, enterpriseID); err != nil { - log.Printf("removing enterprise: %+v", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing enterprise") + handleError(ctx, w, err) return } @@ -198,27 +198,27 @@ func (a *APIController) UpdateEnterpriseHandler(w http.ResponseWriter, r *http.R Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var updatePayload runnerParams.UpdateEntityParams if err := json.NewDecoder(r.Body).Decode(&updatePayload); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } enterprise, err := a.r.UpdateEnterprise(ctx, enterpriseID, updatePayload) if err != nil { - log.Printf("error updating enterprise: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error updating enterprise: %s") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(enterprise); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -253,28 +253,28 @@ func (a *APIController) CreateEnterprisePoolHandler(w http.ResponseWriter, r *ht Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.CreatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.CreateEnterprisePool(ctx, enterpriseID, poolData) if err != nil { - log.Printf("error creating enterprise pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating enterprise pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -302,21 +302,21 @@ func (a *APIController) ListEnterprisePoolsHandler(w http.ResponseWriter, r *htt Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pools, err := a.r.ListEnterprisePools(ctx, enterpriseID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pools); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -352,21 +352,21 @@ func (a *APIController) GetEnterprisePoolHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No enterprise or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pool, err := a.r.GetEnterprisePoolByID(ctx, enterpriseID, poolID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -401,14 +401,14 @@ func (a *APIController) DeleteEnterprisePoolHandler(w http.ResponseWriter, r *ht Error: "Bad Request", Details: "No enterprise or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.DeleteEnterprisePool(ctx, enterpriseID, poolID); err != nil { - log.Printf("removing pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing pool") + handleError(ctx, w, err) return } @@ -455,27 +455,27 @@ func (a *APIController) UpdateEnterprisePoolHandler(w http.ResponseWriter, r *ht Error: "Bad Request", Details: "No enterprise or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.UpdatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.UpdateEnterprisePool(ctx, enterpriseID, poolID, poolData) if err != nil { - log.Printf("error creating enterprise pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating enterprise pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/controllers/instances.go b/apiserver/controllers/instances.go index dd1ad284..6cf43dd9 100644 --- a/apiserver/controllers/instances.go +++ b/apiserver/controllers/instances.go @@ -16,7 +16,7 @@ package controllers import ( "encoding/json" - "log" + "log/slog" "net/http" "strconv" @@ -51,21 +51,21 @@ func (a *APIController) ListPoolInstancesHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } instances, err := a.r.ListPoolInstances(ctx, poolID) if err != nil { - log.Printf("listing pool instances: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pool instances") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instances); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -93,21 +93,21 @@ func (a *APIController) GetInstanceHandler(w http.ResponseWriter, r *http.Reques Error: "Bad Request", Details: "No runner name specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } instance, err := a.r.GetInstance(ctx, instanceName) if err != nil { - log.Printf("listing instances: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing instances") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instance); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -140,15 +140,15 @@ func (a *APIController) DeleteInstanceHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No instance name specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } forceRemove, _ := strconv.ParseBool(r.URL.Query().Get("forceRemove")) if err := a.r.DeleteRunner(ctx, instanceName, forceRemove); err != nil { - log.Printf("removing runner: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing runner") + handleError(ctx, w, err) return } @@ -180,21 +180,21 @@ func (a *APIController) ListRepoInstancesHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } instances, err := a.r.ListRepoInstances(ctx, repoID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instances); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -222,21 +222,21 @@ func (a *APIController) ListOrgInstancesHandler(w http.ResponseWriter, r *http.R Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } instances, err := a.r.ListOrgInstances(ctx, orgID) if err != nil { - log.Printf("listing instances: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing instances") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instances); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -264,21 +264,21 @@ func (a *APIController) ListEnterpriseInstancesHandler(w http.ResponseWriter, r Error: "Bad Request", Details: "No enterprise ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } instances, err := a.r.ListEnterpriseInstances(ctx, enterpriseID) if err != nil { - log.Printf("listing instances: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing instances") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instances); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -294,14 +294,14 @@ func (a *APIController) ListAllInstancesHandler(w http.ResponseWriter, r *http.R instances, err := a.r.ListAllInstances(ctx) if err != nil { - log.Printf("listing instances: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing instances") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(instances); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -310,14 +310,34 @@ func (a *APIController) InstanceStatusMessageHandler(w http.ResponseWriter, r *h var updateMessage runnerParams.InstanceUpdateMessage if err := json.NewDecoder(r.Body).Decode(&updateMessage); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } if err := a.r.AddInstanceStatusMessage(ctx, updateMessage); err != nil { - log.Printf("error saving status message: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error saving status message") + handleError(ctx, w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) +} + +func (a *APIController) InstanceSystemInfoHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + var updateMessage runnerParams.UpdateSystemInfoParams + if err := json.NewDecoder(r.Body).Decode(&updateMessage); err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) + return + } + + if err := a.r.UpdateSystemInfo(ctx, updateMessage); err != nil { + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error saving status message") + handleError(ctx, w, err) return } diff --git a/apiserver/controllers/metadata.go b/apiserver/controllers/metadata.go index 664d46a6..1b7e215d 100644 --- a/apiserver/controllers/metadata.go +++ b/apiserver/controllers/metadata.go @@ -17,7 +17,7 @@ package controllers import ( "encoding/json" "fmt" - "log" + "log/slog" "net/http" "github.com/cloudbase/garm/apiserver/params" @@ -29,14 +29,14 @@ func (a *APIController) InstanceGithubRegistrationTokenHandler(w http.ResponseWr token, err := a.r.GetInstanceGithubRegistrationToken(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) if _, err := w.Write([]byte(token)); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -50,7 +50,7 @@ func (a *APIController) JITCredentialsFileHandler(w http.ResponseWriter, r *http Error: "Not Found", Details: "Not Found", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } @@ -59,8 +59,8 @@ func (a *APIController) JITCredentialsFileHandler(w http.ResponseWriter, r *http data, err := a.r.GetJITConfigFile(ctx, dotFileName) if err != nil { - log.Printf("getting JIT config file: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "getting JIT config file") + handleError(ctx, w, err) return } @@ -71,7 +71,7 @@ func (a *APIController) JITCredentialsFileHandler(w http.ResponseWriter, r *http w.Header().Set("Content-Length", fmt.Sprintf("%d", len(data))) w.WriteHeader(http.StatusOK) if _, err := w.Write(data); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -80,14 +80,14 @@ func (a *APIController) SystemdServiceNameHandler(w http.ResponseWriter, r *http serviceName, err := a.r.GetRunnerServiceName(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "text/plain") w.WriteHeader(http.StatusOK) if _, err := w.Write([]byte(serviceName)); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -97,14 +97,14 @@ func (a *APIController) SystemdUnitFileHandler(w http.ResponseWriter, r *http.Re data, err := a.r.GenerateSystemdUnitFile(ctx, runAsUser) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "text/plain") w.WriteHeader(http.StatusOK) if _, err := w.Write(data); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -113,12 +113,12 @@ func (a *APIController) RootCertificateBundleHandler(w http.ResponseWriter, r *h bundle, err := a.r.GetRootCertificateBundle(ctx) if err != nil { - handleError(w, err) + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(bundle); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/controllers/organizations.go b/apiserver/controllers/organizations.go index 02f93fc4..8495d351 100644 --- a/apiserver/controllers/organizations.go +++ b/apiserver/controllers/organizations.go @@ -16,7 +16,7 @@ package controllers import ( "encoding/json" - "log" + "log/slog" "net/http" "strconv" @@ -46,20 +46,20 @@ func (a *APIController) CreateOrgHandler(w http.ResponseWriter, r *http.Request) var orgData runnerParams.CreateOrgParams if err := json.NewDecoder(r.Body).Decode(&orgData); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } org, err := a.r.CreateOrganization(ctx, orgData) if err != nil { - log.Printf("error creating organization: %+v", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating organization") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(org); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -75,14 +75,14 @@ func (a *APIController) ListOrgsHandler(w http.ResponseWriter, r *http.Request) orgs, err := a.r.ListOrganizations(ctx) if err != nil { - log.Printf("listing orgs: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing orgs") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(orgs); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -111,21 +111,21 @@ func (a *APIController) GetOrgByIDHandler(w http.ResponseWriter, r *http.Request Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } org, err := a.r.GetOrganizationByID(ctx, orgID) if err != nil { - log.Printf("fetching org: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching org") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(org); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -159,7 +159,7 @@ func (a *APIController) DeleteOrgHandler(w http.ResponseWriter, r *http.Request) Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } @@ -167,8 +167,8 @@ func (a *APIController) DeleteOrgHandler(w http.ResponseWriter, r *http.Request) keepWebhook, _ := strconv.ParseBool(r.URL.Query().Get("keepWebhook")) if err := a.r.DeleteOrganization(ctx, orgID, keepWebhook); err != nil { - log.Printf("removing org: %+v", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing org") + handleError(ctx, w, err) return } @@ -208,27 +208,27 @@ func (a *APIController) UpdateOrgHandler(w http.ResponseWriter, r *http.Request) Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var updatePayload runnerParams.UpdateEntityParams if err := json.NewDecoder(r.Body).Decode(&updatePayload); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } org, err := a.r.UpdateOrganization(ctx, orgID, updatePayload) if err != nil { - log.Printf("error updating organization: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error updating organization") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(org); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -263,28 +263,28 @@ func (a *APIController) CreateOrgPoolHandler(w http.ResponseWriter, r *http.Requ Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.CreatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.CreateOrgPool(ctx, orgID, poolData) if err != nil { - log.Printf("error creating organization pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating organization pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -312,21 +312,21 @@ func (a *APIController) ListOrgPoolsHandler(w http.ResponseWriter, r *http.Reque Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pools, err := a.r.ListOrgPools(ctx, orgID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pools); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -361,21 +361,21 @@ func (a *APIController) GetOrgPoolHandler(w http.ResponseWriter, r *http.Request Error: "Bad Request", Details: "No org or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pool, err := a.r.GetOrgPoolByID(ctx, orgID, poolID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -410,14 +410,14 @@ func (a *APIController) DeleteOrgPoolHandler(w http.ResponseWriter, r *http.Requ Error: "Bad Request", Details: "No org or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.DeleteOrgPool(ctx, orgID, poolID); err != nil { - log.Printf("removing pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing pool") + handleError(ctx, w, err) return } @@ -464,28 +464,28 @@ func (a *APIController) UpdateOrgPoolHandler(w http.ResponseWriter, r *http.Requ Error: "Bad Request", Details: "No org or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.UpdatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.UpdateOrgPool(ctx, orgID, poolID, poolData) if err != nil { - log.Printf("error creating organization pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating organization pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -521,28 +521,28 @@ func (a *APIController) InstallOrgWebhookHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var hookParam runnerParams.InstallWebhookParams if err := json.NewDecoder(r.Body).Decode(&hookParam); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } info, err := a.r.InstallOrgWebhook(ctx, orgID, hookParam) if err != nil { - log.Printf("installing webhook: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "installing webhook") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(info); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -570,14 +570,14 @@ func (a *APIController) UninstallOrgWebhookHandler(w http.ResponseWriter, r *htt Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.UninstallOrgWebhook(ctx, orgID); err != nil { - log.Printf("removing webhook: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing webhook") + handleError(ctx, w, err) return } @@ -610,20 +610,20 @@ func (a *APIController) GetOrgWebhookInfoHandler(w http.ResponseWriter, r *http. Error: "Bad Request", Details: "No org ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } info, err := a.r.GetOrgWebhookInfo(ctx, orgID) if err != nil { - log.Printf("getting webhook info: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "getting webhook info") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(info); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/controllers/pools.go b/apiserver/controllers/pools.go index 34403759..3f7a81b1 100644 --- a/apiserver/controllers/pools.go +++ b/apiserver/controllers/pools.go @@ -16,7 +16,7 @@ package controllers import ( "encoding/json" - "log" + "log/slog" "net/http" gErrors "github.com/cloudbase/garm-provider-common/errors" @@ -39,14 +39,14 @@ func (a *APIController) ListAllPoolsHandler(w http.ResponseWriter, r *http.Reque pools, err := a.r.ListAllPools(ctx) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pools); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -75,15 +75,15 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques Error: "Bad Request", Details: "No pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pool, err := a.r.GetPoolByID(ctx, poolID) if err != nil { - log.Printf("fetching pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching pool") + handleError(ctx, w, err) return } @@ -91,7 +91,7 @@ func (a *APIController) GetPoolByIDHandler(w http.ResponseWriter, r *http.Reques w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -119,14 +119,14 @@ func (a *APIController) DeletePoolByIDHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.DeletePoolByID(ctx, poolID); err != nil { - log.Printf("removing pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing pool") + handleError(ctx, w, err) return } @@ -165,27 +165,27 @@ func (a *APIController) UpdatePoolByIDHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.UpdatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.UpdatePoolByID(ctx, poolID, poolData) if err != nil { - log.Printf("fetching pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/controllers/repositories.go b/apiserver/controllers/repositories.go index ae2a3040..6831dda4 100644 --- a/apiserver/controllers/repositories.go +++ b/apiserver/controllers/repositories.go @@ -16,7 +16,7 @@ package controllers import ( "encoding/json" - "log" + "log/slog" "net/http" "strconv" @@ -46,20 +46,20 @@ func (a *APIController) CreateRepoHandler(w http.ResponseWriter, r *http.Request var repoData runnerParams.CreateRepoParams if err := json.NewDecoder(r.Body).Decode(&repoData); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } repo, err := a.r.CreateRepository(ctx, repoData) if err != nil { - log.Printf("error creating repository: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating repository") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(repo); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -75,14 +75,14 @@ func (a *APIController) ListReposHandler(w http.ResponseWriter, r *http.Request) repos, err := a.r.ListRepositories(ctx) if err != nil { - log.Printf("listing repos: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing repositories") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(repos); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -111,21 +111,21 @@ func (a *APIController) GetRepoByIDHandler(w http.ResponseWriter, r *http.Reques Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } repo, err := a.r.GetRepositoryByID(ctx, repoID) if err != nil { - log.Printf("fetching repo: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching repository") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(repo); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -159,15 +159,15 @@ func (a *APIController) DeleteRepoHandler(w http.ResponseWriter, r *http.Request Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } keepWebhook, _ := strconv.ParseBool(r.URL.Query().Get("keepWebhook")) if err := a.r.DeleteRepository(ctx, repoID, keepWebhook); err != nil { - log.Printf("fetching repo: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "fetching repository") + handleError(ctx, w, err) return } @@ -207,27 +207,27 @@ func (a *APIController) UpdateRepoHandler(w http.ResponseWriter, r *http.Request Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var updatePayload runnerParams.UpdateEntityParams if err := json.NewDecoder(r.Body).Decode(&updatePayload); err != nil { - handleError(w, gErrors.ErrBadRequest) + handleError(ctx, w, gErrors.ErrBadRequest) return } repo, err := a.r.UpdateRepository(ctx, repoID, updatePayload) if err != nil { - log.Printf("error updating repository: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error updating repository") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(repo); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -262,28 +262,28 @@ func (a *APIController) CreateRepoPoolHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.CreatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.CreateRepoPool(ctx, repoID, poolData) if err != nil { - log.Printf("error creating repository pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating repository pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -311,21 +311,21 @@ func (a *APIController) ListRepoPoolsHandler(w http.ResponseWriter, r *http.Requ Error: "Bad Request", Details: "No repo ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pools, err := a.r.ListRepoPools(ctx, repoID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pools); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -360,21 +360,21 @@ func (a *APIController) GetRepoPoolHandler(w http.ResponseWriter, r *http.Reques Error: "Bad Request", Details: "No repo or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } pool, err := a.r.GetRepoPoolByID(ctx, repoID, poolID) if err != nil { - log.Printf("listing pools: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -409,14 +409,14 @@ func (a *APIController) DeleteRepoPoolHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No repo or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.DeleteRepoPool(ctx, repoID, poolID); err != nil { - log.Printf("removing pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing pool") + handleError(ctx, w, err) return } @@ -463,28 +463,28 @@ func (a *APIController) UpdateRepoPoolHandler(w http.ResponseWriter, r *http.Req Error: "Bad Request", Details: "No repo or pool ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var poolData runnerParams.UpdatePoolParams if err := json.NewDecoder(r.Body).Decode(&poolData); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } pool, err := a.r.UpdateRepoPool(ctx, repoID, poolID, poolData) if err != nil { - log.Printf("error creating repository pool: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "error creating repository pool") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(pool); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -520,28 +520,28 @@ func (a *APIController) InstallRepoWebhookHandler(w http.ResponseWriter, r *http Error: "Bad Request", Details: "No repository ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } var hookParam runnerParams.InstallWebhookParams if err := json.NewDecoder(r.Body).Decode(&hookParam); err != nil { - log.Printf("failed to decode: %s", err) - handleError(w, gErrors.ErrBadRequest) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to decode") + handleError(ctx, w, gErrors.ErrBadRequest) return } info, err := a.r.InstallRepoWebhook(ctx, repoID, hookParam) if err != nil { - log.Printf("installing webhook: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "installing webhook") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(info); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -569,14 +569,14 @@ func (a *APIController) UninstallRepoWebhookHandler(w http.ResponseWriter, r *ht Error: "Bad Request", Details: "No repository ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } if err := a.r.UninstallRepoWebhook(ctx, repoID); err != nil { - log.Printf("removing webhook: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "removing webhook") + handleError(ctx, w, err) return } @@ -609,20 +609,20 @@ func (a *APIController) GetRepoWebhookInfoHandler(w http.ResponseWriter, r *http Error: "Bad Request", Details: "No repository ID specified", }); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } info, err := a.r.GetRepoWebhookInfo(ctx, repoID) if err != nil { - log.Printf("getting webhook info: %s", err) - handleError(w, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "getting webhook info") + handleError(ctx, w, err) return } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(info); err != nil { - log.Printf("failed to encode response: %q", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } diff --git a/apiserver/routers/routers.go b/apiserver/routers/routers.go index 6644a4ba..5e6de6cd 100644 --- a/apiserver/routers/routers.go +++ b/apiserver/routers/routers.go @@ -47,14 +47,14 @@ package routers import ( _ "expvar" // Register the expvar handlers - "io" + "log/slog" "net/http" _ "net/http/pprof" // Register the pprof handlers + "github.com/felixge/httpsnoop" "github.com/gorilla/mux" "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/cloudbase/garm-provider-common/util" "github.com/cloudbase/garm/apiserver/controllers" "github.com/cloudbase/garm/auth" ) @@ -82,10 +82,28 @@ func WithDebugServer(parentRouter *mux.Router) *mux.Router { return parentRouter } -func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddleware, initMiddleware, instanceMiddleware auth.Middleware, manageWebhooks bool) *mux.Router { +func requestLogger(h http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // gathers metrics from the upstream handlers + metrics := httpsnoop.CaptureMetrics(h, w, r) + + //prints log and metrics + slog.Info( + "access_log", + slog.String("method", r.Method), + slog.String("uri", r.URL.RequestURI()), + slog.String("user_agent", r.Header.Get("User-Agent")), + slog.String("ip", r.RemoteAddr), + slog.Int("code", metrics.Code), + slog.Int64("bytes", metrics.Written), + slog.Duration("request_time", metrics.Duration), + ) + }) +} + +func NewAPIRouter(han *controllers.APIController, authMiddleware, initMiddleware, instanceMiddleware auth.Middleware, manageWebhooks bool) *mux.Router { router := mux.NewRouter() - logMiddleware := util.NewLoggingMiddleware(logWriter) - router.Use(logMiddleware) + router.Use(requestLogger) // Handles github webhooks webhookRouter := router.PathPrefix("/webhooks").Subrouter() @@ -106,6 +124,8 @@ func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddl callbackRouter := apiSubRouter.PathPrefix("/callbacks").Subrouter() callbackRouter.Handle("/status/", http.HandlerFunc(han.InstanceStatusMessageHandler)).Methods("POST", "OPTIONS") callbackRouter.Handle("/status", http.HandlerFunc(han.InstanceStatusMessageHandler)).Methods("POST", "OPTIONS") + callbackRouter.Handle("/system-info/", http.HandlerFunc(han.InstanceSystemInfoHandler)).Methods("POST", "OPTIONS") + callbackRouter.Handle("/system-info", http.HandlerFunc(han.InstanceSystemInfoHandler)).Methods("POST", "OPTIONS") callbackRouter.Use(instanceMiddleware.Middleware) /////////////////// diff --git a/auth/init_required.go b/auth/init_required.go index 6265649e..7dcc655b 100644 --- a/auth/init_required.go +++ b/auth/init_required.go @@ -16,7 +16,7 @@ package auth import ( "encoding/json" - "log" + "log/slog" "net/http" "github.com/cloudbase/garm/apiserver/params" @@ -37,16 +37,17 @@ type initRequired struct { // Middleware implements the middleware interface func (i *initRequired) Middleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() ctrlInfo, err := i.store.ControllerInfo() if err != nil || ctrlInfo.ControllerID.String() == "" { w.Header().Add("Content-Type", "application/json") w.WriteHeader(http.StatusConflict) if err := json.NewEncoder(w).Encode(params.InitializationRequired); err != nil { - log.Printf("failed to encode response: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } return } - ctx := r.Context() + next.ServeHTTP(w, r.WithContext(ctx)) }) } diff --git a/auth/instance_middleware.go b/auth/instance_middleware.go index 4e947099..1562b8b9 100644 --- a/auth/instance_middleware.go +++ b/auth/instance_middleware.go @@ -17,16 +17,18 @@ package auth import ( "context" "fmt" + "log/slog" "net/http" "strings" "time" - runnerErrors "github.com/cloudbase/garm-provider-common/errors" "github.com/cloudbase/garm/config" dbCommon "github.com/cloudbase/garm/database/common" "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner/common" + runnerErrors "github.com/cloudbase/garm-provider-common/errors" + commonParams "github.com/cloudbase/garm-provider-common/params" jwt "github.com/golang-jwt/jwt/v5" "github.com/pkg/errors" ) @@ -39,7 +41,8 @@ type InstanceJWTClaims struct { // Scope is either repository or organization Scope params.PoolType `json:"scope"` // Entity is the repo or org name - Entity string `json:"entity"` + Entity string `json:"entity"` + CreateAttempt int `json:"create_attempt"` jwt.RegisteredClaims } @@ -56,11 +59,12 @@ func NewInstanceJWTToken(instance params.Instance, secret, entity string, poolTy ExpiresAt: expires, Issuer: "garm", }, - ID: instance.ID, - Name: instance.Name, - PoolID: instance.PoolID, - Scope: poolType, - Entity: entity, + ID: instance.ID, + Name: instance.Name, + PoolID: instance.PoolID, + Scope: poolType, + Entity: entity, + CreateAttempt: instance.CreateAttempt, } token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) tokenString, err := token.SignedString([]byte(secret)) @@ -111,13 +115,13 @@ func (amw *instanceMiddleware) Middleware(next http.Handler) http.Handler { ctx := r.Context() authorizationHeader := r.Header.Get("authorization") if authorizationHeader == "" { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } bearerToken := strings.Split(authorizationHeader, " ") if len(bearerToken) != 2 { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } @@ -130,30 +134,60 @@ func (amw *instanceMiddleware) Middleware(next http.Handler) http.Handler { }) if err != nil { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } if !token.Valid { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } ctx, err = amw.claimsToContext(ctx, claims) if err != nil { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } if InstanceID(ctx) == "" { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } runnerStatus := InstanceRunnerStatus(ctx) if runnerStatus != params.RunnerInstalling && runnerStatus != params.RunnerPending { // Instances that have finished installing can no longer authenticate to the API - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) + return + } + + instanceParams, err := InstanceParams(ctx) + if err != nil { + slog.InfoContext( + ctx, "could not find instance params", + "runner_name", InstanceName(ctx)) + invalidAuthResponse(ctx, w) + return + } + + // Token was generated for a previous attempt at creating this instance. + if claims.CreateAttempt != instanceParams.CreateAttempt { + slog.InfoContext( + ctx, "invalid token create attempt", + "runner_name", InstanceName(ctx), + "token_create_attempt", claims.CreateAttempt, + "instance_create_attempt", instanceParams.CreateAttempt) + invalidAuthResponse(ctx, w) + return + } + + // Only allow instances that are in the creating or running state to authenticate. + if instanceParams.Status != commonParams.InstanceCreating && instanceParams.Status != commonParams.InstanceRunning { + slog.InfoContext( + ctx, "invalid instance status", + "runner_name", InstanceName(ctx), + "status", instanceParams.Status) + invalidAuthResponse(ctx, w) return } diff --git a/auth/jwt.go b/auth/jwt.go index fb1833c6..8516a02f 100644 --- a/auth/jwt.go +++ b/auth/jwt.go @@ -18,7 +18,7 @@ import ( "context" "encoding/json" "fmt" - "log" + "log/slog" "net/http" "strings" @@ -73,14 +73,14 @@ func (amw *jwtMiddleware) claimsToContext(ctx context.Context, claims *JWTClaims return ctx, nil } -func invalidAuthResponse(w http.ResponseWriter) { +func invalidAuthResponse(ctx context.Context, w http.ResponseWriter) { w.Header().Add("Content-Type", "application/json") w.WriteHeader(http.StatusUnauthorized) if err := json.NewEncoder(w).Encode( apiParams.APIErrorResponse{ Error: "Authentication failed", }); err != nil { - log.Printf("failed to encode response: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to encode response") } } @@ -91,13 +91,13 @@ func (amw *jwtMiddleware) Middleware(next http.Handler) http.Handler { ctx := r.Context() authorizationHeader := r.Header.Get("authorization") if authorizationHeader == "" { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } bearerToken := strings.Split(authorizationHeader, " ") if len(bearerToken) != 2 { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } @@ -110,22 +110,22 @@ func (amw *jwtMiddleware) Middleware(next http.Handler) http.Handler { }) if err != nil { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } if !token.Valid { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } ctx, err = amw.claimsToContext(ctx, claims) if err != nil { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } if !IsEnabled(ctx) { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } diff --git a/auth/metrics.go b/auth/metrics.go index 7494fe2f..3be4577a 100644 --- a/auth/metrics.go +++ b/auth/metrics.go @@ -27,13 +27,13 @@ func (m *MetricsMiddleware) Middleware(next http.Handler) http.Handler { ctx := r.Context() authorizationHeader := r.Header.Get("authorization") if authorizationHeader == "" { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } bearerToken := strings.Split(authorizationHeader, " ") if len(bearerToken) != 2 { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } @@ -46,18 +46,18 @@ func (m *MetricsMiddleware) Middleware(next http.Handler) http.Handler { }) if err != nil { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } if !token.Valid { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } // we fully trust the claims if !claims.ReadMetrics { - invalidAuthResponse(w) + invalidAuthResponse(ctx, w) return } diff --git a/cmd/garm-cli/cmd/enterprise.go b/cmd/garm-cli/cmd/enterprise.go index b9ec7fe2..fe49545a 100644 --- a/cmd/garm-cli/cmd/enterprise.go +++ b/cmd/garm-cli/cmd/enterprise.go @@ -17,7 +17,6 @@ package cmd import ( "fmt" - "github.com/cloudbase/garm-provider-common/util" apiClientEnterprises "github.com/cloudbase/garm/client/enterprises" "github.com/cloudbase/garm/params" @@ -26,10 +25,9 @@ import ( ) var ( - enterpriseName string - enterpriseWebhookSecret string - enterpriseCreds string - enterpriseRandomWebhookSecret bool + enterpriseName string + enterpriseWebhookSecret string + enterpriseCreds string ) // enterpriseCmd represents the enterprise command @@ -57,14 +55,6 @@ var enterpriseAddCmd = &cobra.Command{ return errNeedsInitError } - if enterpriseRandomWebhookSecret { - secret, err := util.GetRandomString(32) - if err != nil { - return err - } - enterpriseWebhookSecret = secret - } - newEnterpriseReq := apiClientEnterprises.NewCreateEnterpriseParams() newEnterpriseReq.Body = params.CreateEnterpriseParams{ Name: enterpriseName, @@ -189,8 +179,6 @@ func init() { enterpriseAddCmd.Flags().StringVar(&enterpriseName, "name", "", "The name of the enterprise") enterpriseAddCmd.Flags().StringVar(&enterpriseWebhookSecret, "webhook-secret", "", "The webhook secret for this enterprise") enterpriseAddCmd.Flags().StringVar(&enterpriseCreds, "credentials", "", "Credentials name. See credentials list.") - enterpriseAddCmd.Flags().BoolVar(&enterpriseRandomWebhookSecret, "random-webhook-secret", false, "Generate a random webhook secret for this organization.") - enterpriseAddCmd.MarkFlagsMutuallyExclusive("webhook-secret", "random-webhook-secret") enterpriseAddCmd.MarkFlagRequired("credentials") //nolint enterpriseAddCmd.MarkFlagRequired("name") //nolint diff --git a/cmd/garm-cli/cmd/log.go b/cmd/garm-cli/cmd/log.go index 9e6669ce..19708afc 100644 --- a/cmd/garm-cli/cmd/log.go +++ b/cmd/garm-cli/cmd/log.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "log" + "log/slog" "net/http" "net/url" "os" @@ -36,7 +37,7 @@ var logCmd = &cobra.Command{ wsScheme = "wss" } u := url.URL{Scheme: wsScheme, Host: parsedURL.Host, Path: "/api/v1/ws"} - log.Printf("connecting to %s", u.String()) + slog.Debug("connecting", "url", u.String()) header := http.Header{} header.Add("Authorization", fmt.Sprintf("Bearer %s", mgr.Token)) @@ -59,10 +60,10 @@ var logCmd = &cobra.Command{ for { _, message, err := c.ReadMessage() if err != nil { - log.Printf("read: %q", err) + slog.With(slog.Any("error", err)).Error("reading log message") return } - log.Print(util.SanitizeLogEntry(string(message))) + fmt.Println(util.SanitizeLogEntry(string(message))) } }() diff --git a/cmd/garm-cli/cmd/pool.go b/cmd/garm-cli/cmd/pool.go index 3db8f7d8..4820cabd 100644 --- a/cmd/garm-cli/cmd/pool.go +++ b/cmd/garm-cli/cmd/pool.go @@ -87,9 +87,9 @@ Example: garm-cli pool list --org=5493e51f-3170-4ce3-9f05-3fe690fc6ec6 List pools from one enterprise: - garm-cli pool list --org=a8ee4c66-e762-4cbe-a35d-175dba2c9e62 + garm-cli pool list --enterprise=a8ee4c66-e762-4cbe-a35d-175dba2c9e62 - List all pools from all repos and orgs: + List all pools from all repos, orgs and enterprises: garm-cli pool list --all `, @@ -285,7 +285,7 @@ var poolUpdateCmd = &cobra.Command{ Long: `Updates pool characteristics. This command updates the pool characteristics. Runners already created prior to updating -the pool, will not be recreated. IF they no longer suit your needs, you will need to +the pool, will not be recreated. If they no longer suit your needs, you will need to explicitly remove them using the runner delete command. `, SilenceUsage: true, diff --git a/cmd/garm/main.go b/cmd/garm/main.go index 5399fee0..4b10fbaa 100644 --- a/cmd/garm/main.go +++ b/cmd/garm/main.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "log" + "log/slog" "net" "net/http" "os" @@ -36,6 +37,7 @@ import ( "github.com/cloudbase/garm/database/common" "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/runner" + garmUtil "github.com/cloudbase/garm/util" "github.com/cloudbase/garm/util/appdefaults" "github.com/cloudbase/garm/websocket" lumberjack "gopkg.in/natefinch/lumberjack.v2" @@ -69,22 +71,8 @@ func maybeInitController(db common.Store) error { return nil } -func main() { - flag.Parse() - if *version { - fmt.Println(Version) - return - } - ctx, stop := signal.NotifyContext(context.Background(), signals...) - defer stop() - fmt.Println(ctx) - - cfg, err := config.NewConfig(*conf) - if err != nil { - log.Fatalf("Fetching config: %+v", err) - } - - logWriter, err := util.GetLoggingWriter(cfg.Default.LogFile) +func setupLogging(ctx context.Context, logCfg config.Logging, hub *websocket.Hub) { + logWriter, err := util.GetLoggingWriter(logCfg.LogFile) if err != nil { log.Fatalf("fetching log writer: %+v", err) } @@ -102,7 +90,7 @@ func main() { // we got a SIGHUP. Rotate log file. if logger, ok := logWriter.(*lumberjack.Logger); ok { if err := logger.Rotate(); err != nil { - log.Printf("failed to rotate log file: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to rotate log file") } } } @@ -112,18 +100,72 @@ func main() { var writers []io.Writer = []io.Writer{ logWriter, } + + if hub != nil { + writers = append(writers, hub) + } + + wr := io.MultiWriter(writers...) + + var logLevel slog.Level + switch logCfg.LogLevel { + case config.LevelDebug: + logLevel = slog.LevelDebug + case config.LevelInfo: + logLevel = slog.LevelInfo + case config.LevelWarn: + logLevel = slog.LevelWarn + case config.LevelError: + logLevel = slog.LevelError + default: + logLevel = slog.LevelInfo + } + + // logger options + opts := slog.HandlerOptions{ + AddSource: logCfg.LogSource, + Level: logLevel, + } + + var han slog.Handler + switch logCfg.LogFormat { + case config.FormatJSON: + han = slog.NewJSONHandler(wr, &opts) + default: + han = slog.NewTextHandler(wr, &opts) + } + + wrapped := garmUtil.ContextHandler{ + Handler: han, + } + slog.SetDefault(slog.New(wrapped)) + +} + +func main() { + flag.Parse() + if *version { + fmt.Println(Version) + return + } + ctx, stop := signal.NotifyContext(context.Background(), signals...) + defer stop() + + cfg, err := config.NewConfig(*conf) + if err != nil { + log.Fatalf("Fetching config: %+v", err) + } + + logCfg := cfg.GetLoggingConfig() var hub *websocket.Hub - if cfg.Default.EnableLogStreamer { + if logCfg.EnableLogStreamer != nil && *logCfg.EnableLogStreamer { hub = websocket.NewHub(ctx) if err := hub.Start(); err != nil { log.Fatal(err) } defer hub.Stop() //nolint - writers = append(writers, hub) } - - multiWriter := io.MultiWriter(writers...) - log.SetOutput(multiWriter) + setupLogging(ctx, logCfg, hub) db, err := database.NewDatabase(ctx, cfg.Database) if err != nil { @@ -170,19 +212,19 @@ func main() { log.Fatal(err) } - router := routers.NewAPIRouter(controller, multiWriter, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement) + router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement) if cfg.Metrics.Enable { - log.Printf("registering prometheus metrics collectors") + slog.InfoContext(ctx, "registering prometheus metrics collectors") if err := metrics.RegisterCollectors(runner); err != nil { log.Fatal(err) } - log.Printf("setting up metric routes") + slog.InfoContext(ctx, "setting up metric routes") router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware) } if cfg.Default.DebugServer { - log.Printf("setting up debug routes") + slog.InfoContext(ctx, "setting up debug routes") router = routers.WithDebugServer(router) } @@ -207,11 +249,11 @@ func main() { go func() { if cfg.APIServer.UseTLS { if err := srv.ServeTLS(listener, cfg.APIServer.TLSConfig.CRT, cfg.APIServer.TLSConfig.Key); err != nil { - log.Printf("Listening: %+v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "Listening") } } else { if err := srv.Serve(listener); err != http.ErrServerClosed { - log.Printf("Listening: %+v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "Listening") } } }() @@ -220,12 +262,12 @@ func main() { shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 60*time.Second) defer shutdownCancel() if err := srv.Shutdown(shutdownCtx); err != nil { - log.Printf("graceful api server shutdown failed: %+v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "graceful api server shutdown failed") } - log.Printf("waiting for runner to stop") + slog.With(slog.Any("error", err)).ErrorContext(ctx, "waiting for runner to stop") if err := runner.Wait(); err != nil { - log.Printf("failed to shutdown workers: %+v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to shutdown workers") os.Exit(1) } } diff --git a/config/config.go b/config/config.go index 89335306..26505ae3 100644 --- a/config/config.go +++ b/config/config.go @@ -18,7 +18,7 @@ import ( "crypto/tls" "crypto/x509" "fmt" - "log" + "log/slog" "net" "net/url" "os" @@ -33,6 +33,8 @@ import ( ) type DBBackendType string +type LogLevel string +type LogFormat string const ( // MySQLBackend represents the MySQL DB backend @@ -44,6 +46,24 @@ const ( EnvironmentVariablePrefix = "GARM" ) +const ( + // LevelDebug is the debug log level + LevelDebug LogLevel = "debug" + // LevelInfo is the info log level + LevelInfo LogLevel = "info" + // LevelWarn is the warn log level + LevelWarn LogLevel = "warn" + // LevelError is the error log level + LevelError LogLevel = "error" +) + +const ( + // FormatText is the text log format + FormatText LogFormat = "text" + // FormatJSON is the json log format + FormatJSON LogFormat = "json" +) + // NewConfig returns a new Config func NewConfig(cfgFile string) (*Config, error) { var config Config @@ -64,6 +84,7 @@ type Config struct { Providers []Provider `toml:"provider,omitempty" json:"provider,omitempty"` Github []Github `toml:"github,omitempty"` JWTAuth JWTAuth `toml:"jwt_auth" json:"jwt-auth"` + Logging Logging `toml:"logging" json:"logging"` } // Validate validates the config @@ -89,6 +110,10 @@ func (c *Config) Validate() error { return errors.Wrap(err, "validating jwt config") } + if err := c.Logging.Validate(); err != nil { + return errors.Wrap(err, "validating logging config") + } + providerNames := map[string]int{} for _, provider := range c.Providers { @@ -107,6 +132,52 @@ func (c *Config) Validate() error { return nil } +func (c *Config) GetLoggingConfig() Logging { + logging := c.Logging + if logging.LogFormat == "" { + logging.LogFormat = FormatText + } + + if logging.LogLevel == "" { + logging.LogLevel = LevelInfo + } + + // maintain backwards compatibility + if logging.LogFile == "" && c.Default.LogFile != "" { + logging.LogFile = c.Default.LogFile + } + if logging.EnableLogStreamer == nil && c.Default.EnableLogStreamer != nil { + logging.EnableLogStreamer = c.Default.EnableLogStreamer + } + + return logging +} + +type Logging struct { + // LogFile is the location of the log file. + LogFile string `toml:"log_file,omitempty" json:"log-file"` + // EnableLogStreamer enables the log streamer over websockets. + EnableLogStreamer *bool `toml:"enable_log_streamer,omitempty" json:"enable-log-streamer,omitempty"` + // LogLevel is the log level. + LogLevel LogLevel `toml:"log_level" json:"log-format"` + // LogFormat is the log format. + LogFormat LogFormat `toml:"log_format" json:"log-level"` + // LogSource enables the log source. + LogSource bool `toml:"log_source" json:"log-source"` +} + +func (l *Logging) Validate() error { + if l.LogLevel != LevelDebug && l.LogLevel != LevelInfo && l.LogLevel != LevelWarn && l.LogLevel != LevelError && l.LogLevel != "" { + return fmt.Errorf("invalid log level: %s", l.LogLevel) + } + + if l.LogFormat != FormatText && l.LogFormat != FormatJSON && l.LogFormat != "" { + return fmt.Errorf("invalid log format: %s", l.LogFormat) + } + + return nil +} + type Default struct { // CallbackURL is the URL where the instances can send back status reports. CallbackURL string `toml:"callback_url" json:"callback-url"` @@ -120,7 +191,7 @@ type Default struct { // LogFile is the location of the log file. LogFile string `toml:"log_file,omitempty" json:"log-file"` - EnableLogStreamer bool `toml:"enable_log_streamer"` + EnableLogStreamer *bool `toml:"enable_log_streamer,omitempty" json:"enable-log-streamer,omitempty"` DebugServer bool `toml:"debug_server" json:"debug-server"` } @@ -438,7 +509,7 @@ func (d *timeToLive) ParseDuration() (time.Duration, error) { func (d *timeToLive) Duration() time.Duration { duration, err := d.ParseDuration() if err != nil { - log.Printf("failed to parse duration: %s", err) + slog.With(slog.Any("error", err)).Error("failed to parse duration") return appdefaults.DefaultJWTTTL } // TODO(gabriel-samfira): should we have a minimum TTL? diff --git a/database/sql/jobs.go b/database/sql/jobs.go index 1d050f01..e0ed7ef8 100644 --- a/database/sql/jobs.go +++ b/database/sql/jobs.go @@ -3,7 +3,7 @@ package sql import ( "context" "encoding/json" - "log" + "log/slog" runnerErrors "github.com/cloudbase/garm-provider-common/errors" "github.com/cloudbase/garm/database/common" @@ -53,7 +53,7 @@ func sqlWorkflowJobToParamsJob(job WorkflowJob) (params.Job, error) { return jobParam, nil } -func (s *sqlDatabase) paramsJobToWorkflowJob(job params.Job) (WorkflowJob, error) { +func (s *sqlDatabase) paramsJobToWorkflowJob(ctx context.Context, job params.Job) (WorkflowJob, error) { asJson, err := json.Marshal(job.Labels) if err != nil { return WorkflowJob{}, errors.Wrap(err, "marshaling labels") @@ -83,7 +83,7 @@ func (s *sqlDatabase) paramsJobToWorkflowJob(job params.Job) (WorkflowJob, error if job.RunnerName != "" { instance, err := s.getInstanceByName(s.ctx, job.RunnerName) if err != nil { - log.Printf("failed to get instance by name: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get instance by name") } else { workflofJob.InstanceID = &instance.ID } @@ -218,7 +218,7 @@ func (s *sqlDatabase) CreateOrUpdateJob(ctx context.Context, job params.Job) (pa if err == nil { workflowJob.InstanceID = &instance.ID } else { - log.Printf("failed to get instance by name: %v", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get instance by name") } } @@ -237,7 +237,7 @@ func (s *sqlDatabase) CreateOrUpdateJob(ctx context.Context, job params.Job) (pa return params.Job{}, errors.Wrap(err, "saving job") } } else { - workflowJob, err := s.paramsJobToWorkflowJob(job) + workflowJob, err := s.paramsJobToWorkflowJob(ctx, job) if err != nil { return params.Job{}, errors.Wrap(err, "converting job") } diff --git a/database/sql/sql.go b/database/sql/sql.go index 47c6ef02..e513be42 100644 --- a/database/sql/sql.go +++ b/database/sql/sql.go @@ -17,7 +17,7 @@ package sql import ( "context" "fmt" - "log" + "log/slog" "strings" "github.com/pkg/errors" @@ -151,7 +151,7 @@ func (s *sqlDatabase) cascadeMigrationSQLite(model interface{}, name string, jus if model != nil { if err := s.conn.Migrator().AutoMigrate(model); err != nil { if err := s.conn.Exec(fmt.Sprintf(restoreNameTemplate, name, name, name)).Error; err != nil { - log.Printf("failed to restore table %s: %s", name, err) + slog.With(slog.Any("error", err)).Error("failed to restore table", "table", name) } return fmt.Errorf("failed to create table %s: %w", name, err) } @@ -193,13 +193,13 @@ func (s *sqlDatabase) cascadeMigration() error { func (s *sqlDatabase) migrateDB() error { if s.conn.Migrator().HasIndex(&Organization{}, "idx_organizations_name") { if err := s.conn.Migrator().DropIndex(&Organization{}, "idx_organizations_name"); err != nil { - log.Printf("failed to drop index idx_organizations_name: %s", err) + slog.With(slog.Any("error", err)).Error("failed to drop index idx_organizations_name") } } if s.conn.Migrator().HasIndex(&Repository{}, "idx_owner") { if err := s.conn.Migrator().DropIndex(&Repository{}, "idx_owner"); err != nil { - log.Printf("failed to drop index idx_owner: %s", err) + slog.With(slog.Any("error", err)).Error("failed to drop index idx_owner") } } diff --git a/doc/config_logging.md b/doc/config_logging.md new file mode 100644 index 00000000..ae3b93ed --- /dev/null +++ b/doc/config_logging.md @@ -0,0 +1,35 @@ +# The logging section + +GARM has switched to the `slog` package for logging, adding structured logging. As such, we added a dedicated `logging` section to the config to tweak the logging settings. We moved the `enable_log_streamer` and the `log_file` options from the `default` section to the `logging` section. They are still available in the `default` section for backwards compatibility, but they are deprecated and will be removed in a future release. + +An example of the new `logging` section: + +```toml +[logging] +# Uncomment this line if you'd like to log to a file instead of standard output. +# log_file = "/tmp/runner-manager.log" + +# enable_log_streamer enables streaming the logs over websockets +enable_log_streamer = true +# log_format is the output format of the logs. GARM uses structured logging and can +# output as "text" or "json" +log_format = "text" +# log_level is the logging level GARM will output. Available log levels are: +# * debug +# * info +# * warn +# * error +log_level = "debug" +# log_source will output information about the function that generated the log line. +log_source = false +``` + +By default GARM logs everything to standard output. You can optionally log to file by adding the `log_file` option to the `logging` section. The `enable_log_streamer` option allows you to stream GARM logs directly to your terminal. Set this option to `true`, then you can use the following command to stream logs: + +```bash +garm-cli debug-log +``` + +The `log_format`, `log_level` and `log_source` options allow you to tweak the logging output. The `log_format` option can be set to `text` or `json`. The `log_level` option can be set to `debug`, `info`, `warn` or `error`. The `log_source` option will output information about the function that generated the log line. All these options influence how the structured logging is output. + +This will allow you to ingest GARM logs in a central location such as an ELK stack or similar. \ No newline at end of file diff --git a/doc/providers.md b/doc/providers.md index d1f891f0..6862432a 100644 --- a/doc/providers.md +++ b/doc/providers.md @@ -50,6 +50,9 @@ For non testing purposes, there are two external providers currently available: * [Kubernetes](https://github.com/mercedes-benz/garm-provider-k8s) - Thanks to the amazing folks at @mercedes-benz for sharing their awesome provider! * [LXD](https://github.com/cloudbase/garm-provider-lxd) * [Incus](https://github.com/cloudbase/garm-provider-incus) +* [Equinix Metal](https://github.com/cloudbase/garm-provider-equinix) +* [Amazon EC2](https://github.com/cloudbase/garm-provider-aws) +* [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) Details on how to install and configure them are available in their respective repositories. diff --git a/doc/quickstart.md b/doc/quickstart.md index 9f90e960..062154c5 100644 --- a/doc/quickstart.md +++ b/doc/quickstart.md @@ -101,6 +101,9 @@ This is where you have a decision to make. GARM has a number of providers you ca * [Kubernetes](https://github.com/mercedes-benz/garm-provider-k8s) - Thanks to the amazing folks at @mercedes-benz for sharing their awesome provider! * [LXD](https://github.com/cloudbase/garm-provider-lxd) * [Incus](https://github.com/cloudbase/garm-provider-incus) +* [Equinix Metal](https://github.com/cloudbase/garm-provider-equinix) +* [Amazon EC2](https://github.com/cloudbase/garm-provider-aws) +* [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) All currently available providers are `external`. diff --git a/doc/using_garm.md b/doc/using_garm.md new file mode 100644 index 00000000..6a41777a --- /dev/null +++ b/doc/using_garm.md @@ -0,0 +1,529 @@ +# Using GARM + +This document will walk you through the various commands and options available in GARM. It is assumed that you have already installed GARM and have it running. If you haven't, please check out the [quickstart](/doc/quickstart.md) document for instructions on how to install GARM. + +While using the GARM cli, you will most likely spend most of your time listing pools and runners, but we will cover most of the available commands and options. Some of them we'll skip (like the `init` or `profile` subcommands), as they've been covered in the [quickstart](/doc/quickstart.md) document. + +## Table of contents + + +## Listing controller info + +You can list the controller info by running the following command: + +```bash +garm-cli controller-info show ++------------------------+----------------------------------------------------------------------------+ +| FIELD | VALUE | ++------------------------+----------------------------------------------------------------------------+ +| Controller ID | a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | +| Hostname | garm | +| Metadata URL | https://garm.example.com/api/v1/metadata | +| Callback URL | https://garm.example.com/api/v1/callbacks | +| Webhook Base URL | https://garm.example.com/webhooks | +| Controller Webhook URL | https://garm.example.com/webhooks/a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | ++------------------------+----------------------------------------------------------------------------+ +``` + +There are several things of interest in this output. + +* `Controller ID` - This is the unique identifier of the controller. Each GARM installation, on first run will automatically generate a unique controller ID. This is important for several reasons. For one, it allows us to run several GARM controllers on the same repos/orgs/enterprises, without accidentally clasing with each other. Each runner started by a GARM controller, will be tagged with this controller ID in order to easily identify runners that we manage. +* `Hostname` - This is the hostname of the machine where GARM is running. This is purely informative. +* `Metadata URL` - This URL is configured by the user in the GARM config file, and is the URL that is presented to the runners via userdata when they get set up. Runners will connect to this URL and retrieve information they might need to set themselves up. GARM cannot automatically determine this URL, as it is dependent on the user's network setup. GARM may be hidden behind a load balancer or a reverse proxy, in which case, the URL by which the GARM controller can be accessed may be different than the IP addresses that are locally visible to GARM. +* `Callback URL` - This URL is configured by the user in the GARM config file, and is the URL that is presented to the runners via userdata when they get set up. Runners will connect to this URL and send status updates and system information (OS version, OS name, github runner agent ID, etc) to the controller. +* `Webhook Base URL` - This is the base URL for webhooks. It is configured by the user in the GARM config file. This URL can be called into by GitHub itself when hooks get triggered by a workflow. GARM needs to know when a new job is started in order to schedule the createion of a new runner. Job webhooks sent to this URL will be recorded by GARM and acter upon. While you can configure this URL directly in your GitHub repo settings, it is advised to use the `Controller Webhook URL` instead, as it is unique to each controller, and allows you to potentially install multiple GARM controller inside the same repo. +* `Controller Webhook URL` - This is the URL that GitHub will call into when a webhook is triggered. This URL is unique to each GARM controller and is the preferred URL to use in order to receive webhooks from GitHub. It serves the same purpose as the `Webhook Base URL`, but is unique to each controller, allowing you to potentially install multiple GARM controllers inside the same repo. + +We will see the `Controller Webhook URL` later when we set up the GitHub repo to send webhooks to GARM. + +## Listing configured providers + +GARM uses providers to create runners. These providers are external executables that GARM calls into to create runners in a particular IaaS. + +Once configured (see [provider configuration](/doc/providers.md)), you can list the configured providers by running the following command: + +```bash +ubuntu@garm:~$ garm-cli provider list ++--------------+---------------------------------+----------+ +| NAME | DESCRIPTION | TYPE | ++--------------+---------------------------------+----------+ +| incus | Incus external provider | external | ++--------------+---------------------------------+----------+ +| lxd | LXD external provider | external | ++--------------+---------------------------------+----------+ +| openstack | OpenStack external provider | external | ++--------------+---------------------------------+----------+ +| azure | Azure provider | external | ++--------------+---------------------------------+----------+ +| k8s_external | k8s external provider | external | ++--------------+---------------------------------+----------+ +| Amazon EC2 | Amazon EC2 provider | external | ++--------------+---------------------------------+----------+ +| equinix | Equinix Metal | external | ++--------------+---------------------------------+----------+ +``` + +Each of these providers can be used to set up a runner pool for a repository, organization or enterprise. + +## Listing github credentials + +GARM needs access to your GitHub repositories, organizations or enterprise in order to manage runners. This is done via a [GitHub personal access token](/doc/github_credentials.md). You can configure multiple tokens with access to various repositories, organizations or enterprises, either on GitHub or on GitHub Enterprise Server. + +The credentials sections allow you to override the API URL, Upload URL and base URLs, unlocking the ability to use GARM with GitHub Enterprise Server. + +To list existing credentials, run the following command: + +```bash +ubuntu@garm:~$ garm-cli credentials list ++-------------+------------------------------------+--------------------+-------------------------+-----------------------------+ +| NAME | DESCRIPTION | BASE URL | API URL | UPLOAD URL | ++-------------+------------------------------------+--------------------+-------------------------+-----------------------------+ +| gabriel | github token or user gabriel | https://github.com | https://api.github.com/ | https://uploads.github.com/ | ++-------------+------------------------------------+--------------------+-------------------------+-----------------------------+ +| gabriel_org | github token with org level access | https://github.com | https://api.github.com/ | https://uploads.github.com/ | ++-------------+------------------------------------+--------------------+-------------------------+-----------------------------+ +``` + +These credentials are configured in the GARM config file. You can add, remove or modify them as needed. When using GitHub, you don't need to explicitly set the API URL, Upload URL or base URL, as they are automatically set to the GitHub defaults. When using GitHub Enterprise Server, you will need to set these URLs explicitly. See the [github credentials](/doc/github_credentials.md) section for more details. + +## Adding a new repository + +To add a new repository we need to use credentials that has access to the repository. We've listed credentials above, so let's add our first repository: + +```bash +ubuntu@garm:~$ garm-cli repository add \ + --name garm \ + --owner gabriel-samfira \ + --credentials gabriel \ + --install-webhook \ + --random-webhook-secret ++----------------------+--------------------------------------+ +| FIELD | VALUE | ++----------------------+--------------------------------------+ +| ID | be3a0673-56af-4395-9ebf-4521fea67567 | +| Owner | gabriel-samfira | +| Name | garm | +| Credentials | gabriel | +| Pool manager running | true | ++----------------------+--------------------------------------+ +``` + +Lets break down the command a bit and explain what happened above. We added a new repository to GARM, that belogs to the user `gabriel-samfira` and is called `garm`. When using GitHub, this translates to `https://github.com/gabriel-samfira/garm`. + +As part of the above command, we used the credentials called `gabriel` to authenticate to GitHub. If those credentials didn't have access to the repository, we would have received an error when adding the repo. + +The other interesting bit about the above command is that we automatically added the `webhook` to the repository and generated a secure random secret to authenticate the webhooks that come in from GitHub for this new repo. Any webhook claiming to be for the `gabriel-samfira/garm` repo, will be validated against the secret that was generated. + +### Managing repository webhooks + +The `webhook` URL that was used, will correspond to the `Controller Webhook URL` that we saw earlier when we listed the controller info. Let's list it and see what it looks like: + +```bash +ubuntu@garm:~$ garm-cli repository webhook show be3a0673-56af-4395-9ebf-4521fea67567 ++--------------+----------------------------------------------------------------------------+ +| FIELD | VALUE | ++--------------+----------------------------------------------------------------------------+ +| ID | 460257636 | +| URL | https://garm.example.com/webhooks/a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | +| Events | [workflow_job] | +| Active | true | +| Insecure SSL | false | ++--------------+----------------------------------------------------------------------------+ +``` + +We can see that it's active, and the events to which it subscribed. + +The `--install-webhook` and `--random-webhook-secret` options are convenience options that allow you to quickly add a new repository to GARM and have it ready to receive webhooks from GitHub. If you don't want to install the webhook, you can add the repository without it, and then install it later using the `garm-cli repository webhook install` command (which we'll show in a second) or manually add it in the GitHub UI. + +To uninstall a webhook from a repository, you can use the following command: + +```bash +garm-cli repository webhook uninstall be3a0673-56af-4395-9ebf-4521fea67567 +``` + +After which listing the webhook will show that it's inactive: + +```bash +ubuntu@garm:~$ garm-cli repository webhook show be3a0673-56af-4395-9ebf-4521fea67567 +Error: [GET /repositories/{repoID}/webhook][404] GetRepoWebhookInfo default {Error:Not Found Details:hook not found} +``` + +You can always add it back using: + +```bash +ubuntu@garm:~$ garm-cli repository webhook install be3a0673-56af-4395-9ebf-4521fea67567 ++--------------+----------------------------------------------------------------------------+ +| FIELD | VALUE | ++--------------+----------------------------------------------------------------------------+ +| ID | 460258767 | +| URL | https://garm.example.com/webhooks/a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | +| Events | [workflow_job] | +| Active | true | +| Insecure SSL | false | ++--------------+----------------------------------------------------------------------------+ +``` + +To allow GARM to manage webhooks, the PAT you're using must have the `admin:repo_hook` and `admin:org_hook` scopes. Webhook management is not available for enterprises. For enterprises you will have to add the webhook manually. + +To manually add a webhook, see the [webhooks](/doc/webhooks.md) section. + +## Listing repositories + +To list existing repositories, run the following command: + +```bash +ubuntu@garm:~$ garm-cli repository list ++--------------------------------------+-----------------+---------+------------------+------------------+ +| ID | OWNER | NAME | CREDENTIALS NAME | POOL MGR RUNNING | ++--------------------------------------+-----------------+---------+------------------+------------------+ +| be3a0673-56af-4395-9ebf-4521fea67567 | gabriel-samfira | garm | gabriel | true | ++--------------------------------------+-----------------+---------+------------------+------------------+ +``` + +This will list all the repositories that GARM is currently managing. + +## Removing a repository + +To remove a repository, you can use the following command: + +```bash +garm-cli repository delete be3a0673-56af-4395-9ebf-4521fea67567 +``` + +This will remove the repository from GARM, and if a webhook was installed, will also clean up the webhook from the repository. + +Note: GARM will not remove a webhook that points to the `Base Webhook URL`. It will only remove webhooks that are namespaced to the running controller. + +## Adding a new organization + +Adding a new organization is similar to adding a new repository. You need to use credentials that have access to the organization, and you can add the organization to GARM using the following command: + +```bash +ubuntu@garm:~$ garm-cli organization add \ + --credentials gabriel_org \ + --name gsamfira \ + --install-webhook \ + --random-webhook-secret ++----------------------+--------------------------------------+ +| FIELD | VALUE | ++----------------------+--------------------------------------+ +| ID | b50f648d-708f-48ed-8a14-cf58887af9cf | +| Name | gsamfira | +| Credentials | gabriel_org | +| Pool manager running | true | ++----------------------+--------------------------------------+ +``` + +This will add the organization `gsamfira` to GARM, and install a webhook for it. The webhook will be validated against the secret that was generated. The only difference between adding an organization and adding a repository is that you use the `organization` subcommand instead of the `repository` subcommand, and the `--name` option represents the `name` of the organization. + +Managing webhooks for organizations is similar to managing webhooks for repositories. You can list, show, install and uninstall webhooks for organizations using the `garm-cli organization webhook` subcommand. We won't go into details here, as it's similar to managing webhooks for repositories. + +All the other operations that exist on repositories, like listing, removing, etc, also exist for organizations and enterprises. Have a look at the help for the `garm-cli organization` subcommand for more details. + +## Adding an enterprise + +Enterprises are a bit special. Currently we don't support managing webhooks for enterprises, mainly because the level of access that would be required to do so seems a bit too much to enable in GARM itself. And considering that you'll probably ever only have one enterprise with multiple organizations and repositories, the effort/risk to benefit ratio makes this feature not worth implementing at the moment. + +To add an enterprise to GARM, you can use the following command: + +```bash +garm-cli enterprise add \ + --credentials gabriel_enterprise \ + --name samfira \ + --webhook-secret SuperSecretWebhookTokenPleaseReplaceMe +``` + +The `name` of the enterprise is the ["slug" of the enterprise](https://docs.github.com/en/enterprise-cloud@latest/admin/managing-your-enterprise-account/creating-an-enterprise-account). + +You will then have to manually add the `Controller Webhook URL` to the enterprise in the GitHub UI. + +All the other operations that exist on repositories, like listing, removing, etc, also exist for organizations and enterprises. Have a look at the help for the `garm-cli enterprise` subcommand for more details. + +At that point the enterprise will be added to GARM and you can start managing runners for it. + +## Creating a runner pool + +Now that we have a repository, organization or enterprise added to GARM, we can create a runner pool for it. A runner pool is a collection of runners of the same type, that are managed by GARM and are used to run workflows for the repository, organization or enterprise. + +You can create multiple pools of runners for the same entity (repository, organization or enterprise), and you can create pools of runners of different types. For example, you can have a pool of runners that are created on AWS, and another pool of runners that are created on Azure, k8s, LXD, etc. For repositories or organizations with complex needs, you can set up a number of pools that cover a wide range of needs, based on cost, capability (GPUs, FPGAs, etc) or sheer raw computing power. You don't have to pick just one and managing all of them is done using the exact same commands, as we'll show below. + +Before we create a pool, we have to decide on which provider we want to use. We've listed the providers above, so let's pick one and create a pool of runners for our repository. For the purpose of this example, we'll use the `incus` provider. We'll show you how to create a pool using this provider, but keep in mind that adding another pool using a different provider is done using the exact same commands. The only difference will be in the `--image`, `--flavor` and `--extra-specs` options that you'll use when creating the pool. + +Out of those three options, only the `--image` and `--flavor` are mandatory. The `--extra-specs` option is optional and is used to pass additional information to the provider when creating the pool. The `--extra-specs` option is provider specific, and you'll have to consult the provider documentation to see what options are available. + +But I digress. Let's create a pool of runners using the `incus` provider, for the `gabriel-samfira/garm` repository we created above: + +```bash +garm-cli pool add \ + --enabled=false \ + --repo be3a0673-56af-4395-9ebf-4521fea67567 \ + --image "images:ubuntu/22.04/cloud" \ + --flavor default \ + --provider-name incus \ + --min-idle-runners 1 \ + --tags ubuntu,incus ++--------------------------+----------------------------------------+ +| FIELD | VALUE | ++--------------------------+----------------------------------------+ +| ID | 9daa34aa-a08a-4f29-a782-f54950d8521a | +| Provider Name | incus | +| Image | images:ubuntu/22.04/cloud | +| Flavor | default | +| OS Type | linux | +| OS Architecture | amd64 | +| Max Runners | 5 | +| Min Idle Runners | 1 | +| Runner Bootstrap Timeout | 20 | +| Tags | self-hosted, x64, Linux, ubuntu, incus | +| Belongs to | gabriel-samfira/garm | +| Level | repo | +| Enabled | false | +| Runner Prefix | garm | +| Extra specs | | +| GitHub Runner Group | | ++--------------------------+----------------------------------------+ +``` + +Let's unpack the command a bit and explain what happened above. We added a new pool of runners to GARM, that belongs to the `gabriel-samfira/garm` repository. We used the `incus` provider to create the pool, and we specified the `--image` and `--flavor` options to tell the provider what kind of runners we want to create. On Incus and LXD, the flavor maps to a `profile` and the image maps to an incus or LXD image, as you would normally use when spinning up a new container or VM using the `incus launch` command. + +We also specified the `--min-idle-runners` option to tell GARM to always keep at least 1 runner idle in the pool. This is useful for repositories that have a lot of workflows that run often, and we want to make sure that we always have a runner ready to pick up a job. + +If we review the output of the command, we can see that the pool was created with a maximum number of 5 runners. This is just a default we can tweak when creating the pool, or later using the `garm-cli pool update` command. We can also see that the pool was created with a runner botstrap timeout of 20 minutes. This timeout is important on provider where the instance may take a long time to spin up. For example, on Equinix Metal, some operating systems can take a few minutes to install and reboot. This timeout can be tweaked to a higher value to account for this. + +The pool was created with the `--enabled` flag set to `false`, so the pool won't create any runners yet: + +```bash +ubuntu@garm:~$ garm-cli runner list 9daa34aa-a08a-4f29-a782-f54950d8521a ++----+------+--------+---------------+---------+ +| NR | NAME | STATUS | RUNNER STATUS | POOL ID | ++----+------+--------+---------------+---------+ ++----+------+--------+---------------+---------+ +``` + +## Listing pools + +To list pools created for a repository you can run: + +```bash +ubuntu@garm:~$ garm-cli pool list --repo=be3a0673-56af-4395-9ebf-4521fea67567 ++--------------------------------------+---------------------------+---------+------------------------------------+------------+-------+---------+---------------+ +| ID | IMAGE | FLAVOR | TAGS | BELONGS TO | LEVEL | ENABLED | RUNNER PREFIX | ++--------------------------------------+---------------------------+---------+------------------------------------+------------+-------+---------+---------------+ +| 9daa34aa-a08a-4f29-a782-f54950d8521a | images:ubuntu/22.04/cloud | default | self-hosted x64 Linux ubuntu incus | | | false | garm | ++--------------------------------------+---------------------------+---------+------------------------------------+------------+-------+---------+---------------+ +``` + +If you want to list pools for an organization or enterprise, you can use the `--org` or `--enterprise` options respectively. + +## Showing pool info + +You can get detailed information about a pool by running the following command: + +```bash +ubuntu@garm:~$ garm-cli pool show 9daa34aa-a08a-4f29-a782-f54950d8521a ++--------------------------+----------------------------------------+ +| FIELD | VALUE | ++--------------------------+----------------------------------------+ +| ID | 9daa34aa-a08a-4f29-a782-f54950d8521a | +| Provider Name | incus | +| Image | images:ubuntu/22.04/cloud | +| Flavor | default | +| OS Type | linux | +| OS Architecture | amd64 | +| Max Runners | 5 | +| Min Idle Runners | 1 | +| Runner Bootstrap Timeout | 20 | +| Tags | self-hosted, x64, Linux, ubuntu, incus | +| Belongs to | gabriel-samfira/garm | +| Level | repo | +| Enabled | false | +| Runner Prefix | garm | +| Extra specs | | +| GitHub Runner Group | | ++--------------------------+----------------------------------------+ +``` + +## Deleting a pool + +In order to delete a pool, you must first make sure there are no runners in the pool. To ensure this, we can first disable the pool, to make sure no new runners are created, remove the runners or allow them to be user, then we can delete the pool. + +To disable a pool, you can use the following command: + +```bash +ubuntu@garm:~$ garm-cli pool update 9daa34aa-a08a-4f29-a782-f54950d8521a --enabled=false ++--------------------------+----------------------------------------+ +| FIELD | VALUE | ++--------------------------+----------------------------------------+ +| ID | 9daa34aa-a08a-4f29-a782-f54950d8521a | +| Provider Name | incus | +| Image | images:ubuntu/22.04/cloud | +| Flavor | default | +| OS Type | linux | +| OS Architecture | amd64 | +| Max Runners | 5 | +| Min Idle Runners | 1 | +| Runner Bootstrap Timeout | 20 | +| Tags | self-hosted, x64, Linux, ubuntu, incus | +| Belongs to | gabriel-samfira/garm | +| Level | repo | +| Enabled | false | +| Runner Prefix | garm | +| Extra specs | | +| GitHub Runner Group | | ++--------------------------+----------------------------------------+ +``` + +If there are no runners in the pool, you can then remove it: + +```bash +ubuntu@garm:~$ garm-cli pool delete 9daa34aa-a08a-4f29-a782-f54950d8521a +``` + +## Update a pool + +You can update a pool by using the `garm-cli pool update` command. Nearly every aspect of a pool can be updated after it has been created. To demonstrate the command, we can enable the pool we created earlier: + +```bash +ubuntu@garm:~$ garm-cli pool update 9daa34aa-a08a-4f29-a782-f54950d8521a --enabled=true ++--------------------------+----------------------------------------+ +| FIELD | VALUE | ++--------------------------+----------------------------------------+ +| ID | 9daa34aa-a08a-4f29-a782-f54950d8521a | +| Provider Name | incus | +| Image | images:ubuntu/22.04/cloud | +| Flavor | default | +| OS Type | linux | +| OS Architecture | amd64 | +| Max Runners | 5 | +| Min Idle Runners | 1 | +| Runner Bootstrap Timeout | 20 | +| Tags | self-hosted, x64, Linux, ubuntu, incus | +| Belongs to | gabriel-samfira/garm | +| Level | repo | +| Enabled | true | +| Runner Prefix | garm | +| Extra specs | | +| GitHub Runner Group | | ++--------------------------+----------------------------------------+ +``` + +Now that the pool is enabled, GARM will start creating runners for it. We can list the runners in the pool to see if any have been created: + +```bash +ubuntu@garm:~$ garm-cli runner list 9daa34aa-a08a-4f29-a782-f54950d8521a ++----+-------------------+---------+---------------+--------------------------------------+ +| NR | NAME | STATUS | RUNNER STATUS | POOL ID | ++----+-------------------+---------+---------------+--------------------------------------+ +| 1 | garm-BFrp51VoVBCO | running | installing | 9daa34aa-a08a-4f29-a782-f54950d8521a | ++----+-------------------+---------+---------------+--------------------------------------+ +``` + +We can see that a runner has been created and is currently being installed. If we check incus, we should also see it there as well: + +```bash +root@incus:~# incus list ++-------------------+---------+----------------------+-----------------------------------------------+-----------+-----------+ +| NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | ++-------------------+---------+----------------------+-----------------------------------------------+-----------+-----------+ +| garm-BFrp51VoVBCO | RUNNING | 10.23.120.217 (eth0) | fd42:e6ea:8b6c:6cb9:216:3eff:feaa:fabf (eth0) | CONTAINER | 0 | ++-------------------+---------+----------------------+-----------------------------------------------+-----------+-----------+ +``` + +Awesome! This runner will be able to pick up bobs that match the labels we've set on the pool. + +## Listing runners + +You can list runners for a pool, for a repository, organization or enterprise, or for all of them. To list all runners, you can run: + +```bash +ubuntu@garm:~$ garm-cli runner list --all ++----+---------------------+---------+---------------+--------------------------------------+ +| NR | NAME | STATUS | RUNNER STATUS | POOL ID | ++----+---------------------+---------+---------------+--------------------------------------+ +| 1 | garm-jZWtnxYHR6sG | running | idle | 8ec34c1f-b053-4a5d-80d6-40afdfb389f9 | ++----+---------------------+---------+---------------+--------------------------------------+ +| 2 | garm-2vtBBaT2dgIvFg | running | idle | c03c8101-3ae0-49d7-98b7-298a3689d24c | ++----+---------------------+---------+---------------+--------------------------------------+ +| 3 | garm-Ew7SzN6LVlEC | running | idle | 577627f4-1add-4a45-9c62-3a7cbdec8403 | ++----+---------------------+---------+---------------+--------------------------------------+ +| 4 | garm-BFrp51VoVBCO | running | idle | 9daa34aa-a08a-4f29-a782-f54950d8521a | ++----+---------------------+---------+---------------+--------------------------------------+ +``` + +Have a look at the help command for the flags available to the `list` subcommand. + +## Showing runner info + +You can get detailed information about a runner by running the following command: + +```bash +ubuntu@garm:~$ garm-cli runner show garm-BFrp51VoVBCO ++-----------------+------------------------------------------------------------------------------------------------------+ +| FIELD | VALUE | ++-----------------+------------------------------------------------------------------------------------------------------+ +| ID | b332a811-0ebf-474c-9997-780124e22382 | +| Provider ID | garm-BFrp51VoVBCO | +| Name | garm-BFrp51VoVBCO | +| OS Type | linux | +| OS Architecture | amd64 | +| OS Name | Ubuntu | +| OS Version | 22.04 | +| Status | running | +| Runner Status | idle | +| Pool ID | 9daa34aa-a08a-4f29-a782-f54950d8521a | +| Addresses | 10.23.120.217 | +| | fd42:e6ea:8b6c:6cb9:216:3eff:feaa:fabf | +| Status Updates | 2024-02-11T23:39:54: downloading tools from https://github.com/actions/runner/releases/download/v2.3 | +| | 12.0/actions-runner-linux-x64-2.312.0.tar.gz | +| | 2024-02-11T23:40:04: extracting runner | +| | 2024-02-11T23:40:07: installing dependencies | +| | 2024-02-11T23:40:13: configuring runner | +| | 2024-02-11T23:40:13: runner registration token was retrieved | +| | 2024-02-11T23:40:19: runner successfully configured after 1 attempt(s) | +| | 2024-02-11T23:40:20: installing runner service | +| | 2024-02-11T23:40:20: starting service | +| | 2024-02-11T23:40:21: runner successfully installed | ++-----------------+------------------------------------------------------------------------------------------------------+ +``` + +## Deleting a runner + +You can delete a runner by running the following command: + +```bash +garm-cli runner rm garm-BFrp51VoVBCO +``` + +Only idle runners can be removed. If a runner is executing a job, it cannot be removed. However, a runner that is currently running a job, will be removed anyway when that job finishes. You can wait for the job to finish or you can cancel the job from the github workflow page. + +In some cases, providers may error out when creating or deleting a runner. This can happen if the provider is misconfigured. To avoid situations in which GARM gets deadlocked trying to remove a runner from a provider that is in err, we can forcefully remove a runner. The `--force` flag will make GARM ignore any error returned by the provider when attempting to delete an instance: + +```bash +garm-cli runner remove --force garm-BFrp51VoVBCO +``` + +Awesome! We've covered all the major parts of using GARM. This is all you need to have your workflows run on your self-hosted runners. Of course, each provider may have its own particularities, config options, extra specs and caveats (all of which should be documented in the provider README), but once added to the GARM config, creating a pool should be the same. + +## The debug-log command + +GARM outputs logs to standard out, log files and optionally to a websocket for easy debugging. This is just a convenience feature that allows you to stream logs to your terminal without having to log into the server. It's disabled by default, but if you enable it, you'll be able to run: + +```bash +ubuntu@garm:~$ garm-cli debug-log +time=2024-02-12T08:36:18.584Z level=INFO msg=access_log method=GET uri=/api/v1/ws user_agent=Go-http-client/1.1 ip=127.0.0.1:47260 code=200 bytes=0 request_time=447.445µs +time=2024-02-12T08:36:31.251Z level=INFO msg=access_log method=GET uri=/api/v1/instances user_agent=Go-http-client/1.1 ip=127.0.0.1:58460 code=200 bytes=1410 request_time=656.184µs +``` + +This will bring a real-time log to your terminal. While this feature should be fairly secure, I encourage you to only expose it within networks you know are secure. This can be done by configuring a reverse proxy in front of GARM that only allows connections to the websocket endpoint from certain locations. + +## Listing recorded jobs + +GARM will record any job that comes in and for which we have a pool configured. If we don't have a pool for a particular job, then that job is ignored. There is no point in recording jobs that we can't do anything about. It would just bloat the database for no reason. + +To view existing jobs, run the following command: + +```bash +garm-cli job list +``` + +If you've just set up GARM and have not yet created a pool or triggered a job, this will be empty. If you've configured everything and still don't receive jobs, you'll need to make sure that your URLs (discussed at the begining of this article), are correct. GitHub needs to be able to reach the webhook URL that our GARM instance listens on. \ No newline at end of file diff --git a/go.mod b/go.mod index 89a019c7..04ff9ec1 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.21 require ( github.com/BurntSushi/toml v1.3.2 github.com/cloudbase/garm-provider-common v0.1.1 + github.com/felixge/httpsnoop v1.0.4 github.com/go-openapi/errors v0.21.0 github.com/go-openapi/runtime v0.26.2 github.com/go-openapi/strfmt v0.21.10 @@ -41,7 +42,6 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chzyer/readline v1.5.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.3.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/analysis v0.21.5 // indirect diff --git a/metrics/enterprise.go b/metrics/enterprise.go index 53ddfc5e..ddba98aa 100644 --- a/metrics/enterprise.go +++ b/metrics/enterprise.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "strconv" "github.com/cloudbase/garm/auth" @@ -14,7 +14,7 @@ func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hos enterprises, err := c.runner.ListEnterprises(ctx) if err != nil { - log.Printf("listing providers: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") return } @@ -28,7 +28,7 @@ func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hos enterprise.ID, // label: id ) if err != nil { - log.Printf("cannot collect enterpriseInfo metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric") continue } ch <- enterpriseInfo @@ -42,7 +42,7 @@ func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hos strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running ) if err != nil { - log.Printf("cannot collect enterprisePoolManagerStatus metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric") continue } ch <- enterprisePoolManagerStatus diff --git a/metrics/health.go b/metrics/health.go index 4cc9e83a..da4b67dc 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "github.com/prometheus/client_golang/prometheus" ) @@ -15,7 +15,7 @@ func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostnam controllerID, ) if err != nil { - log.Printf("error on creating health metric: %s", err) + slog.With(slog.Any("error", err)).Error("error on creating health metric") return } ch <- m diff --git a/metrics/instance.go b/metrics/instance.go index 8bf234c4..d89409ed 100644 --- a/metrics/instance.go +++ b/metrics/instance.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" @@ -14,13 +14,13 @@ func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostn instances, err := c.runner.ListAllInstances(ctx) if err != nil { - log.Printf("cannot collect metrics, listing instances: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect metrics, listing instances") return } pools, err := c.runner.ListAllPools(ctx) if err != nil { - log.Printf("listing pools: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") return } @@ -71,7 +71,7 @@ func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostn ) if err != nil { - log.Printf("cannot collect runner metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect runner metric") continue } ch <- m diff --git a/metrics/metrics.go b/metrics/metrics.go index 1032a88f..61d02015 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "github.com/cloudbase/garm/auth" "github.com/cloudbase/garm/params" @@ -192,7 +192,7 @@ func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) { func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) { controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext()) if err != nil { - log.Printf("failed to get controller info: %s", err) + slog.With(slog.Any("error", err)).Error("failed to get controller info") return } diff --git a/metrics/organization.go b/metrics/organization.go index b5c57f08..e5ea8292 100644 --- a/metrics/organization.go +++ b/metrics/organization.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "strconv" "github.com/cloudbase/garm/auth" @@ -14,7 +14,7 @@ func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, h organizations, err := c.runner.ListOrganizations(ctx) if err != nil { - log.Printf("listing providers: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") return } @@ -28,7 +28,7 @@ func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, h organization.ID, // label: id ) if err != nil { - log.Printf("cannot collect organizationInfo metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationInfo metric") continue } ch <- organizationInfo @@ -42,7 +42,7 @@ func (c *GarmCollector) CollectOrganizationMetric(ch chan<- prometheus.Metric, h strconv.FormatBool(organization.PoolManagerStatus.IsRunning), // label: running ) if err != nil { - log.Printf("cannot collect organizationPoolManagerStatus metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect organizationPoolManagerStatus metric") continue } ch <- organizationPoolManagerStatus diff --git a/metrics/pool.go b/metrics/pool.go index deefd06e..110812bd 100644 --- a/metrics/pool.go +++ b/metrics/pool.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "strconv" "strings" @@ -15,7 +15,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname pools, err := c.runner.ListAllPools(ctx) if err != nil { - log.Printf("listing pools: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing pools") return } @@ -64,7 +64,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname poolNames[pool.ID].Type, // label: pool_type ) if err != nil { - log.Printf("cannot collect poolInfo metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolInfo metric") continue } ch <- poolInfo @@ -77,7 +77,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname strconv.FormatBool(pool.Enabled), // label: enabled ) if err != nil { - log.Printf("cannot collect poolStatus metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolStatus metric") continue } ch <- poolStatus @@ -89,7 +89,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname pool.ID, // label: id ) if err != nil { - log.Printf("cannot collect poolMaxRunners metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMaxRunners metric") continue } ch <- poolMaxRunners @@ -101,7 +101,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname pool.ID, // label: id ) if err != nil { - log.Printf("cannot collect poolMinIdleRunners metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolMinIdleRunners metric") continue } ch <- poolMinIdleRunners @@ -113,7 +113,7 @@ func (c *GarmCollector) CollectPoolMetric(ch chan<- prometheus.Metric, hostname pool.ID, // label: id ) if err != nil { - log.Printf("cannot collect poolBootstrapTimeout metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect poolBootstrapTimeout metric") continue } ch <- poolBootstrapTimeout diff --git a/metrics/provider.go b/metrics/provider.go index e2e903f3..e51f295e 100644 --- a/metrics/provider.go +++ b/metrics/provider.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "github.com/cloudbase/garm/auth" "github.com/prometheus/client_golang/prometheus" @@ -13,7 +13,7 @@ func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostn providers, err := c.runner.ListProviders(ctx) if err != nil { - log.Printf("listing providers: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") return } @@ -28,7 +28,7 @@ func (c *GarmCollector) CollectProviderMetric(ch chan<- prometheus.Metric, hostn provider.Description, // label: description ) if err != nil { - log.Printf("cannot collect providerInfo metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect providerInfo metric") continue } ch <- providerInfo diff --git a/metrics/repository.go b/metrics/repository.go index 4d0f3d78..b778782e 100644 --- a/metrics/repository.go +++ b/metrics/repository.go @@ -1,7 +1,7 @@ package metrics import ( - "log" + "log/slog" "strconv" "github.com/cloudbase/garm/auth" @@ -14,7 +14,7 @@ func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hos repositories, err := c.runner.ListRepositories(ctx) if err != nil { - log.Printf("listing providers: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers") return } @@ -29,7 +29,7 @@ func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hos repository.ID, // label: id ) if err != nil { - log.Printf("cannot collect repositoryInfo metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryInfo metric") continue } ch <- repositoryInfo @@ -43,7 +43,7 @@ func (c *GarmCollector) CollectRepositoryMetric(ch chan<- prometheus.Metric, hos strconv.FormatBool(repository.PoolManagerStatus.IsRunning), // label: running ) if err != nil { - log.Printf("cannot collect repositoryPoolManagerStatus metric: %s", err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect repositoryPoolManagerStatus metric") continue } ch <- repositoryPoolManagerStatus diff --git a/params/params.go b/params/params.go index 38b6b056..df37ddb2 100644 --- a/params/params.go +++ b/params/params.go @@ -566,3 +566,9 @@ type HookInfo struct { type CertificateBundle struct { RootCertificates map[string][]byte `json:"root_certificates"` } + +type UpdateSystemInfoParams struct { + OSName string `json:"os_name,omitempty"` + OSVersion string `json:"os_version,omitempty"` + AgentID *int64 `json:"agent_id,omitempty"` +} diff --git a/params/requests.go b/params/requests.go index 74bfeb50..946c8536 100644 --- a/params/requests.go +++ b/params/requests.go @@ -239,5 +239,5 @@ type UpdateEntityParams struct { type InstanceUpdateMessage struct { Status RunnerStatus `json:"status"` Message string `json:"message"` - AgentID *int64 `json:"agent_id"` + AgentID *int64 `json:"agent_id,omitempty"` } diff --git a/runner/common/pool.go b/runner/common/pool.go index caf1da38..47655414 100644 --- a/runner/common/pool.go +++ b/runner/common/pool.go @@ -27,9 +27,9 @@ const ( PoolReapTimeoutInterval = 5 * time.Minute // Temporary tools download token is valid for 1 hour by default. // There is no point in making an API call to get available tools, for every runner - // we spin up. We cache the tools for one minute. This should save us a lot of API calls + // we spin up. We cache the tools for 5 minutes. This should save us a lot of API calls // in cases where we have a lot of runners spin up at the same time. - PoolToolUpdateInterval = 1 * time.Minute + PoolToolUpdateInterval = 5 * time.Minute // BackoffTimer is the time we wait before attempting to make another request // to the github API. diff --git a/runner/enterprises.go b/runner/enterprises.go index b86a4284..cf2ca46e 100644 --- a/runner/enterprises.go +++ b/runner/enterprises.go @@ -3,7 +3,7 @@ package runner import ( "context" "fmt" - "log" + "log/slog" "strings" runnerErrors "github.com/cloudbase/garm-provider-common/errors" @@ -47,7 +47,9 @@ func (r *Runner) CreateEnterprise(ctx context.Context, param params.CreateEnterp defer func() { if err != nil { if deleteErr := r.store.DeleteEnterprise(ctx, enterprise.ID); deleteErr != nil { - log.Printf("failed to delete enterprise: %s", deleteErr) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to delete enterprise", + "enterprise_id", enterprise.ID) } } }() @@ -59,7 +61,9 @@ func (r *Runner) CreateEnterprise(ctx context.Context, param params.CreateEnterp } if err := poolMgr.Start(); err != nil { if deleteErr := r.poolManagerCtrl.DeleteEnterprisePoolManager(enterprise); deleteErr != nil { - log.Printf("failed to cleanup pool manager for enterprise %s", enterprise.ID) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to cleanup pool manager for enterprise", + "enterprise_id", enterprise.ID) } return params.Enterprise{}, errors.Wrap(err, "starting enterprise pool manager") } diff --git a/runner/metadata.go b/runner/metadata.go index 7fa8f782..7890434e 100644 --- a/runner/metadata.go +++ b/runner/metadata.go @@ -6,7 +6,7 @@ import ( "encoding/base64" "fmt" "html/template" - "log" + "log/slog" "strings" "github.com/cloudbase/garm-provider-common/defaults" @@ -44,7 +44,6 @@ func validateInstanceState(ctx context.Context) (params.Instance, error) { instance, err := auth.InstanceParams(ctx) if err != nil { - log.Printf("failed to get instance params: %s", err) return params.Instance{}, runnerErrors.ErrUnauthorized } return instance, nil @@ -53,13 +52,16 @@ func validateInstanceState(ctx context.Context) (params.Instance, error) { func (r *Runner) GetRunnerServiceName(ctx context.Context) (string, error) { instance, err := validateInstanceState(ctx) if err != nil { - log.Printf("failed to get instance params: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get instance params") return "", runnerErrors.ErrUnauthorized } pool, err := r.store.GetPoolByID(r.ctx, instance.PoolID) if err != nil { - log.Printf("failed to get pool: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get pool", + "pool_id", instance.PoolID) return "", errors.Wrap(err, "fetching pool") } @@ -109,7 +111,8 @@ func (r *Runner) GenerateSystemdUnitFile(ctx context.Context, runAsUser string) func (r *Runner) GetJITConfigFile(ctx context.Context, file string) ([]byte, error) { instance, err := validateInstanceState(ctx) if err != nil { - log.Printf("failed to get instance params: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get instance params") return nil, runnerErrors.ErrUnauthorized } jitConfig := instance.JitConfiguration @@ -142,7 +145,8 @@ func (r *Runner) GetInstanceGithubRegistrationToken(ctx context.Context) (string instance, err := auth.InstanceParams(ctx) if err != nil { - log.Printf("failed to get instance params: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get instance params") return "", runnerErrors.ErrUnauthorized } @@ -175,7 +179,8 @@ func (r *Runner) GetInstanceGithubRegistrationToken(ctx context.Context) (string func (r *Runner) GetRootCertificateBundle(ctx context.Context) (params.CertificateBundle, error) { instance, err := auth.InstanceParams(ctx) if err != nil { - log.Printf("failed to get instance params: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get instance params") return params.CertificateBundle{}, runnerErrors.ErrUnauthorized } @@ -186,7 +191,10 @@ func (r *Runner) GetRootCertificateBundle(ctx context.Context) (params.Certifica bundle, err := poolMgr.RootCABundle() if err != nil { - log.Printf("failed to get root CA bundle: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get root CA bundle", + "instance", instance.Name, + "pool_manager", poolMgr.ID()) // The root CA bundle is invalid. Return an empty bundle to the runner and log the event. return params.CertificateBundle{ RootCertificates: make(map[string][]byte), diff --git a/runner/organizations.go b/runner/organizations.go index 8e8c2a70..c86af1fb 100644 --- a/runner/organizations.go +++ b/runner/organizations.go @@ -17,7 +17,7 @@ package runner import ( "context" "fmt" - "log" + "log/slog" "strings" runnerErrors "github.com/cloudbase/garm-provider-common/errors" @@ -60,7 +60,9 @@ func (r *Runner) CreateOrganization(ctx context.Context, param params.CreateOrgP defer func() { if err != nil { if deleteErr := r.store.DeleteOrganization(ctx, org.ID); deleteErr != nil { - log.Printf("failed to delete org: %s", deleteErr) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to delete org", + "org_id", org.ID) } } }() @@ -71,7 +73,9 @@ func (r *Runner) CreateOrganization(ctx context.Context, param params.CreateOrgP } if err := poolMgr.Start(); err != nil { if deleteErr := r.poolManagerCtrl.DeleteOrgPoolManager(org); deleteErr != nil { - log.Printf("failed to cleanup pool manager for org %s", org.ID) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to cleanup pool manager for org", + "org_id", org.ID) } return params.Organization{}, errors.Wrap(err, "starting org pool manager") } @@ -156,7 +160,9 @@ func (r *Runner) DeleteOrganization(ctx context.Context, orgID string, keepWebho if err := poolMgr.UninstallWebhook(ctx); err != nil { // TODO(gabriel-samfira): Should we error out here? - log.Printf("failed to uninstall webhook: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to uninstall webhook", + "org_id", org.ID) } } diff --git a/runner/pool/enterprise.go b/runner/pool/enterprise.go index 05911e4a..f03947b5 100644 --- a/runner/pool/enterprise.go +++ b/runner/pool/enterprise.go @@ -5,7 +5,7 @@ import ( "encoding/base64" "encoding/json" "fmt" - "log" + "log/slog" "net/http" "strings" "sync" @@ -25,6 +25,7 @@ import ( var _ poolHelper = &enterprise{} func NewEnterprisePoolManager(ctx context.Context, cfg params.Enterprise, cfgInternal params.Internal, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) { + ctx = util.WithContext(ctx, slog.Any("pool_mgr", cfg.Name), slog.Any("pool_type", params.EnterprisePool)) ghc, ghEnterpriseClient, err := util.GithubClient(ctx, cfgInternal.OAuth2Token, cfgInternal.GithubCredentialsDetails) if err != nil { return nil, errors.Wrap(err, "getting github client") @@ -134,7 +135,9 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par defer func() { if err != nil && runner != nil { _, innerErr := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runner.GetID()) - log.Printf("failed to remove runner: %v", innerErr) + slog.With(slog.Any("error", innerErr)).ErrorContext( + ctx, "failed to remove runner", + "runner_id", runner.GetID(), "organization", r.cfg.Name) } }() diff --git a/runner/pool/organization.go b/runner/pool/organization.go index 4a88443d..0f534508 100644 --- a/runner/pool/organization.go +++ b/runner/pool/organization.go @@ -19,7 +19,7 @@ import ( "encoding/base64" "encoding/json" "fmt" - "log" + "log/slog" "net/http" "strings" "sync" @@ -39,6 +39,7 @@ import ( var _ poolHelper = &organization{} func NewOrganizationPoolManager(ctx context.Context, cfg params.Organization, cfgInternal params.Internal, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) { + ctx = util.WithContext(ctx, slog.Any("pool_mgr", cfg.Name), slog.Any("pool_type", params.OrganizationPool)) ghc, _, err := util.GithubClient(ctx, cfgInternal.OAuth2Token, cfgInternal.GithubCredentialsDetails) if err != nil { return nil, errors.Wrap(err, "getting github client") @@ -146,7 +147,9 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p defer func() { if err != nil && runner != nil { _, innerErr := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runner.GetID()) - log.Printf("failed to remove runner: %v", innerErr) + slog.With(slog.Any("error", innerErr)).ErrorContext( + ctx, "failed to remove runner", + "runner_id", runner.GetID(), "organization", r.cfg.Name) } }() @@ -369,7 +372,7 @@ func (r *organization) InstallHook(ctx context.Context, req *github.Hook) (param } if _, err := r.ghcli.PingOrgHook(ctx, r.cfg.Name, hook.GetID()); err != nil { - log.Printf("failed to ping hook %d: %v", *hook.ID, err) + slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to ping hook", "hook_id", hook.GetID()) } return hookToParamsHookInfo(hook), nil diff --git a/runner/pool/pool.go b/runner/pool/pool.go index 77fe515e..b15e3677 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -17,7 +17,7 @@ package pool import ( "context" "fmt" - "log" + "log/slog" "math" "net/http" "strconv" @@ -134,23 +134,30 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error { _, err := r.store.GetJobByID(r.ctx, jobParams.ID) if err != nil { if !errors.Is(err, runnerErrors.ErrNotFound) { - r.log("failed to get job %d: %s", jobParams.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to get job", + "job_id", jobParams.ID) return } // This job is new to us. Check if we have a pool that can handle it. potentialPools, err := r.store.FindPoolsMatchingAllTags(r.ctx, r.helper.PoolType(), r.helper.ID(), jobParams.Labels) if err != nil { - r.log("failed to find pools matching tags %s: %s; not recording job", strings.Join(jobParams.Labels, ", "), err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to find pools matching tags; not recording job", + "requested_tags", strings.Join(jobParams.Labels, ", ")) return } if len(potentialPools) == 0 { - r.log("no pools matching tags %s; not recording job", strings.Join(jobParams.Labels, ", ")) + slog.WarnContext( + r.ctx, "no pools matching tags; not recording job", + "requested_tags", strings.Join(jobParams.Labels, ", ")) return } } if _, jobErr := r.store.CreateOrUpdateJob(r.ctx, jobParams); jobErr != nil { - r.log("failed to update job %d: %s", jobParams.ID, jobErr) + slog.With(slog.Any("error", jobErr)).ErrorContext( + r.ctx, "failed to update job", "job_id", jobParams.ID) } if triggeredBy != 0 && jobParams.ID != triggeredBy { @@ -159,7 +166,9 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error { // still queued and we don't remove the lock, it will linger until the lock timeout is reached. // That may take a long time, so we break the lock here and allow it to be scheduled again. if err := r.store.BreakLockJobIsQueued(r.ctx, triggeredBy); err != nil { - r.log("failed to break lock for job %d: %s", triggeredBy, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to break lock for job", + "job_id", triggeredBy) } } }() @@ -189,15 +198,21 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error { if errors.Is(err, runnerErrors.ErrNotFound) { return nil } - r.log("failed to update runner %s status: %s", util.SanitizeLogEntry(jobParams.RunnerName), err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", util.SanitizeLogEntry(jobParams.RunnerName)) return errors.Wrap(err, "updating runner") } - r.log("marking instance %s as pending_delete", util.SanitizeLogEntry(jobParams.RunnerName)) + slog.DebugContext( + r.ctx, "marking instance as pending_delete", + "runner_name", util.SanitizeLogEntry(jobParams.RunnerName)) if _, err := r.setInstanceStatus(jobParams.RunnerName, commonParams.InstancePendingDelete, nil); err != nil { if errors.Is(err, runnerErrors.ErrNotFound) { return nil } - r.log("failed to update runner %s status: %s", util.SanitizeLogEntry(jobParams.RunnerName), err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", util.SanitizeLogEntry(jobParams.RunnerName)) return errors.Wrap(err, "updating runner") } case "in_progress": @@ -220,7 +235,9 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error { if errors.Is(err, runnerErrors.ErrNotFound) { return nil } - r.log("failed to update runner %s status: %s", util.SanitizeLogEntry(jobParams.RunnerName), err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", util.SanitizeLogEntry(jobParams.RunnerName)) return errors.Wrap(err, "updating runner") } // Set triggeredBy here so we break the lock on any potential queued job. @@ -233,7 +250,9 @@ func (r *basePoolManager) HandleWorkflowJob(job params.WorkflowJob) error { return errors.Wrap(err, "getting pool") } if err := r.ensureIdleRunnersForOnePool(pool); err != nil { - r.log("error ensuring idle runners for pool %s: %s", pool.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "error ensuring idle runners for pool", + "pool_id", pool.ID) } } return nil @@ -253,12 +272,16 @@ func jobIdFromLabels(labels []string) int64 { } func (r *basePoolManager) startLoopForFunction(f func() error, interval time.Duration, name string, alwaysRun bool) { - r.log("starting %s loop for %s", name, r.helper.String()) + slog.InfoContext( + r.ctx, "starting loop for entity", + "loop_name", name) ticker := time.NewTicker(interval) r.wg.Add(1) defer func() { - r.log("%s loop exited for pool %s", name, r.helper.String()) + slog.InfoContext( + r.ctx, "pool loop exited", + "loop_name", name) ticker.Stop() r.wg.Done() }() @@ -273,7 +296,9 @@ func (r *basePoolManager) startLoopForFunction(f func() error, interval time.Dur select { case <-ticker.C: if err := f(); err != nil { - r.log("error in loop %s: %q", name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "error in loop", + "loop_name", name) if errors.Is(err, runnerErrors.ErrUnauthorized) { r.setPoolRunningState(false, err.Error()) } @@ -304,7 +329,8 @@ func (r *basePoolManager) updateTools() error { // Update tools cache. tools, err := r.helper.FetchTools() if err != nil { - r.log("failed to update tools for repo %s: %s", r.helper.String(), err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update tools for repo") r.setPoolRunningState(false, err.Error()) r.waitForTimeoutOrCanceled(common.BackoffTimer) return fmt.Errorf("failed to update tools for repo %s: %w", r.helper.String(), err) @@ -313,7 +339,7 @@ func (r *basePoolManager) updateTools() error { r.tools = tools r.mux.Unlock() - r.log("successfully updated tools") + slog.DebugContext(r.ctx, "successfully updated tools") r.setPoolRunningState(true, "") return err } @@ -365,7 +391,9 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne runnerNames := map[string]bool{} for _, run := range runners { if !r.isManagedRunner(labelsFromRunner(run)) { - r.log("runner %s is not managed by a pool belonging to %s", *run.Name, r.helper.String()) + slog.DebugContext( + r.ctx, "runner is not managed by a pool we manage", + "runner_name", run.GetName()) continue } runnerNames[*run.Name] = true @@ -374,7 +402,9 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne for _, instance := range dbInstances { lockAcquired := r.keyMux.TryLock(instance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instance.Name) + slog.DebugContext( + r.ctx, "failed to acquire lock for instance", + "runner_name", instance.Name) continue } defer r.keyMux.Unlock(instance.Name, false) @@ -397,21 +427,27 @@ func (r *basePoolManager) cleanupOrphanedProviderRunners(runners []*github.Runne case params.RunnerPending, params.RunnerInstalling: if time.Since(instance.UpdatedAt).Minutes() < float64(pool.RunnerTimeout()) { // runner is still installing. We give it a chance to finish. - r.log("runner %s is still installing, give it a chance to finish", instance.Name) + slog.DebugContext( + r.ctx, "runner is still installing, give it a chance to finish", + "runner_name", instance.Name) continue } } if time.Since(instance.UpdatedAt).Minutes() < 5 { // instance was updated recently. We give it a chance to register itself in github. - r.log("instance %s was updated recently, skipping check", instance.Name) + slog.DebugContext( + r.ctx, "instance was updated recently, skipping check", + "runner_name", instance.Name) continue } if ok := runnerNames[instance.Name]; !ok { // Set pending_delete on DB field. Allow consolidate() to remove it. if _, err := r.setInstanceStatus(instance.Name, commonParams.InstancePendingDelete, nil); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner", + "runner_name", instance.Name) return errors.Wrap(err, "updating runner") } } @@ -431,17 +467,23 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { runnersByName := map[string]*github.Runner{} for _, run := range runners { if !r.isManagedRunner(labelsFromRunner(run)) { - r.log("runner %s is not managed by a pool belonging to %s", *run.Name, r.helper.String()) + slog.DebugContext( + r.ctx, "runner is not managed by a pool we manage", + "runner_name", run.GetName()) continue } runnersByName[*run.Name] = run } for _, instance := range dbInstances { - r.log("attempting to lock instance %s", instance.Name) + slog.DebugContext( + r.ctx, "attempting to lock instance", + "runner_name", instance.Name) lockAcquired := r.keyMux.TryLock(instance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instance.Name) + slog.DebugContext( + r.ctx, "failed to acquire lock for instance", + "runner_name", instance.Name) continue } defer r.keyMux.Unlock(instance.Name, false) @@ -460,9 +502,13 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error { // never started on the instance. // * A JIT config was created, but the runner never joined github. if runner, ok := runnersByName[instance.Name]; !ok || runner.GetStatus() == "offline" { - r.log("reaping timed-out/failed runner %s", instance.Name) + slog.InfoContext( + r.ctx, "reaping timed-out/failed runner", + "runner_name", instance.Name) if err := r.DeleteRunner(instance, false); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", instance.Name) return errors.Wrap(err, "updating runner") } } @@ -488,7 +534,9 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) g, ctx := errgroup.WithContext(r.ctx) for _, runner := range runners { if !r.isManagedRunner(labelsFromRunner(runner)) { - r.log("runner %s is not managed by a pool belonging to %s", *runner.Name, r.helper.String()) + slog.DebugContext( + r.ctx, "runner is not managed by a pool we manage", + "runner_name", runner.GetName()) continue } @@ -505,7 +553,9 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) } // We no longer have a DB entry for this instance, and the runner appears offline in github. // Previous forceful removal may have failed? - r.log("Runner %s has no database entry in garm, removing from github", *runner.Name) + slog.InfoContext( + r.ctx, "Runner has no database entry in garm, removing from github", + "runner_name", runner.GetName()) resp, err := r.helper.RemoveGithubRunner(*runner.ID) if err != nil { // Removed in the meantime? @@ -524,14 +574,18 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) continue case commonParams.InstancePendingCreate, commonParams.InstanceCreating: // instance is still being created. We give it a chance to finish. - r.log("instance %s is still being created, give it a chance to finish", dbInstance.Name) + slog.DebugContext( + r.ctx, "instance is still being created, give it a chance to finish", + "runner_name", dbInstance.Name) continue case commonParams.InstanceRunning: // this check is not strictly needed, but can help avoid unnecessary strain on the provider. // At worst, we will have a runner that is offline in github for 5 minutes before we reap it. if time.Since(dbInstance.UpdatedAt).Minutes() < 5 { // instance was updated recently. We give it a chance to register itself in github. - r.log("instance %s was updated recently, skipping check", dbInstance.Name) + slog.DebugContext( + r.ctx, "instance was updated recently, skipping check", + "runner_name", dbInstance.Name) continue } } @@ -550,7 +604,9 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) var poolInstances []commonParams.ProviderInstance poolInstances, ok = poolInstanceCache[pool.ID] if !ok { - r.log("updating instances cache for pool %s", pool.ID) + slog.DebugContext( + r.ctx, "updating instances cache for pool", + "pool_id", pool.ID) poolInstances, err = provider.ListInstances(r.ctx, pool.ID) if err != nil { return errors.Wrapf(err, "fetching instances for pool %s", pool.ID) @@ -560,7 +616,9 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) lockAcquired := r.keyMux.TryLock(dbInstance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", dbInstance.Name) + slog.DebugContext( + r.ctx, "failed to acquire lock for instance", + "runner_name", dbInstance.Name) continue } @@ -575,18 +633,24 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) if !ok { // The runner instance is no longer on the provider, and it appears offline in github. // It should be safe to force remove it. - r.log("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name) + slog.InfoContext( + r.ctx, "Runner instance is no longer on the provider, removing from github", + "runner_name", dbInstance.Name) resp, err := r.helper.RemoveGithubRunner(*runner.ID) if err != nil { // Removed in the meantime? if resp != nil && resp.StatusCode == http.StatusNotFound { - r.log("runner dissapeared from github") + slog.DebugContext( + r.ctx, "runner dissapeared from github", + "runner_name", dbInstance.Name) } else { return errors.Wrap(err, "removing runner from github") } } // Remove the database entry for the runner. - r.log("Removing %s from database", dbInstance.Name) + slog.InfoContext( + r.ctx, "Removing from database", + "runner_name", dbInstance.Name) if err := r.store.DeleteInstance(ctx, dbInstance.PoolID, dbInstance.Name); err != nil { return errors.Wrap(err, "removing runner from database") } @@ -598,10 +662,14 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) // instance is running, but github reports runner as offline. Log the event. // This scenario may require manual intervention. // Perhaps it just came online and github did not yet change it's status? - r.log("instance %s is online but github reports runner as offline", dbInstance.Name) + slog.WarnContext( + r.ctx, "instance is online but github reports runner as offline", + "runner_name", dbInstance.Name) return nil } else { - r.log("instance %s was found in stopped state; starting", dbInstance.Name) + slog.InfoContext( + r.ctx, "instance was found in stopped state; starting", + "runner_name", dbInstance.Name) //start the instance if err := provider.Start(r.ctx, dbInstance.ProviderID); err != nil { return errors.Wrapf(err, "starting instance %s", dbInstance.ProviderID) @@ -708,7 +776,8 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona // Attempt to create JIT config jitConfig, runner, err = r.helper.GetJITConfig(ctx, name, pool, labels) if err != nil { - r.log("failed to get JIT config, falling back to registration token: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to get JIT config, falling back to registration token") } } @@ -739,14 +808,18 @@ func (r *basePoolManager) AddRunner(ctx context.Context, poolID string, aditiona if err != nil { if instance.ID != "" { if err := r.DeleteRunner(instance, false); err != nil { - r.log("failed to cleanup instance: %s", instance.Name) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to cleanup instance", + "runner_name", instance.Name) } } if runner != nil { _, runnerCleanupErr := r.helper.RemoveGithubRunner(runner.GetID()) if err != nil { - r.log("failed to remove runner %d: %s", runner.GetID(), runnerCleanupErr) + slog.With(slog.Any("error", runnerCleanupErr)).ErrorContext( + ctx, "failed to remove runner", + "gh_runner_id", runner.GetID()) } } } @@ -765,7 +838,8 @@ func (r *basePoolManager) Status() params.PoolManagerStatus { } func (r *basePoolManager) waitForTimeoutOrCanceled(timeout time.Duration) { - r.log("sleeping for %.2f minutes", timeout.Minutes()) + slog.DebugContext( + r.ctx, fmt.Sprintf("sleeping for %.2f minutes", timeout.Minutes())) select { case <-time.After(timeout): case <-r.ctx.Done(): @@ -842,7 +916,9 @@ func (r *basePoolManager) addInstanceToProvider(instance params.Instance) error if instanceIDToDelete != "" { if err := provider.DeleteInstance(r.ctx, instanceIDToDelete); err != nil { if !errors.Is(err, runnerErrors.ErrNotFound) { - r.log("failed to cleanup instance: %s", instanceIDToDelete) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to cleanup instance", + "provider_id", instanceIDToDelete) } } } @@ -882,23 +958,23 @@ func (r *basePoolManager) getRunnerDetailsFromJob(job params.WorkflowJob) (param } // Runner name was not set in WorkflowJob by github. We can still attempt to // fetch the info we need, using the workflow run ID, from the API. - r.log("runner name not found in workflow job, attempting to fetch from API") + slog.InfoContext( + r.ctx, "runner name not found in workflow job, attempting to fetch from API", + "job_id", job.WorkflowJob.ID) runnerInfo, err = r.helper.GetRunnerInfoFromWorkflow(job) if err != nil { return params.RunnerInfo{}, errors.Wrap(err, "fetching runner name from API") } } - runnerDetails, err := r.store.GetInstanceByName(context.Background(), runnerInfo.Name) + _, err = r.store.GetInstanceByName(context.Background(), runnerInfo.Name) if err != nil { - r.log("could not find runner details for %s", util.SanitizeLogEntry(runnerInfo.Name)) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "could not find runner details", + "runner_name", util.SanitizeLogEntry(runnerInfo.Name)) return params.RunnerInfo{}, errors.Wrap(err, "fetching runner details") } - if _, err := r.helper.GetPoolByID(runnerDetails.PoolID); err != nil { - r.log("runner %s (pool ID: %s) does not belong to any pool we manage: %s", runnerDetails.Name, runnerDetails.PoolID, err) - return params.RunnerInfo{}, errors.Wrap(err, "fetching pool for instance") - } return runnerInfo, nil } @@ -989,9 +1065,13 @@ func (r *basePoolManager) updateArgsFromProviderInstance(providerInstance common } func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool) error { - r.log("scaling down pool %s", pool.ID) + slog.DebugContext( + ctx, "scaling down pool", + "pool_id", pool.ID) if !pool.Enabled { - r.log("pool %s is disabled, skipping scale down", pool.ID) + slog.DebugContext( + ctx, "pool is disabled, skipping scale down", + "pool_id", pool.ID) return nil } @@ -1035,13 +1115,18 @@ func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool lockAcquired := r.keyMux.TryLock(instanceToDelete.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instanceToDelete.Name) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to acquire lock for instance", + "provider_id", instanceToDelete.Name) continue } defer r.keyMux.Unlock(instanceToDelete.Name, false) g.Go(func() error { - r.log("scaling down idle worker %s from pool %s\n", instanceToDelete.Name, pool.ID) + slog.InfoContext( + ctx, "scaling down idle worker from pool %s", + "runner_name", instanceToDelete.Name, + "pool_id", pool.ID) if err := r.DeleteRunner(instanceToDelete, false); err != nil { return fmt.Errorf("failed to delete instance %s: %w", instanceToDelete.ID, err) } @@ -1065,7 +1150,9 @@ func (r *basePoolManager) scaleDownOnePool(ctx context.Context, pool params.Pool for _, job := range queued { if time.Since(job.CreatedAt).Minutes() > 10 && pool.HasRequiredLabels(job.Labels) { if err := r.store.DeleteJob(ctx, job.ID); err != nil && !errors.Is(err, runnerErrors.ErrNotFound) { - r.log("failed to delete job %d: %s", job.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to delete job", + "job_id", job.ID) } } } @@ -1109,7 +1196,10 @@ func (r *basePoolManager) ensureIdleRunnersForOnePool(pool params.Pool) error { } if uint(len(existingInstances)) >= pool.MaxRunners { - r.log("max workers (%d) reached for pool %s, skipping idle worker creation", pool.MaxRunners, pool.ID) + slog.DebugContext( + r.ctx, "max workers reached for pool, skipping idle worker creation", + "max_runners", pool.MaxRunners, + "pool_id", pool.ID) return nil } @@ -1134,7 +1224,9 @@ func (r *basePoolManager) ensureIdleRunnersForOnePool(pool params.Pool) error { } for i := 0; i < required; i++ { - r.log("adding new idle worker to pool %s", pool.ID) + slog.InfoContext( + r.ctx, "adding new idle worker to pool", + "pool_id", pool.ID) if err := r.AddRunner(r.ctx, pool.ID, nil); err != nil { return fmt.Errorf("failed to add new instance for pool %s: %w", pool.ID, err) } @@ -1146,7 +1238,9 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po if !pool.Enabled { return nil } - r.log("running retry failed instances for pool %s", pool.ID) + slog.DebugContext( + ctx, "running retry failed instances for pool", + "pool_id", pool.ID) existingInstances, err := r.store.ListPoolInstances(r.ctx, pool.ID) if err != nil { @@ -1155,6 +1249,8 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po g, errCtx := errgroup.WithContext(ctx) for _, instance := range existingInstances { + instance := instance + if instance.Status != commonParams.InstanceError { continue } @@ -1162,21 +1258,29 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po continue } - r.log("attempting to retry failed instance %s", instance.Name) + slog.DebugContext( + ctx, "attempting to retry failed instance", + "runner_name", instance.Name) lockAcquired := r.keyMux.TryLock(instance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instance.Name) + slog.DebugContext( + ctx, "failed to acquire lock for instance", + "runner_name", instance.Name) continue } - instance := instance g.Go(func() error { defer r.keyMux.Unlock(instance.Name, false) + slog.DebugContext( + ctx, "attempting to clean up any previous instance", + "runner_name", instance.Name) // NOTE(gabriel-samfira): this is done in parallel. If there are many failed instances // this has the potential to create many API requests to the target provider. // TODO(gabriel-samfira): implement request throttling. if err := r.deleteInstanceFromProvider(errCtx, instance); err != nil { - r.log("failed to delete instance %s from provider: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to delete instance from provider", + "runner_name", instance.Name) // Bail here, otherwise we risk creating multiple failing instances, and losing track // of them. If Create instance failed to return a proper provider ID, we rely on the // name to delete the instance. If we don't bail here, and end up with multiple @@ -1186,7 +1290,9 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po // which we would rather avoid. return err } - + slog.DebugContext( + ctx, "cleanup of previously failed instance complete", + "runner_name", instance.Name) // TODO(gabriel-samfira): Incrementing CreateAttempt should be done within a transaction. // It's fairly safe to do here (for now), as there should be no other code path that updates // an instance in this state. @@ -1197,10 +1303,14 @@ func (r *basePoolManager) retryFailedInstancesForOnePool(ctx context.Context, po Status: commonParams.InstancePendingCreate, RunnerStatus: params.RunnerPending, } - r.log("queueing previously failed instance %s for retry", instance.Name) + slog.DebugContext( + ctx, "queueing previously failed instance for retry", + "runner_name", instance.Name) // Set instance to pending create and wait for retry. if _, err := r.updateInstance(instance.Name, updateParams); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to update runner status", + "runner_name", instance.Name) } return nil }) @@ -1243,7 +1353,9 @@ func (r *basePoolManager) scaleDown() error { for _, pool := range pools { pool := pool g.Go(func() error { - r.log("running scale down for pool %s", pool.ID) + slog.DebugContext( + ctx, "running scale down for pool", + "pool_id", pool.ID) return r.scaleDownOnePool(ctx, pool) }) } @@ -1291,6 +1403,11 @@ func (r *basePoolManager) deleteInstanceFromProvider(ctx context.Context, instan identifier = instance.Name } + slog.DebugContext( + ctx, "calling delete instance on provider", + "runner_name", instance.Name, + "provider_id", identifier) + if err := provider.DeleteInstance(ctx, identifier); err != nil { return errors.Wrap(err, "removing instance") } @@ -1304,17 +1421,23 @@ func (r *basePoolManager) deletePendingInstances() error { return fmt.Errorf("failed to fetch instances from store: %w", err) } - r.log("removing instances in pending_delete") + slog.DebugContext( + r.ctx, "removing instances in pending_delete") for _, instance := range instances { if instance.Status != commonParams.InstancePendingDelete && instance.Status != commonParams.InstancePendingForceDelete { // not in pending_delete status. Skip. continue } - r.log("removing instance %s in pool %s", instance.Name, instance.PoolID) + slog.InfoContext( + r.ctx, "removing instance from pool", + "runner_name", instance.Name, + "pool_id", instance.PoolID) lockAcquired := r.keyMux.TryLock(instance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instance.Name) + slog.InfoContext( + r.ctx, "failed to acquire lock for instance", + "runner_name", instance.Name) continue } @@ -1322,7 +1445,9 @@ func (r *basePoolManager) deletePendingInstances() error { // Set the status to deleting before launching the goroutine that removes // the runner from the provider (which can take a long time). if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceDeleting, nil); err != nil { - r.log("failed to update runner %s status: %q", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", instance.Name) r.keyMux.Unlock(instance.Name, false) continue } @@ -1334,29 +1459,41 @@ func (r *basePoolManager) deletePendingInstances() error { }() defer func(instance params.Instance) { if err != nil { - r.log("failed to remove instance %s: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to remove instance", + "runner_name", instance.Name) // failed to remove from provider. Set status to previous value, which will retry // the operation. if _, err := r.setInstanceStatus(instance.Name, currentStatus, nil); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", instance.Name) } } }(instance) - r.log("removing instance %s from provider", instance.Name) + slog.DebugContext( + r.ctx, "removing instance from provider", + "runner_name", instance.Name) err = r.deleteInstanceFromProvider(r.ctx, instance) if err != nil { if currentStatus != commonParams.InstancePendingForceDelete { return fmt.Errorf("failed to remove instance from provider: %w", err) } - log.Printf("failed to remove instance %s from provider (continuing anyway): %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to remove instance from provider (continuing anyway)", + "instance", instance.Name) } - r.log("removing instance %s from database", instance.Name) + slog.InfoContext( + r.ctx, "removing instance from database", + "runner_name", instance.Name) if deleteErr := r.store.DeleteInstance(r.ctx, instance.PoolID, instance.Name); deleteErr != nil { return fmt.Errorf("failed to delete instance from database: %w", deleteErr) } deleteMux = true - r.log("instance %s was successfully removed", instance.Name) + slog.InfoContext( + r.ctx, "instance was successfully removed", + "runner_name", instance.Name) return nil }(instance) //nolint } @@ -1376,17 +1513,24 @@ func (r *basePoolManager) addPendingInstances() error { continue } - r.log("attempting to acquire lock for instance %s (create)", instance.Name) + slog.DebugContext( + r.ctx, "attempting to acquire lock for instance", + "runner_name", instance.Name, + "action", "create_pending") lockAcquired := r.keyMux.TryLock(instance.Name) if !lockAcquired { - r.log("failed to acquire lock for instance %s", instance.Name) + slog.DebugContext( + r.ctx, "failed to acquire lock for instance", + "runner_name", instance.Name) continue } // Set the instance to "creating" before launching the goroutine. This will ensure that addPendingInstances() // won't attempt to create the runner a second time. if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceCreating, nil); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", instance.Name) r.keyMux.Unlock(instance.Name, false) // We failed to transition the instance to Creating. This means that garm will retry to create this instance // when the loop runs again and we end up with multiple instances. @@ -1395,14 +1539,23 @@ func (r *basePoolManager) addPendingInstances() error { go func(instance params.Instance) { defer r.keyMux.Unlock(instance.Name, false) - r.log("creating instance %s in pool %s", instance.Name, instance.PoolID) + slog.InfoContext( + r.ctx, "creating instance in pool", + "runner_name", instance.Name, + "pool_id", instance.PoolID) if err := r.addInstanceToProvider(instance); err != nil { - r.log("failed to add instance to provider: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to add instance to provider", + "runner_name", instance.Name) errAsBytes := []byte(err.Error()) - if _, err := r.setInstanceStatus(instance.Name, commonParams.InstanceError, errAsBytes); err != nil { - r.log("failed to update runner %s status: %s", instance.Name, err) + if _, statusErr := r.setInstanceStatus(instance.Name, commonParams.InstanceError, errAsBytes); statusErr != nil { + slog.With(slog.Any("error", statusErr)).ErrorContext( + r.ctx, "failed to update runner status", + "runner_name", instance.Name) } - r.log("failed to create instance in provider: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to create instance in provider", + "runner_name", instance.Name) } }(instance) } @@ -1425,7 +1578,8 @@ func (r *basePoolManager) Wait() error { } func (r *basePoolManager) runnerCleanup() (err error) { - r.log("running runner cleanup") + slog.DebugContext( + r.ctx, "running runner cleanup") runners, err := r.helper.GetGithubRunners() if err != nil { return fmt.Errorf("failed to fetch github runners: %w", err) @@ -1519,12 +1673,14 @@ func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove bool) return errors.Wrapf(runnerErrors.ErrBadRequest, "removing runner: %q", err) case http.StatusNotFound: // Runner may have been deleted by a finished job, or manually by the user. - r.log("runner with agent id %d was not found in github", runner.AgentID) + slog.DebugContext( + r.ctx, "runner was not found in github", + "agent_id", runner.AgentID) case http.StatusUnauthorized: // Mark the pool as offline from this point forward - failureReason := fmt.Sprintf("failed to remove runner: %q", err) - r.setPoolRunningState(false, failureReason) - log.Print(failureReason) + r.setPoolRunningState(false, fmt.Sprintf("failed to remove runner: %q", err)) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to remove runner") // evaluate the next switch case. fallthrough default: @@ -1542,9 +1698,14 @@ func (r *basePoolManager) DeleteRunner(runner params.Instance, forceRemove bool) instanceStatus = commonParams.InstancePendingForceDelete } - r.log("setting instance status for %v to %v", runner.Name, instanceStatus) + slog.InfoContext( + r.ctx, "setting instance status", + "runner_name", runner.Name, + "status", instanceStatus) if _, err := r.setInstanceStatus(runner.Name, instanceStatus, nil); err != nil { - r.log("failed to update runner %s status: %s", runner.Name, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to update runner", + "runner_name", runner.Name) return errors.Wrap(err, "updating runner") } return nil @@ -1581,17 +1742,24 @@ func (r *basePoolManager) consumeQueuedJobs() error { poolsCache := poolsForTags{} - r.log("found %d queued jobs for %s", len(queued), r.helper.String()) + slog.DebugContext( + r.ctx, "found queued jobs", + "job_count", len(queued)) for _, job := range queued { if job.LockedBy != uuid.Nil && job.LockedBy.String() != r.ID() { // Job was handled by us or another entity. - r.log("job %d is locked by %s", job.ID, job.LockedBy.String()) + slog.DebugContext( + r.ctx, "job is locked", + "job_id", job.ID, + "locking_entity", job.LockedBy.String()) continue } if time.Since(job.UpdatedAt) < time.Second*30 { // give the idle runners a chance to pick up the job. - r.log("job %d was updated less than 30 seconds ago. Skipping", job.ID) + slog.DebugContext( + r.ctx, "job was updated less than 30 seconds ago. Skipping", + "job_id", job.ID) continue } @@ -1601,7 +1769,9 @@ func (r *basePoolManager) consumeQueuedJobs() error { // the runner. if err := r.store.UnlockJob(r.ctx, job.ID, r.ID()); err != nil { // TODO: Implament a cache? Should we return here? - r.log("failed to unlock job %d: %q", job.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to unlock job", + "job_id", job.ID) continue } } @@ -1611,7 +1781,9 @@ func (r *basePoolManager) consumeQueuedJobs() error { // TODO(gabriel-samfira): create an in-memory state of existing runners that we can easily // check for existing pending or idle runners. If we can't find any, attempt to allocate another // runner. - r.log("job %d is locked by us", job.ID) + slog.DebugContext( + r.ctx, "job is locked by us", + "job_id", job.ID) continue } @@ -1619,20 +1791,23 @@ func (r *basePoolManager) consumeQueuedJobs() error { if !ok { potentialPools, err := r.store.FindPoolsMatchingAllTags(r.ctx, r.helper.PoolType(), r.helper.ID(), job.Labels) if err != nil { - r.log("error finding pools matching labels: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "error finding pools matching labels") continue } poolRR = poolsCache.Add(job.Labels, potentialPools) } if poolRR.Len() == 0 { - r.log("could not find pools with labels %s", strings.Join(job.Labels, ",")) + slog.DebugContext(r.ctx, "could not find pools with labels", "requested_labels", strings.Join(job.Labels, ",")) continue } runnerCreated := false if err := r.store.LockJob(r.ctx, job.ID, r.ID()); err != nil { - r.log("could not lock job %d: %s", job.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "could not lock job", + "job_id", job.ID) continue } @@ -1642,31 +1817,45 @@ func (r *basePoolManager) consumeQueuedJobs() error { for i := 0; i < poolRR.Len(); i++ { pool, err := poolRR.Next() if err != nil { - r.log("could not find a pool to create a runner for job %d: %s", job.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "could not find a pool to create a runner for job", + "job_id", job.ID) break } - r.log("attempting to create a runner in pool %s for job %d", pool.ID, job.ID) + slog.InfoContext( + r.ctx, "attempting to create a runner in pool", + "pool_id", pool.ID, + "job_id", job.ID) if err := r.addRunnerToPool(pool, jobLabels); err != nil { - r.log("[PoolRR] could not add runner to pool %s: %s", pool.ID, err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "could not add runner to pool", + "pool_id", pool.ID) continue } - r.log("a new runner was added to pool %s as a response to queued job %d", pool.ID, job.ID) + slog.DebugContext(r.ctx, "a new runner was added as a response to queued job", + "pool_id", pool.ID, + "job_id", job.ID) runnerCreated = true break } if !runnerCreated { - r.log("could not create a runner for job %d; unlocking", job.ID) + slog.WarnContext( + r.ctx, "could not create a runner for job; unlocking", + "job_id", job.ID) if err := r.store.UnlockJob(r.ctx, job.ID, r.ID()); err != nil { - r.log("failed to unlock job: %d", job.ID) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to unlock job", + "job_id", job.ID) return errors.Wrap(err, "unlocking job") } } } if err := r.store.DeleteCompletedJobs(r.ctx); err != nil { - r.log("failed to delete completed jobs: %q", err) + slog.With(slog.Any("error", err)).ErrorContext( + r.ctx, "failed to delete completed jobs") } return nil } diff --git a/runner/pool/repository.go b/runner/pool/repository.go index 9abe387e..0505eece 100644 --- a/runner/pool/repository.go +++ b/runner/pool/repository.go @@ -19,7 +19,7 @@ import ( "encoding/base64" "encoding/json" "fmt" - "log" + "log/slog" "net/http" "strings" "sync" @@ -39,6 +39,7 @@ import ( var _ poolHelper = &repository{} func NewRepositoryPoolManager(ctx context.Context, cfg params.Repository, cfgInternal params.Internal, providers map[string]common.Provider, store dbCommon.Store) (common.PoolManager, error) { + ctx = util.WithContext(ctx, slog.Any("pool_mgr", fmt.Sprintf("%s/%s", cfg.Owner, cfg.Name)), slog.Any("pool_type", params.RepositoryPool)) ghc, _, err := util.GithubClient(ctx, cfgInternal.OAuth2Token, cfgInternal.GithubCredentialsDetails) if err != nil { return nil, errors.Wrap(err, "getting github client") @@ -110,7 +111,11 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par defer func() { if err != nil && runner != nil { _, innerErr := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runner.GetID()) - log.Printf("failed to remove runner: %v", innerErr) + slog.With(slog.Any("error", innerErr)).ErrorContext( + ctx, "failed to remove runner", + "runner_id", runner.GetID(), + "repo", r.cfg.Name, + "owner", r.cfg.Owner) } }() @@ -332,7 +337,11 @@ func (r *repository) InstallHook(ctx context.Context, req *github.Hook) (params. } if _, err := r.ghcli.PingRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID()); err != nil { - log.Printf("failed to ping hook %d: %v", hook.GetID(), err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to ping hook", + "hook_id", hook.GetID(), + "repo", r.cfg.Name, + "owner", r.cfg.Owner) } return hookToParamsHookInfo(hook), nil diff --git a/runner/pool/util.go b/runner/pool/util.go index 4a8c09e3..b97bf26f 100644 --- a/runner/pool/util.go +++ b/runner/pool/util.go @@ -1,7 +1,6 @@ package pool import ( - "log" "sort" "strings" "sync" @@ -57,11 +56,3 @@ func (p *poolsForTags) Add(tags []string, pools []params.Pool) *poolRoundRobin { v, _ := p.pools.LoadOrStore(key, poolRR) return v.(*poolRoundRobin) } - -func (r *basePoolManager) log(msg string, args ...interface{}) { - msgArgs := []interface{}{ - r.helper.String(), - } - msgArgs = append(msgArgs, args...) - log.Printf("[Pool mgr %s] "+msg, msgArgs...) -} diff --git a/runner/providers/external/external.go b/runner/providers/external/external.go index bf4cd3fc..d1574047 100644 --- a/runner/providers/external/external.go +++ b/runner/providers/external/external.go @@ -4,7 +4,7 @@ import ( "context" "encoding/json" "fmt" - "log" + "log/slog" "os/exec" "github.com/cloudbase/garm-provider-common/execution" @@ -51,7 +51,7 @@ type external struct { environmentVariables []string } -func (e *external) validateResult(inst commonParams.ProviderInstance) error { +func (e *external) validateResult(ctx context.Context, inst commonParams.ProviderInstance) error { if inst.ProviderID == "" { return garmErrors.NewProviderError("missing provider ID") } @@ -60,10 +60,6 @@ func (e *external) validateResult(inst commonParams.ProviderInstance) error { return garmErrors.NewProviderError("missing instance name") } - if inst.OSName == "" || inst.OSArch == "" || inst.OSType == "" { - // we can still function without this info (I think) - log.Printf("WARNING: missing OS information") - } if !IsValidProviderStatus(inst.Status) { return garmErrors.NewProviderError("invalid status returned (%s)", inst.Status) } @@ -96,12 +92,14 @@ func (e *external) CreateInstance(ctx context.Context, bootstrapParams commonPar return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err) } - if err := e.validateResult(param); err != nil { + if err := e.validateResult(ctx, param); err != nil { return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } retAsJs, _ := json.MarshalIndent(param, "", " ") - log.Printf("provider returned: %s", string(retAsJs)) + slog.DebugContext( + ctx, "provider returned", + "output", string(retAsJs)) return param, nil } @@ -148,7 +146,7 @@ func (e *external) GetInstance(ctx context.Context, instance string) (commonPara return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err) } - if err := e.validateResult(param); err != nil { + if err := e.validateResult(ctx, param); err != nil { return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } @@ -177,7 +175,7 @@ func (e *external) ListInstances(ctx context.Context, poolID string) ([]commonPa ret := make([]commonParams.ProviderInstance, len(param)) for idx, inst := range param { - if err := e.validateResult(inst); err != nil { + if err := e.validateResult(ctx, inst); err != nil { return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } ret[idx] = inst diff --git a/runner/providers/providers.go b/runner/providers/providers.go index 5d4e58f1..e9bb48ed 100644 --- a/runner/providers/providers.go +++ b/runner/providers/providers.go @@ -16,7 +16,7 @@ package providers import ( "context" - "log" + "log/slog" "github.com/cloudbase/garm/config" "github.com/cloudbase/garm/params" @@ -31,7 +31,9 @@ import ( func LoadProvidersFromConfig(ctx context.Context, cfg config.Config, controllerID string) (map[string]common.Provider, error) { providers := make(map[string]common.Provider, len(cfg.Providers)) for _, providerCfg := range cfg.Providers { - log.Printf("Loading provider %s", providerCfg.Name) + slog.InfoContext( + ctx, "Loading provider", + "provider", providerCfg.Name) switch providerCfg.ProviderType { case params.ExternalProvider: conf := providerCfg diff --git a/runner/repositories.go b/runner/repositories.go index 54859c4e..143bbd06 100644 --- a/runner/repositories.go +++ b/runner/repositories.go @@ -17,7 +17,7 @@ package runner import ( "context" "fmt" - "log" + "log/slog" "strings" runnerErrors "github.com/cloudbase/garm-provider-common/errors" @@ -60,7 +60,9 @@ func (r *Runner) CreateRepository(ctx context.Context, param params.CreateRepoPa defer func() { if err != nil { if deleteErr := r.store.DeleteRepository(ctx, repo.ID); deleteErr != nil { - log.Printf("failed to delete repository: %s", deleteErr) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to delete repository", + "repository_id", repo.ID) } } }() @@ -71,7 +73,9 @@ func (r *Runner) CreateRepository(ctx context.Context, param params.CreateRepoPa } if err := poolMgr.Start(); err != nil { if deleteErr := r.poolManagerCtrl.DeleteRepoPoolManager(repo); deleteErr != nil { - log.Printf("failed to cleanup pool manager for repo %s", repo.ID) + slog.With(slog.Any("error", deleteErr)).ErrorContext( + ctx, "failed to cleanup pool manager for repo", + "repository_id", repo.ID) } return params.Repository{}, errors.Wrap(err, "starting repo pool manager") } @@ -155,7 +159,9 @@ func (r *Runner) DeleteRepository(ctx context.Context, repoID string, keepWebhoo if err := poolMgr.UninstallWebhook(ctx); err != nil { // TODO(gabriel-samfira): Should we error out here? - log.Printf("failed to uninstall webhook: %s", err) + slog.With(slog.Any("error", err)).ErrorContext( + ctx, "failed to uninstall webhook", + "pool_manager_id", poolMgr.ID()) } } diff --git a/runner/runner.go b/runner/runner.go index aae2baf2..55c23ae8 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -23,7 +23,7 @@ import ( "encoding/json" "fmt" "hash" - "log" + "log/slog" "os" "strings" "sync" @@ -449,7 +449,9 @@ func (r *Runner) loadReposOrgsAndEnterprises() error { for _, repo := range repos { repo := repo g.Go(func() error { - log.Printf("creating pool manager for repo %s/%s", repo.Owner, repo.Name) + slog.InfoContext( + r.ctx, "creating pool manager for repo", + "repo_owner", repo.Owner, "repo_name", repo.Name) _, err := r.poolManagerCtrl.CreateRepoPoolManager(r.ctx, repo, r.providers, r.store) return err }) @@ -458,7 +460,7 @@ func (r *Runner) loadReposOrgsAndEnterprises() error { for _, org := range orgs { org := org g.Go(func() error { - log.Printf("creating pool manager for organization %s", org.Name) + slog.InfoContext(r.ctx, "creating pool manager for organization", "org_name", org.Name) _, err := r.poolManagerCtrl.CreateOrgPoolManager(r.ctx, org, r.providers, r.store) return err }) @@ -467,7 +469,7 @@ func (r *Runner) loadReposOrgsAndEnterprises() error { for _, enterprise := range enterprises { enterprise := enterprise g.Go(func() error { - log.Printf("creating pool manager for enterprise %s", enterprise.Name) + slog.InfoContext(r.ctx, "creating pool manager for enterprise", "enterprise_name", enterprise.Name) _, err := r.poolManagerCtrl.CreateEnterprisePoolManager(r.ctx, enterprise, r.providers, r.store) return err }) @@ -630,7 +632,7 @@ func (r *Runner) Wait() error { go func(id string, poolMgr common.PoolManager) { defer wg.Done() if err := poolMgr.Wait(); err != nil { - log.Printf("timed out waiting for pool manager %s to exit", id) + slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id, "pool_mgr_id", poolMgr.ID()) } }(poolId, repo) } @@ -640,7 +642,7 @@ func (r *Runner) Wait() error { go func(id string, poolMgr common.PoolManager) { defer wg.Done() if err := poolMgr.Wait(); err != nil { - log.Printf("timed out waiting for pool manager %s to exit", id) + slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id) } }(poolId, org) } @@ -650,7 +652,7 @@ func (r *Runner) Wait() error { go func(id string, poolMgr common.PoolManager) { defer wg.Done() if err := poolMgr.Wait(); err != nil { - log.Printf("timed out waiting for pool manager %s to exit", id) + slog.With(slog.Any("error", err)).ErrorContext(r.ctx, "timed out waiting for pool manager to exit", "pool_id", id) } }(poolId, enterprise) } @@ -717,12 +719,20 @@ func (r *Runner) DispatchWorkflowJob(hookTargetType, signature string, jobData [ switch HookTargetType(hookTargetType) { case RepoHook: - log.Printf("got hook for repo %s/%s", util.SanitizeLogEntry(job.Repository.Owner.Login), util.SanitizeLogEntry(job.Repository.Name)) + slog.DebugContext( + r.ctx, "got hook for repo", + "repo_owner", util.SanitizeLogEntry(job.Repository.Owner.Login), + "repo_name", util.SanitizeLogEntry(job.Repository.Name)) poolManager, err = r.findRepoPoolManager(job.Repository.Owner.Login, job.Repository.Name) case OrganizationHook: - log.Printf("got hook for org %s", util.SanitizeLogEntry(job.Organization.Login)) + slog.DebugContext( + r.ctx, "got hook for organization", + "organization", util.SanitizeLogEntry(job.Organization.Login)) poolManager, err = r.findOrgPoolManager(job.Organization.Login) case EnterpriseHook: + slog.DebugContext( + r.ctx, "got hook for enterprise", + "enterprise", util.SanitizeLogEntry(job.Enterprise.Slug)) poolManager, err = r.findEnterprisePoolManager(job.Enterprise.Slug) default: return runnerErrors.NewBadRequestError("cannot handle hook target type %s", hookTargetType) @@ -852,7 +862,35 @@ func (r *Runner) AddInstanceStatusMessage(ctx context.Context, param params.Inst } if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil { - return errors.Wrap(err, "updating runner state") + return errors.Wrap(err, "updating runner agent ID") + } + + return nil +} + +func (r *Runner) UpdateSystemInfo(ctx context.Context, param params.UpdateSystemInfoParams) error { + instanceID := auth.InstanceID(ctx) + if instanceID == "" { + slog.ErrorContext(ctx, "missing instance ID") + return runnerErrors.ErrUnauthorized + } + + if param.OSName == "" && param.OSVersion == "" && param.AgentID == nil { + // Nothing to update + return nil + } + + updateParams := params.UpdateInstanceParams{ + OSName: param.OSName, + OSVersion: param.OSVersion, + } + + if param.AgentID != nil { + updateParams.AgentID = *param.AgentID + } + + if _, err := r.store.UpdateInstance(r.ctx, instanceID, updateParams); err != nil { + return errors.Wrap(err, "updating runner system info") } return nil @@ -922,7 +960,13 @@ func (r *Runner) DeleteRunner(ctx context.Context, instanceName string, forceDel case commonParams.InstanceRunning, commonParams.InstanceError, commonParams.InstancePendingForceDelete, commonParams.InstancePendingDelete: default: - return runnerErrors.NewBadRequestError("runner must be in %q or %q state", commonParams.InstanceRunning, commonParams.InstanceError) + validStates := []string{ + string(commonParams.InstanceRunning), + string(commonParams.InstanceError), + string(commonParams.InstancePendingForceDelete), + string(commonParams.InstancePendingDelete), + } + return runnerErrors.NewBadRequestError("runner must be in one of the following states: %q", strings.Join(validStates, ", ")) } poolMgr, err := r.getPoolManagerFromInstance(ctx, instance) diff --git a/test/integration/e2e/client.go b/test/integration/e2e/client.go index 536420fe..2ab81695 100644 --- a/test/integration/e2e/client.go +++ b/test/integration/e2e/client.go @@ -1,7 +1,7 @@ package e2e import ( - "log" + "log/slog" "net/url" "github.com/cloudbase/garm/client" @@ -32,7 +32,7 @@ func InitClient(baseURL string) { } func FirstRun(adminUsername, adminPassword, adminFullName, adminEmail string) *params.User { - log.Println("First run") + slog.Info("First run") newUser := params.NewUserParams{ Username: adminUsername, Password: adminPassword, @@ -47,7 +47,7 @@ func FirstRun(adminUsername, adminPassword, adminFullName, adminEmail string) *p } func Login(username, password string) { - log.Println("Login") + slog.Info("Login") loginParams := params.PasswordLoginParams{ Username: username, Password: password, diff --git a/test/integration/e2e/e2e.go b/test/integration/e2e/e2e.go index b94c0652..0c0c27e2 100644 --- a/test/integration/e2e/e2e.go +++ b/test/integration/e2e/e2e.go @@ -2,7 +2,7 @@ package e2e import ( "fmt" - "log" + "log/slog" "os" "time" @@ -10,7 +10,7 @@ import ( ) func ListCredentials() params.Credentials { - log.Println("List credentials") + slog.Info("List credentials") credentials, err := listCredentials(cli, authToken) if err != nil { panic(err) @@ -19,7 +19,7 @@ func ListCredentials() params.Credentials { } func ListProviders() params.Providers { - log.Println("List providers") + slog.Info("List providers") providers, err := listProviders(cli, authToken) if err != nil { panic(err) @@ -28,7 +28,7 @@ func ListProviders() params.Providers { } func GetMetricsToken() { - log.Println("Get metrics token") + slog.Info("Get metrics token") _, err := getMetricsToken(cli, authToken) if err != nil { panic(err) @@ -36,7 +36,7 @@ func GetMetricsToken() { } func GetControllerInfo() *params.ControllerInfo { - log.Println("Get controller info") + slog.Info("Get controller info") controllerInfo, err := getControllerInfo(cli, authToken) if err != nil { panic(err) @@ -62,7 +62,7 @@ func GracefulCleanup() { if _, err := updatePool(cli, authToken, pool.ID, poolParams); err != nil { panic(err) } - log.Printf("Pool %s disabled", pool.ID) + slog.Info("Pool disabled", "pool_id", pool.ID) } // delete all the instances @@ -75,7 +75,7 @@ func GracefulCleanup() { if err := deleteInstance(cli, authToken, instance.Name, false); err != nil { panic(err) } - log.Printf("Instance %s deletion initiated", instance.Name) + slog.Info("Instance deletion initiated", "instance", instance.Name) } } @@ -91,7 +91,7 @@ func GracefulCleanup() { if err := deletePool(cli, authToken, pool.ID); err != nil { panic(err) } - log.Printf("Pool %s deleted", pool.ID) + slog.Info("Pool deleted", "pool_id", pool.ID) } // delete all the repositories @@ -103,7 +103,7 @@ func GracefulCleanup() { if err := deleteRepo(cli, authToken, repo.ID); err != nil { panic(err) } - log.Printf("Repo %s deleted", repo.ID) + slog.Info("Repo deleted", "repo_id", repo.ID) } // delete all the organizations @@ -115,14 +115,14 @@ func GracefulCleanup() { if err := deleteOrg(cli, authToken, org.ID); err != nil { panic(err) } - log.Printf("Org %s deleted", org.ID) + slog.Info("Org deleted", "org_id", org.ID) } } func appendCtrlInfoToGitHubEnv(controllerInfo *params.ControllerInfo) error { envFile, found := os.LookupEnv("GITHUB_ENV") if !found { - log.Printf("GITHUB_ENV not set, skipping appending controller info") + slog.Info("GITHUB_ENV not set, skipping appending controller info") return nil } file, err := os.OpenFile(envFile, os.O_WRONLY|os.O_APPEND, os.ModeAppend) diff --git a/test/integration/e2e/github_client_utils.go b/test/integration/e2e/github_client_utils.go index 8eee927c..81ced630 100644 --- a/test/integration/e2e/github_client_utils.go +++ b/test/integration/e2e/github_client_utils.go @@ -3,14 +3,14 @@ package e2e import ( "context" "fmt" - "log" + "log/slog" "github.com/google/go-github/v57/github" "golang.org/x/oauth2" ) func TriggerWorkflow(ghToken, orgName, repoName, workflowFileName, labelName string) { - log.Printf("Trigger workflow with label %s", labelName) + slog.Info("Trigger workflow", "label", labelName) client := getGithubClient(ghToken) eventReq := github.CreateWorkflowDispatchEventRequest{ @@ -26,7 +26,7 @@ func TriggerWorkflow(ghToken, orgName, repoName, workflowFileName, labelName str } func GhOrgRunnersCleanup(ghToken, orgName, controllerID string) error { - log.Printf("Cleanup Github runners, labelled with controller ID %s, from org %s", controllerID, orgName) + slog.Info("Cleanup Github runners", "controller_id", controllerID, "org_name", orgName) client := getGithubClient(ghToken) ghOrgRunners, _, err := client.Actions.ListOrganizationRunners(context.Background(), orgName, nil) @@ -42,10 +42,10 @@ func GhOrgRunnersCleanup(ghToken, orgName, controllerID string) error { if _, err := client.Actions.RemoveOrganizationRunner(context.Background(), orgName, orgRunner.GetID()); err != nil { // We don't fail if we can't remove a single runner. This // is a best effort to try and remove all the orphan runners. - log.Printf("Failed to remove organization runner %s: %v", orgRunner.GetName(), err) + slog.With(slog.Any("error", err)).Info("Failed to remove organization runner", "org_runner", orgRunner.GetName()) break } - log.Printf("Removed organization runner %s", orgRunner.GetName()) + slog.Info("Removed organization runner", "org_runner", orgRunner.GetName()) break } } @@ -55,7 +55,7 @@ func GhOrgRunnersCleanup(ghToken, orgName, controllerID string) error { } func GhRepoRunnersCleanup(ghToken, orgName, repoName, controllerID string) error { - log.Printf("Cleanup Github runners, labelled with controller ID %s, from repo %s/%s", controllerID, orgName, repoName) + slog.Info("Cleanup Github runners", "controller_id", controllerID, "org_name", orgName, "repo_name", repoName) client := getGithubClient(ghToken) ghRepoRunners, _, err := client.Actions.ListRunners(context.Background(), orgName, repoName, nil) @@ -71,10 +71,10 @@ func GhRepoRunnersCleanup(ghToken, orgName, repoName, controllerID string) error if _, err := client.Actions.RemoveRunner(context.Background(), orgName, repoName, repoRunner.GetID()); err != nil { // We don't fail if we can't remove a single runner. This // is a best effort to try and remove all the orphan runners. - log.Printf("Failed to remove repository runner %s: %v", repoRunner.GetName(), err) + slog.With(slog.Any("error", err)).Error("Failed to remove repository runner", "runner_name", repoRunner.GetName()) break } - log.Printf("Removed repository runner %s", repoRunner.GetName()) + slog.Info("Removed repository runner", "runner_name", repoRunner.GetName()) break } } @@ -124,7 +124,7 @@ func ValidateRepoWebhookUninstalled(ghToken, url, orgName, repoName string) { } func GhOrgWebhookCleanup(ghToken, webhookURL, orgName string) error { - log.Printf("Cleanup Github webhook with url %s for org %s", webhookURL, orgName) + slog.Info("Cleanup Github webhook", "webhook_url", webhookURL, "org_name", orgName) hook, err := getGhOrgWebhook(webhookURL, ghToken, orgName) if err != nil { return err @@ -136,14 +136,14 @@ func GhOrgWebhookCleanup(ghToken, webhookURL, orgName string) error { if _, err := client.Organizations.DeleteHook(context.Background(), orgName, hook.GetID()); err != nil { return err } - log.Printf("Github webhook with url %s for org %s was removed", webhookURL, orgName) + slog.Info("Github webhook removed", "webhook_url", webhookURL, "org_name", orgName) } return nil } func GhRepoWebhookCleanup(ghToken, webhookURL, orgName, repoName string) error { - log.Printf("Cleanup Github webhook with url %s for repo %s/%s", webhookURL, orgName, repoName) + slog.Info("Cleanup Github webhook", "webhook_url", webhookURL, "org_name", orgName, "repo_name", repoName) hook, err := getGhRepoWebhook(webhookURL, ghToken, orgName, repoName) if err != nil { @@ -156,7 +156,7 @@ func GhRepoWebhookCleanup(ghToken, webhookURL, orgName, repoName string) error { if _, err := client.Repositories.DeleteHook(context.Background(), orgName, repoName, hook.GetID()); err != nil { return err } - log.Printf("Github webhook with url %s for repo %s/%s was removed", webhookURL, orgName, repoName) + slog.Info("Github webhook with", "webhook_url", webhookURL, "org_name", orgName, "repo_name", repoName) } return nil diff --git a/test/integration/e2e/instances.go b/test/integration/e2e/instances.go index 7b0de20b..1200f3a3 100644 --- a/test/integration/e2e/instances.go +++ b/test/integration/e2e/instances.go @@ -2,7 +2,7 @@ package e2e import ( "fmt" - "log" + "log/slog" "time" commonParams "github.com/cloudbase/garm-provider-common/params" @@ -13,13 +13,13 @@ func waitInstanceStatus(name string, status commonParams.InstanceStatus, runnerS var timeWaited time.Duration = 0 var instance *params.Instance - log.Printf("Waiting for instance %s status to reach status %s and runner status %s", name, status, runnerStatus) + slog.Info("Waiting for instance to reach desired status", "instance", name, "desired_status", status, "desired_runner_status", runnerStatus) for timeWaited < timeout { instance, err := getInstance(cli, authToken, name) if err != nil { return nil, err } - log.Printf("Instance %s status %s and runner status %s", name, instance.Status, instance.RunnerStatus) + slog.Info("Instance status", "instance_name", name, "status", instance.Status, "runner_status", instance.RunnerStatus) if instance.Status == status && instance.RunnerStatus == runnerStatus { return instance, nil } @@ -37,14 +37,14 @@ func DeleteInstance(name string, forceRemove bool) { if err := deleteInstance(cli, authToken, name, forceRemove); err != nil { panic(err) } - log.Printf("Instance %s deletion initiated", name) + slog.Info("Instance deletion initiated", "instance_name", name) } func WaitInstanceToBeRemoved(name string, timeout time.Duration) error { var timeWaited time.Duration = 0 var instance *params.Instance - log.Printf("Waiting for instance %s to be removed", name) + slog.Info("Waiting for instance to be removed", "instance_name", name) for timeWaited < timeout { instances, err := listInstances(cli, authToken) if err != nil { @@ -82,7 +82,7 @@ func WaitPoolInstances(poolID string, status commonParams.InstanceStatus, runner return err } - log.Printf("Waiting for pool %s instances to reach status: %s and runner status: %s", poolID, status, runnerStatus) + slog.Info("Waiting for pool instances to reach desired status", "pool_id", poolID, "desired_status", status, "desired_runner_status", runnerStatus) for timeWaited < timeout { poolInstances, err := listPoolInstances(cli, authToken, poolID) if err != nil { @@ -96,7 +96,13 @@ func WaitPoolInstances(poolID string, status commonParams.InstanceStatus, runner } } - log.Printf("Pool %s instance reached status: %s and runner status: %s: %d/%d", poolID, status, runnerStatus, instancesCount, len(poolInstances)) + slog.Info( + "Pool instance reached status", + "pool_id", poolID, + "status", status, + "runner_status", runnerStatus, + "desired_instance_count", instancesCount, + "pool_instance_count", len(poolInstances)) if instancesCount == int(pool.MinIdleRunners) && instancesCount == len(poolInstances) { return nil } diff --git a/test/integration/e2e/jobs.go b/test/integration/e2e/jobs.go index d8abccf2..f3a1f3c6 100644 --- a/test/integration/e2e/jobs.go +++ b/test/integration/e2e/jobs.go @@ -2,7 +2,7 @@ package e2e import ( "fmt" - "log" + "log/slog" "time" commonParams "github.com/cloudbase/garm-provider-common/params" @@ -10,7 +10,7 @@ import ( ) func ValidateJobLifecycle(label string) { - log.Printf("Validate GARM job lifecycle with label %s", label) + slog.Info("Validate GARM job lifecycle", "label", label) // wait for job list to be updated job, err := waitLabelledJob(label, 4*time.Minute) @@ -58,7 +58,7 @@ func waitLabelledJob(label string, timeout time.Duration) (*params.Job, error) { var jobs params.Jobs var err error - log.Printf("Waiting for job with label %s", label) + slog.Info("Waiting for job", "label", label) for timeWaited < timeout { jobs, err = listJobs(cli, authToken) if err != nil { @@ -85,7 +85,7 @@ func waitJobStatus(id int64, status params.JobStatus, timeout time.Duration) (*p var timeWaited time.Duration = 0 var job *params.Job - log.Printf("Waiting for job %d to reach status %s", id, status) + slog.Info("Waiting for job to reach status", "job_id", id, "status", status) for timeWaited < timeout { jobs, err := listJobs(cli, authToken) if err != nil { diff --git a/test/integration/e2e/organizations.go b/test/integration/e2e/organizations.go index 490ca784..8f511215 100644 --- a/test/integration/e2e/organizations.go +++ b/test/integration/e2e/organizations.go @@ -1,7 +1,7 @@ package e2e import ( - "log" + "log/slog" "time" commonParams "github.com/cloudbase/garm-provider-common/params" @@ -9,7 +9,7 @@ import ( ) func CreateOrg(orgName, credentialsName, orgWebhookSecret string) *params.Organization { - log.Printf("Create org %s", orgName) + slog.Info("Create org", "org_name", orgName) orgParams := params.CreateOrgParams{ Name: orgName, CredentialsName: credentialsName, @@ -23,7 +23,7 @@ func CreateOrg(orgName, credentialsName, orgWebhookSecret string) *params.Organi } func UpdateOrg(id, credentialsName string) *params.Organization { - log.Printf("Update org %s", id) + slog.Info("Update org", "org_id", id) updateParams := params.UpdateEntityParams{ CredentialsName: credentialsName, } @@ -35,7 +35,7 @@ func UpdateOrg(id, credentialsName string) *params.Organization { } func InstallOrgWebhook(id string) *params.HookInfo { - log.Printf("Install org %s webhook", id) + slog.Info("Install org webhook", "org_id", id) webhookParams := params.InstallWebhookParams{ WebhookEndpointType: params.WebhookEndpointDirect, } @@ -51,14 +51,14 @@ func InstallOrgWebhook(id string) *params.HookInfo { } func UninstallOrgWebhook(id string) { - log.Printf("Uninstall org %s webhook", id) + slog.Info("Uninstall org webhook", "org_id", id) if err := uninstallOrgWebhook(cli, authToken, id); err != nil { panic(err) } } func CreateOrgPool(orgID string, poolParams params.CreatePoolParams) *params.Pool { - log.Printf("Create org %s pool", orgID) + slog.Info("Create org pool", "org_id", orgID) pool, err := createOrgPool(cli, authToken, orgID, poolParams) if err != nil { panic(err) @@ -67,7 +67,7 @@ func CreateOrgPool(orgID string, poolParams params.CreatePoolParams) *params.Poo } func GetOrgPool(orgID, orgPoolID string) *params.Pool { - log.Printf("Get org %s pool %s", orgID, orgPoolID) + slog.Info("Get org pool", "org_id", orgID, "pool_id", orgPoolID) pool, err := getOrgPool(cli, authToken, orgID, orgPoolID) if err != nil { panic(err) @@ -76,7 +76,7 @@ func GetOrgPool(orgID, orgPoolID string) *params.Pool { } func UpdateOrgPool(orgID, orgPoolID string, maxRunners, minIdleRunners uint) *params.Pool { - log.Printf("Update org %s pool %s", orgID, orgPoolID) + slog.Info("Update org pool", "org_id", orgID, "pool_id", orgPoolID) poolParams := params.UpdatePoolParams{ MinIdleRunners: &minIdleRunners, MaxRunners: &maxRunners, @@ -89,7 +89,7 @@ func UpdateOrgPool(orgID, orgPoolID string, maxRunners, minIdleRunners uint) *pa } func DeleteOrgPool(orgID, orgPoolID string) { - log.Printf("Delete org %s pool %s", orgID, orgPoolID) + slog.Info("Delete org pool", "org_id", orgID, "pool_id", orgPoolID) if err := deleteOrgPool(cli, authToken, orgID, orgPoolID); err != nil { panic(err) } @@ -111,7 +111,7 @@ func WaitOrgRunningIdleInstances(orgID string, timeout time.Duration) { func dumpOrgInstancesDetails(orgID string) error { // print org details - log.Printf("Dumping org %s details", orgID) + slog.Info("Dumping org details", "org_id", orgID) org, err := getOrg(cli, authToken, orgID) if err != nil { return err @@ -121,7 +121,7 @@ func dumpOrgInstancesDetails(orgID string) error { } // print org instances details - log.Printf("Dumping org %s instances details", orgID) + slog.Info("Dumping org instances details", "org_id", orgID) instances, err := listOrgInstances(cli, authToken, orgID) if err != nil { return err @@ -131,7 +131,7 @@ func dumpOrgInstancesDetails(orgID string) error { if err != nil { return err } - log.Printf("Instance %s info:", instance.Name) + slog.Info("Instance info", "instance_name", instance.Name) if err := printJsonResponse(instance); err != nil { return err } diff --git a/test/integration/e2e/pools.go b/test/integration/e2e/pools.go index 34d530f2..f00e07eb 100644 --- a/test/integration/e2e/pools.go +++ b/test/integration/e2e/pools.go @@ -2,7 +2,7 @@ package e2e import ( "fmt" - "log" + "log/slog" "time" "github.com/cloudbase/garm/params" @@ -13,13 +13,13 @@ func waitPoolNoInstances(id string, timeout time.Duration) error { var pool *params.Pool var err error - log.Printf("Wait until pool %s has no instances", id) + slog.Info("Wait until pool has no instances", "pool_id", id) for timeWaited < timeout { pool, err = getPool(cli, authToken, id) if err != nil { return err } - log.Printf("Current pool instances: %d", len(pool.Instances)) + slog.Info("Current pool instances", "instance_count", len(pool.Instances)) if len(pool.Instances) == 0 { return nil } @@ -45,7 +45,7 @@ func dumpPoolInstancesDetails(poolID string) error { if err != nil { return err } - log.Printf("Instance %s details:", instance.Name) + slog.Info("Instance details", "instance_name", instance.Name) if err := printJsonResponse(instanceDetails); err != nil { return err } diff --git a/test/integration/e2e/repositories.go b/test/integration/e2e/repositories.go index 72c10e32..b1e9236d 100644 --- a/test/integration/e2e/repositories.go +++ b/test/integration/e2e/repositories.go @@ -1,7 +1,7 @@ package e2e import ( - "log" + "log/slog" "time" commonParams "github.com/cloudbase/garm-provider-common/params" @@ -9,7 +9,7 @@ import ( ) func CreateRepo(orgName, repoName, credentialsName, repoWebhookSecret string) *params.Repository { - log.Printf("Create repository %s/%s", orgName, repoName) + slog.Info("Create repository", "owner_name", orgName, "repo_name", repoName) createParams := params.CreateRepoParams{ Owner: orgName, Name: repoName, @@ -24,7 +24,7 @@ func CreateRepo(orgName, repoName, credentialsName, repoWebhookSecret string) *p } func UpdateRepo(id, credentialsName string) *params.Repository { - log.Printf("Update repo %s", id) + slog.Info("Update repo", "repo_id", id) updateParams := params.UpdateEntityParams{ CredentialsName: credentialsName, } @@ -36,7 +36,7 @@ func UpdateRepo(id, credentialsName string) *params.Repository { } func InstallRepoWebhook(id string) *params.HookInfo { - log.Printf("Install repo %s webhook", id) + slog.Info("Install repo webhook", "repo_id", id) webhookParams := params.InstallWebhookParams{ WebhookEndpointType: params.WebhookEndpointDirect, } @@ -52,14 +52,14 @@ func InstallRepoWebhook(id string) *params.HookInfo { } func UninstallRepoWebhook(id string) { - log.Printf("Uninstall repo %s webhook", id) + slog.Info("Uninstall repo webhook", "repo_id", id) if err := uninstallRepoWebhook(cli, authToken, id); err != nil { panic(err) } } func CreateRepoPool(repoID string, poolParams params.CreatePoolParams) *params.Pool { - log.Printf("Create repo %s pool", repoID) + slog.Info("Create repo pool", "repo_id", repoID) pool, err := createRepoPool(cli, authToken, repoID, poolParams) if err != nil { panic(err) @@ -68,7 +68,7 @@ func CreateRepoPool(repoID string, poolParams params.CreatePoolParams) *params.P } func GetRepoPool(repoID, repoPoolID string) *params.Pool { - log.Printf("Get repo %s pool %s", repoID, repoPoolID) + slog.Info("Get repo pool", "repo_id", repoID, "pool_id", repoPoolID) pool, err := getRepoPool(cli, authToken, repoID, repoPoolID) if err != nil { panic(err) @@ -77,7 +77,7 @@ func GetRepoPool(repoID, repoPoolID string) *params.Pool { } func UpdateRepoPool(repoID, repoPoolID string, maxRunners, minIdleRunners uint) *params.Pool { - log.Printf("Update repo %s pool %s", repoID, repoPoolID) + slog.Info("Update repo pool", "repo_id", repoID, "pool_id", repoPoolID) poolParams := params.UpdatePoolParams{ MinIdleRunners: &minIdleRunners, MaxRunners: &maxRunners, @@ -90,14 +90,14 @@ func UpdateRepoPool(repoID, repoPoolID string, maxRunners, minIdleRunners uint) } func DeleteRepoPool(repoID, repoPoolID string) { - log.Printf("Delete repo %s pool %s", repoID, repoPoolID) + slog.Info("Delete repo pool", "repo_id", repoID, "pool_id", repoPoolID) if err := deleteRepoPool(cli, authToken, repoID, repoPoolID); err != nil { panic(err) } } func DisableRepoPool(repoID, repoPoolID string) { - log.Printf("Disable repo %s pool %s", repoID, repoPoolID) + slog.Info("Disable repo pool", "repo_id", repoID, "pool_id", repoPoolID) enabled := false poolParams := params.UpdatePoolParams{Enabled: &enabled} if _, err := updateRepoPool(cli, authToken, repoID, repoPoolID, poolParams); err != nil { @@ -121,7 +121,7 @@ func WaitRepoRunningIdleInstances(repoID string, timeout time.Duration) { func dumpRepoInstancesDetails(repoID string) error { // print repo details - log.Printf("Dumping repo %s details", repoID) + slog.Info("Dumping repo details", "repo_id", repoID) repo, err := getRepo(cli, authToken, repoID) if err != nil { return err @@ -131,7 +131,7 @@ func dumpRepoInstancesDetails(repoID string) error { } // print repo instances details - log.Printf("Dumping repo %s instances details", repoID) + slog.Info("Dumping repo instances details", "repo_id", repoID) instances, err := listRepoInstances(cli, authToken, repoID) if err != nil { return err @@ -141,7 +141,7 @@ func dumpRepoInstancesDetails(repoID string) error { if err != nil { return err } - log.Printf("Instance %s info:", instance.Name) + slog.Info("Instance info", "instance_name", instance.Name) if err := printJsonResponse(instance); err != nil { return err } diff --git a/test/integration/e2e/utils.go b/test/integration/e2e/utils.go index d849b797..ed8c2d62 100644 --- a/test/integration/e2e/utils.go +++ b/test/integration/e2e/utils.go @@ -2,7 +2,7 @@ package e2e import ( "encoding/json" - "log" + "log/slog" ) func printJsonResponse(resp interface{}) error { @@ -10,6 +10,6 @@ func printJsonResponse(resp interface{}) error { if err != nil { return err } - log.Println(string(b)) + slog.Info(string(b)) return nil } diff --git a/test/integration/gh_cleanup/main.go b/test/integration/gh_cleanup/main.go index 25773684..b841652e 100644 --- a/test/integration/gh_cleanup/main.go +++ b/test/integration/gh_cleanup/main.go @@ -2,7 +2,7 @@ package main import ( "fmt" - "log" + "log/slog" "os" "github.com/cloudbase/garm/test/integration/e2e" @@ -21,7 +21,7 @@ func main() { _ = e2e.GhOrgRunnersCleanup(ghToken, orgName, controllerID) _ = e2e.GhRepoRunnersCleanup(ghToken, orgName, repoName, controllerID) } else { - log.Println("Env variable GARM_CONTROLLER_ID is not set, skipping GitHub runners cleanup") + slog.Warn("Env variable GARM_CONTROLLER_ID is not set, skipping GitHub runners cleanup") } baseURL, baseUrlFound := os.LookupEnv("GARM_BASE_URL") @@ -30,6 +30,6 @@ func main() { _ = e2e.GhOrgWebhookCleanup(ghToken, webhookURL, orgName) _ = e2e.GhRepoWebhookCleanup(ghToken, webhookURL, orgName, repoName) } else { - log.Println("Env variables GARM_CONTROLLER_ID & GARM_BASE_URL are not set, skipping webhooks cleanup") + slog.Warn("Env variables GARM_CONTROLLER_ID & GARM_BASE_URL are not set, skipping webhooks cleanup") } } diff --git a/test/integration/main.go b/test/integration/main.go index 04b96861..06f22edc 100644 --- a/test/integration/main.go +++ b/test/integration/main.go @@ -2,7 +2,7 @@ package main import ( "fmt" - "log" + "log/slog" "os" "time" @@ -120,19 +120,19 @@ func main() { _ = e2e.UpdateRepoPool(repo.ID, repoPool2.ID, repoPoolParams2.MaxRunners, 1) err := e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstanceRunning, params.RunnerPending, 1*time.Minute) if err != nil { - log.Printf("Failed to wait for instance to be running: %v", err) + slog.With(slog.Any("error", err)).Error("Failed to wait for instance to be running") } repoPool2 = e2e.GetRepoPool(repo.ID, repoPool2.ID) e2e.DisableRepoPool(repo.ID, repoPool2.ID) e2e.DeleteInstance(repoPool2.Instances[0].Name, false) err = e2e.WaitPoolInstances(repoPool2.ID, commonParams.InstancePendingDelete, params.RunnerPending, 1*time.Minute) if err != nil { - log.Printf("Failed to wait for instance to be running: %v", err) + slog.With(slog.Any("error", err)).Error("Failed to wait for instance to be running") } e2e.DeleteInstance(repoPool2.Instances[0].Name, true) // delete instance with forceRemove err = e2e.WaitInstanceToBeRemoved(repoPool2.Instances[0].Name, 1*time.Minute) if err != nil { - log.Printf("Failed to wait for instance to be removed: %v", err) + slog.With(slog.Any("error", err)).Error("Failed to wait for instance to be removed") } e2e.DeleteRepoPool(repo.ID, repoPool2.ID) diff --git a/testdata/config.toml b/testdata/config.toml index 34465cb7..b9905553 100644 --- a/testdata/config.toml +++ b/testdata/config.toml @@ -35,15 +35,36 @@ webhook_url = "https://garm.example.com/webhooks" # webhooks for repositories or organizations. enable_webhook_management = true +# DEPRECATED: Use the [logging] section to set this option. # Uncomment this line if you'd like to log to a file instead of standard output. # log_file = "/tmp/runner-manager.log" +# DEPRECATED: Use the [logging] section to set this option. # Enable streaming logs via web sockets. Use garm-cli debug-log. enable_log_streamer = false # Enable the golang debug server. See the documentation in the "doc" folder for more information. debug_server = false + +[logging] +# Uncomment this line if you'd like to log to a file instead of standard output. +# log_file = "/tmp/runner-manager.log" + +# enable_log_streamer enables streaming the logs over websockets +enable_log_streamer = true +# log_format is the output format of the logs. GARM uses structured logging and can +# output as "text" or "json" +log_format = "text" +# log_level is the logging level GARM will output. Available log levels are: +# * debug +# * info +# * warn +# * error +log_level = "debug" +# log_source will output information about the function that generated the log line. +log_source = false + [metrics] # Toggle metrics. If set to false, the API endpoint for metrics collection will # be disabled. diff --git a/util/logging.go b/util/logging.go new file mode 100644 index 00000000..ac35863b --- /dev/null +++ b/util/logging.go @@ -0,0 +1,30 @@ +package util + +import ( + "context" + "log/slog" +) + +type slogContextKey string + +const ( + slogCtxFields slogContextKey = "slog_ctx_fields" +) + +type ContextHandler struct { + slog.Handler +} + +func (h ContextHandler) Handle(ctx context.Context, r slog.Record) error { + attrs, ok := ctx.Value(slogCtxFields).([]slog.Attr) + if ok { + for _, v := range attrs { + r.AddAttrs(v) + } + } + return h.Handler.Handle(ctx, r) +} + +func WithContext(ctx context.Context, attrs ...slog.Attr) context.Context { + return context.WithValue(ctx, slogCtxFields, attrs) +} diff --git a/websocket/client.go b/websocket/client.go index 28429a3a..ef420460 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -1,7 +1,7 @@ package websocket import ( - "log" + "log/slog" "time" "github.com/google/uuid" @@ -56,7 +56,7 @@ func (c *Client) clientReader() { }() c.conn.SetReadLimit(maxMessageSize) if err := c.conn.SetReadDeadline(time.Now().Add(pongWait)); err != nil { - log.Printf("failed to set read deadline: %s", err) + slog.With(slog.Any("error", err)).Error("failed to set read deadline") } c.conn.SetPongHandler(func(string) error { if err := c.conn.SetReadDeadline(time.Now().Add(pongWait)); err != nil { @@ -86,23 +86,23 @@ func (c *Client) clientWriter() { select { case message, ok := <-c.send: if err := c.conn.SetWriteDeadline(time.Now().Add(writeWait)); err != nil { - log.Printf("failed to set write deadline: %s", err) + slog.With(slog.Any("error", err)).Error("failed to set write deadline") } if !ok { // The hub closed the channel. if err := c.conn.WriteMessage(websocket.CloseMessage, []byte{}); err != nil { - log.Printf("failed to write message: %s", err) + slog.With(slog.Any("error", err)).Error("failed to write message") } return } if err := c.conn.WriteMessage(websocket.TextMessage, message); err != nil { - log.Printf("error sending message: %v", err) + slog.With(slog.Any("error", err)).Error("error sending message") return } case <-ticker.C: if err := c.conn.SetWriteDeadline(time.Now().Add(writeWait)); err != nil { - log.Printf("failed to set write deadline: %s", err) + slog.With(slog.Any("error", err)).Error("failed to set write deadline") } if err := c.conn.WriteMessage(websocket.PingMessage, nil); err != nil { return diff --git a/websocket/websocket.go b/websocket/websocket.go index a482fdcd..1286222f 100644 --- a/websocket/websocket.go +++ b/websocket/websocket.go @@ -3,6 +3,7 @@ package websocket import ( "context" "fmt" + "sync" "time" ) @@ -33,41 +34,64 @@ type Hub struct { // Unregister requests from clients. unregister chan *Client + + mux sync.Mutex + once sync.Once } func (h *Hub) run() { + defer func() { + close(h.closed) + }() for { select { case <-h.quit: - close(h.closed) return case <-h.ctx.Done(): - close(h.closed) return case client := <-h.register: if client != nil { + h.mux.Lock() h.clients[client.id] = client + h.mux.Unlock() } case client := <-h.unregister: if client != nil { + h.mux.Lock() if _, ok := h.clients[client.id]; ok { - delete(h.clients, client.id) + client.conn.Close() close(client.send) + delete(h.clients, client.id) } + h.mux.Unlock() } case message := <-h.broadcast: + staleClients := []string{} for id, client := range h.clients { if client == nil { + staleClients = append(staleClients, id) continue } select { case client.send <- message: case <-time.After(5 * time.Second): - close(client.send) - delete(h.clients, id) + staleClients = append(staleClients, id) } } + if len(staleClients) > 0 { + h.mux.Lock() + for _, id := range staleClients { + if client, ok := h.clients[id]; ok { + if client != nil { + client.conn.Close() + close(client.send) + } + delete(h.clients, id) + } + } + h.mux.Unlock() + } } } } @@ -78,13 +102,15 @@ func (h *Hub) Register(client *Client) error { } func (h *Hub) Write(msg []byte) (int, error) { + tmp := make([]byte, len(msg)) + copy(tmp, msg) + select { case <-time.After(5 * time.Second): return 0, fmt.Errorf("timed out sending message to client") - case h.broadcast <- msg: - + case h.broadcast <- tmp: } - return len(msg), nil + return len(tmp), nil } func (h *Hub) Start() error { @@ -92,19 +118,23 @@ func (h *Hub) Start() error { return nil } +func (h *Hub) Close() error { + h.once.Do(func() { + close(h.quit) + }) + return nil +} + func (h *Hub) Stop() error { - close(h.quit) + h.Close() + return h.Wait() +} + +func (h *Hub) Wait() error { select { case <-h.closed: - return nil case <-time.After(60 * time.Second): return fmt.Errorf("timed out waiting for hub stop") } -} - -func (h *Hub) Wait() { - select { - case <-h.closed: - case <-time.After(60 * time.Second): - } + return nil }