metrics: fix review findings

This commit is contained in:
Michael Kuhnt 2023-01-26 14:02:53 +01:00
parent ee659f509f
commit 6a032bfaa2
No known key found for this signature in database
GPG key ID: 088DC1E2EDC5A631
12 changed files with 141 additions and 115 deletions

View file

@ -16,11 +16,9 @@ package controllers
import (
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
"garm/apiserver/params"
@ -36,12 +34,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
)
func NewAPIController(r *runner.Runner, auth *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
id, err := r.GetControllerID()
if err != nil {
return nil, errors.Wrap(err, "getting controller ID")
}
func NewAPIController(r *runner.Runner, auth *auth.Authenticator, hub *wsWriter.Hub, controllerInfo runnerParams.ControllerInfo) (*APIController, error) {
return &APIController{
r: r,
auth: auth,
@ -50,17 +43,16 @@ func NewAPIController(r *runner.Runner, auth *auth.Authenticator, hub *wsWriter.
ReadBufferSize: 1024,
WriteBufferSize: 16384,
},
id: id.String(),
controllerInfo: controllerInfo,
}, nil
}
type APIController struct {
r *runner.Runner
auth *auth.Authenticator
hub *wsWriter.Hub
upgrader websocket.Upgrader
// holds this controller's id
id string
r *runner.Runner
auth *auth.Authenticator
hub *wsWriter.Hub
upgrader websocket.Upgrader
controllerInfo runnerParams.ControllerInfo
}
func handleError(w http.ResponseWriter, err error) {
@ -97,22 +89,6 @@ func handleError(w http.ResponseWriter, err error) {
}
}
// GetControllerInfo returns means to identify this very garm instance.
// This is very useful for debugging and monitoring purposes.
func (a *APIController) GetControllerInfo() (hostname, controllerId string) {
// the hostname is neither fixed nor in our control
// so we get it every time to avoid confusion
var err error
hostname, err = os.Hostname()
if err != nil {
log.Printf("error getting hostname: %q", err)
return "", ""
}
return hostname, a.id
}
// metric to count total webhooks received
// at this point the webhook is not yet authenticated and
// we don't know if it's meant for us or not
@ -139,23 +115,26 @@ func (a *APIController) handleWorkflowJobEvent(w http.ResponseWriter, r *http.Re
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")
hostname, controllerId := a.GetControllerInfo()
controllerInfo, err := a.r.GetControllerInfo(r.Context())
if err != nil {
log.Printf("failed to get controller info for metics labels: %q", err)
}
if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
webhooksReceived.WithLabelValues("false", "owner_unknown", hostname, controllerId).Inc()
webhooksReceived.WithLabelValues("false", "owner_unknown", controllerInfo.Hostname, controllerInfo.ControllerID.String()).Inc()
log.Printf("got not found error from DispatchWorkflowJob. webhook not meant for us?: %q", err)
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
webhooksReceived.WithLabelValues("false", "signature_invalid", hostname, controllerId).Inc()
webhooksReceived.WithLabelValues("false", "signature_invalid", controllerInfo.Hostname, controllerInfo.ControllerID.String()).Inc()
} else {
webhooksReceived.WithLabelValues("false", "unknown", hostname, controllerId).Inc()
webhooksReceived.WithLabelValues("false", "unknown", controllerInfo.Hostname, controllerInfo.ControllerID.String()).Inc()
}
handleError(w, err)
return
}
webhooksReceived.WithLabelValues("true", "", hostname, controllerId).Inc()
webhooksReceived.WithLabelValues("true", "", controllerInfo.Hostname, controllerInfo.ControllerID.String()).Inc()
}
func (a *APIController) CatchAll(w http.ResponseWriter, r *http.Request) {
@ -225,14 +204,21 @@ func (a *APIController) NotFoundHandler(w http.ResponseWriter, r *http.Request)
func (a *APIController) MetricsTokenHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
if !auth.IsAdmin(ctx) {
handleError(w, gErrors.ErrUnauthorized)
return
}
token, err := a.auth.GetJWTMetricsToken(ctx)
if err != nil {
handleError(w, err)
return
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
fmt.Fprintf(w, `{"token": "%s"}`, token)
err = json.NewEncoder(w).Encode(runnerParams.JWTResponse{Token: token})
if err != nil {
log.Printf("failed to encode response: %q", err)
}
}
// LoginHandler returns a jwt token

View file

@ -4,45 +4,77 @@ import (
"log"
"garm/auth"
"garm/runner"
"github.com/prometheus/client_golang/prometheus"
)
type GarmCollector struct {
healthMetric *prometheus.Desc
instanceMetric *prometheus.Desc
apiController *APIController
runner *runner.Runner
}
func NewGarmCollector(a *APIController) *GarmCollector {
func NewGarmCollector(r *runner.Runner) *GarmCollector {
return &GarmCollector{
apiController: a,
runner: r,
instanceMetric: prometheus.NewDesc(
"garm_runner_status",
"Status of the runner",
[]string{"name", "status", "runner_status", "pool", "pool_type", "hostname", "controller_id"}, nil,
)}
[]string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "hostname", "controller_id"}, nil,
),
healthMetric: prometheus.NewDesc(
"garm_health",
"Health of the runner",
[]string{"hostname", "controller_id"}, nil,
),
}
}
func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.instanceMetric
ch <- c.healthMetric
}
func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
c.CollectInstanceMetric(ch)
controllerInfo, err := c.runner.GetControllerInfo(auth.GetAdminContext())
if err != nil {
log.Printf("error on fetching controllerInfo: %s", err)
// continue anyway
}
c.CollectInstanceMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
c.CollectHealthMetric(ch, controllerInfo.Hostname, controllerInfo.ControllerID.String())
}
func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
m, err := prometheus.NewConstMetric(
c.healthMetric,
prometheus.GaugeValue,
1,
hostname,
controllerID,
)
if err != nil {
log.Printf("error on creating health metric: %s", err)
return
}
ch <- m
}
// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric) {
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()
instances, err := c.apiController.r.ListAllInstances(ctx)
instances, err := c.runner.ListAllInstances(ctx)
if err != nil {
log.Printf("cannot collect metrics, listing instances: %s", err)
return
}
pools, err := c.apiController.r.ListAllPools(ctx)
pools, err := c.runner.ListAllPools(ctx)
if err != nil {
log.Printf("listing pools: %s", err)
// continue anyway
@ -58,23 +90,21 @@ func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric) {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: "enterprise",
Type: string(pool.PoolType()),
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: "organization",
Type: string(pool.PoolType()),
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: "repository",
Type: string(pool.PoolType()),
}
}
}
hostname, controllerID := c.apiController.GetControllerInfo()
for _, instance := range instances {
m, err := prometheus.NewConstMetric(
@ -86,6 +116,7 @@ func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric) {
string(instance.RunnerStatus),
poolNames[instance.PoolID].Name,
poolNames[instance.PoolID].Type,
instance.PoolID,
hostname,
controllerID,
)

View file

@ -19,17 +19,28 @@ import (
"net/http"
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus/promhttp"
"garm/apiserver/controllers"
"garm/auth"
"garm/config"
"garm/util"
)
func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddleware, initMiddleware, instanceMiddleware auth.Middleware) *mux.Router {
func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, cfg *config.Config, authMiddleware, initMiddleware, instanceMiddleware, metricsMiddlerware auth.Middleware) *mux.Router {
router := mux.NewRouter()
logMiddleware := util.NewLoggingMiddleware(logWriter)
router.Use(logMiddleware)
if cfg.Metrics.Enable {
metricsRouter := router.PathPrefix("/metrics").Subrouter()
if !cfg.Metrics.DisableAuth {
metricsRouter.Use(metricsMiddlerware.Middleware)
}
metricsRouter.Handle("/", promhttp.Handler()).Methods("GET", "OPTIONS")
metricsRouter.Handle("", promhttp.Handler()).Methods("GET", "OPTIONS")
}
// Handles github webhooks
webhookRouter := router.PathPrefix("/webhooks").Subrouter()
webhookRouter.PathPrefix("/").Handler(http.HandlerFunc(han.CatchAll))

View file

@ -74,6 +74,11 @@ func (a *Authenticator) GetJWTToken(ctx context.Context) (string, error) {
// GetJWTMetricsToken returns a JWT token that can be used to read metrics.
// This token is not tied to a user, no user is stored in the db.
func (a *Authenticator) GetJWTMetricsToken(ctx context.Context) (string, error) {
if !IsAdmin(ctx) {
return "", runnerErrors.ErrUnauthorized
}
tokenID, err := util.GetRandomString(16)
if err != nil {
return "", errors.Wrap(err, "generating random string")
@ -88,7 +93,6 @@ func (a *Authenticator) GetJWTMetricsToken(ctx context.Context) (string, error)
// TODO: make this configurable
Issuer: "garm",
},
UserID: "metrics",
TokenID: tokenID,
IsAdmin: false,
ReadMetrics: true,

View file

@ -24,8 +24,9 @@ import (
type contextFlags string
const (
isAdminKey contextFlags = "is_admin"
fullNameKey contextFlags = "full_name"
isAdminKey contextFlags = "is_admin"
fullNameKey contextFlags = "full_name"
readMetricsKey contextFlags = "read_metrics"
// UserIDFlag is the User ID flag we set in the context
UserIDFlag contextFlags = "user_id"
isEnabledFlag contextFlags = "is_enabled"

View file

@ -1,6 +1,7 @@
package auth
import (
"context"
"fmt"
"garm/config"
"net/http"
@ -13,10 +14,10 @@ type MetricsMiddleware struct {
cfg config.JWTAuth
}
func NewMetricsMiddleware(cfg config.JWTAuth) *MetricsMiddleware {
func NewMetricsMiddleware(cfg config.JWTAuth) (*MetricsMiddleware, error) {
return &MetricsMiddleware{
cfg: cfg,
}
}, nil
}
func (m *MetricsMiddleware) Middleware(next http.Handler) http.Handler {
@ -59,6 +60,9 @@ func (m *MetricsMiddleware) Middleware(next http.Handler) http.Handler {
return
}
ctx = context.WithValue(ctx, isAdminKey, false)
ctx = context.WithValue(ctx, readMetricsKey, true)
next.ServeHTTP(w, r.WithContext(ctx))
})
}

View file

@ -40,7 +40,6 @@ import (
"github.com/gorilla/mux"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
@ -112,6 +111,11 @@ func main() {
log.Fatalf("failed to create controller: %+v", err)
}
controllerInfo, err := db.ControllerInfo()
if err != nil {
log.Fatal(err)
}
// If there are many repos/pools, this may take a long time.
// TODO: start pool managers in the background and log errors.
if err := runner.Start(); err != nil {
@ -119,7 +123,7 @@ func main() {
}
authenticator := auth.NewAuthenticator(cfg.JWTAuth, db)
controller, err := controllers.NewAPIController(runner, authenticator, hub)
controller, err := controllers.NewAPIController(runner, authenticator, hub, controllerInfo)
if err != nil {
log.Fatalf("failed to create controller: %+v", err)
}
@ -139,7 +143,16 @@ func main() {
log.Fatal(err)
}
router := routers.NewAPIRouter(controller, multiWriter, jwtMiddleware, initMiddleware, instanceMiddleware)
metricsMiddleware, err := auth.NewMetricsMiddleware(cfg.JWTAuth)
if err != nil {
log.Fatal(err)
}
err = prometheus.Register(controllers.NewGarmCollector(runner))
if err != nil {
log.Println("failed to register garm collector in prometheus", err)
}
router := routers.NewAPIRouter(controller, multiWriter, cfg, jwtMiddleware, initMiddleware, instanceMiddleware, metricsMiddleware)
corsMw := mux.CORSMethodMiddleware(router)
router.Use(corsMw)
@ -170,28 +183,6 @@ func main() {
}
}()
if !cfg.APIServer.MetricsConfig.Disabled {
go func() {
metricsMiddleware := auth.NewMetricsMiddleware(cfg.JWTAuth)
r := mux.NewRouter()
r.Handle("/metrics", promhttp.Handler())
if !cfg.APIServer.MetricsConfig.NoAuth {
r.Use(metricsMiddleware.Middleware)
}
err := prometheus.Register(controllers.NewGarmCollector(controller))
if err != nil {
log.Printf("failed to register prometheus collector: %+v", err)
}
if err := http.ListenAndServe(cfg.APIServer.MetricsBindAddress(), r); err != nil {
log.Printf("metrics server failed: %+v", err)
}
}()
}
<-ctx.Done()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 60*time.Second)
defer shutdownCancel()

View file

@ -119,6 +119,7 @@ func NewConfig(cfgFile string) (*Config, error) {
type Config struct {
Default Default `toml:"default" json:"default"`
APIServer APIServer `toml:"apiserver,omitempty" json:"apiserver,omitempty"`
Metrics Metrics `toml:"metrics,omitempty" json:"metrics,omitempty"`
Database Database `toml:"database,omitempty" json:"database,omitempty"`
Providers []Provider `toml:"provider,omitempty" json:"provider,omitempty"`
Github []Github `toml:"github,omitempty"`
@ -476,30 +477,19 @@ func (t *TLSConfig) Validate() error {
return nil
}
type MetricsConfig struct {
Port int `toml:"port" json:"port"`
Disabled bool `toml:"disabled" json:"disabled"`
NoAuth bool `toml:"no_auth" json:"no-auth"`
}
// MetricsBindAddress returns a host:port string.
func (a *APIServer) MetricsBindAddress() string {
metricsPort := a.MetricsConfig.Port
if metricsPort == 0 {
metricsPort = 8081
}
return fmt.Sprintf("%s:%d", a.Bind, metricsPort)
type Metrics struct {
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
Enable bool `toml:"enable" json:"enable"`
}
// APIServer holds configuration for the API server
// worker
type APIServer struct {
Bind string `toml:"bind" json:"bind"`
Port int `toml:"port" json:"port"`
UseTLS bool `toml:"use_tls" json:"use-tls"`
TLSConfig TLSConfig `toml:"tls" json:"tls"`
CORSOrigins []string `toml:"cors_origins" json:"cors-origins"`
MetricsConfig MetricsConfig `toml:"metrics" json:"metrics"`
Bind string `toml:"bind" json:"bind"`
Port int `toml:"port" json:"port"`
UseTLS bool `toml:"use_tls" json:"use-tls"`
TLSConfig TLSConfig `toml:"tls" json:"tls"`
CORSOrigins []string `toml:"cors_origins" json:"cors-origins"`
}
func (a *APIServer) APITLSConfig() (*tls.Config, error) {

2
go.mod
View file

@ -18,6 +18,7 @@ require (
github.com/manifoldco/promptui v0.9.0
github.com/nbutton23/zxcvbn-go v0.0.0-20210217022336-fa2cb2858354
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.14.0
github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b
github.com/spf13/cobra v1.4.1-0.20220504202302-9e88759b19cd
github.com/stretchr/testify v1.8.0
@ -59,7 +60,6 @@ require (
github.com/pborman/uuid v1.2.1 // indirect
github.com/pkg/sftp v1.13.4 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect

6
go.sum
View file

@ -416,8 +416,6 @@ golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4Iltr
golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 h1:RerP+noqYHUQ8CMRcPlC2nvTa4dcBIjegkuWdcUDuqg=
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A=
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b h1:clP8eMhB30EHdc0bd2Twtq6kgU7yl5ub2cQLSdrv1Dg=
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@ -472,8 +470,6 @@ golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f h1:rlezHXNlxYWvBCzNses9Dlc7nGFaNMJeqLolcmQSSZY=
golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
@ -613,8 +609,6 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw=
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w=
google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/DATA-DOG/go-sqlmock.v1 v1.3.0 h1:FVCohIoYO7IJoDDVpV2pdq7SgrMH6wHnuTyrdrxJNoY=

View file

@ -275,6 +275,7 @@ type JWTResponse struct {
type ControllerInfo struct {
ControllerID uuid.UUID `json:"controller_id"`
Hostname string `json:"hostname"`
}
type GithubCredentials struct {

View file

@ -24,6 +24,7 @@ import (
"fmt"
"hash"
"log"
"os"
"strings"
"sync"
"time"
@ -41,7 +42,6 @@ import (
"garm/util"
"github.com/pkg/errors"
uuid "github.com/satori/go.uuid"
)
func NewRunner(ctx context.Context, cfg config.Config) (*Runner, error) {
@ -264,14 +264,27 @@ type Runner struct {
providers map[string]common.Provider
credentials map[string]config.Github
controllerInfo params.ControllerInfo
}
func (r *Runner) GetControllerID() (uuid.UUID, error) {
info, err := r.store.ControllerInfo()
if err != nil {
return uuid.Nil, errors.Wrap(err, "fetching controller info")
func (r *Runner) GetControllerInfo(ctx context.Context) (params.ControllerInfo, error) {
if r.controllerInfo == (params.ControllerInfo{}) {
var err error
r.controllerInfo, err = r.store.ControllerInfo()
if err != nil {
return params.ControllerInfo{}, errors.Wrap(err, "fetching controller info")
}
}
return info.ControllerID, nil
if r.controllerInfo.Hostname == "" {
var err error
r.controllerInfo.Hostname, err = os.Hostname()
if err != nil {
// this returns a partial controller info, but it's better than nothing
return r.controllerInfo, errors.Wrap(err, "fetching hostname")
}
}
return r.controllerInfo, nil
}
func (r *Runner) ListCredentials(ctx context.Context) ([]params.GithubCredentials, error) {