diff --git a/.goreleaser.yaml b/.goreleaser.yaml index de03c63..661607b 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -14,7 +14,6 @@ builds: - CGO_ENABLED=0 goos: - linux - - darwin goarch: - amd64 - arm64 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f9d6972 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,94 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build and Development Commands + +```bash +# Build +make build # Build the collector binary +go build -o collector ./cmd/collector +go build -o receiver ./cmd/receiver + +# Test +make test # Run all tests +go test -v ./... # Run all tests with verbose output +go test -v ./internal/collector/... # Run tests for a specific package +make test-coverage # Run tests with coverage report + +# Code Quality +make fmt # Format code +make vet # Run go vet +make lint # Run golangci-lint (v2.6.2) +make all # Format, vet, lint, and build + +# Git Hooks +make install-hooks # Install pre-commit and commit-msg hooks +``` + +## Architecture Overview + +This is a Go metrics collector designed for CI/CD environments with shared PID namespaces. It consists of two binaries: + +### Collector (`cmd/collector`) +Runs alongside CI workloads, periodically reads `/proc` filesystem, and pushes a summary to the receiver on shutdown (SIGINT/SIGTERM). + +**Data Flow:** +1. `metrics.Aggregator` reads `/proc` to collect CPU/memory for all processes +2. `collector.Collector` orchestrates collection at intervals and writes to output +3. `summary.Accumulator` tracks samples across the run, computing peak/avg/percentiles +4. On shutdown, `summary.PushClient` sends the summary to the receiver HTTP endpoint + +### Receiver (`cmd/receiver`) +HTTP service that stores metric summaries in SQLite (via GORM) and provides a query API. + +**Key Endpoints:** +- `POST /api/v1/metrics` - Receive metrics from collectors +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` - Query stored metrics + +### Internal Packages + +| Package | Purpose | +|---------|---------| +| `internal/collector` | Orchestrates collection loop, handles shutdown | +| `internal/metrics` | Aggregates system/process metrics from /proc | +| `internal/proc` | Low-level /proc parsing (stat, status, cgroup) | +| `internal/cgroup` | Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP env vars | +| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | +| `internal/receiver` | HTTP handlers and SQLite store | +| `internal/output` | Metrics output formatting (JSON/text) | + +### Container Metrics + +The collector groups processes by container using cgroup paths. Configuration via environment variables: +- `CGROUP_PROCESS_MAP`: JSON mapping process names to container names (e.g., `{"node":"runner"}`) +- `CGROUP_LIMITS`: JSON with CPU/memory limits per container for percentage calculations + +CPU values in container metrics are reported as **cores** (not percentage), enabling direct comparison with Kubernetes resource limits. + +## Commit Message Convention + +Uses conventional commits enforced by git hook: +``` +()?: +``` + +Types: `feat`, `fix`, `chore`, `docs`, `style`, `refactor`, `perf`, `test`, `build`, `ci` + +Examples: +- `feat: add user authentication` +- `fix(collector): handle nil cgroup paths` +- `feat!: breaking change in API` + +## Running Locally + +```bash +# Run receiver +./receiver --addr=:8080 --db=metrics.db + +# Run collector with push endpoint +./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics + +# Docker Compose stress test +docker compose -f test/docker/docker-compose-stress.yaml up -d +``` diff --git a/Dockerfile b/Dockerfile index 64a266f..75f7b7f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,37 @@ -FROM golang:1.25-alpine AS builder +FROM golang:1.25-alpine AS builder-base WORKDIR /app -COPY go.mod ./ +COPY go.mod go.sum ./ RUN go mod download COPY . . +# Collector build (no CGO needed) +FROM builder-base AS builder-collector + RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /resource-collector ./cmd/collector -FROM alpine:3.19 +# Receiver build (CGO needed for SQLite) +FROM builder-base AS builder-receiver -COPY --from=builder /resource-collector /usr/local/bin/resource-collector +RUN apk add --no-cache gcc musl-dev +RUN CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o /metrics-receiver ./cmd/receiver + +# Collector image +FROM alpine:3.19 AS collector + +COPY --from=builder-collector /resource-collector /usr/local/bin/resource-collector ENTRYPOINT ["/usr/local/bin/resource-collector"] + +# Receiver image +FROM alpine:3.19 AS receiver + +RUN apk add --no-cache sqlite-libs + +COPY --from=builder-receiver /metrics-receiver /usr/local/bin/metrics-receiver + +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/metrics-receiver"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..abd58e9 --- /dev/null +++ b/README.md @@ -0,0 +1,243 @@ +# Forgejo Runner Resource Collector + +A lightweight metrics collector for CI/CD workloads in shared PID namespace environments. Reads `/proc` to collect CPU and memory metrics, groups them by container/cgroup, and pushes run summaries to a receiver service for storage and querying. + +## Architecture + +The system has two independent binaries: + +``` +┌─────────────────────────────────────────────┐ ┌──────────────────────────┐ +│ CI/CD Pod (shared PID namespace) │ │ Receiver Service │ +│ │ │ │ +│ ┌───────────┐ ┌────────┐ ┌───────────┐ │ │ POST /api/v1/metrics │ +│ │ collector │ │ runner │ │ sidecar │ │ │ │ │ +│ │ │ │ │ │ │ │ │ ▼ │ +│ │ reads │ │ │ │ │ │ push │ ┌────────────┐ │ +│ │ /proc for │ │ │ │ │ │──────▶│ │ SQLite │ │ +│ │ all PIDs │ │ │ │ │ │ │ └────────────┘ │ +│ └───────────┘ └────────┘ └───────────┘ │ │ │ │ +│ │ │ ▼ │ +└─────────────────────────────────────────────┘ │ GET /api/v1/metrics/... │ + └──────────────────────────┘ +``` + +### Collector + +Runs as a sidecar alongside CI workloads. On a configurable interval, it reads `/proc` to collect CPU and memory for all visible processes, groups them by container using cgroup paths, and accumulates samples. On shutdown (SIGINT/SIGTERM), it computes run-level statistics (peak, avg, percentiles) and pushes a single summary to the receiver. + +```bash +./collector --interval=2s --top=10 --push-endpoint=http://receiver:8080/api/v1/metrics +``` + +**Flags:** `--interval`, `--proc-path`, `--log-level`, `--log-format`, `--top`, `--push-endpoint`, `--push-token` + +**Environment variables:** + +| Variable | Description | Example | +| ------------------------- | ------------------------------------- | ------------------- | +| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | +| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | +| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | +| `GITHUB_JOB` | Job name | `build` | +| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | +| `COLLECTOR_PUSH_TOKEN` | Bearer token for push endpoint auth | — | +| `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | +| `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | + +**CGROUP_LIMITS example:** + +```json +{ + "runner": { "cpu": "2", "memory": "1Gi" }, + "sidecar": { "cpu": "500m", "memory": "256Mi" } +} +``` + +CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory supports `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). + +### Receiver + +HTTP service that stores metric summaries in SQLite (via GORM) and exposes a query API. + +```bash +./receiver --addr=:8080 --db=metrics.db --read-token=my-secret-token --hmac-key=my-hmac-key +``` + +**Flags:** + +| Flag | Environment Variable | Description | Default | +| -------------- | --------------------- | ----------------------------------------------------- | ------------ | +| `--addr` | — | HTTP listen address | `:8080` | +| `--db` | — | SQLite database path | `metrics.db` | +| `--read-token` | `RECEIVER_READ_TOKEN` | Pre-shared token for read/admin endpoints (required) | — | +| `--hmac-key` | `RECEIVER_HMAC_KEY` | Secret key for push token generation/validation (required) | — | + +**Endpoints:** + +- `POST /api/v1/metrics` — receive and store a metric summary (requires scoped push token) +- `POST /api/v1/token` — generate a scoped push token (requires read token auth) +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics (requires read token auth) + +**Authentication:** + +All metrics endpoints require authentication via `--read-token`: + +- The GET endpoint requires a Bearer token matching the read token +- The POST metrics endpoint requires a scoped push token (generated via `POST /api/v1/token`) +- The token endpoint itself requires the read token + +**Token flow:** + +```bash +# 1. Admin generates a scoped push token using the read token +curl -X POST http://localhost:8080/api/v1/token \ + -H "Authorization: Bearer my-secret-token" \ + -H "Content-Type: application/json" \ + -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' +# → {"token":""} + +# 2. Collector uses the scoped token to push metrics +./collector --push-endpoint=http://localhost:8080/api/v1/metrics \ + --push-token= + +# 3. Query metrics with the read token +curl -H "Authorization: Bearer my-secret-token" \ #gitleaks:allow + http://localhost:8080/api/v1/metrics/repo/my-org/my-repo/ci.yml/build +``` + +Push tokens are HMAC-SHA256 digests derived from `--hmac-key` and the scope (org/repo/workflow/job). They are stateless — no database storage is needed. The HMAC key is separate from the read token so that compromising a push token does not expose the admin credential. + +## How Metrics Are Collected + +The collector reads `/proc/[pid]/stat` for every visible process to get CPU ticks (`utime` + `stime`) and `/proc/[pid]/status` for memory (RSS). It takes two samples per interval and computes the delta to derive CPU usage rates. + +Processes are grouped into containers by reading `/proc/[pid]/cgroup` and matching cgroup paths against the `CGROUP_PROCESS_MAP`. This is necessary because in shared PID namespace pods, `/proc/stat` only shows host-level aggregates — per-container metrics must be built up from individual process data. + +Container CPU is reported in **cores** (not percentage) for direct comparison with Kubernetes resource limits. System-level CPU is reported as a percentage (0-100%). + +Over the course of a run, the `summary.Accumulator` tracks every sample and on shutdown computes: + +| Stat | Description | +| -------------------------- | ------------------------------ | +| `peak` | Maximum observed value | +| `p99`, `p95`, `p75`, `p50` | Percentiles across all samples | +| `avg` | Arithmetic mean | + +These stats are computed for CPU, memory, and per-container metrics. + +## API Response + +``` +GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build +``` + +```json +[ + { + "id": 1, + "organization": "my-org", + "repository": "my-org/my-repo", + "workflow": "ci.yml", + "job": "build", + "run_id": "run-123", + "received_at": "2026-02-06T14:30:23.056Z", + "payload": { + "start_time": "2026-02-06T14:30:02.185Z", + "end_time": "2026-02-06T14:30:22.190Z", + "duration_seconds": 20.0, + "sample_count": 11, + "cpu_total_percent": { "peak": ..., "avg": ..., "p50": ... }, + "mem_used_bytes": { "peak": ..., "avg": ... }, + "containers": [ + { + "name": "runner", + "cpu_cores": { "peak": 2.007, "avg": 1.5, "p50": 1.817, "p95": 2.004 }, + "memory_bytes": { "peak": 18567168, "avg": 18567168 } + } + ], + "top_cpu_processes": [ ... ], + "top_mem_processes": [ ... ] + } + } +] +``` + +**CPU metric distinction:** + +- `cpu_total_percent` — system-wide, 0-100% +- `cpu_cores` (containers) — cores used (e.g. `2.0` = two full cores) +- `peak_cpu_percent` (processes) — per-process, where 100% = 1 core + +All memory values are in **bytes**. + +## Running + +### Docker Compose + +```bash +# Start the receiver (builds image if needed): +docker compose -f test/docker/docker-compose-stress.yaml up -d --build receiver + +# Generate a scoped push token for the collector: +PUSH_TOKEN=$(curl -s -X POST http://localhost:9080/api/v1/token \ + -H "Authorization: Bearer dummyreadtoken" \ + -H "Content-Type: application/json" \ + -d '{"organization":"test-org","repository":"test-org/stress-test","workflow":"stress-test-workflow","job":"heavy-workload"}' \ + | jq -r .token) + +# Start the collector and stress workloads with the push token: +COLLECTOR_PUSH_TOKEN=$PUSH_TOKEN \ + docker compose -f test/docker/docker-compose-stress.yaml up -d --build collector + +# ... Wait for data collection ... + +# Trigger shutdown summary: +docker compose -f test/docker/docker-compose-stress.yaml stop collector + +# Query results with the read token: +curl -H "Authorization: Bearer dummyreadtoken" \ + http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload +``` + +### Local + +```bash +go build -o collector ./cmd/collector +go build -o receiver ./cmd/receiver + +# Start receiver with both keys: +./receiver --addr=:8080 --db=metrics.db \ + --read-token=my-secret-token --hmac-key=my-hmac-key + +# Generate a scoped push token: +PUSH_TOKEN=$(curl -s -X POST http://localhost:8080/api/v1/token \ + -H "Authorization: Bearer my-secret-token" \ + -H "Content-Type: application/json" \ + -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' \ + | jq -r .token) + +# Run collector with the push token: +./collector --interval=2s --top=10 \ + --push-endpoint=http://localhost:8080/api/v1/metrics \ + --push-token=$PUSH_TOKEN +``` + +## Internal Packages + +| Package | Purpose | +| -------------------- | ------------------------------------------------------------------- | +| `internal/proc` | Low-level `/proc` parsing (stat, status, cgroup) | +| `internal/metrics` | Aggregates process metrics from `/proc` into system/container views | +| `internal/cgroup` | Parses `CGROUP_PROCESS_MAP` and `CGROUP_LIMITS` env vars | +| `internal/collector` | Orchestrates the collection loop and shutdown | +| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | +| `internal/receiver` | HTTP handlers and SQLite store | +| `internal/output` | Metrics output formatting (JSON/text) | + +## Background + +Technical reference on the Linux primitives this project builds on: + +- [Identifying process cgroups by PID](docs/background/identify-process-cgroup-by-pid.md) — how to read `/proc//cgroup` to determine which container a process belongs to +- [/proc/stat behavior in containers](docs/background/proc-stat-in-containers.md) — why `/proc/stat` shows host-level data in containers, and how to aggregate per-process stats from `/proc/[pid]/stat` instead, including CPU tick conversion and cgroup limit handling diff --git a/cmd/collector/main.go b/cmd/collector/main.go index 8009586..7a88a85 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -12,6 +12,7 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/collector" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) const ( @@ -29,6 +30,8 @@ func main() { logLevel := flag.String("log-level", defaultLogLevel, "Log level: debug, info, warn, error") logFormat := flag.String("log-format", defaultLogFormat, "Output format: json, text") topN := flag.Int("top", defaultTopN, "Number of top processes to include") + pushEndpoint := flag.String("push-endpoint", "", "HTTP endpoint to push metrics to (e.g., http://localhost:8080/api/v1/metrics)") + pushToken := flag.String("push-token", os.Getenv("COLLECTOR_PUSH_TOKEN"), "Bearer token for push endpoint authentication (or set COLLECTOR_PUSH_TOKEN)") flag.Parse() // Setup structured logging for application logs @@ -53,6 +56,25 @@ func main() { TopN: *topN, }, metricsWriter, appLogger) + // Attach summary writer to emit run summary on shutdown + summaryWriter := summary.NewSummaryWriter(os.Stdout, *logFormat) + c.SetSummaryWriter(summaryWriter) + + // Setup push client if endpoint is configured + if *pushEndpoint != "" { + pushClient := summary.NewPushClient(*pushEndpoint, *pushToken) + c.SetPushClient(pushClient) + execCtx := pushClient.ExecutionContext() + appLogger.Info("push client configured", + slog.String("endpoint", *pushEndpoint), + slog.String("organization", execCtx.Organization), + slog.String("repository", execCtx.Repository), + slog.String("workflow", execCtx.Workflow), + slog.String("job", execCtx.Job), + slog.String("run_id", execCtx.RunID), + ) + } + // Setup signal handling for graceful shutdown ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/cmd/receiver/main.go b/cmd/receiver/main.go new file mode 100644 index 0000000..1379b53 --- /dev/null +++ b/cmd/receiver/main.go @@ -0,0 +1,79 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log/slog" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" +) + +const ( + defaultAddr = ":8080" + defaultDBPath = "metrics.db" +) + +func main() { + addr := flag.String("addr", defaultAddr, "HTTP listen address") + dbPath := flag.String("db", defaultDBPath, "SQLite database path") + readToken := flag.String("read-token", os.Getenv("RECEIVER_READ_TOKEN"), "Pre-shared token for read endpoints (or set RECEIVER_READ_TOKEN)") + hmacKey := flag.String("hmac-key", os.Getenv("RECEIVER_HMAC_KEY"), "Secret key for push token generation/validation (or set RECEIVER_HMAC_KEY)") + flag.Parse() + + logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + store, err := receiver.NewStore(*dbPath) + if err != nil { + logger.Error("failed to open database", slog.String("error", err.Error())) + os.Exit(1) + } + defer func() { _ = store.Close() }() + + handler := receiver.NewHandler(store, logger, *readToken, *hmacKey) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := &http.Server{ + Addr: *addr, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigChan + logger.Info("received signal, shutting down", slog.String("signal", sig.String())) + cancel() + + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + _ = server.Shutdown(shutdownCtx) + }() + + logger.Info("starting metrics receiver", + slog.String("addr", *addr), + slog.String("db", *dbPath), + ) + + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } + + <-ctx.Done() + logger.Info("receiver stopped gracefully") +} diff --git a/docs/background/identify-process-cgroup-by-pid.md b/docs/background/identify-process-cgroup-by-pid.md new file mode 100644 index 0000000..7d4734d --- /dev/null +++ b/docs/background/identify-process-cgroup-by-pid.md @@ -0,0 +1,38 @@ +# Identifying a Process's Cgroup by PID + +Read `/proc//cgroup` to find which cgroup (and therefore which container) a process belongs to. + +## /proc/PID/cgroup + +```bash +cat /proc//cgroup +``` + +Shows all cgroup controllers the process belongs to: +``` +12:blkio:/user.slice +11:memory:/user.slice/user-1000.slice +... +0::/user.slice/user-1000.slice/session-1.scope +``` + +On cgroup v2, the path after `::` is the cgroup path under `/sys/fs/cgroup/`. + +## Other Methods + +```bash +# ps format options +ps -o pid,cgroup -p + +# systemd systems +systemd-cgls --unit +systemd-cgls # whole tree +``` + +## Quick One-Liners + +```bash +cat /proc/self/cgroup # current shell +cat /proc/$$/cgroup # also current shell +cat /proc/1234/cgroup # specific PID +``` diff --git a/docs/background/proc-stat-in-containers.md b/docs/background/proc-stat-in-containers.md new file mode 100644 index 0000000..9884b2a --- /dev/null +++ b/docs/background/proc-stat-in-containers.md @@ -0,0 +1,238 @@ +# /proc/stat behavior in containerised environments + +`/proc/stat` in containers shows **host-level** statistics, not container-specific data. To get container-aware CPU metrics when processes span multiple cgroups (e.g., sidecars sharing a PID namespace), aggregate `/proc/[pid]/stat` for all visible processes and use cgroup limits from `/sys/fs/cgroup` for normalization. + +## Why /proc/stat is wrong in containers + +`/proc/stat` reports host-wide values (CPU times, context switches, boot time, process count) because `/proc` is mounted from the host kernel, which has no namespace awareness for these metrics. + +This means: +- Tools reading `/proc/stat` (top, htop, etc.) show **host** CPU usage, not container usage +- Cgroup CPU limits (e.g., 2 CPUs) are not reflected — all host CPUs are visible +- In shared environments, containers see each other's aggregate impact + +### Alternatives + +| Approach | Description | +|----------|-------------| +| **cgroups** | Read `/sys/fs/cgroup/cpu/` for container-specific CPU accounting | +| **LXCFS** | FUSE filesystem providing container-aware `/proc` files | +| **Container runtimes** | Some (like Kata) use VMs with isolated kernels | +| **Metrics APIs** | Docker/Kubernetes APIs instead of `/proc/stat` | + +```bash +# cgroups v2: +cat /sys/fs/cgroup/cpu.stat + +# cgroups v1: +cat /sys/fs/cgroup/cpu/cpuacct.usage +``` + +## Aggregating per-Process CPU from /proc/[pid]/stat + +When cgroup-level reads aren't an option (sidecars sharing PID namespace with different cgroups), aggregate individual process stats: + +```bash +# Fields 14 (utime) and 15 (stime) in /proc/[pid]/stat +for pid in /proc/[0-9]*; do + awk '{print $14 + $15}' "$pid/stat" 2>/dev/null +done | awk '{sum += $1} END {print sum}' +``` + +### Caveats + +1. **Race conditions** — processes can spawn/die between reads +2. **Short-lived processes** — missed if they start and exit between samples +3. **Zombie/exited processes** — their CPU time may not be captured +4. **Overhead** — scanning all PIDs repeatedly is expensive +5. **Namespace visibility** — you only see processes in your PID namespace (which is what you want) +6. **Children accounting** — when a process exits, its CPU time is added to the parent's `cutime`/`cstime`, risking double-counting + +Cgroups handle these edge cases natively, but **cannot be used when sidecars share the PID namespace with different cgroups** — in that case, per-process aggregation is the best option. + +## Parent/Child Process Relationships + +Field 4 in `/proc/[pid]/stat` is the PPID (parent process ID): + +```bash +awk '{print $4}' /proc/1234/stat # PPID from stat +grep PPid /proc/1234/status # more readable +``` + +### Building a Process Tree + +```bash +#!/bin/bash +declare -A parent_of children_of + +for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + pid="${stat#/proc/}"; pid="${pid%/stat}" + rest="${line##*) }"; read -ra fields <<< "$rest" + ppid="${fields[1]}" # 4th field overall = index 1 after state + parent_of[$pid]=$ppid + children_of[$ppid]+="$pid " + fi +done + +print_tree() { + local pid=$1 indent=$2 + echo "${indent}${pid}" + for child in ${children_of[$pid]}; do print_tree "$child" " $indent"; done +} +print_tree 1 "" +``` + +### Avoiding Double-Counting with cutime/cstime + +Only sum `utime` + `stime` per process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on — those children no longer exist in `/proc`, so their time is only accessible via the parent. + +```bash +#!/bin/bash +declare -A utime stime + +for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + pid="${stat#/proc/}"; pid="${pid%/stat}" + rest="${line##*) }"; read -ra f <<< "$rest" + utime[$pid]="${f[11]}"; stime[$pid]="${f[12]}" + # cutime=${f[13]} cstime=${f[14]} — don't sum these + fi +done + +total=0 +for pid in "${!utime[@]}"; do ((total += utime[$pid] + stime[$pid])); done +echo "Total CPU ticks: $total" +echo "Seconds: $(echo "scale=2; $total / $(getconf CLK_TCK)" | bc)" +``` + +## Converting Ticks to CPU Percentages + +CPU percentage is a rate — you need **two samples** over a time interval. + +``` +CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100 +``` + +- `delta_ticks` = difference in (utime + stime) between samples +- `CLK_TCK` = ticks per second (usually 100, get via `getconf CLK_TCK`) +- `num_cpus` = number of CPUs (omit for per-core percentage) + +| Style | Formula | Example | +|-------|---------|---------| +| **Normalized** (0-100%) | `delta / (elapsed * CLK_TCK * num_cpus) * 100` | 50% = half of total capacity | +| **Cores-style** (0-N*100%) | `delta / (elapsed * CLK_TCK) * 100` | 200% = 2 full cores busy | + +### Sampling Script + +```bash +#!/bin/bash +CLK_TCK=$(getconf CLK_TCK) +NUM_CPUS=$(nproc) + +get_total_ticks() { + local total=0 + for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + rest="${line##*) }"; read -ra f <<< "$rest" + ((total += f[11] + f[12])) + fi + done + echo "$total" +} + +ticks1=$(get_total_ticks); time1=$(date +%s.%N) +sleep 1 +ticks2=$(get_total_ticks); time2=$(date +%s.%N) + +delta_ticks=$((ticks2 - ticks1)) +elapsed=$(echo "$time2 - $time1" | bc) + +pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK * $NUM_CPUS)) * 100" | bc) +echo "CPU usage: ${pct}% of ${NUM_CPUS} CPUs" + +cores_pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK)) * 100" | bc) +echo "CPU usage: ${cores_pct}% (cores-style)" +``` + +## Respecting Cgroup CPU Limits + +The above calculations use `nproc`, which returns the **host** CPU count. If a container is limited to 2 CPUs on an 8-CPU host, `nproc` returns 8 and the percentage is misleading. + +### Reading Effective CPU Limit + +```bash +#!/bin/bash +get_effective_cpus() { + # cgroups v2 + if [[ -f /sys/fs/cgroup/cpu.max ]]; then + read quota period < /sys/fs/cgroup/cpu.max + [[ "$quota" != "max" ]] && echo "scale=2; $quota / $period" | bc && return + fi + # cgroups v1 + if [[ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ]]; then + quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) + period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) + [[ "$quota" != "-1" ]] && echo "scale=2; $quota / $period" | bc && return + fi + nproc # fallback +} +``` + +Also check cpuset limits (`cpuset.cpus.effective` for v2, `cpuset/cpuset.cpus` for v1) which restrict which physical CPUs are available. + +### Shared PID Namespace Complication + +When sidecars share a PID namespace but have different cgroups, there's no single "correct" CPU limit for normalization. Options: + +1. **Use host CPU count** — percentage of total host capacity +2. **Sum the limits** — if you know each sidecar's cgroup, sum their quotas +3. **Report in cores** — skip normalization, show `1.5 cores used` instead of percentage + +## Reading Cgroup Limits for Other Containers + +Every process exposes its cgroup membership via `/proc//cgroup`. If the cgroup filesystem is mounted, you can read any container's limits: + +```bash +get_cgroup_cpu_limit() { + local pid=$1 + # cgroups v2 + cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) + if [[ -n "$cgroup_path" ]]; then + limit_file="/sys/fs/cgroup${cgroup_path}/cpu.max" + if [[ -r "$limit_file" ]]; then + read quota period < "$limit_file" + [[ "$quota" == "max" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc + return + fi + fi + # cgroups v1 + cgroup_path=$(grep -oP 'cpu.*:\K.*' /proc/$pid/cgroup 2>/dev/null) + if [[ -n "$cgroup_path" ]]; then + quota_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_quota_us" + period_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_period_us" + if [[ -r "$quota_file" ]]; then + quota=$(cat "$quota_file"); period=$(cat "$period_file") + [[ "$quota" == "-1" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc + return + fi + fi + echo "unknown" +} +``` + +### Mount Visibility + +| Scenario | Can Read Other Cgroups? | +|----------|------------------------| +| Host system | Yes | +| Privileged container | Yes | +| `/sys/fs/cgroup` mounted read-only from host | Yes (common in Kubernetes) | +| Only own cgroup subtree mounted | No | + +### Fallbacks When Cgroups Aren't Accessible + +1. **Mount the cgroup fs** — volume mount `/sys/fs/cgroup:ro` +2. **Use a sidecar with access** — one privileged container does monitoring +3. **Accept "unknown" limits** — report raw ticks/cores instead of percentages +4. **Kubernetes Downward API** — inject limits as env vars (own container only) diff --git a/go.mod b/go.mod index 3b4c218..7ef0cd0 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,15 @@ module edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector go 1.25.6 + +require ( + gorm.io/driver/sqlite v1.6.0 + gorm.io/gorm v1.31.1 +) + +require ( + github.com/jinzhu/inflection v1.0.0 // indirect + github.com/jinzhu/now v1.1.5 // indirect + github.com/mattn/go-sqlite3 v1.14.22 // indirect + golang.org/x/text v0.20.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..330dd09 --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= +github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= +github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +gorm.io/driver/sqlite v1.6.0 h1:WHRRrIiulaPiPFmDcod6prc4l2VGVWHz80KspNsxSfQ= +gorm.io/driver/sqlite v1.6.0/go.mod h1:AO9V1qIQddBESngQUKWL9yoH93HIeA1X6V633rBwyT8= +gorm.io/gorm v1.31.1 h1:7CA8FTFz/gRfgqgpeKIBcervUn3xSyPUmr6B2WXJ7kg= +gorm.io/gorm v1.31.1/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs= diff --git a/internal/cgroup/config.go b/internal/cgroup/config.go new file mode 100644 index 0000000..536f61b --- /dev/null +++ b/internal/cgroup/config.go @@ -0,0 +1,84 @@ +// ABOUTME: Configuration types and parsing for cgroup limits and process mapping. +// ABOUTME: Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP environment variables. +package cgroup + +import ( + "encoding/json" + "fmt" + "os" +) + +// CgroupLimit holds the resource limits for a container/cgroup +type CgroupLimit struct { + CPUCores float64 // CPU limit in cores (e.g., 0.5 for "500m", 2.0 for "2") + MemoryBytes uint64 // Memory limit in bytes +} + +// CgroupLimits maps container names to their resource limits +type CgroupLimits map[string]CgroupLimit + +// ProcessMapping maps process names to container names (for cgroup path discovery) +type ProcessMapping map[string]string + +// CgroupPathMapping maps cgroup paths to container names (built at runtime) +type CgroupPathMapping map[string]string + +// rawLimitEntry is the JSON structure for each entry in CGROUP_LIMITS +type rawLimitEntry struct { + CPU string `json:"cpu"` + Memory string `json:"memory"` +} + +// ParseCgroupLimitsEnv parses the CGROUP_LIMITS environment variable. +// Expected format: {"container-name": {"cpu": "500m", "memory": "1Gi"}, ...} +func ParseCgroupLimitsEnv() (CgroupLimits, error) { + raw := os.Getenv("CGROUP_LIMITS") + if raw == "" { + return nil, nil // No limits configured + } + + var parsed map[string]rawLimitEntry + if err := json.Unmarshal([]byte(raw), &parsed); err != nil { + return nil, fmt.Errorf("parsing CGROUP_LIMITS: %w", err) + } + + limits := make(CgroupLimits) + for name, entry := range parsed { + var limit CgroupLimit + var err error + + if entry.CPU != "" { + limit.CPUCores, err = ParseCPU(entry.CPU) + if err != nil { + return nil, fmt.Errorf("parsing CPU for %q: %w", name, err) + } + } + + if entry.Memory != "" { + limit.MemoryBytes, err = ParseMemory(entry.Memory) + if err != nil { + return nil, fmt.Errorf("parsing memory for %q: %w", name, err) + } + } + + limits[name] = limit + } + + return limits, nil +} + +// ParseProcessMappingEnv parses the CGROUP_PROCESS_MAP environment variable. +// Expected format: {"process-name": "container-name", ...} +func ParseProcessMappingEnv() (ProcessMapping, error) { + raw := os.Getenv("CGROUP_PROCESS_MAP") + if raw == "" { + return nil, nil // No mapping configured + } + + var parsed map[string]string + if err := json.Unmarshal([]byte(raw), &parsed); err != nil { + return nil, fmt.Errorf("parsing CGROUP_PROCESS_MAP: %w", err) + } + + return ProcessMapping(parsed), nil +} diff --git a/internal/cgroup/parse.go b/internal/cgroup/parse.go new file mode 100644 index 0000000..e7d1a92 --- /dev/null +++ b/internal/cgroup/parse.go @@ -0,0 +1,96 @@ +// ABOUTME: Parses Kubernetes-style resource notation for CPU and memory. +// ABOUTME: CPU: "500m" = 0.5 cores, "2" = 2 cores. +// ABOUTME: Memory: "1Gi" = 1 GiB, "512Mi" = 512 MiB, "1G" = 1 GB. +package cgroup + +import ( + "fmt" + "strconv" + "strings" +) + +// ParseCPU parses Kubernetes CPU notation to cores. +// Examples: "500m" => 0.5, "2" => 2.0, "100m" => 0.1, "2000m" => 2.0 +func ParseCPU(value string) (float64, error) { + value = strings.TrimSpace(value) + if value == "" { + return 0, fmt.Errorf("empty CPU value") + } + + // Handle millicores suffix + if strings.HasSuffix(value, "m") { + millis, err := strconv.ParseFloat(strings.TrimSuffix(value, "m"), 64) + if err != nil { + return 0, fmt.Errorf("parsing millicores: %w", err) + } + return millis / 1000.0, nil + } + + // Plain number means cores + cores, err := strconv.ParseFloat(value, 64) + if err != nil { + return 0, fmt.Errorf("parsing cores: %w", err) + } + + return cores, nil +} + +// ParseMemory parses Kubernetes memory notation to bytes. +// Supports: +// - Binary suffixes: Ki, Mi, Gi, Ti (powers of 1024) +// - Decimal suffixes: K, M, G, T (powers of 1000) +// - Plain numbers: bytes +func ParseMemory(value string) (uint64, error) { + value = strings.TrimSpace(value) + if value == "" { + return 0, fmt.Errorf("empty memory value") + } + + // Binary suffixes (powers of 1024) + binarySuffixes := map[string]uint64{ + "Ki": 1024, + "Mi": 1024 * 1024, + "Gi": 1024 * 1024 * 1024, + "Ti": 1024 * 1024 * 1024 * 1024, + } + + // Decimal suffixes (powers of 1000) + decimalSuffixes := map[string]uint64{ + "K": 1000, + "M": 1000 * 1000, + "G": 1000 * 1000 * 1000, + "T": 1000 * 1000 * 1000 * 1000, + } + + // Try binary suffixes first (2-char) + for suffix, multiplier := range binarySuffixes { + if strings.HasSuffix(value, suffix) { + numStr := strings.TrimSuffix(value, suffix) + num, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("parsing memory value: %w", err) + } + return uint64(num * float64(multiplier)), nil + } + } + + // Try decimal suffixes (1-char) + for suffix, multiplier := range decimalSuffixes { + if strings.HasSuffix(value, suffix) { + numStr := strings.TrimSuffix(value, suffix) + num, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("parsing memory value: %w", err) + } + return uint64(num * float64(multiplier)), nil + } + } + + // Plain number (bytes) + bytes, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return 0, fmt.Errorf("parsing bytes: %w", err) + } + + return bytes, nil +} diff --git a/internal/cgroup/parse_test.go b/internal/cgroup/parse_test.go new file mode 100644 index 0000000..2451163 --- /dev/null +++ b/internal/cgroup/parse_test.go @@ -0,0 +1,84 @@ +package cgroup + +import ( + "testing" +) + +func TestParseCPU(t *testing.T) { + tests := []struct { + name string + input string + want float64 + wantErr bool + }{ + {"millicores 500m", "500m", 0.5, false}, + {"millicores 100m", "100m", 0.1, false}, + {"millicores 2000m", "2000m", 2.0, false}, + {"millicores 1m", "1m", 0.001, false}, + {"cores integer", "2", 2.0, false}, + {"cores decimal", "1.5", 1.5, false}, + {"cores with spaces", " 2 ", 2.0, false}, + {"empty string", "", 0, true}, + {"invalid format", "abc", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseCPU(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("ParseCPU() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got != tt.want { + t.Errorf("ParseCPU() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestParseMemory(t *testing.T) { + tests := []struct { + name string + input string + want uint64 + wantErr bool + }{ + // Binary suffixes (powers of 1024) + {"Ki", "1Ki", 1024, false}, + {"Mi", "1Mi", 1024 * 1024, false}, + {"Gi", "1Gi", 1024 * 1024 * 1024, false}, + {"Ti", "1Ti", 1024 * 1024 * 1024 * 1024, false}, + {"512Mi", "512Mi", 512 * 1024 * 1024, false}, + {"2Gi", "2Gi", 2 * 1024 * 1024 * 1024, false}, + + // Decimal suffixes (powers of 1000) + {"K", "1K", 1000, false}, + {"M", "1M", 1000000, false}, + {"G", "1G", 1000000000, false}, + {"T", "1T", 1000000000000, false}, + + // Plain bytes + {"bytes", "1024", 1024, false}, + {"large bytes", "1073741824", 1073741824, false}, + + // With spaces + {"with spaces", " 1Gi ", 1024 * 1024 * 1024, false}, + + // Error cases + {"empty", "", 0, true}, + {"invalid", "abc", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseMemory(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("ParseMemory() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got != tt.want { + t.Errorf("ParseMemory() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/collector/collector.go b/internal/collector/collector.go index cbc5683..393f1ae 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -8,6 +8,7 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) // Config holds the collector configuration @@ -19,22 +20,36 @@ type Config struct { // Collector orchestrates metric collection type Collector struct { - config Config - aggregator *metrics.Aggregator - writer output.Writer - logger *slog.Logger + config Config + aggregator *metrics.Aggregator + writer output.Writer + logger *slog.Logger + accumulator *summary.Accumulator + summaryWriter *summary.SummaryWriter + pushClient *summary.PushClient } // New creates a new collector func New(cfg Config, writer output.Writer, logger *slog.Logger) *Collector { return &Collector{ - config: cfg, - aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), - writer: writer, - logger: logger, + config: cfg, + aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), + writer: writer, + logger: logger, + accumulator: summary.NewAccumulator(cfg.TopN), } } +// SetSummaryWriter attaches a summary writer for emitting run summaries on shutdown +func (c *Collector) SetSummaryWriter(w *summary.SummaryWriter) { + c.summaryWriter = w +} + +// SetPushClient attaches a push client for sending summaries to the receiver +func (c *Collector) SetPushClient(p *summary.PushClient) { + c.pushClient = p +} + // Run starts the collector loop and blocks until context is cancelled func (c *Collector) Run(ctx context.Context) error { c.logger.Info("collector starting", @@ -55,6 +70,7 @@ func (c *Collector) Run(ctx context.Context) error { select { case <-ctx.Done(): c.logger.Info("collector stopping") + c.emitSummary(context.Background()) // Use fresh context for shutdown tasks return ctx.Err() case <-ticker.C: if err := c.collect(); err != nil { @@ -75,9 +91,37 @@ func (c *Collector) collect() error { return fmt.Errorf("writing metrics: %w", err) } + c.accumulator.Add(m) + return nil } +// emitSummary computes and writes the run summary if a writer is configured +func (c *Collector) emitSummary(ctx context.Context) { + s := c.accumulator.Summarize() + if s == nil { + c.logger.Info("no samples collected, skipping run summary") + return + } + + c.logger.Info("emitting run summary", + slog.Int("sample_count", s.SampleCount), + slog.Float64("duration_seconds", s.DurationSeconds), + ) + + if c.summaryWriter != nil { + c.summaryWriter.Write(s) + } + + if c.pushClient != nil { + if err := c.pushClient.Push(ctx, s); err != nil { + c.logger.Error("failed to push metrics", slog.String("error", err.Error())) + } else { + c.logger.Info("metrics pushed successfully") + } + } +} + // CollectOnce performs a single collection and returns the metrics func (c *Collector) CollectOnce() (*metrics.SystemMetrics, error) { return c.aggregator.Collect() diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go new file mode 100644 index 0000000..9251a51 --- /dev/null +++ b/internal/collector/collector_test.go @@ -0,0 +1,98 @@ +// ABOUTME: Tests for the collector's summary integration. +// ABOUTME: Validates that run summaries are emitted on shutdown and handles missing writer gracefully. +package collector + +import ( + "bytes" + "context" + "log/slog" + "strings" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestCollector_EmitsSummaryOnShutdown(t *testing.T) { + // Use testdata/proc as the proc filesystem + procPath := "testdata/proc" + + // Metrics output (regular collection output) + var metricsOut bytes.Buffer + metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ + Output: &metricsOut, + Format: output.LogFormatJSON, + Level: slog.LevelInfo, + }) + + // Summary output + var summaryOut bytes.Buffer + sw := summary.NewSummaryWriter(&summaryOut, "json") + + // Silence app logs + appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + + c := New(Config{ + ProcPath: procPath, + Interval: 50 * time.Millisecond, + TopN: 5, + }, metricsWriter, appLogger) + + c.SetSummaryWriter(sw) + + // Run collector briefly then cancel + ctx, cancel := context.WithCancel(context.Background()) + go func() { + // Let at least 2 collection cycles run + time.Sleep(150 * time.Millisecond) + cancel() + }() + + _ = c.Run(ctx) + + // Verify summary was emitted + summaryOutput := summaryOut.String() + if !strings.Contains(summaryOutput, "run_summary") { + t.Errorf("expected 'run_summary' in output, got: %s", summaryOutput) + } + if !strings.Contains(summaryOutput, "duration_seconds") { + t.Errorf("expected 'duration_seconds' in output, got: %s", summaryOutput) + } + if !strings.Contains(summaryOutput, "sample_count") { + t.Errorf("expected 'sample_count' in output, got: %s", summaryOutput) + } +} + +func TestCollector_NoSummaryWithoutWriter(t *testing.T) { + procPath := "testdata/proc" + + var metricsOut bytes.Buffer + metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ + Output: &metricsOut, + Format: output.LogFormatJSON, + Level: slog.LevelInfo, + }) + + appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + + c := New(Config{ + ProcPath: procPath, + Interval: 50 * time.Millisecond, + TopN: 5, + }, metricsWriter, appLogger) + + // Deliberately do NOT set a summary writer + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(100 * time.Millisecond) + cancel() + }() + + // Should not panic + err := c.Run(ctx) + if err != context.Canceled { + t.Errorf("expected context.Canceled, got: %v", err) + } +} diff --git a/internal/collector/testdata/proc/1/stat b/internal/collector/testdata/proc/1/stat new file mode 100644 index 0000000..01d6595 --- /dev/null +++ b/internal/collector/testdata/proc/1/stat @@ -0,0 +1 @@ +1 (init) S 0 1 1 0 -1 4194560 1000 0 0 0 100 50 0 0 20 0 1 0 1 10000000 500 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/internal/collector/testdata/proc/1/status b/internal/collector/testdata/proc/1/status new file mode 100644 index 0000000..0b4d9e3 --- /dev/null +++ b/internal/collector/testdata/proc/1/status @@ -0,0 +1,14 @@ +Name: init +Uid: 0 0 0 0 +Gid: 0 0 0 0 +VmPeak: 10000 kB +VmSize: 10000 kB +VmRSS: 5000 kB +VmData: 3000 kB +VmStk: 200 kB +VmExe: 100 kB +VmLib: 1000 kB +RssAnon: 3000 kB +RssFile: 1500 kB +RssShmem: 500 kB +Threads: 1 diff --git a/internal/collector/testdata/proc/cpuinfo b/internal/collector/testdata/proc/cpuinfo new file mode 100644 index 0000000..9703a4e --- /dev/null +++ b/internal/collector/testdata/proc/cpuinfo @@ -0,0 +1 @@ +processor : 0 diff --git a/internal/collector/testdata/proc/meminfo b/internal/collector/testdata/proc/meminfo new file mode 100644 index 0000000..2993ff4 --- /dev/null +++ b/internal/collector/testdata/proc/meminfo @@ -0,0 +1,5 @@ +MemTotal: 16348500 kB +MemFree: 8000000 kB +MemAvailable: 12000000 kB +Buffers: 500000 kB +Cached: 3000000 kB diff --git a/internal/collector/testdata/proc/stat b/internal/collector/testdata/proc/stat new file mode 100644 index 0000000..513d56a --- /dev/null +++ b/internal/collector/testdata/proc/stat @@ -0,0 +1 @@ +cpu 10000 500 3000 80000 200 50 30 0 0 0 diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go new file mode 100644 index 0000000..685d2b6 --- /dev/null +++ b/internal/integration/integration_test.go @@ -0,0 +1,386 @@ +// ABOUTME: Integration tests for collector and receiver interaction. +// ABOUTME: Tests that the push client can successfully send metrics to the receiver. +package integration + +import ( + "bytes" + "context" + "encoding/json" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "path/filepath" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +const ( + testReadToken = "integration-test-token" + testHMACKey = "integration-hmac-key" +) + +// setupTestReceiver creates a test receiver with SQLite storage, auth, and HTTP server +func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := receiver.NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), testReadToken, testHMACKey) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := httptest.NewServer(mux) + + cleanup := func() { + server.Close() + _ = store.Close() + } + + return store, server, cleanup +} + +// generatePushToken generates a scoped push token for an execution context +func generatePushToken(exec summary.ExecutionContext) string { + return receiver.GenerateScopedToken(testHMACKey, exec.Organization, exec.Repository, exec.Workflow, exec.Job) +} + +func TestPushClientToReceiver(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Test execution context + testCtx := summary.ExecutionContext{ + Organization: "integration-org", + Repository: "integration-repo", + Workflow: "test.yml", + Job: "integration-test", + RunID: "run-integration-123", + } + + // Create a test summary + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 10, + CPUTotal: summary.StatSummary{Peak: 85.5, Avg: 42.3, P95: 78.0}, + MemUsedBytes: summary.StatSummary{Peak: 4294967296, Avg: 2147483648, P95: 3865470566}, + MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + TopCPUProcesses: []summary.ProcessPeak{ + {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, + }, + TopMemProcesses: []summary.ProcessPeak{ + {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, + }, + } + + // Build payload matching what push client sends + payload := struct { + Execution summary.ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` + }{ + Execution: testCtx, + Summary: *testSummary, + } + + body, err := json.Marshal(payload) + if err != nil { + t.Fatalf("Marshal() error = %v", err) + } + + // Send via HTTP client with scoped push token + pushToken := generatePushToken(testCtx) + req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) + if err != nil { + t.Fatalf("NewRequest() error = %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("Do() error = %v", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusCreated { + t.Errorf("status = %d, want %d", resp.StatusCode, http.StatusCreated) + } + + // Verify metrics were stored + metrics, err := store.GetMetricsByWorkflowJob("integration-org", "integration-repo", "test.yml", "integration-test") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + + m := metrics[0] + if m.Organization != testCtx.Organization { + t.Errorf("Organization = %q, want %q", m.Organization, testCtx.Organization) + } + if m.Repository != testCtx.Repository { + t.Errorf("Repository = %q, want %q", m.Repository, testCtx.Repository) + } + if m.Workflow != testCtx.Workflow { + t.Errorf("Workflow = %q, want %q", m.Workflow, testCtx.Workflow) + } + if m.Job != testCtx.Job { + t.Errorf("Job = %q, want %q", m.Job, testCtx.Job) + } + if m.RunID != testCtx.RunID { + t.Errorf("RunID = %q, want %q", m.RunID, testCtx.RunID) + } + + // Verify payload was stored correctly + var storedSummary summary.RunSummary + if err := json.Unmarshal([]byte(m.Payload), &storedSummary); err != nil { + t.Fatalf("Unmarshal payload error = %v", err) + } + + if storedSummary.SampleCount != testSummary.SampleCount { + t.Errorf("SampleCount = %d, want %d", storedSummary.SampleCount, testSummary.SampleCount) + } + if storedSummary.CPUTotal.Peak != testSummary.CPUTotal.Peak { + t.Errorf("CPUTotal.Peak = %f, want %f", storedSummary.CPUTotal.Peak, testSummary.CPUTotal.Peak) + } +} + +func TestPushClientIntegration(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Set environment variables for the push client + t.Setenv("GITHUB_REPOSITORY_OWNER", "push-client-org") + t.Setenv("GITHUB_REPOSITORY", "push-client-repo") + t.Setenv("GITHUB_WORKFLOW", "push-test.yml") + t.Setenv("GITHUB_JOB", "push-job") + t.Setenv("GITHUB_RUN_ID", "push-run-456") + + // Generate scoped push token + pushToken := receiver.GenerateScopedToken(testHMACKey, "push-client-org", "push-client-repo", "push-test.yml", "push-job") + + // Create push client with token - it reads execution context from env vars + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", pushToken) + + // Verify execution context was read from env + ctx := pushClient.ExecutionContext() + if ctx.Organization != "push-client-org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "push-client-org") + } + + // Create and push a summary + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-30 * time.Second), + EndTime: time.Now(), + DurationSeconds: 30.0, + SampleCount: 6, + CPUTotal: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + MemUsedBytes: summary.StatSummary{Peak: 1000000, Avg: 500000, P95: 900000}, + MemUsedPercent: summary.StatSummary{Peak: 10.0, Avg: 5.0, P95: 9.0}, + } + + // Push the summary + err := pushClient.Push(context.Background(), testSummary) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + + // Verify it was stored + metrics, err := store.GetMetricsByWorkflowJob("push-client-org", "push-client-repo", "push-test.yml", "push-job") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + + if metrics[0].RunID != "push-run-456" { + t.Errorf("RunID = %q, want %q", metrics[0].RunID, "push-run-456") + } +} + +func TestMultiplePushes(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Simulate multiple workflow runs pushing metrics via direct HTTP POST + runs := []summary.ExecutionContext{ + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-2"}, + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "test", RunID: "run-1"}, + {Organization: "org-a", Repository: "repo-2", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, + } + + for _, execCtx := range runs { + payload := struct { + Execution summary.ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` + }{ + Execution: execCtx, + Summary: summary.RunSummary{ + SampleCount: 5, + CPUTotal: summary.StatSummary{Peak: 50.0}, + }, + } + + body, err := json.Marshal(payload) + if err != nil { + t.Fatalf("Marshal() error = %v", err) + } + + pushToken := generatePushToken(execCtx) + req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) + if err != nil { + t.Fatalf("NewRequest() error = %v for run %+v", err, execCtx) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("Do() error = %v for run %+v", err, execCtx) + } + _ = resp.Body.Close() + + if resp.StatusCode != http.StatusCreated { + t.Fatalf("status = %d, want %d for run %+v", resp.StatusCode, http.StatusCreated, execCtx) + } + } + + // Verify filtering works correctly + metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 2 { + t.Errorf("got %d metrics for org-a/repo-1/ci.yml/build, want 2", len(metrics)) + } + + metrics, err = store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "test") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 1 { + t.Errorf("got %d metrics for org-a/repo-1/ci.yml/test, want 1", len(metrics)) + } +} + +func TestPushClientWithTokenIntegration(t *testing.T) { + readToken := "integration-read-secret" + hmacKey := "integration-hmac-secret" + store, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) + defer cleanup() + + // Generate a scoped token via the API + tokenReqBody, _ := json.Marshal(map[string]string{ + "organization": "token-org", + "repository": "token-repo", + "workflow": "ci.yml", + "job": "build", + }) + tokenReq, _ := http.NewRequest(http.MethodPost, server.URL+"/api/v1/token", bytes.NewReader(tokenReqBody)) + tokenReq.Header.Set("Authorization", "Bearer "+readToken) + tokenReq.Header.Set("Content-Type", "application/json") + + tokenResp, err := http.DefaultClient.Do(tokenReq) + if err != nil { + t.Fatalf("token request error: %v", err) + } + defer func() { _ = tokenResp.Body.Close() }() + + if tokenResp.StatusCode != http.StatusOK { + t.Fatalf("token request status = %d, want %d", tokenResp.StatusCode, http.StatusOK) + } + + var tokenBody struct { + Token string `json:"token"` + } + if err := json.NewDecoder(tokenResp.Body).Decode(&tokenBody); err != nil { + t.Fatalf("decode token response: %v", err) + } + + // Use the scoped token to push metrics + t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") + t.Setenv("GITHUB_REPOSITORY", "token-repo") + t.Setenv("GITHUB_WORKFLOW", "ci.yml") + t.Setenv("GITHUB_JOB", "build") + t.Setenv("GITHUB_RUN_ID", "token-run-1") + + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", tokenBody.Token) + + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-10 * time.Second), + EndTime: time.Now(), + DurationSeconds: 10.0, + SampleCount: 2, + } + + if err := pushClient.Push(context.Background(), testSummary); err != nil { + t.Fatalf("Push() error = %v", err) + } + + // Verify stored + metrics, err := store.GetMetricsByWorkflowJob("token-org", "token-repo", "ci.yml", "build") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + if metrics[0].RunID != "token-run-1" { + t.Errorf("RunID = %q, want %q", metrics[0].RunID, "token-run-1") + } +} + +func TestPushClientWithWrongTokenIntegration(t *testing.T) { + readToken := "integration-read-secret" + hmacKey := "integration-hmac-secret" + _, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) + defer cleanup() + + t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") + t.Setenv("GITHUB_REPOSITORY", "token-repo") + t.Setenv("GITHUB_WORKFLOW", "ci.yml") + t.Setenv("GITHUB_JOB", "build") + t.Setenv("GITHUB_RUN_ID", "token-run-2") + + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", "wrong-token") + + err := pushClient.Push(context.Background(), &summary.RunSummary{SampleCount: 1}) + if err == nil { + t.Error("Push() with wrong token should fail") + } +} + +func setupTestReceiverWithToken(t *testing.T, readToken, hmacKey string) (*receiver.Store, *httptest.Server, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := receiver.NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), readToken, hmacKey) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := httptest.NewServer(mux) + + cleanup := func() { + server.Close() + _ = store.Close() + } + + return store, server, cleanup +} diff --git a/internal/metrics/aggregator.go b/internal/metrics/aggregator.go index 1e33646..2e7c18e 100644 --- a/internal/metrics/aggregator.go +++ b/internal/metrics/aggregator.go @@ -4,23 +4,37 @@ import ( "sort" "time" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/cgroup" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/proc" ) // Aggregator collects and aggregates metrics from processes type Aggregator struct { - procPath string - topN int - prevCPU *CPUSnapshot - prevProcCPU map[int]*ProcessCPUSnapshot + procPath string + topN int + prevCPU *CPUSnapshot + prevProcCPU map[int]*ProcessCPUSnapshot + cgroupLimits cgroup.CgroupLimits // container name -> limits + processMapping cgroup.ProcessMapping // process name -> container name + cgroupPathMapping cgroup.CgroupPathMapping // cgroup path -> container name (built at runtime) + prevCgroupCPU map[string]uint64 // container name -> previous total ticks + prevCgroupTime time.Time // previous collection time for cgroup CPU calc } // NewAggregator creates a new metrics aggregator func NewAggregator(procPath string, topN int) *Aggregator { + // Parse cgroup configuration from environment + limits, _ := cgroup.ParseCgroupLimitsEnv() + processMap, _ := cgroup.ParseProcessMappingEnv() + return &Aggregator{ - procPath: procPath, - topN: topN, - prevProcCPU: make(map[int]*ProcessCPUSnapshot), + procPath: procPath, + topN: topN, + prevProcCPU: make(map[int]*ProcessCPUSnapshot), + cgroupLimits: limits, + processMapping: processMap, + cgroupPathMapping: make(cgroup.CgroupPathMapping), + prevCgroupCPU: make(map[string]uint64), } } @@ -77,6 +91,12 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { return float64(p.MemRSS) }) + // Discover cgroup path mappings from known processes + a.discoverCgroupMappings(processes) + + // Calculate per-cgroup metrics + cgroupMetrics := a.calculateCgroupMetrics(processes, processMetrics, now) + return &SystemMetrics{ Timestamp: now, TotalProcesses: len(processes), @@ -84,6 +104,7 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { Memory: memMetrics, TopCPU: topCPU, TopMemory: topMemory, + Cgroups: cgroupMetrics, }, nil } @@ -158,6 +179,11 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now state = "?" } + cgroupPath := "" + if p.Cgroup != nil { + cgroupPath = p.Cgroup.Path + } + metrics = append(metrics, ProcessMetrics{ PID: pid, Name: p.Status.Name, @@ -166,6 +192,7 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now MemVirtual: p.Status.VmSize, Threads: p.Status.Threads, State: state, + CgroupPath: cgroupPath, }) } @@ -223,3 +250,152 @@ func (a *Aggregator) getTopByMetric(metrics []ProcessMetrics, getValue func(Proc return sorted[:n] } + +// discoverCgroupMappings discovers cgroup path to container name mappings +// by looking for processes that match the configured process mapping. +func (a *Aggregator) discoverCgroupMappings(processes []*proc.ProcessInfo) { + if len(a.processMapping) == 0 { + return + } + + for _, p := range processes { + if p.Cgroup == nil || p.Cgroup.Path == "" { + continue + } + + // Check if this process name is in our mapping + if containerName, ok := a.processMapping[p.Status.Name]; ok { + // Map this cgroup path to the container name + a.cgroupPathMapping[p.Cgroup.Path] = containerName + } + } +} + +// resolveContainerName returns the container name for a cgroup path, +// or the raw path if no mapping exists. +func (a *Aggregator) resolveContainerName(cgroupPath string) string { + if name, ok := a.cgroupPathMapping[cgroupPath]; ok { + return name + } + // Use raw path as fallback + if cgroupPath == "" { + return "" + } + return cgroupPath +} + +// calculateCgroupMetrics computes metrics grouped by container/cgroup. +func (a *Aggregator) calculateCgroupMetrics( + processes []*proc.ProcessInfo, + processMetrics []ProcessMetrics, + now time.Time, +) map[string]*CgroupMetrics { + // Build lookup from PID to ProcessMetrics + pmByPID := make(map[int]ProcessMetrics) + for _, pm := range processMetrics { + pmByPID[pm.PID] = pm + } + + // Group processes by container name + type cgroupData struct { + cgroupPath string + procs []*proc.ProcessInfo + metrics []ProcessMetrics + } + containerGroups := make(map[string]*cgroupData) + + for _, p := range processes { + cgroupPath := "" + if p.Cgroup != nil { + cgroupPath = p.Cgroup.Path + } + + containerName := a.resolveContainerName(cgroupPath) + + if _, ok := containerGroups[containerName]; !ok { + containerGroups[containerName] = &cgroupData{ + cgroupPath: cgroupPath, + } + } + + containerGroups[containerName].procs = append(containerGroups[containerName].procs, p) + + if pm, ok := pmByPID[p.Stat.PID]; ok { + containerGroups[containerName].metrics = append(containerGroups[containerName].metrics, pm) + } + } + + // Calculate elapsed time since last collection + elapsed := 0.0 + if !a.prevCgroupTime.IsZero() { + elapsed = now.Sub(a.prevCgroupTime).Seconds() + } + a.prevCgroupTime = now + + // Calculate metrics for each container + result := make(map[string]*CgroupMetrics) + + for containerName, data := range containerGroups { + // Sum CPU ticks (utime + stime only, not cutime/cstime) + var totalTicks uint64 + var totalRSS uint64 + + for _, p := range data.procs { + totalTicks += p.Stat.TotalTime() + totalRSS += p.Status.VmRSS + } + + // Calculate CPU cores used from delta + usedCores := 0.0 + hasDelta := false + if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 { + // Guard against underflow: if processes exited and new ones started, + // totalTicks could be less than prev. In that case, skip this sample. + if totalTicks >= prev { + deltaTicks := totalTicks - prev + // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) + usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) + hasDelta = true + } + } + a.prevCgroupCPU[containerName] = totalTicks + + // Calculate percentages against limits if available + cpuPercent := 0.0 + memPercent := 0.0 + var limitCores float64 + var limitBytes uint64 + + if limit, ok := a.cgroupLimits[containerName]; ok { + limitCores = limit.CPUCores + limitBytes = limit.MemoryBytes + if limit.CPUCores > 0 { + cpuPercent = (usedCores / limit.CPUCores) * 100 + } + if limit.MemoryBytes > 0 { + memPercent = (float64(totalRSS) / float64(limit.MemoryBytes)) * 100 + } + } + + result[containerName] = &CgroupMetrics{ + Name: containerName, + CgroupPath: data.cgroupPath, + CPU: CgroupCPUMetrics{ + TotalTicks: totalTicks, + UsedCores: usedCores, + UsedPercent: cpuPercent, + LimitCores: limitCores, + HasDelta: hasDelta, + }, + Memory: CgroupMemoryMetrics{ + TotalRSSBytes: totalRSS, + UsedPercent: memPercent, + LimitBytes: limitBytes, + }, + Processes: data.metrics, + NumProcs: len(data.procs), + } + } + + return result +} diff --git a/internal/metrics/types.go b/internal/metrics/types.go index 00be63f..9aa9cb2 100644 --- a/internal/metrics/types.go +++ b/internal/metrics/types.go @@ -11,6 +11,7 @@ type ProcessMetrics struct { MemVirtual uint64 `json:"mem_virtual_bytes"` Threads int `json:"threads"` State string `json:"state"` + CgroupPath string `json:"cgroup_path,omitempty"` } // CPUMetrics holds aggregated CPU metrics @@ -35,12 +36,39 @@ type MemoryMetrics struct { // SystemMetrics holds a complete snapshot of system metrics type SystemMetrics struct { - Timestamp time.Time `json:"timestamp"` - TotalProcesses int `json:"total_processes"` - CPU CPUMetrics `json:"cpu"` - Memory MemoryMetrics `json:"memory"` - TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` - TopMemory []ProcessMetrics `json:"top_memory,omitempty"` + Timestamp time.Time `json:"timestamp"` + TotalProcesses int `json:"total_processes"` + CPU CPUMetrics `json:"cpu"` + Memory MemoryMetrics `json:"memory"` + TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` + TopMemory []ProcessMetrics `json:"top_memory,omitempty"` + Cgroups map[string]*CgroupMetrics `json:"cgroups,omitempty"` +} + +// CgroupCPUMetrics holds CPU metrics for a single cgroup/container +type CgroupCPUMetrics struct { + TotalTicks uint64 `json:"total_ticks"` + UsedCores float64 `json:"used_cores"` + UsedPercent float64 `json:"used_percent,omitempty"` + LimitCores float64 `json:"limit_cores,omitempty"` + HasDelta bool `json:"-"` // true when a valid delta could be computed +} + +// CgroupMemoryMetrics holds memory metrics for a single cgroup/container +type CgroupMemoryMetrics struct { + TotalRSSBytes uint64 `json:"total_rss_bytes"` + UsedPercent float64 `json:"used_percent,omitempty"` + LimitBytes uint64 `json:"limit_bytes,omitempty"` +} + +// CgroupMetrics holds all metrics for a single cgroup/container +type CgroupMetrics struct { + Name string `json:"name"` + CgroupPath string `json:"cgroup_path"` + CPU CgroupCPUMetrics `json:"cpu"` + Memory CgroupMemoryMetrics `json:"memory"` + Processes []ProcessMetrics `json:"processes"` + NumProcs int `json:"num_processes"` } // CPUSnapshot holds CPU timing data for calculating percentages between intervals diff --git a/internal/output/logger.go b/internal/output/logger.go index 0ce5ddd..a5933fd 100644 --- a/internal/output/logger.go +++ b/internal/output/logger.go @@ -1,6 +1,7 @@ package output import ( + "context" "io" "log/slog" "os" @@ -53,29 +54,44 @@ func NewLoggerWriter(cfg LoggerConfig) *LoggerWriter { } } +// topCPUEntry is a lightweight struct for JSON serialization of top CPU processes +type topCPUEntry struct { + PID int `json:"pid"` + Name string `json:"name"` + CPUPercent float64 `json:"cpu_percent"` +} + +// topMemEntry is a lightweight struct for JSON serialization of top memory processes +type topMemEntry struct { + PID int `json:"pid"` + Name string `json:"name"` + RSSBytes uint64 `json:"rss_bytes"` +} + // Write outputs the metrics using structured logging func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { - // Build top CPU process attrs - topCPUAttrs := make([]any, 0, len(m.TopCPU)) + // Build top CPU process entries + topCPU := make([]topCPUEntry, 0, len(m.TopCPU)) for _, p := range m.TopCPU { - topCPUAttrs = append(topCPUAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Float64("cpu_percent", p.CPUPercent), - )) + topCPU = append(topCPU, topCPUEntry{ + PID: p.PID, + Name: p.Name, + CPUPercent: p.CPUPercent, + }) } - // Build top memory process attrs - topMemAttrs := make([]any, 0, len(m.TopMemory)) + // Build top memory process entries + topMem := make([]topMemEntry, 0, len(m.TopMemory)) for _, p := range m.TopMemory { - topMemAttrs = append(topMemAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Uint64("rss_bytes", p.MemRSS), - )) + topMem = append(topMem, topMemEntry{ + PID: p.PID, + Name: p.Name, + RSSBytes: p.MemRSS, + }) } - w.logger.Info("metrics_collected", + // Build base attributes + attrs := []slog.Attr{ slog.Time("collection_time", m.Timestamp), slog.Int("total_processes", m.TotalProcesses), slog.Group("cpu", @@ -94,9 +110,16 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { slog.Uint64("total_rss_bytes", m.Memory.TotalRSSBytes), slog.Float64("rss_percent", m.Memory.RSSPercent), ), - slog.Any("top_cpu", topCPUAttrs), - slog.Any("top_memory", topMemAttrs), - ) + slog.Any("top_cpu", topCPU), + slog.Any("top_memory", topMem), + } + + // Add cgroups if present + if len(m.Cgroups) > 0 { + attrs = append(attrs, slog.Any("cgroups", m.Cgroups)) + } + + w.logger.LogAttrs(context.Background(), slog.LevelInfo, "metrics_collected", attrs...) return nil } diff --git a/internal/proc/cgroup.go b/internal/proc/cgroup.go new file mode 100644 index 0000000..6e1597f --- /dev/null +++ b/internal/proc/cgroup.go @@ -0,0 +1,59 @@ +// ABOUTME: Reads cgroup information from /proc/[pid]/cgroup. +// ABOUTME: Supports both cgroup v1 and v2 formats. +package proc + +import ( + "bufio" + "fmt" + "os" + "strings" +) + +// CgroupInfo holds cgroup information for a process +type CgroupInfo struct { + Path string // The cgroup path (unified for v2, or from memory controller for v1) +} + +// ReadCgroup reads /proc/[pid]/cgroup and extracts the cgroup path +func ReadCgroup(procPath string, pid int) (*CgroupInfo, error) { + path := fmt.Sprintf("%s/%d/cgroup", procPath, pid) + + file, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("opening cgroup file: %w", err) + } + defer func() { _ = file.Close() }() + + var cgroupPath string + scanner := bufio.NewScanner(file) + + for scanner.Scan() { + line := scanner.Text() + + // Try cgroup v2 first (unified hierarchy) + // Format: 0::/path + if path, found := strings.CutPrefix(line, "0::"); found { + cgroupPath = path + break + } + + // Fall back to cgroup v1 - look for memory controller + // Format: X:memory:/path or X:memory,other:/path + parts := strings.SplitN(line, ":", 3) + if len(parts) == 3 { + controllers := parts[1] + if strings.Contains(controllers, "memory") { + cgroupPath = parts[2] + // Don't break - prefer v2 if found later + } + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("scanning cgroup file: %w", err) + } + + return &CgroupInfo{ + Path: cgroupPath, + }, nil +} diff --git a/internal/proc/cgroup_test.go b/internal/proc/cgroup_test.go new file mode 100644 index 0000000..e82b6c8 --- /dev/null +++ b/internal/proc/cgroup_test.go @@ -0,0 +1,97 @@ +package proc + +import ( + "os" + "path/filepath" + "testing" +) + +func TestReadCgroup(t *testing.T) { + tests := []struct { + name string + cgroupFile string + wantPath string + wantErr bool + }{ + { + name: "cgroup v2 unified", + cgroupFile: `0::/kubepods/pod-abc/container-123 +`, + wantPath: "/kubepods/pod-abc/container-123", + wantErr: false, + }, + { + name: "cgroup v2 with trailing newline", + cgroupFile: `0::/docker/abc123def456 +`, + wantPath: "/docker/abc123def456", + wantErr: false, + }, + { + name: "cgroup v1 multiple controllers", + cgroupFile: `12:blkio:/user.slice +11:memory:/docker/abc123 +10:cpu,cpuacct:/docker/abc123 +9:pids:/docker/abc123 +`, + wantPath: "/docker/abc123", + wantErr: false, + }, + { + name: "cgroup v2 preferred over v1", + cgroupFile: `11:memory:/docker/old-path +0::/kubepods/new-path +`, + wantPath: "/kubepods/new-path", + wantErr: false, + }, + { + name: "empty file", + cgroupFile: "", + wantPath: "", + wantErr: false, + }, + { + name: "root cgroup", + cgroupFile: `0::/ +`, + wantPath: "/", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a temp directory structure mimicking /proc + tmpDir := t.TempDir() + procDir := filepath.Join(tmpDir, "proc") + pidDir := filepath.Join(procDir, "1234") + + if err := os.MkdirAll(pidDir, 0755); err != nil { + t.Fatalf("Failed to create pid dir: %v", err) + } + + if err := os.WriteFile(filepath.Join(pidDir, "cgroup"), []byte(tt.cgroupFile), 0644); err != nil { + t.Fatalf("Failed to write cgroup file: %v", err) + } + + got, err := ReadCgroup(procDir, 1234) + if (err != nil) != tt.wantErr { + t.Errorf("ReadCgroup() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got.Path != tt.wantPath { + t.Errorf("ReadCgroup() path = %q, want %q", got.Path, tt.wantPath) + } + }) + } +} + +func TestReadCgroup_FileNotFound(t *testing.T) { + tmpDir := t.TempDir() + + _, err := ReadCgroup(tmpDir, 1234) + if err == nil { + t.Error("ReadCgroup() expected error for missing file, got nil") + } +} diff --git a/internal/proc/process.go b/internal/proc/process.go index fced8bd..9d1ef64 100644 --- a/internal/proc/process.go +++ b/internal/proc/process.go @@ -128,13 +128,14 @@ func ReadSystemCPU(procPath string) (user, nice, system, idle, iowait, irq, soft return 0, 0, 0, 0, 0, 0, 0, fmt.Errorf("cpu line not found in /proc/stat") } -// ProcessInfo combines stat and status information for a process +// ProcessInfo combines stat, status, and cgroup information for a process type ProcessInfo struct { Stat *ProcStat Status *ProcStatus + Cgroup *CgroupInfo } -// ReadProcess reads both stat and status for a single process +// ReadProcess reads stat, status, and cgroup for a single process func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { stat, err := ReadStat(procPath, pid) if err != nil { @@ -146,9 +147,13 @@ func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { return nil, err } + // Read cgroup info (non-fatal if it fails) + cgroup, _ := ReadCgroup(procPath, pid) + return &ProcessInfo{ Stat: stat, Status: status, + Cgroup: cgroup, }, nil } diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go new file mode 100644 index 0000000..d847f62 --- /dev/null +++ b/internal/receiver/handler.go @@ -0,0 +1,196 @@ +// ABOUTME: HTTP handlers for the metrics receiver service. +// ABOUTME: Provides endpoints for receiving and querying metrics. +package receiver + +import ( + "crypto/subtle" + "encoding/json" + "log/slog" + "net/http" + "strings" +) + +// Handler handles HTTP requests for the metrics receiver +type Handler struct { + store *Store + logger *slog.Logger + readToken string // Pre-shared token for read endpoint authentication + hmacKey string // Separate key for HMAC-based push token generation/validation +} + +// NewHandler creates a new HTTP handler with the given store. +// readToken authenticates read endpoints and the token generation endpoint. +// hmacKey is the secret used to derive scoped push tokens. +func NewHandler(store *Store, logger *slog.Logger, readToken, hmacKey string) *Handler { + return &Handler{store: store, logger: logger, readToken: readToken, hmacKey: hmacKey} +} + +// RegisterRoutes registers all HTTP routes on the given mux +func (h *Handler) RegisterRoutes(mux *http.ServeMux) { + mux.HandleFunc("POST /api/v1/metrics", h.handleReceiveMetrics) + mux.HandleFunc("POST /api/v1/token", h.handleGenerateToken) + mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}", h.handleGetByWorkflowJob) + mux.HandleFunc("GET /health", h.handleHealth) +} + +// validateReadToken checks the Authorization header for a valid Bearer token. +func (h *Handler) validateReadToken(w http.ResponseWriter, r *http.Request) bool { + if h.readToken == "" { + h.logger.Warn("no read-token configured, rejecting request", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + h.logger.Warn("missing authorization header", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + const bearerPrefix = "Bearer " + if !strings.HasPrefix(authHeader, bearerPrefix) { + h.logger.Warn("invalid authorization format", slog.String("path", r.URL.Path)) + http.Error(w, "invalid authorization format", http.StatusUnauthorized) + return false + } + + token := strings.TrimPrefix(authHeader, bearerPrefix) + if subtle.ConstantTimeCompare([]byte(token), []byte(h.readToken)) != 1 { + h.logger.Warn("invalid token", slog.String("path", r.URL.Path)) + http.Error(w, "invalid token", http.StatusUnauthorized) + return false + } + + return true +} + +func (h *Handler) handleGenerateToken(w http.ResponseWriter, r *http.Request) { + if h.hmacKey == "" { + http.Error(w, "token generation requires a configured HMAC key", http.StatusBadRequest) + return + } + + if !h.validateReadToken(w, r) { + return + } + + var req TokenRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid JSON body", http.StatusBadRequest) + return + } + + if req.Organization == "" || req.Repository == "" || req.Workflow == "" || req.Job == "" { + http.Error(w, "organization, repository, workflow, and job are required", http.StatusBadRequest) + return + } + + token := GenerateScopedToken(h.hmacKey, req.Organization, req.Repository, req.Workflow, req.Job) + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(TokenResponse{Token: token}) +} + +// validatePushToken checks push authentication via scoped HMAC token. +func (h *Handler) validatePushToken(w http.ResponseWriter, r *http.Request, exec ExecutionContext) bool { + if h.hmacKey == "" { + h.logger.Warn("no HMAC key configured, rejecting push", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + h.logger.Warn("missing push authorization", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + const bearerPrefix = "Bearer " + if !strings.HasPrefix(authHeader, bearerPrefix) { + h.logger.Warn("invalid push authorization format", slog.String("path", r.URL.Path)) + http.Error(w, "invalid authorization format", http.StatusUnauthorized) + return false + } + + token := strings.TrimPrefix(authHeader, bearerPrefix) + if !ValidateScopedToken(h.hmacKey, token, exec.Organization, exec.Repository, exec.Workflow, exec.Job) { + h.logger.Warn("invalid push token", slog.String("path", r.URL.Path)) + http.Error(w, "invalid token", http.StatusUnauthorized) + return false + } + + return true +} + +func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { + var payload MetricsPayload + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + h.logger.Error("failed to decode payload", slog.String("error", err.Error())) + http.Error(w, "invalid JSON payload", http.StatusBadRequest) + return + } + + if payload.Execution.RunID == "" { + http.Error(w, "run_id is required", http.StatusBadRequest) + return + } + + if !h.validatePushToken(w, r, payload.Execution) { + return + } + + id, err := h.store.SaveMetric(&payload) + if err != nil { + h.logger.Error("failed to save metric", slog.String("error", err.Error())) + http.Error(w, "failed to save metric", http.StatusInternalServerError) + return + } + + h.logger.Info("metric saved", + slog.Uint64("id", uint64(id)), + slog.String("run_id", payload.Execution.RunID), + slog.String("repository", payload.Execution.Repository), + ) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(map[string]any{"id": id, "status": "created"}) +} + +func (h *Handler) handleGetByWorkflowJob(w http.ResponseWriter, r *http.Request) { + if !h.validateReadToken(w, r) { + return + } + + org := r.PathValue("org") + repo := r.PathValue("repo") + workflow := r.PathValue("workflow") + job := r.PathValue("job") + if org == "" || repo == "" || workflow == "" || job == "" { + http.Error(w, "org, repo, workflow and job are required", http.StatusBadRequest) + return + } + + metrics, err := h.store.GetMetricsByWorkflowJob(org, repo, workflow, job) + if err != nil { + h.logger.Error("failed to get metrics", slog.String("error", err.Error())) + http.Error(w, "failed to get metrics", http.StatusInternalServerError) + return + } + + // Convert to response type with Payload as JSON object + response := make([]MetricResponse, len(metrics)) + for i, m := range metrics { + response[i] = m.ToResponse() + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(response) +} + +func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) +} diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go new file mode 100644 index 0000000..cea58f0 --- /dev/null +++ b/internal/receiver/handler_test.go @@ -0,0 +1,473 @@ +package receiver + +import ( + "bytes" + "encoding/json" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "path/filepath" + "testing" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestHandler_ReceiveMetrics(t *testing.T) { + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) + defer cleanup() + + exec := ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-123", + } + pushToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) + + payload := MetricsPayload{ + Execution: exec, + Summary: summary.RunSummary{ + DurationSeconds: 60.0, + SampleCount: 12, + }, + } + + body, _ := json.Marshal(payload) + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusCreated { + t.Errorf("status = %d, want %d", rec.Code, http.StatusCreated) + } + + var resp map[string]any + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp["status"] != "created" { + t.Errorf("response status = %v, want %q", resp["status"], "created") + } + if resp["id"] == nil || resp["id"].(float64) == 0 { + t.Error("response id is missing or zero") + } +} + +func TestHandler_ReceiveMetrics_InvalidJSON(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader([]byte("not json"))) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_ReceiveMetrics_MissingRunID(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + payload := MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + // RunID is missing + }, + Summary: summary.RunSummary{}, + } + + body, _ := json.Marshal(payload) + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_GetByWorkflowJob(t *testing.T) { + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) + defer cleanup() + + // Save metrics for different workflow/job combinations + payloads := []*MetricsPayload{ + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r1"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r2"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "test", RunID: "r3"}}, + } + for _, p := range payloads { + if _, err := h.store.SaveMetric(p); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y/ci.yml/build", nil) + req.Header.Set("Authorization", "Bearer "+readToken) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []MetricResponse + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 2 { + t.Errorf("got %d metrics, want 2", len(metrics)) + } +} + +func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/workflow/job", nil) + req.Header.Set("Authorization", "Bearer "+readToken) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []MetricResponse + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 0 { + t.Errorf("got %d metrics, want 0", len(metrics)) + } +} + +func TestHandler_GetByWorkflowJob_WithToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + // Save a metric + payload := &MetricsPayload{ + Execution: ExecutionContext{Organization: "org", Repository: "repo", Workflow: "ci.yml", Job: "build", RunID: "r1"}, + } + if _, err := h.store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + + tests := []struct { + name string + authHeader string + wantCode int + }{ + {"no auth header", "", http.StatusUnauthorized}, + {"wrong format", "Basic dXNlcjpwYXNz", http.StatusUnauthorized}, + {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, + {"valid token", "Bearer secret-token", http.StatusOK}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) + if tt.authHeader != "" { + req.Header.Set("Authorization", tt.authHeader) + } + rec := httptest.NewRecorder() + mux.ServeHTTP(rec, req) + + if rec.Code != tt.wantCode { + t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) + } + }) + } +} + +func TestHandler_Health(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/health", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp map[string]string + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp["status"] != "ok" { + t.Errorf("status = %q, want %q", resp["status"], "ok") + } +} + +func TestHandler_GenerateToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Authorization", "Bearer secret-token") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp TokenResponse + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp.Token == "" { + t.Error("expected non-empty token") + } + if len(resp.Token) != 64 { + t.Errorf("token length = %d, want 64", len(resp.Token)) + } +} + +func TestHandler_GenerateToken_NoAuth(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + +func TestHandler_GenerateToken_MissingFields(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + // Missing job field + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Authorization", "Bearer secret-token") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_GenerateToken_NoReadToken(t *testing.T) { + h, cleanup := newTestHandler(t) // no readToken configured + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_ReceiveMetrics_WithPushToken(t *testing.T) { + readToken := "secret-token" + h, cleanup := newTestHandlerWithToken(t, readToken) + defer cleanup() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + + exec := ExecutionContext{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-1", + } + + validToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) + wrongScopeToken := GenerateScopedToken(readToken, "other-org", "repo", "ci.yml", "build") + + tests := []struct { + name string + authHeader string + wantCode int + }{ + {"no auth", "", http.StatusUnauthorized}, + {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, + {"wrong scope", "Bearer " + wrongScopeToken, http.StatusUnauthorized}, + {"valid token", "Bearer " + validToken, http.StatusCreated}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + payload := MetricsPayload{ + Execution: exec, + Summary: summary.RunSummary{SampleCount: 1}, + } + body, _ := json.Marshal(payload) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + if tt.authHeader != "" { + req.Header.Set("Authorization", tt.authHeader) + } + rec := httptest.NewRecorder() + mux.ServeHTTP(rec, req) + + if rec.Code != tt.wantCode { + t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) + } + }) + } +} + +func TestHandler_ReceiveMetrics_RejectsWhenNoReadToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured + defer cleanup() + + payload := MetricsPayload{ + Execution: ExecutionContext{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-1", + }, + Summary: summary.RunSummary{SampleCount: 1}, + } + body, _ := json.Marshal(payload) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + +func TestHandler_GetByWorkflowJob_RejectsWhenNoReadToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + +func newTestHandler(t *testing.T) (*Handler, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + handler := NewHandler(store, logger, "", "") // no auth — endpoints will reject + + return handler, func() { _ = store.Close() } +} + +func newTestHandlerWithToken(t *testing.T, readToken string) (*Handler, func()) { + t.Helper() + return newTestHandlerWithKeys(t, readToken, readToken) +} + +func newTestHandlerWithKeys(t *testing.T, readToken, hmacKey string) (*Handler, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + handler := NewHandler(store, logger, readToken, hmacKey) + + return handler, func() { _ = store.Close() } +} diff --git a/internal/receiver/store.go b/internal/receiver/store.go new file mode 100644 index 0000000..1b934de --- /dev/null +++ b/internal/receiver/store.go @@ -0,0 +1,113 @@ +// ABOUTME: SQLite storage layer for metrics receiver using GORM. +// ABOUTME: Handles database initialization and metric storage/retrieval. +package receiver + +import ( + "encoding/json" + "fmt" + "time" + + "gorm.io/driver/sqlite" + "gorm.io/gorm" + "gorm.io/gorm/logger" +) + +// Metric represents a stored metric record in the database +type Metric struct { + ID uint `gorm:"primaryKey"` + Organization string `gorm:"index:idx_org_repo;not null"` + Repository string `gorm:"index:idx_org_repo;not null"` + Workflow string `gorm:"not null"` + Job string `gorm:"not null"` + RunID string `gorm:"index;not null"` + ReceivedAt time.Time `gorm:"index;not null"` + Payload string `gorm:"type:text;not null"` // JSON-encoded RunSummary +} + +// MetricResponse is the API response type with Payload as embedded JSON object +type MetricResponse struct { + ID uint `json:"id"` + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` + ReceivedAt time.Time `json:"received_at"` + Payload json.RawMessage `json:"payload"` +} + +// ToResponse converts a Metric to a MetricResponse with Payload as JSON object +func (m *Metric) ToResponse() MetricResponse { + return MetricResponse{ + ID: m.ID, + Organization: m.Organization, + Repository: m.Repository, + Workflow: m.Workflow, + Job: m.Job, + RunID: m.RunID, + ReceivedAt: m.ReceivedAt, + Payload: json.RawMessage(m.Payload), + } +} + +// Store handles SQLite storage for metrics using GORM +type Store struct { + db *gorm.DB +} + +// NewStore creates a new SQLite store at the given path +func NewStore(dbPath string) (*Store, error) { + db, err := gorm.Open(sqlite.Open(dbPath), &gorm.Config{ + Logger: logger.Default.LogMode(logger.Silent), + }) + if err != nil { + return nil, fmt.Errorf("opening database: %w", err) + } + + if err := db.AutoMigrate(&Metric{}); err != nil { + return nil, fmt.Errorf("migrating schema: %w", err) + } + + return &Store{db: db}, nil +} + +// SaveMetric stores a metrics payload in the database +func (s *Store) SaveMetric(payload *MetricsPayload) (uint, error) { + summaryJSON, err := json.Marshal(payload.Summary) + if err != nil { + return 0, fmt.Errorf("marshaling summary: %w", err) + } + + metric := Metric{ + Organization: payload.Execution.Organization, + Repository: payload.Execution.Repository, + Workflow: payload.Execution.Workflow, + Job: payload.Execution.Job, + RunID: payload.Execution.RunID, + ReceivedAt: time.Now().UTC(), + Payload: string(summaryJSON), + } + + result := s.db.Create(&metric) + if result.Error != nil { + return 0, fmt.Errorf("inserting metric: %w", result.Error) + } + + return metric.ID, nil +} + +// GetMetricsByWorkflowJob retrieves all metrics for a specific workflow and job +func (s *Store) GetMetricsByWorkflowJob(org, repo, workflow, job string) ([]Metric, error) { + var metrics []Metric + result := s.db.Where("organization = ? AND repository = ? AND workflow = ? AND job = ?", org, repo, workflow, job).Order("received_at DESC").Find(&metrics) + return metrics, result.Error +} + +// Close closes the database connection +func (s *Store) Close() error { + sqlDB, err := s.db.DB() + if err != nil { + return err + } + return sqlDB.Close() +} diff --git a/internal/receiver/store_test.go b/internal/receiver/store_test.go new file mode 100644 index 0000000..44c4f17 --- /dev/null +++ b/internal/receiver/store_test.go @@ -0,0 +1,178 @@ +package receiver + +import ( + "os" + "path/filepath" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestNewStore(t *testing.T) { + dbPath := filepath.Join(t.TempDir(), "test.db") + + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + defer func() { _ = store.Close() }() + + if _, err := os.Stat(dbPath); os.IsNotExist(err) { + t.Error("database file was not created") + } +} + +func TestStore_SaveMetric(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-123", + }, + Summary: summary.RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 12, + CPUTotal: summary.StatSummary{Peak: 80.5, Avg: 45.2, P95: 75.0}, + MemUsedBytes: summary.StatSummary{Peak: 1024000, Avg: 512000, P95: 900000}, + MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + }, + } + + id, err := store.SaveMetric(payload) + if err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + if id == 0 { + t.Error("SaveMetric() returned id = 0, want non-zero") + } +} + +func TestStore_GetMetricsByWorkflowJob(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + // Save metrics for different workflow/job combinations + payloads := []struct { + org string + repo string + workflow string + job string + }{ + {"org-a", "repo-1", "ci.yml", "build"}, + {"org-a", "repo-1", "ci.yml", "build"}, + {"org-a", "repo-1", "ci.yml", "test"}, + {"org-a", "repo-1", "deploy.yml", "build"}, + } + + for i, p := range payloads { + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: p.org, + Repository: p.repo, + Workflow: p.workflow, + Job: p.job, + RunID: "run-" + string(rune('a'+i)), + }, + Summary: summary.RunSummary{}, + } + if _, err := store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + } + + metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 2 { + t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 2", len(metrics)) + } + + for _, m := range metrics { + if m.Organization != "org-a" || m.Repository != "repo-1" || m.Workflow != "ci.yml" || m.Job != "build" { + t.Errorf("GetMetricsByWorkflowJob() returned metric with org=%q repo=%q workflow=%q job=%q, want org-a/repo-1/ci.yml/build", + m.Organization, m.Repository, m.Workflow, m.Job) + } + } +} + +func TestStore_GetMetricsByWorkflowJob_NotFound(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + metrics, err := store.GetMetricsByWorkflowJob("nonexistent", "repo", "workflow", "job") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 0 { + t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 0", len(metrics)) + } +} + +func TestStore_SaveMetric_PreservesPayload(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + original := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "build.yml", + Job: "test", + RunID: "run-preserve", + }, + Summary: summary.RunSummary{ + DurationSeconds: 123.45, + SampleCount: 50, + CPUTotal: summary.StatSummary{Peak: 99.9, Avg: 55.5, P95: 88.8}, + }, + } + + _, err := store.SaveMetric(original) + if err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + metrics, err := store.GetMetricsByWorkflowJob("test-org", "test-repo", "build.yml", "test") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 1 { + t.Fatalf("GetMetricsByWorkflowJob() returned %d metrics, want 1", len(metrics)) + } + + m := metrics[0] + if m.Organization != original.Execution.Organization { + t.Errorf("Organization = %q, want %q", m.Organization, original.Execution.Organization) + } + if m.Repository != original.Execution.Repository { + t.Errorf("Repository = %q, want %q", m.Repository, original.Execution.Repository) + } + if m.Workflow != original.Execution.Workflow { + t.Errorf("Workflow = %q, want %q", m.Workflow, original.Execution.Workflow) + } + if m.Job != original.Execution.Job { + t.Errorf("Job = %q, want %q", m.Job, original.Execution.Job) + } + if m.Payload == "" { + t.Error("Payload is empty") + } +} + +func newTestStore(t *testing.T) *Store { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + return store +} diff --git a/internal/receiver/token.go b/internal/receiver/token.go new file mode 100644 index 0000000..087546c --- /dev/null +++ b/internal/receiver/token.go @@ -0,0 +1,25 @@ +// ABOUTME: HMAC-SHA256 token generation and validation for scoped push authentication. +// ABOUTME: Tokens are derived from a key + scope, enabling stateless validation without DB storage. +package receiver + +import ( + "crypto/hmac" + "crypto/sha256" + "crypto/subtle" + "encoding/hex" +) + +// GenerateScopedToken computes an HMAC-SHA256 token scoped to a specific org/repo/workflow/job. +// The canonical input is "v1\x00\x00\x00\x00". +func GenerateScopedToken(key, org, repo, workflow, job string) string { + mac := hmac.New(sha256.New, []byte(key)) + mac.Write([]byte("v1\x00" + org + "\x00" + repo + "\x00" + workflow + "\x00" + job)) + return hex.EncodeToString(mac.Sum(nil)) +} + +// ValidateScopedToken checks whether a token matches the expected HMAC for the given scope. +// Uses constant-time comparison to prevent timing attacks. +func ValidateScopedToken(key, token, org, repo, workflow, job string) bool { + expected := GenerateScopedToken(key, org, repo, workflow, job) + return subtle.ConstantTimeCompare([]byte(token), []byte(expected)) == 1 +} diff --git a/internal/receiver/token_test.go b/internal/receiver/token_test.go new file mode 100644 index 0000000..2140ecd --- /dev/null +++ b/internal/receiver/token_test.go @@ -0,0 +1,78 @@ +package receiver + +import ( + "encoding/hex" + "testing" +) + +func TestGenerateScopedToken_Deterministic(t *testing.T) { + token1 := GenerateScopedToken("key", "org", "repo", "wf", "job") + token2 := GenerateScopedToken("key", "org", "repo", "wf", "job") + if token1 != token2 { + t.Errorf("tokens differ: %q vs %q", token1, token2) + } +} + +func TestGenerateScopedToken_ScopePinning(t *testing.T) { + base := GenerateScopedToken("key", "org", "repo", "wf", "job") + + variants := []struct { + name string + org string + repo string + wf string + job string + }{ + {"different org", "other-org", "repo", "wf", "job"}, + {"different repo", "org", "other-repo", "wf", "job"}, + {"different workflow", "org", "repo", "other-wf", "job"}, + {"different job", "org", "repo", "wf", "other-job"}, + } + + for _, v := range variants { + t.Run(v.name, func(t *testing.T) { + token := GenerateScopedToken("key", v.org, v.repo, v.wf, v.job) + if token == base { + t.Errorf("token for %s should differ from base", v.name) + } + }) + } +} + +func TestGenerateScopedToken_DifferentKeys(t *testing.T) { + token1 := GenerateScopedToken("key-a", "org", "repo", "wf", "job") + token2 := GenerateScopedToken("key-b", "org", "repo", "wf", "job") + if token1 == token2 { + t.Error("different keys should produce different tokens") + } +} + +func TestGenerateScopedToken_ValidHex(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if len(token) != 64 { + t.Errorf("token length = %d, want 64", len(token)) + } + if _, err := hex.DecodeString(token); err != nil { + t.Errorf("token is not valid hex: %v", err) + } +} + +func TestValidateScopedToken_Correct(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if !ValidateScopedToken("key", token, "org", "repo", "wf", "job") { + t.Error("ValidateScopedToken should accept correct token") + } +} + +func TestValidateScopedToken_WrongToken(t *testing.T) { + if ValidateScopedToken("key", "deadbeef", "org", "repo", "wf", "job") { + t.Error("ValidateScopedToken should reject wrong token") + } +} + +func TestValidateScopedToken_WrongScope(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if ValidateScopedToken("key", token, "org", "repo", "wf", "other-job") { + t.Error("ValidateScopedToken should reject token for different scope") + } +} diff --git a/internal/receiver/types.go b/internal/receiver/types.go new file mode 100644 index 0000000..dbc56e0 --- /dev/null +++ b/internal/receiver/types.go @@ -0,0 +1,45 @@ +// ABOUTME: Data types for the metrics receiver service. +// ABOUTME: Defines MetricsPayload combining execution metadata with run summary. +package receiver + +import "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" + +// ExecutionContext holds GitHub Actions style identifiers for a workflow run +type ExecutionContext struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` +} + +// MetricsPayload is the complete payload sent to the receiver +type MetricsPayload struct { + Execution ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` +} + +// StoredMetric represents a metric record as stored in the database +type StoredMetric struct { + ID int64 + Organization string + Repository string + Workflow string + Job string + RunID string + ReceivedAt string + Payload string // JSON-encoded RunSummary +} + +// TokenRequest is the request body for POST /api/v1/token +type TokenRequest struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` +} + +// TokenResponse is the response body for POST /api/v1/token +type TokenResponse struct { + Token string `json:"token"` +} diff --git a/internal/summary/accumulator.go b/internal/summary/accumulator.go new file mode 100644 index 0000000..6ffa8cd --- /dev/null +++ b/internal/summary/accumulator.go @@ -0,0 +1,184 @@ +// ABOUTME: Accumulates system metrics samples across a collection run. +// ABOUTME: Computes peak, average, and P95 statistics for CPU and memory on demand. +package summary + +import ( + "fmt" + "sort" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" +) + +// containerAccumulator tracks metrics for a single container +type containerAccumulator struct { + cpuCoresValues []float64 + memoryBytesValues []float64 +} + +// Accumulator collects metric samples and computes run-level statistics +type Accumulator struct { + topN int + cpuValues []float64 + memBytesValues []float64 + memPctValues []float64 + processPeaks map[string]*ProcessPeak + containers map[string]*containerAccumulator + startTime time.Time + endTime time.Time + sampleCount int +} + +// NewAccumulator creates an accumulator that tracks the top N processes +func NewAccumulator(topN int) *Accumulator { + return &Accumulator{ + topN: topN, + processPeaks: make(map[string]*ProcessPeak), + containers: make(map[string]*containerAccumulator), + } +} + +// Add records a single metrics sample +func (a *Accumulator) Add(m *metrics.SystemMetrics) { + a.sampleCount++ + if a.sampleCount == 1 { + a.startTime = m.Timestamp + } + a.endTime = m.Timestamp + + a.cpuValues = append(a.cpuValues, m.CPU.TotalPercent) + a.memBytesValues = append(a.memBytesValues, float64(m.Memory.UsedBytes)) + a.memPctValues = append(a.memPctValues, m.Memory.UsedPercent) + + for _, p := range m.TopCPU { + a.updateProcessPeak(p) + } + for _, p := range m.TopMemory { + a.updateProcessPeak(p) + } + + // Track per-container metrics + for name, cgroup := range m.Cgroups { + ca, ok := a.containers[name] + if !ok { + ca = &containerAccumulator{} + a.containers[name] = ca + } + // Only record CPU when a valid delta was computed (skip first sample and underflow) + if cgroup.CPU.HasDelta { + ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores) + } + ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes)) + } +} + +// Summarize computes and returns the run summary, or nil if no samples were added +func (a *Accumulator) Summarize() *RunSummary { + if a.sampleCount == 0 { + return nil + } + + return &RunSummary{ + StartTime: a.startTime, + EndTime: a.endTime, + DurationSeconds: a.endTime.Sub(a.startTime).Seconds(), + SampleCount: a.sampleCount, + CPUTotal: computeStats(a.cpuValues), + MemUsedBytes: computeStats(a.memBytesValues), + MemUsedPercent: computeStats(a.memPctValues), + TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }), + TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }), + Containers: a.containerSummaries(), + } +} + +// containerSummaries computes summaries for all tracked containers +func (a *Accumulator) containerSummaries() []ContainerSummary { + summaries := make([]ContainerSummary, 0, len(a.containers)) + for name, ca := range a.containers { + summaries = append(summaries, ContainerSummary{ + Name: name, + CPUCores: computeStats(ca.cpuCoresValues), + MemoryBytes: computeStats(ca.memoryBytesValues), + }) + } + // Sort by name for consistent output + sort.Slice(summaries, func(i, j int) bool { + return summaries[i].Name < summaries[j].Name + }) + return summaries +} + +// SampleCount returns the number of samples added +func (a *Accumulator) SampleCount() int { + return a.sampleCount +} + +// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values +func computeStats(values []float64) StatSummary { + n := len(values) + if n == 0 { + return StatSummary{} + } + + sorted := make([]float64, n) + copy(sorted, values) + sort.Float64s(sorted) + + var sum float64 + for _, v := range sorted { + sum += v + } + + return StatSummary{ + Peak: sorted[n-1], + P99: sorted[percentileIndex(n, 0.99)], + P95: sorted[percentileIndex(n, 0.95)], + P75: sorted[percentileIndex(n, 0.75)], + P50: sorted[percentileIndex(n, 0.50)], + Avg: sum / float64(n), + } +} + +// percentileIndex returns the index for the given percentile (0.0-1.0) +func percentileIndex(n int, percentile float64) int { + return int(float64(n-1) * percentile) +} + +// updateProcessPeak merges a process observation into the peak tracking map +func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) { + key := fmt.Sprintf("%d:%s", p.PID, p.Name) + existing, ok := a.processPeaks[key] + if !ok { + a.processPeaks[key] = &ProcessPeak{ + PID: p.PID, + Name: p.Name, + PeakCPU: p.CPUPercent, + PeakMem: p.MemRSS, + } + return + } + if p.CPUPercent > existing.PeakCPU { + existing.PeakCPU = p.CPUPercent + } + if p.MemRSS > existing.PeakMem { + existing.PeakMem = p.MemRSS + } +} + +// topProcesses returns the top N processes sorted by the given key function (descending) +func (a *Accumulator) topProcesses(keyFn func(*ProcessPeak) float64) []ProcessPeak { + all := make([]ProcessPeak, 0, len(a.processPeaks)) + for _, p := range a.processPeaks { + all = append(all, *p) + } + + sort.Slice(all, func(i, j int) bool { + return keyFn(&all[i]) > keyFn(&all[j]) + }) + + if len(all) > a.topN { + all = all[:a.topN] + } + return all +} diff --git a/internal/summary/accumulator_test.go b/internal/summary/accumulator_test.go new file mode 100644 index 0000000..cdda597 --- /dev/null +++ b/internal/summary/accumulator_test.go @@ -0,0 +1,598 @@ +// ABOUTME: Tests for the summary accumulator that tracks metrics across a run. +// ABOUTME: Validates stats computation (peak/avg/P95), process peak tracking, and edge cases. +package summary + +import ( + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" +) + +func TestAccumulator_NoSamples(t *testing.T) { + acc := NewAccumulator(5) + result := acc.Summarize() + if result != nil { + t.Errorf("expected nil summary for no samples, got %+v", result) + } +} + +func TestAccumulator_SingleSample(t *testing.T) { + acc := NewAccumulator(5) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 42.5}, + Memory: metrics.MemoryMetrics{ + UsedBytes: 1000, + UsedPercent: 50.0, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // With a single sample, peak=avg=p95 + if s.CPUTotal.Peak != 42.5 { + t.Errorf("CPU peak: got %f, want 42.5", s.CPUTotal.Peak) + } + if s.CPUTotal.Avg != 42.5 { + t.Errorf("CPU avg: got %f, want 42.5", s.CPUTotal.Avg) + } + if s.CPUTotal.P95 != 42.5 { + t.Errorf("CPU p95: got %f, want 42.5", s.CPUTotal.P95) + } + if s.MemUsedBytes.Peak != 1000 { + t.Errorf("MemUsedBytes peak: got %f, want 1000", s.MemUsedBytes.Peak) + } + if s.MemUsedPercent.Peak != 50.0 { + t.Errorf("MemUsedPercent peak: got %f, want 50.0", s.MemUsedPercent.Peak) + } +} + +func TestAccumulator_Stats(t *testing.T) { + acc := NewAccumulator(5) + cpuValues := []float64{10, 20, 30, 40, 50} + for i, v := range cpuValues { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: v}, + Memory: metrics.MemoryMetrics{ + UsedBytes: uint64(v * 100), + UsedPercent: v, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Peak = max = 50 + if s.CPUTotal.Peak != 50 { + t.Errorf("CPU peak: got %f, want 50", s.CPUTotal.Peak) + } + // Avg = (10+20+30+40+50)/5 = 30 + if s.CPUTotal.Avg != 30 { + t.Errorf("CPU avg: got %f, want 30", s.CPUTotal.Avg) + } + // P95: sorted=[10,20,30,40,50], index=int(4*0.95)=int(3.8)=3, value=40 + if s.CPUTotal.P95 != 40 { + t.Errorf("CPU p95: got %f, want 40", s.CPUTotal.P95) + } +} + +func TestAccumulator_P95_LargerDataset(t *testing.T) { + acc := NewAccumulator(5) + // 20 values: 1, 2, 3, ..., 20 + for i := 1; i <= 20; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, + Memory: metrics.MemoryMetrics{}, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // P95: sorted=[1..20], index=int(19*0.95)=int(18.05)=18, value=19 + if s.CPUTotal.P95 != 19 { + t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) + } + // Avg = (1+2+...+20)/20 = 210/20 = 10.5 + if s.CPUTotal.Avg != 10.5 { + t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) + } +} + +func TestAccumulator_MemoryStats(t *testing.T) { + acc := NewAccumulator(5) + memBytes := []uint64{1000, 2000, 3000, 4000, 5000} + memPercent := []float64{10, 20, 30, 40, 50} + + for i := range memBytes { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 0}, + Memory: metrics.MemoryMetrics{ + UsedBytes: memBytes[i], + UsedPercent: memPercent[i], + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // MemUsedBytes: peak=5000, avg=3000, p95=4000 + if s.MemUsedBytes.Peak != 5000 { + t.Errorf("MemUsedBytes peak: got %f, want 5000", s.MemUsedBytes.Peak) + } + if s.MemUsedBytes.Avg != 3000 { + t.Errorf("MemUsedBytes avg: got %f, want 3000", s.MemUsedBytes.Avg) + } + if s.MemUsedBytes.P95 != 4000 { + t.Errorf("MemUsedBytes p95: got %f, want 4000", s.MemUsedBytes.P95) + } + + // MemUsedPercent: peak=50, avg=30, p95=40 + if s.MemUsedPercent.Peak != 50 { + t.Errorf("MemUsedPercent peak: got %f, want 50", s.MemUsedPercent.Peak) + } + if s.MemUsedPercent.Avg != 30 { + t.Errorf("MemUsedPercent avg: got %f, want 30", s.MemUsedPercent.Avg) + } + if s.MemUsedPercent.P95 != 40 { + t.Errorf("MemUsedPercent p95: got %f, want 40", s.MemUsedPercent.P95) + } +} + +func TestAccumulator_ProcessPeaks(t *testing.T) { + acc := NewAccumulator(5) + + // Same PID across two samples; peaks should be retained + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, + }, + }) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 20, MemRSS: 50}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 5, MemRSS: 200}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should find PID 1 with peak CPU=20, peak mem=200 + found := false + for _, p := range s.TopCPUProcesses { + if p.PID == 1 { + found = true + if p.PeakCPU != 20 { + t.Errorf("PeakCPU: got %f, want 20", p.PeakCPU) + } + if p.PeakMem != 200 { + t.Errorf("PeakMem: got %d, want 200", p.PeakMem) + } + } + } + if !found { + t.Error("PID 1 not found in TopCPUProcesses") + } +} + +func TestAccumulator_ProcessPeaks_TopN(t *testing.T) { + acc := NewAccumulator(2) // Only top 2 + + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, + {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, + {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, + {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, + {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // TopCPUProcesses should have at most 2 entries, sorted by PeakCPU descending + if len(s.TopCPUProcesses) != 2 { + t.Fatalf("TopCPUProcesses length: got %d, want 2", len(s.TopCPUProcesses)) + } + if s.TopCPUProcesses[0].PeakCPU != 90 { + t.Errorf("TopCPU[0] PeakCPU: got %f, want 90", s.TopCPUProcesses[0].PeakCPU) + } + if s.TopCPUProcesses[1].PeakCPU != 50 { + t.Errorf("TopCPU[1] PeakCPU: got %f, want 50", s.TopCPUProcesses[1].PeakCPU) + } + + // TopMemProcesses should have at most 2 entries, sorted by PeakMem descending + if len(s.TopMemProcesses) != 2 { + t.Fatalf("TopMemProcesses length: got %d, want 2", len(s.TopMemProcesses)) + } + if s.TopMemProcesses[0].PeakMem != 900 { + t.Errorf("TopMem[0] PeakMem: got %d, want 900", s.TopMemProcesses[0].PeakMem) + } + if s.TopMemProcesses[1].PeakMem != 500 { + t.Errorf("TopMem[1] PeakMem: got %d, want 500", s.TopMemProcesses[1].PeakMem) + } +} + +func TestAccumulator_ProcessPeaks_Dedup(t *testing.T) { + acc := NewAccumulator(5) + + // A process appears in both TopCPU and TopMemory + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "proc", CPUPercent: 80, MemRSS: 100}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "proc", CPUPercent: 30, MemRSS: 500}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // The internal process map should have merged the peaks + // PeakCPU should be 80 (from TopCPU), PeakMem should be 500 (from TopMemory) + for _, p := range s.TopCPUProcesses { + if p.PID == 1 { + if p.PeakCPU != 80 { + t.Errorf("PeakCPU: got %f, want 80", p.PeakCPU) + } + if p.PeakMem != 500 { + t.Errorf("PeakMem: got %d, want 500", p.PeakMem) + } + } + } +} + +func TestAccumulator_SampleCount(t *testing.T) { + acc := NewAccumulator(5) + if acc.SampleCount() != 0 { + t.Errorf("initial SampleCount: got %d, want 0", acc.SampleCount()) + } + + for i := 0; i < 3; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + } + + if acc.SampleCount() != 3 { + t.Errorf("SampleCount after 3 adds: got %d, want 3", acc.SampleCount()) + } +} + +func TestAccumulator_Duration(t *testing.T) { + acc := NewAccumulator(5) + start := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + end := time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC) // 60 seconds later + + acc.Add(&metrics.SystemMetrics{ + Timestamp: start, + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + acc.Add(&metrics.SystemMetrics{ + Timestamp: end, + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if !s.StartTime.Equal(start) { + t.Errorf("StartTime: got %v, want %v", s.StartTime, start) + } + if s.DurationSeconds != 60 { + t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds) + } +} + +func TestAccumulator_AllPercentiles(t *testing.T) { + acc := NewAccumulator(5) + // 20 values: 1, 2, 3, ..., 20 + for i := 1; i <= 20; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, + Memory: metrics.MemoryMetrics{}, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Peak = 20 + if s.CPUTotal.Peak != 20 { + t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak) + } + // P99: index=int(19*0.99)=int(18.81)=18, value=19 + if s.CPUTotal.P99 != 19 { + t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99) + } + // P95: index=int(19*0.95)=int(18.05)=18, value=19 + if s.CPUTotal.P95 != 19 { + t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) + } + // P75: index=int(19*0.75)=int(14.25)=14, value=15 + if s.CPUTotal.P75 != 15 { + t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75) + } + // P50: index=int(19*0.50)=int(9.5)=9, value=10 + if s.CPUTotal.P50 != 10 { + t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50) + } + // Avg = (1+2+...+20)/20 = 210/20 = 10.5 + if s.CPUTotal.Avg != 10.5 { + t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) + } +} + +func TestAccumulator_ContainerMetrics(t *testing.T) { + acc := NewAccumulator(5) + + // Add samples with container metrics (HasDelta=true to indicate valid CPU measurements) + for i := 1; i <= 5; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i * 10)}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{ + TotalRSSBytes: uint64(i * 1000), + }, + }, + "container-b": { + Name: "container-b", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2), HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{ + TotalRSSBytes: uint64(i * 2000), + }, + }, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should have 2 containers + if len(s.Containers) != 2 { + t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) + } + + // Containers should be sorted by name + if s.Containers[0].Name != "container-a" { + t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name) + } + if s.Containers[1].Name != "container-b" { + t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name) + } + + // Container A: CPU cores [1,2,3,4,5], peak=5, avg=3 + containerA := s.Containers[0] + if containerA.CPUCores.Peak != 5 { + t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak) + } + if containerA.CPUCores.Avg != 3 { + t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg) + } + // Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000 + if containerA.MemoryBytes.Peak != 5000 { + t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak) + } + if containerA.MemoryBytes.Avg != 3000 { + t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg) + } + + // Container B: CPU cores [2,4,6,8,10], peak=10, avg=6 + containerB := s.Containers[1] + if containerB.CPUCores.Peak != 10 { + t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak) + } + if containerB.CPUCores.Avg != 6 { + t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg) + } +} + +func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) { + acc := NewAccumulator(5) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 50}, + Memory: metrics.MemoryMetrics{}, + Cgroups: nil, // No containers + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if len(s.Containers) != 0 { + t.Errorf("Containers length: got %d, want 0", len(s.Containers)) + } +} + +func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { + acc := NewAccumulator(5) + + // First sample: only container-a + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: 1, HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, + }, + }, + }) + + // Second sample: both containers + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: 2, HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000}, + }, + "container-b": { + Name: "container-b", + CPU: metrics.CgroupCPUMetrics{UsedCores: 5, HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000}, + }, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should have 2 containers + if len(s.Containers) != 2 { + t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) + } + + // Container A: 2 samples [1,2] + containerA := s.Containers[0] + if containerA.CPUCores.Peak != 2 { + t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak) + } + if containerA.CPUCores.Avg != 1.5 { + t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg) + } + + // Container B: 1 sample [5] + containerB := s.Containers[1] + if containerB.CPUCores.Peak != 5 { + t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak) + } + if containerB.CPUCores.Avg != 5 { + t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg) + } +} + +func TestAccumulator_ContainerMetrics_InvalidDeltaExcluded(t *testing.T) { + acc := NewAccumulator(5) + + // Sample 1: no valid CPU delta (first sample / underflow) — should be excluded from CPU stats + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "runner": { + Name: "runner", + CPU: metrics.CgroupCPUMetrics{UsedCores: 0, HasDelta: false}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, + }, + }, + }) + + // Samples 2-4: valid deltas + for i := 2; i <= 4; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "runner": { + Name: "runner", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: uint64(i * 1000)}, + }, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if len(s.Containers) != 1 { + t.Fatalf("Containers length: got %d, want 1", len(s.Containers)) + } + + runner := s.Containers[0] + // CPU should only include samples 2,3,4 (values 2,3,4) — NOT the invalid zero + // Peak=4, Avg=3, P50=3 + if runner.CPUCores.Peak != 4 { + t.Errorf("CPUCores.Peak: got %f, want 4", runner.CPUCores.Peak) + } + if runner.CPUCores.Avg != 3 { + t.Errorf("CPUCores.Avg: got %f, want 3", runner.CPUCores.Avg) + } + if runner.CPUCores.P50 != 3 { + t.Errorf("CPUCores.P50: got %f, want 3", runner.CPUCores.P50) + } + + // Memory should include all 4 samples (memory is always valid) + // Values: 1000, 2000, 3000, 4000 + if runner.MemoryBytes.Peak != 4000 { + t.Errorf("MemoryBytes.Peak: got %f, want 4000", runner.MemoryBytes.Peak) + } + if runner.MemoryBytes.Avg != 2500 { + t.Errorf("MemoryBytes.Avg: got %f, want 2500", runner.MemoryBytes.Avg) + } +} diff --git a/internal/summary/push.go b/internal/summary/push.go new file mode 100644 index 0000000..b6383db --- /dev/null +++ b/internal/summary/push.go @@ -0,0 +1,112 @@ +// ABOUTME: HTTP client for pushing run summaries to the metrics receiver. +// ABOUTME: Reads execution context from GitHub Actions style environment variables. +package summary + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "time" +) + +// ExecutionContext holds GitHub Actions style identifiers for a workflow run +type ExecutionContext struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` +} + +// MetricsPayload is the complete payload sent to the receiver +type MetricsPayload struct { + Execution ExecutionContext `json:"execution"` + Summary RunSummary `json:"run_summary"` +} + +// PushClient sends metrics to the receiver service +type PushClient struct { + endpoint string + token string + client *http.Client + ctx ExecutionContext +} + +// NewPushClient creates a new push client configured from environment variables. +// If token is non-empty, it is sent as a Bearer token on each push request. +func NewPushClient(endpoint, token string) *PushClient { + return &PushClient{ + endpoint: endpoint, + token: token, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + ctx: ExecutionContextFromEnv(), + } +} + +// ExecutionContextFromEnv reads execution context from GitHub Actions environment variables +func ExecutionContextFromEnv() ExecutionContext { + return ExecutionContext{ + Organization: getEnvWithFallback("GITHUB_REPOSITORY_OWNER", "GITEA_REPO_OWNER"), + Repository: getEnvWithFallback("GITHUB_REPOSITORY", "GITEA_REPO"), + Workflow: getEnvWithFallback("GITHUB_WORKFLOW", "GITEA_WORKFLOW"), + Job: getEnvWithFallback("GITHUB_JOB", "GITEA_JOB"), + RunID: getEnvWithFallback("GITHUB_RUN_ID", "GITEA_RUN_ID"), + } +} + +func getEnvWithFallback(keys ...string) string { + for _, key := range keys { + if val := os.Getenv(key); val != "" { + return val + } + } + return "" +} + +// Push sends the run summary to the receiver +func (p *PushClient) Push(ctx context.Context, summary *RunSummary) error { + if summary == nil { + return nil + } + + payload := MetricsPayload{ + Execution: p.ctx, + Summary: *summary, + } + + body, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshaling payload: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.endpoint, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("creating request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if p.token != "" { + req.Header.Set("Authorization", "Bearer "+p.token) + } + + resp, err := p.client.Do(req) + if err != nil { + return fmt.Errorf("sending request: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + return nil +} + +// ExecutionContext returns the current execution context +func (p *PushClient) ExecutionContext() ExecutionContext { + return p.ctx +} diff --git a/internal/summary/push_test.go b/internal/summary/push_test.go new file mode 100644 index 0000000..552ae68 --- /dev/null +++ b/internal/summary/push_test.go @@ -0,0 +1,202 @@ +package summary + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestPushClient_Push(t *testing.T) { + var received MetricsPayload + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if ct := r.Header.Get("Content-Type"); ct != "application/json" { + t.Errorf("expected Content-Type application/json, got %s", ct) + } + if err := json.NewDecoder(r.Body).Decode(&received); err != nil { + t.Errorf("failed to decode body: %v", err) + } + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL, "") + client.ctx = ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "12345", + } + + summary := &RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 10, + CPUTotal: StatSummary{Peak: 80.0, Avg: 50.0, P95: 75.0}, + } + + err := client.Push(context.Background(), summary) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + + if received.Execution.Organization != "test-org" { + t.Errorf("Organization = %q, want %q", received.Execution.Organization, "test-org") + } + if received.Execution.RunID != "12345" { + t.Errorf("RunID = %q, want %q", received.Execution.RunID, "12345") + } + if received.Summary.SampleCount != 10 { + t.Errorf("SampleCount = %d, want %d", received.Summary.SampleCount, 10) + } +} + +func TestPushClient_Push_NilSummary(t *testing.T) { + client := NewPushClient("http://localhost:9999", "") + err := client.Push(context.Background(), nil) + if err != nil { + t.Errorf("Push(nil) error = %v, want nil", err) + } +} + +func TestPushClient_Push_ServerError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + client := NewPushClient(server.URL, "") + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err == nil { + t.Error("Push() expected error for 500 response, got nil") + } +} + +func TestPushClient_Push_ConnectionError(t *testing.T) { + client := NewPushClient("http://localhost:1", "") // Invalid port + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err == nil { + t.Error("Push() expected error for connection failure, got nil") + } +} + +func TestExecutionContextFromEnv(t *testing.T) { + // Save and restore env + origVars := map[string]string{ + "GITHUB_REPOSITORY_OWNER": "", + "GITHUB_REPOSITORY": "", + "GITHUB_WORKFLOW": "", + "GITHUB_JOB": "", + "GITHUB_RUN_ID": "", + } + for k := range origVars { + origVars[k] = getEnvWithFallback(k) + } + defer func() { + for k, v := range origVars { + if v == "" { + t.Setenv(k, "") + } + } + }() + + t.Setenv("GITHUB_REPOSITORY_OWNER", "my-org") + t.Setenv("GITHUB_REPOSITORY", "my-org/my-repo") + t.Setenv("GITHUB_WORKFLOW", "CI") + t.Setenv("GITHUB_JOB", "test") + t.Setenv("GITHUB_RUN_ID", "999") + + ctx := ExecutionContextFromEnv() + + if ctx.Organization != "my-org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "my-org") + } + if ctx.Repository != "my-org/my-repo" { + t.Errorf("Repository = %q, want %q", ctx.Repository, "my-org/my-repo") + } + if ctx.Workflow != "CI" { + t.Errorf("Workflow = %q, want %q", ctx.Workflow, "CI") + } + if ctx.Job != "test" { + t.Errorf("Job = %q, want %q", ctx.Job, "test") + } + if ctx.RunID != "999" { + t.Errorf("RunID = %q, want %q", ctx.RunID, "999") + } +} + +func TestExecutionContextFromEnv_GiteaFallback(t *testing.T) { + t.Setenv("GITHUB_RUN_ID", "") + t.Setenv("GITEA_RUN_ID", "gitea-123") + + ctx := ExecutionContextFromEnv() + + if ctx.RunID != "gitea-123" { + t.Errorf("RunID = %q, want %q (Gitea fallback)", ctx.RunID, "gitea-123") + } +} + +func TestPushClient_Push_WithToken(t *testing.T) { + var gotAuth string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL, "my-token") + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + if gotAuth != "Bearer my-token" { + t.Errorf("Authorization = %q, want %q", gotAuth, "Bearer my-token") + } +} + +func TestPushClient_Push_WithoutToken(t *testing.T) { + var gotAuth string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL, "") + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + if gotAuth != "" { + t.Errorf("Authorization = %q, want empty", gotAuth) + } +} + +func TestPushClient_ExecutionContext(t *testing.T) { + client := NewPushClient("http://example.com", "") + client.ctx = ExecutionContext{ + Organization: "org", + Repository: "repo", + RunID: "run", + } + + ctx := client.ExecutionContext() + if ctx.Organization != "org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "org") + } +} diff --git a/internal/summary/types.go b/internal/summary/types.go new file mode 100644 index 0000000..dad0b01 --- /dev/null +++ b/internal/summary/types.go @@ -0,0 +1,44 @@ +// ABOUTME: Data types for run-level summary statistics. +// ABOUTME: Defines StatSummary, ProcessPeak, and RunSummary used to report metrics on shutdown. +package summary + +import "time" + +// StatSummary holds peak, percentiles, and average for a metric across the run +type StatSummary struct { + Peak float64 `json:"peak"` + P99 float64 `json:"p99"` + P95 float64 `json:"p95"` + P75 float64 `json:"p75"` + P50 float64 `json:"p50"` + Avg float64 `json:"avg"` +} + +// ProcessPeak holds the peak CPU and memory observed for a single process +type ProcessPeak struct { + PID int `json:"pid"` + Name string `json:"name"` + PeakCPU float64 `json:"peak_cpu_percent"` + PeakMem uint64 `json:"peak_mem_rss_bytes"` +} + +// ContainerSummary holds statistics for a single container across the run +type ContainerSummary struct { + Name string `json:"name"` + CPUCores StatSummary `json:"cpu_cores"` + MemoryBytes StatSummary `json:"memory_bytes"` +} + +// RunSummary holds the complete summary of a collection run +type RunSummary struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + DurationSeconds float64 `json:"duration_seconds"` + SampleCount int `json:"sample_count"` + CPUTotal StatSummary `json:"cpu_total_percent"` + MemUsedBytes StatSummary `json:"mem_used_bytes"` + MemUsedPercent StatSummary `json:"mem_used_percent"` + TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"` + TopMemProcesses []ProcessPeak `json:"top_mem_processes"` + Containers []ContainerSummary `json:"containers"` +} diff --git a/internal/summary/writer.go b/internal/summary/writer.go new file mode 100644 index 0000000..30b392b --- /dev/null +++ b/internal/summary/writer.go @@ -0,0 +1,61 @@ +// ABOUTME: Emits a RunSummary as a structured log entry via slog. +// ABOUTME: Follows the same slog pattern as internal/output/logger.go for consistency. +package summary + +import ( + "io" + "log/slog" +) + +// SummaryWriter outputs a RunSummary using structured logging +type SummaryWriter struct { + logger *slog.Logger +} + +// NewSummaryWriter creates a writer that emits summaries to the given output in the given format +func NewSummaryWriter(output io.Writer, format string) *SummaryWriter { + opts := &slog.HandlerOptions{Level: slog.LevelInfo} + + var handler slog.Handler + switch format { + case "text": + handler = slog.NewTextHandler(output, opts) + default: + handler = slog.NewJSONHandler(output, opts) + } + + return &SummaryWriter{ + logger: slog.New(handler), + } +} + +// Write emits the run summary as a single structured log entry +func (w *SummaryWriter) Write(s *RunSummary) { + if s == nil { + return + } + + w.logger.Info("run_summary", + slog.Time("start_time", s.StartTime), + slog.Time("end_time", s.EndTime), + slog.Float64("duration_seconds", s.DurationSeconds), + slog.Int("sample_count", s.SampleCount), + slog.Group("cpu_total_percent", + slog.Float64("peak", s.CPUTotal.Peak), + slog.Float64("avg", s.CPUTotal.Avg), + slog.Float64("p95", s.CPUTotal.P95), + ), + slog.Group("mem_used_bytes", + slog.Float64("peak", s.MemUsedBytes.Peak), + slog.Float64("avg", s.MemUsedBytes.Avg), + slog.Float64("p95", s.MemUsedBytes.P95), + ), + slog.Group("mem_used_percent", + slog.Float64("peak", s.MemUsedPercent.Peak), + slog.Float64("avg", s.MemUsedPercent.Avg), + slog.Float64("p95", s.MemUsedPercent.P95), + ), + slog.Any("top_cpu_processes", s.TopCPUProcesses), + slog.Any("top_mem_processes", s.TopMemProcesses), + ) +} diff --git a/internal/summary/writer_test.go b/internal/summary/writer_test.go new file mode 100644 index 0000000..d787ec6 --- /dev/null +++ b/internal/summary/writer_test.go @@ -0,0 +1,93 @@ +// ABOUTME: Tests for the summary writer that emits run summaries via slog. +// ABOUTME: Validates JSON output, text output, and nil summary handling. +package summary + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestSummaryWriter_JSON(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "json") + + s := &RunSummary{ + StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), + DurationSeconds: 60, + SampleCount: 12, + CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, + MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, + MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, + TopCPUProcesses: []ProcessPeak{ + {PID: 1, Name: "busy", PeakCPU: 95.5, PeakMem: 1000}, + }, + TopMemProcesses: []ProcessPeak{ + {PID: 2, Name: "hungry", PeakCPU: 10.0, PeakMem: 8000000}, + }, + } + + w.Write(s) + + output := buf.String() + if !strings.Contains(output, "run_summary") { + t.Errorf("output should contain 'run_summary', got: %s", output) + } + if !strings.Contains(output, "duration_seconds") { + t.Errorf("output should contain 'duration_seconds', got: %s", output) + } + if !strings.Contains(output, "sample_count") { + t.Errorf("output should contain 'sample_count', got: %s", output) + } + if !strings.Contains(output, "cpu_total_percent") { + t.Errorf("output should contain 'cpu_total_percent', got: %s", output) + } + if !strings.Contains(output, "mem_used_bytes") { + t.Errorf("output should contain 'mem_used_bytes', got: %s", output) + } + if !strings.Contains(output, "top_cpu_processes") { + t.Errorf("output should contain 'top_cpu_processes', got: %s", output) + } + if !strings.Contains(output, "top_mem_processes") { + t.Errorf("output should contain 'top_mem_processes', got: %s", output) + } +} + +func TestSummaryWriter_Text(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "text") + + s := &RunSummary{ + StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), + DurationSeconds: 60, + SampleCount: 12, + CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, + MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, + MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, + } + + w.Write(s) + + output := buf.String() + if !strings.Contains(output, "run_summary") { + t.Errorf("output should contain 'run_summary', got: %s", output) + } + if !strings.Contains(output, "duration_seconds") { + t.Errorf("output should contain 'duration_seconds', got: %s", output) + } +} + +func TestSummaryWriter_NilSummary(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "json") + + // Should not panic and should not write anything + w.Write(nil) + + if buf.Len() != 0 { + t.Errorf("expected no output for nil summary, got: %s", buf.String()) + } +} diff --git a/test/docker/docker-compose-stress.yaml b/test/docker/docker-compose-stress.yaml new file mode 100644 index 0000000..d4a0be0 --- /dev/null +++ b/test/docker/docker-compose-stress.yaml @@ -0,0 +1,131 @@ +# Docker Compose stress test with receiver +# See README.md "Docker Compose" section for the full token workflow. +# +# This test: +# 1. Starts the metrics receiver (with read-token and hmac-key) +# 2. You generate a scoped push token via POST /api/v1/token +# 3. Start the collector with COLLECTOR_PUSH_TOKEN set +# 4. Runs heavy CPU/memory workloads in multiple containers with shared PID namespace +# 5. Collector gathers metrics and pushes summary to receiver on shutdown +# +# To trigger the push, stop the collector gracefully: +# docker compose -f test/docker/docker-compose-stress.yaml stop collector + +services: + # Metrics receiver - stores summaries in SQLite + receiver: + build: + context: ../.. + dockerfile: Dockerfile + target: receiver + ports: + - "9080:8080" + environment: + - DB_PATH=/data/metrics.db + - RECEIVER_READ_TOKEN=dummyreadtoken + - RECEIVER_HMAC_KEY=dummyhmackey + volumes: + - receiver-data:/data + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"] + interval: 5s + timeout: 3s + retries: 3 + + # Heavy CPU workload - uses stress-ng (owns the PID namespace) + cpu-stress: + image: alexeiled/stress-ng:latest + command: + - --cpu + - "3" + - --timeout + - "300s" + - --metrics-brief + deploy: + resources: + limits: + cpus: "2.0" + memory: 128M + # This container owns the PID namespace + + # Memory-intensive workload - shares PID namespace with cpu-stress + mem-stress: + image: alexeiled/stress-ng:latest + command: + - --vm + - "2" + - --vm-bytes + - "64M" + - --timeout + - "300s" + - --metrics-brief + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + pid: "service:cpu-stress" + depends_on: + - cpu-stress + + # IO workload - continuous disk writes + io-stress: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "IO stress started" + # 'dd' will be our identifiable process + while true; do + dd if=/dev/zero of=/tmp/testfile bs=1M count=100 2>/dev/null + rm -f /tmp/testfile + done + deploy: + resources: + limits: + cpus: "0.5" + memory: 128M + pid: "service:cpu-stress" + depends_on: + - cpu-stress + + # Resource collector - pushes to receiver on shutdown + collector: + build: + context: ../.. + dockerfile: Dockerfile + target: collector + command: + - --interval=2s + - --top=10 + - --log-format=json + - --push-endpoint=http://receiver:8080/api/v1/metrics + environment: + # Push token — pass via COLLECTOR_PUSH_TOKEN from host env + COLLECTOR_PUSH_TOKEN: "${COLLECTOR_PUSH_TOKEN}" + # Execution context for the receiver + GITHUB_REPOSITORY_OWNER: "test-org" + GITHUB_REPOSITORY: "test-org/stress-test" + GITHUB_WORKFLOW: "stress-test-workflow" + GITHUB_JOB: "heavy-workload" + GITHUB_RUN_ID: "stress-run-001" + # Cgroup configuration + # stress-ng-cpu is the worker process name for CPU stress + # stress-ng-vm is the worker process name for memory stress + CGROUP_PROCESS_MAP: '{"stress-ng-cpu":"cpu-stress","stress-ng-vm":"mem-stress","dd":"io-stress","resource-collec":"collector"}' + CGROUP_LIMITS: '{"cpu-stress":{"cpu":"1","memory":"128Mi"},"mem-stress":{"cpu":"500m","memory":"256Mi"},"io-stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"200m","memory":"64Mi"}}' + deploy: + resources: + limits: + cpus: "0.2" + memory: 64M + pid: "service:cpu-stress" + depends_on: + receiver: + condition: service_healthy + cpu-stress: + condition: service_started + +volumes: + receiver-data: diff --git a/test/docker/docker-compose.yaml b/test/docker/docker-compose.yaml new file mode 100644 index 0000000..87a9a9a --- /dev/null +++ b/test/docker/docker-compose.yaml @@ -0,0 +1,81 @@ +# Docker Compose test setup for cgroup grouping verification +# Run with: docker compose -f test/docker/docker-compose.yaml up +# +# NOTE: Docker Compose doesn't have a direct equivalent to K8s shareProcessNamespace. +# Options: +# 1. pid: "host" - sees ALL host processes (not container-specific) +# 2. pid: "service:" - chains PID namespace to another service +# +# For proper testing, use Kubernetes or run containers manually with --pid=container: + +services: + # Simulate a runner workload (this will be the "root" of the shared PID namespace) + # Uses 'cat' reading from a fifo as a unique identifiable process + runner: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Runner started (PID 1 in namespace)" + mkfifo /tmp/runner_fifo + # 'cat' will be our identifiable runner process (blocks on fifo) + cat /tmp/runner_fifo & + CAT_PID=$! + # Generate CPU load with dd + while true; do + dd if=/dev/zero of=/dev/null bs=1M count=50 2>/dev/null + done + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + # This container owns the PID namespace + + # Simulate a sidecar service - shares PID namespace with runner + sidecar: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Sidecar started" + # List processes to verify shared namespace + ps aux + while true; do + sleep 10 + done + deploy: + resources: + limits: + cpus: "0.1" + memory: 128M + pid: "service:runner" # Share PID namespace with runner + depends_on: + - runner + + # Resource collector - shares PID namespace with runner + collector: + build: + context: ../.. + dockerfile: Dockerfile + target: collector + command: + - --interval=3s + - --top=5 + - --log-format=json + environment: + # Map unique process names to container names + # 'cat' runs only in runner, 'sleep' runs only in sidecar + CGROUP_PROCESS_MAP: '{"cat":"runner","sleep":"sidecar","resource-collec":"collector"}' + CGROUP_LIMITS: '{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' + deploy: + resources: + limits: + cpus: "0.1" + memory: 64M + pid: "service:runner" # Share PID namespace with runner + depends_on: + - runner + - sidecar diff --git a/test/k8s/test-cgroup-grouping.yaml b/test/k8s/test-cgroup-grouping.yaml new file mode 100644 index 0000000..e46545b --- /dev/null +++ b/test/k8s/test-cgroup-grouping.yaml @@ -0,0 +1,148 @@ +# Test manifest to verify cgroup grouping behavior +# This pod runs multiple containers with different resource limits +# and a collector sidecar that groups metrics by cgroup/container +apiVersion: v1 +kind: Pod +metadata: + name: test-cgroup-grouping + labels: + app: test-cgroup-grouping +spec: + # Share PID namespace so collector can see all processes + shareProcessNamespace: true + + containers: + # Main workload container - simulates a runner + - name: runner + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Runner container started" + # Simulate some CPU work + while true; do + dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null + sleep 1 + done + resources: + requests: + cpu: "100m" + memory: "64Mi" + limits: + cpu: "500m" + memory: "256Mi" + + # Sidecar container - simulates nginx or another service + - name: sidecar + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Sidecar container started" + # Simulate some lighter work + while true; do + sleep 5 + done + resources: + requests: + cpu: "50m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "128Mi" + + # Resource collector sidecar + - name: collector + image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image + args: + - --interval=5s + - --top=3 + env: + # Map process names to container names + # "sh" is the main process in busybox containers + # You may need to adjust based on actual process names + - name: CGROUP_PROCESS_MAP + value: | + {"sh":"runner","sleep":"sidecar","collector":"collector"} + # Define limits for each container (must match names in CGROUP_PROCESS_MAP) + - name: CGROUP_LIMITS + value: | + {"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}} + resources: + requests: + cpu: "50m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + # Mount proc read-only for process discovery + volumeMounts: + - name: proc + mountPath: /proc + readOnly: true + + volumes: + - name: proc + hostPath: + path: /proc + type: Directory + + restartPolicy: Never +--- +# Alternative: Using a Deployment for longer-running tests +apiVersion: v1 +kind: Pod +metadata: + name: test-cgroup-simple + labels: + app: test-cgroup-simple +spec: + shareProcessNamespace: true + + containers: + # Stress container to generate CPU/memory load + - name: stress + image: progrium/stress:latest + args: + - --cpu + - "1" + - --vm + - "1" + - --vm-bytes + - "64M" + - --timeout + - "300s" + resources: + limits: + cpu: "500m" + memory: "128Mi" + + # Collector + - name: collector + image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image + args: + - --interval=2s + - --top=5 + env: + - name: CGROUP_PROCESS_MAP + value: '{"stress":"stress","collector":"collector"}' + - name: CGROUP_LIMITS + value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' + resources: + limits: + cpu: "100m" + memory: "64Mi" + volumeMounts: + - name: proc + mountPath: /proc + readOnly: true + + volumes: + - name: proc + hostPath: + path: /proc + type: Directory + + restartPolicy: Never diff --git a/test/local-test.sh b/test/local-test.sh new file mode 100755 index 0000000..d56f477 --- /dev/null +++ b/test/local-test.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Local test script to verify cgroup grouping +# Run from project root: ./test/local-test.sh + +set -e + +echo "Building collector..." +go build -o bin/collector ./cmd/collector + +echo "" +echo "Testing cgroup parsing on current system..." +echo "Current process cgroup:" +cat /proc/self/cgroup 2>/dev/null || echo "Cannot read /proc/self/cgroup (expected on macOS)" + +echo "" +echo "Running collector for 10 seconds with cgroup grouping..." +echo "Press Ctrl+C to stop early" +echo "" + +# Set up test environment variables +# Map common process names to container names +export CGROUP_PROCESS_MAP='{"bash":"shell","collector":"collector","zsh":"shell"}' +export CGROUP_LIMITS='{"shell":{"cpu":"2","memory":"4Gi"},"collector":{"cpu":"1","memory":"1Gi"}}' + +# Run collector +timeout 10 ./bin/collector \ + --interval=2s \ + --top=5 \ + --log-format=json \ + 2>/dev/null || true + +echo "" +echo "Test complete!" +echo "" +echo "Note: On macOS, cgroup paths will be empty since cgroups are a Linux feature." +echo "To test properly, run in a Linux container or VM."