diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 661607b..de03c63 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -14,6 +14,7 @@ builds: - CGO_ENABLED=0 goos: - linux + - darwin goarch: - amd64 - arm64 diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index f9d6972..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,94 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Build and Development Commands - -```bash -# Build -make build # Build the collector binary -go build -o collector ./cmd/collector -go build -o receiver ./cmd/receiver - -# Test -make test # Run all tests -go test -v ./... # Run all tests with verbose output -go test -v ./internal/collector/... # Run tests for a specific package -make test-coverage # Run tests with coverage report - -# Code Quality -make fmt # Format code -make vet # Run go vet -make lint # Run golangci-lint (v2.6.2) -make all # Format, vet, lint, and build - -# Git Hooks -make install-hooks # Install pre-commit and commit-msg hooks -``` - -## Architecture Overview - -This is a Go metrics collector designed for CI/CD environments with shared PID namespaces. It consists of two binaries: - -### Collector (`cmd/collector`) -Runs alongside CI workloads, periodically reads `/proc` filesystem, and pushes a summary to the receiver on shutdown (SIGINT/SIGTERM). - -**Data Flow:** -1. `metrics.Aggregator` reads `/proc` to collect CPU/memory for all processes -2. `collector.Collector` orchestrates collection at intervals and writes to output -3. `summary.Accumulator` tracks samples across the run, computing peak/avg/percentiles -4. On shutdown, `summary.PushClient` sends the summary to the receiver HTTP endpoint - -### Receiver (`cmd/receiver`) -HTTP service that stores metric summaries in SQLite (via GORM) and provides a query API. - -**Key Endpoints:** -- `POST /api/v1/metrics` - Receive metrics from collectors -- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` - Query stored metrics - -### Internal Packages - -| Package | Purpose | -|---------|---------| -| `internal/collector` | Orchestrates collection loop, handles shutdown | -| `internal/metrics` | Aggregates system/process metrics from /proc | -| `internal/proc` | Low-level /proc parsing (stat, status, cgroup) | -| `internal/cgroup` | Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP env vars | -| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | -| `internal/receiver` | HTTP handlers and SQLite store | -| `internal/output` | Metrics output formatting (JSON/text) | - -### Container Metrics - -The collector groups processes by container using cgroup paths. Configuration via environment variables: -- `CGROUP_PROCESS_MAP`: JSON mapping process names to container names (e.g., `{"node":"runner"}`) -- `CGROUP_LIMITS`: JSON with CPU/memory limits per container for percentage calculations - -CPU values in container metrics are reported as **cores** (not percentage), enabling direct comparison with Kubernetes resource limits. - -## Commit Message Convention - -Uses conventional commits enforced by git hook: -``` -()?: -``` - -Types: `feat`, `fix`, `chore`, `docs`, `style`, `refactor`, `perf`, `test`, `build`, `ci` - -Examples: -- `feat: add user authentication` -- `fix(collector): handle nil cgroup paths` -- `feat!: breaking change in API` - -## Running Locally - -```bash -# Run receiver -./receiver --addr=:8080 --db=metrics.db - -# Run collector with push endpoint -./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics - -# Docker Compose stress test -docker compose -f test/docker/docker-compose-stress.yaml up -d -``` diff --git a/Dockerfile b/Dockerfile index 75f7b7f..64a266f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,37 +1,16 @@ -FROM golang:1.25-alpine AS builder-base +FROM golang:1.25-alpine AS builder WORKDIR /app -COPY go.mod go.sum ./ +COPY go.mod ./ RUN go mod download COPY . . -# Collector build (no CGO needed) -FROM builder-base AS builder-collector - RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /resource-collector ./cmd/collector -# Receiver build (CGO needed for SQLite) -FROM builder-base AS builder-receiver +FROM alpine:3.19 -RUN apk add --no-cache gcc musl-dev -RUN CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o /metrics-receiver ./cmd/receiver - -# Collector image -FROM alpine:3.19 AS collector - -COPY --from=builder-collector /resource-collector /usr/local/bin/resource-collector +COPY --from=builder /resource-collector /usr/local/bin/resource-collector ENTRYPOINT ["/usr/local/bin/resource-collector"] - -# Receiver image -FROM alpine:3.19 AS receiver - -RUN apk add --no-cache sqlite-libs - -COPY --from=builder-receiver /metrics-receiver /usr/local/bin/metrics-receiver - -EXPOSE 8080 - -ENTRYPOINT ["/usr/local/bin/metrics-receiver"] diff --git a/README.md b/README.md deleted file mode 100644 index abd58e9..0000000 --- a/README.md +++ /dev/null @@ -1,243 +0,0 @@ -# Forgejo Runner Resource Collector - -A lightweight metrics collector for CI/CD workloads in shared PID namespace environments. Reads `/proc` to collect CPU and memory metrics, groups them by container/cgroup, and pushes run summaries to a receiver service for storage and querying. - -## Architecture - -The system has two independent binaries: - -``` -┌─────────────────────────────────────────────┐ ┌──────────────────────────┐ -│ CI/CD Pod (shared PID namespace) │ │ Receiver Service │ -│ │ │ │ -│ ┌───────────┐ ┌────────┐ ┌───────────┐ │ │ POST /api/v1/metrics │ -│ │ collector │ │ runner │ │ sidecar │ │ │ │ │ -│ │ │ │ │ │ │ │ │ ▼ │ -│ │ reads │ │ │ │ │ │ push │ ┌────────────┐ │ -│ │ /proc for │ │ │ │ │ │──────▶│ │ SQLite │ │ -│ │ all PIDs │ │ │ │ │ │ │ └────────────┘ │ -│ └───────────┘ └────────┘ └───────────┘ │ │ │ │ -│ │ │ ▼ │ -└─────────────────────────────────────────────┘ │ GET /api/v1/metrics/... │ - └──────────────────────────┘ -``` - -### Collector - -Runs as a sidecar alongside CI workloads. On a configurable interval, it reads `/proc` to collect CPU and memory for all visible processes, groups them by container using cgroup paths, and accumulates samples. On shutdown (SIGINT/SIGTERM), it computes run-level statistics (peak, avg, percentiles) and pushes a single summary to the receiver. - -```bash -./collector --interval=2s --top=10 --push-endpoint=http://receiver:8080/api/v1/metrics -``` - -**Flags:** `--interval`, `--proc-path`, `--log-level`, `--log-format`, `--top`, `--push-endpoint`, `--push-token` - -**Environment variables:** - -| Variable | Description | Example | -| ------------------------- | ------------------------------------- | ------------------- | -| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | -| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | -| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | -| `GITHUB_JOB` | Job name | `build` | -| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | -| `COLLECTOR_PUSH_TOKEN` | Bearer token for push endpoint auth | — | -| `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | -| `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | - -**CGROUP_LIMITS example:** - -```json -{ - "runner": { "cpu": "2", "memory": "1Gi" }, - "sidecar": { "cpu": "500m", "memory": "256Mi" } -} -``` - -CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory supports `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). - -### Receiver - -HTTP service that stores metric summaries in SQLite (via GORM) and exposes a query API. - -```bash -./receiver --addr=:8080 --db=metrics.db --read-token=my-secret-token --hmac-key=my-hmac-key -``` - -**Flags:** - -| Flag | Environment Variable | Description | Default | -| -------------- | --------------------- | ----------------------------------------------------- | ------------ | -| `--addr` | — | HTTP listen address | `:8080` | -| `--db` | — | SQLite database path | `metrics.db` | -| `--read-token` | `RECEIVER_READ_TOKEN` | Pre-shared token for read/admin endpoints (required) | — | -| `--hmac-key` | `RECEIVER_HMAC_KEY` | Secret key for push token generation/validation (required) | — | - -**Endpoints:** - -- `POST /api/v1/metrics` — receive and store a metric summary (requires scoped push token) -- `POST /api/v1/token` — generate a scoped push token (requires read token auth) -- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics (requires read token auth) - -**Authentication:** - -All metrics endpoints require authentication via `--read-token`: - -- The GET endpoint requires a Bearer token matching the read token -- The POST metrics endpoint requires a scoped push token (generated via `POST /api/v1/token`) -- The token endpoint itself requires the read token - -**Token flow:** - -```bash -# 1. Admin generates a scoped push token using the read token -curl -X POST http://localhost:8080/api/v1/token \ - -H "Authorization: Bearer my-secret-token" \ - -H "Content-Type: application/json" \ - -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' -# → {"token":""} - -# 2. Collector uses the scoped token to push metrics -./collector --push-endpoint=http://localhost:8080/api/v1/metrics \ - --push-token= - -# 3. Query metrics with the read token -curl -H "Authorization: Bearer my-secret-token" \ #gitleaks:allow - http://localhost:8080/api/v1/metrics/repo/my-org/my-repo/ci.yml/build -``` - -Push tokens are HMAC-SHA256 digests derived from `--hmac-key` and the scope (org/repo/workflow/job). They are stateless — no database storage is needed. The HMAC key is separate from the read token so that compromising a push token does not expose the admin credential. - -## How Metrics Are Collected - -The collector reads `/proc/[pid]/stat` for every visible process to get CPU ticks (`utime` + `stime`) and `/proc/[pid]/status` for memory (RSS). It takes two samples per interval and computes the delta to derive CPU usage rates. - -Processes are grouped into containers by reading `/proc/[pid]/cgroup` and matching cgroup paths against the `CGROUP_PROCESS_MAP`. This is necessary because in shared PID namespace pods, `/proc/stat` only shows host-level aggregates — per-container metrics must be built up from individual process data. - -Container CPU is reported in **cores** (not percentage) for direct comparison with Kubernetes resource limits. System-level CPU is reported as a percentage (0-100%). - -Over the course of a run, the `summary.Accumulator` tracks every sample and on shutdown computes: - -| Stat | Description | -| -------------------------- | ------------------------------ | -| `peak` | Maximum observed value | -| `p99`, `p95`, `p75`, `p50` | Percentiles across all samples | -| `avg` | Arithmetic mean | - -These stats are computed for CPU, memory, and per-container metrics. - -## API Response - -``` -GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build -``` - -```json -[ - { - "id": 1, - "organization": "my-org", - "repository": "my-org/my-repo", - "workflow": "ci.yml", - "job": "build", - "run_id": "run-123", - "received_at": "2026-02-06T14:30:23.056Z", - "payload": { - "start_time": "2026-02-06T14:30:02.185Z", - "end_time": "2026-02-06T14:30:22.190Z", - "duration_seconds": 20.0, - "sample_count": 11, - "cpu_total_percent": { "peak": ..., "avg": ..., "p50": ... }, - "mem_used_bytes": { "peak": ..., "avg": ... }, - "containers": [ - { - "name": "runner", - "cpu_cores": { "peak": 2.007, "avg": 1.5, "p50": 1.817, "p95": 2.004 }, - "memory_bytes": { "peak": 18567168, "avg": 18567168 } - } - ], - "top_cpu_processes": [ ... ], - "top_mem_processes": [ ... ] - } - } -] -``` - -**CPU metric distinction:** - -- `cpu_total_percent` — system-wide, 0-100% -- `cpu_cores` (containers) — cores used (e.g. `2.0` = two full cores) -- `peak_cpu_percent` (processes) — per-process, where 100% = 1 core - -All memory values are in **bytes**. - -## Running - -### Docker Compose - -```bash -# Start the receiver (builds image if needed): -docker compose -f test/docker/docker-compose-stress.yaml up -d --build receiver - -# Generate a scoped push token for the collector: -PUSH_TOKEN=$(curl -s -X POST http://localhost:9080/api/v1/token \ - -H "Authorization: Bearer dummyreadtoken" \ - -H "Content-Type: application/json" \ - -d '{"organization":"test-org","repository":"test-org/stress-test","workflow":"stress-test-workflow","job":"heavy-workload"}' \ - | jq -r .token) - -# Start the collector and stress workloads with the push token: -COLLECTOR_PUSH_TOKEN=$PUSH_TOKEN \ - docker compose -f test/docker/docker-compose-stress.yaml up -d --build collector - -# ... Wait for data collection ... - -# Trigger shutdown summary: -docker compose -f test/docker/docker-compose-stress.yaml stop collector - -# Query results with the read token: -curl -H "Authorization: Bearer dummyreadtoken" \ - http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload -``` - -### Local - -```bash -go build -o collector ./cmd/collector -go build -o receiver ./cmd/receiver - -# Start receiver with both keys: -./receiver --addr=:8080 --db=metrics.db \ - --read-token=my-secret-token --hmac-key=my-hmac-key - -# Generate a scoped push token: -PUSH_TOKEN=$(curl -s -X POST http://localhost:8080/api/v1/token \ - -H "Authorization: Bearer my-secret-token" \ - -H "Content-Type: application/json" \ - -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' \ - | jq -r .token) - -# Run collector with the push token: -./collector --interval=2s --top=10 \ - --push-endpoint=http://localhost:8080/api/v1/metrics \ - --push-token=$PUSH_TOKEN -``` - -## Internal Packages - -| Package | Purpose | -| -------------------- | ------------------------------------------------------------------- | -| `internal/proc` | Low-level `/proc` parsing (stat, status, cgroup) | -| `internal/metrics` | Aggregates process metrics from `/proc` into system/container views | -| `internal/cgroup` | Parses `CGROUP_PROCESS_MAP` and `CGROUP_LIMITS` env vars | -| `internal/collector` | Orchestrates the collection loop and shutdown | -| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | -| `internal/receiver` | HTTP handlers and SQLite store | -| `internal/output` | Metrics output formatting (JSON/text) | - -## Background - -Technical reference on the Linux primitives this project builds on: - -- [Identifying process cgroups by PID](docs/background/identify-process-cgroup-by-pid.md) — how to read `/proc//cgroup` to determine which container a process belongs to -- [/proc/stat behavior in containers](docs/background/proc-stat-in-containers.md) — why `/proc/stat` shows host-level data in containers, and how to aggregate per-process stats from `/proc/[pid]/stat` instead, including CPU tick conversion and cgroup limit handling diff --git a/cmd/collector/main.go b/cmd/collector/main.go index 7a88a85..8009586 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -12,7 +12,6 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/collector" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) const ( @@ -30,8 +29,6 @@ func main() { logLevel := flag.String("log-level", defaultLogLevel, "Log level: debug, info, warn, error") logFormat := flag.String("log-format", defaultLogFormat, "Output format: json, text") topN := flag.Int("top", defaultTopN, "Number of top processes to include") - pushEndpoint := flag.String("push-endpoint", "", "HTTP endpoint to push metrics to (e.g., http://localhost:8080/api/v1/metrics)") - pushToken := flag.String("push-token", os.Getenv("COLLECTOR_PUSH_TOKEN"), "Bearer token for push endpoint authentication (or set COLLECTOR_PUSH_TOKEN)") flag.Parse() // Setup structured logging for application logs @@ -56,25 +53,6 @@ func main() { TopN: *topN, }, metricsWriter, appLogger) - // Attach summary writer to emit run summary on shutdown - summaryWriter := summary.NewSummaryWriter(os.Stdout, *logFormat) - c.SetSummaryWriter(summaryWriter) - - // Setup push client if endpoint is configured - if *pushEndpoint != "" { - pushClient := summary.NewPushClient(*pushEndpoint, *pushToken) - c.SetPushClient(pushClient) - execCtx := pushClient.ExecutionContext() - appLogger.Info("push client configured", - slog.String("endpoint", *pushEndpoint), - slog.String("organization", execCtx.Organization), - slog.String("repository", execCtx.Repository), - slog.String("workflow", execCtx.Workflow), - slog.String("job", execCtx.Job), - slog.String("run_id", execCtx.RunID), - ) - } - // Setup signal handling for graceful shutdown ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/cmd/receiver/main.go b/cmd/receiver/main.go deleted file mode 100644 index 1379b53..0000000 --- a/cmd/receiver/main.go +++ /dev/null @@ -1,79 +0,0 @@ -package main - -import ( - "context" - "flag" - "fmt" - "log/slog" - "net/http" - "os" - "os/signal" - "syscall" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" -) - -const ( - defaultAddr = ":8080" - defaultDBPath = "metrics.db" -) - -func main() { - addr := flag.String("addr", defaultAddr, "HTTP listen address") - dbPath := flag.String("db", defaultDBPath, "SQLite database path") - readToken := flag.String("read-token", os.Getenv("RECEIVER_READ_TOKEN"), "Pre-shared token for read endpoints (or set RECEIVER_READ_TOKEN)") - hmacKey := flag.String("hmac-key", os.Getenv("RECEIVER_HMAC_KEY"), "Secret key for push token generation/validation (or set RECEIVER_HMAC_KEY)") - flag.Parse() - - logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ - Level: slog.LevelInfo, - })) - - store, err := receiver.NewStore(*dbPath) - if err != nil { - logger.Error("failed to open database", slog.String("error", err.Error())) - os.Exit(1) - } - defer func() { _ = store.Close() }() - - handler := receiver.NewHandler(store, logger, *readToken, *hmacKey) - mux := http.NewServeMux() - handler.RegisterRoutes(mux) - - server := &http.Server{ - Addr: *addr, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) - - go func() { - sig := <-sigChan - logger.Info("received signal, shutting down", slog.String("signal", sig.String())) - cancel() - - shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) - defer shutdownCancel() - _ = server.Shutdown(shutdownCtx) - }() - - logger.Info("starting metrics receiver", - slog.String("addr", *addr), - slog.String("db", *dbPath), - ) - - if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { - fmt.Fprintf(os.Stderr, "error: %v\n", err) - os.Exit(1) - } - - <-ctx.Done() - logger.Info("receiver stopped gracefully") -} diff --git a/docs/background/identify-process-cgroup-by-pid.md b/docs/background/identify-process-cgroup-by-pid.md deleted file mode 100644 index 7d4734d..0000000 --- a/docs/background/identify-process-cgroup-by-pid.md +++ /dev/null @@ -1,38 +0,0 @@ -# Identifying a Process's Cgroup by PID - -Read `/proc//cgroup` to find which cgroup (and therefore which container) a process belongs to. - -## /proc/PID/cgroup - -```bash -cat /proc//cgroup -``` - -Shows all cgroup controllers the process belongs to: -``` -12:blkio:/user.slice -11:memory:/user.slice/user-1000.slice -... -0::/user.slice/user-1000.slice/session-1.scope -``` - -On cgroup v2, the path after `::` is the cgroup path under `/sys/fs/cgroup/`. - -## Other Methods - -```bash -# ps format options -ps -o pid,cgroup -p - -# systemd systems -systemd-cgls --unit -systemd-cgls # whole tree -``` - -## Quick One-Liners - -```bash -cat /proc/self/cgroup # current shell -cat /proc/$$/cgroup # also current shell -cat /proc/1234/cgroup # specific PID -``` diff --git a/docs/background/proc-stat-in-containers.md b/docs/background/proc-stat-in-containers.md deleted file mode 100644 index 9884b2a..0000000 --- a/docs/background/proc-stat-in-containers.md +++ /dev/null @@ -1,238 +0,0 @@ -# /proc/stat behavior in containerised environments - -`/proc/stat` in containers shows **host-level** statistics, not container-specific data. To get container-aware CPU metrics when processes span multiple cgroups (e.g., sidecars sharing a PID namespace), aggregate `/proc/[pid]/stat` for all visible processes and use cgroup limits from `/sys/fs/cgroup` for normalization. - -## Why /proc/stat is wrong in containers - -`/proc/stat` reports host-wide values (CPU times, context switches, boot time, process count) because `/proc` is mounted from the host kernel, which has no namespace awareness for these metrics. - -This means: -- Tools reading `/proc/stat` (top, htop, etc.) show **host** CPU usage, not container usage -- Cgroup CPU limits (e.g., 2 CPUs) are not reflected — all host CPUs are visible -- In shared environments, containers see each other's aggregate impact - -### Alternatives - -| Approach | Description | -|----------|-------------| -| **cgroups** | Read `/sys/fs/cgroup/cpu/` for container-specific CPU accounting | -| **LXCFS** | FUSE filesystem providing container-aware `/proc` files | -| **Container runtimes** | Some (like Kata) use VMs with isolated kernels | -| **Metrics APIs** | Docker/Kubernetes APIs instead of `/proc/stat` | - -```bash -# cgroups v2: -cat /sys/fs/cgroup/cpu.stat - -# cgroups v1: -cat /sys/fs/cgroup/cpu/cpuacct.usage -``` - -## Aggregating per-Process CPU from /proc/[pid]/stat - -When cgroup-level reads aren't an option (sidecars sharing PID namespace with different cgroups), aggregate individual process stats: - -```bash -# Fields 14 (utime) and 15 (stime) in /proc/[pid]/stat -for pid in /proc/[0-9]*; do - awk '{print $14 + $15}' "$pid/stat" 2>/dev/null -done | awk '{sum += $1} END {print sum}' -``` - -### Caveats - -1. **Race conditions** — processes can spawn/die between reads -2. **Short-lived processes** — missed if they start and exit between samples -3. **Zombie/exited processes** — their CPU time may not be captured -4. **Overhead** — scanning all PIDs repeatedly is expensive -5. **Namespace visibility** — you only see processes in your PID namespace (which is what you want) -6. **Children accounting** — when a process exits, its CPU time is added to the parent's `cutime`/`cstime`, risking double-counting - -Cgroups handle these edge cases natively, but **cannot be used when sidecars share the PID namespace with different cgroups** — in that case, per-process aggregation is the best option. - -## Parent/Child Process Relationships - -Field 4 in `/proc/[pid]/stat` is the PPID (parent process ID): - -```bash -awk '{print $4}' /proc/1234/stat # PPID from stat -grep PPid /proc/1234/status # more readable -``` - -### Building a Process Tree - -```bash -#!/bin/bash -declare -A parent_of children_of - -for stat in /proc/[0-9]*/stat; do - if read -r line < "$stat" 2>/dev/null; then - pid="${stat#/proc/}"; pid="${pid%/stat}" - rest="${line##*) }"; read -ra fields <<< "$rest" - ppid="${fields[1]}" # 4th field overall = index 1 after state - parent_of[$pid]=$ppid - children_of[$ppid]+="$pid " - fi -done - -print_tree() { - local pid=$1 indent=$2 - echo "${indent}${pid}" - for child in ${children_of[$pid]}; do print_tree "$child" " $indent"; done -} -print_tree 1 "" -``` - -### Avoiding Double-Counting with cutime/cstime - -Only sum `utime` + `stime` per process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on — those children no longer exist in `/proc`, so their time is only accessible via the parent. - -```bash -#!/bin/bash -declare -A utime stime - -for stat in /proc/[0-9]*/stat; do - if read -r line < "$stat" 2>/dev/null; then - pid="${stat#/proc/}"; pid="${pid%/stat}" - rest="${line##*) }"; read -ra f <<< "$rest" - utime[$pid]="${f[11]}"; stime[$pid]="${f[12]}" - # cutime=${f[13]} cstime=${f[14]} — don't sum these - fi -done - -total=0 -for pid in "${!utime[@]}"; do ((total += utime[$pid] + stime[$pid])); done -echo "Total CPU ticks: $total" -echo "Seconds: $(echo "scale=2; $total / $(getconf CLK_TCK)" | bc)" -``` - -## Converting Ticks to CPU Percentages - -CPU percentage is a rate — you need **two samples** over a time interval. - -``` -CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100 -``` - -- `delta_ticks` = difference in (utime + stime) between samples -- `CLK_TCK` = ticks per second (usually 100, get via `getconf CLK_TCK`) -- `num_cpus` = number of CPUs (omit for per-core percentage) - -| Style | Formula | Example | -|-------|---------|---------| -| **Normalized** (0-100%) | `delta / (elapsed * CLK_TCK * num_cpus) * 100` | 50% = half of total capacity | -| **Cores-style** (0-N*100%) | `delta / (elapsed * CLK_TCK) * 100` | 200% = 2 full cores busy | - -### Sampling Script - -```bash -#!/bin/bash -CLK_TCK=$(getconf CLK_TCK) -NUM_CPUS=$(nproc) - -get_total_ticks() { - local total=0 - for stat in /proc/[0-9]*/stat; do - if read -r line < "$stat" 2>/dev/null; then - rest="${line##*) }"; read -ra f <<< "$rest" - ((total += f[11] + f[12])) - fi - done - echo "$total" -} - -ticks1=$(get_total_ticks); time1=$(date +%s.%N) -sleep 1 -ticks2=$(get_total_ticks); time2=$(date +%s.%N) - -delta_ticks=$((ticks2 - ticks1)) -elapsed=$(echo "$time2 - $time1" | bc) - -pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK * $NUM_CPUS)) * 100" | bc) -echo "CPU usage: ${pct}% of ${NUM_CPUS} CPUs" - -cores_pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK)) * 100" | bc) -echo "CPU usage: ${cores_pct}% (cores-style)" -``` - -## Respecting Cgroup CPU Limits - -The above calculations use `nproc`, which returns the **host** CPU count. If a container is limited to 2 CPUs on an 8-CPU host, `nproc` returns 8 and the percentage is misleading. - -### Reading Effective CPU Limit - -```bash -#!/bin/bash -get_effective_cpus() { - # cgroups v2 - if [[ -f /sys/fs/cgroup/cpu.max ]]; then - read quota period < /sys/fs/cgroup/cpu.max - [[ "$quota" != "max" ]] && echo "scale=2; $quota / $period" | bc && return - fi - # cgroups v1 - if [[ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ]]; then - quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) - period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) - [[ "$quota" != "-1" ]] && echo "scale=2; $quota / $period" | bc && return - fi - nproc # fallback -} -``` - -Also check cpuset limits (`cpuset.cpus.effective` for v2, `cpuset/cpuset.cpus` for v1) which restrict which physical CPUs are available. - -### Shared PID Namespace Complication - -When sidecars share a PID namespace but have different cgroups, there's no single "correct" CPU limit for normalization. Options: - -1. **Use host CPU count** — percentage of total host capacity -2. **Sum the limits** — if you know each sidecar's cgroup, sum their quotas -3. **Report in cores** — skip normalization, show `1.5 cores used` instead of percentage - -## Reading Cgroup Limits for Other Containers - -Every process exposes its cgroup membership via `/proc//cgroup`. If the cgroup filesystem is mounted, you can read any container's limits: - -```bash -get_cgroup_cpu_limit() { - local pid=$1 - # cgroups v2 - cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) - if [[ -n "$cgroup_path" ]]; then - limit_file="/sys/fs/cgroup${cgroup_path}/cpu.max" - if [[ -r "$limit_file" ]]; then - read quota period < "$limit_file" - [[ "$quota" == "max" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc - return - fi - fi - # cgroups v1 - cgroup_path=$(grep -oP 'cpu.*:\K.*' /proc/$pid/cgroup 2>/dev/null) - if [[ -n "$cgroup_path" ]]; then - quota_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_quota_us" - period_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_period_us" - if [[ -r "$quota_file" ]]; then - quota=$(cat "$quota_file"); period=$(cat "$period_file") - [[ "$quota" == "-1" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc - return - fi - fi - echo "unknown" -} -``` - -### Mount Visibility - -| Scenario | Can Read Other Cgroups? | -|----------|------------------------| -| Host system | Yes | -| Privileged container | Yes | -| `/sys/fs/cgroup` mounted read-only from host | Yes (common in Kubernetes) | -| Only own cgroup subtree mounted | No | - -### Fallbacks When Cgroups Aren't Accessible - -1. **Mount the cgroup fs** — volume mount `/sys/fs/cgroup:ro` -2. **Use a sidecar with access** — one privileged container does monitoring -3. **Accept "unknown" limits** — report raw ticks/cores instead of percentages -4. **Kubernetes Downward API** — inject limits as env vars (own container only) diff --git a/go.mod b/go.mod index 7ef0cd0..3b4c218 100644 --- a/go.mod +++ b/go.mod @@ -1,15 +1,3 @@ module edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector go 1.25.6 - -require ( - gorm.io/driver/sqlite v1.6.0 - gorm.io/gorm v1.31.1 -) - -require ( - github.com/jinzhu/inflection v1.0.0 // indirect - github.com/jinzhu/now v1.1.5 // indirect - github.com/mattn/go-sqlite3 v1.14.22 // indirect - golang.org/x/text v0.20.0 // indirect -) diff --git a/go.sum b/go.sum deleted file mode 100644 index 330dd09..0000000 --- a/go.sum +++ /dev/null @@ -1,12 +0,0 @@ -github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= -github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= -github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= -github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= -github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= -github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= -golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= -gorm.io/driver/sqlite v1.6.0 h1:WHRRrIiulaPiPFmDcod6prc4l2VGVWHz80KspNsxSfQ= -gorm.io/driver/sqlite v1.6.0/go.mod h1:AO9V1qIQddBESngQUKWL9yoH93HIeA1X6V633rBwyT8= -gorm.io/gorm v1.31.1 h1:7CA8FTFz/gRfgqgpeKIBcervUn3xSyPUmr6B2WXJ7kg= -gorm.io/gorm v1.31.1/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs= diff --git a/internal/cgroup/config.go b/internal/cgroup/config.go deleted file mode 100644 index 536f61b..0000000 --- a/internal/cgroup/config.go +++ /dev/null @@ -1,84 +0,0 @@ -// ABOUTME: Configuration types and parsing for cgroup limits and process mapping. -// ABOUTME: Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP environment variables. -package cgroup - -import ( - "encoding/json" - "fmt" - "os" -) - -// CgroupLimit holds the resource limits for a container/cgroup -type CgroupLimit struct { - CPUCores float64 // CPU limit in cores (e.g., 0.5 for "500m", 2.0 for "2") - MemoryBytes uint64 // Memory limit in bytes -} - -// CgroupLimits maps container names to their resource limits -type CgroupLimits map[string]CgroupLimit - -// ProcessMapping maps process names to container names (for cgroup path discovery) -type ProcessMapping map[string]string - -// CgroupPathMapping maps cgroup paths to container names (built at runtime) -type CgroupPathMapping map[string]string - -// rawLimitEntry is the JSON structure for each entry in CGROUP_LIMITS -type rawLimitEntry struct { - CPU string `json:"cpu"` - Memory string `json:"memory"` -} - -// ParseCgroupLimitsEnv parses the CGROUP_LIMITS environment variable. -// Expected format: {"container-name": {"cpu": "500m", "memory": "1Gi"}, ...} -func ParseCgroupLimitsEnv() (CgroupLimits, error) { - raw := os.Getenv("CGROUP_LIMITS") - if raw == "" { - return nil, nil // No limits configured - } - - var parsed map[string]rawLimitEntry - if err := json.Unmarshal([]byte(raw), &parsed); err != nil { - return nil, fmt.Errorf("parsing CGROUP_LIMITS: %w", err) - } - - limits := make(CgroupLimits) - for name, entry := range parsed { - var limit CgroupLimit - var err error - - if entry.CPU != "" { - limit.CPUCores, err = ParseCPU(entry.CPU) - if err != nil { - return nil, fmt.Errorf("parsing CPU for %q: %w", name, err) - } - } - - if entry.Memory != "" { - limit.MemoryBytes, err = ParseMemory(entry.Memory) - if err != nil { - return nil, fmt.Errorf("parsing memory for %q: %w", name, err) - } - } - - limits[name] = limit - } - - return limits, nil -} - -// ParseProcessMappingEnv parses the CGROUP_PROCESS_MAP environment variable. -// Expected format: {"process-name": "container-name", ...} -func ParseProcessMappingEnv() (ProcessMapping, error) { - raw := os.Getenv("CGROUP_PROCESS_MAP") - if raw == "" { - return nil, nil // No mapping configured - } - - var parsed map[string]string - if err := json.Unmarshal([]byte(raw), &parsed); err != nil { - return nil, fmt.Errorf("parsing CGROUP_PROCESS_MAP: %w", err) - } - - return ProcessMapping(parsed), nil -} diff --git a/internal/cgroup/parse.go b/internal/cgroup/parse.go deleted file mode 100644 index e7d1a92..0000000 --- a/internal/cgroup/parse.go +++ /dev/null @@ -1,96 +0,0 @@ -// ABOUTME: Parses Kubernetes-style resource notation for CPU and memory. -// ABOUTME: CPU: "500m" = 0.5 cores, "2" = 2 cores. -// ABOUTME: Memory: "1Gi" = 1 GiB, "512Mi" = 512 MiB, "1G" = 1 GB. -package cgroup - -import ( - "fmt" - "strconv" - "strings" -) - -// ParseCPU parses Kubernetes CPU notation to cores. -// Examples: "500m" => 0.5, "2" => 2.0, "100m" => 0.1, "2000m" => 2.0 -func ParseCPU(value string) (float64, error) { - value = strings.TrimSpace(value) - if value == "" { - return 0, fmt.Errorf("empty CPU value") - } - - // Handle millicores suffix - if strings.HasSuffix(value, "m") { - millis, err := strconv.ParseFloat(strings.TrimSuffix(value, "m"), 64) - if err != nil { - return 0, fmt.Errorf("parsing millicores: %w", err) - } - return millis / 1000.0, nil - } - - // Plain number means cores - cores, err := strconv.ParseFloat(value, 64) - if err != nil { - return 0, fmt.Errorf("parsing cores: %w", err) - } - - return cores, nil -} - -// ParseMemory parses Kubernetes memory notation to bytes. -// Supports: -// - Binary suffixes: Ki, Mi, Gi, Ti (powers of 1024) -// - Decimal suffixes: K, M, G, T (powers of 1000) -// - Plain numbers: bytes -func ParseMemory(value string) (uint64, error) { - value = strings.TrimSpace(value) - if value == "" { - return 0, fmt.Errorf("empty memory value") - } - - // Binary suffixes (powers of 1024) - binarySuffixes := map[string]uint64{ - "Ki": 1024, - "Mi": 1024 * 1024, - "Gi": 1024 * 1024 * 1024, - "Ti": 1024 * 1024 * 1024 * 1024, - } - - // Decimal suffixes (powers of 1000) - decimalSuffixes := map[string]uint64{ - "K": 1000, - "M": 1000 * 1000, - "G": 1000 * 1000 * 1000, - "T": 1000 * 1000 * 1000 * 1000, - } - - // Try binary suffixes first (2-char) - for suffix, multiplier := range binarySuffixes { - if strings.HasSuffix(value, suffix) { - numStr := strings.TrimSuffix(value, suffix) - num, err := strconv.ParseFloat(numStr, 64) - if err != nil { - return 0, fmt.Errorf("parsing memory value: %w", err) - } - return uint64(num * float64(multiplier)), nil - } - } - - // Try decimal suffixes (1-char) - for suffix, multiplier := range decimalSuffixes { - if strings.HasSuffix(value, suffix) { - numStr := strings.TrimSuffix(value, suffix) - num, err := strconv.ParseFloat(numStr, 64) - if err != nil { - return 0, fmt.Errorf("parsing memory value: %w", err) - } - return uint64(num * float64(multiplier)), nil - } - } - - // Plain number (bytes) - bytes, err := strconv.ParseUint(value, 10, 64) - if err != nil { - return 0, fmt.Errorf("parsing bytes: %w", err) - } - - return bytes, nil -} diff --git a/internal/cgroup/parse_test.go b/internal/cgroup/parse_test.go deleted file mode 100644 index 2451163..0000000 --- a/internal/cgroup/parse_test.go +++ /dev/null @@ -1,84 +0,0 @@ -package cgroup - -import ( - "testing" -) - -func TestParseCPU(t *testing.T) { - tests := []struct { - name string - input string - want float64 - wantErr bool - }{ - {"millicores 500m", "500m", 0.5, false}, - {"millicores 100m", "100m", 0.1, false}, - {"millicores 2000m", "2000m", 2.0, false}, - {"millicores 1m", "1m", 0.001, false}, - {"cores integer", "2", 2.0, false}, - {"cores decimal", "1.5", 1.5, false}, - {"cores with spaces", " 2 ", 2.0, false}, - {"empty string", "", 0, true}, - {"invalid format", "abc", 0, true}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseCPU(tt.input) - if (err != nil) != tt.wantErr { - t.Errorf("ParseCPU() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !tt.wantErr && got != tt.want { - t.Errorf("ParseCPU() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestParseMemory(t *testing.T) { - tests := []struct { - name string - input string - want uint64 - wantErr bool - }{ - // Binary suffixes (powers of 1024) - {"Ki", "1Ki", 1024, false}, - {"Mi", "1Mi", 1024 * 1024, false}, - {"Gi", "1Gi", 1024 * 1024 * 1024, false}, - {"Ti", "1Ti", 1024 * 1024 * 1024 * 1024, false}, - {"512Mi", "512Mi", 512 * 1024 * 1024, false}, - {"2Gi", "2Gi", 2 * 1024 * 1024 * 1024, false}, - - // Decimal suffixes (powers of 1000) - {"K", "1K", 1000, false}, - {"M", "1M", 1000000, false}, - {"G", "1G", 1000000000, false}, - {"T", "1T", 1000000000000, false}, - - // Plain bytes - {"bytes", "1024", 1024, false}, - {"large bytes", "1073741824", 1073741824, false}, - - // With spaces - {"with spaces", " 1Gi ", 1024 * 1024 * 1024, false}, - - // Error cases - {"empty", "", 0, true}, - {"invalid", "abc", 0, true}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseMemory(tt.input) - if (err != nil) != tt.wantErr { - t.Errorf("ParseMemory() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !tt.wantErr && got != tt.want { - t.Errorf("ParseMemory() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/internal/collector/collector.go b/internal/collector/collector.go index 393f1ae..cbc5683 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -8,7 +8,6 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) // Config holds the collector configuration @@ -20,36 +19,22 @@ type Config struct { // Collector orchestrates metric collection type Collector struct { - config Config - aggregator *metrics.Aggregator - writer output.Writer - logger *slog.Logger - accumulator *summary.Accumulator - summaryWriter *summary.SummaryWriter - pushClient *summary.PushClient + config Config + aggregator *metrics.Aggregator + writer output.Writer + logger *slog.Logger } // New creates a new collector func New(cfg Config, writer output.Writer, logger *slog.Logger) *Collector { return &Collector{ - config: cfg, - aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), - writer: writer, - logger: logger, - accumulator: summary.NewAccumulator(cfg.TopN), + config: cfg, + aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), + writer: writer, + logger: logger, } } -// SetSummaryWriter attaches a summary writer for emitting run summaries on shutdown -func (c *Collector) SetSummaryWriter(w *summary.SummaryWriter) { - c.summaryWriter = w -} - -// SetPushClient attaches a push client for sending summaries to the receiver -func (c *Collector) SetPushClient(p *summary.PushClient) { - c.pushClient = p -} - // Run starts the collector loop and blocks until context is cancelled func (c *Collector) Run(ctx context.Context) error { c.logger.Info("collector starting", @@ -70,7 +55,6 @@ func (c *Collector) Run(ctx context.Context) error { select { case <-ctx.Done(): c.logger.Info("collector stopping") - c.emitSummary(context.Background()) // Use fresh context for shutdown tasks return ctx.Err() case <-ticker.C: if err := c.collect(); err != nil { @@ -91,37 +75,9 @@ func (c *Collector) collect() error { return fmt.Errorf("writing metrics: %w", err) } - c.accumulator.Add(m) - return nil } -// emitSummary computes and writes the run summary if a writer is configured -func (c *Collector) emitSummary(ctx context.Context) { - s := c.accumulator.Summarize() - if s == nil { - c.logger.Info("no samples collected, skipping run summary") - return - } - - c.logger.Info("emitting run summary", - slog.Int("sample_count", s.SampleCount), - slog.Float64("duration_seconds", s.DurationSeconds), - ) - - if c.summaryWriter != nil { - c.summaryWriter.Write(s) - } - - if c.pushClient != nil { - if err := c.pushClient.Push(ctx, s); err != nil { - c.logger.Error("failed to push metrics", slog.String("error", err.Error())) - } else { - c.logger.Info("metrics pushed successfully") - } - } -} - // CollectOnce performs a single collection and returns the metrics func (c *Collector) CollectOnce() (*metrics.SystemMetrics, error) { return c.aggregator.Collect() diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go deleted file mode 100644 index 9251a51..0000000 --- a/internal/collector/collector_test.go +++ /dev/null @@ -1,98 +0,0 @@ -// ABOUTME: Tests for the collector's summary integration. -// ABOUTME: Validates that run summaries are emitted on shutdown and handles missing writer gracefully. -package collector - -import ( - "bytes" - "context" - "log/slog" - "strings" - "testing" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" -) - -func TestCollector_EmitsSummaryOnShutdown(t *testing.T) { - // Use testdata/proc as the proc filesystem - procPath := "testdata/proc" - - // Metrics output (regular collection output) - var metricsOut bytes.Buffer - metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ - Output: &metricsOut, - Format: output.LogFormatJSON, - Level: slog.LevelInfo, - }) - - // Summary output - var summaryOut bytes.Buffer - sw := summary.NewSummaryWriter(&summaryOut, "json") - - // Silence app logs - appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) - - c := New(Config{ - ProcPath: procPath, - Interval: 50 * time.Millisecond, - TopN: 5, - }, metricsWriter, appLogger) - - c.SetSummaryWriter(sw) - - // Run collector briefly then cancel - ctx, cancel := context.WithCancel(context.Background()) - go func() { - // Let at least 2 collection cycles run - time.Sleep(150 * time.Millisecond) - cancel() - }() - - _ = c.Run(ctx) - - // Verify summary was emitted - summaryOutput := summaryOut.String() - if !strings.Contains(summaryOutput, "run_summary") { - t.Errorf("expected 'run_summary' in output, got: %s", summaryOutput) - } - if !strings.Contains(summaryOutput, "duration_seconds") { - t.Errorf("expected 'duration_seconds' in output, got: %s", summaryOutput) - } - if !strings.Contains(summaryOutput, "sample_count") { - t.Errorf("expected 'sample_count' in output, got: %s", summaryOutput) - } -} - -func TestCollector_NoSummaryWithoutWriter(t *testing.T) { - procPath := "testdata/proc" - - var metricsOut bytes.Buffer - metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ - Output: &metricsOut, - Format: output.LogFormatJSON, - Level: slog.LevelInfo, - }) - - appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) - - c := New(Config{ - ProcPath: procPath, - Interval: 50 * time.Millisecond, - TopN: 5, - }, metricsWriter, appLogger) - - // Deliberately do NOT set a summary writer - - ctx, cancel := context.WithCancel(context.Background()) - go func() { - time.Sleep(100 * time.Millisecond) - cancel() - }() - - // Should not panic - err := c.Run(ctx) - if err != context.Canceled { - t.Errorf("expected context.Canceled, got: %v", err) - } -} diff --git a/internal/collector/testdata/proc/1/stat b/internal/collector/testdata/proc/1/stat deleted file mode 100644 index 01d6595..0000000 --- a/internal/collector/testdata/proc/1/stat +++ /dev/null @@ -1 +0,0 @@ -1 (init) S 0 1 1 0 -1 4194560 1000 0 0 0 100 50 0 0 20 0 1 0 1 10000000 500 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/internal/collector/testdata/proc/1/status b/internal/collector/testdata/proc/1/status deleted file mode 100644 index 0b4d9e3..0000000 --- a/internal/collector/testdata/proc/1/status +++ /dev/null @@ -1,14 +0,0 @@ -Name: init -Uid: 0 0 0 0 -Gid: 0 0 0 0 -VmPeak: 10000 kB -VmSize: 10000 kB -VmRSS: 5000 kB -VmData: 3000 kB -VmStk: 200 kB -VmExe: 100 kB -VmLib: 1000 kB -RssAnon: 3000 kB -RssFile: 1500 kB -RssShmem: 500 kB -Threads: 1 diff --git a/internal/collector/testdata/proc/cpuinfo b/internal/collector/testdata/proc/cpuinfo deleted file mode 100644 index 9703a4e..0000000 --- a/internal/collector/testdata/proc/cpuinfo +++ /dev/null @@ -1 +0,0 @@ -processor : 0 diff --git a/internal/collector/testdata/proc/meminfo b/internal/collector/testdata/proc/meminfo deleted file mode 100644 index 2993ff4..0000000 --- a/internal/collector/testdata/proc/meminfo +++ /dev/null @@ -1,5 +0,0 @@ -MemTotal: 16348500 kB -MemFree: 8000000 kB -MemAvailable: 12000000 kB -Buffers: 500000 kB -Cached: 3000000 kB diff --git a/internal/collector/testdata/proc/stat b/internal/collector/testdata/proc/stat deleted file mode 100644 index 513d56a..0000000 --- a/internal/collector/testdata/proc/stat +++ /dev/null @@ -1 +0,0 @@ -cpu 10000 500 3000 80000 200 50 30 0 0 0 diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go deleted file mode 100644 index 685d2b6..0000000 --- a/internal/integration/integration_test.go +++ /dev/null @@ -1,386 +0,0 @@ -// ABOUTME: Integration tests for collector and receiver interaction. -// ABOUTME: Tests that the push client can successfully send metrics to the receiver. -package integration - -import ( - "bytes" - "context" - "encoding/json" - "io" - "log/slog" - "net/http" - "net/http/httptest" - "path/filepath" - "testing" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" -) - -const ( - testReadToken = "integration-test-token" - testHMACKey = "integration-hmac-key" -) - -// setupTestReceiver creates a test receiver with SQLite storage, auth, and HTTP server -func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) { - t.Helper() - dbPath := filepath.Join(t.TempDir(), "test.db") - store, err := receiver.NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - - handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), testReadToken, testHMACKey) - mux := http.NewServeMux() - handler.RegisterRoutes(mux) - - server := httptest.NewServer(mux) - - cleanup := func() { - server.Close() - _ = store.Close() - } - - return store, server, cleanup -} - -// generatePushToken generates a scoped push token for an execution context -func generatePushToken(exec summary.ExecutionContext) string { - return receiver.GenerateScopedToken(testHMACKey, exec.Organization, exec.Repository, exec.Workflow, exec.Job) -} - -func TestPushClientToReceiver(t *testing.T) { - store, server, cleanup := setupTestReceiver(t) - defer cleanup() - - // Test execution context - testCtx := summary.ExecutionContext{ - Organization: "integration-org", - Repository: "integration-repo", - Workflow: "test.yml", - Job: "integration-test", - RunID: "run-integration-123", - } - - // Create a test summary - testSummary := &summary.RunSummary{ - StartTime: time.Now().Add(-time.Minute), - EndTime: time.Now(), - DurationSeconds: 60.0, - SampleCount: 10, - CPUTotal: summary.StatSummary{Peak: 85.5, Avg: 42.3, P95: 78.0}, - MemUsedBytes: summary.StatSummary{Peak: 4294967296, Avg: 2147483648, P95: 3865470566}, - MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, - TopCPUProcesses: []summary.ProcessPeak{ - {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, - }, - TopMemProcesses: []summary.ProcessPeak{ - {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, - }, - } - - // Build payload matching what push client sends - payload := struct { - Execution summary.ExecutionContext `json:"execution"` - Summary summary.RunSummary `json:"run_summary"` - }{ - Execution: testCtx, - Summary: *testSummary, - } - - body, err := json.Marshal(payload) - if err != nil { - t.Fatalf("Marshal() error = %v", err) - } - - // Send via HTTP client with scoped push token - pushToken := generatePushToken(testCtx) - req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) - if err != nil { - t.Fatalf("NewRequest() error = %v", err) - } - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+pushToken) - - resp, err := http.DefaultClient.Do(req) - if err != nil { - t.Fatalf("Do() error = %v", err) - } - defer func() { _ = resp.Body.Close() }() - - if resp.StatusCode != http.StatusCreated { - t.Errorf("status = %d, want %d", resp.StatusCode, http.StatusCreated) - } - - // Verify metrics were stored - metrics, err := store.GetMetricsByWorkflowJob("integration-org", "integration-repo", "test.yml", "integration-test") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - - if len(metrics) != 1 { - t.Fatalf("got %d metrics, want 1", len(metrics)) - } - - m := metrics[0] - if m.Organization != testCtx.Organization { - t.Errorf("Organization = %q, want %q", m.Organization, testCtx.Organization) - } - if m.Repository != testCtx.Repository { - t.Errorf("Repository = %q, want %q", m.Repository, testCtx.Repository) - } - if m.Workflow != testCtx.Workflow { - t.Errorf("Workflow = %q, want %q", m.Workflow, testCtx.Workflow) - } - if m.Job != testCtx.Job { - t.Errorf("Job = %q, want %q", m.Job, testCtx.Job) - } - if m.RunID != testCtx.RunID { - t.Errorf("RunID = %q, want %q", m.RunID, testCtx.RunID) - } - - // Verify payload was stored correctly - var storedSummary summary.RunSummary - if err := json.Unmarshal([]byte(m.Payload), &storedSummary); err != nil { - t.Fatalf("Unmarshal payload error = %v", err) - } - - if storedSummary.SampleCount != testSummary.SampleCount { - t.Errorf("SampleCount = %d, want %d", storedSummary.SampleCount, testSummary.SampleCount) - } - if storedSummary.CPUTotal.Peak != testSummary.CPUTotal.Peak { - t.Errorf("CPUTotal.Peak = %f, want %f", storedSummary.CPUTotal.Peak, testSummary.CPUTotal.Peak) - } -} - -func TestPushClientIntegration(t *testing.T) { - store, server, cleanup := setupTestReceiver(t) - defer cleanup() - - // Set environment variables for the push client - t.Setenv("GITHUB_REPOSITORY_OWNER", "push-client-org") - t.Setenv("GITHUB_REPOSITORY", "push-client-repo") - t.Setenv("GITHUB_WORKFLOW", "push-test.yml") - t.Setenv("GITHUB_JOB", "push-job") - t.Setenv("GITHUB_RUN_ID", "push-run-456") - - // Generate scoped push token - pushToken := receiver.GenerateScopedToken(testHMACKey, "push-client-org", "push-client-repo", "push-test.yml", "push-job") - - // Create push client with token - it reads execution context from env vars - pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", pushToken) - - // Verify execution context was read from env - ctx := pushClient.ExecutionContext() - if ctx.Organization != "push-client-org" { - t.Errorf("Organization = %q, want %q", ctx.Organization, "push-client-org") - } - - // Create and push a summary - testSummary := &summary.RunSummary{ - StartTime: time.Now().Add(-30 * time.Second), - EndTime: time.Now(), - DurationSeconds: 30.0, - SampleCount: 6, - CPUTotal: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, - MemUsedBytes: summary.StatSummary{Peak: 1000000, Avg: 500000, P95: 900000}, - MemUsedPercent: summary.StatSummary{Peak: 10.0, Avg: 5.0, P95: 9.0}, - } - - // Push the summary - err := pushClient.Push(context.Background(), testSummary) - if err != nil { - t.Fatalf("Push() error = %v", err) - } - - // Verify it was stored - metrics, err := store.GetMetricsByWorkflowJob("push-client-org", "push-client-repo", "push-test.yml", "push-job") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - - if len(metrics) != 1 { - t.Fatalf("got %d metrics, want 1", len(metrics)) - } - - if metrics[0].RunID != "push-run-456" { - t.Errorf("RunID = %q, want %q", metrics[0].RunID, "push-run-456") - } -} - -func TestMultiplePushes(t *testing.T) { - store, server, cleanup := setupTestReceiver(t) - defer cleanup() - - // Simulate multiple workflow runs pushing metrics via direct HTTP POST - runs := []summary.ExecutionContext{ - {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, - {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-2"}, - {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "test", RunID: "run-1"}, - {Organization: "org-a", Repository: "repo-2", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, - } - - for _, execCtx := range runs { - payload := struct { - Execution summary.ExecutionContext `json:"execution"` - Summary summary.RunSummary `json:"run_summary"` - }{ - Execution: execCtx, - Summary: summary.RunSummary{ - SampleCount: 5, - CPUTotal: summary.StatSummary{Peak: 50.0}, - }, - } - - body, err := json.Marshal(payload) - if err != nil { - t.Fatalf("Marshal() error = %v", err) - } - - pushToken := generatePushToken(execCtx) - req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) - if err != nil { - t.Fatalf("NewRequest() error = %v for run %+v", err, execCtx) - } - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+pushToken) - - resp, err := http.DefaultClient.Do(req) - if err != nil { - t.Fatalf("Do() error = %v for run %+v", err, execCtx) - } - _ = resp.Body.Close() - - if resp.StatusCode != http.StatusCreated { - t.Fatalf("status = %d, want %d for run %+v", resp.StatusCode, http.StatusCreated, execCtx) - } - } - - // Verify filtering works correctly - metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 2 { - t.Errorf("got %d metrics for org-a/repo-1/ci.yml/build, want 2", len(metrics)) - } - - metrics, err = store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "test") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 1 { - t.Errorf("got %d metrics for org-a/repo-1/ci.yml/test, want 1", len(metrics)) - } -} - -func TestPushClientWithTokenIntegration(t *testing.T) { - readToken := "integration-read-secret" - hmacKey := "integration-hmac-secret" - store, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) - defer cleanup() - - // Generate a scoped token via the API - tokenReqBody, _ := json.Marshal(map[string]string{ - "organization": "token-org", - "repository": "token-repo", - "workflow": "ci.yml", - "job": "build", - }) - tokenReq, _ := http.NewRequest(http.MethodPost, server.URL+"/api/v1/token", bytes.NewReader(tokenReqBody)) - tokenReq.Header.Set("Authorization", "Bearer "+readToken) - tokenReq.Header.Set("Content-Type", "application/json") - - tokenResp, err := http.DefaultClient.Do(tokenReq) - if err != nil { - t.Fatalf("token request error: %v", err) - } - defer func() { _ = tokenResp.Body.Close() }() - - if tokenResp.StatusCode != http.StatusOK { - t.Fatalf("token request status = %d, want %d", tokenResp.StatusCode, http.StatusOK) - } - - var tokenBody struct { - Token string `json:"token"` - } - if err := json.NewDecoder(tokenResp.Body).Decode(&tokenBody); err != nil { - t.Fatalf("decode token response: %v", err) - } - - // Use the scoped token to push metrics - t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") - t.Setenv("GITHUB_REPOSITORY", "token-repo") - t.Setenv("GITHUB_WORKFLOW", "ci.yml") - t.Setenv("GITHUB_JOB", "build") - t.Setenv("GITHUB_RUN_ID", "token-run-1") - - pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", tokenBody.Token) - - testSummary := &summary.RunSummary{ - StartTime: time.Now().Add(-10 * time.Second), - EndTime: time.Now(), - DurationSeconds: 10.0, - SampleCount: 2, - } - - if err := pushClient.Push(context.Background(), testSummary); err != nil { - t.Fatalf("Push() error = %v", err) - } - - // Verify stored - metrics, err := store.GetMetricsByWorkflowJob("token-org", "token-repo", "ci.yml", "build") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 1 { - t.Fatalf("got %d metrics, want 1", len(metrics)) - } - if metrics[0].RunID != "token-run-1" { - t.Errorf("RunID = %q, want %q", metrics[0].RunID, "token-run-1") - } -} - -func TestPushClientWithWrongTokenIntegration(t *testing.T) { - readToken := "integration-read-secret" - hmacKey := "integration-hmac-secret" - _, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) - defer cleanup() - - t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") - t.Setenv("GITHUB_REPOSITORY", "token-repo") - t.Setenv("GITHUB_WORKFLOW", "ci.yml") - t.Setenv("GITHUB_JOB", "build") - t.Setenv("GITHUB_RUN_ID", "token-run-2") - - pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", "wrong-token") - - err := pushClient.Push(context.Background(), &summary.RunSummary{SampleCount: 1}) - if err == nil { - t.Error("Push() with wrong token should fail") - } -} - -func setupTestReceiverWithToken(t *testing.T, readToken, hmacKey string) (*receiver.Store, *httptest.Server, func()) { - t.Helper() - dbPath := filepath.Join(t.TempDir(), "test.db") - store, err := receiver.NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - - handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), readToken, hmacKey) - mux := http.NewServeMux() - handler.RegisterRoutes(mux) - - server := httptest.NewServer(mux) - - cleanup := func() { - server.Close() - _ = store.Close() - } - - return store, server, cleanup -} diff --git a/internal/metrics/aggregator.go b/internal/metrics/aggregator.go index 2e7c18e..1e33646 100644 --- a/internal/metrics/aggregator.go +++ b/internal/metrics/aggregator.go @@ -4,37 +4,23 @@ import ( "sort" "time" - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/cgroup" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/proc" ) // Aggregator collects and aggregates metrics from processes type Aggregator struct { - procPath string - topN int - prevCPU *CPUSnapshot - prevProcCPU map[int]*ProcessCPUSnapshot - cgroupLimits cgroup.CgroupLimits // container name -> limits - processMapping cgroup.ProcessMapping // process name -> container name - cgroupPathMapping cgroup.CgroupPathMapping // cgroup path -> container name (built at runtime) - prevCgroupCPU map[string]uint64 // container name -> previous total ticks - prevCgroupTime time.Time // previous collection time for cgroup CPU calc + procPath string + topN int + prevCPU *CPUSnapshot + prevProcCPU map[int]*ProcessCPUSnapshot } // NewAggregator creates a new metrics aggregator func NewAggregator(procPath string, topN int) *Aggregator { - // Parse cgroup configuration from environment - limits, _ := cgroup.ParseCgroupLimitsEnv() - processMap, _ := cgroup.ParseProcessMappingEnv() - return &Aggregator{ - procPath: procPath, - topN: topN, - prevProcCPU: make(map[int]*ProcessCPUSnapshot), - cgroupLimits: limits, - processMapping: processMap, - cgroupPathMapping: make(cgroup.CgroupPathMapping), - prevCgroupCPU: make(map[string]uint64), + procPath: procPath, + topN: topN, + prevProcCPU: make(map[int]*ProcessCPUSnapshot), } } @@ -91,12 +77,6 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { return float64(p.MemRSS) }) - // Discover cgroup path mappings from known processes - a.discoverCgroupMappings(processes) - - // Calculate per-cgroup metrics - cgroupMetrics := a.calculateCgroupMetrics(processes, processMetrics, now) - return &SystemMetrics{ Timestamp: now, TotalProcesses: len(processes), @@ -104,7 +84,6 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { Memory: memMetrics, TopCPU: topCPU, TopMemory: topMemory, - Cgroups: cgroupMetrics, }, nil } @@ -179,11 +158,6 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now state = "?" } - cgroupPath := "" - if p.Cgroup != nil { - cgroupPath = p.Cgroup.Path - } - metrics = append(metrics, ProcessMetrics{ PID: pid, Name: p.Status.Name, @@ -192,7 +166,6 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now MemVirtual: p.Status.VmSize, Threads: p.Status.Threads, State: state, - CgroupPath: cgroupPath, }) } @@ -250,152 +223,3 @@ func (a *Aggregator) getTopByMetric(metrics []ProcessMetrics, getValue func(Proc return sorted[:n] } - -// discoverCgroupMappings discovers cgroup path to container name mappings -// by looking for processes that match the configured process mapping. -func (a *Aggregator) discoverCgroupMappings(processes []*proc.ProcessInfo) { - if len(a.processMapping) == 0 { - return - } - - for _, p := range processes { - if p.Cgroup == nil || p.Cgroup.Path == "" { - continue - } - - // Check if this process name is in our mapping - if containerName, ok := a.processMapping[p.Status.Name]; ok { - // Map this cgroup path to the container name - a.cgroupPathMapping[p.Cgroup.Path] = containerName - } - } -} - -// resolveContainerName returns the container name for a cgroup path, -// or the raw path if no mapping exists. -func (a *Aggregator) resolveContainerName(cgroupPath string) string { - if name, ok := a.cgroupPathMapping[cgroupPath]; ok { - return name - } - // Use raw path as fallback - if cgroupPath == "" { - return "" - } - return cgroupPath -} - -// calculateCgroupMetrics computes metrics grouped by container/cgroup. -func (a *Aggregator) calculateCgroupMetrics( - processes []*proc.ProcessInfo, - processMetrics []ProcessMetrics, - now time.Time, -) map[string]*CgroupMetrics { - // Build lookup from PID to ProcessMetrics - pmByPID := make(map[int]ProcessMetrics) - for _, pm := range processMetrics { - pmByPID[pm.PID] = pm - } - - // Group processes by container name - type cgroupData struct { - cgroupPath string - procs []*proc.ProcessInfo - metrics []ProcessMetrics - } - containerGroups := make(map[string]*cgroupData) - - for _, p := range processes { - cgroupPath := "" - if p.Cgroup != nil { - cgroupPath = p.Cgroup.Path - } - - containerName := a.resolveContainerName(cgroupPath) - - if _, ok := containerGroups[containerName]; !ok { - containerGroups[containerName] = &cgroupData{ - cgroupPath: cgroupPath, - } - } - - containerGroups[containerName].procs = append(containerGroups[containerName].procs, p) - - if pm, ok := pmByPID[p.Stat.PID]; ok { - containerGroups[containerName].metrics = append(containerGroups[containerName].metrics, pm) - } - } - - // Calculate elapsed time since last collection - elapsed := 0.0 - if !a.prevCgroupTime.IsZero() { - elapsed = now.Sub(a.prevCgroupTime).Seconds() - } - a.prevCgroupTime = now - - // Calculate metrics for each container - result := make(map[string]*CgroupMetrics) - - for containerName, data := range containerGroups { - // Sum CPU ticks (utime + stime only, not cutime/cstime) - var totalTicks uint64 - var totalRSS uint64 - - for _, p := range data.procs { - totalTicks += p.Stat.TotalTime() - totalRSS += p.Status.VmRSS - } - - // Calculate CPU cores used from delta - usedCores := 0.0 - hasDelta := false - if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 { - // Guard against underflow: if processes exited and new ones started, - // totalTicks could be less than prev. In that case, skip this sample. - if totalTicks >= prev { - deltaTicks := totalTicks - prev - // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) - usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) - hasDelta = true - } - } - a.prevCgroupCPU[containerName] = totalTicks - - // Calculate percentages against limits if available - cpuPercent := 0.0 - memPercent := 0.0 - var limitCores float64 - var limitBytes uint64 - - if limit, ok := a.cgroupLimits[containerName]; ok { - limitCores = limit.CPUCores - limitBytes = limit.MemoryBytes - if limit.CPUCores > 0 { - cpuPercent = (usedCores / limit.CPUCores) * 100 - } - if limit.MemoryBytes > 0 { - memPercent = (float64(totalRSS) / float64(limit.MemoryBytes)) * 100 - } - } - - result[containerName] = &CgroupMetrics{ - Name: containerName, - CgroupPath: data.cgroupPath, - CPU: CgroupCPUMetrics{ - TotalTicks: totalTicks, - UsedCores: usedCores, - UsedPercent: cpuPercent, - LimitCores: limitCores, - HasDelta: hasDelta, - }, - Memory: CgroupMemoryMetrics{ - TotalRSSBytes: totalRSS, - UsedPercent: memPercent, - LimitBytes: limitBytes, - }, - Processes: data.metrics, - NumProcs: len(data.procs), - } - } - - return result -} diff --git a/internal/metrics/types.go b/internal/metrics/types.go index 9aa9cb2..00be63f 100644 --- a/internal/metrics/types.go +++ b/internal/metrics/types.go @@ -11,7 +11,6 @@ type ProcessMetrics struct { MemVirtual uint64 `json:"mem_virtual_bytes"` Threads int `json:"threads"` State string `json:"state"` - CgroupPath string `json:"cgroup_path,omitempty"` } // CPUMetrics holds aggregated CPU metrics @@ -36,39 +35,12 @@ type MemoryMetrics struct { // SystemMetrics holds a complete snapshot of system metrics type SystemMetrics struct { - Timestamp time.Time `json:"timestamp"` - TotalProcesses int `json:"total_processes"` - CPU CPUMetrics `json:"cpu"` - Memory MemoryMetrics `json:"memory"` - TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` - TopMemory []ProcessMetrics `json:"top_memory,omitempty"` - Cgroups map[string]*CgroupMetrics `json:"cgroups,omitempty"` -} - -// CgroupCPUMetrics holds CPU metrics for a single cgroup/container -type CgroupCPUMetrics struct { - TotalTicks uint64 `json:"total_ticks"` - UsedCores float64 `json:"used_cores"` - UsedPercent float64 `json:"used_percent,omitempty"` - LimitCores float64 `json:"limit_cores,omitempty"` - HasDelta bool `json:"-"` // true when a valid delta could be computed -} - -// CgroupMemoryMetrics holds memory metrics for a single cgroup/container -type CgroupMemoryMetrics struct { - TotalRSSBytes uint64 `json:"total_rss_bytes"` - UsedPercent float64 `json:"used_percent,omitempty"` - LimitBytes uint64 `json:"limit_bytes,omitempty"` -} - -// CgroupMetrics holds all metrics for a single cgroup/container -type CgroupMetrics struct { - Name string `json:"name"` - CgroupPath string `json:"cgroup_path"` - CPU CgroupCPUMetrics `json:"cpu"` - Memory CgroupMemoryMetrics `json:"memory"` - Processes []ProcessMetrics `json:"processes"` - NumProcs int `json:"num_processes"` + Timestamp time.Time `json:"timestamp"` + TotalProcesses int `json:"total_processes"` + CPU CPUMetrics `json:"cpu"` + Memory MemoryMetrics `json:"memory"` + TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` + TopMemory []ProcessMetrics `json:"top_memory,omitempty"` } // CPUSnapshot holds CPU timing data for calculating percentages between intervals diff --git a/internal/output/logger.go b/internal/output/logger.go index a5933fd..0ce5ddd 100644 --- a/internal/output/logger.go +++ b/internal/output/logger.go @@ -1,7 +1,6 @@ package output import ( - "context" "io" "log/slog" "os" @@ -54,44 +53,29 @@ func NewLoggerWriter(cfg LoggerConfig) *LoggerWriter { } } -// topCPUEntry is a lightweight struct for JSON serialization of top CPU processes -type topCPUEntry struct { - PID int `json:"pid"` - Name string `json:"name"` - CPUPercent float64 `json:"cpu_percent"` -} - -// topMemEntry is a lightweight struct for JSON serialization of top memory processes -type topMemEntry struct { - PID int `json:"pid"` - Name string `json:"name"` - RSSBytes uint64 `json:"rss_bytes"` -} - // Write outputs the metrics using structured logging func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { - // Build top CPU process entries - topCPU := make([]topCPUEntry, 0, len(m.TopCPU)) + // Build top CPU process attrs + topCPUAttrs := make([]any, 0, len(m.TopCPU)) for _, p := range m.TopCPU { - topCPU = append(topCPU, topCPUEntry{ - PID: p.PID, - Name: p.Name, - CPUPercent: p.CPUPercent, - }) + topCPUAttrs = append(topCPUAttrs, slog.Group("", + slog.Int("pid", p.PID), + slog.String("name", p.Name), + slog.Float64("cpu_percent", p.CPUPercent), + )) } - // Build top memory process entries - topMem := make([]topMemEntry, 0, len(m.TopMemory)) + // Build top memory process attrs + topMemAttrs := make([]any, 0, len(m.TopMemory)) for _, p := range m.TopMemory { - topMem = append(topMem, topMemEntry{ - PID: p.PID, - Name: p.Name, - RSSBytes: p.MemRSS, - }) + topMemAttrs = append(topMemAttrs, slog.Group("", + slog.Int("pid", p.PID), + slog.String("name", p.Name), + slog.Uint64("rss_bytes", p.MemRSS), + )) } - // Build base attributes - attrs := []slog.Attr{ + w.logger.Info("metrics_collected", slog.Time("collection_time", m.Timestamp), slog.Int("total_processes", m.TotalProcesses), slog.Group("cpu", @@ -110,16 +94,9 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { slog.Uint64("total_rss_bytes", m.Memory.TotalRSSBytes), slog.Float64("rss_percent", m.Memory.RSSPercent), ), - slog.Any("top_cpu", topCPU), - slog.Any("top_memory", topMem), - } - - // Add cgroups if present - if len(m.Cgroups) > 0 { - attrs = append(attrs, slog.Any("cgroups", m.Cgroups)) - } - - w.logger.LogAttrs(context.Background(), slog.LevelInfo, "metrics_collected", attrs...) + slog.Any("top_cpu", topCPUAttrs), + slog.Any("top_memory", topMemAttrs), + ) return nil } diff --git a/internal/proc/cgroup.go b/internal/proc/cgroup.go deleted file mode 100644 index 6e1597f..0000000 --- a/internal/proc/cgroup.go +++ /dev/null @@ -1,59 +0,0 @@ -// ABOUTME: Reads cgroup information from /proc/[pid]/cgroup. -// ABOUTME: Supports both cgroup v1 and v2 formats. -package proc - -import ( - "bufio" - "fmt" - "os" - "strings" -) - -// CgroupInfo holds cgroup information for a process -type CgroupInfo struct { - Path string // The cgroup path (unified for v2, or from memory controller for v1) -} - -// ReadCgroup reads /proc/[pid]/cgroup and extracts the cgroup path -func ReadCgroup(procPath string, pid int) (*CgroupInfo, error) { - path := fmt.Sprintf("%s/%d/cgroup", procPath, pid) - - file, err := os.Open(path) - if err != nil { - return nil, fmt.Errorf("opening cgroup file: %w", err) - } - defer func() { _ = file.Close() }() - - var cgroupPath string - scanner := bufio.NewScanner(file) - - for scanner.Scan() { - line := scanner.Text() - - // Try cgroup v2 first (unified hierarchy) - // Format: 0::/path - if path, found := strings.CutPrefix(line, "0::"); found { - cgroupPath = path - break - } - - // Fall back to cgroup v1 - look for memory controller - // Format: X:memory:/path or X:memory,other:/path - parts := strings.SplitN(line, ":", 3) - if len(parts) == 3 { - controllers := parts[1] - if strings.Contains(controllers, "memory") { - cgroupPath = parts[2] - // Don't break - prefer v2 if found later - } - } - } - - if err := scanner.Err(); err != nil { - return nil, fmt.Errorf("scanning cgroup file: %w", err) - } - - return &CgroupInfo{ - Path: cgroupPath, - }, nil -} diff --git a/internal/proc/cgroup_test.go b/internal/proc/cgroup_test.go deleted file mode 100644 index e82b6c8..0000000 --- a/internal/proc/cgroup_test.go +++ /dev/null @@ -1,97 +0,0 @@ -package proc - -import ( - "os" - "path/filepath" - "testing" -) - -func TestReadCgroup(t *testing.T) { - tests := []struct { - name string - cgroupFile string - wantPath string - wantErr bool - }{ - { - name: "cgroup v2 unified", - cgroupFile: `0::/kubepods/pod-abc/container-123 -`, - wantPath: "/kubepods/pod-abc/container-123", - wantErr: false, - }, - { - name: "cgroup v2 with trailing newline", - cgroupFile: `0::/docker/abc123def456 -`, - wantPath: "/docker/abc123def456", - wantErr: false, - }, - { - name: "cgroup v1 multiple controllers", - cgroupFile: `12:blkio:/user.slice -11:memory:/docker/abc123 -10:cpu,cpuacct:/docker/abc123 -9:pids:/docker/abc123 -`, - wantPath: "/docker/abc123", - wantErr: false, - }, - { - name: "cgroup v2 preferred over v1", - cgroupFile: `11:memory:/docker/old-path -0::/kubepods/new-path -`, - wantPath: "/kubepods/new-path", - wantErr: false, - }, - { - name: "empty file", - cgroupFile: "", - wantPath: "", - wantErr: false, - }, - { - name: "root cgroup", - cgroupFile: `0::/ -`, - wantPath: "/", - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create a temp directory structure mimicking /proc - tmpDir := t.TempDir() - procDir := filepath.Join(tmpDir, "proc") - pidDir := filepath.Join(procDir, "1234") - - if err := os.MkdirAll(pidDir, 0755); err != nil { - t.Fatalf("Failed to create pid dir: %v", err) - } - - if err := os.WriteFile(filepath.Join(pidDir, "cgroup"), []byte(tt.cgroupFile), 0644); err != nil { - t.Fatalf("Failed to write cgroup file: %v", err) - } - - got, err := ReadCgroup(procDir, 1234) - if (err != nil) != tt.wantErr { - t.Errorf("ReadCgroup() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !tt.wantErr && got.Path != tt.wantPath { - t.Errorf("ReadCgroup() path = %q, want %q", got.Path, tt.wantPath) - } - }) - } -} - -func TestReadCgroup_FileNotFound(t *testing.T) { - tmpDir := t.TempDir() - - _, err := ReadCgroup(tmpDir, 1234) - if err == nil { - t.Error("ReadCgroup() expected error for missing file, got nil") - } -} diff --git a/internal/proc/process.go b/internal/proc/process.go index 9d1ef64..fced8bd 100644 --- a/internal/proc/process.go +++ b/internal/proc/process.go @@ -128,14 +128,13 @@ func ReadSystemCPU(procPath string) (user, nice, system, idle, iowait, irq, soft return 0, 0, 0, 0, 0, 0, 0, fmt.Errorf("cpu line not found in /proc/stat") } -// ProcessInfo combines stat, status, and cgroup information for a process +// ProcessInfo combines stat and status information for a process type ProcessInfo struct { Stat *ProcStat Status *ProcStatus - Cgroup *CgroupInfo } -// ReadProcess reads stat, status, and cgroup for a single process +// ReadProcess reads both stat and status for a single process func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { stat, err := ReadStat(procPath, pid) if err != nil { @@ -147,13 +146,9 @@ func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { return nil, err } - // Read cgroup info (non-fatal if it fails) - cgroup, _ := ReadCgroup(procPath, pid) - return &ProcessInfo{ Stat: stat, Status: status, - Cgroup: cgroup, }, nil } diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go deleted file mode 100644 index d847f62..0000000 --- a/internal/receiver/handler.go +++ /dev/null @@ -1,196 +0,0 @@ -// ABOUTME: HTTP handlers for the metrics receiver service. -// ABOUTME: Provides endpoints for receiving and querying metrics. -package receiver - -import ( - "crypto/subtle" - "encoding/json" - "log/slog" - "net/http" - "strings" -) - -// Handler handles HTTP requests for the metrics receiver -type Handler struct { - store *Store - logger *slog.Logger - readToken string // Pre-shared token for read endpoint authentication - hmacKey string // Separate key for HMAC-based push token generation/validation -} - -// NewHandler creates a new HTTP handler with the given store. -// readToken authenticates read endpoints and the token generation endpoint. -// hmacKey is the secret used to derive scoped push tokens. -func NewHandler(store *Store, logger *slog.Logger, readToken, hmacKey string) *Handler { - return &Handler{store: store, logger: logger, readToken: readToken, hmacKey: hmacKey} -} - -// RegisterRoutes registers all HTTP routes on the given mux -func (h *Handler) RegisterRoutes(mux *http.ServeMux) { - mux.HandleFunc("POST /api/v1/metrics", h.handleReceiveMetrics) - mux.HandleFunc("POST /api/v1/token", h.handleGenerateToken) - mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}", h.handleGetByWorkflowJob) - mux.HandleFunc("GET /health", h.handleHealth) -} - -// validateReadToken checks the Authorization header for a valid Bearer token. -func (h *Handler) validateReadToken(w http.ResponseWriter, r *http.Request) bool { - if h.readToken == "" { - h.logger.Warn("no read-token configured, rejecting request", slog.String("path", r.URL.Path)) - http.Error(w, "authorization required", http.StatusUnauthorized) - return false - } - - authHeader := r.Header.Get("Authorization") - if authHeader == "" { - h.logger.Warn("missing authorization header", slog.String("path", r.URL.Path)) - http.Error(w, "authorization required", http.StatusUnauthorized) - return false - } - - const bearerPrefix = "Bearer " - if !strings.HasPrefix(authHeader, bearerPrefix) { - h.logger.Warn("invalid authorization format", slog.String("path", r.URL.Path)) - http.Error(w, "invalid authorization format", http.StatusUnauthorized) - return false - } - - token := strings.TrimPrefix(authHeader, bearerPrefix) - if subtle.ConstantTimeCompare([]byte(token), []byte(h.readToken)) != 1 { - h.logger.Warn("invalid token", slog.String("path", r.URL.Path)) - http.Error(w, "invalid token", http.StatusUnauthorized) - return false - } - - return true -} - -func (h *Handler) handleGenerateToken(w http.ResponseWriter, r *http.Request) { - if h.hmacKey == "" { - http.Error(w, "token generation requires a configured HMAC key", http.StatusBadRequest) - return - } - - if !h.validateReadToken(w, r) { - return - } - - var req TokenRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, "invalid JSON body", http.StatusBadRequest) - return - } - - if req.Organization == "" || req.Repository == "" || req.Workflow == "" || req.Job == "" { - http.Error(w, "organization, repository, workflow, and job are required", http.StatusBadRequest) - return - } - - token := GenerateScopedToken(h.hmacKey, req.Organization, req.Repository, req.Workflow, req.Job) - - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(TokenResponse{Token: token}) -} - -// validatePushToken checks push authentication via scoped HMAC token. -func (h *Handler) validatePushToken(w http.ResponseWriter, r *http.Request, exec ExecutionContext) bool { - if h.hmacKey == "" { - h.logger.Warn("no HMAC key configured, rejecting push", slog.String("path", r.URL.Path)) - http.Error(w, "authorization required", http.StatusUnauthorized) - return false - } - - authHeader := r.Header.Get("Authorization") - if authHeader == "" { - h.logger.Warn("missing push authorization", slog.String("path", r.URL.Path)) - http.Error(w, "authorization required", http.StatusUnauthorized) - return false - } - - const bearerPrefix = "Bearer " - if !strings.HasPrefix(authHeader, bearerPrefix) { - h.logger.Warn("invalid push authorization format", slog.String("path", r.URL.Path)) - http.Error(w, "invalid authorization format", http.StatusUnauthorized) - return false - } - - token := strings.TrimPrefix(authHeader, bearerPrefix) - if !ValidateScopedToken(h.hmacKey, token, exec.Organization, exec.Repository, exec.Workflow, exec.Job) { - h.logger.Warn("invalid push token", slog.String("path", r.URL.Path)) - http.Error(w, "invalid token", http.StatusUnauthorized) - return false - } - - return true -} - -func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { - var payload MetricsPayload - if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { - h.logger.Error("failed to decode payload", slog.String("error", err.Error())) - http.Error(w, "invalid JSON payload", http.StatusBadRequest) - return - } - - if payload.Execution.RunID == "" { - http.Error(w, "run_id is required", http.StatusBadRequest) - return - } - - if !h.validatePushToken(w, r, payload.Execution) { - return - } - - id, err := h.store.SaveMetric(&payload) - if err != nil { - h.logger.Error("failed to save metric", slog.String("error", err.Error())) - http.Error(w, "failed to save metric", http.StatusInternalServerError) - return - } - - h.logger.Info("metric saved", - slog.Uint64("id", uint64(id)), - slog.String("run_id", payload.Execution.RunID), - slog.String("repository", payload.Execution.Repository), - ) - - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusCreated) - _ = json.NewEncoder(w).Encode(map[string]any{"id": id, "status": "created"}) -} - -func (h *Handler) handleGetByWorkflowJob(w http.ResponseWriter, r *http.Request) { - if !h.validateReadToken(w, r) { - return - } - - org := r.PathValue("org") - repo := r.PathValue("repo") - workflow := r.PathValue("workflow") - job := r.PathValue("job") - if org == "" || repo == "" || workflow == "" || job == "" { - http.Error(w, "org, repo, workflow and job are required", http.StatusBadRequest) - return - } - - metrics, err := h.store.GetMetricsByWorkflowJob(org, repo, workflow, job) - if err != nil { - h.logger.Error("failed to get metrics", slog.String("error", err.Error())) - http.Error(w, "failed to get metrics", http.StatusInternalServerError) - return - } - - // Convert to response type with Payload as JSON object - response := make([]MetricResponse, len(metrics)) - for i, m := range metrics { - response[i] = m.ToResponse() - } - - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(response) -} - -func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) -} diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go deleted file mode 100644 index cea58f0..0000000 --- a/internal/receiver/handler_test.go +++ /dev/null @@ -1,473 +0,0 @@ -package receiver - -import ( - "bytes" - "encoding/json" - "io" - "log/slog" - "net/http" - "net/http/httptest" - "path/filepath" - "testing" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" -) - -func TestHandler_ReceiveMetrics(t *testing.T) { - const readToken = "test-token" - h, cleanup := newTestHandlerWithToken(t, readToken) - defer cleanup() - - exec := ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-123", - } - pushToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) - - payload := MetricsPayload{ - Execution: exec, - Summary: summary.RunSummary{ - DurationSeconds: 60.0, - SampleCount: 12, - }, - } - - body, _ := json.Marshal(payload) - req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+pushToken) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusCreated { - t.Errorf("status = %d, want %d", rec.Code, http.StatusCreated) - } - - var resp map[string]any - if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if resp["status"] != "created" { - t.Errorf("response status = %v, want %q", resp["status"], "created") - } - if resp["id"] == nil || resp["id"].(float64) == 0 { - t.Error("response id is missing or zero") - } -} - -func TestHandler_ReceiveMetrics_InvalidJSON(t *testing.T) { - h, cleanup := newTestHandler(t) - defer cleanup() - - req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader([]byte("not json"))) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusBadRequest { - t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) - } -} - -func TestHandler_ReceiveMetrics_MissingRunID(t *testing.T) { - h, cleanup := newTestHandler(t) - defer cleanup() - - payload := MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - // RunID is missing - }, - Summary: summary.RunSummary{}, - } - - body, _ := json.Marshal(payload) - req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusBadRequest { - t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) - } -} - -func TestHandler_GetByWorkflowJob(t *testing.T) { - const readToken = "test-token" - h, cleanup := newTestHandlerWithToken(t, readToken) - defer cleanup() - - // Save metrics for different workflow/job combinations - payloads := []*MetricsPayload{ - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r1"}}, - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r2"}}, - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "test", RunID: "r3"}}, - } - for _, p := range payloads { - if _, err := h.store.SaveMetric(p); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - } - - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y/ci.yml/build", nil) - req.Header.Set("Authorization", "Bearer "+readToken) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var metrics []MetricResponse - if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if len(metrics) != 2 { - t.Errorf("got %d metrics, want 2", len(metrics)) - } -} - -func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { - const readToken = "test-token" - h, cleanup := newTestHandlerWithToken(t, readToken) - defer cleanup() - - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/workflow/job", nil) - req.Header.Set("Authorization", "Bearer "+readToken) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var metrics []MetricResponse - if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if len(metrics) != 0 { - t.Errorf("got %d metrics, want 0", len(metrics)) - } -} - -func TestHandler_GetByWorkflowJob_WithToken(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "secret-token") - defer cleanup() - - // Save a metric - payload := &MetricsPayload{ - Execution: ExecutionContext{Organization: "org", Repository: "repo", Workflow: "ci.yml", Job: "build", RunID: "r1"}, - } - if _, err := h.store.SaveMetric(payload); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - - tests := []struct { - name string - authHeader string - wantCode int - }{ - {"no auth header", "", http.StatusUnauthorized}, - {"wrong format", "Basic dXNlcjpwYXNz", http.StatusUnauthorized}, - {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, - {"valid token", "Bearer secret-token", http.StatusOK}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) - if tt.authHeader != "" { - req.Header.Set("Authorization", tt.authHeader) - } - rec := httptest.NewRecorder() - mux.ServeHTTP(rec, req) - - if rec.Code != tt.wantCode { - t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) - } - }) - } -} - -func TestHandler_Health(t *testing.T) { - h, cleanup := newTestHandler(t) - defer cleanup() - - req := httptest.NewRequest(http.MethodGet, "/health", nil) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var resp map[string]string - if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if resp["status"] != "ok" { - t.Errorf("status = %q, want %q", resp["status"], "ok") - } -} - -func TestHandler_GenerateToken(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "secret-token") - defer cleanup() - - body, _ := json.Marshal(TokenRequest{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - Job: "build", - }) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) - req.Header.Set("Authorization", "Bearer secret-token") - req.Header.Set("Content-Type", "application/json") - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var resp TokenResponse - if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if resp.Token == "" { - t.Error("expected non-empty token") - } - if len(resp.Token) != 64 { - t.Errorf("token length = %d, want 64", len(resp.Token)) - } -} - -func TestHandler_GenerateToken_NoAuth(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "secret-token") - defer cleanup() - - body, _ := json.Marshal(TokenRequest{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - Job: "build", - }) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusUnauthorized { - t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) - } -} - -func TestHandler_GenerateToken_MissingFields(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "secret-token") - defer cleanup() - - // Missing job field - body, _ := json.Marshal(TokenRequest{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - }) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) - req.Header.Set("Authorization", "Bearer secret-token") - req.Header.Set("Content-Type", "application/json") - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusBadRequest { - t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) - } -} - -func TestHandler_GenerateToken_NoReadToken(t *testing.T) { - h, cleanup := newTestHandler(t) // no readToken configured - defer cleanup() - - body, _ := json.Marshal(TokenRequest{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - Job: "build", - }) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusBadRequest { - t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) - } -} - -func TestHandler_ReceiveMetrics_WithPushToken(t *testing.T) { - readToken := "secret-token" - h, cleanup := newTestHandlerWithToken(t, readToken) - defer cleanup() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - - exec := ExecutionContext{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-1", - } - - validToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) - wrongScopeToken := GenerateScopedToken(readToken, "other-org", "repo", "ci.yml", "build") - - tests := []struct { - name string - authHeader string - wantCode int - }{ - {"no auth", "", http.StatusUnauthorized}, - {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, - {"wrong scope", "Bearer " + wrongScopeToken, http.StatusUnauthorized}, - {"valid token", "Bearer " + validToken, http.StatusCreated}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - payload := MetricsPayload{ - Execution: exec, - Summary: summary.RunSummary{SampleCount: 1}, - } - body, _ := json.Marshal(payload) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - if tt.authHeader != "" { - req.Header.Set("Authorization", tt.authHeader) - } - rec := httptest.NewRecorder() - mux.ServeHTTP(rec, req) - - if rec.Code != tt.wantCode { - t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) - } - }) - } -} - -func TestHandler_ReceiveMetrics_RejectsWhenNoReadToken(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured - defer cleanup() - - payload := MetricsPayload{ - Execution: ExecutionContext{ - Organization: "org", - Repository: "repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-1", - }, - Summary: summary.RunSummary{SampleCount: 1}, - } - body, _ := json.Marshal(payload) - - req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusUnauthorized { - t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) - } -} - -func TestHandler_GetByWorkflowJob_RejectsWhenNoReadToken(t *testing.T) { - h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured - defer cleanup() - - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusUnauthorized { - t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) - } -} - -func newTestHandler(t *testing.T) (*Handler, func()) { - t.Helper() - dbPath := filepath.Join(t.TempDir(), "test.db") - store, err := NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - handler := NewHandler(store, logger, "", "") // no auth — endpoints will reject - - return handler, func() { _ = store.Close() } -} - -func newTestHandlerWithToken(t *testing.T, readToken string) (*Handler, func()) { - t.Helper() - return newTestHandlerWithKeys(t, readToken, readToken) -} - -func newTestHandlerWithKeys(t *testing.T, readToken, hmacKey string) (*Handler, func()) { - t.Helper() - dbPath := filepath.Join(t.TempDir(), "test.db") - store, err := NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - - logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - handler := NewHandler(store, logger, readToken, hmacKey) - - return handler, func() { _ = store.Close() } -} diff --git a/internal/receiver/store.go b/internal/receiver/store.go deleted file mode 100644 index 1b934de..0000000 --- a/internal/receiver/store.go +++ /dev/null @@ -1,113 +0,0 @@ -// ABOUTME: SQLite storage layer for metrics receiver using GORM. -// ABOUTME: Handles database initialization and metric storage/retrieval. -package receiver - -import ( - "encoding/json" - "fmt" - "time" - - "gorm.io/driver/sqlite" - "gorm.io/gorm" - "gorm.io/gorm/logger" -) - -// Metric represents a stored metric record in the database -type Metric struct { - ID uint `gorm:"primaryKey"` - Organization string `gorm:"index:idx_org_repo;not null"` - Repository string `gorm:"index:idx_org_repo;not null"` - Workflow string `gorm:"not null"` - Job string `gorm:"not null"` - RunID string `gorm:"index;not null"` - ReceivedAt time.Time `gorm:"index;not null"` - Payload string `gorm:"type:text;not null"` // JSON-encoded RunSummary -} - -// MetricResponse is the API response type with Payload as embedded JSON object -type MetricResponse struct { - ID uint `json:"id"` - Organization string `json:"organization"` - Repository string `json:"repository"` - Workflow string `json:"workflow"` - Job string `json:"job"` - RunID string `json:"run_id"` - ReceivedAt time.Time `json:"received_at"` - Payload json.RawMessage `json:"payload"` -} - -// ToResponse converts a Metric to a MetricResponse with Payload as JSON object -func (m *Metric) ToResponse() MetricResponse { - return MetricResponse{ - ID: m.ID, - Organization: m.Organization, - Repository: m.Repository, - Workflow: m.Workflow, - Job: m.Job, - RunID: m.RunID, - ReceivedAt: m.ReceivedAt, - Payload: json.RawMessage(m.Payload), - } -} - -// Store handles SQLite storage for metrics using GORM -type Store struct { - db *gorm.DB -} - -// NewStore creates a new SQLite store at the given path -func NewStore(dbPath string) (*Store, error) { - db, err := gorm.Open(sqlite.Open(dbPath), &gorm.Config{ - Logger: logger.Default.LogMode(logger.Silent), - }) - if err != nil { - return nil, fmt.Errorf("opening database: %w", err) - } - - if err := db.AutoMigrate(&Metric{}); err != nil { - return nil, fmt.Errorf("migrating schema: %w", err) - } - - return &Store{db: db}, nil -} - -// SaveMetric stores a metrics payload in the database -func (s *Store) SaveMetric(payload *MetricsPayload) (uint, error) { - summaryJSON, err := json.Marshal(payload.Summary) - if err != nil { - return 0, fmt.Errorf("marshaling summary: %w", err) - } - - metric := Metric{ - Organization: payload.Execution.Organization, - Repository: payload.Execution.Repository, - Workflow: payload.Execution.Workflow, - Job: payload.Execution.Job, - RunID: payload.Execution.RunID, - ReceivedAt: time.Now().UTC(), - Payload: string(summaryJSON), - } - - result := s.db.Create(&metric) - if result.Error != nil { - return 0, fmt.Errorf("inserting metric: %w", result.Error) - } - - return metric.ID, nil -} - -// GetMetricsByWorkflowJob retrieves all metrics for a specific workflow and job -func (s *Store) GetMetricsByWorkflowJob(org, repo, workflow, job string) ([]Metric, error) { - var metrics []Metric - result := s.db.Where("organization = ? AND repository = ? AND workflow = ? AND job = ?", org, repo, workflow, job).Order("received_at DESC").Find(&metrics) - return metrics, result.Error -} - -// Close closes the database connection -func (s *Store) Close() error { - sqlDB, err := s.db.DB() - if err != nil { - return err - } - return sqlDB.Close() -} diff --git a/internal/receiver/store_test.go b/internal/receiver/store_test.go deleted file mode 100644 index 44c4f17..0000000 --- a/internal/receiver/store_test.go +++ /dev/null @@ -1,178 +0,0 @@ -package receiver - -import ( - "os" - "path/filepath" - "testing" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" -) - -func TestNewStore(t *testing.T) { - dbPath := filepath.Join(t.TempDir(), "test.db") - - store, err := NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - defer func() { _ = store.Close() }() - - if _, err := os.Stat(dbPath); os.IsNotExist(err) { - t.Error("database file was not created") - } -} - -func TestStore_SaveMetric(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - payload := &MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-123", - }, - Summary: summary.RunSummary{ - StartTime: time.Now().Add(-time.Minute), - EndTime: time.Now(), - DurationSeconds: 60.0, - SampleCount: 12, - CPUTotal: summary.StatSummary{Peak: 80.5, Avg: 45.2, P95: 75.0}, - MemUsedBytes: summary.StatSummary{Peak: 1024000, Avg: 512000, P95: 900000}, - MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, - }, - } - - id, err := store.SaveMetric(payload) - if err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - if id == 0 { - t.Error("SaveMetric() returned id = 0, want non-zero") - } -} - -func TestStore_GetMetricsByWorkflowJob(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - // Save metrics for different workflow/job combinations - payloads := []struct { - org string - repo string - workflow string - job string - }{ - {"org-a", "repo-1", "ci.yml", "build"}, - {"org-a", "repo-1", "ci.yml", "build"}, - {"org-a", "repo-1", "ci.yml", "test"}, - {"org-a", "repo-1", "deploy.yml", "build"}, - } - - for i, p := range payloads { - payload := &MetricsPayload{ - Execution: ExecutionContext{ - Organization: p.org, - Repository: p.repo, - Workflow: p.workflow, - Job: p.job, - RunID: "run-" + string(rune('a'+i)), - }, - Summary: summary.RunSummary{}, - } - if _, err := store.SaveMetric(payload); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - } - - metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 2 { - t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 2", len(metrics)) - } - - for _, m := range metrics { - if m.Organization != "org-a" || m.Repository != "repo-1" || m.Workflow != "ci.yml" || m.Job != "build" { - t.Errorf("GetMetricsByWorkflowJob() returned metric with org=%q repo=%q workflow=%q job=%q, want org-a/repo-1/ci.yml/build", - m.Organization, m.Repository, m.Workflow, m.Job) - } - } -} - -func TestStore_GetMetricsByWorkflowJob_NotFound(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - metrics, err := store.GetMetricsByWorkflowJob("nonexistent", "repo", "workflow", "job") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 0 { - t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 0", len(metrics)) - } -} - -func TestStore_SaveMetric_PreservesPayload(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - original := &MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "build.yml", - Job: "test", - RunID: "run-preserve", - }, - Summary: summary.RunSummary{ - DurationSeconds: 123.45, - SampleCount: 50, - CPUTotal: summary.StatSummary{Peak: 99.9, Avg: 55.5, P95: 88.8}, - }, - } - - _, err := store.SaveMetric(original) - if err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - - metrics, err := store.GetMetricsByWorkflowJob("test-org", "test-repo", "build.yml", "test") - if err != nil { - t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) - } - if len(metrics) != 1 { - t.Fatalf("GetMetricsByWorkflowJob() returned %d metrics, want 1", len(metrics)) - } - - m := metrics[0] - if m.Organization != original.Execution.Organization { - t.Errorf("Organization = %q, want %q", m.Organization, original.Execution.Organization) - } - if m.Repository != original.Execution.Repository { - t.Errorf("Repository = %q, want %q", m.Repository, original.Execution.Repository) - } - if m.Workflow != original.Execution.Workflow { - t.Errorf("Workflow = %q, want %q", m.Workflow, original.Execution.Workflow) - } - if m.Job != original.Execution.Job { - t.Errorf("Job = %q, want %q", m.Job, original.Execution.Job) - } - if m.Payload == "" { - t.Error("Payload is empty") - } -} - -func newTestStore(t *testing.T) *Store { - t.Helper() - dbPath := filepath.Join(t.TempDir(), "test.db") - store, err := NewStore(dbPath) - if err != nil { - t.Fatalf("NewStore() error = %v", err) - } - return store -} diff --git a/internal/receiver/token.go b/internal/receiver/token.go deleted file mode 100644 index 087546c..0000000 --- a/internal/receiver/token.go +++ /dev/null @@ -1,25 +0,0 @@ -// ABOUTME: HMAC-SHA256 token generation and validation for scoped push authentication. -// ABOUTME: Tokens are derived from a key + scope, enabling stateless validation without DB storage. -package receiver - -import ( - "crypto/hmac" - "crypto/sha256" - "crypto/subtle" - "encoding/hex" -) - -// GenerateScopedToken computes an HMAC-SHA256 token scoped to a specific org/repo/workflow/job. -// The canonical input is "v1\x00\x00\x00\x00". -func GenerateScopedToken(key, org, repo, workflow, job string) string { - mac := hmac.New(sha256.New, []byte(key)) - mac.Write([]byte("v1\x00" + org + "\x00" + repo + "\x00" + workflow + "\x00" + job)) - return hex.EncodeToString(mac.Sum(nil)) -} - -// ValidateScopedToken checks whether a token matches the expected HMAC for the given scope. -// Uses constant-time comparison to prevent timing attacks. -func ValidateScopedToken(key, token, org, repo, workflow, job string) bool { - expected := GenerateScopedToken(key, org, repo, workflow, job) - return subtle.ConstantTimeCompare([]byte(token), []byte(expected)) == 1 -} diff --git a/internal/receiver/token_test.go b/internal/receiver/token_test.go deleted file mode 100644 index 2140ecd..0000000 --- a/internal/receiver/token_test.go +++ /dev/null @@ -1,78 +0,0 @@ -package receiver - -import ( - "encoding/hex" - "testing" -) - -func TestGenerateScopedToken_Deterministic(t *testing.T) { - token1 := GenerateScopedToken("key", "org", "repo", "wf", "job") - token2 := GenerateScopedToken("key", "org", "repo", "wf", "job") - if token1 != token2 { - t.Errorf("tokens differ: %q vs %q", token1, token2) - } -} - -func TestGenerateScopedToken_ScopePinning(t *testing.T) { - base := GenerateScopedToken("key", "org", "repo", "wf", "job") - - variants := []struct { - name string - org string - repo string - wf string - job string - }{ - {"different org", "other-org", "repo", "wf", "job"}, - {"different repo", "org", "other-repo", "wf", "job"}, - {"different workflow", "org", "repo", "other-wf", "job"}, - {"different job", "org", "repo", "wf", "other-job"}, - } - - for _, v := range variants { - t.Run(v.name, func(t *testing.T) { - token := GenerateScopedToken("key", v.org, v.repo, v.wf, v.job) - if token == base { - t.Errorf("token for %s should differ from base", v.name) - } - }) - } -} - -func TestGenerateScopedToken_DifferentKeys(t *testing.T) { - token1 := GenerateScopedToken("key-a", "org", "repo", "wf", "job") - token2 := GenerateScopedToken("key-b", "org", "repo", "wf", "job") - if token1 == token2 { - t.Error("different keys should produce different tokens") - } -} - -func TestGenerateScopedToken_ValidHex(t *testing.T) { - token := GenerateScopedToken("key", "org", "repo", "wf", "job") - if len(token) != 64 { - t.Errorf("token length = %d, want 64", len(token)) - } - if _, err := hex.DecodeString(token); err != nil { - t.Errorf("token is not valid hex: %v", err) - } -} - -func TestValidateScopedToken_Correct(t *testing.T) { - token := GenerateScopedToken("key", "org", "repo", "wf", "job") - if !ValidateScopedToken("key", token, "org", "repo", "wf", "job") { - t.Error("ValidateScopedToken should accept correct token") - } -} - -func TestValidateScopedToken_WrongToken(t *testing.T) { - if ValidateScopedToken("key", "deadbeef", "org", "repo", "wf", "job") { - t.Error("ValidateScopedToken should reject wrong token") - } -} - -func TestValidateScopedToken_WrongScope(t *testing.T) { - token := GenerateScopedToken("key", "org", "repo", "wf", "job") - if ValidateScopedToken("key", token, "org", "repo", "wf", "other-job") { - t.Error("ValidateScopedToken should reject token for different scope") - } -} diff --git a/internal/receiver/types.go b/internal/receiver/types.go deleted file mode 100644 index dbc56e0..0000000 --- a/internal/receiver/types.go +++ /dev/null @@ -1,45 +0,0 @@ -// ABOUTME: Data types for the metrics receiver service. -// ABOUTME: Defines MetricsPayload combining execution metadata with run summary. -package receiver - -import "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" - -// ExecutionContext holds GitHub Actions style identifiers for a workflow run -type ExecutionContext struct { - Organization string `json:"organization"` - Repository string `json:"repository"` - Workflow string `json:"workflow"` - Job string `json:"job"` - RunID string `json:"run_id"` -} - -// MetricsPayload is the complete payload sent to the receiver -type MetricsPayload struct { - Execution ExecutionContext `json:"execution"` - Summary summary.RunSummary `json:"run_summary"` -} - -// StoredMetric represents a metric record as stored in the database -type StoredMetric struct { - ID int64 - Organization string - Repository string - Workflow string - Job string - RunID string - ReceivedAt string - Payload string // JSON-encoded RunSummary -} - -// TokenRequest is the request body for POST /api/v1/token -type TokenRequest struct { - Organization string `json:"organization"` - Repository string `json:"repository"` - Workflow string `json:"workflow"` - Job string `json:"job"` -} - -// TokenResponse is the response body for POST /api/v1/token -type TokenResponse struct { - Token string `json:"token"` -} diff --git a/internal/summary/accumulator.go b/internal/summary/accumulator.go deleted file mode 100644 index 6ffa8cd..0000000 --- a/internal/summary/accumulator.go +++ /dev/null @@ -1,184 +0,0 @@ -// ABOUTME: Accumulates system metrics samples across a collection run. -// ABOUTME: Computes peak, average, and P95 statistics for CPU and memory on demand. -package summary - -import ( - "fmt" - "sort" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" -) - -// containerAccumulator tracks metrics for a single container -type containerAccumulator struct { - cpuCoresValues []float64 - memoryBytesValues []float64 -} - -// Accumulator collects metric samples and computes run-level statistics -type Accumulator struct { - topN int - cpuValues []float64 - memBytesValues []float64 - memPctValues []float64 - processPeaks map[string]*ProcessPeak - containers map[string]*containerAccumulator - startTime time.Time - endTime time.Time - sampleCount int -} - -// NewAccumulator creates an accumulator that tracks the top N processes -func NewAccumulator(topN int) *Accumulator { - return &Accumulator{ - topN: topN, - processPeaks: make(map[string]*ProcessPeak), - containers: make(map[string]*containerAccumulator), - } -} - -// Add records a single metrics sample -func (a *Accumulator) Add(m *metrics.SystemMetrics) { - a.sampleCount++ - if a.sampleCount == 1 { - a.startTime = m.Timestamp - } - a.endTime = m.Timestamp - - a.cpuValues = append(a.cpuValues, m.CPU.TotalPercent) - a.memBytesValues = append(a.memBytesValues, float64(m.Memory.UsedBytes)) - a.memPctValues = append(a.memPctValues, m.Memory.UsedPercent) - - for _, p := range m.TopCPU { - a.updateProcessPeak(p) - } - for _, p := range m.TopMemory { - a.updateProcessPeak(p) - } - - // Track per-container metrics - for name, cgroup := range m.Cgroups { - ca, ok := a.containers[name] - if !ok { - ca = &containerAccumulator{} - a.containers[name] = ca - } - // Only record CPU when a valid delta was computed (skip first sample and underflow) - if cgroup.CPU.HasDelta { - ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores) - } - ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes)) - } -} - -// Summarize computes and returns the run summary, or nil if no samples were added -func (a *Accumulator) Summarize() *RunSummary { - if a.sampleCount == 0 { - return nil - } - - return &RunSummary{ - StartTime: a.startTime, - EndTime: a.endTime, - DurationSeconds: a.endTime.Sub(a.startTime).Seconds(), - SampleCount: a.sampleCount, - CPUTotal: computeStats(a.cpuValues), - MemUsedBytes: computeStats(a.memBytesValues), - MemUsedPercent: computeStats(a.memPctValues), - TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }), - TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }), - Containers: a.containerSummaries(), - } -} - -// containerSummaries computes summaries for all tracked containers -func (a *Accumulator) containerSummaries() []ContainerSummary { - summaries := make([]ContainerSummary, 0, len(a.containers)) - for name, ca := range a.containers { - summaries = append(summaries, ContainerSummary{ - Name: name, - CPUCores: computeStats(ca.cpuCoresValues), - MemoryBytes: computeStats(ca.memoryBytesValues), - }) - } - // Sort by name for consistent output - sort.Slice(summaries, func(i, j int) bool { - return summaries[i].Name < summaries[j].Name - }) - return summaries -} - -// SampleCount returns the number of samples added -func (a *Accumulator) SampleCount() int { - return a.sampleCount -} - -// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values -func computeStats(values []float64) StatSummary { - n := len(values) - if n == 0 { - return StatSummary{} - } - - sorted := make([]float64, n) - copy(sorted, values) - sort.Float64s(sorted) - - var sum float64 - for _, v := range sorted { - sum += v - } - - return StatSummary{ - Peak: sorted[n-1], - P99: sorted[percentileIndex(n, 0.99)], - P95: sorted[percentileIndex(n, 0.95)], - P75: sorted[percentileIndex(n, 0.75)], - P50: sorted[percentileIndex(n, 0.50)], - Avg: sum / float64(n), - } -} - -// percentileIndex returns the index for the given percentile (0.0-1.0) -func percentileIndex(n int, percentile float64) int { - return int(float64(n-1) * percentile) -} - -// updateProcessPeak merges a process observation into the peak tracking map -func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) { - key := fmt.Sprintf("%d:%s", p.PID, p.Name) - existing, ok := a.processPeaks[key] - if !ok { - a.processPeaks[key] = &ProcessPeak{ - PID: p.PID, - Name: p.Name, - PeakCPU: p.CPUPercent, - PeakMem: p.MemRSS, - } - return - } - if p.CPUPercent > existing.PeakCPU { - existing.PeakCPU = p.CPUPercent - } - if p.MemRSS > existing.PeakMem { - existing.PeakMem = p.MemRSS - } -} - -// topProcesses returns the top N processes sorted by the given key function (descending) -func (a *Accumulator) topProcesses(keyFn func(*ProcessPeak) float64) []ProcessPeak { - all := make([]ProcessPeak, 0, len(a.processPeaks)) - for _, p := range a.processPeaks { - all = append(all, *p) - } - - sort.Slice(all, func(i, j int) bool { - return keyFn(&all[i]) > keyFn(&all[j]) - }) - - if len(all) > a.topN { - all = all[:a.topN] - } - return all -} diff --git a/internal/summary/accumulator_test.go b/internal/summary/accumulator_test.go deleted file mode 100644 index cdda597..0000000 --- a/internal/summary/accumulator_test.go +++ /dev/null @@ -1,598 +0,0 @@ -// ABOUTME: Tests for the summary accumulator that tracks metrics across a run. -// ABOUTME: Validates stats computation (peak/avg/P95), process peak tracking, and edge cases. -package summary - -import ( - "testing" - "time" - - "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" -) - -func TestAccumulator_NoSamples(t *testing.T) { - acc := NewAccumulator(5) - result := acc.Summarize() - if result != nil { - t.Errorf("expected nil summary for no samples, got %+v", result) - } -} - -func TestAccumulator_SingleSample(t *testing.T) { - acc := NewAccumulator(5) - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: 42.5}, - Memory: metrics.MemoryMetrics{ - UsedBytes: 1000, - UsedPercent: 50.0, - }, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // With a single sample, peak=avg=p95 - if s.CPUTotal.Peak != 42.5 { - t.Errorf("CPU peak: got %f, want 42.5", s.CPUTotal.Peak) - } - if s.CPUTotal.Avg != 42.5 { - t.Errorf("CPU avg: got %f, want 42.5", s.CPUTotal.Avg) - } - if s.CPUTotal.P95 != 42.5 { - t.Errorf("CPU p95: got %f, want 42.5", s.CPUTotal.P95) - } - if s.MemUsedBytes.Peak != 1000 { - t.Errorf("MemUsedBytes peak: got %f, want 1000", s.MemUsedBytes.Peak) - } - if s.MemUsedPercent.Peak != 50.0 { - t.Errorf("MemUsedPercent peak: got %f, want 50.0", s.MemUsedPercent.Peak) - } -} - -func TestAccumulator_Stats(t *testing.T) { - acc := NewAccumulator(5) - cpuValues := []float64{10, 20, 30, 40, 50} - for i, v := range cpuValues { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: v}, - Memory: metrics.MemoryMetrics{ - UsedBytes: uint64(v * 100), - UsedPercent: v, - }, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // Peak = max = 50 - if s.CPUTotal.Peak != 50 { - t.Errorf("CPU peak: got %f, want 50", s.CPUTotal.Peak) - } - // Avg = (10+20+30+40+50)/5 = 30 - if s.CPUTotal.Avg != 30 { - t.Errorf("CPU avg: got %f, want 30", s.CPUTotal.Avg) - } - // P95: sorted=[10,20,30,40,50], index=int(4*0.95)=int(3.8)=3, value=40 - if s.CPUTotal.P95 != 40 { - t.Errorf("CPU p95: got %f, want 40", s.CPUTotal.P95) - } -} - -func TestAccumulator_P95_LargerDataset(t *testing.T) { - acc := NewAccumulator(5) - // 20 values: 1, 2, 3, ..., 20 - for i := 1; i <= 20; i++ { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, - Memory: metrics.MemoryMetrics{}, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // P95: sorted=[1..20], index=int(19*0.95)=int(18.05)=18, value=19 - if s.CPUTotal.P95 != 19 { - t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) - } - // Avg = (1+2+...+20)/20 = 210/20 = 10.5 - if s.CPUTotal.Avg != 10.5 { - t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) - } -} - -func TestAccumulator_MemoryStats(t *testing.T) { - acc := NewAccumulator(5) - memBytes := []uint64{1000, 2000, 3000, 4000, 5000} - memPercent := []float64{10, 20, 30, 40, 50} - - for i := range memBytes { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: 0}, - Memory: metrics.MemoryMetrics{ - UsedBytes: memBytes[i], - UsedPercent: memPercent[i], - }, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // MemUsedBytes: peak=5000, avg=3000, p95=4000 - if s.MemUsedBytes.Peak != 5000 { - t.Errorf("MemUsedBytes peak: got %f, want 5000", s.MemUsedBytes.Peak) - } - if s.MemUsedBytes.Avg != 3000 { - t.Errorf("MemUsedBytes avg: got %f, want 3000", s.MemUsedBytes.Avg) - } - if s.MemUsedBytes.P95 != 4000 { - t.Errorf("MemUsedBytes p95: got %f, want 4000", s.MemUsedBytes.P95) - } - - // MemUsedPercent: peak=50, avg=30, p95=40 - if s.MemUsedPercent.Peak != 50 { - t.Errorf("MemUsedPercent peak: got %f, want 50", s.MemUsedPercent.Peak) - } - if s.MemUsedPercent.Avg != 30 { - t.Errorf("MemUsedPercent avg: got %f, want 30", s.MemUsedPercent.Avg) - } - if s.MemUsedPercent.P95 != 40 { - t.Errorf("MemUsedPercent p95: got %f, want 40", s.MemUsedPercent.P95) - } -} - -func TestAccumulator_ProcessPeaks(t *testing.T) { - acc := NewAccumulator(5) - - // Same PID across two samples; peaks should be retained - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - TopCPU: []metrics.ProcessMetrics{ - {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, - }, - TopMemory: []metrics.ProcessMetrics{ - {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, - }, - }) - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - TopCPU: []metrics.ProcessMetrics{ - {PID: 1, Name: "a", CPUPercent: 20, MemRSS: 50}, - }, - TopMemory: []metrics.ProcessMetrics{ - {PID: 1, Name: "a", CPUPercent: 5, MemRSS: 200}, - }, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // Should find PID 1 with peak CPU=20, peak mem=200 - found := false - for _, p := range s.TopCPUProcesses { - if p.PID == 1 { - found = true - if p.PeakCPU != 20 { - t.Errorf("PeakCPU: got %f, want 20", p.PeakCPU) - } - if p.PeakMem != 200 { - t.Errorf("PeakMem: got %d, want 200", p.PeakMem) - } - } - } - if !found { - t.Error("PID 1 not found in TopCPUProcesses") - } -} - -func TestAccumulator_ProcessPeaks_TopN(t *testing.T) { - acc := NewAccumulator(2) // Only top 2 - - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - TopCPU: []metrics.ProcessMetrics{ - {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, - {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, - {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, - }, - TopMemory: []metrics.ProcessMetrics{ - {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, - {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, - {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, - }, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // TopCPUProcesses should have at most 2 entries, sorted by PeakCPU descending - if len(s.TopCPUProcesses) != 2 { - t.Fatalf("TopCPUProcesses length: got %d, want 2", len(s.TopCPUProcesses)) - } - if s.TopCPUProcesses[0].PeakCPU != 90 { - t.Errorf("TopCPU[0] PeakCPU: got %f, want 90", s.TopCPUProcesses[0].PeakCPU) - } - if s.TopCPUProcesses[1].PeakCPU != 50 { - t.Errorf("TopCPU[1] PeakCPU: got %f, want 50", s.TopCPUProcesses[1].PeakCPU) - } - - // TopMemProcesses should have at most 2 entries, sorted by PeakMem descending - if len(s.TopMemProcesses) != 2 { - t.Fatalf("TopMemProcesses length: got %d, want 2", len(s.TopMemProcesses)) - } - if s.TopMemProcesses[0].PeakMem != 900 { - t.Errorf("TopMem[0] PeakMem: got %d, want 900", s.TopMemProcesses[0].PeakMem) - } - if s.TopMemProcesses[1].PeakMem != 500 { - t.Errorf("TopMem[1] PeakMem: got %d, want 500", s.TopMemProcesses[1].PeakMem) - } -} - -func TestAccumulator_ProcessPeaks_Dedup(t *testing.T) { - acc := NewAccumulator(5) - - // A process appears in both TopCPU and TopMemory - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - TopCPU: []metrics.ProcessMetrics{ - {PID: 1, Name: "proc", CPUPercent: 80, MemRSS: 100}, - }, - TopMemory: []metrics.ProcessMetrics{ - {PID: 1, Name: "proc", CPUPercent: 30, MemRSS: 500}, - }, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // The internal process map should have merged the peaks - // PeakCPU should be 80 (from TopCPU), PeakMem should be 500 (from TopMemory) - for _, p := range s.TopCPUProcesses { - if p.PID == 1 { - if p.PeakCPU != 80 { - t.Errorf("PeakCPU: got %f, want 80", p.PeakCPU) - } - if p.PeakMem != 500 { - t.Errorf("PeakMem: got %d, want 500", p.PeakMem) - } - } - } -} - -func TestAccumulator_SampleCount(t *testing.T) { - acc := NewAccumulator(5) - if acc.SampleCount() != 0 { - t.Errorf("initial SampleCount: got %d, want 0", acc.SampleCount()) - } - - for i := 0; i < 3; i++ { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - }) - } - - if acc.SampleCount() != 3 { - t.Errorf("SampleCount after 3 adds: got %d, want 3", acc.SampleCount()) - } -} - -func TestAccumulator_Duration(t *testing.T) { - acc := NewAccumulator(5) - start := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) - end := time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC) // 60 seconds later - - acc.Add(&metrics.SystemMetrics{ - Timestamp: start, - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - }) - acc.Add(&metrics.SystemMetrics{ - Timestamp: end, - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - if !s.StartTime.Equal(start) { - t.Errorf("StartTime: got %v, want %v", s.StartTime, start) - } - if s.DurationSeconds != 60 { - t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds) - } -} - -func TestAccumulator_AllPercentiles(t *testing.T) { - acc := NewAccumulator(5) - // 20 values: 1, 2, 3, ..., 20 - for i := 1; i <= 20; i++ { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, - Memory: metrics.MemoryMetrics{}, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // Peak = 20 - if s.CPUTotal.Peak != 20 { - t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak) - } - // P99: index=int(19*0.99)=int(18.81)=18, value=19 - if s.CPUTotal.P99 != 19 { - t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99) - } - // P95: index=int(19*0.95)=int(18.05)=18, value=19 - if s.CPUTotal.P95 != 19 { - t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) - } - // P75: index=int(19*0.75)=int(14.25)=14, value=15 - if s.CPUTotal.P75 != 15 { - t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75) - } - // P50: index=int(19*0.50)=int(9.5)=9, value=10 - if s.CPUTotal.P50 != 10 { - t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50) - } - // Avg = (1+2+...+20)/20 = 210/20 = 10.5 - if s.CPUTotal.Avg != 10.5 { - t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) - } -} - -func TestAccumulator_ContainerMetrics(t *testing.T) { - acc := NewAccumulator(5) - - // Add samples with container metrics (HasDelta=true to indicate valid CPU measurements) - for i := 1; i <= 5; i++ { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: float64(i * 10)}, - Memory: metrics.MemoryMetrics{}, - Cgroups: map[string]*metrics.CgroupMetrics{ - "container-a": { - Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{ - TotalRSSBytes: uint64(i * 1000), - }, - }, - "container-b": { - Name: "container-b", - CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2), HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{ - TotalRSSBytes: uint64(i * 2000), - }, - }, - }, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // Should have 2 containers - if len(s.Containers) != 2 { - t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) - } - - // Containers should be sorted by name - if s.Containers[0].Name != "container-a" { - t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name) - } - if s.Containers[1].Name != "container-b" { - t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name) - } - - // Container A: CPU cores [1,2,3,4,5], peak=5, avg=3 - containerA := s.Containers[0] - if containerA.CPUCores.Peak != 5 { - t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak) - } - if containerA.CPUCores.Avg != 3 { - t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg) - } - // Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000 - if containerA.MemoryBytes.Peak != 5000 { - t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak) - } - if containerA.MemoryBytes.Avg != 3000 { - t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg) - } - - // Container B: CPU cores [2,4,6,8,10], peak=10, avg=6 - containerB := s.Containers[1] - if containerB.CPUCores.Peak != 10 { - t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak) - } - if containerB.CPUCores.Avg != 6 { - t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg) - } -} - -func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) { - acc := NewAccumulator(5) - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - CPU: metrics.CPUMetrics{TotalPercent: 50}, - Memory: metrics.MemoryMetrics{}, - Cgroups: nil, // No containers - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - if len(s.Containers) != 0 { - t.Errorf("Containers length: got %d, want 0", len(s.Containers)) - } -} - -func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { - acc := NewAccumulator(5) - - // First sample: only container-a - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - Cgroups: map[string]*metrics.CgroupMetrics{ - "container-a": { - Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: 1, HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, - }, - }, - }) - - // Second sample: both containers - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - Cgroups: map[string]*metrics.CgroupMetrics{ - "container-a": { - Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: 2, HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000}, - }, - "container-b": { - Name: "container-b", - CPU: metrics.CgroupCPUMetrics{UsedCores: 5, HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000}, - }, - }, - }) - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - // Should have 2 containers - if len(s.Containers) != 2 { - t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) - } - - // Container A: 2 samples [1,2] - containerA := s.Containers[0] - if containerA.CPUCores.Peak != 2 { - t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak) - } - if containerA.CPUCores.Avg != 1.5 { - t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg) - } - - // Container B: 1 sample [5] - containerB := s.Containers[1] - if containerB.CPUCores.Peak != 5 { - t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak) - } - if containerB.CPUCores.Avg != 5 { - t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg) - } -} - -func TestAccumulator_ContainerMetrics_InvalidDeltaExcluded(t *testing.T) { - acc := NewAccumulator(5) - - // Sample 1: no valid CPU delta (first sample / underflow) — should be excluded from CPU stats - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - Cgroups: map[string]*metrics.CgroupMetrics{ - "runner": { - Name: "runner", - CPU: metrics.CgroupCPUMetrics{UsedCores: 0, HasDelta: false}, - Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, - }, - }, - }) - - // Samples 2-4: valid deltas - for i := 2; i <= 4; i++ { - acc.Add(&metrics.SystemMetrics{ - Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), - CPU: metrics.CPUMetrics{}, - Memory: metrics.MemoryMetrics{}, - Cgroups: map[string]*metrics.CgroupMetrics{ - "runner": { - Name: "runner", - CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, - Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: uint64(i * 1000)}, - }, - }, - }) - } - - s := acc.Summarize() - if s == nil { - t.Fatal("expected non-nil summary") - } - - if len(s.Containers) != 1 { - t.Fatalf("Containers length: got %d, want 1", len(s.Containers)) - } - - runner := s.Containers[0] - // CPU should only include samples 2,3,4 (values 2,3,4) — NOT the invalid zero - // Peak=4, Avg=3, P50=3 - if runner.CPUCores.Peak != 4 { - t.Errorf("CPUCores.Peak: got %f, want 4", runner.CPUCores.Peak) - } - if runner.CPUCores.Avg != 3 { - t.Errorf("CPUCores.Avg: got %f, want 3", runner.CPUCores.Avg) - } - if runner.CPUCores.P50 != 3 { - t.Errorf("CPUCores.P50: got %f, want 3", runner.CPUCores.P50) - } - - // Memory should include all 4 samples (memory is always valid) - // Values: 1000, 2000, 3000, 4000 - if runner.MemoryBytes.Peak != 4000 { - t.Errorf("MemoryBytes.Peak: got %f, want 4000", runner.MemoryBytes.Peak) - } - if runner.MemoryBytes.Avg != 2500 { - t.Errorf("MemoryBytes.Avg: got %f, want 2500", runner.MemoryBytes.Avg) - } -} diff --git a/internal/summary/push.go b/internal/summary/push.go deleted file mode 100644 index b6383db..0000000 --- a/internal/summary/push.go +++ /dev/null @@ -1,112 +0,0 @@ -// ABOUTME: HTTP client for pushing run summaries to the metrics receiver. -// ABOUTME: Reads execution context from GitHub Actions style environment variables. -package summary - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "os" - "time" -) - -// ExecutionContext holds GitHub Actions style identifiers for a workflow run -type ExecutionContext struct { - Organization string `json:"organization"` - Repository string `json:"repository"` - Workflow string `json:"workflow"` - Job string `json:"job"` - RunID string `json:"run_id"` -} - -// MetricsPayload is the complete payload sent to the receiver -type MetricsPayload struct { - Execution ExecutionContext `json:"execution"` - Summary RunSummary `json:"run_summary"` -} - -// PushClient sends metrics to the receiver service -type PushClient struct { - endpoint string - token string - client *http.Client - ctx ExecutionContext -} - -// NewPushClient creates a new push client configured from environment variables. -// If token is non-empty, it is sent as a Bearer token on each push request. -func NewPushClient(endpoint, token string) *PushClient { - return &PushClient{ - endpoint: endpoint, - token: token, - client: &http.Client{ - Timeout: 30 * time.Second, - }, - ctx: ExecutionContextFromEnv(), - } -} - -// ExecutionContextFromEnv reads execution context from GitHub Actions environment variables -func ExecutionContextFromEnv() ExecutionContext { - return ExecutionContext{ - Organization: getEnvWithFallback("GITHUB_REPOSITORY_OWNER", "GITEA_REPO_OWNER"), - Repository: getEnvWithFallback("GITHUB_REPOSITORY", "GITEA_REPO"), - Workflow: getEnvWithFallback("GITHUB_WORKFLOW", "GITEA_WORKFLOW"), - Job: getEnvWithFallback("GITHUB_JOB", "GITEA_JOB"), - RunID: getEnvWithFallback("GITHUB_RUN_ID", "GITEA_RUN_ID"), - } -} - -func getEnvWithFallback(keys ...string) string { - for _, key := range keys { - if val := os.Getenv(key); val != "" { - return val - } - } - return "" -} - -// Push sends the run summary to the receiver -func (p *PushClient) Push(ctx context.Context, summary *RunSummary) error { - if summary == nil { - return nil - } - - payload := MetricsPayload{ - Execution: p.ctx, - Summary: *summary, - } - - body, err := json.Marshal(payload) - if err != nil { - return fmt.Errorf("marshaling payload: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.endpoint, bytes.NewReader(body)) - if err != nil { - return fmt.Errorf("creating request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - if p.token != "" { - req.Header.Set("Authorization", "Bearer "+p.token) - } - - resp, err := p.client.Do(req) - if err != nil { - return fmt.Errorf("sending request: %w", err) - } - defer func() { _ = resp.Body.Close() }() - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - - return nil -} - -// ExecutionContext returns the current execution context -func (p *PushClient) ExecutionContext() ExecutionContext { - return p.ctx -} diff --git a/internal/summary/push_test.go b/internal/summary/push_test.go deleted file mode 100644 index 552ae68..0000000 --- a/internal/summary/push_test.go +++ /dev/null @@ -1,202 +0,0 @@ -package summary - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "testing" - "time" -) - -func TestPushClient_Push(t *testing.T) { - var received MetricsPayload - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodPost { - t.Errorf("expected POST, got %s", r.Method) - } - if ct := r.Header.Get("Content-Type"); ct != "application/json" { - t.Errorf("expected Content-Type application/json, got %s", ct) - } - if err := json.NewDecoder(r.Body).Decode(&received); err != nil { - t.Errorf("failed to decode body: %v", err) - } - w.WriteHeader(http.StatusCreated) - })) - defer server.Close() - - client := NewPushClient(server.URL, "") - client.ctx = ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "12345", - } - - summary := &RunSummary{ - StartTime: time.Now().Add(-time.Minute), - EndTime: time.Now(), - DurationSeconds: 60.0, - SampleCount: 10, - CPUTotal: StatSummary{Peak: 80.0, Avg: 50.0, P95: 75.0}, - } - - err := client.Push(context.Background(), summary) - if err != nil { - t.Fatalf("Push() error = %v", err) - } - - if received.Execution.Organization != "test-org" { - t.Errorf("Organization = %q, want %q", received.Execution.Organization, "test-org") - } - if received.Execution.RunID != "12345" { - t.Errorf("RunID = %q, want %q", received.Execution.RunID, "12345") - } - if received.Summary.SampleCount != 10 { - t.Errorf("SampleCount = %d, want %d", received.Summary.SampleCount, 10) - } -} - -func TestPushClient_Push_NilSummary(t *testing.T) { - client := NewPushClient("http://localhost:9999", "") - err := client.Push(context.Background(), nil) - if err != nil { - t.Errorf("Push(nil) error = %v, want nil", err) - } -} - -func TestPushClient_Push_ServerError(t *testing.T) { - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(http.StatusInternalServerError) - })) - defer server.Close() - - client := NewPushClient(server.URL, "") - client.ctx = ExecutionContext{RunID: "test"} - - err := client.Push(context.Background(), &RunSummary{}) - if err == nil { - t.Error("Push() expected error for 500 response, got nil") - } -} - -func TestPushClient_Push_ConnectionError(t *testing.T) { - client := NewPushClient("http://localhost:1", "") // Invalid port - client.ctx = ExecutionContext{RunID: "test"} - - err := client.Push(context.Background(), &RunSummary{}) - if err == nil { - t.Error("Push() expected error for connection failure, got nil") - } -} - -func TestExecutionContextFromEnv(t *testing.T) { - // Save and restore env - origVars := map[string]string{ - "GITHUB_REPOSITORY_OWNER": "", - "GITHUB_REPOSITORY": "", - "GITHUB_WORKFLOW": "", - "GITHUB_JOB": "", - "GITHUB_RUN_ID": "", - } - for k := range origVars { - origVars[k] = getEnvWithFallback(k) - } - defer func() { - for k, v := range origVars { - if v == "" { - t.Setenv(k, "") - } - } - }() - - t.Setenv("GITHUB_REPOSITORY_OWNER", "my-org") - t.Setenv("GITHUB_REPOSITORY", "my-org/my-repo") - t.Setenv("GITHUB_WORKFLOW", "CI") - t.Setenv("GITHUB_JOB", "test") - t.Setenv("GITHUB_RUN_ID", "999") - - ctx := ExecutionContextFromEnv() - - if ctx.Organization != "my-org" { - t.Errorf("Organization = %q, want %q", ctx.Organization, "my-org") - } - if ctx.Repository != "my-org/my-repo" { - t.Errorf("Repository = %q, want %q", ctx.Repository, "my-org/my-repo") - } - if ctx.Workflow != "CI" { - t.Errorf("Workflow = %q, want %q", ctx.Workflow, "CI") - } - if ctx.Job != "test" { - t.Errorf("Job = %q, want %q", ctx.Job, "test") - } - if ctx.RunID != "999" { - t.Errorf("RunID = %q, want %q", ctx.RunID, "999") - } -} - -func TestExecutionContextFromEnv_GiteaFallback(t *testing.T) { - t.Setenv("GITHUB_RUN_ID", "") - t.Setenv("GITEA_RUN_ID", "gitea-123") - - ctx := ExecutionContextFromEnv() - - if ctx.RunID != "gitea-123" { - t.Errorf("RunID = %q, want %q (Gitea fallback)", ctx.RunID, "gitea-123") - } -} - -func TestPushClient_Push_WithToken(t *testing.T) { - var gotAuth string - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - gotAuth = r.Header.Get("Authorization") - w.WriteHeader(http.StatusCreated) - })) - defer server.Close() - - client := NewPushClient(server.URL, "my-token") - client.ctx = ExecutionContext{RunID: "test"} - - err := client.Push(context.Background(), &RunSummary{}) - if err != nil { - t.Fatalf("Push() error = %v", err) - } - if gotAuth != "Bearer my-token" { - t.Errorf("Authorization = %q, want %q", gotAuth, "Bearer my-token") - } -} - -func TestPushClient_Push_WithoutToken(t *testing.T) { - var gotAuth string - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - gotAuth = r.Header.Get("Authorization") - w.WriteHeader(http.StatusCreated) - })) - defer server.Close() - - client := NewPushClient(server.URL, "") - client.ctx = ExecutionContext{RunID: "test"} - - err := client.Push(context.Background(), &RunSummary{}) - if err != nil { - t.Fatalf("Push() error = %v", err) - } - if gotAuth != "" { - t.Errorf("Authorization = %q, want empty", gotAuth) - } -} - -func TestPushClient_ExecutionContext(t *testing.T) { - client := NewPushClient("http://example.com", "") - client.ctx = ExecutionContext{ - Organization: "org", - Repository: "repo", - RunID: "run", - } - - ctx := client.ExecutionContext() - if ctx.Organization != "org" { - t.Errorf("Organization = %q, want %q", ctx.Organization, "org") - } -} diff --git a/internal/summary/types.go b/internal/summary/types.go deleted file mode 100644 index dad0b01..0000000 --- a/internal/summary/types.go +++ /dev/null @@ -1,44 +0,0 @@ -// ABOUTME: Data types for run-level summary statistics. -// ABOUTME: Defines StatSummary, ProcessPeak, and RunSummary used to report metrics on shutdown. -package summary - -import "time" - -// StatSummary holds peak, percentiles, and average for a metric across the run -type StatSummary struct { - Peak float64 `json:"peak"` - P99 float64 `json:"p99"` - P95 float64 `json:"p95"` - P75 float64 `json:"p75"` - P50 float64 `json:"p50"` - Avg float64 `json:"avg"` -} - -// ProcessPeak holds the peak CPU and memory observed for a single process -type ProcessPeak struct { - PID int `json:"pid"` - Name string `json:"name"` - PeakCPU float64 `json:"peak_cpu_percent"` - PeakMem uint64 `json:"peak_mem_rss_bytes"` -} - -// ContainerSummary holds statistics for a single container across the run -type ContainerSummary struct { - Name string `json:"name"` - CPUCores StatSummary `json:"cpu_cores"` - MemoryBytes StatSummary `json:"memory_bytes"` -} - -// RunSummary holds the complete summary of a collection run -type RunSummary struct { - StartTime time.Time `json:"start_time"` - EndTime time.Time `json:"end_time"` - DurationSeconds float64 `json:"duration_seconds"` - SampleCount int `json:"sample_count"` - CPUTotal StatSummary `json:"cpu_total_percent"` - MemUsedBytes StatSummary `json:"mem_used_bytes"` - MemUsedPercent StatSummary `json:"mem_used_percent"` - TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"` - TopMemProcesses []ProcessPeak `json:"top_mem_processes"` - Containers []ContainerSummary `json:"containers"` -} diff --git a/internal/summary/writer.go b/internal/summary/writer.go deleted file mode 100644 index 30b392b..0000000 --- a/internal/summary/writer.go +++ /dev/null @@ -1,61 +0,0 @@ -// ABOUTME: Emits a RunSummary as a structured log entry via slog. -// ABOUTME: Follows the same slog pattern as internal/output/logger.go for consistency. -package summary - -import ( - "io" - "log/slog" -) - -// SummaryWriter outputs a RunSummary using structured logging -type SummaryWriter struct { - logger *slog.Logger -} - -// NewSummaryWriter creates a writer that emits summaries to the given output in the given format -func NewSummaryWriter(output io.Writer, format string) *SummaryWriter { - opts := &slog.HandlerOptions{Level: slog.LevelInfo} - - var handler slog.Handler - switch format { - case "text": - handler = slog.NewTextHandler(output, opts) - default: - handler = slog.NewJSONHandler(output, opts) - } - - return &SummaryWriter{ - logger: slog.New(handler), - } -} - -// Write emits the run summary as a single structured log entry -func (w *SummaryWriter) Write(s *RunSummary) { - if s == nil { - return - } - - w.logger.Info("run_summary", - slog.Time("start_time", s.StartTime), - slog.Time("end_time", s.EndTime), - slog.Float64("duration_seconds", s.DurationSeconds), - slog.Int("sample_count", s.SampleCount), - slog.Group("cpu_total_percent", - slog.Float64("peak", s.CPUTotal.Peak), - slog.Float64("avg", s.CPUTotal.Avg), - slog.Float64("p95", s.CPUTotal.P95), - ), - slog.Group("mem_used_bytes", - slog.Float64("peak", s.MemUsedBytes.Peak), - slog.Float64("avg", s.MemUsedBytes.Avg), - slog.Float64("p95", s.MemUsedBytes.P95), - ), - slog.Group("mem_used_percent", - slog.Float64("peak", s.MemUsedPercent.Peak), - slog.Float64("avg", s.MemUsedPercent.Avg), - slog.Float64("p95", s.MemUsedPercent.P95), - ), - slog.Any("top_cpu_processes", s.TopCPUProcesses), - slog.Any("top_mem_processes", s.TopMemProcesses), - ) -} diff --git a/internal/summary/writer_test.go b/internal/summary/writer_test.go deleted file mode 100644 index d787ec6..0000000 --- a/internal/summary/writer_test.go +++ /dev/null @@ -1,93 +0,0 @@ -// ABOUTME: Tests for the summary writer that emits run summaries via slog. -// ABOUTME: Validates JSON output, text output, and nil summary handling. -package summary - -import ( - "bytes" - "strings" - "testing" - "time" -) - -func TestSummaryWriter_JSON(t *testing.T) { - var buf bytes.Buffer - w := NewSummaryWriter(&buf, "json") - - s := &RunSummary{ - StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), - DurationSeconds: 60, - SampleCount: 12, - CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, - MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, - MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, - TopCPUProcesses: []ProcessPeak{ - {PID: 1, Name: "busy", PeakCPU: 95.5, PeakMem: 1000}, - }, - TopMemProcesses: []ProcessPeak{ - {PID: 2, Name: "hungry", PeakCPU: 10.0, PeakMem: 8000000}, - }, - } - - w.Write(s) - - output := buf.String() - if !strings.Contains(output, "run_summary") { - t.Errorf("output should contain 'run_summary', got: %s", output) - } - if !strings.Contains(output, "duration_seconds") { - t.Errorf("output should contain 'duration_seconds', got: %s", output) - } - if !strings.Contains(output, "sample_count") { - t.Errorf("output should contain 'sample_count', got: %s", output) - } - if !strings.Contains(output, "cpu_total_percent") { - t.Errorf("output should contain 'cpu_total_percent', got: %s", output) - } - if !strings.Contains(output, "mem_used_bytes") { - t.Errorf("output should contain 'mem_used_bytes', got: %s", output) - } - if !strings.Contains(output, "top_cpu_processes") { - t.Errorf("output should contain 'top_cpu_processes', got: %s", output) - } - if !strings.Contains(output, "top_mem_processes") { - t.Errorf("output should contain 'top_mem_processes', got: %s", output) - } -} - -func TestSummaryWriter_Text(t *testing.T) { - var buf bytes.Buffer - w := NewSummaryWriter(&buf, "text") - - s := &RunSummary{ - StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), - EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), - DurationSeconds: 60, - SampleCount: 12, - CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, - MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, - MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, - } - - w.Write(s) - - output := buf.String() - if !strings.Contains(output, "run_summary") { - t.Errorf("output should contain 'run_summary', got: %s", output) - } - if !strings.Contains(output, "duration_seconds") { - t.Errorf("output should contain 'duration_seconds', got: %s", output) - } -} - -func TestSummaryWriter_NilSummary(t *testing.T) { - var buf bytes.Buffer - w := NewSummaryWriter(&buf, "json") - - // Should not panic and should not write anything - w.Write(nil) - - if buf.Len() != 0 { - t.Errorf("expected no output for nil summary, got: %s", buf.String()) - } -} diff --git a/test/docker/docker-compose-stress.yaml b/test/docker/docker-compose-stress.yaml deleted file mode 100644 index d4a0be0..0000000 --- a/test/docker/docker-compose-stress.yaml +++ /dev/null @@ -1,131 +0,0 @@ -# Docker Compose stress test with receiver -# See README.md "Docker Compose" section for the full token workflow. -# -# This test: -# 1. Starts the metrics receiver (with read-token and hmac-key) -# 2. You generate a scoped push token via POST /api/v1/token -# 3. Start the collector with COLLECTOR_PUSH_TOKEN set -# 4. Runs heavy CPU/memory workloads in multiple containers with shared PID namespace -# 5. Collector gathers metrics and pushes summary to receiver on shutdown -# -# To trigger the push, stop the collector gracefully: -# docker compose -f test/docker/docker-compose-stress.yaml stop collector - -services: - # Metrics receiver - stores summaries in SQLite - receiver: - build: - context: ../.. - dockerfile: Dockerfile - target: receiver - ports: - - "9080:8080" - environment: - - DB_PATH=/data/metrics.db - - RECEIVER_READ_TOKEN=dummyreadtoken - - RECEIVER_HMAC_KEY=dummyhmackey - volumes: - - receiver-data:/data - healthcheck: - test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"] - interval: 5s - timeout: 3s - retries: 3 - - # Heavy CPU workload - uses stress-ng (owns the PID namespace) - cpu-stress: - image: alexeiled/stress-ng:latest - command: - - --cpu - - "3" - - --timeout - - "300s" - - --metrics-brief - deploy: - resources: - limits: - cpus: "2.0" - memory: 128M - # This container owns the PID namespace - - # Memory-intensive workload - shares PID namespace with cpu-stress - mem-stress: - image: alexeiled/stress-ng:latest - command: - - --vm - - "2" - - --vm-bytes - - "64M" - - --timeout - - "300s" - - --metrics-brief - deploy: - resources: - limits: - cpus: "0.5" - memory: 256M - pid: "service:cpu-stress" - depends_on: - - cpu-stress - - # IO workload - continuous disk writes - io-stress: - image: busybox:latest - command: - - /bin/sh - - -c - - | - echo "IO stress started" - # 'dd' will be our identifiable process - while true; do - dd if=/dev/zero of=/tmp/testfile bs=1M count=100 2>/dev/null - rm -f /tmp/testfile - done - deploy: - resources: - limits: - cpus: "0.5" - memory: 128M - pid: "service:cpu-stress" - depends_on: - - cpu-stress - - # Resource collector - pushes to receiver on shutdown - collector: - build: - context: ../.. - dockerfile: Dockerfile - target: collector - command: - - --interval=2s - - --top=10 - - --log-format=json - - --push-endpoint=http://receiver:8080/api/v1/metrics - environment: - # Push token — pass via COLLECTOR_PUSH_TOKEN from host env - COLLECTOR_PUSH_TOKEN: "${COLLECTOR_PUSH_TOKEN}" - # Execution context for the receiver - GITHUB_REPOSITORY_OWNER: "test-org" - GITHUB_REPOSITORY: "test-org/stress-test" - GITHUB_WORKFLOW: "stress-test-workflow" - GITHUB_JOB: "heavy-workload" - GITHUB_RUN_ID: "stress-run-001" - # Cgroup configuration - # stress-ng-cpu is the worker process name for CPU stress - # stress-ng-vm is the worker process name for memory stress - CGROUP_PROCESS_MAP: '{"stress-ng-cpu":"cpu-stress","stress-ng-vm":"mem-stress","dd":"io-stress","resource-collec":"collector"}' - CGROUP_LIMITS: '{"cpu-stress":{"cpu":"1","memory":"128Mi"},"mem-stress":{"cpu":"500m","memory":"256Mi"},"io-stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"200m","memory":"64Mi"}}' - deploy: - resources: - limits: - cpus: "0.2" - memory: 64M - pid: "service:cpu-stress" - depends_on: - receiver: - condition: service_healthy - cpu-stress: - condition: service_started - -volumes: - receiver-data: diff --git a/test/docker/docker-compose.yaml b/test/docker/docker-compose.yaml deleted file mode 100644 index 87a9a9a..0000000 --- a/test/docker/docker-compose.yaml +++ /dev/null @@ -1,81 +0,0 @@ -# Docker Compose test setup for cgroup grouping verification -# Run with: docker compose -f test/docker/docker-compose.yaml up -# -# NOTE: Docker Compose doesn't have a direct equivalent to K8s shareProcessNamespace. -# Options: -# 1. pid: "host" - sees ALL host processes (not container-specific) -# 2. pid: "service:" - chains PID namespace to another service -# -# For proper testing, use Kubernetes or run containers manually with --pid=container: - -services: - # Simulate a runner workload (this will be the "root" of the shared PID namespace) - # Uses 'cat' reading from a fifo as a unique identifiable process - runner: - image: busybox:latest - command: - - /bin/sh - - -c - - | - echo "Runner started (PID 1 in namespace)" - mkfifo /tmp/runner_fifo - # 'cat' will be our identifiable runner process (blocks on fifo) - cat /tmp/runner_fifo & - CAT_PID=$! - # Generate CPU load with dd - while true; do - dd if=/dev/zero of=/dev/null bs=1M count=50 2>/dev/null - done - deploy: - resources: - limits: - cpus: "0.5" - memory: 256M - # This container owns the PID namespace - - # Simulate a sidecar service - shares PID namespace with runner - sidecar: - image: busybox:latest - command: - - /bin/sh - - -c - - | - echo "Sidecar started" - # List processes to verify shared namespace - ps aux - while true; do - sleep 10 - done - deploy: - resources: - limits: - cpus: "0.1" - memory: 128M - pid: "service:runner" # Share PID namespace with runner - depends_on: - - runner - - # Resource collector - shares PID namespace with runner - collector: - build: - context: ../.. - dockerfile: Dockerfile - target: collector - command: - - --interval=3s - - --top=5 - - --log-format=json - environment: - # Map unique process names to container names - # 'cat' runs only in runner, 'sleep' runs only in sidecar - CGROUP_PROCESS_MAP: '{"cat":"runner","sleep":"sidecar","resource-collec":"collector"}' - CGROUP_LIMITS: '{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' - deploy: - resources: - limits: - cpus: "0.1" - memory: 64M - pid: "service:runner" # Share PID namespace with runner - depends_on: - - runner - - sidecar diff --git a/test/k8s/test-cgroup-grouping.yaml b/test/k8s/test-cgroup-grouping.yaml deleted file mode 100644 index e46545b..0000000 --- a/test/k8s/test-cgroup-grouping.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# Test manifest to verify cgroup grouping behavior -# This pod runs multiple containers with different resource limits -# and a collector sidecar that groups metrics by cgroup/container -apiVersion: v1 -kind: Pod -metadata: - name: test-cgroup-grouping - labels: - app: test-cgroup-grouping -spec: - # Share PID namespace so collector can see all processes - shareProcessNamespace: true - - containers: - # Main workload container - simulates a runner - - name: runner - image: busybox:latest - command: - - /bin/sh - - -c - - | - echo "Runner container started" - # Simulate some CPU work - while true; do - dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null - sleep 1 - done - resources: - requests: - cpu: "100m" - memory: "64Mi" - limits: - cpu: "500m" - memory: "256Mi" - - # Sidecar container - simulates nginx or another service - - name: sidecar - image: busybox:latest - command: - - /bin/sh - - -c - - | - echo "Sidecar container started" - # Simulate some lighter work - while true; do - sleep 5 - done - resources: - requests: - cpu: "50m" - memory: "32Mi" - limits: - cpu: "100m" - memory: "128Mi" - - # Resource collector sidecar - - name: collector - image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image - args: - - --interval=5s - - --top=3 - env: - # Map process names to container names - # "sh" is the main process in busybox containers - # You may need to adjust based on actual process names - - name: CGROUP_PROCESS_MAP - value: | - {"sh":"runner","sleep":"sidecar","collector":"collector"} - # Define limits for each container (must match names in CGROUP_PROCESS_MAP) - - name: CGROUP_LIMITS - value: | - {"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}} - resources: - requests: - cpu: "50m" - memory: "32Mi" - limits: - cpu: "100m" - memory: "64Mi" - # Mount proc read-only for process discovery - volumeMounts: - - name: proc - mountPath: /proc - readOnly: true - - volumes: - - name: proc - hostPath: - path: /proc - type: Directory - - restartPolicy: Never ---- -# Alternative: Using a Deployment for longer-running tests -apiVersion: v1 -kind: Pod -metadata: - name: test-cgroup-simple - labels: - app: test-cgroup-simple -spec: - shareProcessNamespace: true - - containers: - # Stress container to generate CPU/memory load - - name: stress - image: progrium/stress:latest - args: - - --cpu - - "1" - - --vm - - "1" - - --vm-bytes - - "64M" - - --timeout - - "300s" - resources: - limits: - cpu: "500m" - memory: "128Mi" - - # Collector - - name: collector - image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image - args: - - --interval=2s - - --top=5 - env: - - name: CGROUP_PROCESS_MAP - value: '{"stress":"stress","collector":"collector"}' - - name: CGROUP_LIMITS - value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' - resources: - limits: - cpu: "100m" - memory: "64Mi" - volumeMounts: - - name: proc - mountPath: /proc - readOnly: true - - volumes: - - name: proc - hostPath: - path: /proc - type: Directory - - restartPolicy: Never diff --git a/test/local-test.sh b/test/local-test.sh deleted file mode 100755 index d56f477..0000000 --- a/test/local-test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Local test script to verify cgroup grouping -# Run from project root: ./test/local-test.sh - -set -e - -echo "Building collector..." -go build -o bin/collector ./cmd/collector - -echo "" -echo "Testing cgroup parsing on current system..." -echo "Current process cgroup:" -cat /proc/self/cgroup 2>/dev/null || echo "Cannot read /proc/self/cgroup (expected on macOS)" - -echo "" -echo "Running collector for 10 seconds with cgroup grouping..." -echo "Press Ctrl+C to stop early" -echo "" - -# Set up test environment variables -# Map common process names to container names -export CGROUP_PROCESS_MAP='{"bash":"shell","collector":"collector","zsh":"shell"}' -export CGROUP_LIMITS='{"shell":{"cpu":"2","memory":"4Gi"},"collector":{"cpu":"1","memory":"1Gi"}}' - -# Run collector -timeout 10 ./bin/collector \ - --interval=2s \ - --top=5 \ - --log-format=json \ - 2>/dev/null || true - -echo "" -echo "Test complete!" -echo "" -echo "Note: On macOS, cgroup paths will be empty since cgroups are a Linux feature." -echo "To test properly, run in a Linux container or VM."