From 54269e8a0e98ca6e71240e45600d9ba7bc6db590 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Wed, 4 Feb 2026 15:10:07 +0100 Subject: [PATCH 01/21] chore: remove darwin from build targets --- .goreleaser.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.goreleaser.yaml b/.goreleaser.yaml index de03c63..661607b 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -14,7 +14,6 @@ builds: - CGO_ENABLED=0 goos: - linux - - darwin goarch: - amd64 - arm64 From 7201a527d839100523097b49dab7dc3068dbf56a Mon Sep 17 00:00:00 2001 From: Waldemar Kindler Date: Wed, 4 Feb 2026 16:21:17 +0100 Subject: [PATCH 02/21] feat(collector): Summaries metrics at the end of the process --- cmd/collector/main.go | 5 + internal/collector/collector.go | 47 ++- internal/collector/collector_test.go | 98 +++++++ internal/collector/testdata/proc/1/stat | 1 + internal/collector/testdata/proc/1/status | 14 + internal/collector/testdata/proc/cpuinfo | 1 + internal/collector/testdata/proc/meminfo | 5 + internal/collector/testdata/proc/stat | 1 + internal/summary/accumulator.go | 138 +++++++++ internal/summary/accumulator_test.go | 335 ++++++++++++++++++++++ internal/summary/types.go | 33 +++ internal/summary/writer.go | 81 ++++++ internal/summary/writer_test.go | 93 ++++++ 13 files changed, 844 insertions(+), 8 deletions(-) create mode 100644 internal/collector/collector_test.go create mode 100644 internal/collector/testdata/proc/1/stat create mode 100644 internal/collector/testdata/proc/1/status create mode 100644 internal/collector/testdata/proc/cpuinfo create mode 100644 internal/collector/testdata/proc/meminfo create mode 100644 internal/collector/testdata/proc/stat create mode 100644 internal/summary/accumulator.go create mode 100644 internal/summary/accumulator_test.go create mode 100644 internal/summary/types.go create mode 100644 internal/summary/writer.go create mode 100644 internal/summary/writer_test.go diff --git a/cmd/collector/main.go b/cmd/collector/main.go index 8009586..b34561b 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -12,6 +12,7 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/collector" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) const ( @@ -53,6 +54,10 @@ func main() { TopN: *topN, }, metricsWriter, appLogger) + // Attach summary writer to emit run summary on shutdown + summaryWriter := summary.NewSummaryWriter(os.Stdout, *logFormat) + c.SetSummaryWriter(summaryWriter) + // Setup signal handling for graceful shutdown ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/internal/collector/collector.go b/internal/collector/collector.go index cbc5683..134ab3a 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -8,6 +8,7 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) // Config holds the collector configuration @@ -19,22 +20,30 @@ type Config struct { // Collector orchestrates metric collection type Collector struct { - config Config - aggregator *metrics.Aggregator - writer output.Writer - logger *slog.Logger + config Config + aggregator *metrics.Aggregator + writer output.Writer + logger *slog.Logger + accumulator *summary.Accumulator + summaryWriter *summary.SummaryWriter } // New creates a new collector func New(cfg Config, writer output.Writer, logger *slog.Logger) *Collector { return &Collector{ - config: cfg, - aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), - writer: writer, - logger: logger, + config: cfg, + aggregator: metrics.NewAggregator(cfg.ProcPath, cfg.TopN), + writer: writer, + logger: logger, + accumulator: summary.NewAccumulator(cfg.TopN), } } +// SetSummaryWriter attaches a summary writer for emitting run summaries on shutdown +func (c *Collector) SetSummaryWriter(w *summary.SummaryWriter) { + c.summaryWriter = w +} + // Run starts the collector loop and blocks until context is cancelled func (c *Collector) Run(ctx context.Context) error { c.logger.Info("collector starting", @@ -55,6 +64,7 @@ func (c *Collector) Run(ctx context.Context) error { select { case <-ctx.Done(): c.logger.Info("collector stopping") + c.emitSummary() return ctx.Err() case <-ticker.C: if err := c.collect(); err != nil { @@ -75,9 +85,30 @@ func (c *Collector) collect() error { return fmt.Errorf("writing metrics: %w", err) } + c.accumulator.Add(m) + return nil } +// emitSummary computes and writes the run summary if a writer is configured +func (c *Collector) emitSummary() { + if c.summaryWriter == nil { + return + } + + s := c.accumulator.Summarize() + if s == nil { + c.logger.Info("no samples collected, skipping run summary") + return + } + + c.logger.Info("emitting run summary", + slog.Int("sample_count", s.SampleCount), + slog.Float64("duration_seconds", s.DurationSeconds), + ) + c.summaryWriter.Write(s) +} + // CollectOnce performs a single collection and returns the metrics func (c *Collector) CollectOnce() (*metrics.SystemMetrics, error) { return c.aggregator.Collect() diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go new file mode 100644 index 0000000..9251a51 --- /dev/null +++ b/internal/collector/collector_test.go @@ -0,0 +1,98 @@ +// ABOUTME: Tests for the collector's summary integration. +// ABOUTME: Validates that run summaries are emitted on shutdown and handles missing writer gracefully. +package collector + +import ( + "bytes" + "context" + "log/slog" + "strings" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/output" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestCollector_EmitsSummaryOnShutdown(t *testing.T) { + // Use testdata/proc as the proc filesystem + procPath := "testdata/proc" + + // Metrics output (regular collection output) + var metricsOut bytes.Buffer + metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ + Output: &metricsOut, + Format: output.LogFormatJSON, + Level: slog.LevelInfo, + }) + + // Summary output + var summaryOut bytes.Buffer + sw := summary.NewSummaryWriter(&summaryOut, "json") + + // Silence app logs + appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + + c := New(Config{ + ProcPath: procPath, + Interval: 50 * time.Millisecond, + TopN: 5, + }, metricsWriter, appLogger) + + c.SetSummaryWriter(sw) + + // Run collector briefly then cancel + ctx, cancel := context.WithCancel(context.Background()) + go func() { + // Let at least 2 collection cycles run + time.Sleep(150 * time.Millisecond) + cancel() + }() + + _ = c.Run(ctx) + + // Verify summary was emitted + summaryOutput := summaryOut.String() + if !strings.Contains(summaryOutput, "run_summary") { + t.Errorf("expected 'run_summary' in output, got: %s", summaryOutput) + } + if !strings.Contains(summaryOutput, "duration_seconds") { + t.Errorf("expected 'duration_seconds' in output, got: %s", summaryOutput) + } + if !strings.Contains(summaryOutput, "sample_count") { + t.Errorf("expected 'sample_count' in output, got: %s", summaryOutput) + } +} + +func TestCollector_NoSummaryWithoutWriter(t *testing.T) { + procPath := "testdata/proc" + + var metricsOut bytes.Buffer + metricsWriter := output.NewLoggerWriter(output.LoggerConfig{ + Output: &metricsOut, + Format: output.LogFormatJSON, + Level: slog.LevelInfo, + }) + + appLogger := slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)) + + c := New(Config{ + ProcPath: procPath, + Interval: 50 * time.Millisecond, + TopN: 5, + }, metricsWriter, appLogger) + + // Deliberately do NOT set a summary writer + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(100 * time.Millisecond) + cancel() + }() + + // Should not panic + err := c.Run(ctx) + if err != context.Canceled { + t.Errorf("expected context.Canceled, got: %v", err) + } +} diff --git a/internal/collector/testdata/proc/1/stat b/internal/collector/testdata/proc/1/stat new file mode 100644 index 0000000..01d6595 --- /dev/null +++ b/internal/collector/testdata/proc/1/stat @@ -0,0 +1 @@ +1 (init) S 0 1 1 0 -1 4194560 1000 0 0 0 100 50 0 0 20 0 1 0 1 10000000 500 18446744073709551615 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/internal/collector/testdata/proc/1/status b/internal/collector/testdata/proc/1/status new file mode 100644 index 0000000..0b4d9e3 --- /dev/null +++ b/internal/collector/testdata/proc/1/status @@ -0,0 +1,14 @@ +Name: init +Uid: 0 0 0 0 +Gid: 0 0 0 0 +VmPeak: 10000 kB +VmSize: 10000 kB +VmRSS: 5000 kB +VmData: 3000 kB +VmStk: 200 kB +VmExe: 100 kB +VmLib: 1000 kB +RssAnon: 3000 kB +RssFile: 1500 kB +RssShmem: 500 kB +Threads: 1 diff --git a/internal/collector/testdata/proc/cpuinfo b/internal/collector/testdata/proc/cpuinfo new file mode 100644 index 0000000..9703a4e --- /dev/null +++ b/internal/collector/testdata/proc/cpuinfo @@ -0,0 +1 @@ +processor : 0 diff --git a/internal/collector/testdata/proc/meminfo b/internal/collector/testdata/proc/meminfo new file mode 100644 index 0000000..2993ff4 --- /dev/null +++ b/internal/collector/testdata/proc/meminfo @@ -0,0 +1,5 @@ +MemTotal: 16348500 kB +MemFree: 8000000 kB +MemAvailable: 12000000 kB +Buffers: 500000 kB +Cached: 3000000 kB diff --git a/internal/collector/testdata/proc/stat b/internal/collector/testdata/proc/stat new file mode 100644 index 0000000..513d56a --- /dev/null +++ b/internal/collector/testdata/proc/stat @@ -0,0 +1 @@ +cpu 10000 500 3000 80000 200 50 30 0 0 0 diff --git a/internal/summary/accumulator.go b/internal/summary/accumulator.go new file mode 100644 index 0000000..bc3b1de --- /dev/null +++ b/internal/summary/accumulator.go @@ -0,0 +1,138 @@ +// ABOUTME: Accumulates system metrics samples across a collection run. +// ABOUTME: Computes peak, average, and P95 statistics for CPU and memory on demand. +package summary + +import ( + "fmt" + "sort" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" +) + +// Accumulator collects metric samples and computes run-level statistics +type Accumulator struct { + topN int + cpuValues []float64 + memBytesValues []float64 + memPctValues []float64 + processPeaks map[string]*ProcessPeak + startTime time.Time + endTime time.Time + sampleCount int +} + +// NewAccumulator creates an accumulator that tracks the top N processes +func NewAccumulator(topN int) *Accumulator { + return &Accumulator{ + topN: topN, + processPeaks: make(map[string]*ProcessPeak), + } +} + +// Add records a single metrics sample +func (a *Accumulator) Add(m *metrics.SystemMetrics) { + a.sampleCount++ + if a.sampleCount == 1 { + a.startTime = m.Timestamp + } + a.endTime = m.Timestamp + + a.cpuValues = append(a.cpuValues, m.CPU.TotalPercent) + a.memBytesValues = append(a.memBytesValues, float64(m.Memory.UsedBytes)) + a.memPctValues = append(a.memPctValues, m.Memory.UsedPercent) + + for _, p := range m.TopCPU { + a.updateProcessPeak(p) + } + for _, p := range m.TopMemory { + a.updateProcessPeak(p) + } +} + +// Summarize computes and returns the run summary, or nil if no samples were added +func (a *Accumulator) Summarize() *RunSummary { + if a.sampleCount == 0 { + return nil + } + + return &RunSummary{ + StartTime: a.startTime, + EndTime: a.endTime, + DurationSeconds: a.endTime.Sub(a.startTime).Seconds(), + SampleCount: a.sampleCount, + CPUTotal: computeStats(a.cpuValues), + MemUsedBytes: computeStats(a.memBytesValues), + MemUsedPercent: computeStats(a.memPctValues), + TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }), + TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }), + } +} + +// SampleCount returns the number of samples added +func (a *Accumulator) SampleCount() int { + return a.sampleCount +} + +// computeStats calculates peak, average, and P95 from a sorted copy of the values +func computeStats(values []float64) StatSummary { + n := len(values) + if n == 0 { + return StatSummary{} + } + + sorted := make([]float64, n) + copy(sorted, values) + sort.Float64s(sorted) + + var sum float64 + for _, v := range sorted { + sum += v + } + + p95Index := int(float64(n-1) * 0.95) + + return StatSummary{ + Peak: sorted[n-1], + Avg: sum / float64(n), + P95: sorted[p95Index], + } +} + +// updateProcessPeak merges a process observation into the peak tracking map +func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) { + key := fmt.Sprintf("%d:%s", p.PID, p.Name) + existing, ok := a.processPeaks[key] + if !ok { + a.processPeaks[key] = &ProcessPeak{ + PID: p.PID, + Name: p.Name, + PeakCPU: p.CPUPercent, + PeakMem: p.MemRSS, + } + return + } + if p.CPUPercent > existing.PeakCPU { + existing.PeakCPU = p.CPUPercent + } + if p.MemRSS > existing.PeakMem { + existing.PeakMem = p.MemRSS + } +} + +// topProcesses returns the top N processes sorted by the given key function (descending) +func (a *Accumulator) topProcesses(keyFn func(*ProcessPeak) float64) []ProcessPeak { + all := make([]ProcessPeak, 0, len(a.processPeaks)) + for _, p := range a.processPeaks { + all = append(all, *p) + } + + sort.Slice(all, func(i, j int) bool { + return keyFn(&all[i]) > keyFn(&all[j]) + }) + + if len(all) > a.topN { + all = all[:a.topN] + } + return all +} diff --git a/internal/summary/accumulator_test.go b/internal/summary/accumulator_test.go new file mode 100644 index 0000000..76f6d47 --- /dev/null +++ b/internal/summary/accumulator_test.go @@ -0,0 +1,335 @@ +// ABOUTME: Tests for the summary accumulator that tracks metrics across a run. +// ABOUTME: Validates stats computation (peak/avg/P95), process peak tracking, and edge cases. +package summary + +import ( + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" +) + +func TestAccumulator_NoSamples(t *testing.T) { + acc := NewAccumulator(5) + result := acc.Summarize() + if result != nil { + t.Errorf("expected nil summary for no samples, got %+v", result) + } +} + +func TestAccumulator_SingleSample(t *testing.T) { + acc := NewAccumulator(5) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 42.5}, + Memory: metrics.MemoryMetrics{ + UsedBytes: 1000, + UsedPercent: 50.0, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // With a single sample, peak=avg=p95 + if s.CPUTotal.Peak != 42.5 { + t.Errorf("CPU peak: got %f, want 42.5", s.CPUTotal.Peak) + } + if s.CPUTotal.Avg != 42.5 { + t.Errorf("CPU avg: got %f, want 42.5", s.CPUTotal.Avg) + } + if s.CPUTotal.P95 != 42.5 { + t.Errorf("CPU p95: got %f, want 42.5", s.CPUTotal.P95) + } + if s.MemUsedBytes.Peak != 1000 { + t.Errorf("MemUsedBytes peak: got %f, want 1000", s.MemUsedBytes.Peak) + } + if s.MemUsedPercent.Peak != 50.0 { + t.Errorf("MemUsedPercent peak: got %f, want 50.0", s.MemUsedPercent.Peak) + } +} + +func TestAccumulator_Stats(t *testing.T) { + acc := NewAccumulator(5) + cpuValues := []float64{10, 20, 30, 40, 50} + for i, v := range cpuValues { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: v}, + Memory: metrics.MemoryMetrics{ + UsedBytes: uint64(v * 100), + UsedPercent: v, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Peak = max = 50 + if s.CPUTotal.Peak != 50 { + t.Errorf("CPU peak: got %f, want 50", s.CPUTotal.Peak) + } + // Avg = (10+20+30+40+50)/5 = 30 + if s.CPUTotal.Avg != 30 { + t.Errorf("CPU avg: got %f, want 30", s.CPUTotal.Avg) + } + // P95: sorted=[10,20,30,40,50], index=int(4*0.95)=int(3.8)=3, value=40 + if s.CPUTotal.P95 != 40 { + t.Errorf("CPU p95: got %f, want 40", s.CPUTotal.P95) + } +} + +func TestAccumulator_P95_LargerDataset(t *testing.T) { + acc := NewAccumulator(5) + // 20 values: 1, 2, 3, ..., 20 + for i := 1; i <= 20; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, + Memory: metrics.MemoryMetrics{}, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // P95: sorted=[1..20], index=int(19*0.95)=int(18.05)=18, value=19 + if s.CPUTotal.P95 != 19 { + t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) + } + // Avg = (1+2+...+20)/20 = 210/20 = 10.5 + if s.CPUTotal.Avg != 10.5 { + t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) + } +} + +func TestAccumulator_MemoryStats(t *testing.T) { + acc := NewAccumulator(5) + memBytes := []uint64{1000, 2000, 3000, 4000, 5000} + memPercent := []float64{10, 20, 30, 40, 50} + + for i := range memBytes { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 0}, + Memory: metrics.MemoryMetrics{ + UsedBytes: memBytes[i], + UsedPercent: memPercent[i], + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // MemUsedBytes: peak=5000, avg=3000, p95=4000 + if s.MemUsedBytes.Peak != 5000 { + t.Errorf("MemUsedBytes peak: got %f, want 5000", s.MemUsedBytes.Peak) + } + if s.MemUsedBytes.Avg != 3000 { + t.Errorf("MemUsedBytes avg: got %f, want 3000", s.MemUsedBytes.Avg) + } + if s.MemUsedBytes.P95 != 4000 { + t.Errorf("MemUsedBytes p95: got %f, want 4000", s.MemUsedBytes.P95) + } + + // MemUsedPercent: peak=50, avg=30, p95=40 + if s.MemUsedPercent.Peak != 50 { + t.Errorf("MemUsedPercent peak: got %f, want 50", s.MemUsedPercent.Peak) + } + if s.MemUsedPercent.Avg != 30 { + t.Errorf("MemUsedPercent avg: got %f, want 30", s.MemUsedPercent.Avg) + } + if s.MemUsedPercent.P95 != 40 { + t.Errorf("MemUsedPercent p95: got %f, want 40", s.MemUsedPercent.P95) + } +} + +func TestAccumulator_ProcessPeaks(t *testing.T) { + acc := NewAccumulator(5) + + // Same PID across two samples; peaks should be retained + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 10, MemRSS: 100}, + }, + }) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 20, MemRSS: 50}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "a", CPUPercent: 5, MemRSS: 200}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should find PID 1 with peak CPU=20, peak mem=200 + found := false + for _, p := range s.TopCPUProcesses { + if p.PID == 1 { + found = true + if p.PeakCPU != 20 { + t.Errorf("PeakCPU: got %f, want 20", p.PeakCPU) + } + if p.PeakMem != 200 { + t.Errorf("PeakMem: got %d, want 200", p.PeakMem) + } + } + } + if !found { + t.Error("PID 1 not found in TopCPUProcesses") + } +} + +func TestAccumulator_ProcessPeaks_TopN(t *testing.T) { + acc := NewAccumulator(2) // Only top 2 + + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, + {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, + {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "low", CPUPercent: 10, MemRSS: 100}, + {PID: 2, Name: "mid", CPUPercent: 50, MemRSS: 500}, + {PID: 3, Name: "high", CPUPercent: 90, MemRSS: 900}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // TopCPUProcesses should have at most 2 entries, sorted by PeakCPU descending + if len(s.TopCPUProcesses) != 2 { + t.Fatalf("TopCPUProcesses length: got %d, want 2", len(s.TopCPUProcesses)) + } + if s.TopCPUProcesses[0].PeakCPU != 90 { + t.Errorf("TopCPU[0] PeakCPU: got %f, want 90", s.TopCPUProcesses[0].PeakCPU) + } + if s.TopCPUProcesses[1].PeakCPU != 50 { + t.Errorf("TopCPU[1] PeakCPU: got %f, want 50", s.TopCPUProcesses[1].PeakCPU) + } + + // TopMemProcesses should have at most 2 entries, sorted by PeakMem descending + if len(s.TopMemProcesses) != 2 { + t.Fatalf("TopMemProcesses length: got %d, want 2", len(s.TopMemProcesses)) + } + if s.TopMemProcesses[0].PeakMem != 900 { + t.Errorf("TopMem[0] PeakMem: got %d, want 900", s.TopMemProcesses[0].PeakMem) + } + if s.TopMemProcesses[1].PeakMem != 500 { + t.Errorf("TopMem[1] PeakMem: got %d, want 500", s.TopMemProcesses[1].PeakMem) + } +} + +func TestAccumulator_ProcessPeaks_Dedup(t *testing.T) { + acc := NewAccumulator(5) + + // A process appears in both TopCPU and TopMemory + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + TopCPU: []metrics.ProcessMetrics{ + {PID: 1, Name: "proc", CPUPercent: 80, MemRSS: 100}, + }, + TopMemory: []metrics.ProcessMetrics{ + {PID: 1, Name: "proc", CPUPercent: 30, MemRSS: 500}, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // The internal process map should have merged the peaks + // PeakCPU should be 80 (from TopCPU), PeakMem should be 500 (from TopMemory) + for _, p := range s.TopCPUProcesses { + if p.PID == 1 { + if p.PeakCPU != 80 { + t.Errorf("PeakCPU: got %f, want 80", p.PeakCPU) + } + if p.PeakMem != 500 { + t.Errorf("PeakMem: got %d, want 500", p.PeakMem) + } + } + } +} + +func TestAccumulator_SampleCount(t *testing.T) { + acc := NewAccumulator(5) + if acc.SampleCount() != 0 { + t.Errorf("initial SampleCount: got %d, want 0", acc.SampleCount()) + } + + for i := 0; i < 3; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + } + + if acc.SampleCount() != 3 { + t.Errorf("SampleCount after 3 adds: got %d, want 3", acc.SampleCount()) + } +} + +func TestAccumulator_Duration(t *testing.T) { + acc := NewAccumulator(5) + start := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + end := time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC) // 60 seconds later + + acc.Add(&metrics.SystemMetrics{ + Timestamp: start, + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + acc.Add(&metrics.SystemMetrics{ + Timestamp: end, + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if !s.StartTime.Equal(start) { + t.Errorf("StartTime: got %v, want %v", s.StartTime, start) + } + if s.DurationSeconds != 60 { + t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds) + } +} diff --git a/internal/summary/types.go b/internal/summary/types.go new file mode 100644 index 0000000..ab5ecea --- /dev/null +++ b/internal/summary/types.go @@ -0,0 +1,33 @@ +// ABOUTME: Data types for run-level summary statistics. +// ABOUTME: Defines StatSummary, ProcessPeak, and RunSummary used to report metrics on shutdown. +package summary + +import "time" + +// StatSummary holds peak, average, and P95 for a metric across the run +type StatSummary struct { + Peak float64 `json:"peak"` + Avg float64 `json:"avg"` + P95 float64 `json:"p95"` +} + +// ProcessPeak holds the peak CPU and memory observed for a single process +type ProcessPeak struct { + PID int `json:"pid"` + Name string `json:"name"` + PeakCPU float64 `json:"peak_cpu_percent"` + PeakMem uint64 `json:"peak_mem_rss_bytes"` +} + +// RunSummary holds the complete summary of a collection run +type RunSummary struct { + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + DurationSeconds float64 `json:"duration_seconds"` + SampleCount int `json:"sample_count"` + CPUTotal StatSummary `json:"cpu_total_percent"` + MemUsedBytes StatSummary `json:"mem_used_bytes"` + MemUsedPercent StatSummary `json:"mem_used_percent"` + TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"` + TopMemProcesses []ProcessPeak `json:"top_mem_processes"` +} diff --git a/internal/summary/writer.go b/internal/summary/writer.go new file mode 100644 index 0000000..f6781e1 --- /dev/null +++ b/internal/summary/writer.go @@ -0,0 +1,81 @@ +// ABOUTME: Emits a RunSummary as a structured log entry via slog. +// ABOUTME: Follows the same slog pattern as internal/output/logger.go for consistency. +package summary + +import ( + "io" + "log/slog" +) + +// SummaryWriter outputs a RunSummary using structured logging +type SummaryWriter struct { + logger *slog.Logger +} + +// NewSummaryWriter creates a writer that emits summaries to the given output in the given format +func NewSummaryWriter(output io.Writer, format string) *SummaryWriter { + opts := &slog.HandlerOptions{Level: slog.LevelInfo} + + var handler slog.Handler + switch format { + case "text": + handler = slog.NewTextHandler(output, opts) + default: + handler = slog.NewJSONHandler(output, opts) + } + + return &SummaryWriter{ + logger: slog.New(handler), + } +} + +// Write emits the run summary as a single structured log entry +func (w *SummaryWriter) Write(s *RunSummary) { + if s == nil { + return + } + + topCPUAttrs := make([]any, 0, len(s.TopCPUProcesses)) + for _, p := range s.TopCPUProcesses { + topCPUAttrs = append(topCPUAttrs, slog.Group("", + slog.Int("pid", p.PID), + slog.String("name", p.Name), + slog.Float64("peak_cpu_percent", p.PeakCPU), + slog.Uint64("peak_mem_rss_bytes", p.PeakMem), + )) + } + + topMemAttrs := make([]any, 0, len(s.TopMemProcesses)) + for _, p := range s.TopMemProcesses { + topMemAttrs = append(topMemAttrs, slog.Group("", + slog.Int("pid", p.PID), + slog.String("name", p.Name), + slog.Float64("peak_cpu_percent", p.PeakCPU), + slog.Uint64("peak_mem_rss_bytes", p.PeakMem), + )) + } + + w.logger.Info("run_summary", + slog.Time("start_time", s.StartTime), + slog.Time("end_time", s.EndTime), + slog.Float64("duration_seconds", s.DurationSeconds), + slog.Int("sample_count", s.SampleCount), + slog.Group("cpu_total_percent", + slog.Float64("peak", s.CPUTotal.Peak), + slog.Float64("avg", s.CPUTotal.Avg), + slog.Float64("p95", s.CPUTotal.P95), + ), + slog.Group("mem_used_bytes", + slog.Float64("peak", s.MemUsedBytes.Peak), + slog.Float64("avg", s.MemUsedBytes.Avg), + slog.Float64("p95", s.MemUsedBytes.P95), + ), + slog.Group("mem_used_percent", + slog.Float64("peak", s.MemUsedPercent.Peak), + slog.Float64("avg", s.MemUsedPercent.Avg), + slog.Float64("p95", s.MemUsedPercent.P95), + ), + slog.Any("top_cpu_processes", topCPUAttrs), + slog.Any("top_mem_processes", topMemAttrs), + ) +} diff --git a/internal/summary/writer_test.go b/internal/summary/writer_test.go new file mode 100644 index 0000000..d787ec6 --- /dev/null +++ b/internal/summary/writer_test.go @@ -0,0 +1,93 @@ +// ABOUTME: Tests for the summary writer that emits run summaries via slog. +// ABOUTME: Validates JSON output, text output, and nil summary handling. +package summary + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestSummaryWriter_JSON(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "json") + + s := &RunSummary{ + StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), + DurationSeconds: 60, + SampleCount: 12, + CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, + MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, + MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, + TopCPUProcesses: []ProcessPeak{ + {PID: 1, Name: "busy", PeakCPU: 95.5, PeakMem: 1000}, + }, + TopMemProcesses: []ProcessPeak{ + {PID: 2, Name: "hungry", PeakCPU: 10.0, PeakMem: 8000000}, + }, + } + + w.Write(s) + + output := buf.String() + if !strings.Contains(output, "run_summary") { + t.Errorf("output should contain 'run_summary', got: %s", output) + } + if !strings.Contains(output, "duration_seconds") { + t.Errorf("output should contain 'duration_seconds', got: %s", output) + } + if !strings.Contains(output, "sample_count") { + t.Errorf("output should contain 'sample_count', got: %s", output) + } + if !strings.Contains(output, "cpu_total_percent") { + t.Errorf("output should contain 'cpu_total_percent', got: %s", output) + } + if !strings.Contains(output, "mem_used_bytes") { + t.Errorf("output should contain 'mem_used_bytes', got: %s", output) + } + if !strings.Contains(output, "top_cpu_processes") { + t.Errorf("output should contain 'top_cpu_processes', got: %s", output) + } + if !strings.Contains(output, "top_mem_processes") { + t.Errorf("output should contain 'top_mem_processes', got: %s", output) + } +} + +func TestSummaryWriter_Text(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "text") + + s := &RunSummary{ + StartTime: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + EndTime: time.Date(2025, 1, 1, 0, 1, 0, 0, time.UTC), + DurationSeconds: 60, + SampleCount: 12, + CPUTotal: StatSummary{Peak: 95.5, Avg: 42.0, P95: 88.0}, + MemUsedBytes: StatSummary{Peak: 8000000, Avg: 4000000, P95: 7500000}, + MemUsedPercent: StatSummary{Peak: 80.0, Avg: 40.0, P95: 75.0}, + } + + w.Write(s) + + output := buf.String() + if !strings.Contains(output, "run_summary") { + t.Errorf("output should contain 'run_summary', got: %s", output) + } + if !strings.Contains(output, "duration_seconds") { + t.Errorf("output should contain 'duration_seconds', got: %s", output) + } +} + +func TestSummaryWriter_NilSummary(t *testing.T) { + var buf bytes.Buffer + w := NewSummaryWriter(&buf, "json") + + // Should not panic and should not write anything + w.Write(nil) + + if buf.Len() != 0 { + t.Errorf("expected no output for nil summary, got: %s", buf.String()) + } +} From c5c872a3731397f6921ff5b87f704fc76a0274cf Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 11:00:34 +0100 Subject: [PATCH 03/21] fix(output): correct JSON serialization of top process metrics slog.Any() does not properly serialize slices of slog.Group() attributes, resulting in broken output like {"Key":"","Value":{}}. Fixed by passing structs with JSON tags directly to slog.Any() instead. Co-Authored-By: Claude Opus 4.5 --- internal/output/logger.go | 46 +++++++++++++++++++++++++------------- internal/summary/writer.go | 24 ++------------------ 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/internal/output/logger.go b/internal/output/logger.go index 0ce5ddd..4c57794 100644 --- a/internal/output/logger.go +++ b/internal/output/logger.go @@ -53,26 +53,40 @@ func NewLoggerWriter(cfg LoggerConfig) *LoggerWriter { } } +// topCPUEntry is a lightweight struct for JSON serialization of top CPU processes +type topCPUEntry struct { + PID int `json:"pid"` + Name string `json:"name"` + CPUPercent float64 `json:"cpu_percent"` +} + +// topMemEntry is a lightweight struct for JSON serialization of top memory processes +type topMemEntry struct { + PID int `json:"pid"` + Name string `json:"name"` + RSSBytes uint64 `json:"rss_bytes"` +} + // Write outputs the metrics using structured logging func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { - // Build top CPU process attrs - topCPUAttrs := make([]any, 0, len(m.TopCPU)) + // Build top CPU process entries + topCPU := make([]topCPUEntry, 0, len(m.TopCPU)) for _, p := range m.TopCPU { - topCPUAttrs = append(topCPUAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Float64("cpu_percent", p.CPUPercent), - )) + topCPU = append(topCPU, topCPUEntry{ + PID: p.PID, + Name: p.Name, + CPUPercent: p.CPUPercent, + }) } - // Build top memory process attrs - topMemAttrs := make([]any, 0, len(m.TopMemory)) + // Build top memory process entries + topMem := make([]topMemEntry, 0, len(m.TopMemory)) for _, p := range m.TopMemory { - topMemAttrs = append(topMemAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Uint64("rss_bytes", p.MemRSS), - )) + topMem = append(topMem, topMemEntry{ + PID: p.PID, + Name: p.Name, + RSSBytes: p.MemRSS, + }) } w.logger.Info("metrics_collected", @@ -94,8 +108,8 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { slog.Uint64("total_rss_bytes", m.Memory.TotalRSSBytes), slog.Float64("rss_percent", m.Memory.RSSPercent), ), - slog.Any("top_cpu", topCPUAttrs), - slog.Any("top_memory", topMemAttrs), + slog.Any("top_cpu", topCPU), + slog.Any("top_memory", topMem), ) return nil diff --git a/internal/summary/writer.go b/internal/summary/writer.go index f6781e1..30b392b 100644 --- a/internal/summary/writer.go +++ b/internal/summary/writer.go @@ -35,26 +35,6 @@ func (w *SummaryWriter) Write(s *RunSummary) { return } - topCPUAttrs := make([]any, 0, len(s.TopCPUProcesses)) - for _, p := range s.TopCPUProcesses { - topCPUAttrs = append(topCPUAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Float64("peak_cpu_percent", p.PeakCPU), - slog.Uint64("peak_mem_rss_bytes", p.PeakMem), - )) - } - - topMemAttrs := make([]any, 0, len(s.TopMemProcesses)) - for _, p := range s.TopMemProcesses { - topMemAttrs = append(topMemAttrs, slog.Group("", - slog.Int("pid", p.PID), - slog.String("name", p.Name), - slog.Float64("peak_cpu_percent", p.PeakCPU), - slog.Uint64("peak_mem_rss_bytes", p.PeakMem), - )) - } - w.logger.Info("run_summary", slog.Time("start_time", s.StartTime), slog.Time("end_time", s.EndTime), @@ -75,7 +55,7 @@ func (w *SummaryWriter) Write(s *RunSummary) { slog.Float64("avg", s.MemUsedPercent.Avg), slog.Float64("p95", s.MemUsedPercent.P95), ), - slog.Any("top_cpu_processes", topCPUAttrs), - slog.Any("top_mem_processes", topMemAttrs), + slog.Any("top_cpu_processes", s.TopCPUProcesses), + slog.Any("top_mem_processes", s.TopMemProcesses), ) } From c309bd810d04493c34290fedddc6e54ecbd1fa84 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 11:40:03 +0100 Subject: [PATCH 04/21] feat(receiver): add HTTP metrics receiver with SQLite storage Add a new receiver application under cmd/receiver that accepts metrics via HTTP POST and stores them in SQLite using GORM. The receiver expects GitHub Actions style execution context (org, repo, workflow, job, run_id). Co-Authored-By: Claude Opus 4.5 --- cmd/receiver/main.go | 77 ++++++++++ go.mod | 12 ++ go.sum | 12 ++ internal/receiver/handler.go | 101 +++++++++++++ internal/receiver/handler_test.go | 239 ++++++++++++++++++++++++++++++ internal/receiver/store.go | 94 ++++++++++++ internal/receiver/store_test.go | 219 +++++++++++++++++++++++++++ internal/receiver/types.go | 32 ++++ 8 files changed, 786 insertions(+) create mode 100644 cmd/receiver/main.go create mode 100644 go.sum create mode 100644 internal/receiver/handler.go create mode 100644 internal/receiver/handler_test.go create mode 100644 internal/receiver/store.go create mode 100644 internal/receiver/store_test.go create mode 100644 internal/receiver/types.go diff --git a/cmd/receiver/main.go b/cmd/receiver/main.go new file mode 100644 index 0000000..a5fda16 --- /dev/null +++ b/cmd/receiver/main.go @@ -0,0 +1,77 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log/slog" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" +) + +const ( + defaultAddr = ":8080" + defaultDBPath = "metrics.db" +) + +func main() { + addr := flag.String("addr", defaultAddr, "HTTP listen address") + dbPath := flag.String("db", defaultDBPath, "SQLite database path") + flag.Parse() + + logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + store, err := receiver.NewStore(*dbPath) + if err != nil { + logger.Error("failed to open database", slog.String("error", err.Error())) + os.Exit(1) + } + defer func() { _ = store.Close() }() + + handler := receiver.NewHandler(store, logger) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := &http.Server{ + Addr: *addr, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigChan + logger.Info("received signal, shutting down", slog.String("signal", sig.String())) + cancel() + + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + _ = server.Shutdown(shutdownCtx) + }() + + logger.Info("starting metrics receiver", + slog.String("addr", *addr), + slog.String("db", *dbPath), + ) + + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } + + <-ctx.Done() + logger.Info("receiver stopped gracefully") +} diff --git a/go.mod b/go.mod index 3b4c218..7ef0cd0 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,15 @@ module edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector go 1.25.6 + +require ( + gorm.io/driver/sqlite v1.6.0 + gorm.io/gorm v1.31.1 +) + +require ( + github.com/jinzhu/inflection v1.0.0 // indirect + github.com/jinzhu/now v1.1.5 // indirect + github.com/mattn/go-sqlite3 v1.14.22 // indirect + golang.org/x/text v0.20.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..330dd09 --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= +github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= +github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= +gorm.io/driver/sqlite v1.6.0 h1:WHRRrIiulaPiPFmDcod6prc4l2VGVWHz80KspNsxSfQ= +gorm.io/driver/sqlite v1.6.0/go.mod h1:AO9V1qIQddBESngQUKWL9yoH93HIeA1X6V633rBwyT8= +gorm.io/gorm v1.31.1 h1:7CA8FTFz/gRfgqgpeKIBcervUn3xSyPUmr6B2WXJ7kg= +gorm.io/gorm v1.31.1/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs= diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go new file mode 100644 index 0000000..ff4edd0 --- /dev/null +++ b/internal/receiver/handler.go @@ -0,0 +1,101 @@ +// ABOUTME: HTTP handlers for the metrics receiver service. +// ABOUTME: Provides endpoints for receiving and querying metrics. +package receiver + +import ( + "encoding/json" + "log/slog" + "net/http" +) + +// Handler handles HTTP requests for the metrics receiver +type Handler struct { + store *Store + logger *slog.Logger +} + +// NewHandler creates a new HTTP handler with the given store +func NewHandler(store *Store, logger *slog.Logger) *Handler { + return &Handler{store: store, logger: logger} +} + +// RegisterRoutes registers all HTTP routes on the given mux +func (h *Handler) RegisterRoutes(mux *http.ServeMux) { + mux.HandleFunc("POST /api/v1/metrics", h.handleReceiveMetrics) + mux.HandleFunc("GET /api/v1/metrics/run/{runID}", h.handleGetByRunID) + mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}", h.handleGetByRepository) + mux.HandleFunc("GET /health", h.handleHealth) +} + +func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { + var payload MetricsPayload + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + h.logger.Error("failed to decode payload", slog.String("error", err.Error())) + http.Error(w, "invalid JSON payload", http.StatusBadRequest) + return + } + + if payload.Execution.RunID == "" { + http.Error(w, "run_id is required", http.StatusBadRequest) + return + } + + id, err := h.store.SaveMetric(&payload) + if err != nil { + h.logger.Error("failed to save metric", slog.String("error", err.Error())) + http.Error(w, "failed to save metric", http.StatusInternalServerError) + return + } + + h.logger.Info("metric saved", + slog.Uint64("id", uint64(id)), + slog.String("run_id", payload.Execution.RunID), + slog.String("repository", payload.Execution.Repository), + ) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(map[string]any{"id": id, "status": "created"}) +} + +func (h *Handler) handleGetByRunID(w http.ResponseWriter, r *http.Request) { + runID := r.PathValue("runID") + if runID == "" { + http.Error(w, "run_id is required", http.StatusBadRequest) + return + } + + metrics, err := h.store.GetMetricsByRunID(runID) + if err != nil { + h.logger.Error("failed to get metrics", slog.String("error", err.Error())) + http.Error(w, "failed to get metrics", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(metrics) +} + +func (h *Handler) handleGetByRepository(w http.ResponseWriter, r *http.Request) { + org := r.PathValue("org") + repo := r.PathValue("repo") + if org == "" || repo == "" { + http.Error(w, "org and repo are required", http.StatusBadRequest) + return + } + + metrics, err := h.store.GetMetricsByRepository(org, repo) + if err != nil { + h.logger.Error("failed to get metrics", slog.String("error", err.Error())) + http.Error(w, "failed to get metrics", http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(metrics) +} + +func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) +} diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go new file mode 100644 index 0000000..a1845d2 --- /dev/null +++ b/internal/receiver/handler_test.go @@ -0,0 +1,239 @@ +package receiver + +import ( + "bytes" + "encoding/json" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "path/filepath" + "testing" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestHandler_ReceiveMetrics(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + payload := MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-123", + }, + Summary: summary.RunSummary{ + DurationSeconds: 60.0, + SampleCount: 12, + }, + } + + body, _ := json.Marshal(payload) + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusCreated { + t.Errorf("status = %d, want %d", rec.Code, http.StatusCreated) + } + + var resp map[string]any + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp["status"] != "created" { + t.Errorf("response status = %v, want %q", resp["status"], "created") + } + if resp["id"] == nil || resp["id"].(float64) == 0 { + t.Error("response id is missing or zero") + } +} + +func TestHandler_ReceiveMetrics_InvalidJSON(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader([]byte("not json"))) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_ReceiveMetrics_MissingRunID(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + payload := MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + // RunID is missing + }, + Summary: summary.RunSummary{}, + } + + body, _ := json.Marshal(payload) + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_GetByRunID(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + // First, save a metric + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-get-test", + }, + Summary: summary.RunSummary{SampleCount: 5}, + } + if _, err := h.store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/run/run-get-test", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []Metric + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 1 { + t.Errorf("got %d metrics, want 1", len(metrics)) + } + if metrics[0].RunID != "run-get-test" { + t.Errorf("RunID = %q, want %q", metrics[0].RunID, "run-get-test") + } +} + +func TestHandler_GetByRunID_NotFound(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/run/nonexistent", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []Metric + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 0 { + t.Errorf("got %d metrics, want 0", len(metrics)) + } +} + +func TestHandler_GetByRepository(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + // Save metrics for different repos + payloads := []*MetricsPayload{ + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", RunID: "r1"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", RunID: "r2"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-z", RunID: "r3"}}, + } + for _, p := range payloads { + if _, err := h.store.SaveMetric(p); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []Metric + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 2 { + t.Errorf("got %d metrics, want 2", len(metrics)) + } +} + +func TestHandler_Health(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/health", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp map[string]string + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp["status"] != "ok" { + t.Errorf("status = %q, want %q", resp["status"], "ok") + } +} + +func newTestHandler(t *testing.T) (*Handler, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + handler := NewHandler(store, logger) + + return handler, func() { _ = store.Close() } +} diff --git a/internal/receiver/store.go b/internal/receiver/store.go new file mode 100644 index 0000000..85112a5 --- /dev/null +++ b/internal/receiver/store.go @@ -0,0 +1,94 @@ +// ABOUTME: SQLite storage layer for metrics receiver using GORM. +// ABOUTME: Handles database initialization and metric storage/retrieval. +package receiver + +import ( + "encoding/json" + "fmt" + "time" + + "gorm.io/driver/sqlite" + "gorm.io/gorm" + "gorm.io/gorm/logger" +) + +// Metric represents a stored metric record in the database +type Metric struct { + ID uint `gorm:"primaryKey"` + Organization string `gorm:"index:idx_org_repo;not null"` + Repository string `gorm:"index:idx_org_repo;not null"` + Workflow string `gorm:"not null"` + Job string `gorm:"not null"` + RunID string `gorm:"index;not null"` + ReceivedAt time.Time `gorm:"index;not null"` + Payload string `gorm:"type:text;not null"` // JSON-encoded RunSummary +} + +// Store handles SQLite storage for metrics using GORM +type Store struct { + db *gorm.DB +} + +// NewStore creates a new SQLite store at the given path +func NewStore(dbPath string) (*Store, error) { + db, err := gorm.Open(sqlite.Open(dbPath), &gorm.Config{ + Logger: logger.Default.LogMode(logger.Silent), + }) + if err != nil { + return nil, fmt.Errorf("opening database: %w", err) + } + + if err := db.AutoMigrate(&Metric{}); err != nil { + return nil, fmt.Errorf("migrating schema: %w", err) + } + + return &Store{db: db}, nil +} + +// SaveMetric stores a metrics payload in the database +func (s *Store) SaveMetric(payload *MetricsPayload) (uint, error) { + summaryJSON, err := json.Marshal(payload.Summary) + if err != nil { + return 0, fmt.Errorf("marshaling summary: %w", err) + } + + metric := Metric{ + Organization: payload.Execution.Organization, + Repository: payload.Execution.Repository, + Workflow: payload.Execution.Workflow, + Job: payload.Execution.Job, + RunID: payload.Execution.RunID, + ReceivedAt: time.Now().UTC(), + Payload: string(summaryJSON), + } + + result := s.db.Create(&metric) + if result.Error != nil { + return 0, fmt.Errorf("inserting metric: %w", result.Error) + } + + return metric.ID, nil +} + +// GetMetricsByRunID retrieves all metrics for a specific run +func (s *Store) GetMetricsByRunID(runID string) ([]Metric, error) { + var metrics []Metric + result := s.db.Where("run_id = ?", runID).Order("received_at DESC").Find(&metrics) + return metrics, result.Error +} + +// GetMetricsByRepository retrieves all metrics for a specific repository +func (s *Store) GetMetricsByRepository(org, repo string) ([]Metric, error) { + var metrics []Metric + result := s.db.Where("organization = ? AND repository = ?", org, repo).Order("received_at DESC").Find(&metrics) + return metrics, result.Error +} + +// Close closes the database connection +func (s *Store) Close() error { + sqlDB, err := s.db.DB() + if err != nil { + return err + } + return sqlDB.Close() +} diff --git a/internal/receiver/store_test.go b/internal/receiver/store_test.go new file mode 100644 index 0000000..8302535 --- /dev/null +++ b/internal/receiver/store_test.go @@ -0,0 +1,219 @@ +package receiver + +import ( + "os" + "path/filepath" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +func TestNewStore(t *testing.T) { + dbPath := filepath.Join(t.TempDir(), "test.db") + + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + defer func() { _ = store.Close() }() + + if _, err := os.Stat(dbPath); os.IsNotExist(err) { + t.Error("database file was not created") + } +} + +func TestStore_SaveMetric(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-123", + }, + Summary: summary.RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 12, + CPUTotal: summary.StatSummary{Peak: 80.5, Avg: 45.2, P95: 75.0}, + MemUsedBytes: summary.StatSummary{Peak: 1024000, Avg: 512000, P95: 900000}, + MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + }, + } + + id, err := store.SaveMetric(payload) + if err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + if id == 0 { + t.Error("SaveMetric() returned id = 0, want non-zero") + } +} + +func TestStore_GetMetricsByRunID(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + // Save two metrics with same run ID + for i := 0; i < 2; i++ { + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-456", + }, + Summary: summary.RunSummary{SampleCount: i + 1}, + } + if _, err := store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + } + + // Save one with different run ID + otherPayload := &MetricsPayload{ + Execution: ExecutionContext{RunID: "run-789"}, + Summary: summary.RunSummary{}, + } + if _, err := store.SaveMetric(otherPayload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + metrics, err := store.GetMetricsByRunID("run-456") + if err != nil { + t.Fatalf("GetMetricsByRunID() error = %v", err) + } + if len(metrics) != 2 { + t.Errorf("GetMetricsByRunID() returned %d metrics, want 2", len(metrics)) + } + + for _, m := range metrics { + if m.RunID != "run-456" { + t.Errorf("GetMetricsByRunID() returned metric with RunID = %q, want %q", m.RunID, "run-456") + } + } +} + +func TestStore_GetMetricsByRunID_NotFound(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + metrics, err := store.GetMetricsByRunID("nonexistent") + if err != nil { + t.Fatalf("GetMetricsByRunID() error = %v", err) + } + if len(metrics) != 0 { + t.Errorf("GetMetricsByRunID() returned %d metrics, want 0", len(metrics)) + } +} + +func TestStore_GetMetricsByRepository(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + // Save metrics for different repos + repos := []struct { + org string + repo string + }{ + {"org-a", "repo-1"}, + {"org-a", "repo-1"}, + {"org-a", "repo-2"}, + {"org-b", "repo-1"}, + } + + for i, r := range repos { + payload := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: r.org, + Repository: r.repo, + RunID: "run-" + string(rune('a'+i)), + }, + Summary: summary.RunSummary{}, + } + if _, err := store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + } + + metrics, err := store.GetMetricsByRepository("org-a", "repo-1") + if err != nil { + t.Fatalf("GetMetricsByRepository() error = %v", err) + } + if len(metrics) != 2 { + t.Errorf("GetMetricsByRepository() returned %d metrics, want 2", len(metrics)) + } + + for _, m := range metrics { + if m.Organization != "org-a" || m.Repository != "repo-1" { + t.Errorf("GetMetricsByRepository() returned metric with org=%q repo=%q, want org-a/repo-1", + m.Organization, m.Repository) + } + } +} + +func TestStore_SaveMetric_PreservesPayload(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + original := &MetricsPayload{ + Execution: ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "build.yml", + Job: "test", + RunID: "run-preserve", + }, + Summary: summary.RunSummary{ + DurationSeconds: 123.45, + SampleCount: 50, + CPUTotal: summary.StatSummary{Peak: 99.9, Avg: 55.5, P95: 88.8}, + }, + } + + _, err := store.SaveMetric(original) + if err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + metrics, err := store.GetMetricsByRunID("run-preserve") + if err != nil { + t.Fatalf("GetMetricsByRunID() error = %v", err) + } + if len(metrics) != 1 { + t.Fatalf("GetMetricsByRunID() returned %d metrics, want 1", len(metrics)) + } + + m := metrics[0] + if m.Organization != original.Execution.Organization { + t.Errorf("Organization = %q, want %q", m.Organization, original.Execution.Organization) + } + if m.Repository != original.Execution.Repository { + t.Errorf("Repository = %q, want %q", m.Repository, original.Execution.Repository) + } + if m.Workflow != original.Execution.Workflow { + t.Errorf("Workflow = %q, want %q", m.Workflow, original.Execution.Workflow) + } + if m.Job != original.Execution.Job { + t.Errorf("Job = %q, want %q", m.Job, original.Execution.Job) + } + if m.Payload == "" { + t.Error("Payload is empty") + } +} + +func newTestStore(t *testing.T) *Store { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + return store +} diff --git a/internal/receiver/types.go b/internal/receiver/types.go new file mode 100644 index 0000000..e11a98d --- /dev/null +++ b/internal/receiver/types.go @@ -0,0 +1,32 @@ +// ABOUTME: Data types for the metrics receiver service. +// ABOUTME: Defines MetricsPayload combining execution metadata with run summary. +package receiver + +import "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" + +// ExecutionContext holds GitHub Actions style identifiers for a workflow run +type ExecutionContext struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` +} + +// MetricsPayload is the complete payload sent to the receiver +type MetricsPayload struct { + Execution ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` +} + +// StoredMetric represents a metric record as stored in the database +type StoredMetric struct { + ID int64 + Organization string + Repository string + Workflow string + Job string + RunID string + ReceivedAt string + Payload string // JSON-encoded RunSummary +} From cfe583fbc4db6e9e91cddc4cfdbfb53d68304e83 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 11:44:20 +0100 Subject: [PATCH 05/21] feat(collector): add HTTP push for metrics to receiver Add push client that sends run summary to a configurable HTTP endpoint on shutdown. Execution context is read from GitHub Actions style environment variables (with Gitea fallbacks). New flag: -push-endpoint Co-Authored-By: Claude Opus 4.5 --- cmd/collector/main.go | 16 ++++ internal/collector/collector.go | 27 ++++-- internal/summary/push.go | 106 +++++++++++++++++++++ internal/summary/push_test.go | 162 ++++++++++++++++++++++++++++++++ 4 files changed, 304 insertions(+), 7 deletions(-) create mode 100644 internal/summary/push.go create mode 100644 internal/summary/push_test.go diff --git a/cmd/collector/main.go b/cmd/collector/main.go index b34561b..ec06b9b 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -30,6 +30,7 @@ func main() { logLevel := flag.String("log-level", defaultLogLevel, "Log level: debug, info, warn, error") logFormat := flag.String("log-format", defaultLogFormat, "Output format: json, text") topN := flag.Int("top", defaultTopN, "Number of top processes to include") + pushEndpoint := flag.String("push-endpoint", "", "HTTP endpoint to push metrics to (e.g., http://localhost:8080/api/v1/metrics)") flag.Parse() // Setup structured logging for application logs @@ -58,6 +59,21 @@ func main() { summaryWriter := summary.NewSummaryWriter(os.Stdout, *logFormat) c.SetSummaryWriter(summaryWriter) + // Setup push client if endpoint is configured + if *pushEndpoint != "" { + pushClient := summary.NewPushClient(*pushEndpoint) + c.SetPushClient(pushClient) + execCtx := pushClient.ExecutionContext() + appLogger.Info("push client configured", + slog.String("endpoint", *pushEndpoint), + slog.String("organization", execCtx.Organization), + slog.String("repository", execCtx.Repository), + slog.String("workflow", execCtx.Workflow), + slog.String("job", execCtx.Job), + slog.String("run_id", execCtx.RunID), + ) + } + // Setup signal handling for graceful shutdown ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/internal/collector/collector.go b/internal/collector/collector.go index 134ab3a..393f1ae 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -26,6 +26,7 @@ type Collector struct { logger *slog.Logger accumulator *summary.Accumulator summaryWriter *summary.SummaryWriter + pushClient *summary.PushClient } // New creates a new collector @@ -44,6 +45,11 @@ func (c *Collector) SetSummaryWriter(w *summary.SummaryWriter) { c.summaryWriter = w } +// SetPushClient attaches a push client for sending summaries to the receiver +func (c *Collector) SetPushClient(p *summary.PushClient) { + c.pushClient = p +} + // Run starts the collector loop and blocks until context is cancelled func (c *Collector) Run(ctx context.Context) error { c.logger.Info("collector starting", @@ -64,7 +70,7 @@ func (c *Collector) Run(ctx context.Context) error { select { case <-ctx.Done(): c.logger.Info("collector stopping") - c.emitSummary() + c.emitSummary(context.Background()) // Use fresh context for shutdown tasks return ctx.Err() case <-ticker.C: if err := c.collect(); err != nil { @@ -91,11 +97,7 @@ func (c *Collector) collect() error { } // emitSummary computes and writes the run summary if a writer is configured -func (c *Collector) emitSummary() { - if c.summaryWriter == nil { - return - } - +func (c *Collector) emitSummary(ctx context.Context) { s := c.accumulator.Summarize() if s == nil { c.logger.Info("no samples collected, skipping run summary") @@ -106,7 +108,18 @@ func (c *Collector) emitSummary() { slog.Int("sample_count", s.SampleCount), slog.Float64("duration_seconds", s.DurationSeconds), ) - c.summaryWriter.Write(s) + + if c.summaryWriter != nil { + c.summaryWriter.Write(s) + } + + if c.pushClient != nil { + if err := c.pushClient.Push(ctx, s); err != nil { + c.logger.Error("failed to push metrics", slog.String("error", err.Error())) + } else { + c.logger.Info("metrics pushed successfully") + } + } } // CollectOnce performs a single collection and returns the metrics diff --git a/internal/summary/push.go b/internal/summary/push.go new file mode 100644 index 0000000..d01fb5c --- /dev/null +++ b/internal/summary/push.go @@ -0,0 +1,106 @@ +// ABOUTME: HTTP client for pushing run summaries to the metrics receiver. +// ABOUTME: Reads execution context from GitHub Actions style environment variables. +package summary + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "time" +) + +// ExecutionContext holds GitHub Actions style identifiers for a workflow run +type ExecutionContext struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` +} + +// MetricsPayload is the complete payload sent to the receiver +type MetricsPayload struct { + Execution ExecutionContext `json:"execution"` + Summary RunSummary `json:"run_summary"` +} + +// PushClient sends metrics to the receiver service +type PushClient struct { + endpoint string + client *http.Client + ctx ExecutionContext +} + +// NewPushClient creates a new push client configured from environment variables +func NewPushClient(endpoint string) *PushClient { + return &PushClient{ + endpoint: endpoint, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + ctx: ExecutionContextFromEnv(), + } +} + +// ExecutionContextFromEnv reads execution context from GitHub Actions environment variables +func ExecutionContextFromEnv() ExecutionContext { + return ExecutionContext{ + Organization: getEnvWithFallback("GITHUB_REPOSITORY_OWNER", "GITEA_REPO_OWNER"), + Repository: getEnvWithFallback("GITHUB_REPOSITORY", "GITEA_REPO"), + Workflow: getEnvWithFallback("GITHUB_WORKFLOW", "GITEA_WORKFLOW"), + Job: getEnvWithFallback("GITHUB_JOB", "GITEA_JOB"), + RunID: getEnvWithFallback("GITHUB_RUN_ID", "GITEA_RUN_ID"), + } +} + +func getEnvWithFallback(keys ...string) string { + for _, key := range keys { + if val := os.Getenv(key); val != "" { + return val + } + } + return "" +} + +// Push sends the run summary to the receiver +func (p *PushClient) Push(ctx context.Context, summary *RunSummary) error { + if summary == nil { + return nil + } + + payload := MetricsPayload{ + Execution: p.ctx, + Summary: *summary, + } + + body, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshaling payload: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.endpoint, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("creating request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := p.client.Do(req) + if err != nil { + return fmt.Errorf("sending request: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + return nil +} + +// ExecutionContext returns the current execution context +func (p *PushClient) ExecutionContext() ExecutionContext { + return p.ctx +} diff --git a/internal/summary/push_test.go b/internal/summary/push_test.go new file mode 100644 index 0000000..58aea9e --- /dev/null +++ b/internal/summary/push_test.go @@ -0,0 +1,162 @@ +package summary + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestPushClient_Push(t *testing.T) { + var received MetricsPayload + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if ct := r.Header.Get("Content-Type"); ct != "application/json" { + t.Errorf("expected Content-Type application/json, got %s", ct) + } + if err := json.NewDecoder(r.Body).Decode(&received); err != nil { + t.Errorf("failed to decode body: %v", err) + } + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL) + client.ctx = ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "12345", + } + + summary := &RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 10, + CPUTotal: StatSummary{Peak: 80.0, Avg: 50.0, P95: 75.0}, + } + + err := client.Push(context.Background(), summary) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + + if received.Execution.Organization != "test-org" { + t.Errorf("Organization = %q, want %q", received.Execution.Organization, "test-org") + } + if received.Execution.RunID != "12345" { + t.Errorf("RunID = %q, want %q", received.Execution.RunID, "12345") + } + if received.Summary.SampleCount != 10 { + t.Errorf("SampleCount = %d, want %d", received.Summary.SampleCount, 10) + } +} + +func TestPushClient_Push_NilSummary(t *testing.T) { + client := NewPushClient("http://localhost:9999") + err := client.Push(context.Background(), nil) + if err != nil { + t.Errorf("Push(nil) error = %v, want nil", err) + } +} + +func TestPushClient_Push_ServerError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + client := NewPushClient(server.URL) + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err == nil { + t.Error("Push() expected error for 500 response, got nil") + } +} + +func TestPushClient_Push_ConnectionError(t *testing.T) { + client := NewPushClient("http://localhost:1") // Invalid port + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err == nil { + t.Error("Push() expected error for connection failure, got nil") + } +} + +func TestExecutionContextFromEnv(t *testing.T) { + // Save and restore env + origVars := map[string]string{ + "GITHUB_REPOSITORY_OWNER": "", + "GITHUB_REPOSITORY": "", + "GITHUB_WORKFLOW": "", + "GITHUB_JOB": "", + "GITHUB_RUN_ID": "", + } + for k := range origVars { + origVars[k] = getEnvWithFallback(k) + } + defer func() { + for k, v := range origVars { + if v == "" { + t.Setenv(k, "") + } + } + }() + + t.Setenv("GITHUB_REPOSITORY_OWNER", "my-org") + t.Setenv("GITHUB_REPOSITORY", "my-org/my-repo") + t.Setenv("GITHUB_WORKFLOW", "CI") + t.Setenv("GITHUB_JOB", "test") + t.Setenv("GITHUB_RUN_ID", "999") + + ctx := ExecutionContextFromEnv() + + if ctx.Organization != "my-org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "my-org") + } + if ctx.Repository != "my-org/my-repo" { + t.Errorf("Repository = %q, want %q", ctx.Repository, "my-org/my-repo") + } + if ctx.Workflow != "CI" { + t.Errorf("Workflow = %q, want %q", ctx.Workflow, "CI") + } + if ctx.Job != "test" { + t.Errorf("Job = %q, want %q", ctx.Job, "test") + } + if ctx.RunID != "999" { + t.Errorf("RunID = %q, want %q", ctx.RunID, "999") + } +} + +func TestExecutionContextFromEnv_GiteaFallback(t *testing.T) { + t.Setenv("GITHUB_RUN_ID", "") + t.Setenv("GITEA_RUN_ID", "gitea-123") + + ctx := ExecutionContextFromEnv() + + if ctx.RunID != "gitea-123" { + t.Errorf("RunID = %q, want %q (Gitea fallback)", ctx.RunID, "gitea-123") + } +} + +func TestPushClient_ExecutionContext(t *testing.T) { + client := NewPushClient("http://example.com") + client.ctx = ExecutionContext{ + Organization: "org", + Repository: "repo", + RunID: "run", + } + + ctx := client.ExecutionContext() + if ctx.Organization != "org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "org") + } +} From d99cd1dd564e2869c83025b9897bace65a713bc2 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 11:54:20 +0100 Subject: [PATCH 06/21] feat(docker): multi-stage build for collector and receiver Add multi-stage Dockerfile that can build both images: - `docker build --target collector` for the collector - `docker build --target receiver` for the receiver (with CGO for SQLite) Co-Authored-By: Claude Opus 4.5 --- Dockerfile | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 64a266f..75f7b7f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,37 @@ -FROM golang:1.25-alpine AS builder +FROM golang:1.25-alpine AS builder-base WORKDIR /app -COPY go.mod ./ +COPY go.mod go.sum ./ RUN go mod download COPY . . +# Collector build (no CGO needed) +FROM builder-base AS builder-collector + RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /resource-collector ./cmd/collector -FROM alpine:3.19 +# Receiver build (CGO needed for SQLite) +FROM builder-base AS builder-receiver -COPY --from=builder /resource-collector /usr/local/bin/resource-collector +RUN apk add --no-cache gcc musl-dev +RUN CGO_ENABLED=1 GOOS=linux go build -ldflags="-s -w" -o /metrics-receiver ./cmd/receiver + +# Collector image +FROM alpine:3.19 AS collector + +COPY --from=builder-collector /resource-collector /usr/local/bin/resource-collector ENTRYPOINT ["/usr/local/bin/resource-collector"] + +# Receiver image +FROM alpine:3.19 AS receiver + +RUN apk add --no-cache sqlite-libs + +COPY --from=builder-receiver /metrics-receiver /usr/local/bin/metrics-receiver + +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/metrics-receiver"] From 7da7dc138f53c3aa1d526d58618c1510f66d99ee Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 12:00:22 +0100 Subject: [PATCH 07/21] refactor(receiver): change query endpoint to filter by workflow and job Replace /api/v1/metrics/run/{runID} and /api/v1/metrics/repo/{org}/{repo} with /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job} for more precise filtering by workflow and job name. Co-Authored-By: Claude Opus 4.5 --- internal/receiver/handler.go | 31 ++------ internal/receiver/handler_test.go | 102 ++++++++------------------ internal/receiver/store.go | 13 +--- internal/receiver/store_test.go | 117 ++++++++++-------------------- 4 files changed, 78 insertions(+), 185 deletions(-) diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go index ff4edd0..a00e414 100644 --- a/internal/receiver/handler.go +++ b/internal/receiver/handler.go @@ -22,8 +22,7 @@ func NewHandler(store *Store, logger *slog.Logger) *Handler { // RegisterRoutes registers all HTTP routes on the given mux func (h *Handler) RegisterRoutes(mux *http.ServeMux) { mux.HandleFunc("POST /api/v1/metrics", h.handleReceiveMetrics) - mux.HandleFunc("GET /api/v1/metrics/run/{runID}", h.handleGetByRunID) - mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}", h.handleGetByRepository) + mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}", h.handleGetByWorkflowJob) mux.HandleFunc("GET /health", h.handleHealth) } @@ -58,33 +57,17 @@ func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { _ = json.NewEncoder(w).Encode(map[string]any{"id": id, "status": "created"}) } -func (h *Handler) handleGetByRunID(w http.ResponseWriter, r *http.Request) { - runID := r.PathValue("runID") - if runID == "" { - http.Error(w, "run_id is required", http.StatusBadRequest) - return - } - - metrics, err := h.store.GetMetricsByRunID(runID) - if err != nil { - h.logger.Error("failed to get metrics", slog.String("error", err.Error())) - http.Error(w, "failed to get metrics", http.StatusInternalServerError) - return - } - - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(metrics) -} - -func (h *Handler) handleGetByRepository(w http.ResponseWriter, r *http.Request) { +func (h *Handler) handleGetByWorkflowJob(w http.ResponseWriter, r *http.Request) { org := r.PathValue("org") repo := r.PathValue("repo") - if org == "" || repo == "" { - http.Error(w, "org and repo are required", http.StatusBadRequest) + workflow := r.PathValue("workflow") + job := r.PathValue("job") + if org == "" || repo == "" || workflow == "" || job == "" { + http.Error(w, "org, repo, workflow and job are required", http.StatusBadRequest) return } - metrics, err := h.store.GetMetricsByRepository(org, repo) + metrics, err := h.store.GetMetricsByWorkflowJob(org, repo, workflow, job) if err != nil { h.logger.Error("failed to get metrics", slog.String("error", err.Error())) http.Error(w, "failed to get metrics", http.StatusInternalServerError) diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go index a1845d2..fdd3894 100644 --- a/internal/receiver/handler_test.go +++ b/internal/receiver/handler_test.go @@ -98,81 +98,15 @@ func TestHandler_ReceiveMetrics_MissingRunID(t *testing.T) { } } -func TestHandler_GetByRunID(t *testing.T) { +func TestHandler_GetByWorkflowJob(t *testing.T) { h, cleanup := newTestHandler(t) defer cleanup() - // First, save a metric - payload := &MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-get-test", - }, - Summary: summary.RunSummary{SampleCount: 5}, - } - if _, err := h.store.SaveMetric(payload); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/run/run-get-test", nil) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var metrics []Metric - if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if len(metrics) != 1 { - t.Errorf("got %d metrics, want 1", len(metrics)) - } - if metrics[0].RunID != "run-get-test" { - t.Errorf("RunID = %q, want %q", metrics[0].RunID, "run-get-test") - } -} - -func TestHandler_GetByRunID_NotFound(t *testing.T) { - h, cleanup := newTestHandler(t) - defer cleanup() - - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/run/nonexistent", nil) - rec := httptest.NewRecorder() - - mux := http.NewServeMux() - h.RegisterRoutes(mux) - mux.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) - } - - var metrics []Metric - if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - if len(metrics) != 0 { - t.Errorf("got %d metrics, want 0", len(metrics)) - } -} - -func TestHandler_GetByRepository(t *testing.T) { - h, cleanup := newTestHandler(t) - defer cleanup() - - // Save metrics for different repos + // Save metrics for different workflow/job combinations payloads := []*MetricsPayload{ - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", RunID: "r1"}}, - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", RunID: "r2"}}, - {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-z", RunID: "r3"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r1"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "build", RunID: "r2"}}, + {Execution: ExecutionContext{Organization: "org-x", Repository: "repo-y", Workflow: "ci.yml", Job: "test", RunID: "r3"}}, } for _, p := range payloads { if _, err := h.store.SaveMetric(p); err != nil { @@ -180,7 +114,7 @@ func TestHandler_GetByRepository(t *testing.T) { } } - req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y", nil) + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y/ci.yml/build", nil) rec := httptest.NewRecorder() mux := http.NewServeMux() @@ -200,6 +134,30 @@ func TestHandler_GetByRepository(t *testing.T) { } } +func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { + h, cleanup := newTestHandler(t) + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/workflow/job", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var metrics []Metric + if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(metrics) != 0 { + t.Errorf("got %d metrics, want 0", len(metrics)) + } +} + func TestHandler_Health(t *testing.T) { h, cleanup := newTestHandler(t) defer cleanup() diff --git a/internal/receiver/store.go b/internal/receiver/store.go index 85112a5..4ae970a 100644 --- a/internal/receiver/store.go +++ b/internal/receiver/store.go @@ -70,17 +70,10 @@ func (s *Store) SaveMetric(payload *MetricsPayload) (uint, error) { return metric.ID, nil } -// GetMetricsByRunID retrieves all metrics for a specific run -func (s *Store) GetMetricsByRunID(runID string) ([]Metric, error) { +// GetMetricsByWorkflowJob retrieves all metrics for a specific workflow and job +func (s *Store) GetMetricsByWorkflowJob(org, repo, workflow, job string) ([]Metric, error) { var metrics []Metric - result := s.db.Where("run_id = ?", runID).Order("received_at DESC").Find(&metrics) - return metrics, result.Error -} - -// GetMetricsByRepository retrieves all metrics for a specific repository -func (s *Store) GetMetricsByRepository(org, repo string) ([]Metric, error) { - var metrics []Metric - result := s.db.Where("organization = ? AND repository = ?", org, repo).Order("received_at DESC").Find(&metrics) + result := s.db.Where("organization = ? AND repository = ? AND workflow = ? AND job = ?", org, repo, workflow, job).Order("received_at DESC").Find(&metrics) return metrics, result.Error } diff --git a/internal/receiver/store_test.go b/internal/receiver/store_test.go index 8302535..44c4f17 100644 --- a/internal/receiver/store_test.go +++ b/internal/receiver/store_test.go @@ -55,84 +55,30 @@ func TestStore_SaveMetric(t *testing.T) { } } -func TestStore_GetMetricsByRunID(t *testing.T) { +func TestStore_GetMetricsByWorkflowJob(t *testing.T) { store := newTestStore(t) defer func() { _ = store.Close() }() - // Save two metrics with same run ID - for i := 0; i < 2; i++ { - payload := &MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-456", - }, - Summary: summary.RunSummary{SampleCount: i + 1}, - } - if _, err := store.SaveMetric(payload); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - } - - // Save one with different run ID - otherPayload := &MetricsPayload{ - Execution: ExecutionContext{RunID: "run-789"}, - Summary: summary.RunSummary{}, - } - if _, err := store.SaveMetric(otherPayload); err != nil { - t.Fatalf("SaveMetric() error = %v", err) - } - - metrics, err := store.GetMetricsByRunID("run-456") - if err != nil { - t.Fatalf("GetMetricsByRunID() error = %v", err) - } - if len(metrics) != 2 { - t.Errorf("GetMetricsByRunID() returned %d metrics, want 2", len(metrics)) - } - - for _, m := range metrics { - if m.RunID != "run-456" { - t.Errorf("GetMetricsByRunID() returned metric with RunID = %q, want %q", m.RunID, "run-456") - } - } -} - -func TestStore_GetMetricsByRunID_NotFound(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - metrics, err := store.GetMetricsByRunID("nonexistent") - if err != nil { - t.Fatalf("GetMetricsByRunID() error = %v", err) - } - if len(metrics) != 0 { - t.Errorf("GetMetricsByRunID() returned %d metrics, want 0", len(metrics)) - } -} - -func TestStore_GetMetricsByRepository(t *testing.T) { - store := newTestStore(t) - defer func() { _ = store.Close() }() - - // Save metrics for different repos - repos := []struct { - org string - repo string + // Save metrics for different workflow/job combinations + payloads := []struct { + org string + repo string + workflow string + job string }{ - {"org-a", "repo-1"}, - {"org-a", "repo-1"}, - {"org-a", "repo-2"}, - {"org-b", "repo-1"}, + {"org-a", "repo-1", "ci.yml", "build"}, + {"org-a", "repo-1", "ci.yml", "build"}, + {"org-a", "repo-1", "ci.yml", "test"}, + {"org-a", "repo-1", "deploy.yml", "build"}, } - for i, r := range repos { + for i, p := range payloads { payload := &MetricsPayload{ Execution: ExecutionContext{ - Organization: r.org, - Repository: r.repo, + Organization: p.org, + Repository: p.repo, + Workflow: p.workflow, + Job: p.job, RunID: "run-" + string(rune('a'+i)), }, Summary: summary.RunSummary{}, @@ -142,22 +88,35 @@ func TestStore_GetMetricsByRepository(t *testing.T) { } } - metrics, err := store.GetMetricsByRepository("org-a", "repo-1") + metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") if err != nil { - t.Fatalf("GetMetricsByRepository() error = %v", err) + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) } if len(metrics) != 2 { - t.Errorf("GetMetricsByRepository() returned %d metrics, want 2", len(metrics)) + t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 2", len(metrics)) } for _, m := range metrics { - if m.Organization != "org-a" || m.Repository != "repo-1" { - t.Errorf("GetMetricsByRepository() returned metric with org=%q repo=%q, want org-a/repo-1", - m.Organization, m.Repository) + if m.Organization != "org-a" || m.Repository != "repo-1" || m.Workflow != "ci.yml" || m.Job != "build" { + t.Errorf("GetMetricsByWorkflowJob() returned metric with org=%q repo=%q workflow=%q job=%q, want org-a/repo-1/ci.yml/build", + m.Organization, m.Repository, m.Workflow, m.Job) } } } +func TestStore_GetMetricsByWorkflowJob_NotFound(t *testing.T) { + store := newTestStore(t) + defer func() { _ = store.Close() }() + + metrics, err := store.GetMetricsByWorkflowJob("nonexistent", "repo", "workflow", "job") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 0 { + t.Errorf("GetMetricsByWorkflowJob() returned %d metrics, want 0", len(metrics)) + } +} + func TestStore_SaveMetric_PreservesPayload(t *testing.T) { store := newTestStore(t) defer func() { _ = store.Close() }() @@ -182,12 +141,12 @@ func TestStore_SaveMetric_PreservesPayload(t *testing.T) { t.Fatalf("SaveMetric() error = %v", err) } - metrics, err := store.GetMetricsByRunID("run-preserve") + metrics, err := store.GetMetricsByWorkflowJob("test-org", "test-repo", "build.yml", "test") if err != nil { - t.Fatalf("GetMetricsByRunID() error = %v", err) + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) } if len(metrics) != 1 { - t.Fatalf("GetMetricsByRunID() returned %d metrics, want 1", len(metrics)) + t.Fatalf("GetMetricsByWorkflowJob() returned %d metrics, want 1", len(metrics)) } m := metrics[0] From 0bf7dfee380b6e19d1339006b8a12599a646a6de Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 12:12:36 +0100 Subject: [PATCH 08/21] test: add integration tests for collector-receiver interaction Add integration tests that verify the push client can successfully send metrics to the receiver and they are stored correctly in SQLite. Tests: - TestPushClientToReceiver: Direct HTTP POST verification - TestPushClientIntegration: Full PushClient with env vars - TestMultiplePushes: Multiple pushes and filtering (parallel-safe) Co-Authored-By: Claude Opus 4.5 --- internal/integration/integration_test.go | 249 +++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 internal/integration/integration_test.go diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go new file mode 100644 index 0000000..a7932c3 --- /dev/null +++ b/internal/integration/integration_test.go @@ -0,0 +1,249 @@ +// ABOUTME: Integration tests for collector and receiver interaction. +// ABOUTME: Tests that the push client can successfully send metrics to the receiver. +package integration + +import ( + "bytes" + "context" + "encoding/json" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "path/filepath" + "testing" + "time" + + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/receiver" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" +) + +// setupTestReceiver creates a test receiver with SQLite storage and HTTP server +func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := receiver.NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil))) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := httptest.NewServer(mux) + + cleanup := func() { + server.Close() + _ = store.Close() + } + + return store, server, cleanup +} + +func TestPushClientToReceiver(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Test execution context + testCtx := summary.ExecutionContext{ + Organization: "integration-org", + Repository: "integration-repo", + Workflow: "test.yml", + Job: "integration-test", + RunID: "run-integration-123", + } + + // Create a test summary + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-time.Minute), + EndTime: time.Now(), + DurationSeconds: 60.0, + SampleCount: 10, + CPUTotal: summary.StatSummary{Peak: 85.5, Avg: 42.3, P95: 78.0}, + MemUsedBytes: summary.StatSummary{Peak: 4294967296, Avg: 2147483648, P95: 3865470566}, + MemUsedPercent: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + TopCPUProcesses: []summary.ProcessPeak{ + {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, + }, + TopMemProcesses: []summary.ProcessPeak{ + {PID: 1234, Name: "test-process", PeakCPU: 45.0, PeakMem: 1073741824}, + }, + } + + // Build payload matching what push client sends + payload := struct { + Execution summary.ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` + }{ + Execution: testCtx, + Summary: *testSummary, + } + + body, err := json.Marshal(payload) + if err != nil { + t.Fatalf("Marshal() error = %v", err) + } + + // Send via HTTP client + resp, err := http.Post(server.URL+"/api/v1/metrics", "application/json", bytes.NewReader(body)) + if err != nil { + t.Fatalf("Post() error = %v", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusCreated { + t.Errorf("status = %d, want %d", resp.StatusCode, http.StatusCreated) + } + + // Verify metrics were stored + metrics, err := store.GetMetricsByWorkflowJob("integration-org", "integration-repo", "test.yml", "integration-test") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + + m := metrics[0] + if m.Organization != testCtx.Organization { + t.Errorf("Organization = %q, want %q", m.Organization, testCtx.Organization) + } + if m.Repository != testCtx.Repository { + t.Errorf("Repository = %q, want %q", m.Repository, testCtx.Repository) + } + if m.Workflow != testCtx.Workflow { + t.Errorf("Workflow = %q, want %q", m.Workflow, testCtx.Workflow) + } + if m.Job != testCtx.Job { + t.Errorf("Job = %q, want %q", m.Job, testCtx.Job) + } + if m.RunID != testCtx.RunID { + t.Errorf("RunID = %q, want %q", m.RunID, testCtx.RunID) + } + + // Verify payload was stored correctly + var storedSummary summary.RunSummary + if err := json.Unmarshal([]byte(m.Payload), &storedSummary); err != nil { + t.Fatalf("Unmarshal payload error = %v", err) + } + + if storedSummary.SampleCount != testSummary.SampleCount { + t.Errorf("SampleCount = %d, want %d", storedSummary.SampleCount, testSummary.SampleCount) + } + if storedSummary.CPUTotal.Peak != testSummary.CPUTotal.Peak { + t.Errorf("CPUTotal.Peak = %f, want %f", storedSummary.CPUTotal.Peak, testSummary.CPUTotal.Peak) + } +} + +func TestPushClientIntegration(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Set environment variables for the push client + t.Setenv("GITHUB_REPOSITORY_OWNER", "push-client-org") + t.Setenv("GITHUB_REPOSITORY", "push-client-repo") + t.Setenv("GITHUB_WORKFLOW", "push-test.yml") + t.Setenv("GITHUB_JOB", "push-job") + t.Setenv("GITHUB_RUN_ID", "push-run-456") + + // Create push client - it reads from env vars + pushClient := summary.NewPushClient(server.URL + "/api/v1/metrics") + + // Verify execution context was read from env + ctx := pushClient.ExecutionContext() + if ctx.Organization != "push-client-org" { + t.Errorf("Organization = %q, want %q", ctx.Organization, "push-client-org") + } + + // Create and push a summary + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-30 * time.Second), + EndTime: time.Now(), + DurationSeconds: 30.0, + SampleCount: 6, + CPUTotal: summary.StatSummary{Peak: 50.0, Avg: 25.0, P95: 45.0}, + MemUsedBytes: summary.StatSummary{Peak: 1000000, Avg: 500000, P95: 900000}, + MemUsedPercent: summary.StatSummary{Peak: 10.0, Avg: 5.0, P95: 9.0}, + } + + // Push the summary + err := pushClient.Push(context.Background(), testSummary) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + + // Verify it was stored + metrics, err := store.GetMetricsByWorkflowJob("push-client-org", "push-client-repo", "push-test.yml", "push-job") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + + if metrics[0].RunID != "push-run-456" { + t.Errorf("RunID = %q, want %q", metrics[0].RunID, "push-run-456") + } +} + +func TestMultiplePushes(t *testing.T) { + store, server, cleanup := setupTestReceiver(t) + defer cleanup() + + // Simulate multiple workflow runs pushing metrics via direct HTTP POST + // This avoids env var manipulation which could cause issues with parallel tests + runs := []summary.ExecutionContext{ + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-2"}, + {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "test", RunID: "run-1"}, + {Organization: "org-a", Repository: "repo-2", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, + } + + for _, execCtx := range runs { + payload := struct { + Execution summary.ExecutionContext `json:"execution"` + Summary summary.RunSummary `json:"run_summary"` + }{ + Execution: execCtx, + Summary: summary.RunSummary{ + SampleCount: 5, + CPUTotal: summary.StatSummary{Peak: 50.0}, + }, + } + + body, err := json.Marshal(payload) + if err != nil { + t.Fatalf("Marshal() error = %v", err) + } + + resp, err := http.Post(server.URL+"/api/v1/metrics", "application/json", bytes.NewReader(body)) + if err != nil { + t.Fatalf("Post() error = %v for run %+v", err, execCtx) + } + _ = resp.Body.Close() + + if resp.StatusCode != http.StatusCreated { + t.Fatalf("status = %d, want %d for run %+v", resp.StatusCode, http.StatusCreated, execCtx) + } + } + + // Verify filtering works correctly + metrics, err := store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "build") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 2 { + t.Errorf("got %d metrics for org-a/repo-1/ci.yml/build, want 2", len(metrics)) + } + + metrics, err = store.GetMetricsByWorkflowJob("org-a", "repo-1", "ci.yml", "test") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 1 { + t.Errorf("got %d metrics for org-a/repo-1/ci.yml/test, want 1", len(metrics)) + } +} From 5e470c33a54d76c527b863eb7ce17e2e026948d7 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 14:50:36 +0100 Subject: [PATCH 09/21] feat(collector): group CPU and memory metrics by cgroup Add cgroup-based process grouping to the resource collector. Processes are grouped by their cgroup path, with container names resolved via configurable process-to-container mapping. New features: - Read cgroup info from /proc/[pid]/cgroup (supports v1 and v2) - Parse K8s resource notation (500m, 1Gi, etc.) for CPU/memory limits - Group metrics by container using CGROUP_PROCESS_MAP env var - Calculate usage percentages against limits from CGROUP_LIMITS env var - Output cgroup metrics with CPU cores used, memory RSS, and percentages Environment variables: - CGROUP_PROCESS_MAP: Map process names to container names for discovery - CGROUP_LIMITS: Define CPU/memory limits per container in K8s notation Co-Authored-By: Claude Opus 4.5 --- identify-process-cgroup-by-pid.md | 62 ++++ internal/cgroup/config.go | 84 +++++ internal/cgroup/parse.go | 96 +++++ internal/cgroup/parse_test.go | 84 +++++ internal/metrics/aggregator.go | 183 +++++++++- internal/metrics/types.go | 39 +- internal/output/logger.go | 13 +- internal/proc/cgroup.go | 59 +++ internal/proc/cgroup_test.go | 97 +++++ internal/proc/process.go | 9 +- proc-stat-in-containers.md | 560 +++++++++++++++++++++++++++++ test/docker/docker-compose.yaml | 81 +++++ test/k8s/test-cgroup-grouping.yaml | 148 ++++++++ test/local-test.sh | 36 ++ 14 files changed, 1534 insertions(+), 17 deletions(-) create mode 100644 identify-process-cgroup-by-pid.md create mode 100644 internal/cgroup/config.go create mode 100644 internal/cgroup/parse.go create mode 100644 internal/cgroup/parse_test.go create mode 100644 internal/proc/cgroup.go create mode 100644 internal/proc/cgroup_test.go create mode 100644 proc-stat-in-containers.md create mode 100644 test/docker/docker-compose.yaml create mode 100644 test/k8s/test-cgroup-grouping.yaml create mode 100755 test/local-test.sh diff --git a/identify-process-cgroup-by-pid.md b/identify-process-cgroup-by-pid.md new file mode 100644 index 0000000..062abba --- /dev/null +++ b/identify-process-cgroup-by-pid.md @@ -0,0 +1,62 @@ +# Identify Process Cgroup by PID + +## TL;DR + +You can identify a process's cgroup by reading `/proc//cgroup`. + +## Outcome + +Multiple methods to find which cgroup a process belongs to using its PID. + +## Conversation + +**Question:** Can I identify the process's cgroup by having its PID? + +**Answer:** Yes, here are the main methods: + +### Method 1: Read `/proc//cgroup` + +```bash +cat /proc//cgroup +``` + +This shows all cgroup controllers the process belongs to. For example: +``` +12:blkio:/user.slice +11:memory:/user.slice/user-1000.slice +... +0::/user.slice/user-1000.slice/session-1.scope +``` + +### Method 2: Using `ps` with format options + +```bash +ps -o pid,cgroup -p +``` + +### Method 3: Using `systemd-cgls` + +If you're on a systemd system: +```bash +systemd-cgls --unit +# or to see the whole tree +systemd-cgls +``` + +### Method 4: Check cgroup v2 unified hierarchy + +On cgroup v2 systems, check: +```bash +cat /proc//cgroup +# Output will look like: 0::/user.slice/user-1000.slice/session-1.scope +``` + +The path after `::` is the cgroup path under `/sys/fs/cgroup/`. + +### Quick one-liner + +```bash +cat /proc/self/cgroup # For current shell +cat /proc/$$/cgroup # Also current shell +cat /proc/1234/cgroup # For PID 1234 +``` diff --git a/internal/cgroup/config.go b/internal/cgroup/config.go new file mode 100644 index 0000000..536f61b --- /dev/null +++ b/internal/cgroup/config.go @@ -0,0 +1,84 @@ +// ABOUTME: Configuration types and parsing for cgroup limits and process mapping. +// ABOUTME: Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP environment variables. +package cgroup + +import ( + "encoding/json" + "fmt" + "os" +) + +// CgroupLimit holds the resource limits for a container/cgroup +type CgroupLimit struct { + CPUCores float64 // CPU limit in cores (e.g., 0.5 for "500m", 2.0 for "2") + MemoryBytes uint64 // Memory limit in bytes +} + +// CgroupLimits maps container names to their resource limits +type CgroupLimits map[string]CgroupLimit + +// ProcessMapping maps process names to container names (for cgroup path discovery) +type ProcessMapping map[string]string + +// CgroupPathMapping maps cgroup paths to container names (built at runtime) +type CgroupPathMapping map[string]string + +// rawLimitEntry is the JSON structure for each entry in CGROUP_LIMITS +type rawLimitEntry struct { + CPU string `json:"cpu"` + Memory string `json:"memory"` +} + +// ParseCgroupLimitsEnv parses the CGROUP_LIMITS environment variable. +// Expected format: {"container-name": {"cpu": "500m", "memory": "1Gi"}, ...} +func ParseCgroupLimitsEnv() (CgroupLimits, error) { + raw := os.Getenv("CGROUP_LIMITS") + if raw == "" { + return nil, nil // No limits configured + } + + var parsed map[string]rawLimitEntry + if err := json.Unmarshal([]byte(raw), &parsed); err != nil { + return nil, fmt.Errorf("parsing CGROUP_LIMITS: %w", err) + } + + limits := make(CgroupLimits) + for name, entry := range parsed { + var limit CgroupLimit + var err error + + if entry.CPU != "" { + limit.CPUCores, err = ParseCPU(entry.CPU) + if err != nil { + return nil, fmt.Errorf("parsing CPU for %q: %w", name, err) + } + } + + if entry.Memory != "" { + limit.MemoryBytes, err = ParseMemory(entry.Memory) + if err != nil { + return nil, fmt.Errorf("parsing memory for %q: %w", name, err) + } + } + + limits[name] = limit + } + + return limits, nil +} + +// ParseProcessMappingEnv parses the CGROUP_PROCESS_MAP environment variable. +// Expected format: {"process-name": "container-name", ...} +func ParseProcessMappingEnv() (ProcessMapping, error) { + raw := os.Getenv("CGROUP_PROCESS_MAP") + if raw == "" { + return nil, nil // No mapping configured + } + + var parsed map[string]string + if err := json.Unmarshal([]byte(raw), &parsed); err != nil { + return nil, fmt.Errorf("parsing CGROUP_PROCESS_MAP: %w", err) + } + + return ProcessMapping(parsed), nil +} diff --git a/internal/cgroup/parse.go b/internal/cgroup/parse.go new file mode 100644 index 0000000..e7d1a92 --- /dev/null +++ b/internal/cgroup/parse.go @@ -0,0 +1,96 @@ +// ABOUTME: Parses Kubernetes-style resource notation for CPU and memory. +// ABOUTME: CPU: "500m" = 0.5 cores, "2" = 2 cores. +// ABOUTME: Memory: "1Gi" = 1 GiB, "512Mi" = 512 MiB, "1G" = 1 GB. +package cgroup + +import ( + "fmt" + "strconv" + "strings" +) + +// ParseCPU parses Kubernetes CPU notation to cores. +// Examples: "500m" => 0.5, "2" => 2.0, "100m" => 0.1, "2000m" => 2.0 +func ParseCPU(value string) (float64, error) { + value = strings.TrimSpace(value) + if value == "" { + return 0, fmt.Errorf("empty CPU value") + } + + // Handle millicores suffix + if strings.HasSuffix(value, "m") { + millis, err := strconv.ParseFloat(strings.TrimSuffix(value, "m"), 64) + if err != nil { + return 0, fmt.Errorf("parsing millicores: %w", err) + } + return millis / 1000.0, nil + } + + // Plain number means cores + cores, err := strconv.ParseFloat(value, 64) + if err != nil { + return 0, fmt.Errorf("parsing cores: %w", err) + } + + return cores, nil +} + +// ParseMemory parses Kubernetes memory notation to bytes. +// Supports: +// - Binary suffixes: Ki, Mi, Gi, Ti (powers of 1024) +// - Decimal suffixes: K, M, G, T (powers of 1000) +// - Plain numbers: bytes +func ParseMemory(value string) (uint64, error) { + value = strings.TrimSpace(value) + if value == "" { + return 0, fmt.Errorf("empty memory value") + } + + // Binary suffixes (powers of 1024) + binarySuffixes := map[string]uint64{ + "Ki": 1024, + "Mi": 1024 * 1024, + "Gi": 1024 * 1024 * 1024, + "Ti": 1024 * 1024 * 1024 * 1024, + } + + // Decimal suffixes (powers of 1000) + decimalSuffixes := map[string]uint64{ + "K": 1000, + "M": 1000 * 1000, + "G": 1000 * 1000 * 1000, + "T": 1000 * 1000 * 1000 * 1000, + } + + // Try binary suffixes first (2-char) + for suffix, multiplier := range binarySuffixes { + if strings.HasSuffix(value, suffix) { + numStr := strings.TrimSuffix(value, suffix) + num, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("parsing memory value: %w", err) + } + return uint64(num * float64(multiplier)), nil + } + } + + // Try decimal suffixes (1-char) + for suffix, multiplier := range decimalSuffixes { + if strings.HasSuffix(value, suffix) { + numStr := strings.TrimSuffix(value, suffix) + num, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("parsing memory value: %w", err) + } + return uint64(num * float64(multiplier)), nil + } + } + + // Plain number (bytes) + bytes, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return 0, fmt.Errorf("parsing bytes: %w", err) + } + + return bytes, nil +} diff --git a/internal/cgroup/parse_test.go b/internal/cgroup/parse_test.go new file mode 100644 index 0000000..2451163 --- /dev/null +++ b/internal/cgroup/parse_test.go @@ -0,0 +1,84 @@ +package cgroup + +import ( + "testing" +) + +func TestParseCPU(t *testing.T) { + tests := []struct { + name string + input string + want float64 + wantErr bool + }{ + {"millicores 500m", "500m", 0.5, false}, + {"millicores 100m", "100m", 0.1, false}, + {"millicores 2000m", "2000m", 2.0, false}, + {"millicores 1m", "1m", 0.001, false}, + {"cores integer", "2", 2.0, false}, + {"cores decimal", "1.5", 1.5, false}, + {"cores with spaces", " 2 ", 2.0, false}, + {"empty string", "", 0, true}, + {"invalid format", "abc", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseCPU(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("ParseCPU() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got != tt.want { + t.Errorf("ParseCPU() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestParseMemory(t *testing.T) { + tests := []struct { + name string + input string + want uint64 + wantErr bool + }{ + // Binary suffixes (powers of 1024) + {"Ki", "1Ki", 1024, false}, + {"Mi", "1Mi", 1024 * 1024, false}, + {"Gi", "1Gi", 1024 * 1024 * 1024, false}, + {"Ti", "1Ti", 1024 * 1024 * 1024 * 1024, false}, + {"512Mi", "512Mi", 512 * 1024 * 1024, false}, + {"2Gi", "2Gi", 2 * 1024 * 1024 * 1024, false}, + + // Decimal suffixes (powers of 1000) + {"K", "1K", 1000, false}, + {"M", "1M", 1000000, false}, + {"G", "1G", 1000000000, false}, + {"T", "1T", 1000000000000, false}, + + // Plain bytes + {"bytes", "1024", 1024, false}, + {"large bytes", "1073741824", 1073741824, false}, + + // With spaces + {"with spaces", " 1Gi ", 1024 * 1024 * 1024, false}, + + // Error cases + {"empty", "", 0, true}, + {"invalid", "abc", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ParseMemory(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("ParseMemory() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got != tt.want { + t.Errorf("ParseMemory() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/metrics/aggregator.go b/internal/metrics/aggregator.go index 1e33646..78cfc94 100644 --- a/internal/metrics/aggregator.go +++ b/internal/metrics/aggregator.go @@ -4,23 +4,37 @@ import ( "sort" "time" + "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/cgroup" "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/proc" ) // Aggregator collects and aggregates metrics from processes type Aggregator struct { - procPath string - topN int - prevCPU *CPUSnapshot - prevProcCPU map[int]*ProcessCPUSnapshot + procPath string + topN int + prevCPU *CPUSnapshot + prevProcCPU map[int]*ProcessCPUSnapshot + cgroupLimits cgroup.CgroupLimits // container name -> limits + processMapping cgroup.ProcessMapping // process name -> container name + cgroupPathMapping cgroup.CgroupPathMapping // cgroup path -> container name (built at runtime) + prevCgroupCPU map[string]uint64 // container name -> previous total ticks + prevCgroupTime time.Time // previous collection time for cgroup CPU calc } // NewAggregator creates a new metrics aggregator func NewAggregator(procPath string, topN int) *Aggregator { + // Parse cgroup configuration from environment + limits, _ := cgroup.ParseCgroupLimitsEnv() + processMap, _ := cgroup.ParseProcessMappingEnv() + return &Aggregator{ - procPath: procPath, - topN: topN, - prevProcCPU: make(map[int]*ProcessCPUSnapshot), + procPath: procPath, + topN: topN, + prevProcCPU: make(map[int]*ProcessCPUSnapshot), + cgroupLimits: limits, + processMapping: processMap, + cgroupPathMapping: make(cgroup.CgroupPathMapping), + prevCgroupCPU: make(map[string]uint64), } } @@ -77,6 +91,12 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { return float64(p.MemRSS) }) + // Discover cgroup path mappings from known processes + a.discoverCgroupMappings(processes) + + // Calculate per-cgroup metrics + cgroupMetrics := a.calculateCgroupMetrics(processes, processMetrics, now) + return &SystemMetrics{ Timestamp: now, TotalProcesses: len(processes), @@ -84,6 +104,7 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) { Memory: memMetrics, TopCPU: topCPU, TopMemory: topMemory, + Cgroups: cgroupMetrics, }, nil } @@ -158,6 +179,11 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now state = "?" } + cgroupPath := "" + if p.Cgroup != nil { + cgroupPath = p.Cgroup.Path + } + metrics = append(metrics, ProcessMetrics{ PID: pid, Name: p.Status.Name, @@ -166,6 +192,7 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now MemVirtual: p.Status.VmSize, Threads: p.Status.Threads, State: state, + CgroupPath: cgroupPath, }) } @@ -223,3 +250,145 @@ func (a *Aggregator) getTopByMetric(metrics []ProcessMetrics, getValue func(Proc return sorted[:n] } + +// discoverCgroupMappings discovers cgroup path to container name mappings +// by looking for processes that match the configured process mapping. +func (a *Aggregator) discoverCgroupMappings(processes []*proc.ProcessInfo) { + if len(a.processMapping) == 0 { + return + } + + for _, p := range processes { + if p.Cgroup == nil || p.Cgroup.Path == "" { + continue + } + + // Check if this process name is in our mapping + if containerName, ok := a.processMapping[p.Status.Name]; ok { + // Map this cgroup path to the container name + a.cgroupPathMapping[p.Cgroup.Path] = containerName + } + } +} + +// resolveContainerName returns the container name for a cgroup path, +// or the raw path if no mapping exists. +func (a *Aggregator) resolveContainerName(cgroupPath string) string { + if name, ok := a.cgroupPathMapping[cgroupPath]; ok { + return name + } + // Use raw path as fallback + if cgroupPath == "" { + return "" + } + return cgroupPath +} + +// calculateCgroupMetrics computes metrics grouped by container/cgroup. +func (a *Aggregator) calculateCgroupMetrics( + processes []*proc.ProcessInfo, + processMetrics []ProcessMetrics, + now time.Time, +) map[string]*CgroupMetrics { + // Build lookup from PID to ProcessMetrics + pmByPID := make(map[int]ProcessMetrics) + for _, pm := range processMetrics { + pmByPID[pm.PID] = pm + } + + // Group processes by container name + type cgroupData struct { + cgroupPath string + procs []*proc.ProcessInfo + metrics []ProcessMetrics + } + containerGroups := make(map[string]*cgroupData) + + for _, p := range processes { + cgroupPath := "" + if p.Cgroup != nil { + cgroupPath = p.Cgroup.Path + } + + containerName := a.resolveContainerName(cgroupPath) + + if _, ok := containerGroups[containerName]; !ok { + containerGroups[containerName] = &cgroupData{ + cgroupPath: cgroupPath, + } + } + + containerGroups[containerName].procs = append(containerGroups[containerName].procs, p) + + if pm, ok := pmByPID[p.Stat.PID]; ok { + containerGroups[containerName].metrics = append(containerGroups[containerName].metrics, pm) + } + } + + // Calculate elapsed time since last collection + elapsed := 0.0 + if !a.prevCgroupTime.IsZero() { + elapsed = now.Sub(a.prevCgroupTime).Seconds() + } + a.prevCgroupTime = now + + // Calculate metrics for each container + result := make(map[string]*CgroupMetrics) + + for containerName, data := range containerGroups { + // Sum CPU ticks (utime + stime only, not cutime/cstime) + var totalTicks uint64 + var totalRSS uint64 + + for _, p := range data.procs { + totalTicks += p.Stat.TotalTime() + totalRSS += p.Status.VmRSS + } + + // Calculate CPU cores used from delta + usedCores := 0.0 + if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 { + deltaTicks := totalTicks - prev + // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) + usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) + } + a.prevCgroupCPU[containerName] = totalTicks + + // Calculate percentages against limits if available + cpuPercent := 0.0 + memPercent := 0.0 + var limitCores float64 + var limitBytes uint64 + + if limit, ok := a.cgroupLimits[containerName]; ok { + limitCores = limit.CPUCores + limitBytes = limit.MemoryBytes + if limit.CPUCores > 0 { + cpuPercent = (usedCores / limit.CPUCores) * 100 + } + if limit.MemoryBytes > 0 { + memPercent = (float64(totalRSS) / float64(limit.MemoryBytes)) * 100 + } + } + + result[containerName] = &CgroupMetrics{ + Name: containerName, + CgroupPath: data.cgroupPath, + CPU: CgroupCPUMetrics{ + TotalTicks: totalTicks, + UsedCores: usedCores, + UsedPercent: cpuPercent, + LimitCores: limitCores, + }, + Memory: CgroupMemoryMetrics{ + TotalRSSBytes: totalRSS, + UsedPercent: memPercent, + LimitBytes: limitBytes, + }, + Processes: data.metrics, + NumProcs: len(data.procs), + } + } + + return result +} diff --git a/internal/metrics/types.go b/internal/metrics/types.go index 00be63f..b88d4c8 100644 --- a/internal/metrics/types.go +++ b/internal/metrics/types.go @@ -11,6 +11,7 @@ type ProcessMetrics struct { MemVirtual uint64 `json:"mem_virtual_bytes"` Threads int `json:"threads"` State string `json:"state"` + CgroupPath string `json:"cgroup_path,omitempty"` } // CPUMetrics holds aggregated CPU metrics @@ -35,12 +36,38 @@ type MemoryMetrics struct { // SystemMetrics holds a complete snapshot of system metrics type SystemMetrics struct { - Timestamp time.Time `json:"timestamp"` - TotalProcesses int `json:"total_processes"` - CPU CPUMetrics `json:"cpu"` - Memory MemoryMetrics `json:"memory"` - TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` - TopMemory []ProcessMetrics `json:"top_memory,omitempty"` + Timestamp time.Time `json:"timestamp"` + TotalProcesses int `json:"total_processes"` + CPU CPUMetrics `json:"cpu"` + Memory MemoryMetrics `json:"memory"` + TopCPU []ProcessMetrics `json:"top_cpu,omitempty"` + TopMemory []ProcessMetrics `json:"top_memory,omitempty"` + Cgroups map[string]*CgroupMetrics `json:"cgroups,omitempty"` +} + +// CgroupCPUMetrics holds CPU metrics for a single cgroup/container +type CgroupCPUMetrics struct { + TotalTicks uint64 `json:"total_ticks"` + UsedCores float64 `json:"used_cores"` + UsedPercent float64 `json:"used_percent,omitempty"` + LimitCores float64 `json:"limit_cores,omitempty"` +} + +// CgroupMemoryMetrics holds memory metrics for a single cgroup/container +type CgroupMemoryMetrics struct { + TotalRSSBytes uint64 `json:"total_rss_bytes"` + UsedPercent float64 `json:"used_percent,omitempty"` + LimitBytes uint64 `json:"limit_bytes,omitempty"` +} + +// CgroupMetrics holds all metrics for a single cgroup/container +type CgroupMetrics struct { + Name string `json:"name"` + CgroupPath string `json:"cgroup_path"` + CPU CgroupCPUMetrics `json:"cpu"` + Memory CgroupMemoryMetrics `json:"memory"` + Processes []ProcessMetrics `json:"processes"` + NumProcs int `json:"num_processes"` } // CPUSnapshot holds CPU timing data for calculating percentages between intervals diff --git a/internal/output/logger.go b/internal/output/logger.go index 4c57794..a5933fd 100644 --- a/internal/output/logger.go +++ b/internal/output/logger.go @@ -1,6 +1,7 @@ package output import ( + "context" "io" "log/slog" "os" @@ -89,7 +90,8 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { }) } - w.logger.Info("metrics_collected", + // Build base attributes + attrs := []slog.Attr{ slog.Time("collection_time", m.Timestamp), slog.Int("total_processes", m.TotalProcesses), slog.Group("cpu", @@ -110,7 +112,14 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error { ), slog.Any("top_cpu", topCPU), slog.Any("top_memory", topMem), - ) + } + + // Add cgroups if present + if len(m.Cgroups) > 0 { + attrs = append(attrs, slog.Any("cgroups", m.Cgroups)) + } + + w.logger.LogAttrs(context.Background(), slog.LevelInfo, "metrics_collected", attrs...) return nil } diff --git a/internal/proc/cgroup.go b/internal/proc/cgroup.go new file mode 100644 index 0000000..6e1597f --- /dev/null +++ b/internal/proc/cgroup.go @@ -0,0 +1,59 @@ +// ABOUTME: Reads cgroup information from /proc/[pid]/cgroup. +// ABOUTME: Supports both cgroup v1 and v2 formats. +package proc + +import ( + "bufio" + "fmt" + "os" + "strings" +) + +// CgroupInfo holds cgroup information for a process +type CgroupInfo struct { + Path string // The cgroup path (unified for v2, or from memory controller for v1) +} + +// ReadCgroup reads /proc/[pid]/cgroup and extracts the cgroup path +func ReadCgroup(procPath string, pid int) (*CgroupInfo, error) { + path := fmt.Sprintf("%s/%d/cgroup", procPath, pid) + + file, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("opening cgroup file: %w", err) + } + defer func() { _ = file.Close() }() + + var cgroupPath string + scanner := bufio.NewScanner(file) + + for scanner.Scan() { + line := scanner.Text() + + // Try cgroup v2 first (unified hierarchy) + // Format: 0::/path + if path, found := strings.CutPrefix(line, "0::"); found { + cgroupPath = path + break + } + + // Fall back to cgroup v1 - look for memory controller + // Format: X:memory:/path or X:memory,other:/path + parts := strings.SplitN(line, ":", 3) + if len(parts) == 3 { + controllers := parts[1] + if strings.Contains(controllers, "memory") { + cgroupPath = parts[2] + // Don't break - prefer v2 if found later + } + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("scanning cgroup file: %w", err) + } + + return &CgroupInfo{ + Path: cgroupPath, + }, nil +} diff --git a/internal/proc/cgroup_test.go b/internal/proc/cgroup_test.go new file mode 100644 index 0000000..e82b6c8 --- /dev/null +++ b/internal/proc/cgroup_test.go @@ -0,0 +1,97 @@ +package proc + +import ( + "os" + "path/filepath" + "testing" +) + +func TestReadCgroup(t *testing.T) { + tests := []struct { + name string + cgroupFile string + wantPath string + wantErr bool + }{ + { + name: "cgroup v2 unified", + cgroupFile: `0::/kubepods/pod-abc/container-123 +`, + wantPath: "/kubepods/pod-abc/container-123", + wantErr: false, + }, + { + name: "cgroup v2 with trailing newline", + cgroupFile: `0::/docker/abc123def456 +`, + wantPath: "/docker/abc123def456", + wantErr: false, + }, + { + name: "cgroup v1 multiple controllers", + cgroupFile: `12:blkio:/user.slice +11:memory:/docker/abc123 +10:cpu,cpuacct:/docker/abc123 +9:pids:/docker/abc123 +`, + wantPath: "/docker/abc123", + wantErr: false, + }, + { + name: "cgroup v2 preferred over v1", + cgroupFile: `11:memory:/docker/old-path +0::/kubepods/new-path +`, + wantPath: "/kubepods/new-path", + wantErr: false, + }, + { + name: "empty file", + cgroupFile: "", + wantPath: "", + wantErr: false, + }, + { + name: "root cgroup", + cgroupFile: `0::/ +`, + wantPath: "/", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a temp directory structure mimicking /proc + tmpDir := t.TempDir() + procDir := filepath.Join(tmpDir, "proc") + pidDir := filepath.Join(procDir, "1234") + + if err := os.MkdirAll(pidDir, 0755); err != nil { + t.Fatalf("Failed to create pid dir: %v", err) + } + + if err := os.WriteFile(filepath.Join(pidDir, "cgroup"), []byte(tt.cgroupFile), 0644); err != nil { + t.Fatalf("Failed to write cgroup file: %v", err) + } + + got, err := ReadCgroup(procDir, 1234) + if (err != nil) != tt.wantErr { + t.Errorf("ReadCgroup() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && got.Path != tt.wantPath { + t.Errorf("ReadCgroup() path = %q, want %q", got.Path, tt.wantPath) + } + }) + } +} + +func TestReadCgroup_FileNotFound(t *testing.T) { + tmpDir := t.TempDir() + + _, err := ReadCgroup(tmpDir, 1234) + if err == nil { + t.Error("ReadCgroup() expected error for missing file, got nil") + } +} diff --git a/internal/proc/process.go b/internal/proc/process.go index fced8bd..9d1ef64 100644 --- a/internal/proc/process.go +++ b/internal/proc/process.go @@ -128,13 +128,14 @@ func ReadSystemCPU(procPath string) (user, nice, system, idle, iowait, irq, soft return 0, 0, 0, 0, 0, 0, 0, fmt.Errorf("cpu line not found in /proc/stat") } -// ProcessInfo combines stat and status information for a process +// ProcessInfo combines stat, status, and cgroup information for a process type ProcessInfo struct { Stat *ProcStat Status *ProcStatus + Cgroup *CgroupInfo } -// ReadProcess reads both stat and status for a single process +// ReadProcess reads stat, status, and cgroup for a single process func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { stat, err := ReadStat(procPath, pid) if err != nil { @@ -146,9 +147,13 @@ func ReadProcess(procPath string, pid int) (*ProcessInfo, error) { return nil, err } + // Read cgroup info (non-fatal if it fails) + cgroup, _ := ReadCgroup(procPath, pid) + return &ProcessInfo{ Stat: stat, Status: status, + Cgroup: cgroup, }, nil } diff --git a/proc-stat-in-containers.md b/proc-stat-in-containers.md new file mode 100644 index 0000000..7c33575 --- /dev/null +++ b/proc-stat-in-containers.md @@ -0,0 +1,560 @@ +# /proc/stat Behavior in Containerized Environments + +## TL;DR + +`/proc/stat` in containers shows host-level statistics, not container-specific data. To get accurate container CPU metrics when cgroups aren't an option (e.g., sidecars sharing PID namespace with different cgroups), aggregate `/proc/[pid]/stat` for all visible processes and calculate percentages using cgroup limits from `/sys/fs/cgroup`. + +## Outcome + +- Explained why `/proc/stat` doesn't provide container-aware metrics +- Provided methods to aggregate process CPU usage from `/proc/[pid]/stat` +- Showed how to determine parent/child process relationships +- Demonstrated converting raw ticks to CPU percentages +- Covered how to read cgroup CPU limits for other containers + +## Conversation + +### How does /proc/stat behave in containerized environments? + +By default, `/proc/stat` inside a container shows **host system statistics**, not container-specific data. This includes: + +- **CPU times** (user, system, idle, etc.) - reflects all CPUs on the host +- **Context switches** - host-wide count +- **Boot time** - host's boot time +- **Processes created** - host-wide total + +#### Why This Happens + +The `/proc` filesystem is mounted from the host kernel. Containers share the host kernel, and `/proc/stat` is a kernel-level interface that doesn't have namespace awareness for most of its metrics. + +#### Practical Implications + +1. **CPU usage calculations are wrong** - Tools like `top`, `htop`, or custom monitoring that read `/proc/stat` will show host CPU usage, not container usage +2. **Resource limits aren't reflected** - Even if you limit a container to 2 CPUs via cgroups, `/proc/stat` shows all host CPUs +3. **Multi-tenant confusion** - In shared environments, containers see each other's aggregate impact + +#### Solutions + +| Approach | Description | +|----------|-------------| +| **cgroups** | Read from `/sys/fs/cgroup/cpu/` for container-specific CPU accounting | +| **LXCFS** | A FUSE filesystem that provides container-aware `/proc` files | +| **Container runtimes** | Some runtimes (like Kata) use VMs with isolated kernels | +| **Metrics APIs** | Use Docker/Kubernetes APIs instead of `/proc/stat` | + +#### Example: Getting Real Container CPU + +```bash +# Instead of /proc/stat, use cgroups v2: +cat /sys/fs/cgroup/cpu.stat + +# Or cgroups v1: +cat /sys/fs/cgroup/cpu/cpuacct.usage +``` + +--- + +### Can I aggregate process usage to get the usage of the container? + +Yes, you can aggregate process usage, but there are caveats. + +#### Aggregating from /proc/[pid]/stat + +```bash +# Fields 14 (utime) and 15 (stime) in /proc/[pid]/stat +for pid in /proc/[0-9]*; do + awk '{print $14 + $15}' "$pid/stat" 2>/dev/null +done | awk '{sum += $1} END {print sum}' +``` + +#### Problems with This Approach + +1. **Race conditions** - Processes can spawn/die between reads, causing inconsistent totals +2. **Short-lived processes** - Processes that start and exit between samples are missed entirely +3. **Zombie/exited processes** - Their CPU time may not be captured +4. **Overhead** - Scanning all PIDs repeatedly is expensive +5. **Namespace visibility** - You only see processes in your PID namespace, but that's actually what you want in a container +6. **Children accounting** - When a process exits, its CPU time gets added to the parent's `cutime`/`cstime` fields, which can cause double-counting if not handled carefully + +#### Better Alternative: cgroups + +Cgroups track CPU usage at the group level and handle all these edge cases: + +```bash +# cgroups v2 (modern) +cat /sys/fs/cgroup/cpu.stat +# Shows: usage_usec, user_usec, system_usec + +# cgroups v1 +cat /sys/fs/cgroup/cpu/cpuacct.usage # total nanoseconds +cat /sys/fs/cgroup/cpu/cpuacct.stat # user/system jiffies +``` + +**However**, cgroups cannot be used when sidecars share the PID namespace but have different cgroups. In that case, aggregating `/proc/[pid]/stat` is the best option. + +--- + +### How do I determine child/parent processes? + +#### From /proc/[pid]/stat + +Field 4 is the **PPID** (parent process ID): + +```bash +# Get PPID for a specific process +awk '{print $4}' /proc/1234/stat + +# List all processes with their parents +for stat in /proc/[0-9]*/stat; do + pid="${stat#/proc/}" + pid="${pid%/stat}" + ppid=$(awk '{print $4}' "$stat" 2>/dev/null) && \ + echo "$pid -> parent: $ppid" +done +``` + +#### From /proc/[pid]/status (more readable) + +```bash +grep PPid /proc/1234/status +# PPid: 1 +``` + +#### Building a Process Tree + +```bash +#!/bin/bash +declare -A parent_of +declare -A children_of + +for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + pid="${stat#/proc/}" + pid="${pid%/stat}" + + # Extract PPID (field 4, but handle comm with spaces) + rest="${line##*) }" + read -ra fields <<< "$rest" + ppid="${fields[1]}" # 4th field overall = index 1 after state + + parent_of[$pid]=$ppid + children_of[$ppid]+="$pid " + fi +done + +# Print tree from PID 1 +print_tree() { + local pid=$1 + local indent=$2 + echo "${indent}${pid}" + for child in ${children_of[$pid]}; do + print_tree "$child" " $indent" + done +} + +print_tree 1 "" +``` + +#### For CPU Aggregation: Handling cutime/cstime + +To properly handle `cutime`/`cstime` without double-counting: + +```bash +#!/bin/bash +declare -A parent_of +declare -A utime stime + +# First pass: collect all data +for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + pid="${stat#/proc/}" + pid="${pid%/stat}" + rest="${line##*) }" + read -ra f <<< "$rest" + + parent_of[$pid]="${f[1]}" + utime[$pid]="${f[11]}" + stime[$pid]="${f[12]}" + # cutime=${f[13]} cstime=${f[14]} - don't sum these + fi +done + +# Sum only utime/stime (not cutime/cstime) +total=0 +for pid in "${!utime[@]}"; do + ((total += utime[$pid] + stime[$pid])) +done + +echo "Total CPU ticks: $total" +echo "Seconds: $(echo "scale=2; $total / $(getconf CLK_TCK)" | bc)" +``` + +**Key insight:** Only sum `utime` + `stime` for each process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on—those children no longer exist in `/proc`, so their time is only accessible via the parent's `cutime`/`cstime`. + +--- + +### How do I convert utime/stime to percentages? + +You need **two samples** over a time interval. CPU percentage is a rate, not an absolute value. + +#### The Formula + +``` +CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100 +``` + +- `delta_ticks` = difference in (utime + stime) between samples +- `CLK_TCK` = ticks per second (usually 100, get via `getconf CLK_TCK`) +- `num_cpus` = number of CPUs (omit for single-CPU percentage) + +#### Two Common Percentage Styles + +| Style | Formula | Example | +|-------|---------|---------| +| **Normalized** (0-100%) | `delta / (elapsed * CLK_TCK * num_cpus) * 100` | 50% = half of total capacity | +| **Cores-style** (0-N*100%) | `delta / (elapsed * CLK_TCK) * 100` | 200% = 2 full cores busy | + +#### Practical Script + +```bash +#!/bin/bash + +CLK_TCK=$(getconf CLK_TCK) +NUM_CPUS=$(nproc) + +get_total_ticks() { + local total=0 + for stat in /proc/[0-9]*/stat; do + if read -r line < "$stat" 2>/dev/null; then + rest="${line##*) }" + read -ra f <<< "$rest" + ((total += f[11] + f[12])) # utime + stime + fi + done + echo "$total" +} + +# First sample +ticks1=$(get_total_ticks) +time1=$(date +%s.%N) + +# Wait +sleep 1 + +# Second sample +ticks2=$(get_total_ticks) +time2=$(date +%s.%N) + +# Calculate +delta_ticks=$((ticks2 - ticks1)) +elapsed=$(echo "$time2 - $time1" | bc) + +# Percentage of total CPU capacity (all cores) +pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK * $NUM_CPUS)) * 100" | bc) +echo "CPU usage: ${pct}% of ${NUM_CPUS} CPUs" + +# Percentage as "CPU cores used" (like top's 200% for 2 full cores) +cores_pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK)) * 100" | bc) +echo "CPU usage: ${cores_pct}% (cores-style)" +``` + +#### Continuous Monitoring + +```bash +#!/bin/bash +CLK_TCK=$(getconf CLK_TCK) +NUM_CPUS=$(nproc) +INTERVAL=1 + +get_total_ticks() { + local total=0 + for stat in /proc/[0-9]*/stat; do + read -r line < "$stat" 2>/dev/null || continue + rest="${line##*) }" + read -ra f <<< "$rest" + ((total += f[11] + f[12])) + done + echo "$total" +} + +prev_ticks=$(get_total_ticks) +prev_time=$(date +%s.%N) + +while true; do + sleep "$INTERVAL" + + curr_ticks=$(get_total_ticks) + curr_time=$(date +%s.%N) + + delta=$((curr_ticks - prev_ticks)) + elapsed=$(echo "$curr_time - $prev_time" | bc) + + pct=$(echo "scale=1; $delta / ($elapsed * $CLK_TCK * $NUM_CPUS) * 100" | bc) + printf "\rCPU: %5.1f%%" "$pct" + + prev_ticks=$curr_ticks + prev_time=$curr_time +done +``` + +--- + +### Does this calculation respect cgroup limits? + +No, it doesn't. The calculation uses `nproc` which typically returns **host CPU count**, not your cgroup limit. + +#### The Problem + +If your container is limited to 2 CPUs on an 8-CPU host: +- `nproc` returns 8 +- Your calculation shows 25% when you're actually at 100% of your limit + +#### Getting Effective CPU Limit + +**cgroups v2:** + +```bash +# cpu.max contains: $quota $period (in microseconds) +# "max 100000" means unlimited +read quota period < /sys/fs/cgroup/cpu.max +if [[ "$quota" == "max" ]]; then + effective_cpus=$(nproc) +else + effective_cpus=$(echo "scale=2; $quota / $period" | bc) +fi +echo "Effective CPUs: $effective_cpus" +``` + +**cgroups v1:** + +```bash +quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) +period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) + +if [[ "$quota" == "-1" ]]; then + effective_cpus=$(nproc) +else + effective_cpus=$(echo "scale=2; $quota / $period" | bc) +fi +``` + +**Also Check cpuset Limits:** + +```bash +# cgroups v2 +cpuset=$(cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null) + +# cgroups v1 +cpuset=$(cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null) + +# Parse "0-3,5,7" format to count CPUs +count_cpus() { + local count=0 + IFS=',' read -ra ranges <<< "$1" + for range in "${ranges[@]}"; do + if [[ "$range" == *-* ]]; then + start="${range%-*}" + end="${range#*-}" + ((count += end - start + 1)) + else + ((count++)) + fi + done + echo "$count" +} +``` + +#### Updated Script Respecting Limits + +```bash +#!/bin/bash +CLK_TCK=$(getconf CLK_TCK) + +get_effective_cpus() { + # Try cgroups v2 first + if [[ -f /sys/fs/cgroup/cpu.max ]]; then + read quota period < /sys/fs/cgroup/cpu.max + if [[ "$quota" != "max" ]]; then + echo "scale=2; $quota / $period" | bc + return + fi + fi + + # Try cgroups v1 + if [[ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ]]; then + quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) + period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) + if [[ "$quota" != "-1" ]]; then + echo "scale=2; $quota / $period" | bc + return + fi + fi + + # Fallback to nproc + nproc +} + +EFFECTIVE_CPUS=$(get_effective_cpus) +echo "Effective CPU limit: $EFFECTIVE_CPUS" + +# ... rest of your sampling logic using $EFFECTIVE_CPUS +``` + +#### Shared PID Namespace Consideration + +When sidecars share PID namespace but have different cgroups: +- Each container may have different CPU limits +- You're aggregating processes across those limits +- There's no single "correct" limit to use + +**Options:** +1. **Use host CPU count** — percentage of total host capacity +2. **Sum the limits** — if you know each sidecar's cgroup, sum their quotas +3. **Report in cores** — skip normalization, just show `1.5 cores used` instead of percentage + +--- + +### Can I get the cgroup limit for another cgroup? + +Yes, if you have visibility into the cgroup filesystem. + +#### 1. Find a Process's Cgroup + +Every process exposes its cgroup membership: + +```bash +# Get cgroup for any PID you can see +cat /proc/1234/cgroup + +# cgroups v2 output: +# 0::/kubepods/pod123/container456 + +# cgroups v1 output: +# 12:cpu,cpuacct:/docker/abc123 +# 11:memory:/docker/abc123 +# ... +``` + +#### 2. Read That Cgroup's Limits + +If the cgroup filesystem is mounted and accessible: + +```bash +#!/bin/bash + +get_cgroup_cpu_limit() { + local pid=$1 + + # Get cgroup path for this PID + cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) # v2 + + if [[ -n "$cgroup_path" ]]; then + # cgroups v2 + limit_file="/sys/fs/cgroup${cgroup_path}/cpu.max" + if [[ -r "$limit_file" ]]; then + read quota period < "$limit_file" + if [[ "$quota" == "max" ]]; then + echo "unlimited" + else + echo "scale=2; $quota / $period" | bc + fi + return + fi + fi + + # Try cgroups v1 + cgroup_path=$(grep -oP 'cpu.*:\K.*' /proc/$pid/cgroup 2>/dev/null) + if [[ -n "$cgroup_path" ]]; then + quota_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_quota_us" + period_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_period_us" + if [[ -r "$quota_file" ]]; then + quota=$(cat "$quota_file") + period=$(cat "$period_file") + if [[ "$quota" == "-1" ]]; then + echo "unlimited" + else + echo "scale=2; $quota / $period" | bc + fi + return + fi + fi + + echo "unknown" +} + +# Example: get limit for PID 1234 +get_cgroup_cpu_limit 1234 +``` + +#### 3. Mount Visibility Requirements + +| Scenario | Can Read Other Cgroups? | +|----------|------------------------| +| Host system | Yes | +| Privileged container | Yes | +| `/sys/fs/cgroup` mounted read-only from host | Yes (common in Kubernetes) | +| Only own cgroup subtree mounted | No | + +Check what's visible: + +```bash +mount | grep cgroup +ls /sys/fs/cgroup/ +``` + +#### 4. Full Solution: Aggregate by Cgroup + +```bash +#!/bin/bash +CLK_TCK=$(getconf CLK_TCK) + +declare -A cgroup_ticks +declare -A cgroup_limit + +for stat in /proc/[0-9]*/stat; do + pid="${stat#/proc/}" + pid="${pid%/stat}" + + # Get cgroup for this process + cg=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) + [[ -z "$cg" ]] && continue + + # Get CPU ticks + if read -r line < "$stat" 2>/dev/null; then + rest="${line##*) }" + read -ra f <<< "$rest" + ticks=$((f[11] + f[12])) + + ((cgroup_ticks[$cg] += ticks)) + + # Cache the limit (only look up once per cgroup) + if [[ -z "${cgroup_limit[$cg]}" ]]; then + limit_file="/sys/fs/cgroup${cg}/cpu.max" + if [[ -r "$limit_file" ]]; then + read quota period < "$limit_file" + if [[ "$quota" == "max" ]]; then + cgroup_limit[$cg]="unlimited" + else + cgroup_limit[$cg]=$(echo "scale=2; $quota / $period" | bc) + fi + else + cgroup_limit[$cg]="unknown" + fi + fi + fi +done + +echo "Ticks by cgroup:" +for cg in "${!cgroup_ticks[@]}"; do + echo " $cg: ${cgroup_ticks[$cg]} ticks (limit: ${cgroup_limit[$cg]} CPUs)" +done +``` + +#### If You Can't Access Other Cgroups + +Fallback options: + +1. **Mount the cgroup fs** — add volume mount for `/sys/fs/cgroup:ro` +2. **Use a sidecar with access** — one privileged container does the monitoring +3. **Accept "unknown" limits** — report raw ticks/cores instead of percentages +4. **Kubernetes Downward API** — inject limits as env vars (only for your own container though) diff --git a/test/docker/docker-compose.yaml b/test/docker/docker-compose.yaml new file mode 100644 index 0000000..87a9a9a --- /dev/null +++ b/test/docker/docker-compose.yaml @@ -0,0 +1,81 @@ +# Docker Compose test setup for cgroup grouping verification +# Run with: docker compose -f test/docker/docker-compose.yaml up +# +# NOTE: Docker Compose doesn't have a direct equivalent to K8s shareProcessNamespace. +# Options: +# 1. pid: "host" - sees ALL host processes (not container-specific) +# 2. pid: "service:" - chains PID namespace to another service +# +# For proper testing, use Kubernetes or run containers manually with --pid=container: + +services: + # Simulate a runner workload (this will be the "root" of the shared PID namespace) + # Uses 'cat' reading from a fifo as a unique identifiable process + runner: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Runner started (PID 1 in namespace)" + mkfifo /tmp/runner_fifo + # 'cat' will be our identifiable runner process (blocks on fifo) + cat /tmp/runner_fifo & + CAT_PID=$! + # Generate CPU load with dd + while true; do + dd if=/dev/zero of=/dev/null bs=1M count=50 2>/dev/null + done + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + # This container owns the PID namespace + + # Simulate a sidecar service - shares PID namespace with runner + sidecar: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Sidecar started" + # List processes to verify shared namespace + ps aux + while true; do + sleep 10 + done + deploy: + resources: + limits: + cpus: "0.1" + memory: 128M + pid: "service:runner" # Share PID namespace with runner + depends_on: + - runner + + # Resource collector - shares PID namespace with runner + collector: + build: + context: ../.. + dockerfile: Dockerfile + target: collector + command: + - --interval=3s + - --top=5 + - --log-format=json + environment: + # Map unique process names to container names + # 'cat' runs only in runner, 'sleep' runs only in sidecar + CGROUP_PROCESS_MAP: '{"cat":"runner","sleep":"sidecar","resource-collec":"collector"}' + CGROUP_LIMITS: '{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' + deploy: + resources: + limits: + cpus: "0.1" + memory: 64M + pid: "service:runner" # Share PID namespace with runner + depends_on: + - runner + - sidecar diff --git a/test/k8s/test-cgroup-grouping.yaml b/test/k8s/test-cgroup-grouping.yaml new file mode 100644 index 0000000..e46545b --- /dev/null +++ b/test/k8s/test-cgroup-grouping.yaml @@ -0,0 +1,148 @@ +# Test manifest to verify cgroup grouping behavior +# This pod runs multiple containers with different resource limits +# and a collector sidecar that groups metrics by cgroup/container +apiVersion: v1 +kind: Pod +metadata: + name: test-cgroup-grouping + labels: + app: test-cgroup-grouping +spec: + # Share PID namespace so collector can see all processes + shareProcessNamespace: true + + containers: + # Main workload container - simulates a runner + - name: runner + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Runner container started" + # Simulate some CPU work + while true; do + dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null + sleep 1 + done + resources: + requests: + cpu: "100m" + memory: "64Mi" + limits: + cpu: "500m" + memory: "256Mi" + + # Sidecar container - simulates nginx or another service + - name: sidecar + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Sidecar container started" + # Simulate some lighter work + while true; do + sleep 5 + done + resources: + requests: + cpu: "50m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "128Mi" + + # Resource collector sidecar + - name: collector + image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image + args: + - --interval=5s + - --top=3 + env: + # Map process names to container names + # "sh" is the main process in busybox containers + # You may need to adjust based on actual process names + - name: CGROUP_PROCESS_MAP + value: | + {"sh":"runner","sleep":"sidecar","collector":"collector"} + # Define limits for each container (must match names in CGROUP_PROCESS_MAP) + - name: CGROUP_LIMITS + value: | + {"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}} + resources: + requests: + cpu: "50m" + memory: "32Mi" + limits: + cpu: "100m" + memory: "64Mi" + # Mount proc read-only for process discovery + volumeMounts: + - name: proc + mountPath: /proc + readOnly: true + + volumes: + - name: proc + hostPath: + path: /proc + type: Directory + + restartPolicy: Never +--- +# Alternative: Using a Deployment for longer-running tests +apiVersion: v1 +kind: Pod +metadata: + name: test-cgroup-simple + labels: + app: test-cgroup-simple +spec: + shareProcessNamespace: true + + containers: + # Stress container to generate CPU/memory load + - name: stress + image: progrium/stress:latest + args: + - --cpu + - "1" + - --vm + - "1" + - --vm-bytes + - "64M" + - --timeout + - "300s" + resources: + limits: + cpu: "500m" + memory: "128Mi" + + # Collector + - name: collector + image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image + args: + - --interval=2s + - --top=5 + env: + - name: CGROUP_PROCESS_MAP + value: '{"stress":"stress","collector":"collector"}' + - name: CGROUP_LIMITS + value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}' + resources: + limits: + cpu: "100m" + memory: "64Mi" + volumeMounts: + - name: proc + mountPath: /proc + readOnly: true + + volumes: + - name: proc + hostPath: + path: /proc + type: Directory + + restartPolicy: Never diff --git a/test/local-test.sh b/test/local-test.sh new file mode 100755 index 0000000..d56f477 --- /dev/null +++ b/test/local-test.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Local test script to verify cgroup grouping +# Run from project root: ./test/local-test.sh + +set -e + +echo "Building collector..." +go build -o bin/collector ./cmd/collector + +echo "" +echo "Testing cgroup parsing on current system..." +echo "Current process cgroup:" +cat /proc/self/cgroup 2>/dev/null || echo "Cannot read /proc/self/cgroup (expected on macOS)" + +echo "" +echo "Running collector for 10 seconds with cgroup grouping..." +echo "Press Ctrl+C to stop early" +echo "" + +# Set up test environment variables +# Map common process names to container names +export CGROUP_PROCESS_MAP='{"bash":"shell","collector":"collector","zsh":"shell"}' +export CGROUP_LIMITS='{"shell":{"cpu":"2","memory":"4Gi"},"collector":{"cpu":"1","memory":"1Gi"}}' + +# Run collector +timeout 10 ./bin/collector \ + --interval=2s \ + --top=5 \ + --log-format=json \ + 2>/dev/null || true + +echo "" +echo "Test complete!" +echo "" +echo "Note: On macOS, cgroup paths will be empty since cgroups are a Linux feature." +echo "To test properly, run in a Linux container or VM." From 6770cfcea75ed5f4469f5e11eab98e89a8e643aa Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:01:01 +0100 Subject: [PATCH 10/21] feat(summary): add per-container metrics with extended percentiles - Extend StatSummary with p99, p75, p50 percentiles (in addition to peak, p95, avg) - Add ContainerSummary type for per-container CPU cores and memory bytes stats - Track container metrics from Cgroups map in Accumulator - Include containers array in RunSummary sent to receiver Co-Authored-By: Claude Opus 4.5 --- internal/summary/accumulator.go | 51 ++++++- internal/summary/accumulator_test.go | 198 +++++++++++++++++++++++++++ internal/summary/types.go | 33 +++-- 3 files changed, 267 insertions(+), 15 deletions(-) diff --git a/internal/summary/accumulator.go b/internal/summary/accumulator.go index bc3b1de..c8f10f1 100644 --- a/internal/summary/accumulator.go +++ b/internal/summary/accumulator.go @@ -10,6 +10,12 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics" ) +// containerAccumulator tracks metrics for a single container +type containerAccumulator struct { + cpuCoresValues []float64 + memoryBytesValues []float64 +} + // Accumulator collects metric samples and computes run-level statistics type Accumulator struct { topN int @@ -17,6 +23,7 @@ type Accumulator struct { memBytesValues []float64 memPctValues []float64 processPeaks map[string]*ProcessPeak + containers map[string]*containerAccumulator startTime time.Time endTime time.Time sampleCount int @@ -27,6 +34,7 @@ func NewAccumulator(topN int) *Accumulator { return &Accumulator{ topN: topN, processPeaks: make(map[string]*ProcessPeak), + containers: make(map[string]*containerAccumulator), } } @@ -48,6 +56,17 @@ func (a *Accumulator) Add(m *metrics.SystemMetrics) { for _, p := range m.TopMemory { a.updateProcessPeak(p) } + + // Track per-container metrics + for name, cgroup := range m.Cgroups { + ca, ok := a.containers[name] + if !ok { + ca = &containerAccumulator{} + a.containers[name] = ca + } + ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores) + ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes)) + } } // Summarize computes and returns the run summary, or nil if no samples were added @@ -66,15 +85,33 @@ func (a *Accumulator) Summarize() *RunSummary { MemUsedPercent: computeStats(a.memPctValues), TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }), TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }), + Containers: a.containerSummaries(), } } +// containerSummaries computes summaries for all tracked containers +func (a *Accumulator) containerSummaries() []ContainerSummary { + summaries := make([]ContainerSummary, 0, len(a.containers)) + for name, ca := range a.containers { + summaries = append(summaries, ContainerSummary{ + Name: name, + CPUCores: computeStats(ca.cpuCoresValues), + MemoryBytes: computeStats(ca.memoryBytesValues), + }) + } + // Sort by name for consistent output + sort.Slice(summaries, func(i, j int) bool { + return summaries[i].Name < summaries[j].Name + }) + return summaries +} + // SampleCount returns the number of samples added func (a *Accumulator) SampleCount() int { return a.sampleCount } -// computeStats calculates peak, average, and P95 from a sorted copy of the values +// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values func computeStats(values []float64) StatSummary { n := len(values) if n == 0 { @@ -90,15 +127,21 @@ func computeStats(values []float64) StatSummary { sum += v } - p95Index := int(float64(n-1) * 0.95) - return StatSummary{ Peak: sorted[n-1], + P99: sorted[percentileIndex(n, 0.99)], + P95: sorted[percentileIndex(n, 0.95)], + P75: sorted[percentileIndex(n, 0.75)], + P50: sorted[percentileIndex(n, 0.50)], Avg: sum / float64(n), - P95: sorted[p95Index], } } +// percentileIndex returns the index for the given percentile (0.0-1.0) +func percentileIndex(n int, percentile float64) int { + return int(float64(n-1) * percentile) +} + // updateProcessPeak merges a process observation into the peak tracking map func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) { key := fmt.Sprintf("%d:%s", p.PID, p.Name) diff --git a/internal/summary/accumulator_test.go b/internal/summary/accumulator_test.go index 76f6d47..203fe53 100644 --- a/internal/summary/accumulator_test.go +++ b/internal/summary/accumulator_test.go @@ -333,3 +333,201 @@ func TestAccumulator_Duration(t *testing.T) { t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds) } } + +func TestAccumulator_AllPercentiles(t *testing.T) { + acc := NewAccumulator(5) + // 20 values: 1, 2, 3, ..., 20 + for i := 1; i <= 20; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i)}, + Memory: metrics.MemoryMetrics{}, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Peak = 20 + if s.CPUTotal.Peak != 20 { + t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak) + } + // P99: index=int(19*0.99)=int(18.81)=18, value=19 + if s.CPUTotal.P99 != 19 { + t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99) + } + // P95: index=int(19*0.95)=int(18.05)=18, value=19 + if s.CPUTotal.P95 != 19 { + t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95) + } + // P75: index=int(19*0.75)=int(14.25)=14, value=15 + if s.CPUTotal.P75 != 15 { + t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75) + } + // P50: index=int(19*0.50)=int(9.5)=9, value=10 + if s.CPUTotal.P50 != 10 { + t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50) + } + // Avg = (1+2+...+20)/20 = 210/20 = 10.5 + if s.CPUTotal.Avg != 10.5 { + t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg) + } +} + +func TestAccumulator_ContainerMetrics(t *testing.T) { + acc := NewAccumulator(5) + + // Add samples with container metrics + for i := 1; i <= 5; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: float64(i * 10)}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i)}, + Memory: metrics.CgroupMemoryMetrics{ + TotalRSSBytes: uint64(i * 1000), + }, + }, + "container-b": { + Name: "container-b", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2)}, + Memory: metrics.CgroupMemoryMetrics{ + TotalRSSBytes: uint64(i * 2000), + }, + }, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should have 2 containers + if len(s.Containers) != 2 { + t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) + } + + // Containers should be sorted by name + if s.Containers[0].Name != "container-a" { + t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name) + } + if s.Containers[1].Name != "container-b" { + t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name) + } + + // Container A: CPU cores [1,2,3,4,5], peak=5, avg=3 + containerA := s.Containers[0] + if containerA.CPUCores.Peak != 5 { + t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak) + } + if containerA.CPUCores.Avg != 3 { + t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg) + } + // Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000 + if containerA.MemoryBytes.Peak != 5000 { + t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak) + } + if containerA.MemoryBytes.Avg != 3000 { + t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg) + } + + // Container B: CPU cores [2,4,6,8,10], peak=10, avg=6 + containerB := s.Containers[1] + if containerB.CPUCores.Peak != 10 { + t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak) + } + if containerB.CPUCores.Avg != 6 { + t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg) + } +} + +func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) { + acc := NewAccumulator(5) + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC), + CPU: metrics.CPUMetrics{TotalPercent: 50}, + Memory: metrics.MemoryMetrics{}, + Cgroups: nil, // No containers + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if len(s.Containers) != 0 { + t.Errorf("Containers length: got %d, want 0", len(s.Containers)) + } +} + +func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { + acc := NewAccumulator(5) + + // First sample: only container-a + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: 1}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, + }, + }, + }) + + // Second sample: both containers + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "container-a": { + Name: "container-a", + CPU: metrics.CgroupCPUMetrics{UsedCores: 2}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000}, + }, + "container-b": { + Name: "container-b", + CPU: metrics.CgroupCPUMetrics{UsedCores: 5}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000}, + }, + }, + }) + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + // Should have 2 containers + if len(s.Containers) != 2 { + t.Fatalf("Containers length: got %d, want 2", len(s.Containers)) + } + + // Container A: 2 samples [1,2] + containerA := s.Containers[0] + if containerA.CPUCores.Peak != 2 { + t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak) + } + if containerA.CPUCores.Avg != 1.5 { + t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg) + } + + // Container B: 1 sample [5] + containerB := s.Containers[1] + if containerB.CPUCores.Peak != 5 { + t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak) + } + if containerB.CPUCores.Avg != 5 { + t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg) + } +} diff --git a/internal/summary/types.go b/internal/summary/types.go index ab5ecea..dad0b01 100644 --- a/internal/summary/types.go +++ b/internal/summary/types.go @@ -4,11 +4,14 @@ package summary import "time" -// StatSummary holds peak, average, and P95 for a metric across the run +// StatSummary holds peak, percentiles, and average for a metric across the run type StatSummary struct { Peak float64 `json:"peak"` - Avg float64 `json:"avg"` + P99 float64 `json:"p99"` P95 float64 `json:"p95"` + P75 float64 `json:"p75"` + P50 float64 `json:"p50"` + Avg float64 `json:"avg"` } // ProcessPeak holds the peak CPU and memory observed for a single process @@ -19,15 +22,23 @@ type ProcessPeak struct { PeakMem uint64 `json:"peak_mem_rss_bytes"` } +// ContainerSummary holds statistics for a single container across the run +type ContainerSummary struct { + Name string `json:"name"` + CPUCores StatSummary `json:"cpu_cores"` + MemoryBytes StatSummary `json:"memory_bytes"` +} + // RunSummary holds the complete summary of a collection run type RunSummary struct { - StartTime time.Time `json:"start_time"` - EndTime time.Time `json:"end_time"` - DurationSeconds float64 `json:"duration_seconds"` - SampleCount int `json:"sample_count"` - CPUTotal StatSummary `json:"cpu_total_percent"` - MemUsedBytes StatSummary `json:"mem_used_bytes"` - MemUsedPercent StatSummary `json:"mem_used_percent"` - TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"` - TopMemProcesses []ProcessPeak `json:"top_mem_processes"` + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + DurationSeconds float64 `json:"duration_seconds"` + SampleCount int `json:"sample_count"` + CPUTotal StatSummary `json:"cpu_total_percent"` + MemUsedBytes StatSummary `json:"mem_used_bytes"` + MemUsedPercent StatSummary `json:"mem_used_percent"` + TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"` + TopMemProcesses []ProcessPeak `json:"top_mem_processes"` + Containers []ContainerSummary `json:"containers"` } From 5b983692c88acb950bd203e8de5ca89b8e9b8810 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:11:22 +0100 Subject: [PATCH 11/21] test: add stress test with receiver integration Docker Compose setup that: - Runs metrics receiver with SQLite storage - Spawns CPU and memory stress workloads using stress-ng - Uses shared PID namespace (pid: service:cpu-stress) for proper isolation - Collector gathers metrics and pushes summary on shutdown Known issue: Container CPU summary may show overflow values on first sample due to delta calculation - to be fixed in accumulator. Co-Authored-By: Claude Opus 4.5 --- test/docker/docker-compose-stress.yaml | 125 +++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 test/docker/docker-compose-stress.yaml diff --git a/test/docker/docker-compose-stress.yaml b/test/docker/docker-compose-stress.yaml new file mode 100644 index 0000000..89ce7e6 --- /dev/null +++ b/test/docker/docker-compose-stress.yaml @@ -0,0 +1,125 @@ +# Docker Compose stress test with receiver +# Run with: docker compose -f test/docker/docker-compose-stress.yaml up +# +# This test: +# 1. Starts the metrics receiver +# 2. Runs heavy CPU/memory workloads in multiple containers with shared PID namespace +# 3. Collector gathers metrics and pushes summary to receiver on shutdown +# +# To trigger the push, stop the collector gracefully: +# docker compose -f test/docker/docker-compose-stress.yaml stop collector + +services: + # Metrics receiver - stores summaries in SQLite + receiver: + build: + context: ../.. + dockerfile: Dockerfile + target: receiver + ports: + - "9080:8080" + environment: + - DB_PATH=/data/metrics.db + volumes: + - receiver-data:/data + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"] + interval: 5s + timeout: 3s + retries: 3 + + # Heavy CPU workload - uses stress-ng (owns the PID namespace) + cpu-stress: + image: alexeiled/stress-ng:latest + command: + - --cpu + - "2" + - --timeout + - "300s" + - --metrics-brief + deploy: + resources: + limits: + cpus: "1.0" + memory: 128M + # This container owns the PID namespace + + # Memory-intensive workload - shares PID namespace with cpu-stress + mem-stress: + image: alexeiled/stress-ng:latest + command: + - --vm + - "2" + - --vm-bytes + - "64M" + - --timeout + - "300s" + - --metrics-brief + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + pid: "service:cpu-stress" + depends_on: + - cpu-stress + + # IO workload - continuous disk writes + io-stress: + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "IO stress started" + # 'dd' will be our identifiable process + while true; do + dd if=/dev/zero of=/tmp/testfile bs=1M count=100 2>/dev/null + rm -f /tmp/testfile + done + deploy: + resources: + limits: + cpus: "0.5" + memory: 128M + pid: "service:cpu-stress" + depends_on: + - cpu-stress + + # Resource collector - pushes to receiver on shutdown + collector: + build: + context: ../.. + dockerfile: Dockerfile + target: collector + command: + - --interval=2s + - --top=10 + - --log-format=json + - --push-endpoint=http://receiver:8080/api/v1/metrics + environment: + # Execution context for the receiver + GITHUB_REPOSITORY_OWNER: "test-org" + GITHUB_REPOSITORY: "test-org/stress-test" + GITHUB_WORKFLOW: "stress-test-workflow" + GITHUB_JOB: "heavy-workload" + GITHUB_RUN_ID: "stress-run-001" + # Cgroup configuration + # stress-ng-cpu is the worker process name for CPU stress + # stress-ng-vm is the worker process name for memory stress + CGROUP_PROCESS_MAP: '{"stress-ng-cpu":"cpu-stress","stress-ng-vm":"mem-stress","dd":"io-stress","resource-collec":"collector"}' + CGROUP_LIMITS: '{"cpu-stress":{"cpu":"1","memory":"128Mi"},"mem-stress":{"cpu":"500m","memory":"256Mi"},"io-stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"200m","memory":"64Mi"}}' + deploy: + resources: + limits: + cpus: "0.2" + memory: 64M + pid: "service:cpu-stress" + depends_on: + receiver: + condition: service_healthy + cpu-stress: + condition: service_started + +volumes: + receiver-data: From 0af8c28bc23ef9c38ee472aa7e0be8b6f5a631d2 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:15:30 +0100 Subject: [PATCH 12/21] fix(aggregator): prevent CPU cores overflow when processes restart Guard against unsigned integer underflow in cgroup CPU calculation. When processes exit and new ones start, totalTicks can be less than the previous value, causing the subtraction to wrap around to a huge positive number. Now checks totalTicks >= prev before calculating delta, treating process churn as 0 CPU usage for that sample. Co-Authored-By: Claude Opus 4.5 --- internal/metrics/aggregator.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/internal/metrics/aggregator.go b/internal/metrics/aggregator.go index 78cfc94..d8a22b9 100644 --- a/internal/metrics/aggregator.go +++ b/internal/metrics/aggregator.go @@ -348,9 +348,14 @@ func (a *Aggregator) calculateCgroupMetrics( // Calculate CPU cores used from delta usedCores := 0.0 if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 { - deltaTicks := totalTicks - prev - // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) - usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) + // Guard against underflow: if processes exited and new ones started, + // totalTicks could be less than prev. In that case, skip this sample. + if totalTicks >= prev { + deltaTicks := totalTicks - prev + // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) + usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) + } + // If totalTicks < prev, usedCores stays 0 for this sample } a.prevCgroupCPU[containerName] = totalTicks From eb01c1c842d16619d063d7182d382a171e34da27 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:22:27 +0100 Subject: [PATCH 13/21] fix(receiver): return Payload as JSON object instead of string Changed the API response to embed Payload as a JSON object using json.RawMessage instead of returning it as a JSON-encoded string inside the JSON response. Added MetricResponse type with ToResponse converter method. Co-Authored-By: Claude Opus 4.5 --- internal/receiver/handler.go | 8 +++++++- internal/receiver/handler_test.go | 4 ++-- internal/receiver/store.go | 26 ++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go index a00e414..66905f9 100644 --- a/internal/receiver/handler.go +++ b/internal/receiver/handler.go @@ -74,8 +74,14 @@ func (h *Handler) handleGetByWorkflowJob(w http.ResponseWriter, r *http.Request) return } + // Convert to response type with Payload as JSON object + response := make([]MetricResponse, len(metrics)) + for i, m := range metrics { + response[i] = m.ToResponse() + } + w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(metrics) + _ = json.NewEncoder(w).Encode(response) } func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request) { diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go index fdd3894..df589df 100644 --- a/internal/receiver/handler_test.go +++ b/internal/receiver/handler_test.go @@ -125,7 +125,7 @@ func TestHandler_GetByWorkflowJob(t *testing.T) { t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) } - var metrics []Metric + var metrics []MetricResponse if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { t.Fatalf("failed to decode response: %v", err) } @@ -149,7 +149,7 @@ func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { t.Errorf("status = %d, want %d", rec.Code, http.StatusOK) } - var metrics []Metric + var metrics []MetricResponse if err := json.NewDecoder(rec.Body).Decode(&metrics); err != nil { t.Fatalf("failed to decode response: %v", err) } diff --git a/internal/receiver/store.go b/internal/receiver/store.go index 4ae970a..1b934de 100644 --- a/internal/receiver/store.go +++ b/internal/receiver/store.go @@ -24,6 +24,32 @@ type Metric struct { Payload string `gorm:"type:text;not null"` // JSON-encoded RunSummary } +// MetricResponse is the API response type with Payload as embedded JSON object +type MetricResponse struct { + ID uint `json:"id"` + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` + RunID string `json:"run_id"` + ReceivedAt time.Time `json:"received_at"` + Payload json.RawMessage `json:"payload"` +} + +// ToResponse converts a Metric to a MetricResponse with Payload as JSON object +func (m *Metric) ToResponse() MetricResponse { + return MetricResponse{ + ID: m.ID, + Organization: m.Organization, + Repository: m.Repository, + Workflow: m.Workflow, + Job: m.Job, + RunID: m.RunID, + ReceivedAt: m.ReceivedAt, + Payload: json.RawMessage(m.Payload), + } +} + // Store handles SQLite storage for metrics using GORM type Store struct { db *gorm.DB From d624d46822aa1462d01ef07103f93779aea3d1f6 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:32:55 +0100 Subject: [PATCH 14/21] test(stress): increase cpu-stress to 3 workers with 2.0 CPU limit Co-Authored-By: Claude Opus 4.5 --- test/docker/docker-compose-stress.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/docker/docker-compose-stress.yaml b/test/docker/docker-compose-stress.yaml index 89ce7e6..0d560a7 100644 --- a/test/docker/docker-compose-stress.yaml +++ b/test/docker/docker-compose-stress.yaml @@ -33,14 +33,14 @@ services: image: alexeiled/stress-ng:latest command: - --cpu - - "2" + - "3" - --timeout - "300s" - --metrics-brief deploy: resources: limits: - cpus: "1.0" + cpus: "2.0" memory: 128M # This container owns the PID namespace From 52f1b8b64d50383ec6108b3446b9934b8b6cbd68 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:38:22 +0100 Subject: [PATCH 15/21] docs: add README with API documentation Document receiver API endpoints and response format. Clarify that container cpu_cores values are in number of cores (not percentage), while system/process CPU values are percentages. Co-Authored-By: Claude Opus 4.5 --- README.md | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..e5ceea1 --- /dev/null +++ b/README.md @@ -0,0 +1,188 @@ +# Forgejo Runner Resource Collector + +A lightweight resource metrics collector designed to run alongside CI/CD workloads in shared PID namespace environments. It collects CPU and memory metrics, groups them by container/cgroup, and pushes summaries to a receiver service. + +## Components + +- **Collector**: Gathers system and per-process metrics at regular intervals, computes run-level statistics, and pushes a summary on shutdown. +- **Receiver**: HTTP service that stores metric summaries in SQLite and provides a query API. + +## Receiver API + +### POST `/api/v1/metrics` + +Receives metric summaries from collectors. + +### GET `/api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` + +Retrieves all stored metrics for a specific workflow and job. + +**Example request:** +``` +GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build +``` + +**Example response:** +```json +[ + { + "id": 1, + "organization": "my-org", + "repository": "my-org/my-repo", + "workflow": "ci.yml", + "job": "build", + "run_id": "run-123", + "received_at": "2026-02-06T14:30:23.056Z", + "payload": { + "start_time": "2026-02-06T14:30:02.185Z", + "end_time": "2026-02-06T14:30:22.190Z", + "duration_seconds": 20.0, + "sample_count": 11, + "cpu_total_percent": { ... }, + "mem_used_bytes": { ... }, + "mem_used_percent": { ... }, + "top_cpu_processes": [ ... ], + "top_mem_processes": [ ... ], + "containers": [ + { + "name": "runner", + "cpu_cores": { + "peak": 2.007, + "p99": 2.005, + "p95": 2.004, + "p75": 1.997, + "p50": 1.817, + "avg": 1.5 + }, + "memory_bytes": { + "peak": 18567168, + "p99": 18567168, + "p95": 18567168, + "p75": 18567168, + "p50": 18567168, + "avg": 18567168 + } + } + ] + } + } +] +``` + +## Understanding the Metrics + +### CPU Metrics + +There are two different CPU metric formats in the response: + +#### 1. System and Process CPU: Percentage (`cpu_total_percent`, `peak_cpu_percent`) + +These values represent **CPU utilization as a percentage** of total available CPU time. + +- `cpu_total_percent`: Overall system CPU usage (0-100%) +- `peak_cpu_percent` (in process lists): Per-process CPU usage where 100% = 1 full CPU core + +#### 2. Container CPU: Cores (`cpu_cores`) + +**Important:** The `cpu_cores` field in container metrics represents **CPU usage in number of cores**, not percentage. + +| Value | Meaning | +|-------|---------| +| `0.5` | Half a CPU core | +| `1.0` | One full CPU core | +| `2.0` | Two CPU cores | +| `2.5` | Two and a half CPU cores | + +This allows direct comparison with Kubernetes resource limits (e.g., `cpu: "2"` or `cpu: "500m"`). + +**Example interpretation:** +```json +{ + "name": "runner", + "cpu_cores": { + "peak": 2.007, + "avg": 1.5 + } +} +``` +This means the "runner" container used a peak of ~2 CPU cores and averaged 1.5 CPU cores during the run. + +### Memory Metrics + +All memory values are in **bytes**: + +- `mem_used_bytes`: System memory usage +- `memory_bytes` (in containers): Container RSS memory usage +- `peak_mem_rss_bytes` (in processes): Process RSS memory + +### Statistical Fields + +Each metric includes percentile statistics across all samples: + +| Field | Description | +|-------|-------------| +| `peak` | Maximum value observed | +| `p99` | 99th percentile | +| `p95` | 95th percentile | +| `p75` | 75th percentile | +| `p50` | Median (50th percentile) | +| `avg` | Arithmetic mean | + +## Configuration + +### Collector Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | +| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | +| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | +| `GITHUB_JOB` | Job name | `build` | +| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | +| `CGROUP_PROCESS_MAP` | JSON mapping process names to container names | `{"node":"runner"}` | +| `CGROUP_LIMITS` | JSON with CPU/memory limits per container | See below | + +**CGROUP_LIMITS example:** +```json +{ + "runner": {"cpu": "2", "memory": "1Gi"}, + "sidecar": {"cpu": "500m", "memory": "256Mi"} +} +``` + +CPU values support Kubernetes notation: `"2"` = 2 cores, `"500m"` = 0.5 cores. + +Memory values support: `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). + +### Receiver Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `DB_PATH` | SQLite database path | `metrics.db` | +| `LISTEN_ADDR` | HTTP listen address | `:8080` | + +## Running + +### Docker Compose (stress test example) + +```bash +docker compose -f test/docker/docker-compose-stress.yaml up -d +# Wait for metrics collection... +docker compose -f test/docker/docker-compose-stress.yaml stop collector +# Query results +curl http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload +``` + +### Local Development + +```bash +# Build +go build -o collector ./cmd/collector +go build -o receiver ./cmd/receiver + +# Run receiver +./receiver --listen=:8080 --db=metrics.db + +# Run collector +./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics +``` From fd02242d5e32dbbcd4b732f403c6f3a2e0c7f6e3 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:40:38 +0100 Subject: [PATCH 16/21] docs: move background documentation to docs/background Co-Authored-By: Claude Opus 4.5 --- .../background/identify-process-cgroup-by-pid.md | 0 .../background/proc-stat-in-containers.md | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename identify-process-cgroup-by-pid.md => docs/background/identify-process-cgroup-by-pid.md (100%) rename proc-stat-in-containers.md => docs/background/proc-stat-in-containers.md (100%) diff --git a/identify-process-cgroup-by-pid.md b/docs/background/identify-process-cgroup-by-pid.md similarity index 100% rename from identify-process-cgroup-by-pid.md rename to docs/background/identify-process-cgroup-by-pid.md diff --git a/proc-stat-in-containers.md b/docs/background/proc-stat-in-containers.md similarity index 100% rename from proc-stat-in-containers.md rename to docs/background/proc-stat-in-containers.md From addab99e5dfd7920b4bac6d3779f01b7d8e4237e Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Fri, 6 Feb 2026 15:42:43 +0100 Subject: [PATCH 17/21] docs: add CLAUDE.md with development guidance Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f9d6972 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,94 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build and Development Commands + +```bash +# Build +make build # Build the collector binary +go build -o collector ./cmd/collector +go build -o receiver ./cmd/receiver + +# Test +make test # Run all tests +go test -v ./... # Run all tests with verbose output +go test -v ./internal/collector/... # Run tests for a specific package +make test-coverage # Run tests with coverage report + +# Code Quality +make fmt # Format code +make vet # Run go vet +make lint # Run golangci-lint (v2.6.2) +make all # Format, vet, lint, and build + +# Git Hooks +make install-hooks # Install pre-commit and commit-msg hooks +``` + +## Architecture Overview + +This is a Go metrics collector designed for CI/CD environments with shared PID namespaces. It consists of two binaries: + +### Collector (`cmd/collector`) +Runs alongside CI workloads, periodically reads `/proc` filesystem, and pushes a summary to the receiver on shutdown (SIGINT/SIGTERM). + +**Data Flow:** +1. `metrics.Aggregator` reads `/proc` to collect CPU/memory for all processes +2. `collector.Collector` orchestrates collection at intervals and writes to output +3. `summary.Accumulator` tracks samples across the run, computing peak/avg/percentiles +4. On shutdown, `summary.PushClient` sends the summary to the receiver HTTP endpoint + +### Receiver (`cmd/receiver`) +HTTP service that stores metric summaries in SQLite (via GORM) and provides a query API. + +**Key Endpoints:** +- `POST /api/v1/metrics` - Receive metrics from collectors +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` - Query stored metrics + +### Internal Packages + +| Package | Purpose | +|---------|---------| +| `internal/collector` | Orchestrates collection loop, handles shutdown | +| `internal/metrics` | Aggregates system/process metrics from /proc | +| `internal/proc` | Low-level /proc parsing (stat, status, cgroup) | +| `internal/cgroup` | Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP env vars | +| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | +| `internal/receiver` | HTTP handlers and SQLite store | +| `internal/output` | Metrics output formatting (JSON/text) | + +### Container Metrics + +The collector groups processes by container using cgroup paths. Configuration via environment variables: +- `CGROUP_PROCESS_MAP`: JSON mapping process names to container names (e.g., `{"node":"runner"}`) +- `CGROUP_LIMITS`: JSON with CPU/memory limits per container for percentage calculations + +CPU values in container metrics are reported as **cores** (not percentage), enabling direct comparison with Kubernetes resource limits. + +## Commit Message Convention + +Uses conventional commits enforced by git hook: +``` +()?: +``` + +Types: `feat`, `fix`, `chore`, `docs`, `style`, `refactor`, `perf`, `test`, `build`, `ci` + +Examples: +- `feat: add user authentication` +- `fix(collector): handle nil cgroup paths` +- `feat!: breaking change in API` + +## Running Locally + +```bash +# Run receiver +./receiver --addr=:8080 --db=metrics.db + +# Run collector with push endpoint +./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics + +# Docker Compose stress test +docker compose -f test/docker/docker-compose-stress.yaml up -d +``` From 2a4c64bfb0a7ca5585f097da549c6061418897a0 Mon Sep 17 00:00:00 2001 From: Martin McCaffery Date: Mon, 9 Feb 2026 17:24:57 +0100 Subject: [PATCH 18/21] Update and extend documentation --- README.md | 251 +++++---- .../identify-process-cgroup-by-pid.md | 52 +- docs/background/proc-stat-in-containers.md | 480 +++--------------- 3 files changed, 212 insertions(+), 571 deletions(-) diff --git a/README.md b/README.md index e5ceea1..196adf2 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,100 @@ # Forgejo Runner Resource Collector -A lightweight resource metrics collector designed to run alongside CI/CD workloads in shared PID namespace environments. It collects CPU and memory metrics, groups them by container/cgroup, and pushes summaries to a receiver service. +A lightweight metrics collector for CI/CD workloads in shared PID namespace environments. Reads `/proc` to collect CPU and memory metrics, groups them by container/cgroup, and pushes run summaries to a receiver service for storage and querying. -## Components +## Architecture -- **Collector**: Gathers system and per-process metrics at regular intervals, computes run-level statistics, and pushes a summary on shutdown. -- **Receiver**: HTTP service that stores metric summaries in SQLite and provides a query API. +The system has two independent binaries: -## Receiver API +``` +┌─────────────────────────────────────────────┐ ┌──────────────────────────┐ +│ CI/CD Pod (shared PID namespace) │ │ Receiver Service │ +│ │ │ │ +│ ┌───────────┐ ┌────────┐ ┌───────────┐ │ │ POST /api/v1/metrics │ +│ │ collector │ │ runner │ │ sidecar │ │ │ │ │ +│ │ │ │ │ │ │ │ │ ▼ │ +│ │ reads │ │ │ │ │ │ push │ ┌────────────┐ │ +│ │ /proc for │ │ │ │ │ │──────▶│ │ SQLite │ │ +│ │ all PIDs │ │ │ │ │ │ │ └────────────┘ │ +│ └───────────┘ └────────┘ └───────────┘ │ │ │ │ +│ │ │ ▼ │ +└─────────────────────────────────────────────┘ │ GET /api/v1/metrics/... │ + └──────────────────────────┘ +``` -### POST `/api/v1/metrics` +### Collector -Receives metric summaries from collectors. +Runs as a sidecar alongside CI workloads. On a configurable interval, it reads `/proc` to collect CPU and memory for all visible processes, groups them by container using cgroup paths, and accumulates samples. On shutdown (SIGINT/SIGTERM), it computes run-level statistics (peak, avg, percentiles) and pushes a single summary to the receiver. -### GET `/api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` +```bash +./collector --interval=2s --top=10 --push-endpoint=http://receiver:8080/api/v1/metrics +``` -Retrieves all stored metrics for a specific workflow and job. +**Flags:** `--interval`, `--proc-path`, `--log-level`, `--log-format`, `--top`, `--push-endpoint` + +**Environment variables:** + +| Variable | Description | Example | +|----------|-------------|---------| +| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | +| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | +| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | +| `GITHUB_JOB` | Job name | `build` | +| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | +| `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | +| `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | + +**CGROUP_LIMITS example:** +```json +{ + "runner": {"cpu": "2", "memory": "1Gi"}, + "sidecar": {"cpu": "500m", "memory": "256Mi"} +} +``` +CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory supports `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). + +### Receiver + +HTTP service that stores metric summaries in SQLite (via GORM) and exposes a query API. + +```bash +./receiver --addr=:8080 --db=metrics.db +``` + +| Variable | Description | Default | +|----------|-------------|---------| +| `DB_PATH` | SQLite database path | `metrics.db` | +| `LISTEN_ADDR` | HTTP listen address | `:8080` | + +**Endpoints:** + +- `POST /api/v1/metrics` — receive and store a metric summary +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics + +## How Metrics Are Collected + +The collector reads `/proc/[pid]/stat` for every visible process to get CPU ticks (`utime` + `stime`) and `/proc/[pid]/status` for memory (RSS). It takes two samples per interval and computes the delta to derive CPU usage rates. + +Processes are grouped into containers by reading `/proc/[pid]/cgroup` and matching cgroup paths against the `CGROUP_PROCESS_MAP`. This is necessary because in shared PID namespace pods, `/proc/stat` only shows host-level aggregates — per-container metrics must be built up from individual process data. + +Container CPU is reported in **cores** (not percentage) for direct comparison with Kubernetes resource limits. System-level CPU is reported as a percentage (0-100%). + +Over the course of a run, the `summary.Accumulator` tracks every sample and on shutdown computes: + +| Stat | Description | +|------|-------------| +| `peak` | Maximum observed value | +| `p99`, `p95`, `p75`, `p50` | Percentiles across all samples | +| `avg` | Arithmetic mean | + +These stats are computed for CPU, memory, and per-container metrics. + +## API Response -**Example request:** ``` GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build ``` -**Example response:** ```json [ { @@ -38,151 +110,66 @@ GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build "end_time": "2026-02-06T14:30:22.190Z", "duration_seconds": 20.0, "sample_count": 11, - "cpu_total_percent": { ... }, - "mem_used_bytes": { ... }, - "mem_used_percent": { ... }, - "top_cpu_processes": [ ... ], - "top_mem_processes": [ ... ], + "cpu_total_percent": { "peak": ..., "avg": ..., "p50": ... }, + "mem_used_bytes": { "peak": ..., "avg": ... }, "containers": [ { "name": "runner", - "cpu_cores": { - "peak": 2.007, - "p99": 2.005, - "p95": 2.004, - "p75": 1.997, - "p50": 1.817, - "avg": 1.5 - }, - "memory_bytes": { - "peak": 18567168, - "p99": 18567168, - "p95": 18567168, - "p75": 18567168, - "p50": 18567168, - "avg": 18567168 - } + "cpu_cores": { "peak": 2.007, "avg": 1.5, "p50": 1.817, "p95": 2.004 }, + "memory_bytes": { "peak": 18567168, "avg": 18567168 } } - ] + ], + "top_cpu_processes": [ ... ], + "top_mem_processes": [ ... ] } } ] ``` -## Understanding the Metrics +**CPU metric distinction:** +- `cpu_total_percent` — system-wide, 0-100% +- `cpu_cores` (containers) — cores used (e.g. `2.0` = two full cores) +- `peak_cpu_percent` (processes) — per-process, where 100% = 1 core -### CPU Metrics - -There are two different CPU metric formats in the response: - -#### 1. System and Process CPU: Percentage (`cpu_total_percent`, `peak_cpu_percent`) - -These values represent **CPU utilization as a percentage** of total available CPU time. - -- `cpu_total_percent`: Overall system CPU usage (0-100%) -- `peak_cpu_percent` (in process lists): Per-process CPU usage where 100% = 1 full CPU core - -#### 2. Container CPU: Cores (`cpu_cores`) - -**Important:** The `cpu_cores` field in container metrics represents **CPU usage in number of cores**, not percentage. - -| Value | Meaning | -|-------|---------| -| `0.5` | Half a CPU core | -| `1.0` | One full CPU core | -| `2.0` | Two CPU cores | -| `2.5` | Two and a half CPU cores | - -This allows direct comparison with Kubernetes resource limits (e.g., `cpu: "2"` or `cpu: "500m"`). - -**Example interpretation:** -```json -{ - "name": "runner", - "cpu_cores": { - "peak": 2.007, - "avg": 1.5 - } -} -``` -This means the "runner" container used a peak of ~2 CPU cores and averaged 1.5 CPU cores during the run. - -### Memory Metrics - -All memory values are in **bytes**: - -- `mem_used_bytes`: System memory usage -- `memory_bytes` (in containers): Container RSS memory usage -- `peak_mem_rss_bytes` (in processes): Process RSS memory - -### Statistical Fields - -Each metric includes percentile statistics across all samples: - -| Field | Description | -|-------|-------------| -| `peak` | Maximum value observed | -| `p99` | 99th percentile | -| `p95` | 95th percentile | -| `p75` | 75th percentile | -| `p50` | Median (50th percentile) | -| `avg` | Arithmetic mean | - -## Configuration - -### Collector Environment Variables - -| Variable | Description | Example | -|----------|-------------|---------| -| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | -| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | -| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | -| `GITHUB_JOB` | Job name | `build` | -| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | -| `CGROUP_PROCESS_MAP` | JSON mapping process names to container names | `{"node":"runner"}` | -| `CGROUP_LIMITS` | JSON with CPU/memory limits per container | See below | - -**CGROUP_LIMITS example:** -```json -{ - "runner": {"cpu": "2", "memory": "1Gi"}, - "sidecar": {"cpu": "500m", "memory": "256Mi"} -} -``` - -CPU values support Kubernetes notation: `"2"` = 2 cores, `"500m"` = 0.5 cores. - -Memory values support: `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). - -### Receiver Environment Variables - -| Variable | Description | Default | -|----------|-------------|---------| -| `DB_PATH` | SQLite database path | `metrics.db` | -| `LISTEN_ADDR` | HTTP listen address | `:8080` | +All memory values are in **bytes**. ## Running -### Docker Compose (stress test example) +### Docker Compose ```bash docker compose -f test/docker/docker-compose-stress.yaml up -d -# Wait for metrics collection... +# Wait for collection, then trigger shutdown summary: docker compose -f test/docker/docker-compose-stress.yaml stop collector -# Query results +# Query results: curl http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload ``` -### Local Development +### Local ```bash -# Build go build -o collector ./cmd/collector go build -o receiver ./cmd/receiver -# Run receiver -./receiver --listen=:8080 --db=metrics.db - -# Run collector +./receiver --addr=:8080 --db=metrics.db ./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics ``` + +## Internal Packages + +| Package | Purpose | +|---------|---------| +| `internal/proc` | Low-level `/proc` parsing (stat, status, cgroup) | +| `internal/metrics` | Aggregates process metrics from `/proc` into system/container views | +| `internal/cgroup` | Parses `CGROUP_PROCESS_MAP` and `CGROUP_LIMITS` env vars | +| `internal/collector` | Orchestrates the collection loop and shutdown | +| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | +| `internal/receiver` | HTTP handlers and SQLite store | +| `internal/output` | Metrics output formatting (JSON/text) | + +## Background + +Technical reference on the Linux primitives this project builds on: + +- [Identifying process cgroups by PID](docs/background/identify-process-cgroup-by-pid.md) — how to read `/proc//cgroup` to determine which container a process belongs to +- [/proc/stat behavior in containers](docs/background/proc-stat-in-containers.md) — why `/proc/stat` shows host-level data in containers, and how to aggregate per-process stats from `/proc/[pid]/stat` instead, including CPU tick conversion and cgroup limit handling diff --git a/docs/background/identify-process-cgroup-by-pid.md b/docs/background/identify-process-cgroup-by-pid.md index 062abba..7d4734d 100644 --- a/docs/background/identify-process-cgroup-by-pid.md +++ b/docs/background/identify-process-cgroup-by-pid.md @@ -1,26 +1,14 @@ -# Identify Process Cgroup by PID +# Identifying a Process's Cgroup by PID -## TL;DR +Read `/proc//cgroup` to find which cgroup (and therefore which container) a process belongs to. -You can identify a process's cgroup by reading `/proc//cgroup`. - -## Outcome - -Multiple methods to find which cgroup a process belongs to using its PID. - -## Conversation - -**Question:** Can I identify the process's cgroup by having its PID? - -**Answer:** Yes, here are the main methods: - -### Method 1: Read `/proc//cgroup` +## /proc/PID/cgroup ```bash cat /proc//cgroup ``` -This shows all cgroup controllers the process belongs to. For example: +Shows all cgroup controllers the process belongs to: ``` 12:blkio:/user.slice 11:memory:/user.slice/user-1000.slice @@ -28,35 +16,23 @@ This shows all cgroup controllers the process belongs to. For example: 0::/user.slice/user-1000.slice/session-1.scope ``` -### Method 2: Using `ps` with format options +On cgroup v2, the path after `::` is the cgroup path under `/sys/fs/cgroup/`. + +## Other Methods ```bash +# ps format options ps -o pid,cgroup -p -``` -### Method 3: Using `systemd-cgls` - -If you're on a systemd system: -```bash +# systemd systems systemd-cgls --unit -# or to see the whole tree -systemd-cgls +systemd-cgls # whole tree ``` -### Method 4: Check cgroup v2 unified hierarchy - -On cgroup v2 systems, check: -```bash -cat /proc//cgroup -# Output will look like: 0::/user.slice/user-1000.slice/session-1.scope -``` - -The path after `::` is the cgroup path under `/sys/fs/cgroup/`. - -### Quick one-liner +## Quick One-Liners ```bash -cat /proc/self/cgroup # For current shell -cat /proc/$$/cgroup # Also current shell -cat /proc/1234/cgroup # For PID 1234 +cat /proc/self/cgroup # current shell +cat /proc/$$/cgroup # also current shell +cat /proc/1234/cgroup # specific PID ``` diff --git a/docs/background/proc-stat-in-containers.md b/docs/background/proc-stat-in-containers.md index 7c33575..9884b2a 100644 --- a/docs/background/proc-stat-in-containers.md +++ b/docs/background/proc-stat-in-containers.md @@ -1,64 +1,36 @@ -# /proc/stat Behavior in Containerized Environments +# /proc/stat behavior in containerised environments -## TL;DR +`/proc/stat` in containers shows **host-level** statistics, not container-specific data. To get container-aware CPU metrics when processes span multiple cgroups (e.g., sidecars sharing a PID namespace), aggregate `/proc/[pid]/stat` for all visible processes and use cgroup limits from `/sys/fs/cgroup` for normalization. -`/proc/stat` in containers shows host-level statistics, not container-specific data. To get accurate container CPU metrics when cgroups aren't an option (e.g., sidecars sharing PID namespace with different cgroups), aggregate `/proc/[pid]/stat` for all visible processes and calculate percentages using cgroup limits from `/sys/fs/cgroup`. +## Why /proc/stat is wrong in containers -## Outcome +`/proc/stat` reports host-wide values (CPU times, context switches, boot time, process count) because `/proc` is mounted from the host kernel, which has no namespace awareness for these metrics. -- Explained why `/proc/stat` doesn't provide container-aware metrics -- Provided methods to aggregate process CPU usage from `/proc/[pid]/stat` -- Showed how to determine parent/child process relationships -- Demonstrated converting raw ticks to CPU percentages -- Covered how to read cgroup CPU limits for other containers +This means: +- Tools reading `/proc/stat` (top, htop, etc.) show **host** CPU usage, not container usage +- Cgroup CPU limits (e.g., 2 CPUs) are not reflected — all host CPUs are visible +- In shared environments, containers see each other's aggregate impact -## Conversation - -### How does /proc/stat behave in containerized environments? - -By default, `/proc/stat` inside a container shows **host system statistics**, not container-specific data. This includes: - -- **CPU times** (user, system, idle, etc.) - reflects all CPUs on the host -- **Context switches** - host-wide count -- **Boot time** - host's boot time -- **Processes created** - host-wide total - -#### Why This Happens - -The `/proc` filesystem is mounted from the host kernel. Containers share the host kernel, and `/proc/stat` is a kernel-level interface that doesn't have namespace awareness for most of its metrics. - -#### Practical Implications - -1. **CPU usage calculations are wrong** - Tools like `top`, `htop`, or custom monitoring that read `/proc/stat` will show host CPU usage, not container usage -2. **Resource limits aren't reflected** - Even if you limit a container to 2 CPUs via cgroups, `/proc/stat` shows all host CPUs -3. **Multi-tenant confusion** - In shared environments, containers see each other's aggregate impact - -#### Solutions +### Alternatives | Approach | Description | |----------|-------------| -| **cgroups** | Read from `/sys/fs/cgroup/cpu/` for container-specific CPU accounting | -| **LXCFS** | A FUSE filesystem that provides container-aware `/proc` files | -| **Container runtimes** | Some runtimes (like Kata) use VMs with isolated kernels | -| **Metrics APIs** | Use Docker/Kubernetes APIs instead of `/proc/stat` | - -#### Example: Getting Real Container CPU +| **cgroups** | Read `/sys/fs/cgroup/cpu/` for container-specific CPU accounting | +| **LXCFS** | FUSE filesystem providing container-aware `/proc` files | +| **Container runtimes** | Some (like Kata) use VMs with isolated kernels | +| **Metrics APIs** | Docker/Kubernetes APIs instead of `/proc/stat` | ```bash -# Instead of /proc/stat, use cgroups v2: +# cgroups v2: cat /sys/fs/cgroup/cpu.stat -# Or cgroups v1: +# cgroups v1: cat /sys/fs/cgroup/cpu/cpuacct.usage ``` ---- +## Aggregating per-Process CPU from /proc/[pid]/stat -### Can I aggregate process usage to get the usage of the container? - -Yes, you can aggregate process usage, but there are caveats. - -#### Aggregating from /proc/[pid]/stat +When cgroup-level reads aren't an option (sidecars sharing PID namespace with different cgroups), aggregate individual process stats: ```bash # Fields 14 (utime) and 15 (stime) in /proc/[pid]/stat @@ -67,137 +39,76 @@ for pid in /proc/[0-9]*; do done | awk '{sum += $1} END {print sum}' ``` -#### Problems with This Approach +### Caveats -1. **Race conditions** - Processes can spawn/die between reads, causing inconsistent totals -2. **Short-lived processes** - Processes that start and exit between samples are missed entirely -3. **Zombie/exited processes** - Their CPU time may not be captured -4. **Overhead** - Scanning all PIDs repeatedly is expensive -5. **Namespace visibility** - You only see processes in your PID namespace, but that's actually what you want in a container -6. **Children accounting** - When a process exits, its CPU time gets added to the parent's `cutime`/`cstime` fields, which can cause double-counting if not handled carefully +1. **Race conditions** — processes can spawn/die between reads +2. **Short-lived processes** — missed if they start and exit between samples +3. **Zombie/exited processes** — their CPU time may not be captured +4. **Overhead** — scanning all PIDs repeatedly is expensive +5. **Namespace visibility** — you only see processes in your PID namespace (which is what you want) +6. **Children accounting** — when a process exits, its CPU time is added to the parent's `cutime`/`cstime`, risking double-counting -#### Better Alternative: cgroups +Cgroups handle these edge cases natively, but **cannot be used when sidecars share the PID namespace with different cgroups** — in that case, per-process aggregation is the best option. -Cgroups track CPU usage at the group level and handle all these edge cases: +## Parent/Child Process Relationships + +Field 4 in `/proc/[pid]/stat` is the PPID (parent process ID): ```bash -# cgroups v2 (modern) -cat /sys/fs/cgroup/cpu.stat -# Shows: usage_usec, user_usec, system_usec - -# cgroups v1 -cat /sys/fs/cgroup/cpu/cpuacct.usage # total nanoseconds -cat /sys/fs/cgroup/cpu/cpuacct.stat # user/system jiffies +awk '{print $4}' /proc/1234/stat # PPID from stat +grep PPid /proc/1234/status # more readable ``` -**However**, cgroups cannot be used when sidecars share the PID namespace but have different cgroups. In that case, aggregating `/proc/[pid]/stat` is the best option. - ---- - -### How do I determine child/parent processes? - -#### From /proc/[pid]/stat - -Field 4 is the **PPID** (parent process ID): - -```bash -# Get PPID for a specific process -awk '{print $4}' /proc/1234/stat - -# List all processes with their parents -for stat in /proc/[0-9]*/stat; do - pid="${stat#/proc/}" - pid="${pid%/stat}" - ppid=$(awk '{print $4}' "$stat" 2>/dev/null) && \ - echo "$pid -> parent: $ppid" -done -``` - -#### From /proc/[pid]/status (more readable) - -```bash -grep PPid /proc/1234/status -# PPid: 1 -``` - -#### Building a Process Tree +### Building a Process Tree ```bash #!/bin/bash -declare -A parent_of -declare -A children_of +declare -A parent_of children_of for stat in /proc/[0-9]*/stat; do if read -r line < "$stat" 2>/dev/null; then - pid="${stat#/proc/}" - pid="${pid%/stat}" - - # Extract PPID (field 4, but handle comm with spaces) - rest="${line##*) }" - read -ra fields <<< "$rest" + pid="${stat#/proc/}"; pid="${pid%/stat}" + rest="${line##*) }"; read -ra fields <<< "$rest" ppid="${fields[1]}" # 4th field overall = index 1 after state - parent_of[$pid]=$ppid children_of[$ppid]+="$pid " fi done -# Print tree from PID 1 print_tree() { - local pid=$1 - local indent=$2 + local pid=$1 indent=$2 echo "${indent}${pid}" - for child in ${children_of[$pid]}; do - print_tree "$child" " $indent" - done + for child in ${children_of[$pid]}; do print_tree "$child" " $indent"; done } - print_tree 1 "" ``` -#### For CPU Aggregation: Handling cutime/cstime +### Avoiding Double-Counting with cutime/cstime -To properly handle `cutime`/`cstime` without double-counting: +Only sum `utime` + `stime` per process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on — those children no longer exist in `/proc`, so their time is only accessible via the parent. ```bash #!/bin/bash -declare -A parent_of declare -A utime stime -# First pass: collect all data for stat in /proc/[0-9]*/stat; do if read -r line < "$stat" 2>/dev/null; then - pid="${stat#/proc/}" - pid="${pid%/stat}" - rest="${line##*) }" - read -ra f <<< "$rest" - - parent_of[$pid]="${f[1]}" - utime[$pid]="${f[11]}" - stime[$pid]="${f[12]}" - # cutime=${f[13]} cstime=${f[14]} - don't sum these + pid="${stat#/proc/}"; pid="${pid%/stat}" + rest="${line##*) }"; read -ra f <<< "$rest" + utime[$pid]="${f[11]}"; stime[$pid]="${f[12]}" + # cutime=${f[13]} cstime=${f[14]} — don't sum these fi done -# Sum only utime/stime (not cutime/cstime) total=0 -for pid in "${!utime[@]}"; do - ((total += utime[$pid] + stime[$pid])) -done - +for pid in "${!utime[@]}"; do ((total += utime[$pid] + stime[$pid])); done echo "Total CPU ticks: $total" echo "Seconds: $(echo "scale=2; $total / $(getconf CLK_TCK)" | bc)" ``` -**Key insight:** Only sum `utime` + `stime` for each process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on—those children no longer exist in `/proc`, so their time is only accessible via the parent's `cutime`/`cstime`. +## Converting Ticks to CPU Percentages ---- - -### How do I convert utime/stime to percentages? - -You need **two samples** over a time interval. CPU percentage is a rate, not an absolute value. - -#### The Formula +CPU percentage is a rate — you need **two samples** over a time interval. ``` CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100 @@ -205,20 +116,17 @@ CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100 - `delta_ticks` = difference in (utime + stime) between samples - `CLK_TCK` = ticks per second (usually 100, get via `getconf CLK_TCK`) -- `num_cpus` = number of CPUs (omit for single-CPU percentage) - -#### Two Common Percentage Styles +- `num_cpus` = number of CPUs (omit for per-core percentage) | Style | Formula | Example | |-------|---------|---------| | **Normalized** (0-100%) | `delta / (elapsed * CLK_TCK * num_cpus) * 100` | 50% = half of total capacity | | **Cores-style** (0-N*100%) | `delta / (elapsed * CLK_TCK) * 100` | 200% = 2 full cores busy | -#### Practical Script +### Sampling Script ```bash #!/bin/bash - CLK_TCK=$(getconf CLK_TCK) NUM_CPUS=$(nproc) @@ -226,267 +134,94 @@ get_total_ticks() { local total=0 for stat in /proc/[0-9]*/stat; do if read -r line < "$stat" 2>/dev/null; then - rest="${line##*) }" - read -ra f <<< "$rest" - ((total += f[11] + f[12])) # utime + stime + rest="${line##*) }"; read -ra f <<< "$rest" + ((total += f[11] + f[12])) fi done echo "$total" } -# First sample -ticks1=$(get_total_ticks) -time1=$(date +%s.%N) - -# Wait +ticks1=$(get_total_ticks); time1=$(date +%s.%N) sleep 1 +ticks2=$(get_total_ticks); time2=$(date +%s.%N) -# Second sample -ticks2=$(get_total_ticks) -time2=$(date +%s.%N) - -# Calculate delta_ticks=$((ticks2 - ticks1)) elapsed=$(echo "$time2 - $time1" | bc) -# Percentage of total CPU capacity (all cores) pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK * $NUM_CPUS)) * 100" | bc) echo "CPU usage: ${pct}% of ${NUM_CPUS} CPUs" -# Percentage as "CPU cores used" (like top's 200% for 2 full cores) cores_pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK)) * 100" | bc) echo "CPU usage: ${cores_pct}% (cores-style)" ``` -#### Continuous Monitoring +## Respecting Cgroup CPU Limits + +The above calculations use `nproc`, which returns the **host** CPU count. If a container is limited to 2 CPUs on an 8-CPU host, `nproc` returns 8 and the percentage is misleading. + +### Reading Effective CPU Limit ```bash #!/bin/bash -CLK_TCK=$(getconf CLK_TCK) -NUM_CPUS=$(nproc) -INTERVAL=1 - -get_total_ticks() { - local total=0 - for stat in /proc/[0-9]*/stat; do - read -r line < "$stat" 2>/dev/null || continue - rest="${line##*) }" - read -ra f <<< "$rest" - ((total += f[11] + f[12])) - done - echo "$total" -} - -prev_ticks=$(get_total_ticks) -prev_time=$(date +%s.%N) - -while true; do - sleep "$INTERVAL" - - curr_ticks=$(get_total_ticks) - curr_time=$(date +%s.%N) - - delta=$((curr_ticks - prev_ticks)) - elapsed=$(echo "$curr_time - $prev_time" | bc) - - pct=$(echo "scale=1; $delta / ($elapsed * $CLK_TCK * $NUM_CPUS) * 100" | bc) - printf "\rCPU: %5.1f%%" "$pct" - - prev_ticks=$curr_ticks - prev_time=$curr_time -done -``` - ---- - -### Does this calculation respect cgroup limits? - -No, it doesn't. The calculation uses `nproc` which typically returns **host CPU count**, not your cgroup limit. - -#### The Problem - -If your container is limited to 2 CPUs on an 8-CPU host: -- `nproc` returns 8 -- Your calculation shows 25% when you're actually at 100% of your limit - -#### Getting Effective CPU Limit - -**cgroups v2:** - -```bash -# cpu.max contains: $quota $period (in microseconds) -# "max 100000" means unlimited -read quota period < /sys/fs/cgroup/cpu.max -if [[ "$quota" == "max" ]]; then - effective_cpus=$(nproc) -else - effective_cpus=$(echo "scale=2; $quota / $period" | bc) -fi -echo "Effective CPUs: $effective_cpus" -``` - -**cgroups v1:** - -```bash -quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) -period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) - -if [[ "$quota" == "-1" ]]; then - effective_cpus=$(nproc) -else - effective_cpus=$(echo "scale=2; $quota / $period" | bc) -fi -``` - -**Also Check cpuset Limits:** - -```bash -# cgroups v2 -cpuset=$(cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null) - -# cgroups v1 -cpuset=$(cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null) - -# Parse "0-3,5,7" format to count CPUs -count_cpus() { - local count=0 - IFS=',' read -ra ranges <<< "$1" - for range in "${ranges[@]}"; do - if [[ "$range" == *-* ]]; then - start="${range%-*}" - end="${range#*-}" - ((count += end - start + 1)) - else - ((count++)) - fi - done - echo "$count" -} -``` - -#### Updated Script Respecting Limits - -```bash -#!/bin/bash -CLK_TCK=$(getconf CLK_TCK) - get_effective_cpus() { - # Try cgroups v2 first + # cgroups v2 if [[ -f /sys/fs/cgroup/cpu.max ]]; then read quota period < /sys/fs/cgroup/cpu.max - if [[ "$quota" != "max" ]]; then - echo "scale=2; $quota / $period" | bc - return - fi + [[ "$quota" != "max" ]] && echo "scale=2; $quota / $period" | bc && return fi - - # Try cgroups v1 + # cgroups v1 if [[ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ]]; then quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) - if [[ "$quota" != "-1" ]]; then - echo "scale=2; $quota / $period" | bc - return - fi + [[ "$quota" != "-1" ]] && echo "scale=2; $quota / $period" | bc && return fi - - # Fallback to nproc - nproc + nproc # fallback } - -EFFECTIVE_CPUS=$(get_effective_cpus) -echo "Effective CPU limit: $EFFECTIVE_CPUS" - -# ... rest of your sampling logic using $EFFECTIVE_CPUS ``` -#### Shared PID Namespace Consideration +Also check cpuset limits (`cpuset.cpus.effective` for v2, `cpuset/cpuset.cpus` for v1) which restrict which physical CPUs are available. -When sidecars share PID namespace but have different cgroups: -- Each container may have different CPU limits -- You're aggregating processes across those limits -- There's no single "correct" limit to use +### Shared PID Namespace Complication + +When sidecars share a PID namespace but have different cgroups, there's no single "correct" CPU limit for normalization. Options: -**Options:** 1. **Use host CPU count** — percentage of total host capacity 2. **Sum the limits** — if you know each sidecar's cgroup, sum their quotas -3. **Report in cores** — skip normalization, just show `1.5 cores used` instead of percentage +3. **Report in cores** — skip normalization, show `1.5 cores used` instead of percentage ---- +## Reading Cgroup Limits for Other Containers -### Can I get the cgroup limit for another cgroup? - -Yes, if you have visibility into the cgroup filesystem. - -#### 1. Find a Process's Cgroup - -Every process exposes its cgroup membership: +Every process exposes its cgroup membership via `/proc//cgroup`. If the cgroup filesystem is mounted, you can read any container's limits: ```bash -# Get cgroup for any PID you can see -cat /proc/1234/cgroup - -# cgroups v2 output: -# 0::/kubepods/pod123/container456 - -# cgroups v1 output: -# 12:cpu,cpuacct:/docker/abc123 -# 11:memory:/docker/abc123 -# ... -``` - -#### 2. Read That Cgroup's Limits - -If the cgroup filesystem is mounted and accessible: - -```bash -#!/bin/bash - get_cgroup_cpu_limit() { local pid=$1 - - # Get cgroup path for this PID - cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) # v2 - + # cgroups v2 + cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) if [[ -n "$cgroup_path" ]]; then - # cgroups v2 limit_file="/sys/fs/cgroup${cgroup_path}/cpu.max" if [[ -r "$limit_file" ]]; then read quota period < "$limit_file" - if [[ "$quota" == "max" ]]; then - echo "unlimited" - else - echo "scale=2; $quota / $period" | bc - fi + [[ "$quota" == "max" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc return fi fi - - # Try cgroups v1 + # cgroups v1 cgroup_path=$(grep -oP 'cpu.*:\K.*' /proc/$pid/cgroup 2>/dev/null) if [[ -n "$cgroup_path" ]]; then quota_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_quota_us" period_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_period_us" if [[ -r "$quota_file" ]]; then - quota=$(cat "$quota_file") - period=$(cat "$period_file") - if [[ "$quota" == "-1" ]]; then - echo "unlimited" - else - echo "scale=2; $quota / $period" | bc - fi + quota=$(cat "$quota_file"); period=$(cat "$period_file") + [[ "$quota" == "-1" ]] && echo "unlimited" || echo "scale=2; $quota / $period" | bc return fi fi - echo "unknown" } - -# Example: get limit for PID 1234 -get_cgroup_cpu_limit 1234 ``` -#### 3. Mount Visibility Requirements +### Mount Visibility | Scenario | Can Read Other Cgroups? | |----------|------------------------| @@ -495,66 +230,9 @@ get_cgroup_cpu_limit 1234 | `/sys/fs/cgroup` mounted read-only from host | Yes (common in Kubernetes) | | Only own cgroup subtree mounted | No | -Check what's visible: +### Fallbacks When Cgroups Aren't Accessible -```bash -mount | grep cgroup -ls /sys/fs/cgroup/ -``` - -#### 4. Full Solution: Aggregate by Cgroup - -```bash -#!/bin/bash -CLK_TCK=$(getconf CLK_TCK) - -declare -A cgroup_ticks -declare -A cgroup_limit - -for stat in /proc/[0-9]*/stat; do - pid="${stat#/proc/}" - pid="${pid%/stat}" - - # Get cgroup for this process - cg=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) - [[ -z "$cg" ]] && continue - - # Get CPU ticks - if read -r line < "$stat" 2>/dev/null; then - rest="${line##*) }" - read -ra f <<< "$rest" - ticks=$((f[11] + f[12])) - - ((cgroup_ticks[$cg] += ticks)) - - # Cache the limit (only look up once per cgroup) - if [[ -z "${cgroup_limit[$cg]}" ]]; then - limit_file="/sys/fs/cgroup${cg}/cpu.max" - if [[ -r "$limit_file" ]]; then - read quota period < "$limit_file" - if [[ "$quota" == "max" ]]; then - cgroup_limit[$cg]="unlimited" - else - cgroup_limit[$cg]=$(echo "scale=2; $quota / $period" | bc) - fi - else - cgroup_limit[$cg]="unknown" - fi - fi - fi -done - -echo "Ticks by cgroup:" -for cg in "${!cgroup_ticks[@]}"; do - echo " $cg: ${cgroup_ticks[$cg]} ticks (limit: ${cgroup_limit[$cg]} CPUs)" -done -``` - -#### If You Can't Access Other Cgroups - -Fallback options: - -1. **Mount the cgroup fs** — add volume mount for `/sys/fs/cgroup:ro` -2. **Use a sidecar with access** — one privileged container does the monitoring +1. **Mount the cgroup fs** — volume mount `/sys/fs/cgroup:ro` +2. **Use a sidecar with access** — one privileged container does monitoring 3. **Accept "unknown" limits** — report raw ticks/cores instead of percentages -4. **Kubernetes Downward API** — inject limits as env vars (only for your own container though) +4. **Kubernetes Downward API** — inject limits as env vars (own container only) From 90c89583a0dc22e802741bb5cb23e65ef7ecdb7a Mon Sep 17 00:00:00 2001 From: Martin McCaffery Date: Mon, 9 Feb 2026 17:37:53 +0100 Subject: [PATCH 19/21] Ignore data with no delta (first datapoint or underflow) --- internal/metrics/aggregator.go | 4 +- internal/metrics/types.go | 1 + internal/summary/accumulator.go | 5 +- internal/summary/accumulator_test.go | 77 +++++++++++++++++++++++++--- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/internal/metrics/aggregator.go b/internal/metrics/aggregator.go index d8a22b9..2e7c18e 100644 --- a/internal/metrics/aggregator.go +++ b/internal/metrics/aggregator.go @@ -347,6 +347,7 @@ func (a *Aggregator) calculateCgroupMetrics( // Calculate CPU cores used from delta usedCores := 0.0 + hasDelta := false if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 { // Guard against underflow: if processes exited and new ones started, // totalTicks could be less than prev. In that case, skip this sample. @@ -354,8 +355,8 @@ func (a *Aggregator) calculateCgroupMetrics( deltaTicks := totalTicks - prev // Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK) usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks)) + hasDelta = true } - // If totalTicks < prev, usedCores stays 0 for this sample } a.prevCgroupCPU[containerName] = totalTicks @@ -384,6 +385,7 @@ func (a *Aggregator) calculateCgroupMetrics( UsedCores: usedCores, UsedPercent: cpuPercent, LimitCores: limitCores, + HasDelta: hasDelta, }, Memory: CgroupMemoryMetrics{ TotalRSSBytes: totalRSS, diff --git a/internal/metrics/types.go b/internal/metrics/types.go index b88d4c8..9aa9cb2 100644 --- a/internal/metrics/types.go +++ b/internal/metrics/types.go @@ -51,6 +51,7 @@ type CgroupCPUMetrics struct { UsedCores float64 `json:"used_cores"` UsedPercent float64 `json:"used_percent,omitempty"` LimitCores float64 `json:"limit_cores,omitempty"` + HasDelta bool `json:"-"` // true when a valid delta could be computed } // CgroupMemoryMetrics holds memory metrics for a single cgroup/container diff --git a/internal/summary/accumulator.go b/internal/summary/accumulator.go index c8f10f1..6ffa8cd 100644 --- a/internal/summary/accumulator.go +++ b/internal/summary/accumulator.go @@ -64,7 +64,10 @@ func (a *Accumulator) Add(m *metrics.SystemMetrics) { ca = &containerAccumulator{} a.containers[name] = ca } - ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores) + // Only record CPU when a valid delta was computed (skip first sample and underflow) + if cgroup.CPU.HasDelta { + ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores) + } ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes)) } } diff --git a/internal/summary/accumulator_test.go b/internal/summary/accumulator_test.go index 203fe53..cdda597 100644 --- a/internal/summary/accumulator_test.go +++ b/internal/summary/accumulator_test.go @@ -379,7 +379,7 @@ func TestAccumulator_AllPercentiles(t *testing.T) { func TestAccumulator_ContainerMetrics(t *testing.T) { acc := NewAccumulator(5) - // Add samples with container metrics + // Add samples with container metrics (HasDelta=true to indicate valid CPU measurements) for i := 1; i <= 5; i++ { acc.Add(&metrics.SystemMetrics{ Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), @@ -388,14 +388,14 @@ func TestAccumulator_ContainerMetrics(t *testing.T) { Cgroups: map[string]*metrics.CgroupMetrics{ "container-a": { Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i)}, + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, Memory: metrics.CgroupMemoryMetrics{ TotalRSSBytes: uint64(i * 1000), }, }, "container-b": { Name: "container-b", - CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2)}, + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2), HasDelta: true}, Memory: metrics.CgroupMemoryMetrics{ TotalRSSBytes: uint64(i * 2000), }, @@ -478,7 +478,7 @@ func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { Cgroups: map[string]*metrics.CgroupMetrics{ "container-a": { Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: 1}, + CPU: metrics.CgroupCPUMetrics{UsedCores: 1, HasDelta: true}, Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, }, }, @@ -492,12 +492,12 @@ func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { Cgroups: map[string]*metrics.CgroupMetrics{ "container-a": { Name: "container-a", - CPU: metrics.CgroupCPUMetrics{UsedCores: 2}, + CPU: metrics.CgroupCPUMetrics{UsedCores: 2, HasDelta: true}, Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000}, }, "container-b": { Name: "container-b", - CPU: metrics.CgroupCPUMetrics{UsedCores: 5}, + CPU: metrics.CgroupCPUMetrics{UsedCores: 5, HasDelta: true}, Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000}, }, }, @@ -531,3 +531,68 @@ func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) { t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg) } } + +func TestAccumulator_ContainerMetrics_InvalidDeltaExcluded(t *testing.T) { + acc := NewAccumulator(5) + + // Sample 1: no valid CPU delta (first sample / underflow) — should be excluded from CPU stats + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "runner": { + Name: "runner", + CPU: metrics.CgroupCPUMetrics{UsedCores: 0, HasDelta: false}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000}, + }, + }, + }) + + // Samples 2-4: valid deltas + for i := 2; i <= 4; i++ { + acc.Add(&metrics.SystemMetrics{ + Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC), + CPU: metrics.CPUMetrics{}, + Memory: metrics.MemoryMetrics{}, + Cgroups: map[string]*metrics.CgroupMetrics{ + "runner": { + Name: "runner", + CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i), HasDelta: true}, + Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: uint64(i * 1000)}, + }, + }, + }) + } + + s := acc.Summarize() + if s == nil { + t.Fatal("expected non-nil summary") + } + + if len(s.Containers) != 1 { + t.Fatalf("Containers length: got %d, want 1", len(s.Containers)) + } + + runner := s.Containers[0] + // CPU should only include samples 2,3,4 (values 2,3,4) — NOT the invalid zero + // Peak=4, Avg=3, P50=3 + if runner.CPUCores.Peak != 4 { + t.Errorf("CPUCores.Peak: got %f, want 4", runner.CPUCores.Peak) + } + if runner.CPUCores.Avg != 3 { + t.Errorf("CPUCores.Avg: got %f, want 3", runner.CPUCores.Avg) + } + if runner.CPUCores.P50 != 3 { + t.Errorf("CPUCores.P50: got %f, want 3", runner.CPUCores.P50) + } + + // Memory should include all 4 samples (memory is always valid) + // Values: 1000, 2000, 3000, 4000 + if runner.MemoryBytes.Peak != 4000 { + t.Errorf("MemoryBytes.Peak: got %f, want 4000", runner.MemoryBytes.Peak) + } + if runner.MemoryBytes.Avg != 2500 { + t.Errorf("MemoryBytes.Avg: got %f, want 2500", runner.MemoryBytes.Avg) + } +} From 042ce77ddc07df823d416362ba9ddf4a6ba7f028 Mon Sep 17 00:00:00 2001 From: Manuel Ganter Date: Tue, 10 Feb 2026 12:02:15 +0100 Subject: [PATCH 20/21] feat: added pre-shared-key for read endpoints --- README.md | 77 +++++++++++++++--------- cmd/receiver/main.go | 3 +- internal/integration/integration_test.go | 2 +- internal/receiver/handler.go | 50 +++++++++++++-- internal/receiver/handler_test.go | 58 +++++++++++++++++- 5 files changed, 152 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 196adf2..b7c98d3 100644 --- a/README.md +++ b/README.md @@ -34,23 +34,25 @@ Runs as a sidecar alongside CI workloads. On a configurable interval, it reads ` **Environment variables:** -| Variable | Description | Example | -|----------|-------------|---------| -| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | -| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | -| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | -| `GITHUB_JOB` | Job name | `build` | -| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | -| `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | -| `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | +| Variable | Description | Example | +| ------------------------- | ------------------------------------- | ------------------- | +| `GITHUB_REPOSITORY_OWNER` | Organization name | `my-org` | +| `GITHUB_REPOSITORY` | Full repository path | `my-org/my-repo` | +| `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | +| `GITHUB_JOB` | Job name | `build` | +| `GITHUB_RUN_ID` | Unique run identifier | `run-123` | +| `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | +| `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | **CGROUP_LIMITS example:** + ```json { - "runner": {"cpu": "2", "memory": "1Gi"}, - "sidecar": {"cpu": "500m", "memory": "256Mi"} + "runner": { "cpu": "2", "memory": "1Gi" }, + "sidecar": { "cpu": "500m", "memory": "256Mi" } } ``` + CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory supports `Ki`, `Mi`, `Gi`, `Ti` (binary) or `K`, `M`, `G`, `T` (decimal). ### Receiver @@ -58,18 +60,32 @@ CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory HTTP service that stores metric summaries in SQLite (via GORM) and exposes a query API. ```bash -./receiver --addr=:8080 --db=metrics.db +./receiver --addr=:8080 --db=metrics.db --read-token=my-secret-token ``` -| Variable | Description | Default | -|----------|-------------|---------| -| `DB_PATH` | SQLite database path | `metrics.db` | -| `LISTEN_ADDR` | HTTP listen address | `:8080` | +**Flags:** + +| Flag | Environment Variable | Description | Default | +| -------------- | --------------------- | ---------------------------------------------- | ------------ | +| `--addr` | — | HTTP listen address | `:8080` | +| `--db` | — | SQLite database path | `metrics.db` | +| `--read-token` | `RECEIVER_READ_TOKEN` | Pre-shared token for read endpoints (optional) | — | **Endpoints:** - `POST /api/v1/metrics` — receive and store a metric summary -- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics (protected if `--read-token` is set) + +**Authentication:** + +When `--read-token` is configured, the GET endpoint requires a Bearer token: + +```bash +curl -H "Authorization: Bearer my-secret-token" \ #gitleaks:allow + http://localhost:8080/api/v1/metrics/repo/org/repo/workflow/job +``` + +If no token is configured, the endpoint remains open. ## How Metrics Are Collected @@ -81,11 +97,11 @@ Container CPU is reported in **cores** (not percentage) for direct comparison wi Over the course of a run, the `summary.Accumulator` tracks every sample and on shutdown computes: -| Stat | Description | -|------|-------------| -| `peak` | Maximum observed value | +| Stat | Description | +| -------------------------- | ------------------------------ | +| `peak` | Maximum observed value | | `p99`, `p95`, `p75`, `p50` | Percentiles across all samples | -| `avg` | Arithmetic mean | +| `avg` | Arithmetic mean | These stats are computed for CPU, memory, and per-container metrics. @@ -127,6 +143,7 @@ GET /api/v1/metrics/repo/my-org/my-repo/ci.yml/build ``` **CPU metric distinction:** + - `cpu_total_percent` — system-wide, 0-100% - `cpu_cores` (containers) — cores used (e.g. `2.0` = two full cores) - `peak_cpu_percent` (processes) — per-process, where 100% = 1 core @@ -157,15 +174,15 @@ go build -o receiver ./cmd/receiver ## Internal Packages -| Package | Purpose | -|---------|---------| -| `internal/proc` | Low-level `/proc` parsing (stat, status, cgroup) | -| `internal/metrics` | Aggregates process metrics from `/proc` into system/container views | -| `internal/cgroup` | Parses `CGROUP_PROCESS_MAP` and `CGROUP_LIMITS` env vars | -| `internal/collector` | Orchestrates the collection loop and shutdown | -| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | -| `internal/receiver` | HTTP handlers and SQLite store | -| `internal/output` | Metrics output formatting (JSON/text) | +| Package | Purpose | +| -------------------- | ------------------------------------------------------------------- | +| `internal/proc` | Low-level `/proc` parsing (stat, status, cgroup) | +| `internal/metrics` | Aggregates process metrics from `/proc` into system/container views | +| `internal/cgroup` | Parses `CGROUP_PROCESS_MAP` and `CGROUP_LIMITS` env vars | +| `internal/collector` | Orchestrates the collection loop and shutdown | +| `internal/summary` | Accumulates samples, computes stats, pushes to receiver | +| `internal/receiver` | HTTP handlers and SQLite store | +| `internal/output` | Metrics output formatting (JSON/text) | ## Background diff --git a/cmd/receiver/main.go b/cmd/receiver/main.go index a5fda16..21a59d4 100644 --- a/cmd/receiver/main.go +++ b/cmd/receiver/main.go @@ -22,6 +22,7 @@ const ( func main() { addr := flag.String("addr", defaultAddr, "HTTP listen address") dbPath := flag.String("db", defaultDBPath, "SQLite database path") + readToken := flag.String("read-token", os.Getenv("RECEIVER_READ_TOKEN"), "Pre-shared token for read endpoints (or set RECEIVER_READ_TOKEN)") flag.Parse() logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ @@ -35,7 +36,7 @@ func main() { } defer func() { _ = store.Close() }() - handler := receiver.NewHandler(store, logger) + handler := receiver.NewHandler(store, logger, *readToken) mux := http.NewServeMux() handler.RegisterRoutes(mux) diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index a7932c3..c9dc9a2 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -27,7 +27,7 @@ func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) t.Fatalf("NewStore() error = %v", err) } - handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil))) + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), "") mux := http.NewServeMux() handler.RegisterRoutes(mux) diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go index 66905f9..ef4bfae 100644 --- a/internal/receiver/handler.go +++ b/internal/receiver/handler.go @@ -3,20 +3,25 @@ package receiver import ( + "crypto/subtle" "encoding/json" "log/slog" "net/http" + "strings" ) // Handler handles HTTP requests for the metrics receiver type Handler struct { - store *Store - logger *slog.Logger + store *Store + logger *slog.Logger + readToken string // Pre-shared token for read endpoints (empty = no auth required) } -// NewHandler creates a new HTTP handler with the given store -func NewHandler(store *Store, logger *slog.Logger) *Handler { - return &Handler{store: store, logger: logger} +// NewHandler creates a new HTTP handler with the given store. +// If readToken is non-empty, the GET metrics endpoint will require +// Bearer token authentication. +func NewHandler(store *Store, logger *slog.Logger, readToken string) *Handler { + return &Handler{store: store, logger: logger, readToken: readToken} } // RegisterRoutes registers all HTTP routes on the given mux @@ -26,6 +31,37 @@ func (h *Handler) RegisterRoutes(mux *http.ServeMux) { mux.HandleFunc("GET /health", h.handleHealth) } +// validateReadToken checks the Authorization header for a valid Bearer token. +// Returns true if authentication is disabled (empty readToken) or if the token matches. +func (h *Handler) validateReadToken(w http.ResponseWriter, r *http.Request) bool { + if h.readToken == "" { + return true + } + + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + h.logger.Warn("missing authorization header", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + const bearerPrefix = "Bearer " + if !strings.HasPrefix(authHeader, bearerPrefix) { + h.logger.Warn("invalid authorization format", slog.String("path", r.URL.Path)) + http.Error(w, "invalid authorization format", http.StatusUnauthorized) + return false + } + + token := strings.TrimPrefix(authHeader, bearerPrefix) + if subtle.ConstantTimeCompare([]byte(token), []byte(h.readToken)) != 1 { + h.logger.Warn("invalid token", slog.String("path", r.URL.Path)) + http.Error(w, "invalid token", http.StatusUnauthorized) + return false + } + + return true +} + func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { var payload MetricsPayload if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { @@ -58,6 +94,10 @@ func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { } func (h *Handler) handleGetByWorkflowJob(w http.ResponseWriter, r *http.Request) { + if !h.validateReadToken(w, r) { + return + } + org := r.PathValue("org") repo := r.PathValue("repo") workflow := r.PathValue("workflow") diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go index df589df..373e1ae 100644 --- a/internal/receiver/handler_test.go +++ b/internal/receiver/handler_test.go @@ -158,6 +158,48 @@ func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { } } +func TestHandler_GetByWorkflowJob_WithToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + // Save a metric + payload := &MetricsPayload{ + Execution: ExecutionContext{Organization: "org", Repository: "repo", Workflow: "ci.yml", Job: "build", RunID: "r1"}, + } + if _, err := h.store.SaveMetric(payload); err != nil { + t.Fatalf("SaveMetric() error = %v", err) + } + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + + tests := []struct { + name string + authHeader string + wantCode int + }{ + {"no auth header", "", http.StatusUnauthorized}, + {"wrong format", "Basic dXNlcjpwYXNz", http.StatusUnauthorized}, + {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, + {"valid token", "Bearer secret-token", http.StatusOK}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) + if tt.authHeader != "" { + req.Header.Set("Authorization", tt.authHeader) + } + rec := httptest.NewRecorder() + mux.ServeHTTP(rec, req) + + if rec.Code != tt.wantCode { + t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) + } + }) + } +} + func TestHandler_Health(t *testing.T) { h, cleanup := newTestHandler(t) defer cleanup() @@ -191,7 +233,21 @@ func newTestHandler(t *testing.T) (*Handler, func()) { } logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - handler := NewHandler(store, logger) + handler := NewHandler(store, logger, "") // no auth token for basic tests + + return handler, func() { _ = store.Close() } +} + +func newTestHandlerWithToken(t *testing.T, token string) (*Handler, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + handler := NewHandler(store, logger, token) return handler, func() { _ = store.Close() } } From d0dd209bc9fab39853ef8b27632a50049c57e796 Mon Sep 17 00:00:00 2001 From: Martin McCaffery Date: Wed, 11 Feb 2026 13:58:42 +0100 Subject: [PATCH 21/21] Add token-based authentication for receiver --- README.md | 87 ++++++-- cmd/collector/main.go | 3 +- cmd/receiver/main.go | 3 +- internal/integration/integration_test.go | 157 ++++++++++++++- internal/receiver/handler.go | 80 +++++++- internal/receiver/handler_test.go | 246 +++++++++++++++++++++-- internal/receiver/token.go | 25 +++ internal/receiver/token_test.go | 78 +++++++ internal/receiver/types.go | 13 ++ internal/summary/push.go | 10 +- internal/summary/push_test.go | 50 ++++- test/docker/docker-compose-stress.yaml | 14 +- 12 files changed, 705 insertions(+), 61 deletions(-) create mode 100644 internal/receiver/token.go create mode 100644 internal/receiver/token_test.go diff --git a/README.md b/README.md index b7c98d3..abd58e9 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Runs as a sidecar alongside CI workloads. On a configurable interval, it reads ` ./collector --interval=2s --top=10 --push-endpoint=http://receiver:8080/api/v1/metrics ``` -**Flags:** `--interval`, `--proc-path`, `--log-level`, `--log-format`, `--top`, `--push-endpoint` +**Flags:** `--interval`, `--proc-path`, `--log-level`, `--log-format`, `--top`, `--push-endpoint`, `--push-token` **Environment variables:** @@ -41,6 +41,7 @@ Runs as a sidecar alongside CI workloads. On a configurable interval, it reads ` | `GITHUB_WORKFLOW` | Workflow filename | `ci.yml` | | `GITHUB_JOB` | Job name | `build` | | `GITHUB_RUN_ID` | Unique run identifier | `run-123` | +| `COLLECTOR_PUSH_TOKEN` | Bearer token for push endpoint auth | — | | `CGROUP_PROCESS_MAP` | JSON: process name → container name | `{"node":"runner"}` | | `CGROUP_LIMITS` | JSON: per-container CPU/memory limits | See below | @@ -60,32 +61,52 @@ CPU supports Kubernetes notation (`"2"` = 2 cores, `"500m"` = 0.5 cores). Memory HTTP service that stores metric summaries in SQLite (via GORM) and exposes a query API. ```bash -./receiver --addr=:8080 --db=metrics.db --read-token=my-secret-token +./receiver --addr=:8080 --db=metrics.db --read-token=my-secret-token --hmac-key=my-hmac-key ``` **Flags:** -| Flag | Environment Variable | Description | Default | -| -------------- | --------------------- | ---------------------------------------------- | ------------ | -| `--addr` | — | HTTP listen address | `:8080` | -| `--db` | — | SQLite database path | `metrics.db` | -| `--read-token` | `RECEIVER_READ_TOKEN` | Pre-shared token for read endpoints (optional) | — | +| Flag | Environment Variable | Description | Default | +| -------------- | --------------------- | ----------------------------------------------------- | ------------ | +| `--addr` | — | HTTP listen address | `:8080` | +| `--db` | — | SQLite database path | `metrics.db` | +| `--read-token` | `RECEIVER_READ_TOKEN` | Pre-shared token for read/admin endpoints (required) | — | +| `--hmac-key` | `RECEIVER_HMAC_KEY` | Secret key for push token generation/validation (required) | — | **Endpoints:** -- `POST /api/v1/metrics` — receive and store a metric summary -- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics (protected if `--read-token` is set) +- `POST /api/v1/metrics` — receive and store a metric summary (requires scoped push token) +- `POST /api/v1/token` — generate a scoped push token (requires read token auth) +- `GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}` — query stored metrics (requires read token auth) **Authentication:** -When `--read-token` is configured, the GET endpoint requires a Bearer token: +All metrics endpoints require authentication via `--read-token`: + +- The GET endpoint requires a Bearer token matching the read token +- The POST metrics endpoint requires a scoped push token (generated via `POST /api/v1/token`) +- The token endpoint itself requires the read token + +**Token flow:** ```bash +# 1. Admin generates a scoped push token using the read token +curl -X POST http://localhost:8080/api/v1/token \ + -H "Authorization: Bearer my-secret-token" \ + -H "Content-Type: application/json" \ + -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' +# → {"token":""} + +# 2. Collector uses the scoped token to push metrics +./collector --push-endpoint=http://localhost:8080/api/v1/metrics \ + --push-token= + +# 3. Query metrics with the read token curl -H "Authorization: Bearer my-secret-token" \ #gitleaks:allow - http://localhost:8080/api/v1/metrics/repo/org/repo/workflow/job + http://localhost:8080/api/v1/metrics/repo/my-org/my-repo/ci.yml/build ``` -If no token is configured, the endpoint remains open. +Push tokens are HMAC-SHA256 digests derived from `--hmac-key` and the scope (org/repo/workflow/job). They are stateless — no database storage is needed. The HMAC key is separate from the read token so that compromising a push token does not expose the admin credential. ## How Metrics Are Collected @@ -155,11 +176,28 @@ All memory values are in **bytes**. ### Docker Compose ```bash -docker compose -f test/docker/docker-compose-stress.yaml up -d -# Wait for collection, then trigger shutdown summary: +# Start the receiver (builds image if needed): +docker compose -f test/docker/docker-compose-stress.yaml up -d --build receiver + +# Generate a scoped push token for the collector: +PUSH_TOKEN=$(curl -s -X POST http://localhost:9080/api/v1/token \ + -H "Authorization: Bearer dummyreadtoken" \ + -H "Content-Type: application/json" \ + -d '{"organization":"test-org","repository":"test-org/stress-test","workflow":"stress-test-workflow","job":"heavy-workload"}' \ + | jq -r .token) + +# Start the collector and stress workloads with the push token: +COLLECTOR_PUSH_TOKEN=$PUSH_TOKEN \ + docker compose -f test/docker/docker-compose-stress.yaml up -d --build collector + +# ... Wait for data collection ... + +# Trigger shutdown summary: docker compose -f test/docker/docker-compose-stress.yaml stop collector -# Query results: -curl http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload + +# Query results with the read token: +curl -H "Authorization: Bearer dummyreadtoken" \ + http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/stress-test-workflow/heavy-workload ``` ### Local @@ -168,8 +206,21 @@ curl http://localhost:9080/api/v1/metrics/repo/test-org/test-org%2Fstress-test/s go build -o collector ./cmd/collector go build -o receiver ./cmd/receiver -./receiver --addr=:8080 --db=metrics.db -./collector --interval=2s --top=10 --push-endpoint=http://localhost:8080/api/v1/metrics +# Start receiver with both keys: +./receiver --addr=:8080 --db=metrics.db \ + --read-token=my-secret-token --hmac-key=my-hmac-key + +# Generate a scoped push token: +PUSH_TOKEN=$(curl -s -X POST http://localhost:8080/api/v1/token \ + -H "Authorization: Bearer my-secret-token" \ + -H "Content-Type: application/json" \ + -d '{"organization":"my-org","repository":"my-repo","workflow":"ci.yml","job":"build"}' \ + | jq -r .token) + +# Run collector with the push token: +./collector --interval=2s --top=10 \ + --push-endpoint=http://localhost:8080/api/v1/metrics \ + --push-token=$PUSH_TOKEN ``` ## Internal Packages diff --git a/cmd/collector/main.go b/cmd/collector/main.go index ec06b9b..7a88a85 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -31,6 +31,7 @@ func main() { logFormat := flag.String("log-format", defaultLogFormat, "Output format: json, text") topN := flag.Int("top", defaultTopN, "Number of top processes to include") pushEndpoint := flag.String("push-endpoint", "", "HTTP endpoint to push metrics to (e.g., http://localhost:8080/api/v1/metrics)") + pushToken := flag.String("push-token", os.Getenv("COLLECTOR_PUSH_TOKEN"), "Bearer token for push endpoint authentication (or set COLLECTOR_PUSH_TOKEN)") flag.Parse() // Setup structured logging for application logs @@ -61,7 +62,7 @@ func main() { // Setup push client if endpoint is configured if *pushEndpoint != "" { - pushClient := summary.NewPushClient(*pushEndpoint) + pushClient := summary.NewPushClient(*pushEndpoint, *pushToken) c.SetPushClient(pushClient) execCtx := pushClient.ExecutionContext() appLogger.Info("push client configured", diff --git a/cmd/receiver/main.go b/cmd/receiver/main.go index 21a59d4..1379b53 100644 --- a/cmd/receiver/main.go +++ b/cmd/receiver/main.go @@ -23,6 +23,7 @@ func main() { addr := flag.String("addr", defaultAddr, "HTTP listen address") dbPath := flag.String("db", defaultDBPath, "SQLite database path") readToken := flag.String("read-token", os.Getenv("RECEIVER_READ_TOKEN"), "Pre-shared token for read endpoints (or set RECEIVER_READ_TOKEN)") + hmacKey := flag.String("hmac-key", os.Getenv("RECEIVER_HMAC_KEY"), "Secret key for push token generation/validation (or set RECEIVER_HMAC_KEY)") flag.Parse() logger := slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ @@ -36,7 +37,7 @@ func main() { } defer func() { _ = store.Close() }() - handler := receiver.NewHandler(store, logger, *readToken) + handler := receiver.NewHandler(store, logger, *readToken, *hmacKey) mux := http.NewServeMux() handler.RegisterRoutes(mux) diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index c9dc9a2..685d2b6 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -18,7 +18,12 @@ import ( "edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/summary" ) -// setupTestReceiver creates a test receiver with SQLite storage and HTTP server +const ( + testReadToken = "integration-test-token" + testHMACKey = "integration-hmac-key" +) + +// setupTestReceiver creates a test receiver with SQLite storage, auth, and HTTP server func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) { t.Helper() dbPath := filepath.Join(t.TempDir(), "test.db") @@ -27,7 +32,7 @@ func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) t.Fatalf("NewStore() error = %v", err) } - handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), "") + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), testReadToken, testHMACKey) mux := http.NewServeMux() handler.RegisterRoutes(mux) @@ -41,6 +46,11 @@ func setupTestReceiver(t *testing.T) (*receiver.Store, *httptest.Server, func()) return store, server, cleanup } +// generatePushToken generates a scoped push token for an execution context +func generatePushToken(exec summary.ExecutionContext) string { + return receiver.GenerateScopedToken(testHMACKey, exec.Organization, exec.Repository, exec.Workflow, exec.Job) +} + func TestPushClientToReceiver(t *testing.T) { store, server, cleanup := setupTestReceiver(t) defer cleanup() @@ -85,10 +95,18 @@ func TestPushClientToReceiver(t *testing.T) { t.Fatalf("Marshal() error = %v", err) } - // Send via HTTP client - resp, err := http.Post(server.URL+"/api/v1/metrics", "application/json", bytes.NewReader(body)) + // Send via HTTP client with scoped push token + pushToken := generatePushToken(testCtx) + req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) if err != nil { - t.Fatalf("Post() error = %v", err) + t.Fatalf("NewRequest() error = %v", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("Do() error = %v", err) } defer func() { _ = resp.Body.Close() }() @@ -148,8 +166,11 @@ func TestPushClientIntegration(t *testing.T) { t.Setenv("GITHUB_JOB", "push-job") t.Setenv("GITHUB_RUN_ID", "push-run-456") - // Create push client - it reads from env vars - pushClient := summary.NewPushClient(server.URL + "/api/v1/metrics") + // Generate scoped push token + pushToken := receiver.GenerateScopedToken(testHMACKey, "push-client-org", "push-client-repo", "push-test.yml", "push-job") + + // Create push client with token - it reads execution context from env vars + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", pushToken) // Verify execution context was read from env ctx := pushClient.ExecutionContext() @@ -194,7 +215,6 @@ func TestMultiplePushes(t *testing.T) { defer cleanup() // Simulate multiple workflow runs pushing metrics via direct HTTP POST - // This avoids env var manipulation which could cause issues with parallel tests runs := []summary.ExecutionContext{ {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-1"}, {Organization: "org-a", Repository: "repo-1", Workflow: "ci.yml", Job: "build", RunID: "run-2"}, @@ -219,9 +239,17 @@ func TestMultiplePushes(t *testing.T) { t.Fatalf("Marshal() error = %v", err) } - resp, err := http.Post(server.URL+"/api/v1/metrics", "application/json", bytes.NewReader(body)) + pushToken := generatePushToken(execCtx) + req, err := http.NewRequest(http.MethodPost, server.URL+"/api/v1/metrics", bytes.NewReader(body)) if err != nil { - t.Fatalf("Post() error = %v for run %+v", err, execCtx) + t.Fatalf("NewRequest() error = %v for run %+v", err, execCtx) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("Do() error = %v for run %+v", err, execCtx) } _ = resp.Body.Close() @@ -247,3 +275,112 @@ func TestMultiplePushes(t *testing.T) { t.Errorf("got %d metrics for org-a/repo-1/ci.yml/test, want 1", len(metrics)) } } + +func TestPushClientWithTokenIntegration(t *testing.T) { + readToken := "integration-read-secret" + hmacKey := "integration-hmac-secret" + store, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) + defer cleanup() + + // Generate a scoped token via the API + tokenReqBody, _ := json.Marshal(map[string]string{ + "organization": "token-org", + "repository": "token-repo", + "workflow": "ci.yml", + "job": "build", + }) + tokenReq, _ := http.NewRequest(http.MethodPost, server.URL+"/api/v1/token", bytes.NewReader(tokenReqBody)) + tokenReq.Header.Set("Authorization", "Bearer "+readToken) + tokenReq.Header.Set("Content-Type", "application/json") + + tokenResp, err := http.DefaultClient.Do(tokenReq) + if err != nil { + t.Fatalf("token request error: %v", err) + } + defer func() { _ = tokenResp.Body.Close() }() + + if tokenResp.StatusCode != http.StatusOK { + t.Fatalf("token request status = %d, want %d", tokenResp.StatusCode, http.StatusOK) + } + + var tokenBody struct { + Token string `json:"token"` + } + if err := json.NewDecoder(tokenResp.Body).Decode(&tokenBody); err != nil { + t.Fatalf("decode token response: %v", err) + } + + // Use the scoped token to push metrics + t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") + t.Setenv("GITHUB_REPOSITORY", "token-repo") + t.Setenv("GITHUB_WORKFLOW", "ci.yml") + t.Setenv("GITHUB_JOB", "build") + t.Setenv("GITHUB_RUN_ID", "token-run-1") + + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", tokenBody.Token) + + testSummary := &summary.RunSummary{ + StartTime: time.Now().Add(-10 * time.Second), + EndTime: time.Now(), + DurationSeconds: 10.0, + SampleCount: 2, + } + + if err := pushClient.Push(context.Background(), testSummary); err != nil { + t.Fatalf("Push() error = %v", err) + } + + // Verify stored + metrics, err := store.GetMetricsByWorkflowJob("token-org", "token-repo", "ci.yml", "build") + if err != nil { + t.Fatalf("GetMetricsByWorkflowJob() error = %v", err) + } + if len(metrics) != 1 { + t.Fatalf("got %d metrics, want 1", len(metrics)) + } + if metrics[0].RunID != "token-run-1" { + t.Errorf("RunID = %q, want %q", metrics[0].RunID, "token-run-1") + } +} + +func TestPushClientWithWrongTokenIntegration(t *testing.T) { + readToken := "integration-read-secret" + hmacKey := "integration-hmac-secret" + _, server, cleanup := setupTestReceiverWithToken(t, readToken, hmacKey) + defer cleanup() + + t.Setenv("GITHUB_REPOSITORY_OWNER", "token-org") + t.Setenv("GITHUB_REPOSITORY", "token-repo") + t.Setenv("GITHUB_WORKFLOW", "ci.yml") + t.Setenv("GITHUB_JOB", "build") + t.Setenv("GITHUB_RUN_ID", "token-run-2") + + pushClient := summary.NewPushClient(server.URL+"/api/v1/metrics", "wrong-token") + + err := pushClient.Push(context.Background(), &summary.RunSummary{SampleCount: 1}) + if err == nil { + t.Error("Push() with wrong token should fail") + } +} + +func setupTestReceiverWithToken(t *testing.T, readToken, hmacKey string) (*receiver.Store, *httptest.Server, func()) { + t.Helper() + dbPath := filepath.Join(t.TempDir(), "test.db") + store, err := receiver.NewStore(dbPath) + if err != nil { + t.Fatalf("NewStore() error = %v", err) + } + + handler := receiver.NewHandler(store, slog.New(slog.NewTextHandler(io.Discard, nil)), readToken, hmacKey) + mux := http.NewServeMux() + handler.RegisterRoutes(mux) + + server := httptest.NewServer(mux) + + cleanup := func() { + server.Close() + _ = store.Close() + } + + return store, server, cleanup +} diff --git a/internal/receiver/handler.go b/internal/receiver/handler.go index ef4bfae..d847f62 100644 --- a/internal/receiver/handler.go +++ b/internal/receiver/handler.go @@ -14,28 +14,31 @@ import ( type Handler struct { store *Store logger *slog.Logger - readToken string // Pre-shared token for read endpoints (empty = no auth required) + readToken string // Pre-shared token for read endpoint authentication + hmacKey string // Separate key for HMAC-based push token generation/validation } // NewHandler creates a new HTTP handler with the given store. -// If readToken is non-empty, the GET metrics endpoint will require -// Bearer token authentication. -func NewHandler(store *Store, logger *slog.Logger, readToken string) *Handler { - return &Handler{store: store, logger: logger, readToken: readToken} +// readToken authenticates read endpoints and the token generation endpoint. +// hmacKey is the secret used to derive scoped push tokens. +func NewHandler(store *Store, logger *slog.Logger, readToken, hmacKey string) *Handler { + return &Handler{store: store, logger: logger, readToken: readToken, hmacKey: hmacKey} } // RegisterRoutes registers all HTTP routes on the given mux func (h *Handler) RegisterRoutes(mux *http.ServeMux) { mux.HandleFunc("POST /api/v1/metrics", h.handleReceiveMetrics) + mux.HandleFunc("POST /api/v1/token", h.handleGenerateToken) mux.HandleFunc("GET /api/v1/metrics/repo/{org}/{repo}/{workflow}/{job}", h.handleGetByWorkflowJob) mux.HandleFunc("GET /health", h.handleHealth) } // validateReadToken checks the Authorization header for a valid Bearer token. -// Returns true if authentication is disabled (empty readToken) or if the token matches. func (h *Handler) validateReadToken(w http.ResponseWriter, r *http.Request) bool { if h.readToken == "" { - return true + h.logger.Warn("no read-token configured, rejecting request", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false } authHeader := r.Header.Get("Authorization") @@ -62,6 +65,65 @@ func (h *Handler) validateReadToken(w http.ResponseWriter, r *http.Request) bool return true } +func (h *Handler) handleGenerateToken(w http.ResponseWriter, r *http.Request) { + if h.hmacKey == "" { + http.Error(w, "token generation requires a configured HMAC key", http.StatusBadRequest) + return + } + + if !h.validateReadToken(w, r) { + return + } + + var req TokenRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid JSON body", http.StatusBadRequest) + return + } + + if req.Organization == "" || req.Repository == "" || req.Workflow == "" || req.Job == "" { + http.Error(w, "organization, repository, workflow, and job are required", http.StatusBadRequest) + return + } + + token := GenerateScopedToken(h.hmacKey, req.Organization, req.Repository, req.Workflow, req.Job) + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(TokenResponse{Token: token}) +} + +// validatePushToken checks push authentication via scoped HMAC token. +func (h *Handler) validatePushToken(w http.ResponseWriter, r *http.Request, exec ExecutionContext) bool { + if h.hmacKey == "" { + h.logger.Warn("no HMAC key configured, rejecting push", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + h.logger.Warn("missing push authorization", slog.String("path", r.URL.Path)) + http.Error(w, "authorization required", http.StatusUnauthorized) + return false + } + + const bearerPrefix = "Bearer " + if !strings.HasPrefix(authHeader, bearerPrefix) { + h.logger.Warn("invalid push authorization format", slog.String("path", r.URL.Path)) + http.Error(w, "invalid authorization format", http.StatusUnauthorized) + return false + } + + token := strings.TrimPrefix(authHeader, bearerPrefix) + if !ValidateScopedToken(h.hmacKey, token, exec.Organization, exec.Repository, exec.Workflow, exec.Job) { + h.logger.Warn("invalid push token", slog.String("path", r.URL.Path)) + http.Error(w, "invalid token", http.StatusUnauthorized) + return false + } + + return true +} + func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { var payload MetricsPayload if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { @@ -75,6 +137,10 @@ func (h *Handler) handleReceiveMetrics(w http.ResponseWriter, r *http.Request) { return } + if !h.validatePushToken(w, r, payload.Execution) { + return + } + id, err := h.store.SaveMetric(&payload) if err != nil { h.logger.Error("failed to save metric", slog.String("error", err.Error())) diff --git a/internal/receiver/handler_test.go b/internal/receiver/handler_test.go index 373e1ae..cea58f0 100644 --- a/internal/receiver/handler_test.go +++ b/internal/receiver/handler_test.go @@ -14,17 +14,21 @@ import ( ) func TestHandler_ReceiveMetrics(t *testing.T) { - h, cleanup := newTestHandler(t) + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) defer cleanup() + exec := ExecutionContext{ + Organization: "test-org", + Repository: "test-repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-123", + } + pushToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) + payload := MetricsPayload{ - Execution: ExecutionContext{ - Organization: "test-org", - Repository: "test-repo", - Workflow: "ci.yml", - Job: "build", - RunID: "run-123", - }, + Execution: exec, Summary: summary.RunSummary{ DurationSeconds: 60.0, SampleCount: 12, @@ -34,6 +38,7 @@ func TestHandler_ReceiveMetrics(t *testing.T) { body, _ := json.Marshal(payload) req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+pushToken) rec := httptest.NewRecorder() mux := http.NewServeMux() @@ -99,7 +104,8 @@ func TestHandler_ReceiveMetrics_MissingRunID(t *testing.T) { } func TestHandler_GetByWorkflowJob(t *testing.T) { - h, cleanup := newTestHandler(t) + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) defer cleanup() // Save metrics for different workflow/job combinations @@ -115,6 +121,7 @@ func TestHandler_GetByWorkflowJob(t *testing.T) { } req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org-x/repo-y/ci.yml/build", nil) + req.Header.Set("Authorization", "Bearer "+readToken) rec := httptest.NewRecorder() mux := http.NewServeMux() @@ -135,10 +142,12 @@ func TestHandler_GetByWorkflowJob(t *testing.T) { } func TestHandler_GetByWorkflowJob_NotFound(t *testing.T) { - h, cleanup := newTestHandler(t) + const readToken = "test-token" + h, cleanup := newTestHandlerWithToken(t, readToken) defer cleanup() req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/workflow/job", nil) + req.Header.Set("Authorization", "Bearer "+readToken) rec := httptest.NewRecorder() mux := http.NewServeMux() @@ -224,6 +233,212 @@ func TestHandler_Health(t *testing.T) { } } +func TestHandler_GenerateToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Authorization", "Bearer secret-token") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp TokenResponse + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if resp.Token == "" { + t.Error("expected non-empty token") + } + if len(resp.Token) != 64 { + t.Errorf("token length = %d, want 64", len(resp.Token)) + } +} + +func TestHandler_GenerateToken_NoAuth(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + +func TestHandler_GenerateToken_MissingFields(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "secret-token") + defer cleanup() + + // Missing job field + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Authorization", "Bearer secret-token") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_GenerateToken_NoReadToken(t *testing.T) { + h, cleanup := newTestHandler(t) // no readToken configured + defer cleanup() + + body, _ := json.Marshal(TokenRequest{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/token", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusBadRequest { + t.Errorf("status = %d, want %d", rec.Code, http.StatusBadRequest) + } +} + +func TestHandler_ReceiveMetrics_WithPushToken(t *testing.T) { + readToken := "secret-token" + h, cleanup := newTestHandlerWithToken(t, readToken) + defer cleanup() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + + exec := ExecutionContext{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-1", + } + + validToken := GenerateScopedToken(readToken, exec.Organization, exec.Repository, exec.Workflow, exec.Job) + wrongScopeToken := GenerateScopedToken(readToken, "other-org", "repo", "ci.yml", "build") + + tests := []struct { + name string + authHeader string + wantCode int + }{ + {"no auth", "", http.StatusUnauthorized}, + {"wrong token", "Bearer wrong-token", http.StatusUnauthorized}, + {"wrong scope", "Bearer " + wrongScopeToken, http.StatusUnauthorized}, + {"valid token", "Bearer " + validToken, http.StatusCreated}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + payload := MetricsPayload{ + Execution: exec, + Summary: summary.RunSummary{SampleCount: 1}, + } + body, _ := json.Marshal(payload) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + if tt.authHeader != "" { + req.Header.Set("Authorization", tt.authHeader) + } + rec := httptest.NewRecorder() + mux.ServeHTTP(rec, req) + + if rec.Code != tt.wantCode { + t.Errorf("status = %d, want %d", rec.Code, tt.wantCode) + } + }) + } +} + +func TestHandler_ReceiveMetrics_RejectsWhenNoReadToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured + defer cleanup() + + payload := MetricsPayload{ + Execution: ExecutionContext{ + Organization: "org", + Repository: "repo", + Workflow: "ci.yml", + Job: "build", + RunID: "run-1", + }, + Summary: summary.RunSummary{SampleCount: 1}, + } + body, _ := json.Marshal(payload) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + +func TestHandler_GetByWorkflowJob_RejectsWhenNoReadToken(t *testing.T) { + h, cleanup := newTestHandlerWithToken(t, "") // no readToken configured + defer cleanup() + + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/repo/org/repo/ci.yml/build", nil) + rec := httptest.NewRecorder() + + mux := http.NewServeMux() + h.RegisterRoutes(mux) + mux.ServeHTTP(rec, req) + + if rec.Code != http.StatusUnauthorized { + t.Errorf("status = %d, want %d", rec.Code, http.StatusUnauthorized) + } +} + func newTestHandler(t *testing.T) (*Handler, func()) { t.Helper() dbPath := filepath.Join(t.TempDir(), "test.db") @@ -233,12 +448,17 @@ func newTestHandler(t *testing.T) (*Handler, func()) { } logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - handler := NewHandler(store, logger, "") // no auth token for basic tests + handler := NewHandler(store, logger, "", "") // no auth — endpoints will reject return handler, func() { _ = store.Close() } } -func newTestHandlerWithToken(t *testing.T, token string) (*Handler, func()) { +func newTestHandlerWithToken(t *testing.T, readToken string) (*Handler, func()) { + t.Helper() + return newTestHandlerWithKeys(t, readToken, readToken) +} + +func newTestHandlerWithKeys(t *testing.T, readToken, hmacKey string) (*Handler, func()) { t.Helper() dbPath := filepath.Join(t.TempDir(), "test.db") store, err := NewStore(dbPath) @@ -247,7 +467,7 @@ func newTestHandlerWithToken(t *testing.T, token string) (*Handler, func()) { } logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - handler := NewHandler(store, logger, token) + handler := NewHandler(store, logger, readToken, hmacKey) return handler, func() { _ = store.Close() } } diff --git a/internal/receiver/token.go b/internal/receiver/token.go new file mode 100644 index 0000000..087546c --- /dev/null +++ b/internal/receiver/token.go @@ -0,0 +1,25 @@ +// ABOUTME: HMAC-SHA256 token generation and validation for scoped push authentication. +// ABOUTME: Tokens are derived from a key + scope, enabling stateless validation without DB storage. +package receiver + +import ( + "crypto/hmac" + "crypto/sha256" + "crypto/subtle" + "encoding/hex" +) + +// GenerateScopedToken computes an HMAC-SHA256 token scoped to a specific org/repo/workflow/job. +// The canonical input is "v1\x00\x00\x00\x00". +func GenerateScopedToken(key, org, repo, workflow, job string) string { + mac := hmac.New(sha256.New, []byte(key)) + mac.Write([]byte("v1\x00" + org + "\x00" + repo + "\x00" + workflow + "\x00" + job)) + return hex.EncodeToString(mac.Sum(nil)) +} + +// ValidateScopedToken checks whether a token matches the expected HMAC for the given scope. +// Uses constant-time comparison to prevent timing attacks. +func ValidateScopedToken(key, token, org, repo, workflow, job string) bool { + expected := GenerateScopedToken(key, org, repo, workflow, job) + return subtle.ConstantTimeCompare([]byte(token), []byte(expected)) == 1 +} diff --git a/internal/receiver/token_test.go b/internal/receiver/token_test.go new file mode 100644 index 0000000..2140ecd --- /dev/null +++ b/internal/receiver/token_test.go @@ -0,0 +1,78 @@ +package receiver + +import ( + "encoding/hex" + "testing" +) + +func TestGenerateScopedToken_Deterministic(t *testing.T) { + token1 := GenerateScopedToken("key", "org", "repo", "wf", "job") + token2 := GenerateScopedToken("key", "org", "repo", "wf", "job") + if token1 != token2 { + t.Errorf("tokens differ: %q vs %q", token1, token2) + } +} + +func TestGenerateScopedToken_ScopePinning(t *testing.T) { + base := GenerateScopedToken("key", "org", "repo", "wf", "job") + + variants := []struct { + name string + org string + repo string + wf string + job string + }{ + {"different org", "other-org", "repo", "wf", "job"}, + {"different repo", "org", "other-repo", "wf", "job"}, + {"different workflow", "org", "repo", "other-wf", "job"}, + {"different job", "org", "repo", "wf", "other-job"}, + } + + for _, v := range variants { + t.Run(v.name, func(t *testing.T) { + token := GenerateScopedToken("key", v.org, v.repo, v.wf, v.job) + if token == base { + t.Errorf("token for %s should differ from base", v.name) + } + }) + } +} + +func TestGenerateScopedToken_DifferentKeys(t *testing.T) { + token1 := GenerateScopedToken("key-a", "org", "repo", "wf", "job") + token2 := GenerateScopedToken("key-b", "org", "repo", "wf", "job") + if token1 == token2 { + t.Error("different keys should produce different tokens") + } +} + +func TestGenerateScopedToken_ValidHex(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if len(token) != 64 { + t.Errorf("token length = %d, want 64", len(token)) + } + if _, err := hex.DecodeString(token); err != nil { + t.Errorf("token is not valid hex: %v", err) + } +} + +func TestValidateScopedToken_Correct(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if !ValidateScopedToken("key", token, "org", "repo", "wf", "job") { + t.Error("ValidateScopedToken should accept correct token") + } +} + +func TestValidateScopedToken_WrongToken(t *testing.T) { + if ValidateScopedToken("key", "deadbeef", "org", "repo", "wf", "job") { + t.Error("ValidateScopedToken should reject wrong token") + } +} + +func TestValidateScopedToken_WrongScope(t *testing.T) { + token := GenerateScopedToken("key", "org", "repo", "wf", "job") + if ValidateScopedToken("key", token, "org", "repo", "wf", "other-job") { + t.Error("ValidateScopedToken should reject token for different scope") + } +} diff --git a/internal/receiver/types.go b/internal/receiver/types.go index e11a98d..dbc56e0 100644 --- a/internal/receiver/types.go +++ b/internal/receiver/types.go @@ -30,3 +30,16 @@ type StoredMetric struct { ReceivedAt string Payload string // JSON-encoded RunSummary } + +// TokenRequest is the request body for POST /api/v1/token +type TokenRequest struct { + Organization string `json:"organization"` + Repository string `json:"repository"` + Workflow string `json:"workflow"` + Job string `json:"job"` +} + +// TokenResponse is the response body for POST /api/v1/token +type TokenResponse struct { + Token string `json:"token"` +} diff --git a/internal/summary/push.go b/internal/summary/push.go index d01fb5c..b6383db 100644 --- a/internal/summary/push.go +++ b/internal/summary/push.go @@ -30,14 +30,17 @@ type MetricsPayload struct { // PushClient sends metrics to the receiver service type PushClient struct { endpoint string + token string client *http.Client ctx ExecutionContext } -// NewPushClient creates a new push client configured from environment variables -func NewPushClient(endpoint string) *PushClient { +// NewPushClient creates a new push client configured from environment variables. +// If token is non-empty, it is sent as a Bearer token on each push request. +func NewPushClient(endpoint, token string) *PushClient { return &PushClient{ endpoint: endpoint, + token: token, client: &http.Client{ Timeout: 30 * time.Second, }, @@ -86,6 +89,9 @@ func (p *PushClient) Push(ctx context.Context, summary *RunSummary) error { return fmt.Errorf("creating request: %w", err) } req.Header.Set("Content-Type", "application/json") + if p.token != "" { + req.Header.Set("Authorization", "Bearer "+p.token) + } resp, err := p.client.Do(req) if err != nil { diff --git a/internal/summary/push_test.go b/internal/summary/push_test.go index 58aea9e..552ae68 100644 --- a/internal/summary/push_test.go +++ b/internal/summary/push_test.go @@ -25,7 +25,7 @@ func TestPushClient_Push(t *testing.T) { })) defer server.Close() - client := NewPushClient(server.URL) + client := NewPushClient(server.URL, "") client.ctx = ExecutionContext{ Organization: "test-org", Repository: "test-repo", @@ -59,7 +59,7 @@ func TestPushClient_Push(t *testing.T) { } func TestPushClient_Push_NilSummary(t *testing.T) { - client := NewPushClient("http://localhost:9999") + client := NewPushClient("http://localhost:9999", "") err := client.Push(context.Background(), nil) if err != nil { t.Errorf("Push(nil) error = %v, want nil", err) @@ -72,7 +72,7 @@ func TestPushClient_Push_ServerError(t *testing.T) { })) defer server.Close() - client := NewPushClient(server.URL) + client := NewPushClient(server.URL, "") client.ctx = ExecutionContext{RunID: "test"} err := client.Push(context.Background(), &RunSummary{}) @@ -82,7 +82,7 @@ func TestPushClient_Push_ServerError(t *testing.T) { } func TestPushClient_Push_ConnectionError(t *testing.T) { - client := NewPushClient("http://localhost:1") // Invalid port + client := NewPushClient("http://localhost:1", "") // Invalid port client.ctx = ExecutionContext{RunID: "test"} err := client.Push(context.Background(), &RunSummary{}) @@ -147,8 +147,48 @@ func TestExecutionContextFromEnv_GiteaFallback(t *testing.T) { } } +func TestPushClient_Push_WithToken(t *testing.T) { + var gotAuth string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL, "my-token") + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + if gotAuth != "Bearer my-token" { + t.Errorf("Authorization = %q, want %q", gotAuth, "Bearer my-token") + } +} + +func TestPushClient_Push_WithoutToken(t *testing.T) { + var gotAuth string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotAuth = r.Header.Get("Authorization") + w.WriteHeader(http.StatusCreated) + })) + defer server.Close() + + client := NewPushClient(server.URL, "") + client.ctx = ExecutionContext{RunID: "test"} + + err := client.Push(context.Background(), &RunSummary{}) + if err != nil { + t.Fatalf("Push() error = %v", err) + } + if gotAuth != "" { + t.Errorf("Authorization = %q, want empty", gotAuth) + } +} + func TestPushClient_ExecutionContext(t *testing.T) { - client := NewPushClient("http://example.com") + client := NewPushClient("http://example.com", "") client.ctx = ExecutionContext{ Organization: "org", Repository: "repo", diff --git a/test/docker/docker-compose-stress.yaml b/test/docker/docker-compose-stress.yaml index 0d560a7..d4a0be0 100644 --- a/test/docker/docker-compose-stress.yaml +++ b/test/docker/docker-compose-stress.yaml @@ -1,10 +1,12 @@ # Docker Compose stress test with receiver -# Run with: docker compose -f test/docker/docker-compose-stress.yaml up +# See README.md "Docker Compose" section for the full token workflow. # # This test: -# 1. Starts the metrics receiver -# 2. Runs heavy CPU/memory workloads in multiple containers with shared PID namespace -# 3. Collector gathers metrics and pushes summary to receiver on shutdown +# 1. Starts the metrics receiver (with read-token and hmac-key) +# 2. You generate a scoped push token via POST /api/v1/token +# 3. Start the collector with COLLECTOR_PUSH_TOKEN set +# 4. Runs heavy CPU/memory workloads in multiple containers with shared PID namespace +# 5. Collector gathers metrics and pushes summary to receiver on shutdown # # To trigger the push, stop the collector gracefully: # docker compose -f test/docker/docker-compose-stress.yaml stop collector @@ -20,6 +22,8 @@ services: - "9080:8080" environment: - DB_PATH=/data/metrics.db + - RECEIVER_READ_TOKEN=dummyreadtoken + - RECEIVER_HMAC_KEY=dummyhmackey volumes: - receiver-data:/data healthcheck: @@ -98,6 +102,8 @@ services: - --log-format=json - --push-endpoint=http://receiver:8080/api/v1/metrics environment: + # Push token — pass via COLLECTOR_PUSH_TOKEN from host env + COLLECTOR_PUSH_TOKEN: "${COLLECTOR_PUSH_TOKEN}" # Execution context for the receiver GITHUB_REPOSITORY_OWNER: "test-org" GITHUB_REPOSITORY: "test-org/stress-test"