feat(summary): add per-container metrics with extended percentiles

- Extend StatSummary with p99, p75, p50 percentiles (in addition to peak, p95, avg) - Add ContainerSummary type for per-container CPU cores and memory bytes stats - Track container metrics from Cgroups map in Accumulator - Include containers array in RunSummary sent to receiver Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-06 15:01:01 +01:00 · 2026-02-06 15:01:01 +01:00 · 6770cfcea7
commit 6770cfcea7
parent 5e470c33a5
3 changed files with 267 additions and 15 deletions
--- a/internal/summary/accumulator.go
+++ b/internal/summary/accumulator.go
@ -10,6 +10,12 @@ import (
 	"edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics"
 )

+// containerAccumulator tracks metrics for a single container
+type containerAccumulator struct {
+	cpuCoresValues    []float64
+	memoryBytesValues []float64
+}
+
 // Accumulator collects metric samples and computes run-level statistics
 type Accumulator struct {
 	topN           int
@ -17,6 +23,7 @@ type Accumulator struct {
 	memBytesValues []float64
 	memPctValues   []float64
 	processPeaks   map[string]*ProcessPeak
+	containers     map[string]*containerAccumulator
 	startTime      time.Time
 	endTime        time.Time
 	sampleCount    int
@ -27,6 +34,7 @@ func NewAccumulator(topN int) *Accumulator {
 	return &Accumulator{
 		topN:         topN,
 		processPeaks: make(map[string]*ProcessPeak),
+		containers:   make(map[string]*containerAccumulator),
 	}
 }

@ -48,6 +56,17 @@ func (a *Accumulator) Add(m *metrics.SystemMetrics) {
 	for _, p := range m.TopMemory {
 		a.updateProcessPeak(p)
 	}
+
+	// Track per-container metrics
+	for name, cgroup := range m.Cgroups {
+		ca, ok := a.containers[name]
+		if !ok {
+			ca = &containerAccumulator{}
+			a.containers[name] = ca
+		}
+		ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores)
+		ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes))
+	}
 }

 // Summarize computes and returns the run summary, or nil if no samples were added
@ -66,15 +85,33 @@ func (a *Accumulator) Summarize() *RunSummary {
 		MemUsedPercent:  computeStats(a.memPctValues),
 		TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }),
 		TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }),
+		Containers:      a.containerSummaries(),
 	}
 }

+// containerSummaries computes summaries for all tracked containers
+func (a *Accumulator) containerSummaries() []ContainerSummary {
+	summaries := make([]ContainerSummary, 0, len(a.containers))
+	for name, ca := range a.containers {
+		summaries = append(summaries, ContainerSummary{
+			Name:        name,
+			CPUCores:    computeStats(ca.cpuCoresValues),
+			MemoryBytes: computeStats(ca.memoryBytesValues),
+		})
+	}
+	// Sort by name for consistent output
+	sort.Slice(summaries, func(i, j int) bool {
+		return summaries[i].Name < summaries[j].Name
+	})
+	return summaries
+}
+
 // SampleCount returns the number of samples added
 func (a *Accumulator) SampleCount() int {
 	return a.sampleCount
 }

-// computeStats calculates peak, average, and P95 from a sorted copy of the values
+// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values
 func computeStats(values []float64) StatSummary {
 	n := len(values)
 	if n == 0 {
@ -90,15 +127,21 @@ func computeStats(values []float64) StatSummary {
 		sum += v
 	}

-	p95Index := int(float64(n-1) * 0.95)
-
 	return StatSummary{
 		Peak: sorted[n-1],
+		P99:  sorted[percentileIndex(n, 0.99)],
+		P95:  sorted[percentileIndex(n, 0.95)],
+		P75:  sorted[percentileIndex(n, 0.75)],
+		P50:  sorted[percentileIndex(n, 0.50)],
 		Avg:  sum / float64(n),
-		P95:  sorted[p95Index],
 	}
 }

+// percentileIndex returns the index for the given percentile (0.0-1.0)
+func percentileIndex(n int, percentile float64) int {
+	return int(float64(n-1) * percentile)
+}
+
 // updateProcessPeak merges a process observation into the peak tracking map
 func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) {
 	key := fmt.Sprintf("%d:%s", p.PID, p.Name)
--- a/internal/summary/accumulator_test.go
+++ b/internal/summary/accumulator_test.go
@ -333,3 +333,201 @@ func TestAccumulator_Duration(t *testing.T) {
 		t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds)
 	}
 }
+
+func TestAccumulator_AllPercentiles(t *testing.T) {
+	acc := NewAccumulator(5)
+	// 20 values: 1, 2, 3, ..., 20
+	for i := 1; i <= 20; i++ {
+		acc.Add(&metrics.SystemMetrics{
+			Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
+			CPU:       metrics.CPUMetrics{TotalPercent: float64(i)},
+			Memory:    metrics.MemoryMetrics{},
+		})
+	}
+
+	s := acc.Summarize()
+	if s == nil {
+		t.Fatal("expected non-nil summary")
+	}
+
+	// Peak = 20
+	if s.CPUTotal.Peak != 20 {
+		t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak)
+	}
+	// P99: index=int(19*0.99)=int(18.81)=18, value=19
+	if s.CPUTotal.P99 != 19 {
+		t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99)
+	}
+	// P95: index=int(19*0.95)=int(18.05)=18, value=19
+	if s.CPUTotal.P95 != 19 {
+		t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95)
+	}
+	// P75: index=int(19*0.75)=int(14.25)=14, value=15
+	if s.CPUTotal.P75 != 15 {
+		t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75)
+	}
+	// P50: index=int(19*0.50)=int(9.5)=9, value=10
+	if s.CPUTotal.P50 != 10 {
+		t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50)
+	}
+	// Avg = (1+2+...+20)/20 = 210/20 = 10.5
+	if s.CPUTotal.Avg != 10.5 {
+		t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg)
+	}
+}
+
+func TestAccumulator_ContainerMetrics(t *testing.T) {
+	acc := NewAccumulator(5)
+
+	// Add samples with container metrics
+	for i := 1; i <= 5; i++ {
+		acc.Add(&metrics.SystemMetrics{
+			Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
+			CPU:       metrics.CPUMetrics{TotalPercent: float64(i * 10)},
+			Memory:    metrics.MemoryMetrics{},
+			Cgroups: map[string]*metrics.CgroupMetrics{
+				"container-a": {
+					Name: "container-a",
+					CPU:  metrics.CgroupCPUMetrics{UsedCores: float64(i)},
+					Memory: metrics.CgroupMemoryMetrics{
+						TotalRSSBytes: uint64(i * 1000),
+					},
+				},
+				"container-b": {
+					Name: "container-b",
+					CPU:  metrics.CgroupCPUMetrics{UsedCores: float64(i * 2)},
+					Memory: metrics.CgroupMemoryMetrics{
+						TotalRSSBytes: uint64(i * 2000),
+					},
+				},
+			},
+		})
+	}
+
+	s := acc.Summarize()
+	if s == nil {
+		t.Fatal("expected non-nil summary")
+	}
+
+	// Should have 2 containers
+	if len(s.Containers) != 2 {
+		t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
+	}
+
+	// Containers should be sorted by name
+	if s.Containers[0].Name != "container-a" {
+		t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name)
+	}
+	if s.Containers[1].Name != "container-b" {
+		t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name)
+	}
+
+	// Container A: CPU cores [1,2,3,4,5], peak=5, avg=3
+	containerA := s.Containers[0]
+	if containerA.CPUCores.Peak != 5 {
+		t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak)
+	}
+	if containerA.CPUCores.Avg != 3 {
+		t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg)
+	}
+	// Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000
+	if containerA.MemoryBytes.Peak != 5000 {
+		t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak)
+	}
+	if containerA.MemoryBytes.Avg != 3000 {
+		t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg)
+	}
+
+	// Container B: CPU cores [2,4,6,8,10], peak=10, avg=6
+	containerB := s.Containers[1]
+	if containerB.CPUCores.Peak != 10 {
+		t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak)
+	}
+	if containerB.CPUCores.Avg != 6 {
+		t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg)
+	}
+}
+
+func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) {
+	acc := NewAccumulator(5)
+	acc.Add(&metrics.SystemMetrics{
+		Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC),
+		CPU:       metrics.CPUMetrics{TotalPercent: 50},
+		Memory:    metrics.MemoryMetrics{},
+		Cgroups:   nil, // No containers
+	})
+
+	s := acc.Summarize()
+	if s == nil {
+		t.Fatal("expected non-nil summary")
+	}
+
+	if len(s.Containers) != 0 {
+		t.Errorf("Containers length: got %d, want 0", len(s.Containers))
+	}
+}
+
+func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) {
+	acc := NewAccumulator(5)
+
+	// First sample: only container-a
+	acc.Add(&metrics.SystemMetrics{
+		Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC),
+		CPU:       metrics.CPUMetrics{},
+		Memory:    metrics.MemoryMetrics{},
+		Cgroups: map[string]*metrics.CgroupMetrics{
+			"container-a": {
+				Name:   "container-a",
+				CPU:    metrics.CgroupCPUMetrics{UsedCores: 1},
+				Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000},
+			},
+		},
+	})
+
+	// Second sample: both containers
+	acc.Add(&metrics.SystemMetrics{
+		Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC),
+		CPU:       metrics.CPUMetrics{},
+		Memory:    metrics.MemoryMetrics{},
+		Cgroups: map[string]*metrics.CgroupMetrics{
+			"container-a": {
+				Name:   "container-a",
+				CPU:    metrics.CgroupCPUMetrics{UsedCores: 2},
+				Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000},
+			},
+			"container-b": {
+				Name:   "container-b",
+				CPU:    metrics.CgroupCPUMetrics{UsedCores: 5},
+				Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000},
+			},
+		},
+	})
+
+	s := acc.Summarize()
+	if s == nil {
+		t.Fatal("expected non-nil summary")
+	}
+
+	// Should have 2 containers
+	if len(s.Containers) != 2 {
+		t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
+	}
+
+	// Container A: 2 samples [1,2]
+	containerA := s.Containers[0]
+	if containerA.CPUCores.Peak != 2 {
+		t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak)
+	}
+	if containerA.CPUCores.Avg != 1.5 {
+		t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg)
+	}
+
+	// Container B: 1 sample [5]
+	containerB := s.Containers[1]
+	if containerB.CPUCores.Peak != 5 {
+		t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak)
+	}
+	if containerB.CPUCores.Avg != 5 {
+		t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg)
+	}
+}
--- a/internal/summary/types.go
+++ b/internal/summary/types.go
@ -4,11 +4,14 @@ package summary

 import "time"

-// StatSummary holds peak, average, and P95 for a metric across the run
+// StatSummary holds peak, percentiles, and average for a metric across the run
 type StatSummary struct {
 	Peak float64 `json:"peak"`
-	Avg  float64 `json:"avg"`
+	P99  float64 `json:"p99"`
 	P95  float64 `json:"p95"`
+	P75  float64 `json:"p75"`
+	P50  float64 `json:"p50"`
+	Avg  float64 `json:"avg"`
 }

 // ProcessPeak holds the peak CPU and memory observed for a single process
@ -19,15 +22,23 @@ type ProcessPeak struct {
 	PeakMem uint64  `json:"peak_mem_rss_bytes"`
 }

+// ContainerSummary holds statistics for a single container across the run
+type ContainerSummary struct {
+	Name        string      `json:"name"`
+	CPUCores    StatSummary `json:"cpu_cores"`
+	MemoryBytes StatSummary `json:"memory_bytes"`
+}
+
 // RunSummary holds the complete summary of a collection run
 type RunSummary struct {
-	StartTime       time.Time     `json:"start_time"`
-	EndTime         time.Time     `json:"end_time"`
-	DurationSeconds float64       `json:"duration_seconds"`
-	SampleCount     int           `json:"sample_count"`
-	CPUTotal        StatSummary   `json:"cpu_total_percent"`
-	MemUsedBytes    StatSummary   `json:"mem_used_bytes"`
-	MemUsedPercent  StatSummary   `json:"mem_used_percent"`
-	TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"`
-	TopMemProcesses []ProcessPeak `json:"top_mem_processes"`
+	StartTime       time.Time          `json:"start_time"`
+	EndTime         time.Time          `json:"end_time"`
+	DurationSeconds float64            `json:"duration_seconds"`
+	SampleCount     int                `json:"sample_count"`
+	CPUTotal        StatSummary        `json:"cpu_total_percent"`
+	MemUsedBytes    StatSummary        `json:"mem_used_bytes"`
+	MemUsedPercent  StatSummary        `json:"mem_used_percent"`
+	TopCPUProcesses []ProcessPeak      `json:"top_cpu_processes"`
+	TopMemProcesses []ProcessPeak      `json:"top_mem_processes"`
+	Containers      []ContainerSummary `json:"containers"`
 }