feat(summary): add per-container metrics with extended percentiles
All checks were successful
ci / build (push) Successful in 34s

- Extend StatSummary with p99, p75, p50 percentiles (in addition to peak, p95, avg)
- Add ContainerSummary type for per-container CPU cores and memory bytes stats
- Track container metrics from Cgroups map in Accumulator
- Include containers array in RunSummary sent to receiver

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Manuel Ganter 2026-02-06 15:01:01 +01:00
parent 5e470c33a5
commit 6770cfcea7
No known key found for this signature in database
3 changed files with 267 additions and 15 deletions

View file

@ -10,6 +10,12 @@ import (
"edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics"
)
// containerAccumulator tracks metrics for a single container
type containerAccumulator struct {
cpuCoresValues []float64
memoryBytesValues []float64
}
// Accumulator collects metric samples and computes run-level statistics
type Accumulator struct {
topN int
@ -17,6 +23,7 @@ type Accumulator struct {
memBytesValues []float64
memPctValues []float64
processPeaks map[string]*ProcessPeak
containers map[string]*containerAccumulator
startTime time.Time
endTime time.Time
sampleCount int
@ -27,6 +34,7 @@ func NewAccumulator(topN int) *Accumulator {
return &Accumulator{
topN: topN,
processPeaks: make(map[string]*ProcessPeak),
containers: make(map[string]*containerAccumulator),
}
}
@ -48,6 +56,17 @@ func (a *Accumulator) Add(m *metrics.SystemMetrics) {
for _, p := range m.TopMemory {
a.updateProcessPeak(p)
}
// Track per-container metrics
for name, cgroup := range m.Cgroups {
ca, ok := a.containers[name]
if !ok {
ca = &containerAccumulator{}
a.containers[name] = ca
}
ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores)
ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes))
}
}
// Summarize computes and returns the run summary, or nil if no samples were added
@ -66,15 +85,33 @@ func (a *Accumulator) Summarize() *RunSummary {
MemUsedPercent: computeStats(a.memPctValues),
TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }),
TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }),
Containers: a.containerSummaries(),
}
}
// containerSummaries computes summaries for all tracked containers
func (a *Accumulator) containerSummaries() []ContainerSummary {
summaries := make([]ContainerSummary, 0, len(a.containers))
for name, ca := range a.containers {
summaries = append(summaries, ContainerSummary{
Name: name,
CPUCores: computeStats(ca.cpuCoresValues),
MemoryBytes: computeStats(ca.memoryBytesValues),
})
}
// Sort by name for consistent output
sort.Slice(summaries, func(i, j int) bool {
return summaries[i].Name < summaries[j].Name
})
return summaries
}
// SampleCount returns the number of samples added
func (a *Accumulator) SampleCount() int {
return a.sampleCount
}
// computeStats calculates peak, average, and P95 from a sorted copy of the values
// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values
func computeStats(values []float64) StatSummary {
n := len(values)
if n == 0 {
@ -90,15 +127,21 @@ func computeStats(values []float64) StatSummary {
sum += v
}
p95Index := int(float64(n-1) * 0.95)
return StatSummary{
Peak: sorted[n-1],
P99: sorted[percentileIndex(n, 0.99)],
P95: sorted[percentileIndex(n, 0.95)],
P75: sorted[percentileIndex(n, 0.75)],
P50: sorted[percentileIndex(n, 0.50)],
Avg: sum / float64(n),
P95: sorted[p95Index],
}
}
// percentileIndex returns the index for the given percentile (0.0-1.0)
func percentileIndex(n int, percentile float64) int {
return int(float64(n-1) * percentile)
}
// updateProcessPeak merges a process observation into the peak tracking map
func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) {
key := fmt.Sprintf("%d:%s", p.PID, p.Name)

View file

@ -333,3 +333,201 @@ func TestAccumulator_Duration(t *testing.T) {
t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds)
}
}
func TestAccumulator_AllPercentiles(t *testing.T) {
acc := NewAccumulator(5)
// 20 values: 1, 2, 3, ..., 20
for i := 1; i <= 20; i++ {
acc.Add(&metrics.SystemMetrics{
Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
CPU: metrics.CPUMetrics{TotalPercent: float64(i)},
Memory: metrics.MemoryMetrics{},
})
}
s := acc.Summarize()
if s == nil {
t.Fatal("expected non-nil summary")
}
// Peak = 20
if s.CPUTotal.Peak != 20 {
t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak)
}
// P99: index=int(19*0.99)=int(18.81)=18, value=19
if s.CPUTotal.P99 != 19 {
t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99)
}
// P95: index=int(19*0.95)=int(18.05)=18, value=19
if s.CPUTotal.P95 != 19 {
t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95)
}
// P75: index=int(19*0.75)=int(14.25)=14, value=15
if s.CPUTotal.P75 != 15 {
t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75)
}
// P50: index=int(19*0.50)=int(9.5)=9, value=10
if s.CPUTotal.P50 != 10 {
t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50)
}
// Avg = (1+2+...+20)/20 = 210/20 = 10.5
if s.CPUTotal.Avg != 10.5 {
t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg)
}
}
func TestAccumulator_ContainerMetrics(t *testing.T) {
acc := NewAccumulator(5)
// Add samples with container metrics
for i := 1; i <= 5; i++ {
acc.Add(&metrics.SystemMetrics{
Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
CPU: metrics.CPUMetrics{TotalPercent: float64(i * 10)},
Memory: metrics.MemoryMetrics{},
Cgroups: map[string]*metrics.CgroupMetrics{
"container-a": {
Name: "container-a",
CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i)},
Memory: metrics.CgroupMemoryMetrics{
TotalRSSBytes: uint64(i * 1000),
},
},
"container-b": {
Name: "container-b",
CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2)},
Memory: metrics.CgroupMemoryMetrics{
TotalRSSBytes: uint64(i * 2000),
},
},
},
})
}
s := acc.Summarize()
if s == nil {
t.Fatal("expected non-nil summary")
}
// Should have 2 containers
if len(s.Containers) != 2 {
t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
}
// Containers should be sorted by name
if s.Containers[0].Name != "container-a" {
t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name)
}
if s.Containers[1].Name != "container-b" {
t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name)
}
// Container A: CPU cores [1,2,3,4,5], peak=5, avg=3
containerA := s.Containers[0]
if containerA.CPUCores.Peak != 5 {
t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak)
}
if containerA.CPUCores.Avg != 3 {
t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg)
}
// Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000
if containerA.MemoryBytes.Peak != 5000 {
t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak)
}
if containerA.MemoryBytes.Avg != 3000 {
t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg)
}
// Container B: CPU cores [2,4,6,8,10], peak=10, avg=6
containerB := s.Containers[1]
if containerB.CPUCores.Peak != 10 {
t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak)
}
if containerB.CPUCores.Avg != 6 {
t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg)
}
}
func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) {
acc := NewAccumulator(5)
acc.Add(&metrics.SystemMetrics{
Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC),
CPU: metrics.CPUMetrics{TotalPercent: 50},
Memory: metrics.MemoryMetrics{},
Cgroups: nil, // No containers
})
s := acc.Summarize()
if s == nil {
t.Fatal("expected non-nil summary")
}
if len(s.Containers) != 0 {
t.Errorf("Containers length: got %d, want 0", len(s.Containers))
}
}
func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) {
acc := NewAccumulator(5)
// First sample: only container-a
acc.Add(&metrics.SystemMetrics{
Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC),
CPU: metrics.CPUMetrics{},
Memory: metrics.MemoryMetrics{},
Cgroups: map[string]*metrics.CgroupMetrics{
"container-a": {
Name: "container-a",
CPU: metrics.CgroupCPUMetrics{UsedCores: 1},
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000},
},
},
})
// Second sample: both containers
acc.Add(&metrics.SystemMetrics{
Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC),
CPU: metrics.CPUMetrics{},
Memory: metrics.MemoryMetrics{},
Cgroups: map[string]*metrics.CgroupMetrics{
"container-a": {
Name: "container-a",
CPU: metrics.CgroupCPUMetrics{UsedCores: 2},
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000},
},
"container-b": {
Name: "container-b",
CPU: metrics.CgroupCPUMetrics{UsedCores: 5},
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000},
},
},
})
s := acc.Summarize()
if s == nil {
t.Fatal("expected non-nil summary")
}
// Should have 2 containers
if len(s.Containers) != 2 {
t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
}
// Container A: 2 samples [1,2]
containerA := s.Containers[0]
if containerA.CPUCores.Peak != 2 {
t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak)
}
if containerA.CPUCores.Avg != 1.5 {
t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg)
}
// Container B: 1 sample [5]
containerB := s.Containers[1]
if containerB.CPUCores.Peak != 5 {
t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak)
}
if containerB.CPUCores.Avg != 5 {
t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg)
}
}

View file

@ -4,11 +4,14 @@ package summary
import "time"
// StatSummary holds peak, average, and P95 for a metric across the run
// StatSummary holds peak, percentiles, and average for a metric across the run
type StatSummary struct {
Peak float64 `json:"peak"`
Avg float64 `json:"avg"`
P99 float64 `json:"p99"`
P95 float64 `json:"p95"`
P75 float64 `json:"p75"`
P50 float64 `json:"p50"`
Avg float64 `json:"avg"`
}
// ProcessPeak holds the peak CPU and memory observed for a single process
@ -19,15 +22,23 @@ type ProcessPeak struct {
PeakMem uint64 `json:"peak_mem_rss_bytes"`
}
// ContainerSummary holds statistics for a single container across the run
type ContainerSummary struct {
Name string `json:"name"`
CPUCores StatSummary `json:"cpu_cores"`
MemoryBytes StatSummary `json:"memory_bytes"`
}
// RunSummary holds the complete summary of a collection run
type RunSummary struct {
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
DurationSeconds float64 `json:"duration_seconds"`
SampleCount int `json:"sample_count"`
CPUTotal StatSummary `json:"cpu_total_percent"`
MemUsedBytes StatSummary `json:"mem_used_bytes"`
MemUsedPercent StatSummary `json:"mem_used_percent"`
TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"`
TopMemProcesses []ProcessPeak `json:"top_mem_processes"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
DurationSeconds float64 `json:"duration_seconds"`
SampleCount int `json:"sample_count"`
CPUTotal StatSummary `json:"cpu_total_percent"`
MemUsedBytes StatSummary `json:"mem_used_bytes"`
MemUsedPercent StatSummary `json:"mem_used_percent"`
TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"`
TopMemProcesses []ProcessPeak `json:"top_mem_processes"`
Containers []ContainerSummary `json:"containers"`
}