feat(summary): add per-container metrics with extended percentiles
All checks were successful
ci / build (push) Successful in 34s
All checks were successful
ci / build (push) Successful in 34s
- Extend StatSummary with p99, p75, p50 percentiles (in addition to peak, p95, avg) - Add ContainerSummary type for per-container CPU cores and memory bytes stats - Track container metrics from Cgroups map in Accumulator - Include containers array in RunSummary sent to receiver Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5e470c33a5
commit
6770cfcea7
3 changed files with 267 additions and 15 deletions
|
|
@ -10,6 +10,12 @@ import (
|
|||
"edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/metrics"
|
||||
)
|
||||
|
||||
// containerAccumulator tracks metrics for a single container
|
||||
type containerAccumulator struct {
|
||||
cpuCoresValues []float64
|
||||
memoryBytesValues []float64
|
||||
}
|
||||
|
||||
// Accumulator collects metric samples and computes run-level statistics
|
||||
type Accumulator struct {
|
||||
topN int
|
||||
|
|
@ -17,6 +23,7 @@ type Accumulator struct {
|
|||
memBytesValues []float64
|
||||
memPctValues []float64
|
||||
processPeaks map[string]*ProcessPeak
|
||||
containers map[string]*containerAccumulator
|
||||
startTime time.Time
|
||||
endTime time.Time
|
||||
sampleCount int
|
||||
|
|
@ -27,6 +34,7 @@ func NewAccumulator(topN int) *Accumulator {
|
|||
return &Accumulator{
|
||||
topN: topN,
|
||||
processPeaks: make(map[string]*ProcessPeak),
|
||||
containers: make(map[string]*containerAccumulator),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -48,6 +56,17 @@ func (a *Accumulator) Add(m *metrics.SystemMetrics) {
|
|||
for _, p := range m.TopMemory {
|
||||
a.updateProcessPeak(p)
|
||||
}
|
||||
|
||||
// Track per-container metrics
|
||||
for name, cgroup := range m.Cgroups {
|
||||
ca, ok := a.containers[name]
|
||||
if !ok {
|
||||
ca = &containerAccumulator{}
|
||||
a.containers[name] = ca
|
||||
}
|
||||
ca.cpuCoresValues = append(ca.cpuCoresValues, cgroup.CPU.UsedCores)
|
||||
ca.memoryBytesValues = append(ca.memoryBytesValues, float64(cgroup.Memory.TotalRSSBytes))
|
||||
}
|
||||
}
|
||||
|
||||
// Summarize computes and returns the run summary, or nil if no samples were added
|
||||
|
|
@ -66,15 +85,33 @@ func (a *Accumulator) Summarize() *RunSummary {
|
|||
MemUsedPercent: computeStats(a.memPctValues),
|
||||
TopCPUProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return p.PeakCPU }),
|
||||
TopMemProcesses: a.topProcesses(func(p *ProcessPeak) float64 { return float64(p.PeakMem) }),
|
||||
Containers: a.containerSummaries(),
|
||||
}
|
||||
}
|
||||
|
||||
// containerSummaries computes summaries for all tracked containers
|
||||
func (a *Accumulator) containerSummaries() []ContainerSummary {
|
||||
summaries := make([]ContainerSummary, 0, len(a.containers))
|
||||
for name, ca := range a.containers {
|
||||
summaries = append(summaries, ContainerSummary{
|
||||
Name: name,
|
||||
CPUCores: computeStats(ca.cpuCoresValues),
|
||||
MemoryBytes: computeStats(ca.memoryBytesValues),
|
||||
})
|
||||
}
|
||||
// Sort by name for consistent output
|
||||
sort.Slice(summaries, func(i, j int) bool {
|
||||
return summaries[i].Name < summaries[j].Name
|
||||
})
|
||||
return summaries
|
||||
}
|
||||
|
||||
// SampleCount returns the number of samples added
|
||||
func (a *Accumulator) SampleCount() int {
|
||||
return a.sampleCount
|
||||
}
|
||||
|
||||
// computeStats calculates peak, average, and P95 from a sorted copy of the values
|
||||
// computeStats calculates peak, percentiles (p99, p95, p75, p50), and average from a sorted copy of the values
|
||||
func computeStats(values []float64) StatSummary {
|
||||
n := len(values)
|
||||
if n == 0 {
|
||||
|
|
@ -90,15 +127,21 @@ func computeStats(values []float64) StatSummary {
|
|||
sum += v
|
||||
}
|
||||
|
||||
p95Index := int(float64(n-1) * 0.95)
|
||||
|
||||
return StatSummary{
|
||||
Peak: sorted[n-1],
|
||||
P99: sorted[percentileIndex(n, 0.99)],
|
||||
P95: sorted[percentileIndex(n, 0.95)],
|
||||
P75: sorted[percentileIndex(n, 0.75)],
|
||||
P50: sorted[percentileIndex(n, 0.50)],
|
||||
Avg: sum / float64(n),
|
||||
P95: sorted[p95Index],
|
||||
}
|
||||
}
|
||||
|
||||
// percentileIndex returns the index for the given percentile (0.0-1.0)
|
||||
func percentileIndex(n int, percentile float64) int {
|
||||
return int(float64(n-1) * percentile)
|
||||
}
|
||||
|
||||
// updateProcessPeak merges a process observation into the peak tracking map
|
||||
func (a *Accumulator) updateProcessPeak(p metrics.ProcessMetrics) {
|
||||
key := fmt.Sprintf("%d:%s", p.PID, p.Name)
|
||||
|
|
|
|||
|
|
@ -333,3 +333,201 @@ func TestAccumulator_Duration(t *testing.T) {
|
|||
t.Errorf("DurationSeconds: got %f, want 60", s.DurationSeconds)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAccumulator_AllPercentiles(t *testing.T) {
|
||||
acc := NewAccumulator(5)
|
||||
// 20 values: 1, 2, 3, ..., 20
|
||||
for i := 1; i <= 20; i++ {
|
||||
acc.Add(&metrics.SystemMetrics{
|
||||
Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
|
||||
CPU: metrics.CPUMetrics{TotalPercent: float64(i)},
|
||||
Memory: metrics.MemoryMetrics{},
|
||||
})
|
||||
}
|
||||
|
||||
s := acc.Summarize()
|
||||
if s == nil {
|
||||
t.Fatal("expected non-nil summary")
|
||||
}
|
||||
|
||||
// Peak = 20
|
||||
if s.CPUTotal.Peak != 20 {
|
||||
t.Errorf("CPU peak: got %f, want 20", s.CPUTotal.Peak)
|
||||
}
|
||||
// P99: index=int(19*0.99)=int(18.81)=18, value=19
|
||||
if s.CPUTotal.P99 != 19 {
|
||||
t.Errorf("CPU p99: got %f, want 19", s.CPUTotal.P99)
|
||||
}
|
||||
// P95: index=int(19*0.95)=int(18.05)=18, value=19
|
||||
if s.CPUTotal.P95 != 19 {
|
||||
t.Errorf("CPU p95: got %f, want 19", s.CPUTotal.P95)
|
||||
}
|
||||
// P75: index=int(19*0.75)=int(14.25)=14, value=15
|
||||
if s.CPUTotal.P75 != 15 {
|
||||
t.Errorf("CPU p75: got %f, want 15", s.CPUTotal.P75)
|
||||
}
|
||||
// P50: index=int(19*0.50)=int(9.5)=9, value=10
|
||||
if s.CPUTotal.P50 != 10 {
|
||||
t.Errorf("CPU p50: got %f, want 10", s.CPUTotal.P50)
|
||||
}
|
||||
// Avg = (1+2+...+20)/20 = 210/20 = 10.5
|
||||
if s.CPUTotal.Avg != 10.5 {
|
||||
t.Errorf("CPU avg: got %f, want 10.5", s.CPUTotal.Avg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAccumulator_ContainerMetrics(t *testing.T) {
|
||||
acc := NewAccumulator(5)
|
||||
|
||||
// Add samples with container metrics
|
||||
for i := 1; i <= 5; i++ {
|
||||
acc.Add(&metrics.SystemMetrics{
|
||||
Timestamp: time.Date(2025, 1, 1, 0, 0, i, 0, time.UTC),
|
||||
CPU: metrics.CPUMetrics{TotalPercent: float64(i * 10)},
|
||||
Memory: metrics.MemoryMetrics{},
|
||||
Cgroups: map[string]*metrics.CgroupMetrics{
|
||||
"container-a": {
|
||||
Name: "container-a",
|
||||
CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i)},
|
||||
Memory: metrics.CgroupMemoryMetrics{
|
||||
TotalRSSBytes: uint64(i * 1000),
|
||||
},
|
||||
},
|
||||
"container-b": {
|
||||
Name: "container-b",
|
||||
CPU: metrics.CgroupCPUMetrics{UsedCores: float64(i * 2)},
|
||||
Memory: metrics.CgroupMemoryMetrics{
|
||||
TotalRSSBytes: uint64(i * 2000),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
s := acc.Summarize()
|
||||
if s == nil {
|
||||
t.Fatal("expected non-nil summary")
|
||||
}
|
||||
|
||||
// Should have 2 containers
|
||||
if len(s.Containers) != 2 {
|
||||
t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
|
||||
}
|
||||
|
||||
// Containers should be sorted by name
|
||||
if s.Containers[0].Name != "container-a" {
|
||||
t.Errorf("Containers[0].Name: got %s, want container-a", s.Containers[0].Name)
|
||||
}
|
||||
if s.Containers[1].Name != "container-b" {
|
||||
t.Errorf("Containers[1].Name: got %s, want container-b", s.Containers[1].Name)
|
||||
}
|
||||
|
||||
// Container A: CPU cores [1,2,3,4,5], peak=5, avg=3
|
||||
containerA := s.Containers[0]
|
||||
if containerA.CPUCores.Peak != 5 {
|
||||
t.Errorf("container-a CPUCores.Peak: got %f, want 5", containerA.CPUCores.Peak)
|
||||
}
|
||||
if containerA.CPUCores.Avg != 3 {
|
||||
t.Errorf("container-a CPUCores.Avg: got %f, want 3", containerA.CPUCores.Avg)
|
||||
}
|
||||
// Memory bytes [1000,2000,3000,4000,5000], peak=5000, avg=3000
|
||||
if containerA.MemoryBytes.Peak != 5000 {
|
||||
t.Errorf("container-a MemoryBytes.Peak: got %f, want 5000", containerA.MemoryBytes.Peak)
|
||||
}
|
||||
if containerA.MemoryBytes.Avg != 3000 {
|
||||
t.Errorf("container-a MemoryBytes.Avg: got %f, want 3000", containerA.MemoryBytes.Avg)
|
||||
}
|
||||
|
||||
// Container B: CPU cores [2,4,6,8,10], peak=10, avg=6
|
||||
containerB := s.Containers[1]
|
||||
if containerB.CPUCores.Peak != 10 {
|
||||
t.Errorf("container-b CPUCores.Peak: got %f, want 10", containerB.CPUCores.Peak)
|
||||
}
|
||||
if containerB.CPUCores.Avg != 6 {
|
||||
t.Errorf("container-b CPUCores.Avg: got %f, want 6", containerB.CPUCores.Avg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAccumulator_ContainerMetrics_NoContainers(t *testing.T) {
|
||||
acc := NewAccumulator(5)
|
||||
acc.Add(&metrics.SystemMetrics{
|
||||
Timestamp: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
CPU: metrics.CPUMetrics{TotalPercent: 50},
|
||||
Memory: metrics.MemoryMetrics{},
|
||||
Cgroups: nil, // No containers
|
||||
})
|
||||
|
||||
s := acc.Summarize()
|
||||
if s == nil {
|
||||
t.Fatal("expected non-nil summary")
|
||||
}
|
||||
|
||||
if len(s.Containers) != 0 {
|
||||
t.Errorf("Containers length: got %d, want 0", len(s.Containers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestAccumulator_ContainerMetrics_PartialSamples(t *testing.T) {
|
||||
acc := NewAccumulator(5)
|
||||
|
||||
// First sample: only container-a
|
||||
acc.Add(&metrics.SystemMetrics{
|
||||
Timestamp: time.Date(2025, 1, 1, 0, 0, 1, 0, time.UTC),
|
||||
CPU: metrics.CPUMetrics{},
|
||||
Memory: metrics.MemoryMetrics{},
|
||||
Cgroups: map[string]*metrics.CgroupMetrics{
|
||||
"container-a": {
|
||||
Name: "container-a",
|
||||
CPU: metrics.CgroupCPUMetrics{UsedCores: 1},
|
||||
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 1000},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Second sample: both containers
|
||||
acc.Add(&metrics.SystemMetrics{
|
||||
Timestamp: time.Date(2025, 1, 1, 0, 0, 2, 0, time.UTC),
|
||||
CPU: metrics.CPUMetrics{},
|
||||
Memory: metrics.MemoryMetrics{},
|
||||
Cgroups: map[string]*metrics.CgroupMetrics{
|
||||
"container-a": {
|
||||
Name: "container-a",
|
||||
CPU: metrics.CgroupCPUMetrics{UsedCores: 2},
|
||||
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 2000},
|
||||
},
|
||||
"container-b": {
|
||||
Name: "container-b",
|
||||
CPU: metrics.CgroupCPUMetrics{UsedCores: 5},
|
||||
Memory: metrics.CgroupMemoryMetrics{TotalRSSBytes: 5000},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
s := acc.Summarize()
|
||||
if s == nil {
|
||||
t.Fatal("expected non-nil summary")
|
||||
}
|
||||
|
||||
// Should have 2 containers
|
||||
if len(s.Containers) != 2 {
|
||||
t.Fatalf("Containers length: got %d, want 2", len(s.Containers))
|
||||
}
|
||||
|
||||
// Container A: 2 samples [1,2]
|
||||
containerA := s.Containers[0]
|
||||
if containerA.CPUCores.Peak != 2 {
|
||||
t.Errorf("container-a CPUCores.Peak: got %f, want 2", containerA.CPUCores.Peak)
|
||||
}
|
||||
if containerA.CPUCores.Avg != 1.5 {
|
||||
t.Errorf("container-a CPUCores.Avg: got %f, want 1.5", containerA.CPUCores.Avg)
|
||||
}
|
||||
|
||||
// Container B: 1 sample [5]
|
||||
containerB := s.Containers[1]
|
||||
if containerB.CPUCores.Peak != 5 {
|
||||
t.Errorf("container-b CPUCores.Peak: got %f, want 5", containerB.CPUCores.Peak)
|
||||
}
|
||||
if containerB.CPUCores.Avg != 5 {
|
||||
t.Errorf("container-b CPUCores.Avg: got %f, want 5", containerB.CPUCores.Avg)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,11 +4,14 @@ package summary
|
|||
|
||||
import "time"
|
||||
|
||||
// StatSummary holds peak, average, and P95 for a metric across the run
|
||||
// StatSummary holds peak, percentiles, and average for a metric across the run
|
||||
type StatSummary struct {
|
||||
Peak float64 `json:"peak"`
|
||||
Avg float64 `json:"avg"`
|
||||
P99 float64 `json:"p99"`
|
||||
P95 float64 `json:"p95"`
|
||||
P75 float64 `json:"p75"`
|
||||
P50 float64 `json:"p50"`
|
||||
Avg float64 `json:"avg"`
|
||||
}
|
||||
|
||||
// ProcessPeak holds the peak CPU and memory observed for a single process
|
||||
|
|
@ -19,15 +22,23 @@ type ProcessPeak struct {
|
|||
PeakMem uint64 `json:"peak_mem_rss_bytes"`
|
||||
}
|
||||
|
||||
// ContainerSummary holds statistics for a single container across the run
|
||||
type ContainerSummary struct {
|
||||
Name string `json:"name"`
|
||||
CPUCores StatSummary `json:"cpu_cores"`
|
||||
MemoryBytes StatSummary `json:"memory_bytes"`
|
||||
}
|
||||
|
||||
// RunSummary holds the complete summary of a collection run
|
||||
type RunSummary struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
DurationSeconds float64 `json:"duration_seconds"`
|
||||
SampleCount int `json:"sample_count"`
|
||||
CPUTotal StatSummary `json:"cpu_total_percent"`
|
||||
MemUsedBytes StatSummary `json:"mem_used_bytes"`
|
||||
MemUsedPercent StatSummary `json:"mem_used_percent"`
|
||||
TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"`
|
||||
TopMemProcesses []ProcessPeak `json:"top_mem_processes"`
|
||||
StartTime time.Time `json:"start_time"`
|
||||
EndTime time.Time `json:"end_time"`
|
||||
DurationSeconds float64 `json:"duration_seconds"`
|
||||
SampleCount int `json:"sample_count"`
|
||||
CPUTotal StatSummary `json:"cpu_total_percent"`
|
||||
MemUsedBytes StatSummary `json:"mem_used_bytes"`
|
||||
MemUsedPercent StatSummary `json:"mem_used_percent"`
|
||||
TopCPUProcesses []ProcessPeak `json:"top_cpu_processes"`
|
||||
TopMemProcesses []ProcessPeak `json:"top_mem_processes"`
|
||||
Containers []ContainerSummary `json:"containers"`
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue