feat(collector): group CPU and memory metrics by cgroup
All checks were successful
ci / build (push) Successful in 30s
All checks were successful
ci / build (push) Successful in 30s
Add cgroup-based process grouping to the resource collector. Processes are grouped by their cgroup path, with container names resolved via configurable process-to-container mapping. New features: - Read cgroup info from /proc/[pid]/cgroup (supports v1 and v2) - Parse K8s resource notation (500m, 1Gi, etc.) for CPU/memory limits - Group metrics by container using CGROUP_PROCESS_MAP env var - Calculate usage percentages against limits from CGROUP_LIMITS env var - Output cgroup metrics with CPU cores used, memory RSS, and percentages Environment variables: - CGROUP_PROCESS_MAP: Map process names to container names for discovery - CGROUP_LIMITS: Define CPU/memory limits per container in K8s notation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
0bf7dfee38
commit
5e470c33a5
14 changed files with 1534 additions and 17 deletions
62
identify-process-cgroup-by-pid.md
Normal file
62
identify-process-cgroup-by-pid.md
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Identify Process Cgroup by PID
|
||||
|
||||
## TL;DR
|
||||
|
||||
You can identify a process's cgroup by reading `/proc/<PID>/cgroup`.
|
||||
|
||||
## Outcome
|
||||
|
||||
Multiple methods to find which cgroup a process belongs to using its PID.
|
||||
|
||||
## Conversation
|
||||
|
||||
**Question:** Can I identify the process's cgroup by having its PID?
|
||||
|
||||
**Answer:** Yes, here are the main methods:
|
||||
|
||||
### Method 1: Read `/proc/<PID>/cgroup`
|
||||
|
||||
```bash
|
||||
cat /proc/<PID>/cgroup
|
||||
```
|
||||
|
||||
This shows all cgroup controllers the process belongs to. For example:
|
||||
```
|
||||
12:blkio:/user.slice
|
||||
11:memory:/user.slice/user-1000.slice
|
||||
...
|
||||
0::/user.slice/user-1000.slice/session-1.scope
|
||||
```
|
||||
|
||||
### Method 2: Using `ps` with format options
|
||||
|
||||
```bash
|
||||
ps -o pid,cgroup -p <PID>
|
||||
```
|
||||
|
||||
### Method 3: Using `systemd-cgls`
|
||||
|
||||
If you're on a systemd system:
|
||||
```bash
|
||||
systemd-cgls --unit <unit-name>
|
||||
# or to see the whole tree
|
||||
systemd-cgls
|
||||
```
|
||||
|
||||
### Method 4: Check cgroup v2 unified hierarchy
|
||||
|
||||
On cgroup v2 systems, check:
|
||||
```bash
|
||||
cat /proc/<PID>/cgroup
|
||||
# Output will look like: 0::/user.slice/user-1000.slice/session-1.scope
|
||||
```
|
||||
|
||||
The path after `::` is the cgroup path under `/sys/fs/cgroup/`.
|
||||
|
||||
### Quick one-liner
|
||||
|
||||
```bash
|
||||
cat /proc/self/cgroup # For current shell
|
||||
cat /proc/$$/cgroup # Also current shell
|
||||
cat /proc/1234/cgroup # For PID 1234
|
||||
```
|
||||
84
internal/cgroup/config.go
Normal file
84
internal/cgroup/config.go
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
// ABOUTME: Configuration types and parsing for cgroup limits and process mapping.
|
||||
// ABOUTME: Parses CGROUP_LIMITS and CGROUP_PROCESS_MAP environment variables.
|
||||
package cgroup
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
// CgroupLimit holds the resource limits for a container/cgroup
|
||||
type CgroupLimit struct {
|
||||
CPUCores float64 // CPU limit in cores (e.g., 0.5 for "500m", 2.0 for "2")
|
||||
MemoryBytes uint64 // Memory limit in bytes
|
||||
}
|
||||
|
||||
// CgroupLimits maps container names to their resource limits
|
||||
type CgroupLimits map[string]CgroupLimit
|
||||
|
||||
// ProcessMapping maps process names to container names (for cgroup path discovery)
|
||||
type ProcessMapping map[string]string
|
||||
|
||||
// CgroupPathMapping maps cgroup paths to container names (built at runtime)
|
||||
type CgroupPathMapping map[string]string
|
||||
|
||||
// rawLimitEntry is the JSON structure for each entry in CGROUP_LIMITS
|
||||
type rawLimitEntry struct {
|
||||
CPU string `json:"cpu"`
|
||||
Memory string `json:"memory"`
|
||||
}
|
||||
|
||||
// ParseCgroupLimitsEnv parses the CGROUP_LIMITS environment variable.
|
||||
// Expected format: {"container-name": {"cpu": "500m", "memory": "1Gi"}, ...}
|
||||
func ParseCgroupLimitsEnv() (CgroupLimits, error) {
|
||||
raw := os.Getenv("CGROUP_LIMITS")
|
||||
if raw == "" {
|
||||
return nil, nil // No limits configured
|
||||
}
|
||||
|
||||
var parsed map[string]rawLimitEntry
|
||||
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
|
||||
return nil, fmt.Errorf("parsing CGROUP_LIMITS: %w", err)
|
||||
}
|
||||
|
||||
limits := make(CgroupLimits)
|
||||
for name, entry := range parsed {
|
||||
var limit CgroupLimit
|
||||
var err error
|
||||
|
||||
if entry.CPU != "" {
|
||||
limit.CPUCores, err = ParseCPU(entry.CPU)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing CPU for %q: %w", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
if entry.Memory != "" {
|
||||
limit.MemoryBytes, err = ParseMemory(entry.Memory)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing memory for %q: %w", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
limits[name] = limit
|
||||
}
|
||||
|
||||
return limits, nil
|
||||
}
|
||||
|
||||
// ParseProcessMappingEnv parses the CGROUP_PROCESS_MAP environment variable.
|
||||
// Expected format: {"process-name": "container-name", ...}
|
||||
func ParseProcessMappingEnv() (ProcessMapping, error) {
|
||||
raw := os.Getenv("CGROUP_PROCESS_MAP")
|
||||
if raw == "" {
|
||||
return nil, nil // No mapping configured
|
||||
}
|
||||
|
||||
var parsed map[string]string
|
||||
if err := json.Unmarshal([]byte(raw), &parsed); err != nil {
|
||||
return nil, fmt.Errorf("parsing CGROUP_PROCESS_MAP: %w", err)
|
||||
}
|
||||
|
||||
return ProcessMapping(parsed), nil
|
||||
}
|
||||
96
internal/cgroup/parse.go
Normal file
96
internal/cgroup/parse.go
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
// ABOUTME: Parses Kubernetes-style resource notation for CPU and memory.
|
||||
// ABOUTME: CPU: "500m" = 0.5 cores, "2" = 2 cores.
|
||||
// ABOUTME: Memory: "1Gi" = 1 GiB, "512Mi" = 512 MiB, "1G" = 1 GB.
|
||||
package cgroup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseCPU parses Kubernetes CPU notation to cores.
|
||||
// Examples: "500m" => 0.5, "2" => 2.0, "100m" => 0.1, "2000m" => 2.0
|
||||
func ParseCPU(value string) (float64, error) {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
return 0, fmt.Errorf("empty CPU value")
|
||||
}
|
||||
|
||||
// Handle millicores suffix
|
||||
if strings.HasSuffix(value, "m") {
|
||||
millis, err := strconv.ParseFloat(strings.TrimSuffix(value, "m"), 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parsing millicores: %w", err)
|
||||
}
|
||||
return millis / 1000.0, nil
|
||||
}
|
||||
|
||||
// Plain number means cores
|
||||
cores, err := strconv.ParseFloat(value, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parsing cores: %w", err)
|
||||
}
|
||||
|
||||
return cores, nil
|
||||
}
|
||||
|
||||
// ParseMemory parses Kubernetes memory notation to bytes.
|
||||
// Supports:
|
||||
// - Binary suffixes: Ki, Mi, Gi, Ti (powers of 1024)
|
||||
// - Decimal suffixes: K, M, G, T (powers of 1000)
|
||||
// - Plain numbers: bytes
|
||||
func ParseMemory(value string) (uint64, error) {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
return 0, fmt.Errorf("empty memory value")
|
||||
}
|
||||
|
||||
// Binary suffixes (powers of 1024)
|
||||
binarySuffixes := map[string]uint64{
|
||||
"Ki": 1024,
|
||||
"Mi": 1024 * 1024,
|
||||
"Gi": 1024 * 1024 * 1024,
|
||||
"Ti": 1024 * 1024 * 1024 * 1024,
|
||||
}
|
||||
|
||||
// Decimal suffixes (powers of 1000)
|
||||
decimalSuffixes := map[string]uint64{
|
||||
"K": 1000,
|
||||
"M": 1000 * 1000,
|
||||
"G": 1000 * 1000 * 1000,
|
||||
"T": 1000 * 1000 * 1000 * 1000,
|
||||
}
|
||||
|
||||
// Try binary suffixes first (2-char)
|
||||
for suffix, multiplier := range binarySuffixes {
|
||||
if strings.HasSuffix(value, suffix) {
|
||||
numStr := strings.TrimSuffix(value, suffix)
|
||||
num, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parsing memory value: %w", err)
|
||||
}
|
||||
return uint64(num * float64(multiplier)), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Try decimal suffixes (1-char)
|
||||
for suffix, multiplier := range decimalSuffixes {
|
||||
if strings.HasSuffix(value, suffix) {
|
||||
numStr := strings.TrimSuffix(value, suffix)
|
||||
num, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parsing memory value: %w", err)
|
||||
}
|
||||
return uint64(num * float64(multiplier)), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Plain number (bytes)
|
||||
bytes, err := strconv.ParseUint(value, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parsing bytes: %w", err)
|
||||
}
|
||||
|
||||
return bytes, nil
|
||||
}
|
||||
84
internal/cgroup/parse_test.go
Normal file
84
internal/cgroup/parse_test.go
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
package cgroup
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseCPU(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want float64
|
||||
wantErr bool
|
||||
}{
|
||||
{"millicores 500m", "500m", 0.5, false},
|
||||
{"millicores 100m", "100m", 0.1, false},
|
||||
{"millicores 2000m", "2000m", 2.0, false},
|
||||
{"millicores 1m", "1m", 0.001, false},
|
||||
{"cores integer", "2", 2.0, false},
|
||||
{"cores decimal", "1.5", 1.5, false},
|
||||
{"cores with spaces", " 2 ", 2.0, false},
|
||||
{"empty string", "", 0, true},
|
||||
{"invalid format", "abc", 0, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := ParseCPU(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("ParseCPU() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr && got != tt.want {
|
||||
t.Errorf("ParseCPU() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMemory(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want uint64
|
||||
wantErr bool
|
||||
}{
|
||||
// Binary suffixes (powers of 1024)
|
||||
{"Ki", "1Ki", 1024, false},
|
||||
{"Mi", "1Mi", 1024 * 1024, false},
|
||||
{"Gi", "1Gi", 1024 * 1024 * 1024, false},
|
||||
{"Ti", "1Ti", 1024 * 1024 * 1024 * 1024, false},
|
||||
{"512Mi", "512Mi", 512 * 1024 * 1024, false},
|
||||
{"2Gi", "2Gi", 2 * 1024 * 1024 * 1024, false},
|
||||
|
||||
// Decimal suffixes (powers of 1000)
|
||||
{"K", "1K", 1000, false},
|
||||
{"M", "1M", 1000000, false},
|
||||
{"G", "1G", 1000000000, false},
|
||||
{"T", "1T", 1000000000000, false},
|
||||
|
||||
// Plain bytes
|
||||
{"bytes", "1024", 1024, false},
|
||||
{"large bytes", "1073741824", 1073741824, false},
|
||||
|
||||
// With spaces
|
||||
{"with spaces", " 1Gi ", 1024 * 1024 * 1024, false},
|
||||
|
||||
// Error cases
|
||||
{"empty", "", 0, true},
|
||||
{"invalid", "abc", 0, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := ParseMemory(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("ParseMemory() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr && got != tt.want {
|
||||
t.Errorf("ParseMemory() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -4,23 +4,37 @@ import (
|
|||
"sort"
|
||||
"time"
|
||||
|
||||
"edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/cgroup"
|
||||
"edp.buildth.ing/DevFW-CICD/forgejo-runner-resource-collector/internal/proc"
|
||||
)
|
||||
|
||||
// Aggregator collects and aggregates metrics from processes
|
||||
type Aggregator struct {
|
||||
procPath string
|
||||
topN int
|
||||
prevCPU *CPUSnapshot
|
||||
prevProcCPU map[int]*ProcessCPUSnapshot
|
||||
procPath string
|
||||
topN int
|
||||
prevCPU *CPUSnapshot
|
||||
prevProcCPU map[int]*ProcessCPUSnapshot
|
||||
cgroupLimits cgroup.CgroupLimits // container name -> limits
|
||||
processMapping cgroup.ProcessMapping // process name -> container name
|
||||
cgroupPathMapping cgroup.CgroupPathMapping // cgroup path -> container name (built at runtime)
|
||||
prevCgroupCPU map[string]uint64 // container name -> previous total ticks
|
||||
prevCgroupTime time.Time // previous collection time for cgroup CPU calc
|
||||
}
|
||||
|
||||
// NewAggregator creates a new metrics aggregator
|
||||
func NewAggregator(procPath string, topN int) *Aggregator {
|
||||
// Parse cgroup configuration from environment
|
||||
limits, _ := cgroup.ParseCgroupLimitsEnv()
|
||||
processMap, _ := cgroup.ParseProcessMappingEnv()
|
||||
|
||||
return &Aggregator{
|
||||
procPath: procPath,
|
||||
topN: topN,
|
||||
prevProcCPU: make(map[int]*ProcessCPUSnapshot),
|
||||
procPath: procPath,
|
||||
topN: topN,
|
||||
prevProcCPU: make(map[int]*ProcessCPUSnapshot),
|
||||
cgroupLimits: limits,
|
||||
processMapping: processMap,
|
||||
cgroupPathMapping: make(cgroup.CgroupPathMapping),
|
||||
prevCgroupCPU: make(map[string]uint64),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -77,6 +91,12 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) {
|
|||
return float64(p.MemRSS)
|
||||
})
|
||||
|
||||
// Discover cgroup path mappings from known processes
|
||||
a.discoverCgroupMappings(processes)
|
||||
|
||||
// Calculate per-cgroup metrics
|
||||
cgroupMetrics := a.calculateCgroupMetrics(processes, processMetrics, now)
|
||||
|
||||
return &SystemMetrics{
|
||||
Timestamp: now,
|
||||
TotalProcesses: len(processes),
|
||||
|
|
@ -84,6 +104,7 @@ func (a *Aggregator) Collect() (*SystemMetrics, error) {
|
|||
Memory: memMetrics,
|
||||
TopCPU: topCPU,
|
||||
TopMemory: topMemory,
|
||||
Cgroups: cgroupMetrics,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -158,6 +179,11 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now
|
|||
state = "?"
|
||||
}
|
||||
|
||||
cgroupPath := ""
|
||||
if p.Cgroup != nil {
|
||||
cgroupPath = p.Cgroup.Path
|
||||
}
|
||||
|
||||
metrics = append(metrics, ProcessMetrics{
|
||||
PID: pid,
|
||||
Name: p.Status.Name,
|
||||
|
|
@ -166,6 +192,7 @@ func (a *Aggregator) calculateProcessMetrics(processes []*proc.ProcessInfo, now
|
|||
MemVirtual: p.Status.VmSize,
|
||||
Threads: p.Status.Threads,
|
||||
State: state,
|
||||
CgroupPath: cgroupPath,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -223,3 +250,145 @@ func (a *Aggregator) getTopByMetric(metrics []ProcessMetrics, getValue func(Proc
|
|||
|
||||
return sorted[:n]
|
||||
}
|
||||
|
||||
// discoverCgroupMappings discovers cgroup path to container name mappings
|
||||
// by looking for processes that match the configured process mapping.
|
||||
func (a *Aggregator) discoverCgroupMappings(processes []*proc.ProcessInfo) {
|
||||
if len(a.processMapping) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, p := range processes {
|
||||
if p.Cgroup == nil || p.Cgroup.Path == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this process name is in our mapping
|
||||
if containerName, ok := a.processMapping[p.Status.Name]; ok {
|
||||
// Map this cgroup path to the container name
|
||||
a.cgroupPathMapping[p.Cgroup.Path] = containerName
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// resolveContainerName returns the container name for a cgroup path,
|
||||
// or the raw path if no mapping exists.
|
||||
func (a *Aggregator) resolveContainerName(cgroupPath string) string {
|
||||
if name, ok := a.cgroupPathMapping[cgroupPath]; ok {
|
||||
return name
|
||||
}
|
||||
// Use raw path as fallback
|
||||
if cgroupPath == "" {
|
||||
return "<unknown>"
|
||||
}
|
||||
return cgroupPath
|
||||
}
|
||||
|
||||
// calculateCgroupMetrics computes metrics grouped by container/cgroup.
|
||||
func (a *Aggregator) calculateCgroupMetrics(
|
||||
processes []*proc.ProcessInfo,
|
||||
processMetrics []ProcessMetrics,
|
||||
now time.Time,
|
||||
) map[string]*CgroupMetrics {
|
||||
// Build lookup from PID to ProcessMetrics
|
||||
pmByPID := make(map[int]ProcessMetrics)
|
||||
for _, pm := range processMetrics {
|
||||
pmByPID[pm.PID] = pm
|
||||
}
|
||||
|
||||
// Group processes by container name
|
||||
type cgroupData struct {
|
||||
cgroupPath string
|
||||
procs []*proc.ProcessInfo
|
||||
metrics []ProcessMetrics
|
||||
}
|
||||
containerGroups := make(map[string]*cgroupData)
|
||||
|
||||
for _, p := range processes {
|
||||
cgroupPath := ""
|
||||
if p.Cgroup != nil {
|
||||
cgroupPath = p.Cgroup.Path
|
||||
}
|
||||
|
||||
containerName := a.resolveContainerName(cgroupPath)
|
||||
|
||||
if _, ok := containerGroups[containerName]; !ok {
|
||||
containerGroups[containerName] = &cgroupData{
|
||||
cgroupPath: cgroupPath,
|
||||
}
|
||||
}
|
||||
|
||||
containerGroups[containerName].procs = append(containerGroups[containerName].procs, p)
|
||||
|
||||
if pm, ok := pmByPID[p.Stat.PID]; ok {
|
||||
containerGroups[containerName].metrics = append(containerGroups[containerName].metrics, pm)
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate elapsed time since last collection
|
||||
elapsed := 0.0
|
||||
if !a.prevCgroupTime.IsZero() {
|
||||
elapsed = now.Sub(a.prevCgroupTime).Seconds()
|
||||
}
|
||||
a.prevCgroupTime = now
|
||||
|
||||
// Calculate metrics for each container
|
||||
result := make(map[string]*CgroupMetrics)
|
||||
|
||||
for containerName, data := range containerGroups {
|
||||
// Sum CPU ticks (utime + stime only, not cutime/cstime)
|
||||
var totalTicks uint64
|
||||
var totalRSS uint64
|
||||
|
||||
for _, p := range data.procs {
|
||||
totalTicks += p.Stat.TotalTime()
|
||||
totalRSS += p.Status.VmRSS
|
||||
}
|
||||
|
||||
// Calculate CPU cores used from delta
|
||||
usedCores := 0.0
|
||||
if prev, ok := a.prevCgroupCPU[containerName]; ok && elapsed > 0 {
|
||||
deltaTicks := totalTicks - prev
|
||||
// Convert ticks to cores: deltaTicks / (elapsed_seconds * CLK_TCK)
|
||||
usedCores = float64(deltaTicks) / (elapsed * float64(proc.DefaultClockTicks))
|
||||
}
|
||||
a.prevCgroupCPU[containerName] = totalTicks
|
||||
|
||||
// Calculate percentages against limits if available
|
||||
cpuPercent := 0.0
|
||||
memPercent := 0.0
|
||||
var limitCores float64
|
||||
var limitBytes uint64
|
||||
|
||||
if limit, ok := a.cgroupLimits[containerName]; ok {
|
||||
limitCores = limit.CPUCores
|
||||
limitBytes = limit.MemoryBytes
|
||||
if limit.CPUCores > 0 {
|
||||
cpuPercent = (usedCores / limit.CPUCores) * 100
|
||||
}
|
||||
if limit.MemoryBytes > 0 {
|
||||
memPercent = (float64(totalRSS) / float64(limit.MemoryBytes)) * 100
|
||||
}
|
||||
}
|
||||
|
||||
result[containerName] = &CgroupMetrics{
|
||||
Name: containerName,
|
||||
CgroupPath: data.cgroupPath,
|
||||
CPU: CgroupCPUMetrics{
|
||||
TotalTicks: totalTicks,
|
||||
UsedCores: usedCores,
|
||||
UsedPercent: cpuPercent,
|
||||
LimitCores: limitCores,
|
||||
},
|
||||
Memory: CgroupMemoryMetrics{
|
||||
TotalRSSBytes: totalRSS,
|
||||
UsedPercent: memPercent,
|
||||
LimitBytes: limitBytes,
|
||||
},
|
||||
Processes: data.metrics,
|
||||
NumProcs: len(data.procs),
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ type ProcessMetrics struct {
|
|||
MemVirtual uint64 `json:"mem_virtual_bytes"`
|
||||
Threads int `json:"threads"`
|
||||
State string `json:"state"`
|
||||
CgroupPath string `json:"cgroup_path,omitempty"`
|
||||
}
|
||||
|
||||
// CPUMetrics holds aggregated CPU metrics
|
||||
|
|
@ -35,12 +36,38 @@ type MemoryMetrics struct {
|
|||
|
||||
// SystemMetrics holds a complete snapshot of system metrics
|
||||
type SystemMetrics struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
TotalProcesses int `json:"total_processes"`
|
||||
CPU CPUMetrics `json:"cpu"`
|
||||
Memory MemoryMetrics `json:"memory"`
|
||||
TopCPU []ProcessMetrics `json:"top_cpu,omitempty"`
|
||||
TopMemory []ProcessMetrics `json:"top_memory,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
TotalProcesses int `json:"total_processes"`
|
||||
CPU CPUMetrics `json:"cpu"`
|
||||
Memory MemoryMetrics `json:"memory"`
|
||||
TopCPU []ProcessMetrics `json:"top_cpu,omitempty"`
|
||||
TopMemory []ProcessMetrics `json:"top_memory,omitempty"`
|
||||
Cgroups map[string]*CgroupMetrics `json:"cgroups,omitempty"`
|
||||
}
|
||||
|
||||
// CgroupCPUMetrics holds CPU metrics for a single cgroup/container
|
||||
type CgroupCPUMetrics struct {
|
||||
TotalTicks uint64 `json:"total_ticks"`
|
||||
UsedCores float64 `json:"used_cores"`
|
||||
UsedPercent float64 `json:"used_percent,omitempty"`
|
||||
LimitCores float64 `json:"limit_cores,omitempty"`
|
||||
}
|
||||
|
||||
// CgroupMemoryMetrics holds memory metrics for a single cgroup/container
|
||||
type CgroupMemoryMetrics struct {
|
||||
TotalRSSBytes uint64 `json:"total_rss_bytes"`
|
||||
UsedPercent float64 `json:"used_percent,omitempty"`
|
||||
LimitBytes uint64 `json:"limit_bytes,omitempty"`
|
||||
}
|
||||
|
||||
// CgroupMetrics holds all metrics for a single cgroup/container
|
||||
type CgroupMetrics struct {
|
||||
Name string `json:"name"`
|
||||
CgroupPath string `json:"cgroup_path"`
|
||||
CPU CgroupCPUMetrics `json:"cpu"`
|
||||
Memory CgroupMemoryMetrics `json:"memory"`
|
||||
Processes []ProcessMetrics `json:"processes"`
|
||||
NumProcs int `json:"num_processes"`
|
||||
}
|
||||
|
||||
// CPUSnapshot holds CPU timing data for calculating percentages between intervals
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package output
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
|
|
@ -89,7 +90,8 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error {
|
|||
})
|
||||
}
|
||||
|
||||
w.logger.Info("metrics_collected",
|
||||
// Build base attributes
|
||||
attrs := []slog.Attr{
|
||||
slog.Time("collection_time", m.Timestamp),
|
||||
slog.Int("total_processes", m.TotalProcesses),
|
||||
slog.Group("cpu",
|
||||
|
|
@ -110,7 +112,14 @@ func (w *LoggerWriter) Write(m *metrics.SystemMetrics) error {
|
|||
),
|
||||
slog.Any("top_cpu", topCPU),
|
||||
slog.Any("top_memory", topMem),
|
||||
)
|
||||
}
|
||||
|
||||
// Add cgroups if present
|
||||
if len(m.Cgroups) > 0 {
|
||||
attrs = append(attrs, slog.Any("cgroups", m.Cgroups))
|
||||
}
|
||||
|
||||
w.logger.LogAttrs(context.Background(), slog.LevelInfo, "metrics_collected", attrs...)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
59
internal/proc/cgroup.go
Normal file
59
internal/proc/cgroup.go
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
// ABOUTME: Reads cgroup information from /proc/[pid]/cgroup.
|
||||
// ABOUTME: Supports both cgroup v1 and v2 formats.
|
||||
package proc
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CgroupInfo holds cgroup information for a process
|
||||
type CgroupInfo struct {
|
||||
Path string // The cgroup path (unified for v2, or from memory controller for v1)
|
||||
}
|
||||
|
||||
// ReadCgroup reads /proc/[pid]/cgroup and extracts the cgroup path
|
||||
func ReadCgroup(procPath string, pid int) (*CgroupInfo, error) {
|
||||
path := fmt.Sprintf("%s/%d/cgroup", procPath, pid)
|
||||
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("opening cgroup file: %w", err)
|
||||
}
|
||||
defer func() { _ = file.Close() }()
|
||||
|
||||
var cgroupPath string
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Try cgroup v2 first (unified hierarchy)
|
||||
// Format: 0::/path
|
||||
if path, found := strings.CutPrefix(line, "0::"); found {
|
||||
cgroupPath = path
|
||||
break
|
||||
}
|
||||
|
||||
// Fall back to cgroup v1 - look for memory controller
|
||||
// Format: X:memory:/path or X:memory,other:/path
|
||||
parts := strings.SplitN(line, ":", 3)
|
||||
if len(parts) == 3 {
|
||||
controllers := parts[1]
|
||||
if strings.Contains(controllers, "memory") {
|
||||
cgroupPath = parts[2]
|
||||
// Don't break - prefer v2 if found later
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("scanning cgroup file: %w", err)
|
||||
}
|
||||
|
||||
return &CgroupInfo{
|
||||
Path: cgroupPath,
|
||||
}, nil
|
||||
}
|
||||
97
internal/proc/cgroup_test.go
Normal file
97
internal/proc/cgroup_test.go
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
package proc
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReadCgroup(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
cgroupFile string
|
||||
wantPath string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "cgroup v2 unified",
|
||||
cgroupFile: `0::/kubepods/pod-abc/container-123
|
||||
`,
|
||||
wantPath: "/kubepods/pod-abc/container-123",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "cgroup v2 with trailing newline",
|
||||
cgroupFile: `0::/docker/abc123def456
|
||||
`,
|
||||
wantPath: "/docker/abc123def456",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "cgroup v1 multiple controllers",
|
||||
cgroupFile: `12:blkio:/user.slice
|
||||
11:memory:/docker/abc123
|
||||
10:cpu,cpuacct:/docker/abc123
|
||||
9:pids:/docker/abc123
|
||||
`,
|
||||
wantPath: "/docker/abc123",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "cgroup v2 preferred over v1",
|
||||
cgroupFile: `11:memory:/docker/old-path
|
||||
0::/kubepods/new-path
|
||||
`,
|
||||
wantPath: "/kubepods/new-path",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "empty file",
|
||||
cgroupFile: "",
|
||||
wantPath: "",
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "root cgroup",
|
||||
cgroupFile: `0::/
|
||||
`,
|
||||
wantPath: "/",
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Create a temp directory structure mimicking /proc
|
||||
tmpDir := t.TempDir()
|
||||
procDir := filepath.Join(tmpDir, "proc")
|
||||
pidDir := filepath.Join(procDir, "1234")
|
||||
|
||||
if err := os.MkdirAll(pidDir, 0755); err != nil {
|
||||
t.Fatalf("Failed to create pid dir: %v", err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(pidDir, "cgroup"), []byte(tt.cgroupFile), 0644); err != nil {
|
||||
t.Fatalf("Failed to write cgroup file: %v", err)
|
||||
}
|
||||
|
||||
got, err := ReadCgroup(procDir, 1234)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("ReadCgroup() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr && got.Path != tt.wantPath {
|
||||
t.Errorf("ReadCgroup() path = %q, want %q", got.Path, tt.wantPath)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadCgroup_FileNotFound(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
_, err := ReadCgroup(tmpDir, 1234)
|
||||
if err == nil {
|
||||
t.Error("ReadCgroup() expected error for missing file, got nil")
|
||||
}
|
||||
}
|
||||
|
|
@ -128,13 +128,14 @@ func ReadSystemCPU(procPath string) (user, nice, system, idle, iowait, irq, soft
|
|||
return 0, 0, 0, 0, 0, 0, 0, fmt.Errorf("cpu line not found in /proc/stat")
|
||||
}
|
||||
|
||||
// ProcessInfo combines stat and status information for a process
|
||||
// ProcessInfo combines stat, status, and cgroup information for a process
|
||||
type ProcessInfo struct {
|
||||
Stat *ProcStat
|
||||
Status *ProcStatus
|
||||
Cgroup *CgroupInfo
|
||||
}
|
||||
|
||||
// ReadProcess reads both stat and status for a single process
|
||||
// ReadProcess reads stat, status, and cgroup for a single process
|
||||
func ReadProcess(procPath string, pid int) (*ProcessInfo, error) {
|
||||
stat, err := ReadStat(procPath, pid)
|
||||
if err != nil {
|
||||
|
|
@ -146,9 +147,13 @@ func ReadProcess(procPath string, pid int) (*ProcessInfo, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
// Read cgroup info (non-fatal if it fails)
|
||||
cgroup, _ := ReadCgroup(procPath, pid)
|
||||
|
||||
return &ProcessInfo{
|
||||
Stat: stat,
|
||||
Status: status,
|
||||
Cgroup: cgroup,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
|
|||
560
proc-stat-in-containers.md
Normal file
560
proc-stat-in-containers.md
Normal file
|
|
@ -0,0 +1,560 @@
|
|||
# /proc/stat Behavior in Containerized Environments
|
||||
|
||||
## TL;DR
|
||||
|
||||
`/proc/stat` in containers shows host-level statistics, not container-specific data. To get accurate container CPU metrics when cgroups aren't an option (e.g., sidecars sharing PID namespace with different cgroups), aggregate `/proc/[pid]/stat` for all visible processes and calculate percentages using cgroup limits from `/sys/fs/cgroup`.
|
||||
|
||||
## Outcome
|
||||
|
||||
- Explained why `/proc/stat` doesn't provide container-aware metrics
|
||||
- Provided methods to aggregate process CPU usage from `/proc/[pid]/stat`
|
||||
- Showed how to determine parent/child process relationships
|
||||
- Demonstrated converting raw ticks to CPU percentages
|
||||
- Covered how to read cgroup CPU limits for other containers
|
||||
|
||||
## Conversation
|
||||
|
||||
### How does /proc/stat behave in containerized environments?
|
||||
|
||||
By default, `/proc/stat` inside a container shows **host system statistics**, not container-specific data. This includes:
|
||||
|
||||
- **CPU times** (user, system, idle, etc.) - reflects all CPUs on the host
|
||||
- **Context switches** - host-wide count
|
||||
- **Boot time** - host's boot time
|
||||
- **Processes created** - host-wide total
|
||||
|
||||
#### Why This Happens
|
||||
|
||||
The `/proc` filesystem is mounted from the host kernel. Containers share the host kernel, and `/proc/stat` is a kernel-level interface that doesn't have namespace awareness for most of its metrics.
|
||||
|
||||
#### Practical Implications
|
||||
|
||||
1. **CPU usage calculations are wrong** - Tools like `top`, `htop`, or custom monitoring that read `/proc/stat` will show host CPU usage, not container usage
|
||||
2. **Resource limits aren't reflected** - Even if you limit a container to 2 CPUs via cgroups, `/proc/stat` shows all host CPUs
|
||||
3. **Multi-tenant confusion** - In shared environments, containers see each other's aggregate impact
|
||||
|
||||
#### Solutions
|
||||
|
||||
| Approach | Description |
|
||||
|----------|-------------|
|
||||
| **cgroups** | Read from `/sys/fs/cgroup/cpu/` for container-specific CPU accounting |
|
||||
| **LXCFS** | A FUSE filesystem that provides container-aware `/proc` files |
|
||||
| **Container runtimes** | Some runtimes (like Kata) use VMs with isolated kernels |
|
||||
| **Metrics APIs** | Use Docker/Kubernetes APIs instead of `/proc/stat` |
|
||||
|
||||
#### Example: Getting Real Container CPU
|
||||
|
||||
```bash
|
||||
# Instead of /proc/stat, use cgroups v2:
|
||||
cat /sys/fs/cgroup/cpu.stat
|
||||
|
||||
# Or cgroups v1:
|
||||
cat /sys/fs/cgroup/cpu/cpuacct.usage
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Can I aggregate process usage to get the usage of the container?
|
||||
|
||||
Yes, you can aggregate process usage, but there are caveats.
|
||||
|
||||
#### Aggregating from /proc/[pid]/stat
|
||||
|
||||
```bash
|
||||
# Fields 14 (utime) and 15 (stime) in /proc/[pid]/stat
|
||||
for pid in /proc/[0-9]*; do
|
||||
awk '{print $14 + $15}' "$pid/stat" 2>/dev/null
|
||||
done | awk '{sum += $1} END {print sum}'
|
||||
```
|
||||
|
||||
#### Problems with This Approach
|
||||
|
||||
1. **Race conditions** - Processes can spawn/die between reads, causing inconsistent totals
|
||||
2. **Short-lived processes** - Processes that start and exit between samples are missed entirely
|
||||
3. **Zombie/exited processes** - Their CPU time may not be captured
|
||||
4. **Overhead** - Scanning all PIDs repeatedly is expensive
|
||||
5. **Namespace visibility** - You only see processes in your PID namespace, but that's actually what you want in a container
|
||||
6. **Children accounting** - When a process exits, its CPU time gets added to the parent's `cutime`/`cstime` fields, which can cause double-counting if not handled carefully
|
||||
|
||||
#### Better Alternative: cgroups
|
||||
|
||||
Cgroups track CPU usage at the group level and handle all these edge cases:
|
||||
|
||||
```bash
|
||||
# cgroups v2 (modern)
|
||||
cat /sys/fs/cgroup/cpu.stat
|
||||
# Shows: usage_usec, user_usec, system_usec
|
||||
|
||||
# cgroups v1
|
||||
cat /sys/fs/cgroup/cpu/cpuacct.usage # total nanoseconds
|
||||
cat /sys/fs/cgroup/cpu/cpuacct.stat # user/system jiffies
|
||||
```
|
||||
|
||||
**However**, cgroups cannot be used when sidecars share the PID namespace but have different cgroups. In that case, aggregating `/proc/[pid]/stat` is the best option.
|
||||
|
||||
---
|
||||
|
||||
### How do I determine child/parent processes?
|
||||
|
||||
#### From /proc/[pid]/stat
|
||||
|
||||
Field 4 is the **PPID** (parent process ID):
|
||||
|
||||
```bash
|
||||
# Get PPID for a specific process
|
||||
awk '{print $4}' /proc/1234/stat
|
||||
|
||||
# List all processes with their parents
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
pid="${stat#/proc/}"
|
||||
pid="${pid%/stat}"
|
||||
ppid=$(awk '{print $4}' "$stat" 2>/dev/null) && \
|
||||
echo "$pid -> parent: $ppid"
|
||||
done
|
||||
```
|
||||
|
||||
#### From /proc/[pid]/status (more readable)
|
||||
|
||||
```bash
|
||||
grep PPid /proc/1234/status
|
||||
# PPid: 1
|
||||
```
|
||||
|
||||
#### Building a Process Tree
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
declare -A parent_of
|
||||
declare -A children_of
|
||||
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
if read -r line < "$stat" 2>/dev/null; then
|
||||
pid="${stat#/proc/}"
|
||||
pid="${pid%/stat}"
|
||||
|
||||
# Extract PPID (field 4, but handle comm with spaces)
|
||||
rest="${line##*) }"
|
||||
read -ra fields <<< "$rest"
|
||||
ppid="${fields[1]}" # 4th field overall = index 1 after state
|
||||
|
||||
parent_of[$pid]=$ppid
|
||||
children_of[$ppid]+="$pid "
|
||||
fi
|
||||
done
|
||||
|
||||
# Print tree from PID 1
|
||||
print_tree() {
|
||||
local pid=$1
|
||||
local indent=$2
|
||||
echo "${indent}${pid}"
|
||||
for child in ${children_of[$pid]}; do
|
||||
print_tree "$child" " $indent"
|
||||
done
|
||||
}
|
||||
|
||||
print_tree 1 ""
|
||||
```
|
||||
|
||||
#### For CPU Aggregation: Handling cutime/cstime
|
||||
|
||||
To properly handle `cutime`/`cstime` without double-counting:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
declare -A parent_of
|
||||
declare -A utime stime
|
||||
|
||||
# First pass: collect all data
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
if read -r line < "$stat" 2>/dev/null; then
|
||||
pid="${stat#/proc/}"
|
||||
pid="${pid%/stat}"
|
||||
rest="${line##*) }"
|
||||
read -ra f <<< "$rest"
|
||||
|
||||
parent_of[$pid]="${f[1]}"
|
||||
utime[$pid]="${f[11]}"
|
||||
stime[$pid]="${f[12]}"
|
||||
# cutime=${f[13]} cstime=${f[14]} - don't sum these
|
||||
fi
|
||||
done
|
||||
|
||||
# Sum only utime/stime (not cutime/cstime)
|
||||
total=0
|
||||
for pid in "${!utime[@]}"; do
|
||||
((total += utime[$pid] + stime[$pid]))
|
||||
done
|
||||
|
||||
echo "Total CPU ticks: $total"
|
||||
echo "Seconds: $(echo "scale=2; $total / $(getconf CLK_TCK)" | bc)"
|
||||
```
|
||||
|
||||
**Key insight:** Only sum `utime` + `stime` for each process. The `cutime`/`cstime` fields are cumulative from children that have already exited and been `wait()`ed on—those children no longer exist in `/proc`, so their time is only accessible via the parent's `cutime`/`cstime`.
|
||||
|
||||
---
|
||||
|
||||
### How do I convert utime/stime to percentages?
|
||||
|
||||
You need **two samples** over a time interval. CPU percentage is a rate, not an absolute value.
|
||||
|
||||
#### The Formula
|
||||
|
||||
```
|
||||
CPU % = (delta_ticks / (elapsed_seconds * CLK_TCK * num_cpus)) * 100
|
||||
```
|
||||
|
||||
- `delta_ticks` = difference in (utime + stime) between samples
|
||||
- `CLK_TCK` = ticks per second (usually 100, get via `getconf CLK_TCK`)
|
||||
- `num_cpus` = number of CPUs (omit for single-CPU percentage)
|
||||
|
||||
#### Two Common Percentage Styles
|
||||
|
||||
| Style | Formula | Example |
|
||||
|-------|---------|---------|
|
||||
| **Normalized** (0-100%) | `delta / (elapsed * CLK_TCK * num_cpus) * 100` | 50% = half of total capacity |
|
||||
| **Cores-style** (0-N*100%) | `delta / (elapsed * CLK_TCK) * 100` | 200% = 2 full cores busy |
|
||||
|
||||
#### Practical Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
CLK_TCK=$(getconf CLK_TCK)
|
||||
NUM_CPUS=$(nproc)
|
||||
|
||||
get_total_ticks() {
|
||||
local total=0
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
if read -r line < "$stat" 2>/dev/null; then
|
||||
rest="${line##*) }"
|
||||
read -ra f <<< "$rest"
|
||||
((total += f[11] + f[12])) # utime + stime
|
||||
fi
|
||||
done
|
||||
echo "$total"
|
||||
}
|
||||
|
||||
# First sample
|
||||
ticks1=$(get_total_ticks)
|
||||
time1=$(date +%s.%N)
|
||||
|
||||
# Wait
|
||||
sleep 1
|
||||
|
||||
# Second sample
|
||||
ticks2=$(get_total_ticks)
|
||||
time2=$(date +%s.%N)
|
||||
|
||||
# Calculate
|
||||
delta_ticks=$((ticks2 - ticks1))
|
||||
elapsed=$(echo "$time2 - $time1" | bc)
|
||||
|
||||
# Percentage of total CPU capacity (all cores)
|
||||
pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK * $NUM_CPUS)) * 100" | bc)
|
||||
echo "CPU usage: ${pct}% of ${NUM_CPUS} CPUs"
|
||||
|
||||
# Percentage as "CPU cores used" (like top's 200% for 2 full cores)
|
||||
cores_pct=$(echo "scale=2; ($delta_ticks / ($elapsed * $CLK_TCK)) * 100" | bc)
|
||||
echo "CPU usage: ${cores_pct}% (cores-style)"
|
||||
```
|
||||
|
||||
#### Continuous Monitoring
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
CLK_TCK=$(getconf CLK_TCK)
|
||||
NUM_CPUS=$(nproc)
|
||||
INTERVAL=1
|
||||
|
||||
get_total_ticks() {
|
||||
local total=0
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
read -r line < "$stat" 2>/dev/null || continue
|
||||
rest="${line##*) }"
|
||||
read -ra f <<< "$rest"
|
||||
((total += f[11] + f[12]))
|
||||
done
|
||||
echo "$total"
|
||||
}
|
||||
|
||||
prev_ticks=$(get_total_ticks)
|
||||
prev_time=$(date +%s.%N)
|
||||
|
||||
while true; do
|
||||
sleep "$INTERVAL"
|
||||
|
||||
curr_ticks=$(get_total_ticks)
|
||||
curr_time=$(date +%s.%N)
|
||||
|
||||
delta=$((curr_ticks - prev_ticks))
|
||||
elapsed=$(echo "$curr_time - $prev_time" | bc)
|
||||
|
||||
pct=$(echo "scale=1; $delta / ($elapsed * $CLK_TCK * $NUM_CPUS) * 100" | bc)
|
||||
printf "\rCPU: %5.1f%%" "$pct"
|
||||
|
||||
prev_ticks=$curr_ticks
|
||||
prev_time=$curr_time
|
||||
done
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Does this calculation respect cgroup limits?
|
||||
|
||||
No, it doesn't. The calculation uses `nproc` which typically returns **host CPU count**, not your cgroup limit.
|
||||
|
||||
#### The Problem
|
||||
|
||||
If your container is limited to 2 CPUs on an 8-CPU host:
|
||||
- `nproc` returns 8
|
||||
- Your calculation shows 25% when you're actually at 100% of your limit
|
||||
|
||||
#### Getting Effective CPU Limit
|
||||
|
||||
**cgroups v2:**
|
||||
|
||||
```bash
|
||||
# cpu.max contains: $quota $period (in microseconds)
|
||||
# "max 100000" means unlimited
|
||||
read quota period < /sys/fs/cgroup/cpu.max
|
||||
if [[ "$quota" == "max" ]]; then
|
||||
effective_cpus=$(nproc)
|
||||
else
|
||||
effective_cpus=$(echo "scale=2; $quota / $period" | bc)
|
||||
fi
|
||||
echo "Effective CPUs: $effective_cpus"
|
||||
```
|
||||
|
||||
**cgroups v1:**
|
||||
|
||||
```bash
|
||||
quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us)
|
||||
period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us)
|
||||
|
||||
if [[ "$quota" == "-1" ]]; then
|
||||
effective_cpus=$(nproc)
|
||||
else
|
||||
effective_cpus=$(echo "scale=2; $quota / $period" | bc)
|
||||
fi
|
||||
```
|
||||
|
||||
**Also Check cpuset Limits:**
|
||||
|
||||
```bash
|
||||
# cgroups v2
|
||||
cpuset=$(cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null)
|
||||
|
||||
# cgroups v1
|
||||
cpuset=$(cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null)
|
||||
|
||||
# Parse "0-3,5,7" format to count CPUs
|
||||
count_cpus() {
|
||||
local count=0
|
||||
IFS=',' read -ra ranges <<< "$1"
|
||||
for range in "${ranges[@]}"; do
|
||||
if [[ "$range" == *-* ]]; then
|
||||
start="${range%-*}"
|
||||
end="${range#*-}"
|
||||
((count += end - start + 1))
|
||||
else
|
||||
((count++))
|
||||
fi
|
||||
done
|
||||
echo "$count"
|
||||
}
|
||||
```
|
||||
|
||||
#### Updated Script Respecting Limits
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
CLK_TCK=$(getconf CLK_TCK)
|
||||
|
||||
get_effective_cpus() {
|
||||
# Try cgroups v2 first
|
||||
if [[ -f /sys/fs/cgroup/cpu.max ]]; then
|
||||
read quota period < /sys/fs/cgroup/cpu.max
|
||||
if [[ "$quota" != "max" ]]; then
|
||||
echo "scale=2; $quota / $period" | bc
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
# Try cgroups v1
|
||||
if [[ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ]]; then
|
||||
quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us)
|
||||
period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us)
|
||||
if [[ "$quota" != "-1" ]]; then
|
||||
echo "scale=2; $quota / $period" | bc
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
# Fallback to nproc
|
||||
nproc
|
||||
}
|
||||
|
||||
EFFECTIVE_CPUS=$(get_effective_cpus)
|
||||
echo "Effective CPU limit: $EFFECTIVE_CPUS"
|
||||
|
||||
# ... rest of your sampling logic using $EFFECTIVE_CPUS
|
||||
```
|
||||
|
||||
#### Shared PID Namespace Consideration
|
||||
|
||||
When sidecars share PID namespace but have different cgroups:
|
||||
- Each container may have different CPU limits
|
||||
- You're aggregating processes across those limits
|
||||
- There's no single "correct" limit to use
|
||||
|
||||
**Options:**
|
||||
1. **Use host CPU count** — percentage of total host capacity
|
||||
2. **Sum the limits** — if you know each sidecar's cgroup, sum their quotas
|
||||
3. **Report in cores** — skip normalization, just show `1.5 cores used` instead of percentage
|
||||
|
||||
---
|
||||
|
||||
### Can I get the cgroup limit for another cgroup?
|
||||
|
||||
Yes, if you have visibility into the cgroup filesystem.
|
||||
|
||||
#### 1. Find a Process's Cgroup
|
||||
|
||||
Every process exposes its cgroup membership:
|
||||
|
||||
```bash
|
||||
# Get cgroup for any PID you can see
|
||||
cat /proc/1234/cgroup
|
||||
|
||||
# cgroups v2 output:
|
||||
# 0::/kubepods/pod123/container456
|
||||
|
||||
# cgroups v1 output:
|
||||
# 12:cpu,cpuacct:/docker/abc123
|
||||
# 11:memory:/docker/abc123
|
||||
# ...
|
||||
```
|
||||
|
||||
#### 2. Read That Cgroup's Limits
|
||||
|
||||
If the cgroup filesystem is mounted and accessible:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
get_cgroup_cpu_limit() {
|
||||
local pid=$1
|
||||
|
||||
# Get cgroup path for this PID
|
||||
cgroup_path=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null) # v2
|
||||
|
||||
if [[ -n "$cgroup_path" ]]; then
|
||||
# cgroups v2
|
||||
limit_file="/sys/fs/cgroup${cgroup_path}/cpu.max"
|
||||
if [[ -r "$limit_file" ]]; then
|
||||
read quota period < "$limit_file"
|
||||
if [[ "$quota" == "max" ]]; then
|
||||
echo "unlimited"
|
||||
else
|
||||
echo "scale=2; $quota / $period" | bc
|
||||
fi
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
# Try cgroups v1
|
||||
cgroup_path=$(grep -oP 'cpu.*:\K.*' /proc/$pid/cgroup 2>/dev/null)
|
||||
if [[ -n "$cgroup_path" ]]; then
|
||||
quota_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_quota_us"
|
||||
period_file="/sys/fs/cgroup/cpu${cgroup_path}/cpu.cfs_period_us"
|
||||
if [[ -r "$quota_file" ]]; then
|
||||
quota=$(cat "$quota_file")
|
||||
period=$(cat "$period_file")
|
||||
if [[ "$quota" == "-1" ]]; then
|
||||
echo "unlimited"
|
||||
else
|
||||
echo "scale=2; $quota / $period" | bc
|
||||
fi
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "unknown"
|
||||
}
|
||||
|
||||
# Example: get limit for PID 1234
|
||||
get_cgroup_cpu_limit 1234
|
||||
```
|
||||
|
||||
#### 3. Mount Visibility Requirements
|
||||
|
||||
| Scenario | Can Read Other Cgroups? |
|
||||
|----------|------------------------|
|
||||
| Host system | Yes |
|
||||
| Privileged container | Yes |
|
||||
| `/sys/fs/cgroup` mounted read-only from host | Yes (common in Kubernetes) |
|
||||
| Only own cgroup subtree mounted | No |
|
||||
|
||||
Check what's visible:
|
||||
|
||||
```bash
|
||||
mount | grep cgroup
|
||||
ls /sys/fs/cgroup/
|
||||
```
|
||||
|
||||
#### 4. Full Solution: Aggregate by Cgroup
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
CLK_TCK=$(getconf CLK_TCK)
|
||||
|
||||
declare -A cgroup_ticks
|
||||
declare -A cgroup_limit
|
||||
|
||||
for stat in /proc/[0-9]*/stat; do
|
||||
pid="${stat#/proc/}"
|
||||
pid="${pid%/stat}"
|
||||
|
||||
# Get cgroup for this process
|
||||
cg=$(grep -oP '0::\K.*' /proc/$pid/cgroup 2>/dev/null)
|
||||
[[ -z "$cg" ]] && continue
|
||||
|
||||
# Get CPU ticks
|
||||
if read -r line < "$stat" 2>/dev/null; then
|
||||
rest="${line##*) }"
|
||||
read -ra f <<< "$rest"
|
||||
ticks=$((f[11] + f[12]))
|
||||
|
||||
((cgroup_ticks[$cg] += ticks))
|
||||
|
||||
# Cache the limit (only look up once per cgroup)
|
||||
if [[ -z "${cgroup_limit[$cg]}" ]]; then
|
||||
limit_file="/sys/fs/cgroup${cg}/cpu.max"
|
||||
if [[ -r "$limit_file" ]]; then
|
||||
read quota period < "$limit_file"
|
||||
if [[ "$quota" == "max" ]]; then
|
||||
cgroup_limit[$cg]="unlimited"
|
||||
else
|
||||
cgroup_limit[$cg]=$(echo "scale=2; $quota / $period" | bc)
|
||||
fi
|
||||
else
|
||||
cgroup_limit[$cg]="unknown"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Ticks by cgroup:"
|
||||
for cg in "${!cgroup_ticks[@]}"; do
|
||||
echo " $cg: ${cgroup_ticks[$cg]} ticks (limit: ${cgroup_limit[$cg]} CPUs)"
|
||||
done
|
||||
```
|
||||
|
||||
#### If You Can't Access Other Cgroups
|
||||
|
||||
Fallback options:
|
||||
|
||||
1. **Mount the cgroup fs** — add volume mount for `/sys/fs/cgroup:ro`
|
||||
2. **Use a sidecar with access** — one privileged container does the monitoring
|
||||
3. **Accept "unknown" limits** — report raw ticks/cores instead of percentages
|
||||
4. **Kubernetes Downward API** — inject limits as env vars (only for your own container though)
|
||||
81
test/docker/docker-compose.yaml
Normal file
81
test/docker/docker-compose.yaml
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# Docker Compose test setup for cgroup grouping verification
|
||||
# Run with: docker compose -f test/docker/docker-compose.yaml up
|
||||
#
|
||||
# NOTE: Docker Compose doesn't have a direct equivalent to K8s shareProcessNamespace.
|
||||
# Options:
|
||||
# 1. pid: "host" - sees ALL host processes (not container-specific)
|
||||
# 2. pid: "service:<name>" - chains PID namespace to another service
|
||||
#
|
||||
# For proper testing, use Kubernetes or run containers manually with --pid=container:<id>
|
||||
|
||||
services:
|
||||
# Simulate a runner workload (this will be the "root" of the shared PID namespace)
|
||||
# Uses 'cat' reading from a fifo as a unique identifiable process
|
||||
runner:
|
||||
image: busybox:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Runner started (PID 1 in namespace)"
|
||||
mkfifo /tmp/runner_fifo
|
||||
# 'cat' will be our identifiable runner process (blocks on fifo)
|
||||
cat /tmp/runner_fifo &
|
||||
CAT_PID=$!
|
||||
# Generate CPU load with dd
|
||||
while true; do
|
||||
dd if=/dev/zero of=/dev/null bs=1M count=50 2>/dev/null
|
||||
done
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.5"
|
||||
memory: 256M
|
||||
# This container owns the PID namespace
|
||||
|
||||
# Simulate a sidecar service - shares PID namespace with runner
|
||||
sidecar:
|
||||
image: busybox:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Sidecar started"
|
||||
# List processes to verify shared namespace
|
||||
ps aux
|
||||
while true; do
|
||||
sleep 10
|
||||
done
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.1"
|
||||
memory: 128M
|
||||
pid: "service:runner" # Share PID namespace with runner
|
||||
depends_on:
|
||||
- runner
|
||||
|
||||
# Resource collector - shares PID namespace with runner
|
||||
collector:
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: Dockerfile
|
||||
target: collector
|
||||
command:
|
||||
- --interval=3s
|
||||
- --top=5
|
||||
- --log-format=json
|
||||
environment:
|
||||
# Map unique process names to container names
|
||||
# 'cat' runs only in runner, 'sleep' runs only in sidecar
|
||||
CGROUP_PROCESS_MAP: '{"cat":"runner","sleep":"sidecar","resource-collec":"collector"}'
|
||||
CGROUP_LIMITS: '{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}'
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "0.1"
|
||||
memory: 64M
|
||||
pid: "service:runner" # Share PID namespace with runner
|
||||
depends_on:
|
||||
- runner
|
||||
- sidecar
|
||||
148
test/k8s/test-cgroup-grouping.yaml
Normal file
148
test/k8s/test-cgroup-grouping.yaml
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
# Test manifest to verify cgroup grouping behavior
|
||||
# This pod runs multiple containers with different resource limits
|
||||
# and a collector sidecar that groups metrics by cgroup/container
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: test-cgroup-grouping
|
||||
labels:
|
||||
app: test-cgroup-grouping
|
||||
spec:
|
||||
# Share PID namespace so collector can see all processes
|
||||
shareProcessNamespace: true
|
||||
|
||||
containers:
|
||||
# Main workload container - simulates a runner
|
||||
- name: runner
|
||||
image: busybox:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Runner container started"
|
||||
# Simulate some CPU work
|
||||
while true; do
|
||||
dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null
|
||||
sleep 1
|
||||
done
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "64Mi"
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "256Mi"
|
||||
|
||||
# Sidecar container - simulates nginx or another service
|
||||
- name: sidecar
|
||||
image: busybox:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Sidecar container started"
|
||||
# Simulate some lighter work
|
||||
while true; do
|
||||
sleep 5
|
||||
done
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "32Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "128Mi"
|
||||
|
||||
# Resource collector sidecar
|
||||
- name: collector
|
||||
image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image
|
||||
args:
|
||||
- --interval=5s
|
||||
- --top=3
|
||||
env:
|
||||
# Map process names to container names
|
||||
# "sh" is the main process in busybox containers
|
||||
# You may need to adjust based on actual process names
|
||||
- name: CGROUP_PROCESS_MAP
|
||||
value: |
|
||||
{"sh":"runner","sleep":"sidecar","collector":"collector"}
|
||||
# Define limits for each container (must match names in CGROUP_PROCESS_MAP)
|
||||
- name: CGROUP_LIMITS
|
||||
value: |
|
||||
{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "32Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "64Mi"
|
||||
# Mount proc read-only for process discovery
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /proc
|
||||
readOnly: true
|
||||
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
type: Directory
|
||||
|
||||
restartPolicy: Never
|
||||
---
|
||||
# Alternative: Using a Deployment for longer-running tests
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: test-cgroup-simple
|
||||
labels:
|
||||
app: test-cgroup-simple
|
||||
spec:
|
||||
shareProcessNamespace: true
|
||||
|
||||
containers:
|
||||
# Stress container to generate CPU/memory load
|
||||
- name: stress
|
||||
image: progrium/stress:latest
|
||||
args:
|
||||
- --cpu
|
||||
- "1"
|
||||
- --vm
|
||||
- "1"
|
||||
- --vm-bytes
|
||||
- "64M"
|
||||
- --timeout
|
||||
- "300s"
|
||||
resources:
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "128Mi"
|
||||
|
||||
# Collector
|
||||
- name: collector
|
||||
image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image
|
||||
args:
|
||||
- --interval=2s
|
||||
- --top=5
|
||||
env:
|
||||
- name: CGROUP_PROCESS_MAP
|
||||
value: '{"stress":"stress","collector":"collector"}'
|
||||
- name: CGROUP_LIMITS
|
||||
value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}'
|
||||
resources:
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "64Mi"
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /proc
|
||||
readOnly: true
|
||||
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
type: Directory
|
||||
|
||||
restartPolicy: Never
|
||||
36
test/local-test.sh
Executable file
36
test/local-test.sh
Executable file
|
|
@ -0,0 +1,36 @@
|
|||
#!/bin/bash
|
||||
# Local test script to verify cgroup grouping
|
||||
# Run from project root: ./test/local-test.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "Building collector..."
|
||||
go build -o bin/collector ./cmd/collector
|
||||
|
||||
echo ""
|
||||
echo "Testing cgroup parsing on current system..."
|
||||
echo "Current process cgroup:"
|
||||
cat /proc/self/cgroup 2>/dev/null || echo "Cannot read /proc/self/cgroup (expected on macOS)"
|
||||
|
||||
echo ""
|
||||
echo "Running collector for 10 seconds with cgroup grouping..."
|
||||
echo "Press Ctrl+C to stop early"
|
||||
echo ""
|
||||
|
||||
# Set up test environment variables
|
||||
# Map common process names to container names
|
||||
export CGROUP_PROCESS_MAP='{"bash":"shell","collector":"collector","zsh":"shell"}'
|
||||
export CGROUP_LIMITS='{"shell":{"cpu":"2","memory":"4Gi"},"collector":{"cpu":"1","memory":"1Gi"}}'
|
||||
|
||||
# Run collector
|
||||
timeout 10 ./bin/collector \
|
||||
--interval=2s \
|
||||
--top=5 \
|
||||
--log-format=json \
|
||||
2>/dev/null || true
|
||||
|
||||
echo ""
|
||||
echo "Test complete!"
|
||||
echo ""
|
||||
echo "Note: On macOS, cgroup paths will be empty since cgroups are a Linux feature."
|
||||
echo "To test properly, run in a Linux container or VM."
|
||||
Loading…
Add table
Add a link
Reference in a new issue