forgejo-runner-optimiser/test/k8s/test-cgroup-grouping.yaml
Manuel Ganter 5e470c33a5
All checks were successful
ci / build (push) Successful in 30s
feat(collector): group CPU and memory metrics by cgroup
Add cgroup-based process grouping to the resource collector. Processes are
grouped by their cgroup path, with container names resolved via configurable
process-to-container mapping.

New features:
- Read cgroup info from /proc/[pid]/cgroup (supports v1 and v2)
- Parse K8s resource notation (500m, 1Gi, etc.) for CPU/memory limits
- Group metrics by container using CGROUP_PROCESS_MAP env var
- Calculate usage percentages against limits from CGROUP_LIMITS env var
- Output cgroup metrics with CPU cores used, memory RSS, and percentages

Environment variables:
- CGROUP_PROCESS_MAP: Map process names to container names for discovery
- CGROUP_LIMITS: Define CPU/memory limits per container in K8s notation

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-06 14:50:36 +01:00

148 lines
3.7 KiB
YAML

# Test manifest to verify cgroup grouping behavior
# This pod runs multiple containers with different resource limits
# and a collector sidecar that groups metrics by cgroup/container
apiVersion: v1
kind: Pod
metadata:
name: test-cgroup-grouping
labels:
app: test-cgroup-grouping
spec:
# Share PID namespace so collector can see all processes
shareProcessNamespace: true
containers:
# Main workload container - simulates a runner
- name: runner
image: busybox:latest
command:
- /bin/sh
- -c
- |
echo "Runner container started"
# Simulate some CPU work
while true; do
dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null
sleep 1
done
resources:
requests:
cpu: "100m"
memory: "64Mi"
limits:
cpu: "500m"
memory: "256Mi"
# Sidecar container - simulates nginx or another service
- name: sidecar
image: busybox:latest
command:
- /bin/sh
- -c
- |
echo "Sidecar container started"
# Simulate some lighter work
while true; do
sleep 5
done
resources:
requests:
cpu: "50m"
memory: "32Mi"
limits:
cpu: "100m"
memory: "128Mi"
# Resource collector sidecar
- name: collector
image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image
args:
- --interval=5s
- --top=3
env:
# Map process names to container names
# "sh" is the main process in busybox containers
# You may need to adjust based on actual process names
- name: CGROUP_PROCESS_MAP
value: |
{"sh":"runner","sleep":"sidecar","collector":"collector"}
# Define limits for each container (must match names in CGROUP_PROCESS_MAP)
- name: CGROUP_LIMITS
value: |
{"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}
resources:
requests:
cpu: "50m"
memory: "32Mi"
limits:
cpu: "100m"
memory: "64Mi"
# Mount proc read-only for process discovery
volumeMounts:
- name: proc
mountPath: /proc
readOnly: true
volumes:
- name: proc
hostPath:
path: /proc
type: Directory
restartPolicy: Never
---
# Alternative: Using a Deployment for longer-running tests
apiVersion: v1
kind: Pod
metadata:
name: test-cgroup-simple
labels:
app: test-cgroup-simple
spec:
shareProcessNamespace: true
containers:
# Stress container to generate CPU/memory load
- name: stress
image: progrium/stress:latest
args:
- --cpu
- "1"
- --vm
- "1"
- --vm-bytes
- "64M"
- --timeout
- "300s"
resources:
limits:
cpu: "500m"
memory: "128Mi"
# Collector
- name: collector
image: ghcr.io/your-org/forgejo-runner-resource-collector:latest # Replace with your image
args:
- --interval=2s
- --top=5
env:
- name: CGROUP_PROCESS_MAP
value: '{"stress":"stress","collector":"collector"}'
- name: CGROUP_LIMITS
value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}'
resources:
limits:
cpu: "100m"
memory: "64Mi"
volumeMounts:
- name: proc
mountPath: /proc
readOnly: true
volumes:
- name: proc
hostPath:
path: /proc
type: Directory
restartPolicy: Never