forgejo-runner-optimiser/test/k8s/test-cgroup-grouping.yaml

# Test manifest to verify cgroup grouping behavior
# This pod runs multiple containers with different resource limits
# and a collector sidecar that groups metrics by cgroup/container
apiVersion: v1
kind: Pod
metadata:
  name: test-cgroup-grouping
  labels:
    app: test-cgroup-grouping
spec:
  # Share PID namespace so collector can see all processes
  shareProcessNamespace: true

  containers:
    # Main workload container - simulates a runner
    - name: runner
      image: busybox:latest
      command:
        - /bin/sh
        - -c
        - |
          echo "Runner container started"
          # Simulate some CPU work
          while true; do
            dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null
            sleep 1
          done
      resources:
        requests:
          cpu: "100m"
          memory: "64Mi"
        limits:
          cpu: "500m"
          memory: "256Mi"

    # Sidecar container - simulates nginx or another service
    - name: sidecar
      image: busybox:latest
      command:
        - /bin/sh
        - -c
        - |
          echo "Sidecar container started"
          # Simulate some lighter work
          while true; do
            sleep 5
          done
      resources:
        requests:
          cpu: "50m"
          memory: "32Mi"
        limits:
          cpu: "100m"
          memory: "128Mi"

    # Resource collector sidecar
    - name: collector
      image: ghcr.io/your-org/forgejo-runner-resource-collector:latest  # Replace with your image
      args:
        - --interval=5s
        - --top=3
      env:
        # Map process names to container names
        # "sh" is the main process in busybox containers
        # You may need to adjust based on actual process names
        - name: CGROUP_PROCESS_MAP
          value: |
            {"sh":"runner","sleep":"sidecar","collector":"collector"}
        # Define limits for each container (must match names in CGROUP_PROCESS_MAP)
        - name: CGROUP_LIMITS
          value: |
            {"runner":{"cpu":"500m","memory":"256Mi"},"sidecar":{"cpu":"100m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}
      resources:
        requests:
          cpu: "50m"
          memory: "32Mi"
        limits:
          cpu: "100m"
          memory: "64Mi"
      # Mount proc read-only for process discovery
      volumeMounts:
        - name: proc
          mountPath: /proc
          readOnly: true

  volumes:
    - name: proc
      hostPath:
        path: /proc
        type: Directory

  restartPolicy: Never
---
# Alternative: Using a Deployment for longer-running tests
apiVersion: v1
kind: Pod
metadata:
  name: test-cgroup-simple
  labels:
    app: test-cgroup-simple
spec:
  shareProcessNamespace: true

  containers:
    # Stress container to generate CPU/memory load
    - name: stress
      image: progrium/stress:latest
      args:
        - --cpu
        - "1"
        - --vm
        - "1"
        - --vm-bytes
        - "64M"
        - --timeout
        - "300s"
      resources:
        limits:
          cpu: "500m"
          memory: "128Mi"

    # Collector
    - name: collector
      image: ghcr.io/your-org/forgejo-runner-resource-collector:latest  # Replace with your image
      args:
        - --interval=2s
        - --top=5
      env:
        - name: CGROUP_PROCESS_MAP
          value: '{"stress":"stress","collector":"collector"}'
        - name: CGROUP_LIMITS
          value: '{"stress":{"cpu":"500m","memory":"128Mi"},"collector":{"cpu":"100m","memory":"64Mi"}}'
      resources:
        limits:
          cpu: "100m"
          memory: "64Mi"
      volumeMounts:
        - name: proc
          mountPath: /proc
          readOnly: true

  volumes:
    - name: proc
      hostPath:
        path: /proc
        type: Directory

  restartPolicy: Never