Alerts


/etc/prometheus/rules/up.yml > Alert rules
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: avg(irate(node_cpu_seconds_total{mode="idle"}[1m])
  * 100) < 50
for: 15s
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} has a average CPU idle (current value: {{ $value
    }}s)'
  summary: High usage on {{ $labels.instance }}
HostOutOfDiskSpace (0 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 10 and on(instance, device, mountpoint)
  node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  description: Disk is almost full (< 10% left)
  summary: Host out of disk space (instance {{ $labels.instance }})
  value: '{{ $value }}'
HostOutOfMemory (0 active)
alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes * 100 < 25
for: 15s
labels:
  severity: warning
annotations:
  description: 'Node memory is filling up (< 25% left)\n  VALUE = {{ $value }}\n  LABELS:
    {{ $labels }}'
  title: Host out of memory (instance {{ $labels.instance }})
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 15s
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minute.'
  title: Instance {{ $labels.instance }} down