More than expected

- scrape prometheus through exporter-exporter - Install json-exporter - Install alert-rules - Reload prometheus where sufficient (no restart)

More than expected
1026b1b5 · Leonard Penzer · 1a1b0224 · 1026b1b5 · 1026b1b5 · 1026b1b5
Commit 1026b1b5 authored 3 weeks ago by Leonard Penzer
--- a/inventory/role_prometheus
+++ b/inventory/role_prometheus
+---
+role_prometheus:
+  hosts:
+    prometheus01.vm.freifunk-stuttgart.de:
+    prometheus02.vm.freifunk-stuttgart.de:
--- a/roles/prometheus-exporters/tasks/main.yml
+++ b/roles/prometheus-exporters/tasks/main.yml
@@ -284,6 +284,14 @@
    dest: /etc/prometheus/exporter-exporter.yml
    content: |
      modules:
+        prometheus:
+          method: http
+          http:
+            port: 9090
+        alertmanager:
+          method: http
+          http:
+            port: 9093
        node:
          method: http
          http:

--- a/roles/prometheus/files/alertmanager.yml
+++ b/roles/prometheus/files/alertmanager.yml
+# Sample configuration.
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@freifunk-stuttgart.de'
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service', 'severity']
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  repeat_interval: 24h
+  # A default receiver
+  receiver: 'null'
+  routes:
+### leonard monitoring ###
+  - receiver: 'leonard_healthchecks'
+    repeat_interval: 5m
+    continue: false
+    #group_wait: 1s
+    #group_interval: 1m
+    matchers:
+      - alertname = SelfMonitoringAlwaysFiring 
+      - severity = info
+### leonard ###
+  - receiver: 'leonard_pushover'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity = info
+### nrb ###
+  - receiver: 'nrb'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'nrb'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity =~ "info"
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+receivers:
+#- name: 'ffs-gw-admins'
+#  email_configs:
+#  - to: 'gw-admins@freifunk-stuttgart.de'
+#  webhook_configs:
+#  - url: 'http://localhost:9199/alert'
+- name: 'leonard_healthchecks'
+  email_configs:
+  - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
+- name: 'leonard_selfhosted'
+  email_configs:
+  - to: 'leonard@selfhosted.de'
+    send_resolved: true
+- name: 'null'
+  email_configs: []  # Kein Versand
+- name: leonard_pushover
+  pushover_configs:
+    - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
+      user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
+      priority: 0
+      send_resolved: true
+- name: 'nrb'
+  email_configs:
+  - to: 'ffs-alerts@nicoboehr.de'
+    send_resolved: true
--- a/roles/prometheus/files/alerts/alert_healthchecks.yml
+++ b/roles/prometheus/files/alerts/alert_healthchecks.yml
+groups:
+- name: Selfmonitoring
+  rules:
+  - alert: 'SelfMonitoringAlwaysFiring'
+    expr: minute() >= 0
+    for: 1s
+    labels:
+      severity: info
+      application: leonard_healthchecks
--- a/roles/prometheus/files/alerts/alert_loadbalancing.yml
+++ b/roles/prometheus/files/alerts/alert_loadbalancing.yml
+groups:
+- name: lowpref
+  rules:
+  - alert: LowGatewayPreference
+    expr: gw_loadbalancing_pref{segment="1"} < 10
+    for: 1d
+    labels:
+      severity: page
+    annotations:
+      summary: |
+        {{ .Labels.gateway }} has low gateway preference ({{ .Value }})
--- a/roles/prometheus/files/alerts/blackbox-exporter.yml
+++ b/roles/prometheus/files/alerts/blackbox-exporter.yml
+groups:
+- name: BlackboxExporter
+  rules:
+    - alert: BlackboxProbeFailed
+      expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe failed (instance {{ $labels.instance }})
+        description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxConfigurationReloadFailure
+      expr: 'blackbox_exporter_config_last_reload_successful != 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
+        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 20 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxSslCertificateExpired
+      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
+        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxProbeSlowHttp
+      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
+        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: BlackboxProbeSlowPing
+      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
+        description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/roles/prometheus/files/alerts/general.yml
+++ b/roles/prometheus/files/alerts/general.yml
+groups:
+- name: up_success
+  rules:
+  - alert: UP_FAILED
+    expr: up{ignore_down!="1"} < 1
+    for: 15m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Scrapes not functional"
+- name: reload_success
+  rules:
+  - alert: PROMETHEUS_RELOAD_FAILED
+    expr: prometheus_config_last_reload_successful < 1
+    for: 1m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Reload of prometheus config failed"
+  - alert: ALERTMANAGER_RELOAD_FAILED
+    expr: alertmanager_config_last_reload_successful < 1
+    for: 1m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Reload of alertmanager config failed"
+- name: probe_success
+  rules:
+  - alert: PROBE_FAILED_TCP
+    expr: probe_success{} < 1
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Blackbox probe failed"
--- a/roles/prometheus/files/alerts/node-exporter.yml
+++ b/roles/prometheus/files/alerts/node-exporter.yml
+groups:
+- name: NodeExporter
+  rules:
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 20m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          #    Many containers
+          #    - alert: HostMemoryIsUnderutilized
+          #      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+          #      for: 1w
+          #      labels:
+          #        severity: info
+          #      annotations:
+          #        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+          #        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          #    - alert: HostUnusualNetworkThroughputIn
+          #      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+          #      for: 5m
+          #      labels:
+          #        severity: warning
+          #      annotations:
+          #        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+          #        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          #
+          #    - alert: HostUnusualNetworkThroughputOut
+          #      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+          #      for: 5m
+          #      labels:
+          #        severity: warning
+          #      annotations:
+          #        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          #        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+# All Hosts durcing Backup
+#    - alert: HostUnusualDiskReadRate
+#      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 5m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+#        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+#  pve01 und backup01, vielleicht auch mal pbs?
+#    - alert: HostUnusualDiskWriteRate
+#      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 2m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+#        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostOutOfDiskSpace
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostDiskWillFillIn24Hours
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostInodesWillFillIn24Hours
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total{nodename!="gw05n02"}[1m]) / rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+# Kann beim backup auf allen containern vorkommen
+#    - alert: HostHighCpuLoad
+#      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 10m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host high CPU load (instance {{ $labels.instance }})
+#        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+#    nas/nextcloud vm
+#    - alert: HostCpuIsUnderutilized
+#      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 1w
+#      labels:
+#        severity: info
+#      annotations:
+#        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+#        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostCpuStealNoisyNeighbor
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+# pve01 und backup01
+#    - alert: HostCpuHighIowait
+#      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+#      for: 0m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host CPU high iowait (instance {{ $labels.instance }})
+#        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostUnusualDiskIo
+      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+# All hosts during backup
+#    - alert: HostContextSwitchingHigh
+#      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
+#/
+#(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+#'
+#      for: 0m
+#      labels:
+#        severity: warning
+#      annotations:
+#        summary: Host context switching high (instance {{ $labels.instance }})
+#        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostSwapIsFillingUp
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host swap is filling up (instance {{ $labels.instance }})
+        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: CpuTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip=~"pci0000:00_0000:00:18_3"} > 98)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostPhysicalComponentTooHot
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip!="pci0000:00_0000:00:18_3"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostRaidArrayGotInactive
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostKernelVersionDeviations
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total{device!~"^g09n03abbtesta|^g09n03amobrtra|^g09n03bbbtestb"}[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostNetworkInterfaceSaturated
+      expr: '((rate(node_network_receive_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m])) / node_network_speed_bytes{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostRequiresReboot
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      for: 4h
+      labels:
+        severity: info
+      annotations:
+        summary: Host requires reboot (instance {{ $labels.instance }})
+        description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/roles/prometheus/files/alerts/smartctl-exporter.yml
+++ b/roles/prometheus/files/alerts/smartctl-exporter.yml
+groups:
+- name: SmartctlExporter
+  rules:
+    - alert: SmartDeviceTemperatureWarning
+      expr: 'smartctl_device_temperature > 60'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Smart device temperature warning (instance {{ $labels.instance }})
+        description: "Device temperature  warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: SmartDeviceTemperatureCritical
+      expr: 'smartctl_device_temperature > 80'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart device temperature critical (instance {{ $labels.instance }})
+        description: "Device temperature critical  (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: SmartCriticalWarning
+      expr: 'smartctl_device_critical_warning > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart critical warning (instance {{ $labels.instance }})
+        description: "device has critical warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: SmartMediaErrors
+      expr: 'smartctl_device_media_errors > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart media errors (instance {{ $labels.instance }})
+        description: "device has media errors (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: SmartNvmeWearoutIndicator
+      expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
+        description: "NVMe device is wearing out (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/roles/prometheus/files/config.yml
+++ b/roles/prometheus/files/config.yml
+---
+modules:
+  gwpref:
+    metrics:
+    - name: gw_loadbalancing_pref
+      help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes."
+      path: '{ .segments.1.preference }'
+      labels:
+        segment: '1'
--- a/roles/prometheus/files/json-exporter.service
+++ b/roles/prometheus/files/json-exporter.service
+Unit]
+Description=Prometheus Json Exporter
+Wants=network-online.target
+After=network-online.target
+[Service]
+Restart=always
+User=prometheus
+Group=prometheus
+StandardError=syslog
+Restart=on-failure
+KillSignal=SIGQUIT
+ExecStart=/opt/json-exporter/json_exporter --config.file /opt/json-exporter/config.yml
+[Install]
+WantedBy=multi-user.target
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@@ -2,4 +2,7 @@
  service:
    name: prometheus
    state: restarted
+- name: Reload prometheus
+  service:
+    name: prometheus
+    state: reloaded
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@@ -8,26 +8,49 @@
    msg: "This role must only be run on prometheus hosts"
  when: not is_prometheus | default(false)
+- name: Create node_exporter_targets.yml file
+  template:
+    src: prometheus.yml.j2
+    dest: /etc/prometheus/prometheus.yml
+  notify:
+    - Reload prometheus
 - name: Create node_exporter_targets.yml file
  template:
    src: node_exporter_targets.yml.j2
    dest: /etc/prometheus/node_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus
 - name: Create kea_exporter_targets.yml file
  template:
    src: kea_exporter_targets.yml.j2
    dest: /etc/prometheus/kea_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus
 - name: Create bird_exporter_targets.yml file
  template:
    src: bird_exporter_targets.yml.j2
    dest: /etc/prometheus/bird_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus
+- name: Copy all alerting rules
+  copy:
+    src: alerts/
+    dest: /etc/prometheus/alerts
+    mode: preserve
+    owner: root
+    group: root
+- name: Copy alertmanager.yml
+  copy:
+    src: alertmanager.yml
+    dest: /etc/prometheus/
+    mode: preserve
+    owner: root
+    group: root
 - name: Check if client-cert exists
  stat:
@@ -155,5 +178,58 @@
      - prometheus
      - prometheus-alertmanager
      - prometheus-blackbox-exporter
+      - yamllint
    state: present
    install_recommends: false
+  notify:
+    - Reload prometheus
+- name: Erstelle Zielverzeichnis
+  file:
+    path: /opt/json-exporter
+    state: directory
+    mode: '0755'
+- name: Lade json_exporter herunter
+  get_url:
+    url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz
+    dest: /opt/json-exporter/json_exporter.tar.gz
+    mode: '0644'
+- name: Entpacke json_exporter
+  unarchive:
+    src: /opt/json-exporter/json_exporter.tar.gz
+    dest: /opt/json-exporter
+    remote_src: yes
+- name: Verschiebe Binärdatei in /opt/json-exporter
+  command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter
+  args:
+    creates: /opt/json-exporter/json_exporter
+- name: Mache json_exporter ausführbar
+  file:
+    path: /opt/json-exporter/json_exporter
+    mode: '0755'
+- name: Kopiere config.yml nach /opt/json-exporter/
+  copy:
+    src: files/config.yml
+    dest: /opt/json-exporter/config.yml
+    mode: '0644'
+- name: Kopiere systemd service file für json_exporter
+  copy:
+    src: files/json-exporter.service
+    dest: /etc/systemd/system/json-exporter.service
+    mode: '0644'
+- name: Lade systemd neu, um neue Unit-Dateien zu erkennen
+  systemd:
+    daemon_reload: yes
+- name: Aktiviere und starte json_exporter
+  systemd:
+    name: json-exporter
+    enabled: yes
+    state: started
--- a/roles/prometheus/templates/alertmanager.yml.j2
+++ b/roles/prometheus/templates/alertmanager.yml.j2
+# Sample configuration.
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@freifunk-stuttgart.de'
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service', 'severity']
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  repeat_interval: 24h
+  # A default receiver
+  receiver: 'null'
+  routes:
+### leonard monitoring ###
+  - receiver: 'leonard_healthchecks'
+    repeat_interval: 5m
+    continue: false
+    #group_wait: 1s
+    #group_interval: 1m
+    matchers:
+      - alertname = SelfMonitoringAlwaysFiring 
+      - severity = info
+### leonard ###
+  - receiver: 'leonard_pushover'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity = info
+### nrb ###
+  - receiver: 'nrb'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'nrb'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity =~ "info"
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+receivers:
+#- name: 'ffs-gw-admins'
+#  email_configs:
+#  - to: 'gw-admins@freifunk-stuttgart.de'
+#  webhook_configs:
+#  - url: 'http://localhost:9199/alert'
+- name: 'leonard_healthchecks'
+  email_configs:
+  - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
+- name: 'leonard_selfhosted'
+  email_configs:
+  - to: 'leonard@selfhosted.de'
+    send_resolved: true
+- name: 'null'
+  email_configs: []  # Kein Versand
+- name: leonard_pushover
+  pushover_configs:
+    - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
+      user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
+      priority: 0
+      send_resolved: true
+- name: 'nrb'
+  email_configs:
+  - to: 'ffs-alerts@nicoboehr.de'
+    send_resolved: true
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
+---
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}'
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['localhost:9093']
+# Load rules once and periodically evaluate them
+# according to the global 'evaluation_interval'.
+rule_files:
+  - 'alerts/*.yml'
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+{% for host in groups['role_prometheus'] %}
+      - targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998']
+        labels:
+          instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
+{% endfor %}
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - prometheus
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['localhost:9093']
+  - job_name: 'collectd'
+    scrape_interval: 60s
+    static_configs:
+      - targets:
+          - 'yanic01.vm.freifunk-stuttgart.de:9998'
+        labels:
+          instance: "10.0.3.236:9104"
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - respondd
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+  - job_name: 'bird'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/bird_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - bird
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+  - job_name: monitor01_blackbox
+    scrape_interval: 15s
+    metrics_path: /proxy
+    scheme: https
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+    params:
+      module:
+        - blackbox
+        - icmp
+    static_configs:
+      - targets:
+          - 10.190.0.93
+          - 10.190.176.93
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: monitor01.vm.freifunk-stuttgart.de:9998
+  - job_name: 'node'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/node_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - node
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+  - job_name: 'kea'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/kea_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - kea
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+      # Re-activate when fastd-exporter is installed on gws
+      #  - job_name: bb_fastd
+      #    scrape_interval: 15s
+      #    file_sd_configs:
+      #    - files:
+      #      - 'target-fastd.json'
+  - job_name: json_gwpref
+    metrics_path: /probe
+    params:
+      module: [gwpref]
+    static_configs:
+{% for host in groups['role_gw'] %}
+      - targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json']
+        labels:
+          instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
+{% endfor %}
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - target_label: __address__
+        ## Location of the json exporter's real <hostname>:<port>
+        replacement: localhost:7979
+  - job_name: 'federate'
+    scrape_interval: 15s
+    honor_labels: true
+    metrics_path: '/federate'
+    params:
+      'match[]':
+        - '{job="blackbox"}'
+        - '{job="blackbox-5g"}'
+        - '{job="blackbox-starlink"}'
+        - '{job="zyxel"}'
+        - '{job="node"}'
+        - '{job="snmp"}'
+        - '{job="unifi"}'
+    static_configs:
+      - targets:
+          - '10.191.255.172:9090'
+        labels:
+          ignore_down: "1"