Skip to content
Snippets Groups Projects
Commit 1026b1b5 authored by Leonard Penzer's avatar Leonard Penzer
Browse files

More than expected

- scrape prometheus through exporter-exporter
- Install json-exporter
- Install alert-rules
- Reload prometheus where sufficient (no restart)
parent 1a1b0224
No related branches found
No related tags found
No related merge requests found
Showing
with 1064 additions and 6 deletions
---
role_prometheus:
hosts:
prometheus01.vm.freifunk-stuttgart.de:
prometheus02.vm.freifunk-stuttgart.de:
...@@ -284,6 +284,14 @@ ...@@ -284,6 +284,14 @@
dest: /etc/prometheus/exporter-exporter.yml dest: /etc/prometheus/exporter-exporter.yml
content: | content: |
modules: modules:
prometheus:
method: http
http:
port: 9090
alertmanager:
method: http
http:
port: 9093
node: node:
method: http method: http
http: http:
......
# Sample configuration.
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25'
smtp_from: 'alertmanager@freifunk-stuttgart.de'
# The directory from which notification templates are read.
templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service', 'severity']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
repeat_interval: 24h
# A default receiver
receiver: 'null'
routes:
### leonard monitoring ###
- receiver: 'leonard_healthchecks'
repeat_interval: 5m
continue: false
#group_wait: 1s
#group_interval: 1m
matchers:
- alertname = SelfMonitoringAlwaysFiring
- severity = info
### leonard ###
- receiver: 'leonard_pushover'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 24h
continue: true
matchers:
- severity = info
### nrb ###
- receiver: 'nrb'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'nrb'
repeat_interval: 24h
continue: true
matchers:
- severity =~ "info"
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
# Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service']
receivers:
#- name: 'ffs-gw-admins'
# email_configs:
# - to: 'gw-admins@freifunk-stuttgart.de'
# webhook_configs:
# - url: 'http://localhost:9199/alert'
- name: 'leonard_healthchecks'
email_configs:
- to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
- name: 'leonard_selfhosted'
email_configs:
- to: 'leonard@selfhosted.de'
send_resolved: true
- name: 'null'
email_configs: [] # Kein Versand
- name: leonard_pushover
pushover_configs:
- token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
priority: 0
send_resolved: true
- name: 'nrb'
email_configs:
- to: 'ffs-alerts@nicoboehr.de'
send_resolved: true
groups:
- name: Selfmonitoring
rules:
- alert: 'SelfMonitoringAlwaysFiring'
expr: minute() >= 0
for: 1s
labels:
severity: info
application: leonard_healthchecks
groups:
- name: lowpref
rules:
- alert: LowGatewayPreference
expr: gw_loadbalancing_pref{segment="1"} < 10
for: 1d
labels:
severity: page
annotations:
summary: |
{{ .Labels.gateway }} has low gateway preference ({{ .Value }})
groups:
- name: BlackboxExporter
rules:
- alert: BlackboxProbeFailed
expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0'
for: 10m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxConfigurationReloadFailure
expr: 'blackbox_exporter_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
groups:
- name: up_success
rules:
- alert: UP_FAILED
expr: up{ignore_down!="1"} < 1
for: 15m
labels:
severity: warning
application: prometheus
annotations:
summary: "Scrapes not functional"
- name: reload_success
rules:
- alert: PROMETHEUS_RELOAD_FAILED
expr: prometheus_config_last_reload_successful < 1
for: 1m
labels:
severity: warning
application: prometheus
annotations:
summary: "Reload of prometheus config failed"
- alert: ALERTMANAGER_RELOAD_FAILED
expr: alertmanager_config_last_reload_successful < 1
for: 1m
labels:
severity: warning
application: prometheus
annotations:
summary: "Reload of alertmanager config failed"
- name: probe_success
rules:
- alert: PROBE_FAILED_TCP
expr: probe_success{} < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox probe failed"
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 20m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Many containers
# - alert: HostMemoryIsUnderutilized
# expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host Memory is underutilized (instance {{ $labels.instance }})
# description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: HostUnusualNetworkThroughputIn
# expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: Host unusual network throughput in (instance {{ $labels.instance }})
# description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#
# - alert: HostUnusualNetworkThroughputOut
# expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: Host unusual network throughput out (instance {{ $labels.instance }})
# description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# All Hosts durcing Backup
# - alert: HostUnusualDiskReadRate
# expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 5m
# labels:
# severity: warning
# annotations:
# summary: Host unusual disk read rate (instance {{ $labels.instance }})
# description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pve01 und backup01, vielleicht auch mal pbs?
# - alert: HostUnusualDiskWriteRate
# expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Host unusual disk write rate (instance {{ $labels.instance }})
# description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total{nodename!="gw05n02"}[1m]) / rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Kann beim backup auf allen containern vorkommen
# - alert: HostHighCpuLoad
# expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 10m
# labels:
# severity: warning
# annotations:
# summary: Host high CPU load (instance {{ $labels.instance }})
# description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# nas/nextcloud vm
# - alert: HostCpuIsUnderutilized
# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# pve01 und backup01
# - alert: HostCpuHighIowait
# expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host CPU high iowait (instance {{ $labels.instance }})
# description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}'
for: 15m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# All hosts during backup
# - alert: HostContextSwitchingHigh
# expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
#/
#(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
#'
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host context switching high (instance {{ $labels.instance }})
# description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CpuTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip=~"pci0000:00_0000:00:18_3"} > 98)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip!="pci0000:00_0000:00:18_3"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total{device!~"^g09n03abbtesta|^g09n03amobrtra|^g09n03bbbtestb"}[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m])) / node_network_speed_bytes{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
groups:
- name: SmartctlExporter
rules:
- alert: SmartDeviceTemperatureWarning
expr: 'smartctl_device_temperature > 60'
for: 2m
labels:
severity: warning
annotations:
summary: Smart device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureCritical
expr: 'smartctl_device_temperature > 80'
for: 2m
labels:
severity: critical
annotations:
summary: Smart device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Smart critical warning (instance {{ $labels.instance }})
description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Smart media errors (instance {{ $labels.instance }})
description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartNvmeWearoutIndicator
expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
for: 15m
labels:
severity: critical
annotations:
summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
---
modules:
gwpref:
metrics:
- name: gw_loadbalancing_pref
help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes."
path: '{ .segments.1.preference }'
labels:
segment: '1'
Unit]
Description=Prometheus Json Exporter
Wants=network-online.target
After=network-online.target
[Service]
Restart=always
User=prometheus
Group=prometheus
StandardError=syslog
Restart=on-failure
KillSignal=SIGQUIT
ExecStart=/opt/json-exporter/json_exporter --config.file /opt/json-exporter/config.yml
[Install]
WantedBy=multi-user.target
...@@ -2,4 +2,7 @@ ...@@ -2,4 +2,7 @@
service: service:
name: prometheus name: prometheus
state: restarted state: restarted
- name: Reload prometheus
service:
name: prometheus
state: reloaded
...@@ -8,26 +8,49 @@ ...@@ -8,26 +8,49 @@
msg: "This role must only be run on prometheus hosts" msg: "This role must only be run on prometheus hosts"
when: not is_prometheus | default(false) when: not is_prometheus | default(false)
- name: Create node_exporter_targets.yml file
template:
src: prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
notify:
- Reload prometheus
- name: Create node_exporter_targets.yml file - name: Create node_exporter_targets.yml file
template: template:
src: node_exporter_targets.yml.j2 src: node_exporter_targets.yml.j2
dest: /etc/prometheus/node_exporter_targets.yml dest: /etc/prometheus/node_exporter_targets.yml
notify: notify:
- Restart prometheus - Reload prometheus
- name: Create kea_exporter_targets.yml file - name: Create kea_exporter_targets.yml file
template: template:
src: kea_exporter_targets.yml.j2 src: kea_exporter_targets.yml.j2
dest: /etc/prometheus/kea_exporter_targets.yml dest: /etc/prometheus/kea_exporter_targets.yml
notify: notify:
- Restart prometheus - Reload prometheus
- name: Create bird_exporter_targets.yml file - name: Create bird_exporter_targets.yml file
template: template:
src: bird_exporter_targets.yml.j2 src: bird_exporter_targets.yml.j2
dest: /etc/prometheus/bird_exporter_targets.yml dest: /etc/prometheus/bird_exporter_targets.yml
notify: notify:
- Restart prometheus - Reload prometheus
- name: Copy all alerting rules
copy:
src: alerts/
dest: /etc/prometheus/alerts
mode: preserve
owner: root
group: root
- name: Copy alertmanager.yml
copy:
src: alertmanager.yml
dest: /etc/prometheus/
mode: preserve
owner: root
group: root
- name: Check if client-cert exists - name: Check if client-cert exists
stat: stat:
...@@ -155,5 +178,58 @@ ...@@ -155,5 +178,58 @@
- prometheus - prometheus
- prometheus-alertmanager - prometheus-alertmanager
- prometheus-blackbox-exporter - prometheus-blackbox-exporter
- yamllint
state: present state: present
install_recommends: false install_recommends: false
notify:
- Reload prometheus
- name: Erstelle Zielverzeichnis
file:
path: /opt/json-exporter
state: directory
mode: '0755'
- name: Lade json_exporter herunter
get_url:
url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz
dest: /opt/json-exporter/json_exporter.tar.gz
mode: '0644'
- name: Entpacke json_exporter
unarchive:
src: /opt/json-exporter/json_exporter.tar.gz
dest: /opt/json-exporter
remote_src: yes
- name: Verschiebe Binärdatei in /opt/json-exporter
command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter
args:
creates: /opt/json-exporter/json_exporter
- name: Mache json_exporter ausführbar
file:
path: /opt/json-exporter/json_exporter
mode: '0755'
- name: Kopiere config.yml nach /opt/json-exporter/
copy:
src: files/config.yml
dest: /opt/json-exporter/config.yml
mode: '0644'
- name: Kopiere systemd service file für json_exporter
copy:
src: files/json-exporter.service
dest: /etc/systemd/system/json-exporter.service
mode: '0644'
- name: Lade systemd neu, um neue Unit-Dateien zu erkennen
systemd:
daemon_reload: yes
- name: Aktiviere und starte json_exporter
systemd:
name: json-exporter
enabled: yes
state: started
# Sample configuration.
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25'
smtp_from: 'alertmanager@freifunk-stuttgart.de'
# The directory from which notification templates are read.
templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service', 'severity']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
repeat_interval: 24h
# A default receiver
receiver: 'null'
routes:
### leonard monitoring ###
- receiver: 'leonard_healthchecks'
repeat_interval: 5m
continue: false
#group_wait: 1s
#group_interval: 1m
matchers:
- alertname = SelfMonitoringAlwaysFiring
- severity = info
### leonard ###
- receiver: 'leonard_pushover'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 24h
continue: true
matchers:
- severity = info
### nrb ###
- receiver: 'nrb'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'nrb'
repeat_interval: 24h
continue: true
matchers:
- severity =~ "info"
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
# Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service']
receivers:
#- name: 'ffs-gw-admins'
# email_configs:
# - to: 'gw-admins@freifunk-stuttgart.de'
# webhook_configs:
# - url: 'http://localhost:9199/alert'
- name: 'leonard_healthchecks'
email_configs:
- to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
- name: 'leonard_selfhosted'
email_configs:
- to: 'leonard@selfhosted.de'
send_resolved: true
- name: 'null'
email_configs: [] # Kein Versand
- name: leonard_pushover
pushover_configs:
- token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
priority: 0
send_resolved: true
- name: 'nrb'
email_configs:
- to: 'ffs-alerts@nicoboehr.de'
send_resolved: true
---
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}'
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them
# according to the global 'evaluation_interval'.
rule_files:
- 'alerts/*.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
{% for host in groups['role_prometheus'] %}
- targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998']
labels:
instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
{% endfor %}
scheme: https
metrics_path: /proxy
params:
module:
- prometheus
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'alertmanager'
static_configs:
- targets: ['localhost:9093']
- job_name: 'collectd'
scrape_interval: 60s
static_configs:
- targets:
- 'yanic01.vm.freifunk-stuttgart.de:9998'
labels:
instance: "10.0.3.236:9104"
scheme: https
metrics_path: /proxy
params:
module:
- respondd
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'bird'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/bird_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- bird
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: monitor01_blackbox
scrape_interval: 15s
metrics_path: /proxy
scheme: https
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
params:
module:
- blackbox
- icmp
static_configs:
- targets:
- 10.190.0.93
- 10.190.176.93
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: monitor01.vm.freifunk-stuttgart.de:9998
- job_name: 'node'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/node_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- node
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'kea'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/kea_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- kea
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
# Re-activate when fastd-exporter is installed on gws
# - job_name: bb_fastd
# scrape_interval: 15s
# file_sd_configs:
# - files:
# - 'target-fastd.json'
- job_name: json_gwpref
metrics_path: /probe
params:
module: [gwpref]
static_configs:
{% for host in groups['role_gw'] %}
- targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json']
labels:
instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
{% endfor %}
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
## Location of the json exporter's real <hostname>:<port>
replacement: localhost:7979
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job="blackbox"}'
- '{job="blackbox-5g"}'
- '{job="blackbox-starlink"}'
- '{job="zyxel"}'
- '{job="node"}'
- '{job="snmp"}'
- '{job="unifi"}'
static_configs:
- targets:
- '10.191.255.172:9090'
labels:
ignore_down: "1"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment