diff --git a/inventory/role_prometheus b/inventory/role_prometheus new file mode 100644 index 0000000000000000000000000000000000000000..e1da387b189afbfbd699eaaf448a70c1cd6a7f2b --- /dev/null +++ b/inventory/role_prometheus @@ -0,0 +1,5 @@ +--- +role_prometheus: + hosts: + prometheus01.vm.freifunk-stuttgart.de: + prometheus02.vm.freifunk-stuttgart.de: diff --git a/roles/prometheus-exporters/tasks/main.yml b/roles/prometheus-exporters/tasks/main.yml index 75acd65fa0aa91ea6aa2f43ad5cd5ef70904b574..8c665fd0c8c2495d711af55f9ca027c7e68e3e6a 100644 --- a/roles/prometheus-exporters/tasks/main.yml +++ b/roles/prometheus-exporters/tasks/main.yml @@ -284,6 +284,14 @@ dest: /etc/prometheus/exporter-exporter.yml content: | modules: + prometheus: + method: http + http: + port: 9090 + alertmanager: + method: http + http: + port: 9093 node: method: http http: diff --git a/roles/prometheus/files/alertmanager.yml b/roles/prometheus/files/alertmanager.yml new file mode 100644 index 0000000000000000000000000000000000000000..0cba29e1bbb2a1b1a423a4ef9617c633263b5541 --- /dev/null +++ b/roles/prometheus/files/alertmanager.yml @@ -0,0 +1,113 @@ +# Sample configuration. +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@freifunk-stuttgart.de' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service', 'severity'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + repeat_interval: 24h + + # A default receiver + receiver: 'null' + + routes: +### leonard monitoring ### + - receiver: 'leonard_healthchecks' + repeat_interval: 5m + continue: false + #group_wait: 1s + #group_interval: 1m + matchers: + - alertname = SelfMonitoringAlwaysFiring + - severity = info +### leonard ### + - receiver: 'leonard_pushover' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 24h + continue: true + matchers: + - severity = info +### nrb ### + - receiver: 'nrb' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'nrb' + repeat_interval: 24h + continue: true + matchers: + - severity =~ "info" + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +#- name: 'ffs-gw-admins' +# email_configs: +# - to: 'gw-admins@freifunk-stuttgart.de' +# webhook_configs: +# - url: 'http://localhost:9199/alert' +- name: 'leonard_healthchecks' + email_configs: + - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de' +- name: 'leonard_selfhosted' + email_configs: + - to: 'leonard@selfhosted.de' + send_resolved: true +- name: 'null' + email_configs: [] # Kein Versand +- name: leonard_pushover + pushover_configs: + - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW + user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd + priority: 0 + send_resolved: true +- name: 'nrb' + email_configs: + - to: 'ffs-alerts@nicoboehr.de' + send_resolved: true diff --git a/roles/prometheus/files/alerts/alert_healthchecks.yml b/roles/prometheus/files/alerts/alert_healthchecks.yml new file mode 100644 index 0000000000000000000000000000000000000000..7741c9aec8f42189d59aad2d144a0d1a66e17db0 --- /dev/null +++ b/roles/prometheus/files/alerts/alert_healthchecks.yml @@ -0,0 +1,9 @@ +groups: +- name: Selfmonitoring + rules: + - alert: 'SelfMonitoringAlwaysFiring' + expr: minute() >= 0 + for: 1s + labels: + severity: info + application: leonard_healthchecks diff --git a/roles/prometheus/files/alerts/alert_loadbalancing.yml b/roles/prometheus/files/alerts/alert_loadbalancing.yml new file mode 100644 index 0000000000000000000000000000000000000000..1fa7242241b7a8f6624d4cd5f59918c00338f80f --- /dev/null +++ b/roles/prometheus/files/alerts/alert_loadbalancing.yml @@ -0,0 +1,11 @@ +groups: +- name: lowpref + rules: + - alert: LowGatewayPreference + expr: gw_loadbalancing_pref{segment="1"} < 10 + for: 1d + labels: + severity: page + annotations: + summary: | + {{ .Labels.gateway }} has low gateway preference ({{ .Value }}) diff --git a/roles/prometheus/files/alerts/blackbox-exporter.yml b/roles/prometheus/files/alerts/blackbox-exporter.yml new file mode 100644 index 0000000000000000000000000000000000000000..b83797ae6e178eee2dc832c0f6133d0220153e28 --- /dev/null +++ b/roles/prometheus/files/alerts/blackbox-exporter.yml @@ -0,0 +1,68 @@ +groups: + +- name: BlackboxExporter + + rules: + + - alert: BlackboxProbeFailed + expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0' + for: 10m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxConfigurationReloadFailure + expr: 'blackbox_exporter_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox configuration reload failure (instance {{ $labels.instance }}) + description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateExpired + expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) + description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowHttp + expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) + description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowPing + expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ $labels.instance }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/files/alerts/general.yml b/roles/prometheus/files/alerts/general.yml new file mode 100644 index 0000000000000000000000000000000000000000..267589c0c80d58e1b490afe8ef6e00e4355d86f3 --- /dev/null +++ b/roles/prometheus/files/alerts/general.yml @@ -0,0 +1,39 @@ +groups: +- name: up_success + rules: + - alert: UP_FAILED + expr: up{ignore_down!="1"} < 1 + for: 15m + labels: + severity: warning + application: prometheus + annotations: + summary: "Scrapes not functional" +- name: reload_success + rules: + - alert: PROMETHEUS_RELOAD_FAILED + expr: prometheus_config_last_reload_successful < 1 + for: 1m + labels: + severity: warning + application: prometheus + annotations: + summary: "Reload of prometheus config failed" + - alert: ALERTMANAGER_RELOAD_FAILED + expr: alertmanager_config_last_reload_successful < 1 + for: 1m + labels: + severity: warning + application: prometheus + annotations: + summary: "Reload of alertmanager config failed" +- name: probe_success + rules: + - alert: PROBE_FAILED_TCP + expr: probe_success{} < 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Blackbox probe failed" + diff --git a/roles/prometheus/files/alerts/node-exporter.yml b/roles/prometheus/files/alerts/node-exporter.yml new file mode 100644 index 0000000000000000000000000000000000000000..00a83285a152c14bc54e40f27cc50c6d47266840 --- /dev/null +++ b/roles/prometheus/files/alerts/node-exporter.yml @@ -0,0 +1,366 @@ +groups: + +- name: NodeExporter + + rules: + + - alert: HostOutOfMemory + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 20m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Many containers + # - alert: HostMemoryIsUnderutilized + # expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 1w + # labels: + # severity: info + # annotations: + # summary: Host Memory is underutilized (instance {{ $labels.instance }}) + # description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: HostUnusualNetworkThroughputIn + # expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 5m + # labels: + # severity: warning + # annotations: + # summary: Host unusual network throughput in (instance {{ $labels.instance }}) + # description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # + # - alert: HostUnusualNetworkThroughputOut + # expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 5m + # labels: + # severity: warning + # annotations: + # summary: Host unusual network throughput out (instance {{ $labels.instance }}) + # description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# All Hosts durcing Backup +# - alert: HostUnusualDiskReadRate +# expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 5m +# labels: +# severity: warning +# annotations: +# summary: Host unusual disk read rate (instance {{ $labels.instance }}) +# description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# pve01 und backup01, vielleicht auch mal pbs? +# - alert: HostUnusualDiskWriteRate +# expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 2m +# labels: +# severity: warning +# annotations: +# summary: Host unusual disk write rate (instance {{ $labels.instance }}) +# description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskWillFillIn24Hours + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: '(rate(node_disk_write_time_seconds_total{nodename!="gw05n02"}[1m]) / rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# Kann beim backup auf allen containern vorkommen +# - alert: HostHighCpuLoad +# expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 10m +# labels: +# severity: warning +# annotations: +# summary: Host high CPU load (instance {{ $labels.instance }}) +# description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# nas/nextcloud vm +# - alert: HostCpuIsUnderutilized +# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 1w +# labels: +# severity: info +# annotations: +# summary: Host CPU is underutilized (instance {{ $labels.instance }}) +# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# pve01 und backup01 +# - alert: HostCpuHighIowait +# expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Host CPU high iowait (instance {{ $labels.instance }}) +# description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}' + for: 15m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# All hosts during backup +# - alert: HostContextSwitchingHigh +# expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) +#/ +#(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 +#' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Host context switching high (instance {{ $labels.instance }}) +# description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: CpuTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip=~"pci0000:00_0000:00:18_3"} > 98)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip!="pci0000:00_0000:00:18_3"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: '(rate(node_network_transmit_errs_total{device!~"^g09n03abbtesta|^g09n03amobrtra|^g09n03bbbtestb"}[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: '((rate(node_network_receive_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m])) / node_network_speed_bytes{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/files/alerts/smartctl-exporter.yml b/roles/prometheus/files/alerts/smartctl-exporter.yml new file mode 100644 index 0000000000000000000000000000000000000000..1946c38e1c7570198c7dba73f96a69349dcc8fd2 --- /dev/null +++ b/roles/prometheus/files/alerts/smartctl-exporter.yml @@ -0,0 +1,50 @@ +groups: + +- name: SmartctlExporter + + rules: + + - alert: SmartDeviceTemperatureWarning + expr: 'smartctl_device_temperature > 60' + for: 2m + labels: + severity: warning + annotations: + summary: Smart device temperature warning (instance {{ $labels.instance }}) + description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartDeviceTemperatureCritical + expr: 'smartctl_device_temperature > 80' + for: 2m + labels: + severity: critical + annotations: + summary: Smart device temperature critical (instance {{ $labels.instance }}) + description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartCriticalWarning + expr: 'smartctl_device_critical_warning > 0' + for: 15m + labels: + severity: critical + annotations: + summary: Smart critical warning (instance {{ $labels.instance }}) + description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartMediaErrors + expr: 'smartctl_device_media_errors > 0' + for: 15m + labels: + severity: critical + annotations: + summary: Smart media errors (instance {{ $labels.instance }}) + description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartNvmeWearoutIndicator + expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}' + for: 15m + labels: + severity: critical + annotations: + summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }}) + description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/files/config.yml b/roles/prometheus/files/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..62c43955665d5f46735a16b4680405bf54232742 --- /dev/null +++ b/roles/prometheus/files/config.yml @@ -0,0 +1,9 @@ +--- +modules: + gwpref: + metrics: + - name: gw_loadbalancing_pref + help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes." + path: '{ .segments.1.preference }' + labels: + segment: '1' diff --git a/roles/prometheus/files/json-exporter.service b/roles/prometheus/files/json-exporter.service new file mode 100644 index 0000000000000000000000000000000000000000..85e09ee2a85e1248fff4510ce8877d3ec739a19a --- /dev/null +++ b/roles/prometheus/files/json-exporter.service @@ -0,0 +1,16 @@ +Unit] +Description=Prometheus Json Exporter +Wants=network-online.target +After=network-online.target + +[Service] +Restart=always +User=prometheus +Group=prometheus +StandardError=syslog +Restart=on-failure +KillSignal=SIGQUIT +ExecStart=/opt/json-exporter/json_exporter --config.file /opt/json-exporter/config.yml + +[Install] +WantedBy=multi-user.target diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 8a8df0dec666c436ca5457fde020a254318f905f..a05d19c2be74f612fde5d88f3a3dcf5be631da61 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -2,4 +2,7 @@ service: name: prometheus state: restarted - +- name: Reload prometheus + service: + name: prometheus + state: reloaded diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 55d4a2298f0e76d9ea7f5a296b9f0dc0a0698314..49ce70ff13a8b607cfcb1ad7b26c8680736ba956 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -8,26 +8,49 @@ msg: "This role must only be run on prometheus hosts" when: not is_prometheus | default(false) +- name: Create node_exporter_targets.yml file + template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + notify: + - Reload prometheus + - name: Create node_exporter_targets.yml file template: src: node_exporter_targets.yml.j2 dest: /etc/prometheus/node_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus - name: Create kea_exporter_targets.yml file template: src: kea_exporter_targets.yml.j2 dest: /etc/prometheus/kea_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus - name: Create bird_exporter_targets.yml file template: src: bird_exporter_targets.yml.j2 dest: /etc/prometheus/bird_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus + +- name: Copy all alerting rules + copy: + src: alerts/ + dest: /etc/prometheus/alerts + mode: preserve + owner: root + group: root + +- name: Copy alertmanager.yml + copy: + src: alertmanager.yml + dest: /etc/prometheus/ + mode: preserve + owner: root + group: root - name: Check if client-cert exists stat: @@ -130,7 +153,7 @@ file: path: "{{ item }}" state: absent - loop: + loop: - /dev/shm/ca_cert.pem - /dev/shm/ca_key.pem - /tmp/{{ inventory_hostname }}_ext.cnf @@ -138,7 +161,7 @@ - name: Copy cert to prometheus host when: not client_cert_exists.stat.exists - copy: + copy: src: /tmp/{{ inventory_hostname }}.cert.pem dest: /etc/prometheus/ssl/client.cert.pem mode: '0644' @@ -155,5 +178,58 @@ - prometheus - prometheus-alertmanager - prometheus-blackbox-exporter + - yamllint state: present install_recommends: false + notify: + - Reload prometheus + +- name: Erstelle Zielverzeichnis + file: + path: /opt/json-exporter + state: directory + mode: '0755' + +- name: Lade json_exporter herunter + get_url: + url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz + dest: /opt/json-exporter/json_exporter.tar.gz + mode: '0644' + +- name: Entpacke json_exporter + unarchive: + src: /opt/json-exporter/json_exporter.tar.gz + dest: /opt/json-exporter + remote_src: yes + +- name: Verschiebe Binärdatei in /opt/json-exporter + command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter + args: + creates: /opt/json-exporter/json_exporter + +- name: Mache json_exporter ausführbar + file: + path: /opt/json-exporter/json_exporter + mode: '0755' + +- name: Kopiere config.yml nach /opt/json-exporter/ + copy: + src: files/config.yml + dest: /opt/json-exporter/config.yml + mode: '0644' + +- name: Kopiere systemd service file für json_exporter + copy: + src: files/json-exporter.service + dest: /etc/systemd/system/json-exporter.service + mode: '0644' + +- name: Lade systemd neu, um neue Unit-Dateien zu erkennen + systemd: + daemon_reload: yes + +- name: Aktiviere und starte json_exporter + systemd: + name: json-exporter + enabled: yes + state: started diff --git a/roles/prometheus/templates/alertmanager.yml.j2 b/roles/prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 0000000000000000000000000000000000000000..0cba29e1bbb2a1b1a423a4ef9617c633263b5541 --- /dev/null +++ b/roles/prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,113 @@ +# Sample configuration. +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@freifunk-stuttgart.de' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service', 'severity'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + repeat_interval: 24h + + # A default receiver + receiver: 'null' + + routes: +### leonard monitoring ### + - receiver: 'leonard_healthchecks' + repeat_interval: 5m + continue: false + #group_wait: 1s + #group_interval: 1m + matchers: + - alertname = SelfMonitoringAlwaysFiring + - severity = info +### leonard ### + - receiver: 'leonard_pushover' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 24h + continue: true + matchers: + - severity = info +### nrb ### + - receiver: 'nrb' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'nrb' + repeat_interval: 24h + continue: true + matchers: + - severity =~ "info" + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +#- name: 'ffs-gw-admins' +# email_configs: +# - to: 'gw-admins@freifunk-stuttgart.de' +# webhook_configs: +# - url: 'http://localhost:9199/alert' +- name: 'leonard_healthchecks' + email_configs: + - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de' +- name: 'leonard_selfhosted' + email_configs: + - to: 'leonard@selfhosted.de' + send_resolved: true +- name: 'null' + email_configs: [] # Kein Versand +- name: leonard_pushover + pushover_configs: + - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW + user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd + priority: 0 + send_resolved: true +- name: 'nrb' + email_configs: + - to: 'ffs-alerts@nicoboehr.de' + send_resolved: true diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 0000000000000000000000000000000000000000..2700986f3f20e7c3e19f9a236176da9d6341ddb9 --- /dev/null +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,172 @@ +--- +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}' + +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them +# according to the global 'evaluation_interval'. +rule_files: + - 'alerts/*.yml' + +scrape_configs: + - job_name: 'prometheus' + static_configs: +{% for host in groups['role_prometheus'] %} + - targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998'] + labels: + instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}' +{% endfor %} + scheme: https + metrics_path: /proxy + params: + module: + - prometheus + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'alertmanager' + static_configs: + - targets: ['localhost:9093'] + + - job_name: 'collectd' + scrape_interval: 60s + static_configs: + - targets: + - 'yanic01.vm.freifunk-stuttgart.de:9998' + labels: + instance: "10.0.3.236:9104" + scheme: https + metrics_path: /proxy + params: + module: + - respondd + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'bird' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/bird_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - bird + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: monitor01_blackbox + scrape_interval: 15s + metrics_path: /proxy + scheme: https + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + params: + module: + - blackbox + - icmp + static_configs: + - targets: + - 10.190.0.93 + - 10.190.176.93 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: monitor01.vm.freifunk-stuttgart.de:9998 + - job_name: 'node' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/node_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - node + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'kea' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/kea_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - kea + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + # Re-activate when fastd-exporter is installed on gws + # - job_name: bb_fastd + # scrape_interval: 15s + # file_sd_configs: + # - files: + # - 'target-fastd.json' + + - job_name: json_gwpref + metrics_path: /probe + params: + module: [gwpref] + static_configs: +{% for host in groups['role_gw'] %} + - targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json'] + labels: + instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}' +{% endfor %} + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + ## Location of the json exporter's real <hostname>:<port> + replacement: localhost:7979 + + - job_name: 'federate' + scrape_interval: 15s + honor_labels: true + metrics_path: '/federate' + params: + 'match[]': + - '{job="blackbox"}' + - '{job="blackbox-5g"}' + - '{job="blackbox-starlink"}' + - '{job="zyxel"}' + - '{job="node"}' + - '{job="snmp"}' + - '{job="unifi"}' + static_configs: + - targets: + - '10.191.255.172:9090' + labels: + ignore_down: "1"