Skip to content
Snippets Groups Projects
Commit 1026b1b5 authored by Leonard Penzer's avatar Leonard Penzer
Browse files

More than expected

- scrape prometheus through exporter-exporter
- Install json-exporter
- Install alert-rules
- Reload prometheus where sufficient (no restart)
parent 1a1b0224
No related branches found
No related tags found
No related merge requests found
Showing
with 1064 additions and 6 deletions
---
role_prometheus:
hosts:
prometheus01.vm.freifunk-stuttgart.de:
prometheus02.vm.freifunk-stuttgart.de:
......@@ -284,6 +284,14 @@
dest: /etc/prometheus/exporter-exporter.yml
content: |
modules:
prometheus:
method: http
http:
port: 9090
alertmanager:
method: http
http:
port: 9093
node:
method: http
http:
......
# Sample configuration.
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25'
smtp_from: 'alertmanager@freifunk-stuttgart.de'
# The directory from which notification templates are read.
templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service', 'severity']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
repeat_interval: 24h
# A default receiver
receiver: 'null'
routes:
### leonard monitoring ###
- receiver: 'leonard_healthchecks'
repeat_interval: 5m
continue: false
#group_wait: 1s
#group_interval: 1m
matchers:
- alertname = SelfMonitoringAlwaysFiring
- severity = info
### leonard ###
- receiver: 'leonard_pushover'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 24h
continue: true
matchers:
- severity = info
### nrb ###
- receiver: 'nrb'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'nrb'
repeat_interval: 24h
continue: true
matchers:
- severity =~ "info"
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
# Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service']
receivers:
#- name: 'ffs-gw-admins'
# email_configs:
# - to: 'gw-admins@freifunk-stuttgart.de'
# webhook_configs:
# - url: 'http://localhost:9199/alert'
- name: 'leonard_healthchecks'
email_configs:
- to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
- name: 'leonard_selfhosted'
email_configs:
- to: 'leonard@selfhosted.de'
send_resolved: true
- name: 'null'
email_configs: [] # Kein Versand
- name: leonard_pushover
pushover_configs:
- token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
priority: 0
send_resolved: true
- name: 'nrb'
email_configs:
- to: 'ffs-alerts@nicoboehr.de'
send_resolved: true
groups:
- name: Selfmonitoring
rules:
- alert: 'SelfMonitoringAlwaysFiring'
expr: minute() >= 0
for: 1s
labels:
severity: info
application: leonard_healthchecks
groups:
- name: lowpref
rules:
- alert: LowGatewayPreference
expr: gw_loadbalancing_pref{segment="1"} < 10
for: 1d
labels:
severity: page
annotations:
summary: |
{{ .Labels.gateway }} has low gateway preference ({{ .Value }})
groups:
- name: BlackboxExporter
rules:
- alert: BlackboxProbeFailed
expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0'
for: 10m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxConfigurationReloadFailure
expr: 'blackbox_exporter_config_last_reload_successful != 1'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
groups:
- name: up_success
rules:
- alert: UP_FAILED
expr: up{ignore_down!="1"} < 1
for: 15m
labels:
severity: warning
application: prometheus
annotations:
summary: "Scrapes not functional"
- name: reload_success
rules:
- alert: PROMETHEUS_RELOAD_FAILED
expr: prometheus_config_last_reload_successful < 1
for: 1m
labels:
severity: warning
application: prometheus
annotations:
summary: "Reload of prometheus config failed"
- alert: ALERTMANAGER_RELOAD_FAILED
expr: alertmanager_config_last_reload_successful < 1
for: 1m
labels:
severity: warning
application: prometheus
annotations:
summary: "Reload of alertmanager config failed"
- name: probe_success
rules:
- alert: PROBE_FAILED_TCP
expr: probe_success{} < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox probe failed"
This diff is collapsed.
groups:
- name: SmartctlExporter
rules:
- alert: SmartDeviceTemperatureWarning
expr: 'smartctl_device_temperature > 60'
for: 2m
labels:
severity: warning
annotations:
summary: Smart device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureCritical
expr: 'smartctl_device_temperature > 80'
for: 2m
labels:
severity: critical
annotations:
summary: Smart device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Smart critical warning (instance {{ $labels.instance }})
description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Smart media errors (instance {{ $labels.instance }})
description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartNvmeWearoutIndicator
expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
for: 15m
labels:
severity: critical
annotations:
summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
---
modules:
gwpref:
metrics:
- name: gw_loadbalancing_pref
help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes."
path: '{ .segments.1.preference }'
labels:
segment: '1'
Unit]
Description=Prometheus Json Exporter
Wants=network-online.target
After=network-online.target
[Service]
Restart=always
User=prometheus
Group=prometheus
StandardError=syslog
Restart=on-failure
KillSignal=SIGQUIT
ExecStart=/opt/json-exporter/json_exporter --config.file /opt/json-exporter/config.yml
[Install]
WantedBy=multi-user.target
......@@ -2,4 +2,7 @@
service:
name: prometheus
state: restarted
- name: Reload prometheus
service:
name: prometheus
state: reloaded
......@@ -8,26 +8,49 @@
msg: "This role must only be run on prometheus hosts"
when: not is_prometheus | default(false)
- name: Create node_exporter_targets.yml file
template:
src: prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
notify:
- Reload prometheus
- name: Create node_exporter_targets.yml file
template:
src: node_exporter_targets.yml.j2
dest: /etc/prometheus/node_exporter_targets.yml
notify:
- Restart prometheus
- Reload prometheus
- name: Create kea_exporter_targets.yml file
template:
src: kea_exporter_targets.yml.j2
dest: /etc/prometheus/kea_exporter_targets.yml
notify:
- Restart prometheus
- Reload prometheus
- name: Create bird_exporter_targets.yml file
template:
src: bird_exporter_targets.yml.j2
dest: /etc/prometheus/bird_exporter_targets.yml
notify:
- Restart prometheus
- Reload prometheus
- name: Copy all alerting rules
copy:
src: alerts/
dest: /etc/prometheus/alerts
mode: preserve
owner: root
group: root
- name: Copy alertmanager.yml
copy:
src: alertmanager.yml
dest: /etc/prometheus/
mode: preserve
owner: root
group: root
- name: Check if client-cert exists
stat:
......@@ -155,5 +178,58 @@
- prometheus
- prometheus-alertmanager
- prometheus-blackbox-exporter
- yamllint
state: present
install_recommends: false
notify:
- Reload prometheus
- name: Erstelle Zielverzeichnis
file:
path: /opt/json-exporter
state: directory
mode: '0755'
- name: Lade json_exporter herunter
get_url:
url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz
dest: /opt/json-exporter/json_exporter.tar.gz
mode: '0644'
- name: Entpacke json_exporter
unarchive:
src: /opt/json-exporter/json_exporter.tar.gz
dest: /opt/json-exporter
remote_src: yes
- name: Verschiebe Binärdatei in /opt/json-exporter
command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter
args:
creates: /opt/json-exporter/json_exporter
- name: Mache json_exporter ausführbar
file:
path: /opt/json-exporter/json_exporter
mode: '0755'
- name: Kopiere config.yml nach /opt/json-exporter/
copy:
src: files/config.yml
dest: /opt/json-exporter/config.yml
mode: '0644'
- name: Kopiere systemd service file für json_exporter
copy:
src: files/json-exporter.service
dest: /etc/systemd/system/json-exporter.service
mode: '0644'
- name: Lade systemd neu, um neue Unit-Dateien zu erkennen
systemd:
daemon_reload: yes
- name: Aktiviere und starte json_exporter
systemd:
name: json-exporter
enabled: yes
state: started
# Sample configuration.
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25'
smtp_from: 'alertmanager@freifunk-stuttgart.de'
# The directory from which notification templates are read.
templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service', 'severity']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
repeat_interval: 24h
# A default receiver
receiver: 'null'
routes:
### leonard monitoring ###
- receiver: 'leonard_healthchecks'
repeat_interval: 5m
continue: false
#group_wait: 1s
#group_interval: 1m
matchers:
- alertname = SelfMonitoringAlwaysFiring
- severity = info
### leonard ###
- receiver: 'leonard_pushover'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'leonard_selfhosted'
repeat_interval: 24h
continue: true
matchers:
- severity = info
### nrb ###
- receiver: 'nrb'
repeat_interval: 4h
continue: true
matchers:
- severity =~ "warning|critical"
- receiver: 'nrb'
repeat_interval: 24h
continue: true
matchers:
- severity =~ "info"
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
# Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service']
receivers:
#- name: 'ffs-gw-admins'
# email_configs:
# - to: 'gw-admins@freifunk-stuttgart.de'
# webhook_configs:
# - url: 'http://localhost:9199/alert'
- name: 'leonard_healthchecks'
email_configs:
- to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
- name: 'leonard_selfhosted'
email_configs:
- to: 'leonard@selfhosted.de'
send_resolved: true
- name: 'null'
email_configs: [] # Kein Versand
- name: leonard_pushover
pushover_configs:
- token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
priority: 0
send_resolved: true
- name: 'nrb'
email_configs:
- to: 'ffs-alerts@nicoboehr.de'
send_resolved: true
---
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}'
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them
# according to the global 'evaluation_interval'.
rule_files:
- 'alerts/*.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
{% for host in groups['role_prometheus'] %}
- targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998']
labels:
instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
{% endfor %}
scheme: https
metrics_path: /proxy
params:
module:
- prometheus
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'alertmanager'
static_configs:
- targets: ['localhost:9093']
- job_name: 'collectd'
scrape_interval: 60s
static_configs:
- targets:
- 'yanic01.vm.freifunk-stuttgart.de:9998'
labels:
instance: "10.0.3.236:9104"
scheme: https
metrics_path: /proxy
params:
module:
- respondd
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'bird'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/bird_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- bird
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: monitor01_blackbox
scrape_interval: 15s
metrics_path: /proxy
scheme: https
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
params:
module:
- blackbox
- icmp
static_configs:
- targets:
- 10.190.0.93
- 10.190.176.93
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: monitor01.vm.freifunk-stuttgart.de:9998
- job_name: 'node'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/node_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- node
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
- job_name: 'kea'
scrape_interval: 15s
file_sd_configs:
- files:
- /etc/prometheus/kea_exporter_targets.yml
scheme: https
metrics_path: /proxy
params:
module:
- kea
tls_config:
ca_file: /etc/prometheus/ssl/ca_cert.pem
cert_file: /etc/prometheus/ssl/client.cert.pem
key_file: /etc/prometheus/ssl/client.key.pem
insecure_skip_verify: false # only true for debugging
# Re-activate when fastd-exporter is installed on gws
# - job_name: bb_fastd
# scrape_interval: 15s
# file_sd_configs:
# - files:
# - 'target-fastd.json'
- job_name: json_gwpref
metrics_path: /probe
params:
module: [gwpref]
static_configs:
{% for host in groups['role_gw'] %}
- targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json']
labels:
instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
{% endfor %}
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
## Location of the json exporter's real <hostname>:<port>
replacement: localhost:7979
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job="blackbox"}'
- '{job="blackbox-5g"}'
- '{job="blackbox-starlink"}'
- '{job="zyxel"}'
- '{job="node"}'
- '{job="snmp"}'
- '{job="unifi"}'
static_configs:
- targets:
- '10.191.255.172:9090'
labels:
ignore_down: "1"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment