More than expected

- scrape prometheus through exporter-exporter - Install json-exporter - Install alert-rules - Reload prometheus where sufficient (no restart)

More than expected
1026b1b5 · Leonard Penzer · 1a1b0224 · 1026b1b5 · 1026b1b5 · 1026b1b5
Commit 1026b1b5 authored 1 month ago by Leonard Penzer
--- a/inventory/role_prometheus
+++ b/inventory/role_prometheus
+---
+role_prometheus:
+  hosts:
+    prometheus01.vm.freifunk-stuttgart.de:
+    prometheus02.vm.freifunk-stuttgart.de:
--- a/roles/prometheus-exporters/tasks/main.yml
+++ b/roles/prometheus-exporters/tasks/main.yml
@@ -284,6 +284,14 @@
    dest: /etc/prometheus/exporter-exporter.yml
    content: |
      modules:
+        prometheus:
+          method: http
+          http:
+            port: 9090
+        alertmanager:
+          method: http
+          http:
+            port: 9093
        node:
          method: http
          http:

--- a/roles/prometheus/files/alertmanager.yml
+++ b/roles/prometheus/files/alertmanager.yml
+# Sample configuration.
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@freifunk-stuttgart.de'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service', 'severity']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  repeat_interval: 24h
+
+  # A default receiver
+  receiver: 'null'
+
+  routes:
+### leonard monitoring ###
+  - receiver: 'leonard_healthchecks'
+    repeat_interval: 5m
+    continue: false
+    #group_wait: 1s
+    #group_interval: 1m
+    matchers:
+      - alertname = SelfMonitoringAlwaysFiring 
+      - severity = info
+### leonard ###
+  - receiver: 'leonard_pushover'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity = info
+### nrb ###
+  - receiver: 'nrb'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'nrb'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity =~ "info"
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+
+receivers:
+#- name: 'ffs-gw-admins'
+#  email_configs:
+#  - to: 'gw-admins@freifunk-stuttgart.de'
+#  webhook_configs:
+#  - url: 'http://localhost:9199/alert'
+- name: 'leonard_healthchecks'
+  email_configs:
+  - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
+- name: 'leonard_selfhosted'
+  email_configs:
+  - to: 'leonard@selfhosted.de'
+    send_resolved: true
+- name: 'null'
+  email_configs: []  # Kein Versand
+- name: leonard_pushover
+  pushover_configs:
+    - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
+      user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
+      priority: 0
+      send_resolved: true
+- name: 'nrb'
+  email_configs:
+  - to: 'ffs-alerts@nicoboehr.de'
+    send_resolved: true
--- a/roles/prometheus/files/alerts/alert_healthchecks.yml
+++ b/roles/prometheus/files/alerts/alert_healthchecks.yml
+groups:
+- name: Selfmonitoring
+  rules:
+  - alert: 'SelfMonitoringAlwaysFiring'
+    expr: minute() >= 0
+    for: 1s
+    labels:
+      severity: info
+      application: leonard_healthchecks
--- a/roles/prometheus/files/alerts/alert_loadbalancing.yml
+++ b/roles/prometheus/files/alerts/alert_loadbalancing.yml
+groups:
+- name: lowpref
+  rules:
+  - alert: LowGatewayPreference
+    expr: gw_loadbalancing_pref{segment="1"} < 10
+    for: 1d
+    labels:
+      severity: page
+    annotations:
+      summary: |
+        {{ .Labels.gateway }} has low gateway preference ({{ .Value }})
--- a/roles/prometheus/files/alerts/blackbox-exporter.yml
+++ b/roles/prometheus/files/alerts/blackbox-exporter.yml
+groups:
+
+- name: BlackboxExporter
+
+  rules:
+
+    - alert: BlackboxProbeFailed
+      expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe failed (instance {{ $labels.instance }})
+        description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxConfigurationReloadFailure
+      expr: 'blackbox_exporter_config_last_reload_successful != 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
+        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 20 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: "SSL certificate expires in less than 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxSslCertificateExpired
+      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
+        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxProbeSlowHttp
+      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
+        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: BlackboxProbeSlowPing
+      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
+        description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/roles/prometheus/files/alerts/general.yml
+++ b/roles/prometheus/files/alerts/general.yml
+groups:
+- name: up_success
+  rules:
+  - alert: UP_FAILED
+    expr: up{ignore_down!="1"} < 1
+    for: 15m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Scrapes not functional"
+- name: reload_success
+  rules:
+  - alert: PROMETHEUS_RELOAD_FAILED
+    expr: prometheus_config_last_reload_successful < 1
+    for: 1m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Reload of prometheus config failed"
+  - alert: ALERTMANAGER_RELOAD_FAILED
+    expr: alertmanager_config_last_reload_successful < 1
+    for: 1m
+    labels:
+      severity: warning
+      application: prometheus
+    annotations:
+      summary: "Reload of alertmanager config failed"
+- name: probe_success
+  rules:
+  - alert: PROBE_FAILED_TCP
+    expr: probe_success{} < 1
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Blackbox probe failed"
+
--- a/roles/prometheus/files/alerts/node-exporter.yml
+++ b/roles/prometheus/files/alerts/node-exporter.yml
--- a/roles/prometheus/files/alerts/smartctl-exporter.yml
+++ b/roles/prometheus/files/alerts/smartctl-exporter.yml
+groups:
+
+- name: SmartctlExporter
+
+  rules:
+
+    - alert: SmartDeviceTemperatureWarning
+      expr: 'smartctl_device_temperature > 60'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Smart device temperature warning (instance {{ $labels.instance }})
+        description: "Device temperature  warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartDeviceTemperatureCritical
+      expr: 'smartctl_device_temperature > 80'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart device temperature critical (instance {{ $labels.instance }})
+        description: "Device temperature critical  (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartCriticalWarning
+      expr: 'smartctl_device_critical_warning > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart critical warning (instance {{ $labels.instance }})
+        description: "device has critical warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartMediaErrors
+      expr: 'smartctl_device_media_errors > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart media errors (instance {{ $labels.instance }})
+        description: "device has media errors (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartNvmeWearoutIndicator
+      expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
+        description: "NVMe device is wearing out (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/roles/prometheus/files/config.yml
+++ b/roles/prometheus/files/config.yml
+---
+modules:
+  gwpref:
+    metrics:
+    - name: gw_loadbalancing_pref
+      help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes."
+      path: '{ .segments.1.preference }'
+      labels:
+        segment: '1'
--- a/roles/prometheus/files/json-exporter.service
+++ b/roles/prometheus/files/json-exporter.service
+Unit]
+Description=Prometheus Json Exporter
+Wants=network-online.target
+After=network-online.target
+
+[Service]
+Restart=always
+User=prometheus
+Group=prometheus
+StandardError=syslog
+Restart=on-failure
+KillSignal=SIGQUIT
+ExecStart=/opt/json-exporter/json_exporter --config.file /opt/json-exporter/config.yml
+
+[Install]
+WantedBy=multi-user.target
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@@ -2,4 +2,7 @@
  service:
    name: prometheus
    state: restarted
-
+- name: Reload prometheus
+  service:
+    name: prometheus
+    state: reloaded
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@@ -8,26 +8,49 @@
    msg: "This role must only be run on prometheus hosts"
  when: not is_prometheus | default(false)

+- name: Create node_exporter_targets.yml file
+  template:
+    src: prometheus.yml.j2
+    dest: /etc/prometheus/prometheus.yml
+  notify:
+    - Reload prometheus
+
 - name: Create node_exporter_targets.yml file
  template:
    src: node_exporter_targets.yml.j2
    dest: /etc/prometheus/node_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus

 - name: Create kea_exporter_targets.yml file
  template:
    src: kea_exporter_targets.yml.j2
    dest: /etc/prometheus/kea_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus

 - name: Create bird_exporter_targets.yml file
  template:
    src: bird_exporter_targets.yml.j2
    dest: /etc/prometheus/bird_exporter_targets.yml
  notify:
-    - Restart prometheus
+    - Reload prometheus
+
+- name: Copy all alerting rules
+  copy:
+    src: alerts/
+    dest: /etc/prometheus/alerts
+    mode: preserve
+    owner: root
+    group: root
+
+- name: Copy alertmanager.yml
+  copy:
+    src: alertmanager.yml
+    dest: /etc/prometheus/
+    mode: preserve
+    owner: root
+    group: root

 - name: Check if client-cert exists
  stat:
@@ -155,5 +178,58 @@
      - prometheus
      - prometheus-alertmanager
      - prometheus-blackbox-exporter
+      - yamllint
    state: present
    install_recommends: false
+  notify:
+    - Reload prometheus
+
+- name: Erstelle Zielverzeichnis
+  file:
+    path: /opt/json-exporter
+    state: directory
+    mode: '0755'
+
+- name: Lade json_exporter herunter
+  get_url:
+    url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz
+    dest: /opt/json-exporter/json_exporter.tar.gz
+    mode: '0644'
+
+- name: Entpacke json_exporter
+  unarchive:
+    src: /opt/json-exporter/json_exporter.tar.gz
+    dest: /opt/json-exporter
+    remote_src: yes
+
+- name: Verschiebe Binärdatei in /opt/json-exporter
+  command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter
+  args:
+    creates: /opt/json-exporter/json_exporter
+
+- name: Mache json_exporter ausführbar
+  file:
+    path: /opt/json-exporter/json_exporter
+    mode: '0755'
+
+- name: Kopiere config.yml nach /opt/json-exporter/
+  copy:
+    src: files/config.yml
+    dest: /opt/json-exporter/config.yml
+    mode: '0644'
+
+- name: Kopiere systemd service file für json_exporter
+  copy:
+    src: files/json-exporter.service
+    dest: /etc/systemd/system/json-exporter.service
+    mode: '0644'
+
+- name: Lade systemd neu, um neue Unit-Dateien zu erkennen
+  systemd:
+    daemon_reload: yes
+
+- name: Aktiviere und starte json_exporter
+  systemd:
+    name: json-exporter
+    enabled: yes
+    state: started
--- a/roles/prometheus/templates/alertmanager.yml.j2
+++ b/roles/prometheus/templates/alertmanager.yml.j2
+# Sample configuration.
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'localhost:25'
+  smtp_from: 'alertmanager@freifunk-stuttgart.de'
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service', 'severity']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  repeat_interval: 24h
+
+  # A default receiver
+  receiver: 'null'
+
+  routes:
+### leonard monitoring ###
+  - receiver: 'leonard_healthchecks'
+    repeat_interval: 5m
+    continue: false
+    #group_wait: 1s
+    #group_interval: 1m
+    matchers:
+      - alertname = SelfMonitoringAlwaysFiring 
+      - severity = info
+### leonard ###
+  - receiver: 'leonard_pushover'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'leonard_selfhosted'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity = info
+### nrb ###
+  - receiver: 'nrb'
+    repeat_interval: 4h
+    continue: true
+    matchers:
+      - severity =~ "warning|critical"
+  - receiver: 'nrb'
+    repeat_interval: 24h
+    continue: true
+    matchers:
+      - severity =~ "info"
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+
+receivers:
+#- name: 'ffs-gw-admins'
+#  email_configs:
+#  - to: 'gw-admins@freifunk-stuttgart.de'
+#  webhook_configs:
+#  - url: 'http://localhost:9199/alert'
+- name: 'leonard_healthchecks'
+  email_configs:
+  - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de'
+- name: 'leonard_selfhosted'
+  email_configs:
+  - to: 'leonard@selfhosted.de'
+    send_resolved: true
+- name: 'null'
+  email_configs: []  # Kein Versand
+- name: leonard_pushover
+  pushover_configs:
+    - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW
+      user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd
+      priority: 0
+      send_resolved: true
+- name: 'nrb'
+  email_configs:
+  - to: 'ffs-alerts@nicoboehr.de'
+    send_resolved: true
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
+---
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}'
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them
+# according to the global 'evaluation_interval'.
+rule_files:
+  - 'alerts/*.yml'
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+{% for host in groups['role_prometheus'] %}
+      - targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998']
+        labels:
+          instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
+{% endfor %}
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - prometheus
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['localhost:9093']
+
+  - job_name: 'collectd'
+    scrape_interval: 60s
+    static_configs:
+      - targets:
+          - 'yanic01.vm.freifunk-stuttgart.de:9998'
+        labels:
+          instance: "10.0.3.236:9104"
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - respondd
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+
+  - job_name: 'bird'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/bird_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - bird
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+
+  - job_name: monitor01_blackbox
+    scrape_interval: 15s
+    metrics_path: /proxy
+    scheme: https
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+    params:
+      module:
+        - blackbox
+        - icmp
+    static_configs:
+      - targets:
+          - 10.190.0.93
+          - 10.190.176.93
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: monitor01.vm.freifunk-stuttgart.de:9998
+  - job_name: 'node'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/node_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - node
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+
+  - job_name: 'kea'
+    scrape_interval: 15s
+    file_sd_configs:
+      - files:
+          - /etc/prometheus/kea_exporter_targets.yml
+    scheme: https
+    metrics_path: /proxy
+    params:
+      module:
+        - kea
+    tls_config:
+      ca_file: /etc/prometheus/ssl/ca_cert.pem
+      cert_file: /etc/prometheus/ssl/client.cert.pem
+      key_file: /etc/prometheus/ssl/client.key.pem
+      insecure_skip_verify: false  # only true for debugging
+
+      # Re-activate when fastd-exporter is installed on gws
+      #  - job_name: bb_fastd
+      #    scrape_interval: 15s
+      #    file_sd_configs:
+      #    - files:
+      #      - 'target-fastd.json'
+
+  - job_name: json_gwpref
+    metrics_path: /probe
+    params:
+      module: [gwpref]
+    static_configs:
+{% for host in groups['role_gw'] %}
+      - targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json']
+        labels:
+          instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}'
+{% endfor %}
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - target_label: __address__
+        ## Location of the json exporter's real <hostname>:<port>
+        replacement: localhost:7979
+
+  - job_name: 'federate'
+    scrape_interval: 15s
+    honor_labels: true
+    metrics_path: '/federate'
+    params:
+      'match[]':
+        - '{job="blackbox"}'
+        - '{job="blackbox-5g"}'
+        - '{job="blackbox-starlink"}'
+        - '{job="zyxel"}'
+        - '{job="node"}'
+        - '{job="snmp"}'
+        - '{job="unifi"}'
+    static_configs:
+      - targets:
+          - '10.191.255.172:9090'
+        labels:
+          ignore_down: "1"