From 5020e66a58ffb920815b463f59fd232f6545727d Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 8 Jun 2025 09:58:35 +0200 Subject: [PATCH 01/13] Update Inventory - add gw01n03 to external_vm - add firmware to ffspveguests - add gw05n01 and gw05n02 to external_vm - fix dhcp4 (use dhcp04.vm) - remove wiki-testing - id is used by pbs01 - remove dns03.vm.freifunk-stuttgart.de from ffspveguests - add prometheus01 - selfnet-vm to external_vm - update gws to use vm-entries --- inventory/external_vm | 6 ++++++ inventory/ffspveguests | 4 +--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/inventory/external_vm b/inventory/external_vm index dc466f1..63d6dc8 100644 --- a/inventory/external_vm +++ b/inventory/external_vm @@ -7,3 +7,9 @@ external_vm: dhcp03.freifunk-stuttgart.de: ansible_ssh_port: 44353 dns02.as208772.net: + gw01n03.vm.freifunk-stuttgart.de: + ansible_ssh_port: 44353 + gw05n01.vm.freifunk-stuttgart.de: + gw05n02.vm.freifunk-stuttgart.de: + prometheus01.vm.freifunk-stuttgart.de: + ansible_ssh_host: freifunk.ext.selfnet.de diff --git a/inventory/ffspveguests b/inventory/ffspveguests index 3cca7f9..f94fb9b 100644 --- a/inventory/ffspveguests +++ b/inventory/ffspveguests @@ -16,7 +16,6 @@ ffspveguests: ffspve_id: 8194 ripe-atlas01.vm.freifunk-stuttgart.de: ffspve_id: 8187 - dhcp04.vm.freifunk-stuttgart.de: revproxy-05.vm.freifunk-stuttgart.de: openslides.vm.freifunk-stuttgart.de: glrunner-ffs05.vm.freifunk-stuttgart.de: @@ -31,7 +30,6 @@ ffspveguests: nodealarm01.vm.freifunk-stuttgart.de: prometheus02.vm.freifunk-stuttgart.de: prometheus03.vm.freifunk-stuttgart.de: - wiki-testing.vm.freifunk-stuttgart.de: mailexpand.vm.freifunk-stuttgart.de: pad.vm.freifunk-stuttgart.de: revproxy-03.vm.freifunk-stuttgart.de: @@ -64,10 +62,10 @@ ffspveguests: vaultwarden.vm.freifunk-stuttgart.de: nextbox.vm.freifunk-stuttgart.de: ffs13r.vm.freifunk-stuttgart.de: - dns03.vm.freifunk-stuttgart.de: ffs10.vm.freifunk-stuttgart.de: dhcp02.vm.freifunk-stuttgart.de: jumphost01.vm.freifunk-stuttgart.de: nrb-backbonetest2.vm.freifunk-stuttgart.de: ansible_ssh_host: 2a01:4f8:172:feff:be24:11ff:fe8b:8979 ansible_ssh_user: root + firmware.vm.freifunk-stuttgart.de: -- GitLab From 6b5a0cd13bf3e1d90691d04409429e4ea30afa99 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 15 Jun 2025 21:43:30 +0200 Subject: [PATCH 02/13] Inventory: Add gws to role_gw in inventory --- inventory/role_gw | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 inventory/role_gw diff --git a/inventory/role_gw b/inventory/role_gw new file mode 100644 index 0000000..667c2b8 --- /dev/null +++ b/inventory/role_gw @@ -0,0 +1,9 @@ +--- +role_gw: + hosts: + gw01n03.vm.freifunk-stuttgart.de: + ansible_ssh_port: 44353 + gw04n06.vm.freifunk-stuttgart.de: + gw05n02.vm.freifunk-stuttgart.de: + gw09n03.vm.freifunk-stuttgart.de: + gw09n04.vm.freifunk-stuttgart.de: -- GitLab From 3329bb8c39d9c7223e09fdb8a83a33f5efa56f62 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sat, 7 Jun 2025 18:12:43 +0200 Subject: [PATCH 03/13] Initial prometheus-exporters role - open nft firewall for prometheus-exporter-exporter --- group_vars/all/prometheus_ca_vault | 270 ++++++++++++++ roles/prometheus-exporters/handlers/main.yml | 16 + roles/prometheus-exporters/tasks/main.yml | 354 +++++++++++++++++++ 3 files changed, 640 insertions(+) create mode 100644 group_vars/all/prometheus_ca_vault create mode 100644 roles/prometheus-exporters/handlers/main.yml create mode 100644 roles/prometheus-exporters/tasks/main.yml diff --git a/group_vars/all/prometheus_ca_vault b/group_vars/all/prometheus_ca_vault new file mode 100644 index 0000000..0f8f183 --- /dev/null +++ b/group_vars/all/prometheus_ca_vault @@ -0,0 +1,270 @@ +$ANSIBLE_VAULT;1.2;AES256;ansible-ffs +30636138643961623138666666663630653034653639323234336630633332393632396339396562 +6135313732663438653030666461376463366137303162380a643262613235336236666335633030 +37336139646262333532653134653737343964353530383066323664353465313530356336643861 +3839356565373565380a626662663265323962333839373339663361363262616263656337353139 +61646236623836633832376663363331616531623739313333333035393333356437613939626161 +31356330393964663565353762363733356562646664363065323835633036383434353739653433 +39346264386134353364323733383664303034616562346661326135646464643238316461313064 +36356365376263646134363637336336306436616534663737386533376132623434626462363763 +39613966656434613830373231313263306433613234666434366166343333313064326262383264 +66613731643633636134366230663562393263626237353865333934613538353532623237386331 +31373638386663323361356436383733303631323962303766663463656666306231363030633735 +35306366653037313931623233326133393032613464373531383366376434353133346566383166 +31366130363464343466336337666465396436356531626664383961623131663435313632613866 +39363236343737663834656130643965653236623862353663376365613334663363383338663463 +62306564353232663631393534663061316362643531656530636165666666396133333865626538 +33623563393866623563373333336666383732326537323363336134326632623363623666623162 +65346539323331663236383336623863343135376635626633353139623434663331656635386662 +36376634663863623762643862396339326635653933383739396465333061386166616339623430 +66326565303936393239386339356363663130656431396566663366386664353730393733336366 +64363665313631373038633365373334636261653465353964656139343962323465633835303831 +31313562663530316435646362613465356266376266616430376132663530356139633330366637 +30343931346333303230363065636432613036393262333666343530326537633162646339666536 +63663065386265383336333732306236633134393061343833343233326439313361643930613863 +66346335653666313335356564616530303663383966663035356565356666313561366633346539 +33356564373535313062303437323637393835616233666130633434643966343937393738386132 +39356538353965376139303438616463393061626238326436376634636335376566333761653461 +38663662633434386439666636323164643433626362626134646264396364316238636664633434 +62643966313934353263346361303033353966313139643238633335646164356337393334623766 +34373566366662373634663933343561343433343165343135663766393963333731643032326235 +37616663336438306536386463653865323135383637643162383964633638336632356166633033 +37643165346437303730306136363636643231393134666662326138336532363132393433613533 +37343430346335653365633834383164353232323634613437643938353530303239643831386564 +37636464316464396233323538633930613264633366353233623965376363616334623032343761 +32646435356364633332313562663265346465306635393337383364653366356239613239613264 +39353864373866363339636339333639656334636433393139353662323362616531343738626136 +35386335636539376330326366656338373232363434633063353136303930386234326139613535 +33633731343166636534633533393935353831363166363334376230663537313362646166376235 +61336335323838366539626631343531383561313531646534666236313036346562333334373262 +64383563386337653938663965623637646165633330656134393662643732653466363866306237 +62633731663631373263373564663637343630373830646336616132636632646135623537343865 +64666433346538666366613439643639663266633237353130316433353664663766343339336561 +63366334343733646631663531303263376164643765306261323434636338306132363634373032 +37363233373964613166386537633536346662393661333238633539333933623465663135336365 +37393134353038656365363230663462343132366135363336363964393534353632336461383939 +33306637633730623566303533653761613131363936653632656361363265643535313330356335 +63306132623734666234613365346336303565623630623865653832386465303736633130383632 +66336432633534343663343234346239613733623965376634613265383465633464616562633661 +33663231616133613931333330313663313531313137663730623462333139396330646531333231 +35366364316235616532633233363339303862613934306435376536613735326133306261613866 +30653666623039643035346531623130366238633066613631346432343364316237383532656132 +62653262343736356330386238343566656362633461633334373261343464373065656464373361 +63623665393339383431356634326566646237356565356238613233373831646534373065363862 +32373131626461316134353962323331386331366463343636643261373361383062303230393064 +37666434666533626162643635663634623932623330656162363138663833393433373533323364 +64643064633334643331373231623033633035326232336362313462623431326237366563323430 +32393136653436653134633333613930373433653337303464363962373731646535613966663531 +63303833343338343162393239623535646537636339333539326632636162383733623636373039 +37303766383832313738346266383232313265333261336465653961666261666539623230393964 +36646336346539313462356339343465383261343638613261333261366461303837323566656565 +35373461313662323431663235353030326538393261633437616666643137346364396661616166 +34316336333731373538353061646664353632626235373366656532653837343633363337363534 +65346330323731633437303964353838333165386664616134663535393538363639313032393435 +31343765643338633438386364303739366332333866373436653639396562303562666564653035 +62313236383039616366666533353835306131363233643131613734666135376533383333343363 +38663838316163653737643630636565633865303334363938316435376565323933336163326432 +63663033653931343366616435636565383033643661336366616535656539343062666632653631 +30373564643165653264313133373763393864616662396631333365303638376562356366303736 +63306631343165373565653330656234623437643863356265383166646434656632323266653632 +32363838306539636666623466353637306237373832353532353732643266646335366438643765 +37303739386433383039373533383334613137326235376137663062633237303239363638636134 +61633764643534316163633530613334323339646338313563626665343237666134636432376237 +36623536643639343836376165376431636139386561313034363363373664303364333162613866 +63623764643166383961646335326431623735326235653738353037346366356539626136323532 +38396633353136393136316630666631623465653537396366313938356661626461353965666334 +34643664373830613132643738646562373162373461313835656131653165653562343436363538 +37616139333265396338363635363532393333346438616661666639626137373038633537313465 +31336665653631666539386264396531346561363932363234346564343430343437396661643637 +38356439653037643436353938373138636634643932613163376266386138613439323635333539 +61326238393361333234373033316537613535663730386338616436643463376337376132383437 +34663835343831646664396161336536613066373135643636363539616565613832643462356335 +38656534666463356234373839313930646465336330356364663338663833333430383339366562 +31626131326465313039333737353437306230353034393536326364663862613732613631633031 +31356335656338373739386535383066666235383836356261626230623931636564663166376262 +37326239663038333366333135306238666365356635326430366539396232366263663066653363 +30366361323565336361306338666361613737326663366263643937393933666262646265353464 +62306133376235616231326465373032636563356133633466383763666232643931356661633630 +36353339323261623831356262363662313734363636386262323265653861303433643137383136 +33386432323361333937623139313363646463303638366266393733653433333436343863393833 +65393437373761636631616465363330383666626466613061376262663665316130633432363861 +61386239333536333731326636653965316334393436303139306138663338353562306463376364 +34626165613636373435326532653237616230373034383563616564653137633361653666653034 +39313038306639646132393465356334646266323064383134383631653834613863313931333235 +63653533616230383138326566353034363561303263616530366130383430633164366331393534 +32623661663865663862353339613761363466613433313763373464643263383634386330376232 +33376366646137616136343066353261643338356634313138303030336431356332653531333037 +32323063626237623532363834666233313934656333653964636465393631376333643130393562 +64623937633961353364306466383264646633343134626336356432323161333534623837336664 +30636666373930393239303031643032613064616633366638346234383931616663336662396336 +64303637313132303232353035313934316635383863373135343830343232646530623735656133 +32666365316439633636653332623839303934343563633639363030316563373531306261396638 +36613438373734646133653234626631363631306434656366636531363335373461363236326531 +66393932343337623162633437313939666334633665356438363137656534366231613830386638 +39313531666162366539613366623631326331613434633162303466613366313137386338626239 +30383534623835626332656564633639316137323866336335636338616361363534353835333465 +63643732383962656532613738383034656265373033353064353038363565656662626638373161 +36336564663733666461623331323338653932353632313439636662353664653739636335306363 +34313032326364363038353061616134363161643466653166373430326439346664383831313232 +65653464386466386232666635393735313030356266633133656166366536356665343031396562 +61653539306332646538376135343861636334383736313363393366363534643761623832363862 +30633930623764626263643263636331633764616330636431336163323166663965386530636665 +38366161613239643635383139663864366232633633336363356662343262336364376135613062 +61336262646336626133613734353939666533393531373732333132383263626566346234366438 +65663466376363623033303463343435626366633836366233323538386633323739313966383431 +32643764613537316139356132646436353661653432346562303263396531653132336537646362 +63356462346436343833346234353933383766386166363132323938646563646239303535383863 +66656334656562643636386531376438336639663362303536343264313430643962326164396134 +38666330316236353539306338393831333763626437643935303436653566383333323431666438 +63353737623933313939623633636430303636636464663031316263343038316161396161366432 +34636334333964613761396233636136656630643561623937636336323439336236656461616533 +31373566363038333133323866313335666139373961373131646463633831323063356532363533 +62343036343737653737336431323235623164303934353630623535333430326636663531386464 +38616136373264626534376234376431666532353636653731363964333538383639633034623262 +31333031323839373663633238356436656236383933616561356439323265306631653232643633 +37316139343831313933633335376438373663633837323565373939643533333832353232323765 +61306361666238663861343361623837626235663833393539323962343032343538356430633635 +33613465363965376133306339303061356438376662393432356664363536303732346434636533 +38363064313431333530333233373134333064636538363262316138323139393037626636613237 +30633663653237316231646363336163633032663338633534373061386263383061396336363161 +66363334656636346566363536613537666330333235353837303231666262356166376332613036 +35313834393331643664363133396635646633393935623236653738396565663633373733396662 +37363234316463326333653136343133393530363562326136616166386438383838313362386665 +65646262663830613135306562613130616434393563386464396338653130616366653966366239 +32393133616432346666366534333333366336633133366537613864656266626337633563653837 +34313935656637633632383035616430326533393464666363346636316235363333353633323733 +36373137626638336139373631303733363838383132373365353964666437666639646365393961 +36366339346166656364383236396461613233353331643034636666663732306234643662363338 +62636566376662356363356434623863656435376637366561633633616237346431343365663361 +33303736366133633936633233623430363265326232383033313165663039303764373464376336 +61653733643231323862323738333936323433383730373139633333636335653632316466386338 +34303139313838663861636264333966376633313661373637376138383865613764383166393537 +63386137363464356561383935366438373337383031633030373232623466333438353364353933 +34653563383731663231666365626434333235346637333462363334353365363733323936623565 +31313639383065396133333134346335323162313736663565656330353366313434373236306633 +34646264636533316533383833393466356630326330306464366537623632376363303861363364 +61393461386637363230653662373163316663313338383965626234383138633837336265393661 +32653034333764613537633931373366313632323335333365303930386165313662306132666430 +39346364653936356463643663633862326539373037363330323731373661353233396465376539 +30386262353638623930313763643234386262393237663731383533336466346133353766306463 +38626638333439353862373931633065656438663839626563386161366132366236323861616535 +37666361313238383637356633326162306233326462333938363233613431343538373239636230 +66363765613761646466376333303831636230346538386339373165333163396266643562343064 +32663762653339323339313166336132366537303535633038633436313361326438616637346162 +62313732326334653330326565363366623531616562633238336330663839303163393162346565 +61616431663637646534663033366261313934373239313335333563373962346563323537373163 +64633963626235323564353536393931393764363934613265336437643064346463386638323233 +32663238343833346663656135353864373639333364343066396235613961356662393665363630 +38336136386138316535326236333565663136653563303835353964303433636163643866643461 +64396333656630633933393538303863376462363565306564353766663763616166633332393338 +34653465363439323536383163366364306362313865633465303437653935643764326532363362 +62343266333464373963653265323463376363623261346364333132653432353937383632303538 +30373037323236373862643430666239613863646531653962653437333063646165393033656332 +34656536303239353661626339643761623262653835356533343037633937383934306233613734 +39626565393433346562643239306330623662623163643136303366656630363930396539343434 +31393033613139613162393339366132633230656464333839663737643164383761303730316538 +62623934366233626564333066326432366239383330353732646431383533336436663864653132 +37616562316234353834346134653634363738313966643831623766313032313762346230613030 +35383439326466396462346638646165666263363231363435653132326562616562636430643365 +38626439396237363830643831333037353931663064653731316534343661663935633433313737 +61653839316632386564313236663362313761623065663466656535303164643465343434363462 +64656334633266616335366361623630373335386663373830306166646439323636663266613037 +66633338326135653338663837613830633937633236353039383865363662313738393462623239 +65363938323038386465656535386431383864346130333961333436343262636232323033396435 +36303065333365613035636465343865333732363461653963643736623464356561373164313031 +33643037323063366364616363633534623830383866313033396138313335383661346666396537 +38613530653732336535343934326537343566393231333462313663623437343538636665386261 +39633732366138383363663634303532636633363333396532343736613734623962333865656265 +32386431373463626435386633663432323933373631353664336162326564363534643661383434 +64333133323961306633373362323863666233643663333535626531623032323932396536383161 +61646565623564336365343739623331356661393762326439383138353037393965303531313932 +62363464636564643939383661653166303163626435653538646365313064336664613336616138 +34653631663536303432663731333165396663333564363338326530353265656262636138653534 +62346437383636353834356561346663363832666339303864373435653331613437363939613239 +66393539356531303234613835303938353461373762306563356532653038386461383737313737 +39666361613762373134646161353539386665656635666537656266383035656530393730643665 +30326438383066363338303866333464333339376631633531393161353230353032356435663338 +30303834366465623861643130653064643464303366396666646237643237373962636530663863 +63313130396466386562363435376461666164313337373838646536616166623536663135343263 +64323261633133373366306665643638613930616331343064366532313131636362373431343639 +66393830666133656365393730343931636439626162616163393131633634396536343030626336 +30663266623432643234366337653838643630383736643330666436386335336431653662363964 +36323737303232356466373137323734316238383165316631343639393862343765313935646166 +61613734646539363638613865386534306238383464323639333066326565656665623937666162 +34316537393736356436646665313533303838393666653465386162346336306462303438313735 +37656338363335336232333638653038363030383533353032343133363662633531353338663438 +38326664643236336136343266653462386464343363616637393530393166323639343261356635 +64343438306630633538376235313637306664643236383532646633343635353931663561623566 +35616135316466656662336130633963386266336261353631306536376438323235366434346530 +36616161636561316661633662386562653030643264303533386463366132306434343735313139 +62623135323338366461353034623836626330643932303337323266636430643530656337323031 +62653335623565346333653630356565393738356231383366353639653064356230663432383736 +32663463616563643061323639393063343132623435643339363935346437653233663665666162 +37316263663139653930656537376434323464623030366538333161623434623061343462623261 +65313161393537353833313538643138656135626133373634313139333131393864643331333462 +33616166653661396439666366376333393239323931316438333430313463343834643466616265 +39623637323761646566396536386635393239653562363837393532356330393532663361323736 +64653264663064323239373932643565306561313038356461656162376466366335346136333233 +32313233316638353564646162613135646335373765356332636234663038343031646137393736 +62613937323664313463633637393966613230303365666563363363346262633537313837633035 +65376630613235616264626565313931643130363434316438393235343663646162393264666339 +30656137393563613933363639626663643936363039316239623261623438313061343634323037 +34656563613135363463636534366639343863326134376662386138653066353036653835633334 +37393032313539613338613438306434383763313336636137303964633766383537346665646139 +37343638376361623035633832633932316265626131636564323866393933636264643363626331 +64663131336130663538663464373738373461393237383866613931313662386233316261373563 +65353865353662633630336432373065643564613137393966613465363165373463613831306630 +32356665643266616566393539396230666661373964393737353433643735363535306364386336 +63353761643230663732383263643865323635353261666262656232343930373039363865366438 +36643564313737336234396531313837396332373834396464356238303739663732626537336337 +33313065363133333661616433353734343430636331376332336432646563653863366466623864 +36663564616232313731336363646536656263633564383165383965653936376464333663333465 +64356462343837343933383837303939393435626538626262303561346338613362366134636437 +39306466643563333361303163346564656631363462366164663630356338306138636266353633 +64363831353330376135386237333338396330333234633362663262323832633961353832303032 +31343637393432666661653531303134323364363264636539633565323232363739323937333838 +64346631393436373937636166393334303632316636396563643630653933366233663663633632 +34313037653038313437343663663432353234353032343439373865666462666664323261303933 +39343832363435336165613631396362303462323261376466386631636533303765363632626334 +35346461653464633734353265663238646164653466306436643565353566636138333565353438 +35316430646434643961353831326535396262646466396332656339306436633034626664666137 +34643664393136333935623963633631336233656430373665396136613938396332633065333137 +62653939633063616533646430636463336264376634393035623137626637636166353337363833 +61653233373330396232313939336135613862666135306235636135633035386262613339356333 +37356235383731396466636135613831643239373237346463353563646131333033363339343231 +31666162326632663032626130656565653334323633636565653263373336323438316430616436 +62663665343966646231616132653235373836616661333539313339343361653666613639326538 +38323463393333376265653936326632313037346261653934343634326530663338383264306134 +30316137383538353035653536663864356433643234343931323230656633656363376330346234 +39353362623533643861336264663662653863393435396336653334303830653466336236303530 +32356234313130613739346632373865633364323634303739353434383636643563333963383639 +38323039396230623838653234336339396164333933656463313064333262333863316266653732 +32623664353364656565316538373861336166643064646337646364333766653032373064646133 +34633663393935306134386536333363646233653033663139353838396135366131386366333539 +64303035356236326563613937616364346265616664646232663063326261333830356434306536 +63643533306465623866633230653430656236636538643038656139643265346265343835636130 +36393566623830313062303733393032353061326132353031373831386630626531626463643432 +36383232316639343033656132656663666134383962373561653539306466343435376436393666 +37353439633362373131373932656266343761616665646430373539373162616235376239333930 +34306138326263383666643065306564303062393031633563366662656336336163626166356330 +61663164353534303432353061386639633531303863306238336463666236313032306636616135 +37323864613030643937643634653738313861326538313334316231363236623936613337383465 +33353063623136336465643736356338343066306230303866633836653432386365366439396434 +31383533653838373266643436306237306266303261353964306134393033323366643937383138 +35633363383261316638623135393465363664363938323737643430313761303763383533303463 +61636536386165326432646266343764346130623735613439333633613662343735383737376261 +30663537623664633464383235633630653463353432353536643437333137383538343233303039 +36663333623736363635613134616330623564306434373361366264356337623162623363343736 +32616337626564336661626432646165663233643166356133666263366339386435363664303937 +39346439386639643066633063373939363737323862316138373738343330363163613062393233 +39376135353765313133636238623035383235336639663932353861653165636631353463623838 +30646633636539646633623632303761643338363438663663356337616235623766323930346433 +66333736313239306266373130616236313366663537333135656230373234326531333135366236 +38376165336166613563613337353935613632313762356530353465313136663337396630303836 +39336662646634316339653130363332613636613536343366343639363464653532663763353961 +63643935333530626534656666303465303939313032613363363463313366653937343438356630 +66313664336631363834376439643061396136376437663333393833663034343165363034336632 +66646239376134346432343766323564383534613837393165643562363538343562346564303661 +36396433633737396533353138653831643562303538346235313037353362376538626635363332 +37343435316264616239643635613263366531323363336565633261616534643366333264646364 +3731 diff --git a/roles/prometheus-exporters/handlers/main.yml b/roles/prometheus-exporters/handlers/main.yml new file mode 100644 index 0000000..7ff6bd3 --- /dev/null +++ b/roles/prometheus-exporters/handlers/main.yml @@ -0,0 +1,16 @@ +- name: Restart prometheus-node-exporter + systemd: + name: prometheus-node-exporter + state: restarted + enabled: true + +- name: Restart prometheus-exporter-exporter + systemd: + name: prometheus-exporter-exporter + state: restarted + enabled: true + +- name: Restart nftables + systemd: + name: nftables.service + state: restarted diff --git a/roles/prometheus-exporters/tasks/main.yml b/roles/prometheus-exporters/tasks/main.yml new file mode 100644 index 0000000..4f3af55 --- /dev/null +++ b/roles/prometheus-exporters/tasks/main.yml @@ -0,0 +1,354 @@ +--- +# This does not respect hosts that still use ip6tables +# like monitor01 and monitor02 +- name: Prüfe, ob nftables enabled + shell: systemctl is-enabled nftables.service + register: nft_enabled + failed_when: false + +- name: Prüfe, ob nftables active + shell: systemctl is-active nftables.service + register: nft_active + failed_when: false + +- name: Schreibe Prometheus-IPs nach /etc/nftables.conf + when: nft_enabled.stdout == 'enabled' and nft_active.stdout == 'active' + ansible.builtin.blockinfile: + path: /etc/nftables.conf + marker: "# {mark} ANSIBLE MANAGED PROMETHEUS IPS" + insertafter: "^flush ruleset" + block: | + define prometheus_exporter_ips = { + {% for ip in prometheus_ips %} + {{ ip }}, + {% endfor %} + } + notify: Restart nftables + +- name: Füge Prometheus-Access-Block in chain input ein + when: nft_enabled.stdout == 'enabled' and nft_active.stdout == 'active' + ansible.builtin.blockinfile: + path: /etc/nftables.conf + marker: "# {mark} ANSIBLE MANAGED PROMETHEUS EXPORTER" + insertafter: '^\s*chain input \{' + block: | + ip6 saddr $prometheus_exporter_ips ct state new tcp dport { 9998 } accept; + notify: Restart nftables + + +- name: Set Prometheus Node Exporter options for containers Debian 12 and later + copy: + content: | + ARGS="--no-collector.cpufreq --no-collector.thermal_zone \ + --no-collector.hwmon --no-collector.diskstats \ + --no-collector.vmstat --no-collector.mdadm \ + --web.listen-address=[::1]:9100 \ + --web.listen-address=127.0.0.1:9100" + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + notify: Restart prometheus-node-exporter + when: + - ansible_facts['virtualization_type'] == "lxc" | default(false) + - ansible_facts['virtualization_role'] == "guest" | default(false) + - ansible_facts['distribution'] == "Debian" | default(false) + - (ansible_facts['distribution_version'] | int ) >= 12 | default(false) + +- name: Set Prometheus Node Exporter options for containers Debian 11 + copy: + content: | + ARGS="--no-collector.cpufreq --no-collector.thermal_zone \ + --no-collector.hwmon --no-collector.diskstats \ + --no-collector.vmstat --no-collector.mdadm \ + --web.listen-address=127.0.0.1:9100" + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + notify: Restart prometheus-node-exporter + when: + - ansible_facts['virtualization_type'] == "lxc" | default(false) + - ansible_facts['virtualization_role'] == "guest" | default(false) + - ansible_facts['distribution'] == "Debian" | default(false) + - (ansible_facts['distribution_version'] | int ) < 12 | default(false) + +- name: Set Prometheus Node Exporter options for VMs Debian >= 12 + copy: + content: | + ARGS="--no-collector.thermal_zone --no-collector.hwmon \ + --no-collector.diskstats --no-collector.mdadm \ + --web.listen-address=[::1]:9100 \ + --web.listen-address=127.0.0.1:9100" + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + notify: Restart prometheus-node-exporter + when: + - ansible_facts['virtualization_type'] == "kvm" | default(false) + - ansible_facts['virtualization_role'] == "guest" | default(false) + - ansible_facts['distribution'] == "Debian" | default(false) + - (ansible_facts['distribution_version'] | int ) >= 12 | default(false) + +- name: Set Prometheus Node Exporter options for VMs Debian 11 + copy: + content: | + ARGS="--no-collector.thermal_zone --no-collector.hwmon \ + --no-collector.diskstats --no-collector.mdadm \ + --web.listen-address=127.0.0.1:9100" + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + notify: Restart prometheus-node-exporter + when: + - ansible_facts['virtualization_type'] == "kvm" | default(false) + - ansible_facts['virtualization_role'] == "guest" | default(false) + - ansible_facts['distribution'] == "Debian" | default(false) + - (ansible_facts['distribution_version'] | int ) < 12 | default(false) + +- name: Set Prometheus Node Exporter options for hosts + copy: + content: | + ARGS="--web.listen-address=[::1]:9100 --web.listen-address=127.0.0.1:9100" + dest: /etc/default/prometheus-node-exporter + owner: root + group: root + mode: '0644' + notify: Restart prometheus-node-exporter + when: + - ansible_facts["virtualization_role"] == "host" | default(false) + - ansible_facts['distribution'] == "Debian" | default(false) + - not ansible_facts['distribution_version'] == "11" | default(false) + +- name: Installing prometheus exporter packages + apt: + name: + - openssl + - prometheus-exporter-exporter + - prometheus-node-exporter + - prometheus-node-exporter-collectors + state: present + install_recommends: false + +- name: Ensure /etc/prometheus/ssl exists + file: + path: /etc/prometheus/ssl + state: directory + owner: root + group: root + mode: '0755' + +- name: Kopiere ca_cert.pem zum Ziel + copy: + dest: /etc/prometheus/ssl/ca_cert.pem + content: "{{ ca_cert_pem }}" + owner: root + group: root + mode: '0644' + +- name: Check if exporter_key exists + stat: + path: "/etc/prometheus/ssl/exporter.key.pem" + register: exporter_key + +- name: create key + command: > + openssl genrsa -out /etc/prometheus/ssl/exporter.key.pem 2048 + args: + creates: /etc/prometheus/ssl/exporter.key.pem + notify: Restart prometheus-exporter-exporter + when: not exporter_key.stat.exists + +- name: Change file ownership, group and permissions + ansible.builtin.file: + path: /etc/prometheus/ssl/exporter.key.pem + owner: prometheus + group: prometheus + mode: '0600' + +- name: create csr config + when: not exporter_key.stat.exists + copy: + dest: /etc/prometheus/ssl/csr_config.cnf + content: | + [req] + distinguished_name = dn + req_extensions = req_ext + prompt = no + + [dn] + CN = {{ inventory_hostname }} + + [req_ext] + subjectAltName = @alt_names + + [alt_names] + DNS.1 = {{ inventory_hostname}} + +- name: create csr config + when: not exporter_key.stat.exists + delegate_to: localhost + copy: + dest: /tmp/{{ inventory_hostname}}.csr_config.cnf + content: | + [req] + distinguished_name = dn + req_extensions = req_ext + prompt = no + + [dn] + CN = {{ inventory_hostname }} + + [req_ext] + subjectAltName = @alt_names + + [alt_names] + DNS.1 = {{ inventory_hostname}} + +- name: create csr + when: not exporter_key.stat.exists + command: > + openssl req -new -key /etc/prometheus/ssl/exporter.key.pem \ + -out /etc/prometheus/ssl/exporter.csr.pem \ + -config /etc/prometheus/ssl/csr_config.cnf + +- name: Fetch CSR from exporter + when: not exporter_key.stat.exists + fetch: + src: /etc/prometheus/ssl/exporter.csr.pem + dest: /tmp//{{ inventory_hostname }}.csr.pem + flat: true + mode: '0600' + +- name: Schreibe Private Key in RAM-Datei + when: not exporter_key.stat.exists + delegate_to: localhost + copy: + dest: /dev/shm/ca_key.pem + content: "{{ ca_key_pem }}" + mode: '0600' + no_log: true + +- name: Schreibe ca_cert.pem in RAM-Datei + when: not exporter_key.stat.exists + delegate_to: localhost + copy: + dest: /dev/shm/ca_cert.pem + content: "{{ ca_cert_pem }}" + mode: '0600' + no_log: false + +- name: sign cert + when: not exporter_key.stat.exists + delegate_to: localhost + shell: > + openssl x509 -req -in /tmp/{{ inventory_hostname }}.csr.pem \ + -CA /dev/shm/ca_cert.pem -CAkey /dev/shm/ca_key.pem \ + -CAcreateserial -out /tmp/{{ inventory_hostname }}.cert.pem -days 8250 \ + -extensions req_ext -extfile /tmp/{{ inventory_hostname}}.csr_config.cnf + +- name: Lösche mehrere RAM-Dateien + delegate_to: localhost + file: + path: "{{ item }}" + state: absent + loop: + - /dev/shm/ca_cert.pem + - /dev/shm/ca_key.pem + - /tmp/{{ inventory_hostname}}.csr_config.cnf + - /tmp/{{ inventory_hostname }}.csr.pem + +- name: Copy cert to exporter + when: not exporter_key.stat.exists + copy: + src: /tmp/{{ inventory_hostname }}.cert.pem + dest: /etc/prometheus/ssl/exporter.cert.pem + mode: '0644' + notify: Restart prometheus-exporter-exporter + +- name: Lösche files in tmp + delegate_to: localhost + file: + path: "{{ item }}" + state: absent + loop: + - /tmp/{{ inventory_hostname }}.cert.pem + +- name: Lösche temp files remote + file: + path: "{{ item }}" + state: absent + loop: + - /etc/prometheus/ssl/csr_config.cnf + - /etc/prometheus/ssl/exporter.csr.pem + +- name: Configure prometheus-exporter-exporter config + copy: + dest: /etc/prometheus/exporter-exporter.yml + content: | + modules: + prometheus: + method: http + http: + port: 9090 + alertmanager: + method: http + http: + port: 9093 + node: + method: http + http: + port: 9100 + respondd: + method: http + http: + port: 9104 + blackbox: + method: http + http: + port: 9115 + path: '/probe' + bind: + method: http + http: + port: 9119 + postfix: + method: http + http: + port: 9154 + process: + method: http + http: + port: 9256 + pve: + method: http + http: + port: 9221 + path: /pve + bird: + method: http + http: + port: 9324 + kea: + method: http + http: + port: 9547 + owner: root + group: root + mode: '0644' + notify: Restart prometheus-exporter-exporter + +- name: Configure prometheus-exporter-exporter params + copy: + dest: /etc/default/prometheus-exporter-exporter + content: | + ARGS="-web.listen-address= -web.tls.listen-address=:9998 \ + -web.tls.cert=/etc/prometheus/ssl/exporter.cert.pem \ + -web.tls.key=/etc/prometheus/ssl/exporter.key.pem \ + -web.tls.ca=/etc/prometheus/ssl/ca_cert.pem -web.tls.verify" + owner: root + group: root + mode: '0644' + notify: Restart prometheus-exporter-exporter -- GitLab From e1d6eeb3dae2928fe42e32795f900562e22376f7 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sat, 7 Jun 2025 21:02:10 +0200 Subject: [PATCH 04/13] Add prometheus client tls cert config role --- roles/prometheus/handlers/main.yml | 5 + roles/prometheus/tasks/main.yml | 137 ++++++++++++++++++ .../templates/node_exporter_targets.yml.j2 | 4 + 3 files changed, 146 insertions(+) create mode 100644 roles/prometheus/handlers/main.yml create mode 100644 roles/prometheus/tasks/main.yml create mode 100644 roles/prometheus/templates/node_exporter_targets.yml.j2 diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000..8a8df0d --- /dev/null +++ b/roles/prometheus/handlers/main.yml @@ -0,0 +1,5 @@ +- name: Restart prometheus + service: + name: prometheus + state: restarted + diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..cf95926 --- /dev/null +++ b/roles/prometheus/tasks/main.yml @@ -0,0 +1,137 @@ +--- +- name: Check if /etc/prometheus/prometheus.yml exists + stat: + path: "/etc/prometheus/prometheus.yml" + register: is_prometheus + +- name: Fail if host is not prometheus host + fail: + msg: "This role must only be run on prometheus hosts" + when: not is_prometheus | default(false) + + #- name: Create node_exporter_targets.yml file + # template: + # src: node_exporter_targets.yml.j2 + # dest: /etc/prometheus/node_exporter_targets.yml + # notify: + # - Restart prometheus + +- name: Check if client-cert exists + stat: + path: "/etc/prometheus/ssl/client.cert.pem" + register: client_cert_exists + +- name: Ensure /etc/prometheus/ssl exists + when: not client_cert_exists.stat.exists + file: + path: /etc/prometheus/ssl + state: directory + owner: root + group: root + mode: '0755' + +- name: Kopiere ca_cert.pem zum Ziel + copy: + dest: /etc/prometheus/ssl/ca_cert.pem + content: "{{ ca_cert_pem }}" + owner: root + group: root + mode: '0644' + +- name: create key + when: not client_cert_exists.stat.exists + command: > + openssl genrsa -out /etc/prometheus/ssl/client.key.pem 2048 + args: + creates: /etc/prometheus/ssl/client.key.pem + +- name: Change file ownership, group and permissions + ansible.builtin.file: + path: /etc/prometheus/ssl/client.key.pem + owner: prometheus + group: prometheus + mode: '0600' + +- name: create csr + when: not client_cert_exists.stat.exists + command: > + openssl req -new -key /etc/prometheus/ssl/client.key.pem -out /tmp/client.csr.pem \ + -subj "/CN={{ inventory_hostname }}" + args: + creates: /tmp/client.csr.pem + +- name: Fetch CSR from prometheus host + when: not client_cert_exists.stat.exists + fetch: + src: /tmp/client.csr.pem + dest: /tmp/{{ inventory_hostname }}.csr.pem + flat: true + mode: '0600' + +- name: Lösche csr.pem + file: + path: /tmp/client.csr.pem + state: absent + +- name: create csr conf + when: not client_cert_exists.stat.exists + delegate_to: localhost + copy: + content: | + extendedKeyUsage = clientAuth + subjectAltName = @alt_names + [alt_names] + DNS.1 = {{ inventory_hostname }} + dest: /tmp/{{ inventory_hostname }}_ext.cnf + +- name: Schreibe Private Key in RAM-Datei + when: not client_cert_exists.stat.exists + delegate_to: localhost + copy: + dest: /dev/shm/ca_key.pem + content: "{{ ca_key_pem }}" + mode: '0600' + no_log: true + +- name: Schreibe ca_cert.pem in RAM-Datei + when: not client_cert_exists.stat.exists + delegate_to: localhost + copy: + dest: /dev/shm/ca_cert.pem + content: "{{ ca_cert_pem }}" + mode: '0600' + no_log: false + +- name: create client cert + when: not client_cert_exists.stat.exists + delegate_to: localhost + command: > + openssl x509 -req -in /tmp/{{ inventory_hostname }}.csr.pem -CA /dev/shm/ca_cert.pem -CAkey /dev/shm/ca_key.pem \ + -CAcreateserial -out /tmp/{{ inventory_hostname }}.cert.pem -days 3650 -sha256 \ + -extfile /tmp/{{ inventory_hostname }}_ext.cnf + args: + creates: /tmp/{{ inventory_hostname }}.cert.pem + +- name: Lösche CA und ext-cnf + delegate_to: localhost + file: + path: "{{ item }}" + state: absent + loop: + - /dev/shm/ca_cert.pem + - /dev/shm/ca_key.pem + - /tmp/{{ inventory_hostname }}_ext.cnf + - /tmp/{{ inventory_hostname }}.csr.pem + +- name: Copy cert to prometheus host + when: not client_cert_exists.stat.exists + copy: + src: /tmp/{{ inventory_hostname }}.cert.pem + dest: /etc/prometheus/ssl/client.cert.pem + mode: '0644' + +- name: Lösche client cert + delegate_to: localhost + file: + path: /tmp/{{ inventory_hostname }}.cert.pem + state: absent diff --git a/roles/prometheus/templates/node_exporter_targets.yml.j2 b/roles/prometheus/templates/node_exporter_targets.yml.j2 new file mode 100644 index 0000000..45f6755 --- /dev/null +++ b/roles/prometheus/templates/node_exporter_targets.yml.j2 @@ -0,0 +1,4 @@ +- targets: + {% for container in groups['ffspveguests'] %} + - '{{ hostvars[container].ansible_host | default(container) }}.vm.freifunk-stuttgart.de:9998' + {% endfor %} -- GitLab From 362a31c7c89ead1d443ce3bbbbe7f026cadf1da5 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 15 Jun 2025 17:15:00 +0200 Subject: [PATCH 05/13] Exclude hosts from scrape rules if prometheus_exporters_ignore is set --- host_vars/bbnrb1/prometheus-exporters.yml | 2 ++ .../prometheus-exporters.yml | 2 ++ .../prometheus-exporters.yml | 2 ++ .../prometheus-exporters.yml | 2 ++ .../prometheus-exporters.yml | 2 ++ inventory/role_core.yml | 5 +++ roles/prometheus/tasks/main.yml | 36 +++++++++++++++---- .../templates/bird_exporter_targets.yml.j2 | 6 ++++ .../templates/kea_exporter_targets.yml.j2 | 6 ++++ .../templates/node_exporter_targets.yml.j2 | 10 ++++-- 10 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 host_vars/bbnrb1/prometheus-exporters.yml create mode 100644 host_vars/mailgw01.vm.freifunk-stuttgart.de/prometheus-exporters.yml create mode 100644 host_vars/mailgw02.vm.freifunk-stuttgart.de/prometheus-exporters.yml create mode 100644 host_vars/nrb-backbonetest2.vm.freifunk-stuttgart.de/prometheus-exporters.yml create mode 100644 host_vars/openslides.vm.freifunk-stuttgart.de/prometheus-exporters.yml create mode 100644 inventory/role_core.yml create mode 100644 roles/prometheus/templates/bird_exporter_targets.yml.j2 create mode 100644 roles/prometheus/templates/kea_exporter_targets.yml.j2 diff --git a/host_vars/bbnrb1/prometheus-exporters.yml b/host_vars/bbnrb1/prometheus-exporters.yml new file mode 100644 index 0000000..f56ca1f --- /dev/null +++ b/host_vars/bbnrb1/prometheus-exporters.yml @@ -0,0 +1,2 @@ +--- +prometheus_exporters_ignore: true diff --git a/host_vars/mailgw01.vm.freifunk-stuttgart.de/prometheus-exporters.yml b/host_vars/mailgw01.vm.freifunk-stuttgart.de/prometheus-exporters.yml new file mode 100644 index 0000000..f56ca1f --- /dev/null +++ b/host_vars/mailgw01.vm.freifunk-stuttgart.de/prometheus-exporters.yml @@ -0,0 +1,2 @@ +--- +prometheus_exporters_ignore: true diff --git a/host_vars/mailgw02.vm.freifunk-stuttgart.de/prometheus-exporters.yml b/host_vars/mailgw02.vm.freifunk-stuttgart.de/prometheus-exporters.yml new file mode 100644 index 0000000..f56ca1f --- /dev/null +++ b/host_vars/mailgw02.vm.freifunk-stuttgart.de/prometheus-exporters.yml @@ -0,0 +1,2 @@ +--- +prometheus_exporters_ignore: true diff --git a/host_vars/nrb-backbonetest2.vm.freifunk-stuttgart.de/prometheus-exporters.yml b/host_vars/nrb-backbonetest2.vm.freifunk-stuttgart.de/prometheus-exporters.yml new file mode 100644 index 0000000..f56ca1f --- /dev/null +++ b/host_vars/nrb-backbonetest2.vm.freifunk-stuttgart.de/prometheus-exporters.yml @@ -0,0 +1,2 @@ +--- +prometheus_exporters_ignore: true diff --git a/host_vars/openslides.vm.freifunk-stuttgart.de/prometheus-exporters.yml b/host_vars/openslides.vm.freifunk-stuttgart.de/prometheus-exporters.yml new file mode 100644 index 0000000..f56ca1f --- /dev/null +++ b/host_vars/openslides.vm.freifunk-stuttgart.de/prometheus-exporters.yml @@ -0,0 +1,2 @@ +--- +prometheus_exporters_ignore: true diff --git a/inventory/role_core.yml b/inventory/role_core.yml new file mode 100644 index 0000000..e895327 --- /dev/null +++ b/inventory/role_core.yml @@ -0,0 +1,5 @@ +--- +role_core: + hosts: + core01-z10a.vm.freifunk-stuttgart.de: + core02-z10a.vm.freifunk-stuttgart.de: diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index cf95926..1be87d7 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -9,12 +9,26 @@ msg: "This role must only be run on prometheus hosts" when: not is_prometheus | default(false) - #- name: Create node_exporter_targets.yml file - # template: - # src: node_exporter_targets.yml.j2 - # dest: /etc/prometheus/node_exporter_targets.yml - # notify: - # - Restart prometheus +- name: Create node_exporter_targets.yml file + template: + src: node_exporter_targets.yml.j2 + dest: /etc/prometheus/node_exporter_targets.yml + notify: + - Restart prometheus + +- name: Create kea_exporter_targets.yml file + template: + src: kea_exporter_targets.yml.j2 + dest: /etc/prometheus/kea_exporter_targets.yml + notify: + - Restart prometheus + +- name: Create bird_exporter_targets.yml file + template: + src: bird_exporter_targets.yml.j2 + dest: /etc/prometheus/bird_exporter_targets.yml + notify: + - Restart prometheus - name: Check if client-cert exists stat: @@ -135,3 +149,13 @@ file: path: /tmp/{{ inventory_hostname }}.cert.pem state: absent + +- name: Installing prometheus exporter packages + apt: + name: + - prometheus + - prometheus-alertmanager + - prometheus-blackbox-exporter + - yamllint + state: present + install_recommends: false diff --git a/roles/prometheus/templates/bird_exporter_targets.yml.j2 b/roles/prometheus/templates/bird_exporter_targets.yml.j2 new file mode 100644 index 0000000..4db77e9 --- /dev/null +++ b/roles/prometheus/templates/bird_exporter_targets.yml.j2 @@ -0,0 +1,6 @@ +{% for container in groups['role_core'] %} +- targets: + - '{{ hostvars[container].ansible_host | default(container) }}:9998' + labels: + instance: {{ (hostvars[container].ansible_host | default(container)).split('.')[0] }} +{% endfor %} diff --git a/roles/prometheus/templates/kea_exporter_targets.yml.j2 b/roles/prometheus/templates/kea_exporter_targets.yml.j2 new file mode 100644 index 0000000..471c336 --- /dev/null +++ b/roles/prometheus/templates/kea_exporter_targets.yml.j2 @@ -0,0 +1,6 @@ +{% for container in groups['role_dhcpserver'] %} +- targets: + - '{{ hostvars[container].ansible_host | default(container) }}:9998' + labels: + instance: {{ (hostvars[container].ansible_host | default(container)).split('.')[0] }} +{% endfor %} diff --git a/roles/prometheus/templates/node_exporter_targets.yml.j2 b/roles/prometheus/templates/node_exporter_targets.yml.j2 index 45f6755..b2f629b 100644 --- a/roles/prometheus/templates/node_exporter_targets.yml.j2 +++ b/roles/prometheus/templates/node_exporter_targets.yml.j2 @@ -1,4 +1,8 @@ +{% for host in groups['all'] %} +{% if not hostvars[host].prometheus_exporters_ignore | default(false) %} - targets: - {% for container in groups['ffspveguests'] %} - - '{{ hostvars[container].ansible_host | default(container) }}.vm.freifunk-stuttgart.de:9998' - {% endfor %} + - '{{ hostvars[host].ansible_host | default(host) }}:9998' + labels: + instance: {{ (hostvars[host].ansible_host | default(host)).split('.')[0] }} +{% endif %} +{% endfor %} -- GitLab From f72f5f51bda8951fc1c6d8f31febae5059e438eb Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 15 Jun 2025 21:44:26 +0200 Subject: [PATCH 06/13] Add prometheus IPs to group_vars --- group_vars/all/promeheus_hosts | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 group_vars/all/promeheus_hosts diff --git a/group_vars/all/promeheus_hosts b/group_vars/all/promeheus_hosts new file mode 100644 index 0000000..315c1e7 --- /dev/null +++ b/group_vars/all/promeheus_hosts @@ -0,0 +1,4 @@ +--- +prometheus_ips: + - 2a0f:d607:e:2::137 + - 2001:7c7:2110::21 -- GitLab From 0d60888ebf9f541b52ea12682ecc4c6486c0a8b4 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 15 Jun 2025 21:45:52 +0200 Subject: [PATCH 07/13] make prometheus01 a revproxy --- host_vars/prometheus01.vm.freifunk-stuttgart.de/revproxy | 7 +++++++ host_vars/prometheus01.vm.freifunk-stuttgart.de/ssh | 2 ++ inventory/role_revproxy | 1 + 3 files changed, 10 insertions(+) create mode 100644 host_vars/prometheus01.vm.freifunk-stuttgart.de/revproxy create mode 100644 host_vars/prometheus01.vm.freifunk-stuttgart.de/ssh diff --git a/host_vars/prometheus01.vm.freifunk-stuttgart.de/revproxy b/host_vars/prometheus01.vm.freifunk-stuttgart.de/revproxy new file mode 100644 index 0000000..3dfde3e --- /dev/null +++ b/host_vars/prometheus01.vm.freifunk-stuttgart.de/revproxy @@ -0,0 +1,7 @@ +--- +letsencrypt_account_email: "hostmaster@freifunk-stuttgart.de" +letsencrypt_acme_directory: "https://acme-v02.api.letsencrypt.org/directory" +vhosts: + - domain: prometheus01.vm.freifunk-stuttgart.de + backend: "http://127.0.0.1:9090/" + type: proxy diff --git a/host_vars/prometheus01.vm.freifunk-stuttgart.de/ssh b/host_vars/prometheus01.vm.freifunk-stuttgart.de/ssh new file mode 100644 index 0000000..a74af17 --- /dev/null +++ b/host_vars/prometheus01.vm.freifunk-stuttgart.de/ssh @@ -0,0 +1,2 @@ +--- +ssh_disable_password_login: yes diff --git a/inventory/role_revproxy b/inventory/role_revproxy index d7b6d87..2a9d66b 100644 --- a/inventory/role_revproxy +++ b/inventory/role_revproxy @@ -4,3 +4,4 @@ role_revproxy: revproxy-05.vm.freifunk-stuttgart.de: revproxy-03.vm.freifunk-stuttgart.de: revproxy-z10a.vm.freifunk-stuttgart.de: + prometheus01.vm.freifunk-stuttgart.de: -- GitLab From 990a099e95dc93d90dab9a0d24ea6225b5ddba85 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:25:20 +0200 Subject: [PATCH 08/13] Add inventory/role_prometheus --- inventory/role_prometheus | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 inventory/role_prometheus diff --git a/inventory/role_prometheus b/inventory/role_prometheus new file mode 100644 index 0000000..e1da387 --- /dev/null +++ b/inventory/role_prometheus @@ -0,0 +1,5 @@ +--- +role_prometheus: + hosts: + prometheus01.vm.freifunk-stuttgart.de: + prometheus02.vm.freifunk-stuttgart.de: -- GitLab From 87852e3b1d4684ce86be4f3d8d498e55fe266af2 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:28:06 +0200 Subject: [PATCH 09/13] Add prometheus-alertmanager --- roles/prometheus/files/alertmanager.yml | 113 ++++++ .../files/alerts/alert_healthchecks.yml | 9 + .../files/alerts/alert_loadbalancing.yml | 11 + .../files/alerts/blackbox-exporter.yml | 68 ++++ roles/prometheus/files/alerts/general.yml | 39 ++ .../prometheus/files/alerts/node-exporter.yml | 366 ++++++++++++++++++ .../files/alerts/smartctl-exporter.yml | 50 +++ .../prometheus/templates/alertmanager.yml.j2 | 113 ++++++ 8 files changed, 769 insertions(+) create mode 100644 roles/prometheus/files/alertmanager.yml create mode 100644 roles/prometheus/files/alerts/alert_healthchecks.yml create mode 100644 roles/prometheus/files/alerts/alert_loadbalancing.yml create mode 100644 roles/prometheus/files/alerts/blackbox-exporter.yml create mode 100644 roles/prometheus/files/alerts/general.yml create mode 100644 roles/prometheus/files/alerts/node-exporter.yml create mode 100644 roles/prometheus/files/alerts/smartctl-exporter.yml create mode 100644 roles/prometheus/templates/alertmanager.yml.j2 diff --git a/roles/prometheus/files/alertmanager.yml b/roles/prometheus/files/alertmanager.yml new file mode 100644 index 0000000..0cba29e --- /dev/null +++ b/roles/prometheus/files/alertmanager.yml @@ -0,0 +1,113 @@ +# Sample configuration. +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@freifunk-stuttgart.de' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service', 'severity'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + repeat_interval: 24h + + # A default receiver + receiver: 'null' + + routes: +### leonard monitoring ### + - receiver: 'leonard_healthchecks' + repeat_interval: 5m + continue: false + #group_wait: 1s + #group_interval: 1m + matchers: + - alertname = SelfMonitoringAlwaysFiring + - severity = info +### leonard ### + - receiver: 'leonard_pushover' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 24h + continue: true + matchers: + - severity = info +### nrb ### + - receiver: 'nrb' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'nrb' + repeat_interval: 24h + continue: true + matchers: + - severity =~ "info" + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +#- name: 'ffs-gw-admins' +# email_configs: +# - to: 'gw-admins@freifunk-stuttgart.de' +# webhook_configs: +# - url: 'http://localhost:9199/alert' +- name: 'leonard_healthchecks' + email_configs: + - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de' +- name: 'leonard_selfhosted' + email_configs: + - to: 'leonard@selfhosted.de' + send_resolved: true +- name: 'null' + email_configs: [] # Kein Versand +- name: leonard_pushover + pushover_configs: + - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW + user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd + priority: 0 + send_resolved: true +- name: 'nrb' + email_configs: + - to: 'ffs-alerts@nicoboehr.de' + send_resolved: true diff --git a/roles/prometheus/files/alerts/alert_healthchecks.yml b/roles/prometheus/files/alerts/alert_healthchecks.yml new file mode 100644 index 0000000..7741c9a --- /dev/null +++ b/roles/prometheus/files/alerts/alert_healthchecks.yml @@ -0,0 +1,9 @@ +groups: +- name: Selfmonitoring + rules: + - alert: 'SelfMonitoringAlwaysFiring' + expr: minute() >= 0 + for: 1s + labels: + severity: info + application: leonard_healthchecks diff --git a/roles/prometheus/files/alerts/alert_loadbalancing.yml b/roles/prometheus/files/alerts/alert_loadbalancing.yml new file mode 100644 index 0000000..1fa7242 --- /dev/null +++ b/roles/prometheus/files/alerts/alert_loadbalancing.yml @@ -0,0 +1,11 @@ +groups: +- name: lowpref + rules: + - alert: LowGatewayPreference + expr: gw_loadbalancing_pref{segment="1"} < 10 + for: 1d + labels: + severity: page + annotations: + summary: | + {{ .Labels.gateway }} has low gateway preference ({{ .Value }}) diff --git a/roles/prometheus/files/alerts/blackbox-exporter.yml b/roles/prometheus/files/alerts/blackbox-exporter.yml new file mode 100644 index 0000000..b83797a --- /dev/null +++ b/roles/prometheus/files/alerts/blackbox-exporter.yml @@ -0,0 +1,68 @@ +groups: + +- name: BlackboxExporter + + rules: + + - alert: BlackboxProbeFailed + expr: 'probe_success{job!~"node_pve01|blackbox_tls_pve01"} == 0' + for: 10m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxConfigurationReloadFailure + expr: 'blackbox_exporter_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox configuration reload failure (instance {{ $labels.instance }}) + description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateExpired + expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) + description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowHttp + expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) + description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowPing + expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ $labels.instance }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/files/alerts/general.yml b/roles/prometheus/files/alerts/general.yml new file mode 100644 index 0000000..267589c --- /dev/null +++ b/roles/prometheus/files/alerts/general.yml @@ -0,0 +1,39 @@ +groups: +- name: up_success + rules: + - alert: UP_FAILED + expr: up{ignore_down!="1"} < 1 + for: 15m + labels: + severity: warning + application: prometheus + annotations: + summary: "Scrapes not functional" +- name: reload_success + rules: + - alert: PROMETHEUS_RELOAD_FAILED + expr: prometheus_config_last_reload_successful < 1 + for: 1m + labels: + severity: warning + application: prometheus + annotations: + summary: "Reload of prometheus config failed" + - alert: ALERTMANAGER_RELOAD_FAILED + expr: alertmanager_config_last_reload_successful < 1 + for: 1m + labels: + severity: warning + application: prometheus + annotations: + summary: "Reload of alertmanager config failed" +- name: probe_success + rules: + - alert: PROBE_FAILED_TCP + expr: probe_success{} < 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Blackbox probe failed" + diff --git a/roles/prometheus/files/alerts/node-exporter.yml b/roles/prometheus/files/alerts/node-exporter.yml new file mode 100644 index 0000000..fe3e8d7 --- /dev/null +++ b/roles/prometheus/files/alerts/node-exporter.yml @@ -0,0 +1,366 @@ +groups: + +- name: NodeExporter + + rules: + + - alert: HostOutOfMemory + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: '(rate(node_vmstat_pgmajfault[1m]) > 2000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 30m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Many containers + # - alert: HostMemoryIsUnderutilized + # expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 1w + # labels: + # severity: info + # annotations: + # summary: Host Memory is underutilized (instance {{ $labels.instance }}) + # description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: HostUnusualNetworkThroughputIn + # expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 5m + # labels: + # severity: warning + # annotations: + # summary: Host unusual network throughput in (instance {{ $labels.instance }}) + # description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # + # - alert: HostUnusualNetworkThroughputOut + # expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + # for: 5m + # labels: + # severity: warning + # annotations: + # summary: Host unusual network throughput out (instance {{ $labels.instance }}) + # description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# All Hosts durcing Backup +# - alert: HostUnusualDiskReadRate +# expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 5m +# labels: +# severity: warning +# annotations: +# summary: Host unusual disk read rate (instance {{ $labels.instance }}) +# description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# pve01 und backup01, vielleicht auch mal pbs? +# - alert: HostUnusualDiskWriteRate +# expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 2m +# labels: +# severity: warning +# annotations: +# summary: Host unusual disk write rate (instance {{ $labels.instance }}) +# description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskWillFillIn24Hours + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesWillFillIn24Hours + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: '(rate(node_disk_write_time_seconds_total{nodename!="gw05n02"}[1m]) / rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{nodename!="gw05n02"}[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# Kann beim backup auf allen containern vorkommen +# - alert: HostHighCpuLoad +# expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 10m +# labels: +# severity: warning +# annotations: +# summary: Host high CPU load (instance {{ $labels.instance }}) +# description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# nas/nextcloud vm +# - alert: HostCpuIsUnderutilized +# expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 1w +# labels: +# severity: info +# annotations: +# summary: Host CPU is underutilized (instance {{ $labels.instance }}) +# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# pve01 und backup01 +# - alert: HostCpuHighIowait +# expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Host CPU high iowait (instance {{ $labels.instance }}) +# description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename!="gw05n02"}' + for: 15m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# All hosts during backup +# - alert: HostContextSwitchingHigh +# expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) +#/ +#(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 +#' +# for: 0m +# labels: +# severity: warning +# annotations: +# summary: Host context switching high (instance {{ $labels.instance }}) +# description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: CpuTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip=~"pci0000:00_0000:00:18_3"} > 98)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl",chip!="pci0000:00_0000:00:18_3"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidArrayGotInactive + expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: '(rate(node_network_transmit_errs_total{device!~"^g09n03abbtesta|^g09n03amobrtra|^g09n03bbbtestb"}[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkInterfaceSaturated + expr: '((rate(node_network_receive_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"}[1m])) / node_network_speed_bytes{device!~"^bb.*|^tap.*|^vnet.*|^veth.*|^tun.*|^vp.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/files/alerts/smartctl-exporter.yml b/roles/prometheus/files/alerts/smartctl-exporter.yml new file mode 100644 index 0000000..1946c38 --- /dev/null +++ b/roles/prometheus/files/alerts/smartctl-exporter.yml @@ -0,0 +1,50 @@ +groups: + +- name: SmartctlExporter + + rules: + + - alert: SmartDeviceTemperatureWarning + expr: 'smartctl_device_temperature > 60' + for: 2m + labels: + severity: warning + annotations: + summary: Smart device temperature warning (instance {{ $labels.instance }}) + description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartDeviceTemperatureCritical + expr: 'smartctl_device_temperature > 80' + for: 2m + labels: + severity: critical + annotations: + summary: Smart device temperature critical (instance {{ $labels.instance }}) + description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartCriticalWarning + expr: 'smartctl_device_critical_warning > 0' + for: 15m + labels: + severity: critical + annotations: + summary: Smart critical warning (instance {{ $labels.instance }}) + description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartMediaErrors + expr: 'smartctl_device_media_errors > 0' + for: 15m + labels: + severity: critical + annotations: + summary: Smart media errors (instance {{ $labels.instance }}) + description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartNvmeWearoutIndicator + expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}' + for: 15m + labels: + severity: critical + annotations: + summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }}) + description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/roles/prometheus/templates/alertmanager.yml.j2 b/roles/prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 0000000..0cba29e --- /dev/null +++ b/roles/prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,113 @@ +# Sample configuration. +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@freifunk-stuttgart.de' + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service', 'severity'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + repeat_interval: 24h + + # A default receiver + receiver: 'null' + + routes: +### leonard monitoring ### + - receiver: 'leonard_healthchecks' + repeat_interval: 5m + continue: false + #group_wait: 1s + #group_interval: 1m + matchers: + - alertname = SelfMonitoringAlwaysFiring + - severity = info +### leonard ### + - receiver: 'leonard_pushover' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'leonard_selfhosted' + repeat_interval: 24h + continue: true + matchers: + - severity = info +### nrb ### + - receiver: 'nrb' + repeat_interval: 4h + continue: true + matchers: + - severity =~ "warning|critical" + - receiver: 'nrb' + repeat_interval: 24h + continue: true + matchers: + - severity =~ "info" + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +#- name: 'ffs-gw-admins' +# email_configs: +# - to: 'gw-admins@freifunk-stuttgart.de' +# webhook_configs: +# - url: 'http://localhost:9199/alert' +- name: 'leonard_healthchecks' + email_configs: + - to: 'f133a6c2-eea4-4723-ae0e-45859fa34471@healthchecks.selfhosted.de' +- name: 'leonard_selfhosted' + email_configs: + - to: 'leonard@selfhosted.de' + send_resolved: true +- name: 'null' + email_configs: [] # Kein Versand +- name: leonard_pushover + pushover_configs: + - token: aRd3o4cy1sEoPqXaoDnzHZsMgLLdWW + user_key: ueyxtapXg7Mw84vjsgQKLGZQkheNHd + priority: 0 + send_resolved: true +- name: 'nrb' + email_configs: + - to: 'ffs-alerts@nicoboehr.de' + send_resolved: true -- GitLab From f4074fe8bc48d4bb016f7d6e236af3f7bc9e620b Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:50:14 +0200 Subject: [PATCH 10/13] Copy alertmanager.yml and alerting rules --- roles/prometheus/tasks/main.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 1be87d7..677d739 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -30,6 +30,22 @@ notify: - Restart prometheus +- name: Copy all alerting rules + copy: + src: alerts/ + dest: /etc/prometheus/alerts + mode: preserve + owner: root + group: root + +- name: Copy alertmanager.yml + copy: + src: alertmanager.yml + dest: /etc/prometheus/ + mode: preserve + owner: root + group: root + - name: Check if client-cert exists stat: path: "/etc/prometheus/ssl/client.cert.pem" -- GitLab From a881f696c02f7ba286d671b8d0f305456cf87689 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:38:49 +0200 Subject: [PATCH 11/13] Add prometheus reload handler and use it where appropriate --- roles/prometheus/handlers/main.yml | 5 ++++- roles/prometheus/tasks/main.yml | 10 +++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 8a8df0d..a05d19c 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -2,4 +2,7 @@ service: name: prometheus state: restarted - +- name: Reload prometheus + service: + name: prometheus + state: reloaded diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 677d739..75f0d17 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -14,21 +14,21 @@ src: node_exporter_targets.yml.j2 dest: /etc/prometheus/node_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus - name: Create kea_exporter_targets.yml file template: src: kea_exporter_targets.yml.j2 dest: /etc/prometheus/kea_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus - name: Create bird_exporter_targets.yml file template: src: bird_exporter_targets.yml.j2 dest: /etc/prometheus/bird_exporter_targets.yml notify: - - Restart prometheus + - Reload prometheus - name: Copy all alerting rules copy: @@ -37,6 +37,8 @@ mode: preserve owner: root group: root + notify: + - Reload prometheus - name: Copy alertmanager.yml copy: @@ -175,3 +177,5 @@ - yamllint state: present install_recommends: false + notify: + - Reload prometheus -- GitLab From d382bb60dee6befd7ec3115ef71e4cace2a77056 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:30:17 +0200 Subject: [PATCH 12/13] Install json-exporter on prometheus-hosts --- roles/prometheus/files/json-exporter.service | 16 +++++++ roles/prometheus/files/json-exporter.yml | 9 ++++ roles/prometheus/tasks/main.yml | 50 ++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 roles/prometheus/files/json-exporter.service create mode 100644 roles/prometheus/files/json-exporter.yml diff --git a/roles/prometheus/files/json-exporter.service b/roles/prometheus/files/json-exporter.service new file mode 100644 index 0000000..74716db --- /dev/null +++ b/roles/prometheus/files/json-exporter.service @@ -0,0 +1,16 @@ +Unit] +Description=Prometheus Json Exporter +Wants=network-online.target +After=network-online.target + +[Service] +Restart=always +User=prometheus +Group=prometheus +StandardError=syslog +Restart=on-failure +KillSignal=SIGQUIT +ExecStart=/opt/json-exporter/json_exporter --web.listen-address=localhost:7979 --config.file /etc/prometheus/json-exporter.yml + +[Install] +WantedBy=multi-user.target diff --git a/roles/prometheus/files/json-exporter.yml b/roles/prometheus/files/json-exporter.yml new file mode 100644 index 0000000..62c4395 --- /dev/null +++ b/roles/prometheus/files/json-exporter.yml @@ -0,0 +1,9 @@ +--- +modules: + gwpref: + metrics: + - name: gw_loadbalancing_pref + help: "Current Preference. Range -inf to 100, where 100 is most willing to accept more nodes." + path: '{ .segments.1.preference }' + labels: + segment: '1' diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 75f0d17..28f7ef5 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -179,3 +179,53 @@ install_recommends: false notify: - Reload prometheus + +- name: Erstelle Zielverzeichnis + file: + path: /opt/json-exporter + state: directory + mode: '0755' + +- name: Lade json_exporter herunter + get_url: + url: https://github.com/prometheus-community/json_exporter/releases/download/v0.7.0/json_exporter-0.7.0.linux-amd64.tar.gz + dest: /opt/json-exporter/json_exporter.tar.gz + mode: '0644' + +- name: Entpacke json_exporter + unarchive: + src: /opt/json-exporter/json_exporter.tar.gz + dest: /opt/json-exporter + remote_src: yes + +- name: Verschiebe Binärdatei in /opt/json-exporter + command: mv /opt/json-exporter/json_exporter-0.7.0.linux-amd64/json_exporter /opt/json-exporter/json_exporter + args: + creates: /opt/json-exporter/json_exporter + +- name: Mache json_exporter ausführbar + file: + path: /opt/json-exporter/json_exporter + mode: '0755' + +- name: Kopiere json-exporter.yml nach /etc/prometheus + copy: + src: files/json-exporter.yml + dest: /etc/prometheus/json-exporter.yml + mode: '0644' + +- name: Kopiere systemd service file für json_exporter + copy: + src: files/json-exporter.service + dest: /etc/systemd/system/json-exporter.service + mode: '0644' + +- name: Lade systemd neu, um neue Unit-Dateien zu erkennen + systemd: + daemon_reload: yes + +- name: Aktiviere und starte json_exporter + systemd: + name: json-exporter + enabled: yes + state: started -- GitLab From 666ac0cd441528f9db17229992a21c6dd119ab49 Mon Sep 17 00:00:00 2001 From: Leonard Penzer <leonard@penzer.de> Date: Sun, 13 Jul 2025 15:41:58 +0200 Subject: [PATCH 13/13] Generate prometheus.yml from template --- roles/prometheus/tasks/main.yml | 7 + roles/prometheus/templates/prometheus.yml.j2 | 172 +++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 roles/prometheus/templates/prometheus.yml.j2 diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 28f7ef5..dbc59fb 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -9,6 +9,13 @@ msg: "This role must only be run on prometheus hosts" when: not is_prometheus | default(false) +- name: Create node_exporter_targets.yml file + template: + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + notify: + - Reload prometheus + - name: Create node_exporter_targets.yml file template: src: node_exporter_targets.yml.j2 diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 new file mode 100644 index 0000000..2700986 --- /dev/null +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,172 @@ +--- +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: '{{ ( inventory_hostname | default(host)).split('.')[0] }}' + +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them +# according to the global 'evaluation_interval'. +rule_files: + - 'alerts/*.yml' + +scrape_configs: + - job_name: 'prometheus' + static_configs: +{% for host in groups['role_prometheus'] %} + - targets: ['{{ hostvars[host].ansible_host | default(host) }}:9998'] + labels: + instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}' +{% endfor %} + scheme: https + metrics_path: /proxy + params: + module: + - prometheus + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'alertmanager' + static_configs: + - targets: ['localhost:9093'] + + - job_name: 'collectd' + scrape_interval: 60s + static_configs: + - targets: + - 'yanic01.vm.freifunk-stuttgart.de:9998' + labels: + instance: "10.0.3.236:9104" + scheme: https + metrics_path: /proxy + params: + module: + - respondd + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'bird' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/bird_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - bird + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: monitor01_blackbox + scrape_interval: 15s + metrics_path: /proxy + scheme: https + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + params: + module: + - blackbox + - icmp + static_configs: + - targets: + - 10.190.0.93 + - 10.190.176.93 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: monitor01.vm.freifunk-stuttgart.de:9998 + - job_name: 'node' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/node_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - node + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + - job_name: 'kea' + scrape_interval: 15s + file_sd_configs: + - files: + - /etc/prometheus/kea_exporter_targets.yml + scheme: https + metrics_path: /proxy + params: + module: + - kea + tls_config: + ca_file: /etc/prometheus/ssl/ca_cert.pem + cert_file: /etc/prometheus/ssl/client.cert.pem + key_file: /etc/prometheus/ssl/client.key.pem + insecure_skip_verify: false # only true for debugging + + # Re-activate when fastd-exporter is installed on gws + # - job_name: bb_fastd + # scrape_interval: 15s + # file_sd_configs: + # - files: + # - 'target-fastd.json' + + - job_name: json_gwpref + metrics_path: /probe + params: + module: [gwpref] + static_configs: +{% for host in groups['role_gw'] %} + - targets: ['http://{{ hostvars[host].ansible_host | default(host) }}/data/gwstatus.json'] + labels: + instance: '{{ (hostvars[host].ansible_host | default(host)).split('.')[0] }}' +{% endfor %} + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + ## Location of the json exporter's real <hostname>:<port> + replacement: localhost:7979 + + - job_name: 'federate' + scrape_interval: 15s + honor_labels: true + metrics_path: '/federate' + params: + 'match[]': + - '{job="blackbox"}' + - '{job="blackbox-5g"}' + - '{job="blackbox-starlink"}' + - '{job="zyxel"}' + - '{job="node"}' + - '{job="snmp"}' + - '{job="unifi"}' + static_configs: + - targets: + - '10.191.255.172:9090' + labels: + ignore_down: "1" -- GitLab