From 43ee006f1577cb312d3faf6e46c3d4d39b99d7c7 Mon Sep 17 00:00:00 2001 From: Homelab Cursor Date: Tue, 24 Mar 2026 13:30:58 +0100 Subject: [PATCH] monitoring: error-rate alerts, silence proxmox filter, periodic loop - loki_client.py: check_error_rate() mit host-spezifischen Schwellen (rss-manager:15, wordpress:10, default:25) - monitor.py: Error-Rate-Check in check_all(), Silence-Check filtert gestoppte Container via Proxmox-Status - telegram_bot.py: periodischer _monitor_loop alle 10 Min - Schliesst #30 und #31 --- .../__pycache__/monitor.cpython-311.pyc | Bin 21058 -> 22061 bytes .../__pycache__/loki_client.cpython-311.pyc | Bin 11029 -> 12767 bytes homelab-ai-bot/core/loki_client.py | 27 ++++++++++++++++++ homelab-ai-bot/monitor.py | 24 ++++++++++++++-- homelab-ai-bot/telegram_bot.py | 12 ++++++++ 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc b/homelab-ai-bot/__pycache__/monitor.cpython-311.pyc index a167b0ca1960cb8383eab16395366706700d1d54..aa7192d3716e834aa1775f686138724c1a9130b1 100644 GIT binary patch delta 5155 zcmZ`-Yj6|S72d0tCEJo^`6*epCBKES!8T8a7#kbR%Qj%%2?(JTSRhO8-8JU1irqqk zp`O8T=|EnMlMo((Hffzq(xlUA_>pE(LQUc{?oOs9lTJHnI;~R3kWwb?IV;JwAlX@c zd-mLO@45Hf^SG<4=g3d5kn#&gV=@QV>xZxUw^wuAXZRAolooiowew{8xK_s}KW+H= zHy#x+upR2wR5{s4X^6V1NA!(eP?wX;s#!D?5@&UVx_tr9&Z?c%;}5bAH5FP143c+B zdy8bT+uC`ggjMS7q>wGw)sZqL=}I;`LBOSv4Lu$~=nGL^pf=#pBmmjCW?jeTwvNs2 zMOTvAQawFgs54xzsGrnaSFn#G#++mdG(Iqqo1QP+;5m(1?u;_E) zYy&5v;bYgH>^LQ5^Yv3Y`G@#LQr4A+&ud_Q@QYsI94TkdErgRw{U$V!OZBL$`v(pf|YZnT*U!$&}!YyQV!peFw3RDNpy)+ z;3T@lvTt($1IMfRra)fUsyyvso0t`!2ggaa5j#t-%w)rjCU!kFO`9iKNAlyG)UaK$ z!?X^E^QC+?V#rz5n&@HS0;z!R;nh-sly?vU->cfM+N+FCb^P)ewKb>$Pn-$rR|E>W z3&MqjlL|))+1r&_=>>`7jW|GcR{F|BZrFiFv96LFuwARAe7CrjAfOuHF8 zY3bDJ_7t;M((_uUb4uc^eEOC4`_wok=P;pR@O5XRuLGBgtJvE!t?DAF2n{j6!_S?E zL~11m%vdGN1O4wwRYNRUuni?*X%A{flXPZ>QdQ91DEYoZw z+%cC_ER`*S`3yZX=z=(34{=^RQXTKdDl*ekib1A`??5SbHq%}J`&vR<`3;b9N*+iF z<+z8=jiP62jZ#VC2Ed9oVY)~vkMbcX?Yf{Dq7paPNj5Wk)+Q&$`dzYE)(N6V>=ops z-r(L~s4pmM-CZF;gtU7)>n(dDNKxEu$j_zlkclsv_#L#6?ai(x+u4)Zm5}A%$!@iy zoJ$>(>yuT1fS?AJ!!ZEDuAG^~%QoiB2H~SQpOoOv(j@@xm%wWH(g`p*I5Hk#M{}K| zn_bFX2%SF5t?w@Y_9Y@Gb^C+8qEG0>Bo%q~^s5ev!yQ!BJ7ldl)Eg8J<~x@8x&l7x z@CO}rvmI^lsOa=|_4xt;A4nO*F6tAyLIM6Rad{_GJq)6QA13l!X93P|U7W;iYYh{g zlgK_bd0EHSSglr)XmMV(S8?E%3$c~mX-Q`WO*STj{!^-<<(-^MOB-Q0+KjLa0nJY_ zgFsEi)yvvIsLw~?O);~|D^54XUBe>5CwfEOJq-u#-Y%bauiFy{R4)pIyq+X1swrTz1-UChb((4YPCH>>SxP zX0C-hlEfKJgS&=SM08wwU6iL7rPl#be8IMz)G}RO9c#|p#;)Y;ARWwTx01W8-0smU zVWOd>Y}lTw!{ZaJfLUAF>Q)1Jjoq-%17`=rGH3qW^2eaLono*nQ5bg*(mmZfqN^rv zvXT5M=zBfCg4D1(`Bf>H{HO!L#%2{PpQ%_0C+j>APof|vdqY7HPM1C^P(KddgrGRI;@?VSFGt=AW^+_J*CV+R zK-OYr5#*F9ErP7_QMSQRO`c#!99v6&h+?W;J#v!Z3-YL8EA~E&@I3$lm&Ay(n36K%t+uA*(Dqd-XR?kP%P!I}W97N0zIZ_R(I;T5B15NS?Uin38HnGYpWVg-N{?cYB`N;HILT1p4631#nUV#WZ!$6srUkwR5^$ z^$xlhhuVqIh@et}?GLq&%)pEhOUFuR#G;M%0RO@DkdN4*x`o+M6`i6V3X5Yo`ZuW9 z!QQJoWlyl9|64}5$)24vU$LVnpN`{X6#ISrt@=_vMS(veIN9TKU#P}hPY)nquULYl z6UcZA;Qqvj8~0cC*t}Ly_|Cjg|6P*o%A{C9>Ej6ay+@BCV2nl6#&M)xLcp>}hY(I8 zoIrR2;VA&wgr9$2PmuTX@IfljC$Z&62tPr46e7?oD4>+Y z>&P}Jxj}GyMfx**kL3!bsMA+b_B8}W^{?2S`DN*EqL@Yqg@W`lHoe8F7vx*f?_}ey z5bgGeZtCj^QIY-)cvnaH1)Uo5276=CmNXQi&&ArFN@ zlEAhAI|I%ind~1;MY)ecmxJc}4@T6&(w6L>ja8Q32ADpfwAh5au&uj8^Z>mB0!Emz z@N4$^60ga6cI#MDIouOzd4m$mZCax*wl;L(&@N+i$8h=4x}ZVod4v}bYzSyP zWv+5|)9R(`*1MZKI##!IY-)FRE^Y2;Z|S6pYhOe$+$~p1)OnOmJ* zt|ZRU{8hgo@Mq81>gxVaP=S70kXwNfwGhUnGPh_f;{YyQ^zWT1rAj~%Xp+_4!fpYR zKy=B`zkM>`B<7Bl-pQ$CfOs?fgcms$Zo6o9LMK`8c6Y;(!R@9i+1z$V3WifOCK%h% zzGAl$WB)^@1ECUO5aATUF$6rU&=7#E3Hady;>JIU}0bc_xfnIQP!e7{yj$c|PEZLEa zWSc6&0gQ3k5gT)^IbL#z5X?R delta 4245 zcmZ`*3v5%@8NSD_ykp|nP8{3ubrRw@34tUKNXQ@|PY4MOftApu#?8He!LieGZwQa; zLI+!0hC1M|L0ed}mNGg*w~|g;X{D}Jw@Mr9f1ZF9-rL_>AeK&0@v9ssX z_Aajn?z6hkX9Sjr)3ge;^`oP;&+2CBy8T&Y2f8;5Z0c+6?TAq;+m=3*z9*aG9x8vx z$G)ttnEE)~tv8p;Zct9Nhs&p&*$SznZ30p91id3U!R;ZmQCM_@Q}Wum3&6cJI}@`g z7Fxwtn;oQvEzWVUin=`ZN^ZWmSXdGwv3wb?nP~0Y(D|4(gOmM94#(}XC27Sf)XwcF z)Cu)U{^ZgN2G|Y$@fNcwwq5q*NLa>}o4o}MplO)Xe1a2}qZF&Pxbp@|Ik_S!54#nx zb7r6IPK_0EEsNF`8H7fmi7m456@#467|VSi_t?33KTgKEz4}3JoX7?9)^o6ZInE5b zVu!oc$82bak(G)~XihoLcYw1hg_8}O)hV2ob7>dArd!mw_yZnw56H3@4H)Yga&PEZZ|wuRjz?wFf!ihH7%_1D?*T&e3#c-X5Mw!HRK zZmbxs;pk9`!6TiDQ|OwzZ^Fzz(!%~^b1i_7XiHujoK(Udv^cxwb4ru5Lf&Ne*S7H} zp5uhZAY?tMAq!k8uVt;)LPLyKcr?WdVQq-J07-WV6HjX~ue>rz!`=&%w7fn^0oTX+ ztp%h4&0Budw}Xq7C>}8J*;uJOFi#9KR%=Qr?p7$Q6EaeF15@-QC)a=bq)zB%UaO5% zCES~0IwMNa~Ub2zhvfT~!ti$es+9}#!a;=7SD0&%6SQ&IG5#zOyr^T$fpoR>w-33)p z2u~MuxlqopPsoj{x^P(11B>Smf$$dv)#N_rD_jD?TMGZ`Lx<8a?@s!gLa5`yGv-`+}p%tR0A`X(OS?m@G<+@b>ISLHIPcgHyPnHSGBnmCWrj zlQK41d?V;NTp1D7b-xz3lQkXRj=!oN5!Es;9tn zuLOjIVRnd{;dFVeWImp`|EAS;y5nTWner38r+P2Dr>za}yJ@qZ-h6WNnT8WXr-t6D zTs>{W-;9y7L@KGeVVSnzg80_ZH3(W9bs^QwyroqO&>?g)fNI}=^){sH9b zwvVc5k{A(SJvxZO2LL2|jF`8qge@sEcv})Lf5z}mad>wi7!K1{V7?6nFQswZXUyB2 zKeew+)%E5S-pDDxNT+k^uIJR<$f-Njc{43%hBF%b$#LI={USNv4A`_kQ zQ29g)n@=@-*f?XFT2wJ+SoO!|R`dF7-L-6UuhV$V+0bh@{@HE>`pwEVeSZyiy(Y7N zrT%&oL0sMG-{R+P`ZKpQ=x;6~TNC-&P(qFn2qG&RM)YI`f_rWy-~kR zbGd^puJ2ihs$d~GIyx!}nmTUu2fcq9D z@-nMiRu0<@E^9TdL%H6BZa%xVEPI~*9pymVl>Cl48>$xapt(P{Q``wL!dKKS-RW=Q z={8(0h@h1%9YnE%9cj3yQ~(}?uU2I+jWGmk z2iU>oCtWGV41L22(`?g<<+H}@cGGLP9jej40NkmObsgt4;MvN?r)x%Wd_MweCCVwC zM#dDtTt!7r+$3+a`lc?}^gvVeph`0RR%*n_g9s3<96f|EmyL&U_%wnE;Rgsu5gtJ} ziEs))&BgED;b24vVLg#3!O7ue*=a>o@EGf2^Bgr3JK?e12fAW8G}ke`e+=NsPv z#+4~Y^MHZ;nvJv!nziVB92F2gVsEwl0|NZj*6ZzCVC*;2_i*lggbx5zgD^HSs$B=- z<0`U01W0kyQ^-a5gqwPM%^|Y-ZlL`JdJg$NLwFKl0svl+ze6YCw_-d~)J&)zP{sng z#eMV^us`S6PulC8JdndEmTY<;@PGR%d%vAkO9mlJ;r4J;Cu9$BLQaCt1=^gTq0$L? z1DxN&E_alWd)S8^d-Hw>E6{%eB#*#GJ8hXC1LI3%KF5CCIm~oj8%#V5>G`R|fBS1{wUR>W70ey&ueGWe3-NZ#m9TO#gUD#$OV;4;lMW$uvABN4Iak<3X|X zR1q&*)w5~|4`Tit%_kIZsnuL+KHq;VOZ~*gB*B) z-UjYRnE-wrghWsd(p@m9}K9bKtL;zib+S~n-brJnqanqUF};( z_$lLtD+E5KetTEVJvVTJ^m)lyi0f(B#$3|Q7SBOkz;Du0YTl+=B76>#s(wV;Az|{y zFAUkx+V3!CfmSUG^gp+NN1^lpehx$e0lJpCH}Y8@05cx7$5{8qwFlqF@qZ9J2$cvA zBK!#9`v@Y!b^z574#D@taU5%BeMe&oA;{>*k%=zS0-R#B#jk?5pm-=eN+dWU1_G)f zAnlX1;!~>{f^^3o)i_E+5jp;5@TiM9{6%{6NK_aLi>s&$7Jwfoy~kPyUNq0-WfbXV zIDl!+K2ylDHytet6Jj~O^o;&g;|z!NH{iYO>ZY+0GqFwRXE^9C#ygFk2W6~(^GyRD Fb!&K&MROAHYAL0%{!2k=k>ymD=Lz760S{ zP4oeso0hAAM3%8+Q4&mK>5`c}ENI-rUY3@0lQbdIhb8)8(#^RnJS;osqDFn#*K>dO zyWctA`SX3>y}f?$x8SN<#l?CG%h37z;SH|jDhEKxHML+(vReQ(Nx51hKyz}$mAvN$ zwaG^=ur~Rx%e)qHq8=8Ax_$iChdX?gxWf+X zV3lYPbxyLJDzI8C!n0`mwa|-4I3V3a3BD8p>uF1~AH`ylKNLG5Mgm8DM^PXgP2JHf z6@z8vvl16rPR*9=p}~RFviT<(bSMvO_sT!Qv!Y&t!x(WIgXfWh7(PyYW#<4mu6*NI z&<$XVKrbtooC1iZzI8sde&R`KEg?m%ht3}XN9JRad5Rc1A)xkpdc+%n3^xxVOAp3zZA&G ztkSD3_d<4xzKKILOTwheuvQ|Qc_~nPfs(a~vCIy1<7I0=fl^cEXLXW^qLh&`w^n8( z3s&cKHq&r{tdk1{7~+YXGCf_SJT_WK8DfQnQ<|Ij470D#0jSHjWs+(Axz=qVF*v}7 z`}sihEFX%+C0;xe6h#>4!&00V5sD!n3P>U^g^(Bz#UhYD9hO4jC^m>#jl$UJdbAU- zd(P|`=H(IIQ?ofNsE(Z;{xI4_x=j8^Y#{6p42Jzf4gQm-M08f5kq39>7%6;0d@IJ} zZFnDF&$Af2DL_q7GIb=Vtar6JAXqu%9YJDo?@N(%N{g$?4r!=4_D)D4a4PdgyTY-n zlo6M16sjf8n!OCPu|GgJ{zo;yHxz1pgAFRGwlidKHh1B}<&?YQsd+c+;eP z+LJL?XU)|a&YR`DGbIaLZJMin!BWM>>+IFQgg7ovwok*EZPSM{wuY>&VXh);Ys?s% zvc{&2p*d@4ex!O}C*1P7SF51acx~h(sHdn>O3YO+&85bwOPExrYdZwCI-*p{)hXD#hxg^NYTs~;>B zm8Xl!7b^{>E1j1*ryJ+0=fmmJeYj-|U0Fj{+R*jSVo^z=6Rl1?uAHG;mw{ejSMtNg zB@`Z9sb7+S9CJJtjtV-o9zUQf#KK0xQ8A9}#Gq!Ok!YuspS>P-606^*UU<7e@jx96OaTUGTq8wfLeIu+rM1Uav}y# zMZ`^L3O|zXP&|}+R`UexET=5CMT;%ZYMgYQ!bnrrd;w*1q$yKgYXqh|g)tdaZ!?W^ Q%11G8A=i~s-t delta 534 zcmcbgJT;7OIWI340}yzw+LyUkaw4AuquWGvEk>n@L7I%p6KhnNRZ@f}*2zvhD95Ne z@qsL(+Qfgd!fGk}saz=>sqASiDKagrQNk%eIdNTCAX6koJw-m1CzV|iWGWb>D5P=& z6|<$Wr6>Ybh^8eV`2>6Y$|e+ ziP3BFf3bry;Xq@GB0xkWkSJyX5()}M5+H%7&3DB485uJszmd4ao(UAuWU88cR8o^M zd-HqAhs?^!AVmUT@ggq}TOUZEnBX({v+R7vyvdX0oEVcQUz1Z~ESUUPPM@)Ov#$Iv zCdTN=8cMc!T`;*(sQ{?|ladlpzo>E|W9{T}7gB)fEB1}L;(PU>`CDslgbN=Qm-9Sd(5Fk^Nv8WD2 ziA_GCXYP25H7zGUv7|@_D0GV@ClhG0HIP~43?f`XL@_r_um6IPQDX7}Lwf*iMT9{B diff --git a/homelab-ai-bot/core/loki_client.py b/homelab-ai-bot/core/loki_client.py index 7ce9d416..83e867ab 100644 --- a/homelab-ai-bot/core/loki_client.py +++ b/homelab-ai-bot/core/loki_client.py @@ -156,6 +156,33 @@ def check_service_restarts(minutes: int = 35) -> list[dict]: return restarts + +ERROR_RATE_THRESHOLDS = { + "rss-manager": 15, + "wordpress-v2": 10, +} +ERROR_RATE_DEFAULT = 25 + + +def check_error_rate(minutes: int = 30) -> list[dict]: + """Check if any host exceeds its error-rate threshold within the window.""" + all_hosts = get_labels() + alerts = [] + now = datetime.now(timezone.utc) + for host in all_hosts: + q = f'count_over_time({{host="{host}"}} |~ "(?i)error" [{minutes}m])' + data = _query("/loki/api/v1/query", {"query": q, "time": _ns(now)}) + count = sum( + int(float(r.get("value", [None, "0"])[1])) + for r in data.get("data", {}).get("result", []) + if len(r.get("value", [])) > 1 + ) + threshold = ERROR_RATE_THRESHOLDS.get(host, ERROR_RATE_DEFAULT) + if count > threshold: + alerts.append({"host": host, "count": count, "threshold": threshold}) + return alerts + + def format_logs(entries: list[dict], max_lines: int = 30) -> str: """Format log entries for human/LLM consumption.""" if not entries: diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index 4d1c2e0d..6c2fe53c 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -20,6 +20,7 @@ ALERT_COOLDOWN_SECONDS = { "restart": 900, "memory_expiry": 43200, "default": 3600, + "error_rate": 1800, } @@ -110,11 +111,26 @@ def check_all() -> list[str]: if hosts: alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") + error_rates = loki_client.check_error_rate(minutes=30) + for er in error_rates: + alerts.append( + f"🔴 {er['host']}: {er['count']} Fehler in 30 Min (Schwelle: {er['threshold']})" + ) + + running_names = { + ct.get("name", "").lower() + for ct in containers + if "error" not in ct and ct.get("status") == "running" + } + silent = loki_client.check_silence(minutes=35) if silent and "error" not in silent[0]: - names = [s["host"] for s in silent - if s.get("host") not in IGNORED_HOSTS - and s.get("host") not in SILENCE_IGNORED_HOSTS] + names = [ + s["host"] for s in silent + if s.get("host") not in IGNORED_HOSTS + and s.get("host") not in SILENCE_IGNORED_HOSTS + and s["host"].lower() in running_names + ] if names: alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") @@ -257,6 +273,8 @@ def _alert_category(alert_text: str) -> str: return "ram" if "panic" in alert_text.lower() or "fatal" in alert_text.lower(): return "panic" + if "Fehler in 30 Min" in alert_text: + return "error_rate" if "Keine Logs" in alert_text: return "silence" if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text: diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index 1a1c2efd..ff7605ac 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -998,10 +998,22 @@ def main(): except Exception: log.exception("Fehler im Forecast-Loop") + async def _monitor_loop(application): + """Periodischer Monitoring-Check alle 10 Minuten.""" + await asyncio.sleep(60) + while True: + try: + monitor.run_check_and_alert() + except Exception: + log.exception("Fehler im Monitor-Loop") + await asyncio.sleep(600) + async def post_init(application): await application.bot.set_my_commands(BOT_COMMANDS) log.info("Kommandomenü registriert") asyncio.create_task(_watchdog_loop()) + asyncio.create_task(_monitor_loop(application)) + log.info("Monitor-Loop aktiv (alle 10 Min)") if application.job_queue is None: asyncio.create_task(_filmtipp_loop(application)) asyncio.create_task(_forecast_loop(application))