monitoring: limit 200, HTTP-Health-Check, Service-Restart-Erkennung

This commit is contained in:
root 2026-03-11 21:14:06 +07:00
parent afdf94af70
commit 15296da783
2 changed files with 45 additions and 1 deletions

View file

@ -46,7 +46,7 @@ def query_logs(query: str, hours: float = 1, limit: int = 100) -> list[dict]:
return entries return entries
def get_errors(container: str = None, hours: float = 1, limit: int = 50) -> list[dict]: def get_errors(container: str = None, hours: float = 1, limit: int = 200) -> list[dict]:
"""Get error-level logs, optionally filtered by container hostname.""" """Get error-level logs, optionally filtered by container hostname."""
if container: if container:
q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"' q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"'
@ -109,6 +109,26 @@ def get_health(container: str, hours: float = 24) -> dict:
} }
WATCHED_SERVICES = [
("rss-manager", "rss-manager"),
("wordpress-v2", "wordpress"),
("fuenfvoracht", "fuenfvoracht"),
("homelab-ai-bot", "hausmeister"),
]
def check_service_restarts(minutes: int = 35) -> list[dict]:
"""Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki)."""
restarts = []
for host, service_name in WATCHED_SERVICES:
q = f'{{host="{host}"}} |~ "(?i)(Started|Restarting|restarted).*{service_name}"'
entries = query_logs(q, hours=minutes / 60, limit=5)
real = [e for e in entries if "error" not in e]
if real:
restarts.append({"host": host, "service": service_name, "count": len(real)})
return restarts
def format_logs(entries: list[dict], max_lines: int = 30) -> str: def format_logs(entries: list[dict], max_lines: int = 30) -> str:
"""Format log entries for human/LLM consumption.""" """Format log entries for human/LLM consumption."""
if not entries: if not entries:

View file

@ -16,6 +16,8 @@ ALERT_COOLDOWN_SECONDS = {
"ram": 1800, "ram": 1800,
"panic": 3600, "panic": 3600,
"silence": 3600, "silence": 3600,
"http": 1800,
"restart": 900,
"default": 3600, "default": 3600,
} }
@ -36,6 +38,12 @@ def _get_passwords(cfg):
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
HTTP_HEALTH_CHECKS = [
{"name": "arakavanews.com", "url": "https://arakavanews.com"},
{"name": "matomo.orbitalo.net", "url": "https://matomo.orbitalo.net"},
{"name": "grafana.orbitalo.net", "url": "https://grafana.orbitalo.net"},
]
EXPECTED_STOPPED = { EXPECTED_STOPPED = {
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) (101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster) (101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
@ -99,6 +107,18 @@ def check_all() -> list[str]:
if names: if names:
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
for check in HTTP_HEALTH_CHECKS:
try:
r = requests.get(check["url"], timeout=10, allow_redirects=True)
if r.status_code >= 400:
alerts.append(f"🔴 {check['name']} antwortet mit HTTP {r.status_code}")
except requests.RequestException as e:
alerts.append(f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}")
restarts = loki_client.check_service_restarts(minutes=35)
for r in restarts:
alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")
try: try:
mail_client.init(cfg) mail_client.init(cfg)
important = mail_client.get_important_mails(hours=1) important = mail_client.get_important_mails(hours=1)
@ -195,6 +215,10 @@ def _alert_category(alert_text: str) -> str:
return "panic" return "panic"
if "Keine Logs" in alert_text: if "Keine Logs" in alert_text:
return "silence" return "silence"
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
return "http"
if "Service-Neustart" in alert_text:
return "restart"
return "default" return "default"