monitoring: limit 200, HTTP-Health-Check, Service-Restart-Erkennung
This commit is contained in:
parent
afdf94af70
commit
15296da783
2 changed files with 45 additions and 1 deletions
|
|
@ -46,7 +46,7 @@ def query_logs(query: str, hours: float = 1, limit: int = 100) -> list[dict]:
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def get_errors(container: str = None, hours: float = 1, limit: int = 50) -> list[dict]:
|
def get_errors(container: str = None, hours: float = 1, limit: int = 200) -> list[dict]:
|
||||||
"""Get error-level logs, optionally filtered by container hostname."""
|
"""Get error-level logs, optionally filtered by container hostname."""
|
||||||
if container:
|
if container:
|
||||||
q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"'
|
q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"'
|
||||||
|
|
@ -109,6 +109,26 @@ def get_health(container: str, hours: float = 24) -> dict:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
WATCHED_SERVICES = [
|
||||||
|
("rss-manager", "rss-manager"),
|
||||||
|
("wordpress-v2", "wordpress"),
|
||||||
|
("fuenfvoracht", "fuenfvoracht"),
|
||||||
|
("homelab-ai-bot", "hausmeister"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def check_service_restarts(minutes: int = 35) -> list[dict]:
|
||||||
|
"""Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki)."""
|
||||||
|
restarts = []
|
||||||
|
for host, service_name in WATCHED_SERVICES:
|
||||||
|
q = f'{{host="{host}"}} |~ "(?i)(Started|Restarting|restarted).*{service_name}"'
|
||||||
|
entries = query_logs(q, hours=minutes / 60, limit=5)
|
||||||
|
real = [e for e in entries if "error" not in e]
|
||||||
|
if real:
|
||||||
|
restarts.append({"host": host, "service": service_name, "count": len(real)})
|
||||||
|
return restarts
|
||||||
|
|
||||||
|
|
||||||
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
|
def format_logs(entries: list[dict], max_lines: int = 30) -> str:
|
||||||
"""Format log entries for human/LLM consumption."""
|
"""Format log entries for human/LLM consumption."""
|
||||||
if not entries:
|
if not entries:
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ ALERT_COOLDOWN_SECONDS = {
|
||||||
"ram": 1800,
|
"ram": 1800,
|
||||||
"panic": 3600,
|
"panic": 3600,
|
||||||
"silence": 3600,
|
"silence": 3600,
|
||||||
|
"http": 1800,
|
||||||
|
"restart": 900,
|
||||||
"default": 3600,
|
"default": 3600,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -36,6 +38,12 @@ def _get_passwords(cfg):
|
||||||
|
|
||||||
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
|
CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115]
|
||||||
|
|
||||||
|
HTTP_HEALTH_CHECKS = [
|
||||||
|
{"name": "arakavanews.com", "url": "https://arakavanews.com"},
|
||||||
|
{"name": "matomo.orbitalo.net", "url": "https://matomo.orbitalo.net"},
|
||||||
|
{"name": "grafana.orbitalo.net", "url": "https://grafana.orbitalo.net"},
|
||||||
|
]
|
||||||
|
|
||||||
EXPECTED_STOPPED = {
|
EXPECTED_STOPPED = {
|
||||||
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
(101, "pp1"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
||||||
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
(101, "pp2"), # yt-desktop-standby — Reserve, absichtlich gestoppt (pp-cluster)
|
||||||
|
|
@ -99,6 +107,18 @@ def check_all() -> list[str]:
|
||||||
if names:
|
if names:
|
||||||
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}")
|
||||||
|
|
||||||
|
for check in HTTP_HEALTH_CHECKS:
|
||||||
|
try:
|
||||||
|
r = requests.get(check["url"], timeout=10, allow_redirects=True)
|
||||||
|
if r.status_code >= 400:
|
||||||
|
alerts.append(f"🔴 {check['name']} antwortet mit HTTP {r.status_code}")
|
||||||
|
except requests.RequestException as e:
|
||||||
|
alerts.append(f"🔴 {check['name']} nicht erreichbar: {str(e)[:80]}")
|
||||||
|
|
||||||
|
restarts = loki_client.check_service_restarts(minutes=35)
|
||||||
|
for r in restarts:
|
||||||
|
alerts.append(f"🔄 Service-Neustart: {r['service']} auf {r['host']} ({r['count']}x in 35 Min)")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mail_client.init(cfg)
|
mail_client.init(cfg)
|
||||||
important = mail_client.get_important_mails(hours=1)
|
important = mail_client.get_important_mails(hours=1)
|
||||||
|
|
@ -195,6 +215,10 @@ def _alert_category(alert_text: str) -> str:
|
||||||
return "panic"
|
return "panic"
|
||||||
if "Keine Logs" in alert_text:
|
if "Keine Logs" in alert_text:
|
||||||
return "silence"
|
return "silence"
|
||||||
|
if "antwortet mit HTTP" in alert_text or "nicht erreichbar" in alert_text:
|
||||||
|
return "http"
|
||||||
|
if "Service-Neustart" in alert_text:
|
||||||
|
return "restart"
|
||||||
return "default"
|
return "default"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue