feat: count_errors Tool, gather_errors mit Anzahl+Host, Tool-Descriptions geschärft
This commit is contained in:
parent
fc07039709
commit
8e74da7530
3 changed files with 78 additions and 4 deletions
|
|
@ -43,9 +43,41 @@ def gather_status() -> str:
|
||||||
|
|
||||||
|
|
||||||
def gather_errors(hours: float = 2) -> str:
|
def gather_errors(hours: float = 2) -> str:
|
||||||
"""Aktuelle Fehler aus Loki für /errors."""
|
"""Aktuelle Fehler aus Loki — mit Anzahl + Beispiele."""
|
||||||
entries = loki_client.get_errors(hours=hours, limit=30)
|
result = loki_client.count_errors(hours=hours)
|
||||||
return loki_client.format_logs(entries)
|
if "error" in result:
|
||||||
|
return f"Loki-Fehler: {result['error']}"
|
||||||
|
count = result["count"]
|
||||||
|
per_host = result.get("per_host", {})
|
||||||
|
lines = [f"Fehler ({hours:.0f}h): {count} Einträge"]
|
||||||
|
if per_host:
|
||||||
|
top = sorted(per_host.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||||
|
lines.append("Top-Hosts:")
|
||||||
|
for host, n in top:
|
||||||
|
lines.append(f" {host}: {n}x")
|
||||||
|
if count > 0:
|
||||||
|
examples = loki_client.get_errors(hours=hours, limit=5)
|
||||||
|
real = [e for e in examples if "error" not in e]
|
||||||
|
if real:
|
||||||
|
lines.append("Letzte Beispiele:")
|
||||||
|
for e in real[:3]:
|
||||||
|
lines.append(f" [{e.get('host','?')}] {e.get('line','')[:120]}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_error_count(hours: float = 24) -> str:
|
||||||
|
"""Nur die Fehleranzahl aus Loki — für Zähl-Fragen."""
|
||||||
|
result = loki_client.count_errors(hours=hours)
|
||||||
|
if "error" in result:
|
||||||
|
return f"Loki-Fehler: {result['error']}"
|
||||||
|
count = result["count"]
|
||||||
|
per_host = result.get("per_host", {})
|
||||||
|
lines = [f"{count} Fehler-Einträge in den letzten {hours:.0f} Stunden"]
|
||||||
|
if per_host:
|
||||||
|
top = sorted(per_host.items(), key=lambda x: x[1], reverse=True)[:8]
|
||||||
|
for host, n in top:
|
||||||
|
lines.append(f" {host}: {n}x")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def gather_container_status(query: str) -> str:
|
def gather_container_status(query: str) -> str:
|
||||||
|
|
@ -255,6 +287,7 @@ def get_tool_handlers() -> dict:
|
||||||
"get_all_containers": lambda: gather_status(),
|
"get_all_containers": lambda: gather_status(),
|
||||||
"get_container_detail": lambda query: gather_container_status(query),
|
"get_container_detail": lambda query: gather_container_status(query),
|
||||||
"get_errors": lambda hours=2: gather_errors(hours=hours),
|
"get_errors": lambda hours=2: gather_errors(hours=hours),
|
||||||
|
"count_errors": lambda hours=24: gather_error_count(hours=hours),
|
||||||
"get_container_logs": lambda container, hours=1: gather_logs(container, hours=hours),
|
"get_container_logs": lambda container, hours=1: gather_logs(container, hours=hours),
|
||||||
"get_silent_hosts": lambda: gather_silence(),
|
"get_silent_hosts": lambda: gather_silence(),
|
||||||
"get_server_metrics": lambda host=None: _tool_get_server_metrics(host),
|
"get_server_metrics": lambda host=None: _tool_get_server_metrics(host),
|
||||||
|
|
|
||||||
|
|
@ -117,6 +117,33 @@ WATCHED_SERVICES = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def count_errors(hours: float = 24) -> dict:
|
||||||
|
"""Zählt Fehler-Log-Einträge über einen Zeitraum via Loki metric query."""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
start = now - timedelta(hours=hours)
|
||||||
|
q = '{job=~".+"} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"'
|
||||||
|
# Loki instant metric query für Gesamtanzahl
|
||||||
|
data = _query("/loki/api/v1/query_range", {
|
||||||
|
"query": q,
|
||||||
|
"start": _ns(start),
|
||||||
|
"end": _ns(now),
|
||||||
|
"limit": 5000,
|
||||||
|
"direction": "backward",
|
||||||
|
})
|
||||||
|
if "error" in data:
|
||||||
|
return {"error": data["error"], "count": 0}
|
||||||
|
total = sum(
|
||||||
|
len(stream.get("values", []))
|
||||||
|
for stream in data.get("data", {}).get("result", [])
|
||||||
|
)
|
||||||
|
# Per-Host aufschlüsseln
|
||||||
|
per_host: dict[str, int] = {}
|
||||||
|
for stream in data.get("data", {}).get("result", []):
|
||||||
|
host = stream.get("stream", {}).get("host", "unknown")
|
||||||
|
per_host[host] = per_host.get(host, 0) + len(stream.get("values", []))
|
||||||
|
return {"count": total, "hours": hours, "per_host": per_host}
|
||||||
|
|
||||||
|
|
||||||
def check_service_restarts(minutes: int = 35) -> list[dict]:
|
def check_service_restarts(minutes: int = 35) -> list[dict]:
|
||||||
"""Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki)."""
|
"""Findet Services die innerhalb des Zeitfensters neu gestartet haben (systemd journal via Loki)."""
|
||||||
restarts = []
|
restarts = []
|
||||||
|
|
|
||||||
|
|
@ -44,11 +44,25 @@ TOOLS = [
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "count_errors",
|
||||||
|
"description": "Zählt Fehler-Logs aus Loki und gibt ANZAHL pro Host zurück. Nutze dieses Tool wenn nach der ANZAHL von Fehlern gefragt wird (z.B. 'wieviele Fehler', 'wie oft', 'Fehleranzahl').",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"hours": {"type": "number", "description": "Zeitraum in Stunden (z.B. 24 = heute, 72 = 3 Tage, 168 = 1 Woche)", "default": 24}
|
||||||
|
},
|
||||||
|
"required": [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "get_errors",
|
"name": "get_errors",
|
||||||
"description": "Aktuelle Fehler-Logs aus Loki (alle Container)",
|
"description": "Zeigt Fehler-Logs aus Loki mit Beispielen (Inhalt der Fehlermeldungen). Nutze dieses Tool wenn nach dem INHALT oder DETAILS von Fehlern gefragt wird.",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue