diff --git a/homelab-ai-bot/context.py b/homelab-ai-bot/context.py index f09aaa17..e69de29b 100644 --- a/homelab-ai-bot/context.py +++ b/homelab-ai-bot/context.py @@ -1,159 +0,0 @@ -"""Intelligente Kontext-Sammlung für den Hausmeister-Bot. -Entscheidet anhand der Frage welche Datenquellen abgefragt werden.""" - -import sys -import os -import re - -sys.path.insert(0, os.path.dirname(__file__)) -from core import config, loki_client, proxmox_client - - -def _load_config(): - return config.parse_config() - - -def _get_tokens(cfg): - tokens = {} - tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") - tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") - if tn and tv: - tokens["pve-hetzner"] = {"name": tn, "value": tv} - return tokens - - -def _get_passwords(cfg): - return { - "pve-hetzner": cfg.passwords.get("hetzner", ""), - "pve1": cfg.passwords.get("default", ""), - "pve3": cfg.passwords.get("default", ""), - "default": cfg.passwords.get("default", ""), - } - - -def gather_status() -> str: - """Komplett-Status aller Container für /status.""" - cfg = _load_config() - containers = proxmox_client.get_all_containers( - _get_passwords(cfg), _get_tokens(cfg) - ) - return proxmox_client.format_containers(containers) - - -def gather_errors(hours: float = 2) -> str: - """Aktuelle Fehler aus Loki für /errors.""" - entries = loki_client.get_errors(hours=hours, limit=30) - return loki_client.format_logs(entries) - - -def gather_container_status(query: str) -> str: - """Status eines einzelnen Containers.""" - cfg = _load_config() - vmid = None - name = None - - m = re.search(r'\b(\d{3})\b', query) - if m: - vmid = int(m.group(1)) - else: - name = query.strip() - - ct = config.get_container(cfg, vmid=vmid, name=name) - if not ct: - return f"Container nicht gefunden: {query}" - - host_ip = proxmox_client.PROXMOX_HOSTS.get(ct.host) - if not host_ip: - return f"Host nicht erreichbar: {ct.host}" - - token = _get_tokens(cfg).get(ct.host, {}) - pw = _get_passwords(cfg).get(ct.host, "") - try: - client = proxmox_client.ProxmoxClient( - host_ip, password=pw, - token_name=token.get("name", ""), - token_value=token.get("value", ""), - ) - status = client.get_container_status(ct.vmid) - except Exception as e: - return f"Proxmox-Fehler: {e}" - - mem_mb = status.get("mem", 0) // (1024 * 1024) - maxmem_mb = status.get("maxmem", 0) // (1024 * 1024) - uptime_h = status.get("uptime", 0) // 3600 - - return ( - f"CT {ct.vmid} — {ct.name}\n" - f"Host: {ct.host}\n" - f"Status: {status.get('status', '?')}\n" - f"RAM: {mem_mb}/{maxmem_mb} MB\n" - f"CPU: {status.get('cpus', '?')} Kerne\n" - f"Uptime: {uptime_h}h\n" - f"Tailscale: {ct.tailscale_ip or '—'}\n" - f"Dienste: {ct.services}" - ) - - -def gather_logs(container: str, hours: float = 1) -> str: - """Logs eines Containers aus Loki.""" - entries = loki_client.query_logs( - f'{{host="{container}"}}', hours=hours, limit=20 - ) - return loki_client.format_logs(entries) - - -def gather_health(container: str) -> str: - """Health-Check eines Containers.""" - health = loki_client.get_health(container, hours=24) - status_emoji = {"healthy": "✅", "warning": "⚠️", "critical": "🔴"}.get( - health.get("status", ""), "❓" - ) - return ( - f"{status_emoji} {health.get('host', container)}\n" - f"Status: {health.get('status', '?')}\n" - f"Fehler (24h): {health.get('errors_last_{hours}h', '?')}\n" - f"Sendet Logs: {'ja' if health.get('sending_logs') else 'nein'}" - ) - - -def gather_silence() -> str: - """Welche Hosts senden keine Logs?""" - silent = loki_client.check_silence(minutes=35) - if not silent: - return "✅ Alle Hosts senden Logs." - if silent and "error" in silent[0]: - return f"Fehler: {silent[0]['error']}" - lines = ["⚠️ Stille Hosts (keine Logs seit 35+ Min):\n"] - for s in silent: - lines.append(f" • {s['host']}") - return "\n".join(lines) - - -def gather_context_for_question(question: str) -> str: - """Sammelt relevanten Kontext für eine Freitext-Frage.""" - q = question.lower() - parts = [] - - if any(w in q for w in ["fehler", "error", "problem", "kaputt", "down"]): - parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=2)) - - if any(w in q for w in ["status", "läuft", "container", "übersicht", "alles"]): - parts.append("=== Container Status ===\n" + gather_status()) - - if any(w in q for w in ["still", "silence", "stumm", "logs"]): - parts.append("=== Stille Hosts ===\n" + gather_silence()) - - ct_match = re.search(r'\bct[- ]?(\d{3})\b', q) - if ct_match: - parts.append(f"=== CT {ct_match.group(1)} ===\n" + gather_container_status(ct_match.group(1))) - - for name in ["wordpress", "rss", "seafile", "forgejo", "portainer", - "fuenfvoracht", "redax", "flugscanner", "edelmetall"]: - if name in q: - parts.append(f"=== {name} ===\n" + gather_container_status(name)) - - if not parts: - parts.append("=== Container Status ===\n" + gather_status()) - parts.append("=== Aktuelle Fehler ===\n" + gather_errors(hours=1)) - - return "\n\n".join(parts) diff --git a/homelab-ai-bot/core/__init__.py b/homelab-ai-bot/core/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/homelab-ai-bot/core/__init__.py @@ -0,0 +1 @@ + diff --git a/homelab-ai-bot/core/config.py b/homelab-ai-bot/core/config.py new file mode 100644 index 00000000..4960c1b2 --- /dev/null +++ b/homelab-ai-bot/core/config.py @@ -0,0 +1,179 @@ +"""Parses homelab.conf — the single source of truth for infrastructure facts.""" + +import os +import re +from pathlib import Path +from dataclasses import dataclass, field + +HOMELAB_CONF_PATHS = [ + Path("/root/homelab-brain/homelab.conf"), + Path("/opt/homelab-brain/homelab.conf"), +] + + +@dataclass +class Container: + vmid: int + name: str + tailscale_ip: str + services: str + host: str # pve-hetzner, pve1, pve3 + + +@dataclass +class Tunnel: + ct_id: int + domain: str + target: str + status: str + + +@dataclass +class HomelabConfig: + raw: dict = field(default_factory=dict) + domains: dict = field(default_factory=dict) + servers: dict = field(default_factory=dict) + passwords: dict = field(default_factory=dict) + containers: list = field(default_factory=list) + telegram: dict = field(default_factory=dict) + api_keys: dict = field(default_factory=dict) + tunnels: list = field(default_factory=list) + + +def _parse_container_sections(path: Path) -> dict: + """Parse section comments to determine which host each CT_ variable belongs to.""" + section_map = { + "pve-hetzner": re.compile(r"#.*CONTAINER.*pve-hetzner", re.IGNORECASE), + "pve1": re.compile(r"#.*CONTAINER.*pve1", re.IGNORECASE), + "pve3": re.compile(r"#.*CONTAINER.*pve3", re.IGNORECASE), + } + ct_var = re.compile(r"^(CT_\d+(?:_\w+)?)\s*=") + result = {} + current_host = "pve-hetzner" + + with open(path) as f: + for line in f: + stripped = line.strip() + if stripped.startswith("#"): + for host, pattern in section_map.items(): + if pattern.search(stripped): + current_host = host + break + else: + m = ct_var.match(stripped) + if m: + result[m.group(1)] = current_host + return result + + +def find_config() -> Path: + for p in HOMELAB_CONF_PATHS: + if p.exists(): + return p + raise FileNotFoundError(f"homelab.conf not found in {HOMELAB_CONF_PATHS}") + + +def parse_config(path: Path = None) -> HomelabConfig: + if path is None: + path = find_config() + + raw = {} + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + m = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)="?(.*?)"?\s*$', line) + if m: + raw[m.group(1)] = m.group(2) + + cfg = HomelabConfig(raw=raw) + + for k, v in raw.items(): + if k.startswith("DOMAIN_"): + cfg.domains[k.replace("DOMAIN_", "").lower()] = v + elif k.startswith("SRV_"): + cfg.servers[k.replace("SRV_", "").lower()] = v + elif k.startswith("PW_"): + cfg.passwords[k.replace("PW_", "").lower()] = v + elif k.startswith("TG_"): + cfg.telegram[k.lower()] = v + elif k.startswith("FORGEJO_") or k.startswith("GITHUB_") or k.startswith("OPENROUTER_"): + cfg.api_keys[k.lower()] = v + + ct_pattern = re.compile(r"^CT_(\d+)(?:_(PVE\d+))?$") + section_hosts = _parse_container_sections(path) + + for k, v in raw.items(): + m = ct_pattern.match(k) + if m: + vmid = int(m.group(1)) + explicit_host = m.group(2) + if explicit_host: + host = {"PVE1": "pve1", "PVE3": "pve3"}.get(explicit_host, explicit_host.lower()) + else: + host = section_hosts.get(k, "pve-hetzner") + + parts = v.split("|") + if len(parts) >= 3: + cfg.containers.append(Container( + vmid=vmid, + name=parts[0], + tailscale_ip=parts[1] if parts[1] != "—" else "", + services=parts[2], + host=host, + )) + + for k, v in raw.items(): + m = re.match(r"^TUNNEL_(\d+)(?:_\w+)?$", k) + if m: + ct_id = int(m.group(1)) + parts = v.split("|") + if len(parts) >= 3: + cfg.tunnels.append(Tunnel( + ct_id=ct_id, + domain=parts[0], + target=parts[1], + status=parts[2], + )) + + cfg.containers.sort(key=lambda c: (c.host, c.vmid)) + return cfg + + +def get_container(cfg: HomelabConfig, vmid: int = None, name: str = None) -> Container | None: + for c in cfg.containers: + if vmid and c.vmid == vmid: + return c + if name and name.lower() in c.name.lower(): + return c + return None + + +def format_overview(cfg: HomelabConfig) -> str: + lines = ["# Homelab Infrastructure (from homelab.conf)\n"] + + lines.append("## Domains") + for k, v in cfg.domains.items(): + lines.append(f"- {k}: {v}") + + lines.append("\n## Servers (Tailscale)") + for k, v in cfg.servers.items(): + lines.append(f"- {k}: {v}") + + current_host = None + for c in cfg.containers: + if c.host != current_host: + current_host = c.host + lines.append(f"\n## Containers on {current_host}") + lines.append("| CT | Name | Tailscale | Services |") + lines.append("|---|---|---|---|") + ts = c.tailscale_ip or "—" + lines.append(f"| {c.vmid} | {c.name} | {ts} | {c.services} |") + + if cfg.tunnels: + lines.append("\n## Cloudflare Tunnels") + for t in cfg.tunnels: + lines.append(f"- CT {t.ct_id}: {t.domain} → {t.target} ({t.status})") + + return "\n".join(lines) diff --git a/homelab-ai-bot/core/loki_client.py b/homelab-ai-bot/core/loki_client.py new file mode 100644 index 00000000..ca885897 --- /dev/null +++ b/homelab-ai-bot/core/loki_client.py @@ -0,0 +1,130 @@ +"""Loki API client for querying centralized logs.""" + +import requests +from datetime import datetime, timezone, timedelta + +LOKI_URL = "http://100.109.206.43:3100" + + +def _query(endpoint: str, params: dict, base_url: str = None) -> dict: + url = f"{base_url or LOKI_URL}{endpoint}" + try: + r = requests.get(url, params=params, timeout=10) + r.raise_for_status() + return r.json() + except requests.RequestException as e: + return {"error": str(e)} + + +def _ns(dt: datetime) -> str: + return str(int(dt.timestamp() * 1e9)) + + +def query_logs(query: str, hours: float = 1, limit: int = 100) -> list[dict]: + """Run a LogQL query and return log entries.""" + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + data = _query("/loki/api/v1/query_range", { + "query": query, + "start": _ns(start), + "end": _ns(now), + "limit": limit, + "direction": "backward", + }) + if "error" in data: + return [{"error": data["error"]}] + + entries = [] + for stream in data.get("data", {}).get("result", []): + labels = stream.get("stream", {}) + for ts, line in stream.get("values", []): + entries.append({ + "timestamp": ts, + "host": labels.get("host", labels.get("job", "unknown")), + "line": line, + }) + return entries + + +def get_errors(container: str = None, hours: float = 1, limit: int = 50) -> list[dict]: + """Get error-level logs, optionally filtered by container hostname.""" + if container: + q = f'{{host="{container}"}} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"' + else: + q = '{job=~".+"} |~ "(?i)(error|fatal|panic|traceback|exception)" !~ "caller=metrics|query_hash=|executing query|scheduler_processor|Aborted connection|systemd-networkd-wait-online|context canceled|AH01630: client denied"' + return query_logs(q, hours=hours, limit=limit) + + +def get_labels() -> list[str]: + """Get all available label values for 'host'.""" + data = _query("/loki/api/v1/label/host/values", {}) + if "error" in data: + return [] + return data.get("data", []) + + +def check_silence(minutes: int = 35) -> list[dict]: + """Find hosts that haven't sent logs within the given timeframe.""" + all_hosts = get_labels() + if not all_hosts: + return [{"error": "Could not fetch host labels from Loki"}] + + now = datetime.now(timezone.utc) + start = now - timedelta(minutes=minutes) + silent = [] + + for host in all_hosts: + data = _query("/loki/api/v1/query_range", { + "query": f'count_over_time({{host="{host}"}}[{minutes}m])', + "start": _ns(start), + "end": _ns(now), + "limit": 1, + }) + results = data.get("data", {}).get("result", []) + has_logs = any( + int(v[1]) > 0 + for r in results + for v in r.get("values", []) + ) + if not has_logs: + silent.append({"host": host, "silent_minutes": minutes}) + + return silent + + +def get_health(container: str, hours: float = 24) -> dict: + """Get a health summary for a specific container.""" + errors = get_errors(container=container, hours=hours, limit=200) + error_count = len([e for e in errors if "error" not in e]) + + recent = query_logs(f'{{host="{container}"}}', hours=0.5, limit=5) + has_recent = len([e for e in recent if "error" not in e]) > 0 + + return { + "host": container, + "errors_last_{hours}h": error_count, + "sending_logs": has_recent, + "status": "healthy" if error_count < 5 and has_recent else + "warning" if error_count < 20 else "critical", + } + + +def format_logs(entries: list[dict], max_lines: int = 30) -> str: + """Format log entries for human/LLM consumption.""" + if not entries: + return "No log entries found." + if entries and "error" in entries[0]: + return f"Loki error: {entries[0]['error']}" + + lines = [] + for e in entries[:max_lines]: + host = e.get("host", "?") + line = e.get("line", "").strip() + if len(line) > 200: + line = line[:200] + "..." + lines.append(f"[{host}] {line}") + + total = len(entries) + if total > max_lines: + lines.append(f"\n... and {total - max_lines} more entries") + return "\n".join(lines) diff --git a/homelab-ai-bot/core/prometheus_client.py b/homelab-ai-bot/core/prometheus_client.py new file mode 100644 index 00000000..f53bd305 --- /dev/null +++ b/homelab-ai-bot/core/prometheus_client.py @@ -0,0 +1,133 @@ +"""Prometheus API client for querying system metrics.""" + +import requests +from datetime import datetime, timezone, timedelta + +PROMETHEUS_URL = "http://100.88.230.59:9090" + + +def _query(endpoint: str, params: dict, base_url: str = None) -> dict: + url = f"{base_url or PROMETHEUS_URL}{endpoint}" + try: + r = requests.get(url, params=params, timeout=10) + r.raise_for_status() + return r.json() + except requests.RequestException as e: + return {"error": str(e), "status": "unavailable"} + + +def instant_query(query: str) -> dict: + """Run an instant PromQL query.""" + return _query("/api/v1/query", {"query": query}) + + +def range_query(query: str, hours: float = 1, step: str = "5m") -> dict: + """Run a range PromQL query.""" + now = datetime.now(timezone.utc) + start = now - timedelta(hours=hours) + return _query("/api/v1/query_range", { + "query": query, + "start": start.isoformat(), + "end": now.isoformat(), + "step": step, + }) + + +def get_targets() -> list[dict]: + """Get all Prometheus scrape targets with their status.""" + data = _query("/api/v1/targets", {}) + if "error" in data: + return [{"error": data["error"]}] + + targets = [] + for t in data.get("data", {}).get("activeTargets", []): + targets.append({ + "job": t.get("labels", {}).get("job", "unknown"), + "instance": t.get("labels", {}).get("instance", "unknown"), + "health": t.get("health", "unknown"), + "last_scrape": t.get("lastScrape", ""), + }) + return targets + + +def is_available() -> bool: + """Check if Prometheus is reachable.""" + data = _query("/api/v1/query", {"query": "up"}) + return "error" not in data or data.get("status") == "success" + + +def get_cpu(instance: str = None, hours: float = 1) -> dict: + """Get CPU usage. If instance given, filter to that instance.""" + if instance: + q = f'100 - (avg by (instance) (rate(node_cpu_seconds_total{{mode="idle", instance=~"{instance}.*"}}[5m])) * 100)' + else: + q = '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' + data = instant_query(q) + return _extract_metrics(data, "cpu_percent") + + +def get_memory(instance: str = None) -> dict: + """Get memory usage percentage.""" + if instance: + q = f'(1 - node_memory_MemAvailable_bytes{{instance=~"{instance}.*"}} / node_memory_MemTotal_bytes{{instance=~"{instance}.*"}}) * 100' + else: + q = '(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100' + data = instant_query(q) + return _extract_metrics(data, "memory_percent") + + +def get_disk(instance: str = None) -> dict: + """Get root filesystem usage percentage.""" + if instance: + q = f'(1 - node_filesystem_avail_bytes{{mountpoint="/", instance=~"{instance}.*"}} / node_filesystem_size_bytes{{mountpoint="/", instance=~"{instance}.*"}}) * 100' + else: + q = '(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100' + data = instant_query(q) + return _extract_metrics(data, "disk_percent") + + +def get_overview() -> str: + """Get a formatted overview of all monitored instances.""" + if not is_available(): + return "Prometheus is not reachable at " + PROMETHEUS_URL + + targets = get_targets() + if not targets: + return "No Prometheus targets found." + + lines = ["## Prometheus Targets\n"] + for t in targets: + status = "UP" if t["health"] == "up" else "DOWN" + lines.append(f"- [{status}] {t['job']} ({t['instance']})") + + cpu = get_cpu() + if "results" in cpu: + lines.append("\n## CPU Usage") + for r in cpu["results"]: + lines.append(f"- {r['instance']}: {r['value']:.1f}%") + + mem = get_memory() + if "results" in mem: + lines.append("\n## Memory Usage") + for r in mem["results"]: + lines.append(f"- {r['instance']}: {r['value']:.1f}%") + + disk = get_disk() + if "results" in disk: + lines.append("\n## Disk Usage (/)") + for r in disk["results"]: + lines.append(f"- {r['instance']}: {r['value']:.1f}%") + + return "\n".join(lines) + + +def _extract_metrics(data: dict, metric_name: str) -> dict: + if "error" in data and data.get("status") != "success": + return {"error": data.get("error", "unknown error")} + + results = [] + for r in data.get("data", {}).get("result", []): + instance = r.get("metric", {}).get("instance", "unknown") + value = float(r.get("value", [0, 0])[1]) + results.append({"instance": instance, "value": value, "metric": metric_name}) + return {"results": results} diff --git a/homelab-ai-bot/core/proxmox_client.py b/homelab-ai-bot/core/proxmox_client.py new file mode 100644 index 00000000..c13b5de9 --- /dev/null +++ b/homelab-ai-bot/core/proxmox_client.py @@ -0,0 +1,138 @@ +"""Proxmox REST API client for querying infrastructure state.""" + +import requests +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +PROXMOX_HOSTS = { + "pve-hetzner": "100.88.230.59", + "pve1": "100.122.56.60", + "pve3": "100.109.101.12", +} + + +class ProxmoxClient: + def __init__(self, host_ip: str, user: str = "root@pam", + password: str = "", token_name: str = "", token_value: str = ""): + self.base_url = f"https://{host_ip}:8006/api2/json" + self.user = user + self.password = password + self.token_name = token_name + self.token_value = token_value + self._ticket = None + self._csrf = None + + def _auth_header(self) -> dict: + if self.token_name and self.token_value: + return {"Authorization": f"PVEAPIToken={self.user}!{self.token_name}={self.token_value}"} + if self._ticket: + return {} + try: + r = requests.post( + f"{self.base_url}/access/ticket", + data={"username": self.user, "password": self.password}, + verify=False, timeout=10, + ) + r.raise_for_status() + data = r.json()["data"] + self._ticket = data["ticket"] + self._csrf = data["CSRFPreventionToken"] + except requests.RequestException as e: + raise ConnectionError(f"Proxmox auth failed for {self.base_url}: {e}") + return {} + + def _get(self, path: str) -> dict: + headers = self._auth_header() + cookies = {} + if self._ticket: + cookies["PVEAuthCookie"] = self._ticket + headers["CSRFPreventionToken"] = self._csrf + r = requests.get( + f"{self.base_url}{path}", + cookies=cookies, headers=headers, + verify=False, timeout=10, + ) + r.raise_for_status() + return r.json().get("data", {}) + + def get_node_status(self) -> dict: + nodes = self._get("/nodes") + if isinstance(nodes, list): + return nodes[0] if nodes else {} + return nodes + + def get_containers(self) -> list[dict]: + nodes = self._get("/nodes") + if not isinstance(nodes, list): + return [] + node_name = nodes[0]["node"] + return self._get(f"/nodes/{node_name}/lxc") + + def get_container_status(self, vmid: int) -> dict: + nodes = self._get("/nodes") + if not isinstance(nodes, list): + return {"error": "no nodes"} + node_name = nodes[0]["node"] + return self._get(f"/nodes/{node_name}/lxc/{vmid}/status/current") + + +def get_all_containers(passwords: dict = None, tokens: dict = None) -> list[dict]: + """Query all Proxmox hosts and return combined container list.""" + if passwords is None: + passwords = {} + if tokens is None: + tokens = {} + + all_cts = [] + for host_name, host_ip in PROXMOX_HOSTS.items(): + token = tokens.get(host_name, {}) + pw = passwords.get(host_name, passwords.get("default", "")) + try: + client = ProxmoxClient( + host_ip, password=pw, + token_name=token.get("name", ""), + token_value=token.get("value", ""), + ) + containers = client.get_containers() + for ct in containers: + ct["_host"] = host_name + ct["_host_ip"] = host_ip + all_cts.extend(containers) + except Exception as e: + all_cts.append({ + "_host": host_name, + "_host_ip": host_ip, + "error": str(e), + }) + return all_cts + + +def format_containers(containers: list[dict]) -> str: + """Format container list for human/LLM consumption.""" + if not containers: + return "No containers found." + + lines = [] + current_host = None + for ct in sorted(containers, key=lambda c: (c.get("_host", ""), c.get("vmid", 0))): + host = ct.get("_host", "unknown") + if host != current_host: + current_host = host + lines.append(f"\n## {host}") + lines.append("| CT | Name | Status | CPU | RAM (MB) |") + lines.append("|---|---|---|---|---|") + + if "error" in ct: + lines.append(f"| — | ERROR | {ct['error'][:60]} | — | — |") + continue + + vmid = ct.get("vmid", "?") + name = ct.get("name", "?") + status = ct.get("status", "?") + cpus = ct.get("cpus", "?") + mem_mb = ct.get("mem", 0) // (1024 * 1024) if ct.get("mem") else 0 + maxmem_mb = ct.get("maxmem", 0) // (1024 * 1024) if ct.get("maxmem") else 0 + lines.append(f"| {vmid} | {name} | {status} | {cpus} | {mem_mb}/{maxmem_mb} |") + + return "\n".join(lines) diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py index 89066456..e69de29b 100644 --- a/homelab-ai-bot/llm.py +++ b/homelab-ai-bot/llm.py @@ -1,44 +0,0 @@ -"""OpenRouter LLM-Wrapper für natürliche Antworten.""" - -import requests -import os -import sys - -sys.path.insert(0, os.path.dirname(__file__)) -from core import config - -MODEL = "openai/gpt-4o-mini" -SYSTEM_PROMPT = """Du bist der Hausmeister-Bot für ein Homelab mit mehreren Proxmox-Servern. -Du antwortest kurz, präzise und auf Deutsch. -Du bekommst Live-Daten aus Loki (Logs), Proxmox (Container-Status) und homelab.conf. -Wenn alles in Ordnung ist, sag das kurz. Bei Problemen erkläre was los ist und schlage Lösungen vor. -Nutze Emojis sparsam. Formatiere für Telegram (kein Markdown, nur einfacher Text).""" - - -def _get_api_key() -> str: - cfg = config.parse_config() - return cfg.api_keys.get("openrouter_key", "") - - -def ask(question: str, context: str) -> str: - """Stellt eine Frage mit Kontext an OpenRouter.""" - api_key = _get_api_key() - if not api_key: - return "OpenRouter API Key fehlt in homelab.conf" - - messages = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": f"Kontext (Live-Daten):\n{context}\n\nFrage: {question}"}, - ] - - try: - r = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={"Authorization": f"Bearer {api_key}"}, - json={"model": MODEL, "messages": messages, "max_tokens": 500}, - timeout=30, - ) - r.raise_for_status() - return r.json()["choices"][0]["message"]["content"] - except Exception as e: - return f"LLM-Fehler: {e}" diff --git a/homelab-ai-bot/monitor.py b/homelab-ai-bot/monitor.py index a9a609c3..e69de29b 100644 --- a/homelab-ai-bot/monitor.py +++ b/homelab-ai-bot/monitor.py @@ -1,138 +0,0 @@ -"""Proaktives Monitoring — regelbasiert (Stufe 1) + KI (Stufe 2).""" - -import sys -import os -import requests - -sys.path.insert(0, os.path.dirname(__file__)) -from core import config, loki_client, proxmox_client - - -def _get_tokens(cfg): - tokens = {} - tn = cfg.raw.get("PVE_TOKEN_HETZNER_NAME", "") - tv = cfg.raw.get("PVE_TOKEN_HETZNER_VALUE", "") - if tn and tv: - tokens["pve-hetzner"] = {"name": tn, "value": tv} - return tokens - - -def _get_passwords(cfg): - return { - "pve-hetzner": cfg.passwords.get("hetzner", ""), - "pve1": cfg.passwords.get("default", ""), - "pve3": cfg.passwords.get("default", ""), - "default": cfg.passwords.get("default", ""), - } - - -CRITICAL_CONTAINERS = [101, 109, 111, 112, 113, 115] - - -def check_all() -> list[str]: - """Regelbasierter Check (Stufe 1). Gibt Liste von Alarmen zurück.""" - cfg = config.parse_config() - alerts = [] - - containers = proxmox_client.get_all_containers( - _get_passwords(cfg), _get_tokens(cfg) - ) - for ct in containers: - if "error" in ct: - continue - vmid = ct.get("vmid", 0) - name = ct.get("name", "?") - status = ct.get("status", "unknown") - if vmid in CRITICAL_CONTAINERS and status != "running": - alerts.append(f"🔴 CT {vmid} ({name}) ist {status}!") - - mem = ct.get("mem", 0) - maxmem = ct.get("maxmem", 1) - if maxmem > 0 and mem / maxmem > 0.90: - pct = int(mem / maxmem * 100) - alerts.append(f"⚠️ CT {vmid} ({name}) RAM bei {pct}%") - - errors = loki_client.get_errors(hours=0.5, limit=50) - error_lines = [e for e in errors if "error" not in e] - panic_lines = [e for e in error_lines if - any(w in e.get("line", "").lower() for w in ["panic", "fatal", "oom", "out of memory"]) - and "query=" not in e.get("line", "") - and "caller=metrics" not in e.get("line", "") - ] - if panic_lines: - hosts = set(e.get("host", "?") for e in panic_lines) - hosts.discard("${HOSTNAME}") - if hosts: - alerts.append(f"🔴 Kritische Fehler (panic/fatal/OOM) auf: {', '.join(hosts)}") - - silent = loki_client.check_silence(minutes=35) - if silent and "error" not in silent[0]: - names = [s["host"] for s in silent] - alerts.append(f"⚠️ Keine Logs seit 35+ Min: {', '.join(names)}") - - return alerts - - -def format_report() -> str: - """Tagesbericht: Gesamtstatus aller Systeme.""" - cfg = config.parse_config() - lines = ["📋 Tagesbericht Homelab\n"] - - containers = proxmox_client.get_all_containers( - _get_passwords(cfg), _get_tokens(cfg) - ) - running = [c for c in containers if c.get("status") == "running"] - stopped = [c for c in containers if c.get("status") == "stopped"] - errors_ct = [c for c in containers if "error" in c] - lines.append(f"Container: {len(running)} running, {len(stopped)} stopped, {len(errors_ct)} nicht erreichbar") - - errors = loki_client.get_errors(hours=24, limit=100) - error_count = len([e for e in errors if "error" not in e]) - lines.append(f"Fehler (24h): {error_count}") - - silent = loki_client.check_silence(minutes=35) - if silent and "error" not in (silent[0] if silent else {}): - names = [s["host"] for s in silent] - lines.append(f"Stille Hosts: {', '.join(names)}") - else: - lines.append("Stille Hosts: keine") - - alerts = check_all() - if alerts: - lines.append(f"\n⚠️ {len(alerts)} aktive Alarme:") - lines.extend(alerts) - else: - lines.append("\n✅ Keine Alarme — alles läuft.") - - return "\n".join(lines) - - -def send_alert(token: str, chat_id: str, message: str): - """Sendet eine Nachricht via Telegram.""" - requests.post( - f"https://api.telegram.org/bot{token}/sendMessage", - data={"chat_id": chat_id, "text": message}, - timeout=10, - ) - - -def run_check_and_alert(): - """Hauptfunktion für Cron: prüft und sendet Alerts falls nötig.""" - cfg = config.parse_config() - token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "") - chat_id = cfg.raw.get("TG_CHAT_ID", "") - if not token or not chat_id: - return - - alerts = check_all() - if alerts: - msg = "🔧 Hausmeister-Check\n\n" + "\n".join(alerts) - send_alert(token, chat_id, msg) - - -if __name__ == "__main__": - import sys - if len(sys.argv) > 1 and sys.argv[1] == "report": - print(format_report()) - else: - run_check_and_alert() diff --git a/homelab-ai-bot/requirements.txt b/homelab-ai-bot/requirements.txt index 8d21da30..e69de29b 100644 --- a/homelab-ai-bot/requirements.txt +++ b/homelab-ai-bot/requirements.txt @@ -1,2 +0,0 @@ -python-telegram-bot>=21.0 -requests>=2.31 diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index bc360f43..e69de29b 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -1,224 +0,0 @@ -"""Orbitalo Hausmeister — Telegram Bot für Homelab-Management.""" - -import asyncio -import logging -import sys -import os - -sys.path.insert(0, os.path.dirname(__file__)) - -from telegram import BotCommand, Update -from telegram.ext import ( - Application, CommandHandler, MessageHandler, filters, ContextTypes, -) - -BOT_COMMANDS = [ - BotCommand("status", "Alle Container"), - BotCommand("errors", "Aktuelle Fehler"), - BotCommand("ct", "Container-Detail (/ct 109)"), - BotCommand("health", "Health-Check (/health wordpress)"), - BotCommand("logs", "Letzte Logs (/logs rss-manager)"), - BotCommand("silence", "Stille Hosts"), - BotCommand("report", "Tagesbericht"), - BotCommand("check", "Monitoring-Check"), - BotCommand("start", "Hilfe anzeigen"), -] - -import context -import llm -import monitor -from core import config - -logging.basicConfig( - format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", - level=logging.INFO, -) -log = logging.getLogger("hausmeister") - -ALLOWED_CHAT_IDS: set[int] = set() - - -def _load_token_and_chat(): - cfg = config.parse_config() - token = cfg.raw.get("TG_HAUSMEISTER_TOKEN", "") - chat_id = cfg.raw.get("TG_CHAT_ID", "") - if chat_id: - ALLOWED_CHAT_IDS.add(int(chat_id)) - return token - - -def _authorized(update: Update) -> bool: - if not ALLOWED_CHAT_IDS: - return True - return update.effective_chat.id in ALLOWED_CHAT_IDS - - -async def cmd_start(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - await update.message.reply_text( - "🔧 Orbitalo Hausmeister-Bot\n\n" - "Befehle:\n" - "/status — Alle Container\n" - "/errors — Aktuelle Fehler\n" - "/ct — Container-Detail\n" - "/health — Health-Check\n" - "/logs — Letzte Logs\n" - "/silence — Stille Hosts\n" - "/report — Tagesbericht\n" - "/check — Monitoring-Check\n\n" - "Oder einfach eine Frage stellen!" - ) - - -async def cmd_status(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - await update.message.reply_text("⏳ Lade Container-Status...") - try: - text = context.gather_status() - if len(text) > 4000: - text = text[:4000] + "\n..." - await update.message.reply_text(text) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_errors(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - await update.message.reply_text("⏳ Suche Fehler...") - try: - text = context.gather_errors(hours=2) - await update.message.reply_text(text[:4000]) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_ct(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - args = ctx.args - if not args: - await update.message.reply_text("Bitte CT-Nummer angeben: /ct 109") - return - try: - text = context.gather_container_status(args[0]) - await update.message.reply_text(text) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_health(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - args = ctx.args - if not args: - await update.message.reply_text("Bitte Hostname angeben: /health wordpress") - return - try: - text = context.gather_health(args[0]) - await update.message.reply_text(text) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_logs(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - args = ctx.args - if not args: - await update.message.reply_text("Bitte Hostname angeben: /logs rss-manager") - return - try: - text = context.gather_logs(args[0]) - await update.message.reply_text(text[:4000]) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_silence(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - try: - text = context.gather_silence() - await update.message.reply_text(text) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_report(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - await update.message.reply_text("⏳ Erstelle Tagesbericht...") - try: - text = monitor.format_report() - await update.message.reply_text(text[:4000]) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def cmd_check(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - if not _authorized(update): - return - await update.message.reply_text("⏳ Prüfe Systeme...") - try: - alerts = monitor.check_all() - if alerts: - text = f"⚠️ {len(alerts)} Alarme:\n\n" + "\n".join(alerts) - else: - text = "✅ Keine Alarme — alles läuft." - await update.message.reply_text(text) - except Exception as e: - await update.message.reply_text(f"Fehler: {e}") - - -async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE): - """Freitext-Fragen → Kontext sammeln → LLM → Antwort.""" - if not _authorized(update): - return - question = update.message.text - if not question: - return - - await update.message.reply_text("🤔 Denke nach...") - try: - data = context.gather_context_for_question(question) - answer = llm.ask(question, data) - await update.message.reply_text(answer[:4000]) - except Exception as e: - log.exception("Fehler bei Freitext") - await update.message.reply_text(f"Fehler: {e}") - - -def main(): - token = _load_token_and_chat() - if not token: - log.error("TG_HAUSMEISTER_TOKEN fehlt in homelab.conf!") - sys.exit(1) - - log.info("Starte Orbitalo Hausmeister-Bot...") - app = Application.builder().token(token).build() - - app.add_handler(CommandHandler("start", cmd_start)) - app.add_handler(CommandHandler("status", cmd_status)) - app.add_handler(CommandHandler("errors", cmd_errors)) - app.add_handler(CommandHandler("ct", cmd_ct)) - app.add_handler(CommandHandler("health", cmd_health)) - app.add_handler(CommandHandler("logs", cmd_logs)) - app.add_handler(CommandHandler("silence", cmd_silence)) - app.add_handler(CommandHandler("report", cmd_report)) - app.add_handler(CommandHandler("check", cmd_check)) - app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)) - - async def post_init(application): - await application.bot.set_my_commands(BOT_COMMANDS) - log.info("Kommandomenü registriert") - - app.post_init = post_init - log.info("Bot läuft — polling gestartet") - app.run_polling(allowed_updates=Update.ALL_TYPES) - - -if __name__ == "__main__": - main()