fix(llm): Text-Routing auf qwen3:30b-a3b, Timeout-Fallback auf qwen2.5:14b

Vorher lief normaler Text ueber das grosse Vision-Modell qwen3-vl:32b, was bei Tool-Calls (z.B. get_temperaturen) regelmaessig in Read-Timeouts nach 120s resultierte. Aenderungen: - Text-Modell: qwen3:30b-a3b (MoE, schneller fuer reinen Text) - Vision-Modell: bleibt qwen3-vl:32b (nur fuer Fotos/Dokumente) - Fallback bei Timeout: qwen2.5:14b (einmaliger Retry) - Ollama-Timeout modellabhaengig (180s Text, 240s Vision, 90s Fallback) - Alle lokalen Modelle werden korrekt als Ollama erkannt
2026-03-21 02:00:36 +01:00 · 2026-03-21 02:00:36 +01:00 · db61aaedae
commit db61aaedae
parent 4c2177baba
1 changed files with 53 additions and 19 deletions
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@ -18,9 +18,11 @@ log = logging.getLogger('llm')
 OLLAMA_BASE = "http://100.84.255.83:11434"
 OPENROUTER_BASE = "https://openrouter.ai/api/v1"

-MODEL = "qwen3-vl:32b"
-VISION_MODEL = "qwen3-vl:32b"
+MODEL = os.environ.get("HAUSMEISTER_TEXT_MODEL", "qwen3:30b-a3b")
+VISION_MODEL = os.environ.get("HAUSMEISTER_VISION_MODEL", "qwen3-vl:32b")
+FALLBACK_MODEL = os.environ.get("HAUSMEISTER_TEXT_FALLBACK_MODEL", "qwen2.5:14b")
 MAX_TOOL_ROUNDS = 3
+OLLAMA_MODELS = {MODEL, VISION_MODEL, FALLBACK_MODEL}

 import datetime as _dt
 _TODAY = _dt.date.today()
@ -186,10 +188,34 @@ def _get_api_key() -> str:
    return cfg.api_keys.get("openrouter_key", "")


+def _ollama_timeout_for(model: str) -> int:
+    if model == VISION_MODEL:
+        return 240
+    if model == FALLBACK_MODEL:
+        return 90
+    return 180
+
+
+def _add_no_think(messages: list) -> None:
+    for msg in reversed(messages):
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content", "")
+        if isinstance(content, str) and "/no_think" not in content:
+            msg["content"] = content + " /no_think"
+        elif isinstance(content, list):
+            for item in content:
+                if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
+                    item["text"] = item["text"] + " /no_think"
+                    break
+        break
+
+
 def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
-                     model: str = None, max_tokens: int = 4000) -> dict:
+                     model: str = None, max_tokens: int = 4000,
+                     allow_fallback: bool = True) -> dict:
    chosen = model or MODEL
-    use_ollama = (chosen == MODEL)
+    use_ollama = chosen in OLLAMA_MODELS
    log.info("LLM-Call: model=%s ollama=%s max_tokens=%d", chosen, use_ollama, max_tokens)

    payload = {
@ -204,26 +230,34 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
    if use_ollama:
        url = f"{OLLAMA_BASE}/v1/chat/completions"
        headers = {"Content-Type": "application/json"}
-        timeout = 120
-        for msg in reversed(payload.get("messages", [])):
-            if msg.get("role") == "user":
-                content = msg.get("content", "")
-                if isinstance(content, str) and "/no_think" not in content:
-                    msg["content"] = content + " /no_think"
-                elif isinstance(content, list):
-                    for item in content:
-                        if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
-                            item["text"] = item["text"] + " /no_think"
-                            break
-                break
+        timeout = _ollama_timeout_for(chosen)
+        _add_no_think(payload.get("messages", []))
    else:
        url = f"{OPENROUTER_BASE}/chat/completions"
        headers = {"Authorization": f"Bearer {api_key}"}
        timeout = 90

+    try:
        r = requests.post(url, headers=headers, json=payload, timeout=timeout)
        r.raise_for_status()
        return r.json()
+    except requests.exceptions.ReadTimeout:
+        if use_ollama and allow_fallback and chosen == MODEL and FALLBACK_MODEL and FALLBACK_MODEL != chosen:
+            log.warning(
+                "Ollama timeout for %s after %ss, retrying with fallback model %s",
+                chosen,
+                timeout,
+                FALLBACK_MODEL,
+            )
+            return _call_openrouter(
+                messages,
+                api_key,
+                use_tools=use_tools,
+                model=FALLBACK_MODEL,
+                max_tokens=max_tokens,
+                allow_fallback=False,
+            )
+        raise


 def ask(question: str, context: str) -> str: