From db61aaedaeef410d1bfb4dd8abd5fc685190240f Mon Sep 17 00:00:00 2001
From: Homelab Cursor <homelab@orbitalo.net>
Date: Sat, 21 Mar 2026 02:00:36 +0100
Subject: [PATCH] fix(llm): Text-Routing auf qwen3:30b-a3b, Timeout-Fallback
 auf qwen2.5:14b

Vorher lief normaler Text ueber das grosse Vision-Modell qwen3-vl:32b,
was bei Tool-Calls (z.B. get_temperaturen) regelmaessig in Read-Timeouts
nach 120s resultierte.

Aenderungen:
- Text-Modell: qwen3:30b-a3b (MoE, schneller fuer reinen Text)
- Vision-Modell: bleibt qwen3-vl:32b (nur fuer Fotos/Dokumente)
- Fallback bei Timeout: qwen2.5:14b (einmaliger Retry)
- Ollama-Timeout modellabhaengig (180s Text, 240s Vision, 90s Fallback)
- Alle lokalen Modelle werden korrekt als Ollama erkannt
---
 homelab-ai-bot/llm.py | 72 +++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py
index 9c9918de..fa4de9c0 100644
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@@ -18,9 +18,11 @@ log = logging.getLogger('llm')
 OLLAMA_BASE = "http://100.84.255.83:11434"
 OPENROUTER_BASE = "https://openrouter.ai/api/v1"
 
-MODEL = "qwen3-vl:32b"
-VISION_MODEL = "qwen3-vl:32b"
+MODEL = os.environ.get("HAUSMEISTER_TEXT_MODEL", "qwen3:30b-a3b")
+VISION_MODEL = os.environ.get("HAUSMEISTER_VISION_MODEL", "qwen3-vl:32b")
+FALLBACK_MODEL = os.environ.get("HAUSMEISTER_TEXT_FALLBACK_MODEL", "qwen2.5:14b")
 MAX_TOOL_ROUNDS = 3
+OLLAMA_MODELS = {MODEL, VISION_MODEL, FALLBACK_MODEL}
 
 import datetime as _dt
 _TODAY = _dt.date.today()
@@ -186,10 +188,34 @@ def _get_api_key() -> str:
     return cfg.api_keys.get("openrouter_key", "")
 
 
+def _ollama_timeout_for(model: str) -> int:
+    if model == VISION_MODEL:
+        return 240
+    if model == FALLBACK_MODEL:
+        return 90
+    return 180
+
+
+def _add_no_think(messages: list) -> None:
+    for msg in reversed(messages):
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content", "")
+        if isinstance(content, str) and "/no_think" not in content:
+            msg["content"] = content + " /no_think"
+        elif isinstance(content, list):
+            for item in content:
+                if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
+                    item["text"] = item["text"] + " /no_think"
+                    break
+        break
+
+
 def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
-                     model: str = None, max_tokens: int = 4000) -> dict:
+                     model: str = None, max_tokens: int = 4000,
+                     allow_fallback: bool = True) -> dict:
     chosen = model or MODEL
-    use_ollama = (chosen == MODEL)
+    use_ollama = chosen in OLLAMA_MODELS
     log.info("LLM-Call: model=%s ollama=%s max_tokens=%d", chosen, use_ollama, max_tokens)
 
     payload = {
@@ -204,26 +230,34 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
     if use_ollama:
         url = f"{OLLAMA_BASE}/v1/chat/completions"
         headers = {"Content-Type": "application/json"}
-        timeout = 120
-        for msg in reversed(payload.get("messages", [])):
-            if msg.get("role") == "user":
-                content = msg.get("content", "")
-                if isinstance(content, str) and "/no_think" not in content:
-                    msg["content"] = content + " /no_think"
-                elif isinstance(content, list):
-                    for item in content:
-                        if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
-                            item["text"] = item["text"] + " /no_think"
-                            break
-                break
+        timeout = _ollama_timeout_for(chosen)
+        _add_no_think(payload.get("messages", []))
     else:
         url = f"{OPENROUTER_BASE}/chat/completions"
         headers = {"Authorization": f"Bearer {api_key}"}
         timeout = 90
 
-    r = requests.post(url, headers=headers, json=payload, timeout=timeout)
-    r.raise_for_status()
-    return r.json()
+    try:
+        r = requests.post(url, headers=headers, json=payload, timeout=timeout)
+        r.raise_for_status()
+        return r.json()
+    except requests.exceptions.ReadTimeout:
+        if use_ollama and allow_fallback and chosen == MODEL and FALLBACK_MODEL and FALLBACK_MODEL != chosen:
+            log.warning(
+                "Ollama timeout for %s after %ss, retrying with fallback model %s",
+                chosen,
+                timeout,
+                FALLBACK_MODEL,
+            )
+            return _call_openrouter(
+                messages,
+                api_key,
+                use_tools=use_tools,
+                model=FALLBACK_MODEL,
+                max_tokens=max_tokens,
+                allow_fallback=False,
+            )
+        raise
 
 
 def ask(question: str, context: str) -> str: