From db61aaedaeef410d1bfb4dd8abd5fc685190240f Mon Sep 17 00:00:00 2001 From: Homelab Cursor Date: Sat, 21 Mar 2026 02:00:36 +0100 Subject: [PATCH] fix(llm): Text-Routing auf qwen3:30b-a3b, Timeout-Fallback auf qwen2.5:14b Vorher lief normaler Text ueber das grosse Vision-Modell qwen3-vl:32b, was bei Tool-Calls (z.B. get_temperaturen) regelmaessig in Read-Timeouts nach 120s resultierte. Aenderungen: - Text-Modell: qwen3:30b-a3b (MoE, schneller fuer reinen Text) - Vision-Modell: bleibt qwen3-vl:32b (nur fuer Fotos/Dokumente) - Fallback bei Timeout: qwen2.5:14b (einmaliger Retry) - Ollama-Timeout modellabhaengig (180s Text, 240s Vision, 90s Fallback) - Alle lokalen Modelle werden korrekt als Ollama erkannt --- homelab-ai-bot/llm.py | 72 +++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/homelab-ai-bot/llm.py b/homelab-ai-bot/llm.py index 9c9918de..fa4de9c0 100644 --- a/homelab-ai-bot/llm.py +++ b/homelab-ai-bot/llm.py @@ -18,9 +18,11 @@ log = logging.getLogger('llm') OLLAMA_BASE = "http://100.84.255.83:11434" OPENROUTER_BASE = "https://openrouter.ai/api/v1" -MODEL = "qwen3-vl:32b" -VISION_MODEL = "qwen3-vl:32b" +MODEL = os.environ.get("HAUSMEISTER_TEXT_MODEL", "qwen3:30b-a3b") +VISION_MODEL = os.environ.get("HAUSMEISTER_VISION_MODEL", "qwen3-vl:32b") +FALLBACK_MODEL = os.environ.get("HAUSMEISTER_TEXT_FALLBACK_MODEL", "qwen2.5:14b") MAX_TOOL_ROUNDS = 3 +OLLAMA_MODELS = {MODEL, VISION_MODEL, FALLBACK_MODEL} import datetime as _dt _TODAY = _dt.date.today() @@ -186,10 +188,34 @@ def _get_api_key() -> str: return cfg.api_keys.get("openrouter_key", "") +def _ollama_timeout_for(model: str) -> int: + if model == VISION_MODEL: + return 240 + if model == FALLBACK_MODEL: + return 90 + return 180 + + +def _add_no_think(messages: list) -> None: + for msg in reversed(messages): + if msg.get("role") != "user": + continue + content = msg.get("content", "") + if isinstance(content, str) and "/no_think" not in content: + msg["content"] = content + " /no_think" + elif isinstance(content, list): + for item in content: + if item.get("type") == "text" and "/no_think" not in item.get("text", ""): + item["text"] = item["text"] + " /no_think" + break + break + + def _call_openrouter(messages: list, api_key: str, use_tools: bool = True, - model: str = None, max_tokens: int = 4000) -> dict: + model: str = None, max_tokens: int = 4000, + allow_fallback: bool = True) -> dict: chosen = model or MODEL - use_ollama = (chosen == MODEL) + use_ollama = chosen in OLLAMA_MODELS log.info("LLM-Call: model=%s ollama=%s max_tokens=%d", chosen, use_ollama, max_tokens) payload = { @@ -204,26 +230,34 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True, if use_ollama: url = f"{OLLAMA_BASE}/v1/chat/completions" headers = {"Content-Type": "application/json"} - timeout = 120 - for msg in reversed(payload.get("messages", [])): - if msg.get("role") == "user": - content = msg.get("content", "") - if isinstance(content, str) and "/no_think" not in content: - msg["content"] = content + " /no_think" - elif isinstance(content, list): - for item in content: - if item.get("type") == "text" and "/no_think" not in item.get("text", ""): - item["text"] = item["text"] + " /no_think" - break - break + timeout = _ollama_timeout_for(chosen) + _add_no_think(payload.get("messages", [])) else: url = f"{OPENROUTER_BASE}/chat/completions" headers = {"Authorization": f"Bearer {api_key}"} timeout = 90 - r = requests.post(url, headers=headers, json=payload, timeout=timeout) - r.raise_for_status() - return r.json() + try: + r = requests.post(url, headers=headers, json=payload, timeout=timeout) + r.raise_for_status() + return r.json() + except requests.exceptions.ReadTimeout: + if use_ollama and allow_fallback and chosen == MODEL and FALLBACK_MODEL and FALLBACK_MODEL != chosen: + log.warning( + "Ollama timeout for %s after %ss, retrying with fallback model %s", + chosen, + timeout, + FALLBACK_MODEL, + ) + return _call_openrouter( + messages, + api_key, + use_tools=use_tools, + model=FALLBACK_MODEL, + max_tokens=max_tokens, + allow_fallback=False, + ) + raise def ask(question: str, context: str) -> str: