fix(llm): Text-Routing auf qwen3:30b-a3b, Timeout-Fallback auf qwen2.5:14b

Vorher lief normaler Text ueber das grosse Vision-Modell qwen3-vl:32b,
was bei Tool-Calls (z.B. get_temperaturen) regelmaessig in Read-Timeouts
nach 120s resultierte.

Aenderungen:
- Text-Modell: qwen3:30b-a3b (MoE, schneller fuer reinen Text)
- Vision-Modell: bleibt qwen3-vl:32b (nur fuer Fotos/Dokumente)
- Fallback bei Timeout: qwen2.5:14b (einmaliger Retry)
- Ollama-Timeout modellabhaengig (180s Text, 240s Vision, 90s Fallback)
- Alle lokalen Modelle werden korrekt als Ollama erkannt
This commit is contained in:
Homelab Cursor 2026-03-21 02:00:36 +01:00
parent 4c2177baba
commit db61aaedae

View file

@ -18,9 +18,11 @@ log = logging.getLogger('llm')
OLLAMA_BASE = "http://100.84.255.83:11434"
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
MODEL = "qwen3-vl:32b"
VISION_MODEL = "qwen3-vl:32b"
MODEL = os.environ.get("HAUSMEISTER_TEXT_MODEL", "qwen3:30b-a3b")
VISION_MODEL = os.environ.get("HAUSMEISTER_VISION_MODEL", "qwen3-vl:32b")
FALLBACK_MODEL = os.environ.get("HAUSMEISTER_TEXT_FALLBACK_MODEL", "qwen2.5:14b")
MAX_TOOL_ROUNDS = 3
OLLAMA_MODELS = {MODEL, VISION_MODEL, FALLBACK_MODEL}
import datetime as _dt
_TODAY = _dt.date.today()
@ -186,10 +188,34 @@ def _get_api_key() -> str:
return cfg.api_keys.get("openrouter_key", "")
def _ollama_timeout_for(model: str) -> int:
if model == VISION_MODEL:
return 240
if model == FALLBACK_MODEL:
return 90
return 180
def _add_no_think(messages: list) -> None:
for msg in reversed(messages):
if msg.get("role") != "user":
continue
content = msg.get("content", "")
if isinstance(content, str) and "/no_think" not in content:
msg["content"] = content + " /no_think"
elif isinstance(content, list):
for item in content:
if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
item["text"] = item["text"] + " /no_think"
break
break
def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
model: str = None, max_tokens: int = 4000) -> dict:
model: str = None, max_tokens: int = 4000,
allow_fallback: bool = True) -> dict:
chosen = model or MODEL
use_ollama = (chosen == MODEL)
use_ollama = chosen in OLLAMA_MODELS
log.info("LLM-Call: model=%s ollama=%s max_tokens=%d", chosen, use_ollama, max_tokens)
payload = {
@ -204,26 +230,34 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
if use_ollama:
url = f"{OLLAMA_BASE}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
timeout = 120
for msg in reversed(payload.get("messages", [])):
if msg.get("role") == "user":
content = msg.get("content", "")
if isinstance(content, str) and "/no_think" not in content:
msg["content"] = content + " /no_think"
elif isinstance(content, list):
for item in content:
if item.get("type") == "text" and "/no_think" not in item.get("text", ""):
item["text"] = item["text"] + " /no_think"
break
break
timeout = _ollama_timeout_for(chosen)
_add_no_think(payload.get("messages", []))
else:
url = f"{OPENROUTER_BASE}/chat/completions"
headers = {"Authorization": f"Bearer {api_key}"}
timeout = 90
try:
r = requests.post(url, headers=headers, json=payload, timeout=timeout)
r.raise_for_status()
return r.json()
except requests.exceptions.ReadTimeout:
if use_ollama and allow_fallback and chosen == MODEL and FALLBACK_MODEL and FALLBACK_MODEL != chosen:
log.warning(
"Ollama timeout for %s after %ss, retrying with fallback model %s",
chosen,
timeout,
FALLBACK_MODEL,
)
return _call_openrouter(
messages,
api_key,
use_tools=use_tools,
model=FALLBACK_MODEL,
max_tokens=max_tokens,
allow_fallback=False,
)
raise
def ask(question: str, context: str) -> str: