Vision: gpt-4o statt gpt-4o-mini fuer Bilderkennung, detail=high, 1200 Tokens

2026-03-16 09:24:04 +07:00 · 2026-03-16 09:24:04 +07:00 · 345d3e45ab
commit 345d3e45ab
parent c9f1985266
1 changed files with 19 additions and 9 deletions
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@ -13,6 +13,7 @@ sys.path.insert(0, os.path.dirname(__file__))
 from core import config

 MODEL = "openai/gpt-4o-mini"
+VISION_MODEL = "openai/gpt-4o"
 MAX_TOOL_ROUNDS = 3

 SYSTEM_PROMPT = """Du bist der Hausmeister-Bot fuer ein Homelab. Deutsch, kurz, direkt, operativ.
@ -375,11 +376,12 @@ def _get_api_key() -> str:
    return cfg.api_keys.get("openrouter_key", "")


-def _call_openrouter(messages: list, api_key: str, use_tools: bool = True) -> dict:
+def _call_openrouter(messages: list, api_key: str, use_tools: bool = True,
+                     model: str = None, max_tokens: int = 600) -> dict:
    payload = {
-        "model": MODEL,
+        "model": model or MODEL,
        "messages": messages,
-        "max_tokens": 600,
+        "max_tokens": max_tokens,
    }
    if use_tools:
        payload["tools"] = TOOLS
@ -389,7 +391,7 @@ def _call_openrouter(messages: list, api_key: str, use_tools: bool = True) -> di
        "https://openrouter.ai/api/v1/chat/completions",
        headers={"Authorization": f"Bearer {api_key}"},
        json=payload,
-        timeout=60,
+        timeout=90,
    )
    r.raise_for_status()
    return r.json()
@ -519,10 +521,16 @@ def ask_with_image(image_base64: str, caption: str, tool_handlers: dict, session
    except Exception:
        memory_block = ""

-    prompt_text = caption if caption else "Was siehst du auf diesem Bild? Beschreibe was du erkennst."
+    default_prompt = (
+        "Lies dieses Bild/Dokument VOLLSTAENDIG und GENAU. "
+        "Extrahiere ALLE sichtbaren Texte, Zahlen, Daten, Namen. "
+        "Strukturiere die Informationen uebersichtlich. "
+        "Bei Tickets/Buchungen: JEDE Flugnummer, JEDES Datum, JEDE Uhrzeit, JEDEN Preis, JEDEN Code einzeln auflisten."
+    )
+    prompt_text = caption if caption else default_prompt
    user_content = [
        {"type": "text", "text": prompt_text},
-        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}},
+        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}", "detail": "high"}},
    ]

    messages = [
@ -545,7 +553,8 @@ def ask_with_image(image_base64: str, caption: str, tool_handlers: dict, session

    try:
        for _round in range(MAX_TOOL_ROUNDS):
-            data = _call_openrouter(messages, api_key, use_tools=True)
+            data = _call_openrouter(messages, api_key, use_tools=True,
+                                    model=VISION_MODEL, max_tokens=1200)
            choice = data["choices"][0]
            msg = choice["message"]

@ -577,8 +586,9 @@ def ask_with_image(image_base64: str, caption: str, tool_handlers: dict, session
                    "content": str(result)[:3000],
                })

-        data = _call_openrouter(messages, api_key, use_tools=False)
+        data = _call_openrouter(messages, api_key, use_tools=False,
+                               model=VISION_MODEL, max_tokens=1200)
        return data["choices"][0]["message"]["content"]

    except Exception as e:
-        return f"LLM-Fehler: {e}"
+        return f"Vision-LLM-Fehler: {e}"