Vision: intelligente Bilderkennung + PDF-Support + Dokument-Handler

2026-03-16 09:11:29 +07:00 · 2026-03-16 09:11:29 +07:00 · aed9e6d28a
commit aed9e6d28a
parent 89f2c03fa0
2 changed files with 102 additions and 0 deletions
--- a/homelab-ai-bot/llm.py
+++ b/homelab-ai-bot/llm.py
@ -63,6 +63,15 @@ SESSION-RUECKBLICK:
 - Optional kurz erwaehnen was sonst noch Thema war.
 - session_search nur fuer Stichwort-Suche in ALTEN Sessions (nicht aktuelle).
 BILDERKENNUNG:
 Wenn der User ein Bild oder PDF schickt:
 - Beschreibe STRUKTURIERT was du siehst.
 - Bei Flugplaenen/Buchungen: Extrahiere ALLE Daten (Flugnummer, Datum, Abflug/Ankunft Uhrzeit, Airports mit IATA-Codes, Preis, Buchungscode, Airline, Sitzplatz, Gepaeck).
 - Bei Screenshots von Fehlern/Logs: Identifiziere das Problem, ordne es einem Container/Service zu, schlage Loesung vor.
 - Bei Rechnungen/Dokumenten: Extrahiere Betrag, Datum, Absender, Faelligkeit.
 - WICHTIG: Speichere erkannte Reiseplaene, Termine, Buchungen IMMER via memory_suggest (memory_type="plan", mit expires_at).
 - Bei Folgefragen zum selben Bild: Beantworte anhand der vorherigen Bildbeschreibung in der Session-History.
 TOOLS:
 Nutze Tools fuer Live-Daten. Wenn alles OK: kurz sagen. Bei Problemen: erklaeren + Loesung."""
--- a/homelab-ai-bot/telegram_bot.py
+++ b/homelab-ai-bot/telegram_bot.py
@ -435,6 +435,98 @@ async def handle_photo(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
        await update.message.reply_text(f"Fehler bei Bildanalyse: {e}")
 def _extract_pdf_text(pdf_bytes: bytes) -> str:
    """Extrahiert Text aus PDF via PyPDF2. Gibt leeren String zurueck wenn kein Text."""
    try:
        import io as _io
        from PyPDF2 import PdfReader
        reader = PdfReader(_io.BytesIO(pdf_bytes))
        pages = []
        for i, page in enumerate(reader.pages[:10]):
            text = page.extract_text()
            if text and text.strip():
                pages.append(f"--- Seite {i+1} ---\n{text.strip()}")
        return "\n\n".join(pages)
    except Exception as e:
        log.warning("PDF-Extraktion fehlgeschlagen: %s", e)
        return ""
 async def handle_document(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    """Dokument-Nachricht: Bilder und PDFs analysieren."""
    if not _authorized(update):
        return
    doc = update.message.document
    if not doc:
        return
    mime = doc.mime_type or ""
    caption = update.message.caption or ""
    channel_key = str(update.effective_chat.id)
    session_id = memory_client.get_or_create_session(channel_key, source="telegram")
    if mime.startswith("image/"):
        await update.message.reply_text("🔍 Analysiere Bild...")
        try:
            import base64
            tg_file = await ctx.bot.get_file(doc.file_id)
            image_data = await tg_file.download_as_bytearray()
            image_base64 = base64.b64encode(bytes(image_data)).decode("utf-8")
            context.last_suggest_result = {"type": None}
            context.set_source_type("telegram_photo")
            handlers = context.get_tool_handlers(session_id=session_id)
            answer = llm.ask_with_image(image_base64, caption, handlers, session_id=session_id)
            if session_id:
                user_msg = f"[Bild-Datei] {caption}" if caption else "[Bild-Datei gesendet]"
                memory_client.log_message(session_id, "user", user_msg)
                memory_client.log_message(session_id, "assistant", answer)
            await update.message.reply_text(answer[:4000], reply_markup=KEYBOARD)
        except Exception as e:
            log.exception("Fehler bei Bild-Dokument")
            await update.message.reply_text(f"Fehler bei Bildanalyse: {e}")
    elif mime == "application/pdf":
        await update.message.reply_text("📄 Lese PDF...")
        try:
            tg_file = await ctx.bot.get_file(doc.file_id)
            pdf_data = await tg_file.download_as_bytearray()
            pdf_text = _extract_pdf_text(bytes(pdf_data))
            if not pdf_text:
                await update.message.reply_text(
                    "PDF enthält keinen extrahierbaren Text (evtl. gescannt/Bild-PDF).\n"
                    "Tipp: Sende einen Screenshot des PDFs als Foto — dann kann ich es per Bilderkennung lesen."
                )
                return
            question = caption if caption else "Analysiere dieses Dokument. Was sind die wichtigsten Informationen?"
            full_prompt = f"{question}\n\n--- PDF-INHALT ---\n{pdf_text[:6000]}"
            context.last_suggest_result = {"type": None}
            context.set_source_type("telegram_pdf")
            handlers = context.get_tool_handlers(session_id=session_id)
            answer = llm.ask_with_tools(full_prompt, handlers, session_id=session_id)
            if session_id:
                user_msg = f"[PDF: {doc.file_name or 'dokument.pdf'}] {caption}" if caption else f"[PDF: {doc.file_name or 'dokument.pdf'}]"
                memory_client.log_message(session_id, "user", user_msg)
                memory_client.log_message(session_id, "assistant", answer)
            await update.message.reply_text(answer[:4000], reply_markup=KEYBOARD)
        except Exception as e:
            log.exception("Fehler bei PDF-Analyse")
            await update.message.reply_text(f"Fehler bei PDF: {e}")
    else:
        await update.message.reply_text(
            f"Dateityp '{mime}' wird nicht unterstuetzt.\n"
            "Unterstuetzt: Bilder (JPG/PNG) und PDFs."
        )
 async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE):
    """Button-Presses und Freitext-Fragen verarbeiten."""
    if not _authorized(update):
@ -505,6 +597,7 @@ def main():
    app.add_handler(CommandHandler("memory", cmd_memory))
    app.add_handler(MessageHandler(filters.VOICE, handle_voice))
    app.add_handler(MessageHandler(filters.PHOTO, handle_photo))
    app.add_handler(MessageHandler(filters.Document.ALL, handle_document))
    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
    async def post_init(application):