From 74e205fe98dd5015d50c01e87ad93670b352a998 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 15 Mar 2026 15:35:36 +0700 Subject: [PATCH] Voice-Modul: Whisper STT + TTS Onyx fuer Hausmeister-Bot --- homelab-ai-bot/telegram_bot.py | 46 ++++++++++++++++++++ homelab-ai-bot/voice.py | 76 ++++++++++++++++++++++++++++++++++ homelab.conf | 1 + 3 files changed, 123 insertions(+) create mode 100644 homelab-ai-bot/voice.py diff --git a/homelab-ai-bot/telegram_bot.py b/homelab-ai-bot/telegram_bot.py index 81f9db08..a5339a81 100644 --- a/homelab-ai-bot/telegram_bot.py +++ b/homelab-ai-bot/telegram_bot.py @@ -80,6 +80,7 @@ import requests as _req import llm import memory_client import monitor +import voice from core import config logging.basicConfig( @@ -383,6 +384,50 @@ async def handle_memory_callback(update: Update, ctx: ContextTypes.DEFAULT_TYPE) await query.edit_message_text("Fehler beim Loeschen.") +async def handle_voice(update: Update, ctx: ContextTypes.DEFAULT_TYPE): + """Sprachnachricht: Whisper STT -> LLM -> TTS Antwort als Text + Sprache.""" + if not _authorized(update): + return + voice_msg = update.message.voice + if not voice_msg: + return + + await update.message.reply_text("🎙 Höre zu...") + try: + tg_file = await ctx.bot.get_file(voice_msg.file_id) + audio_data = await tg_file.download_as_bytearray() + + text = voice.transcribe(bytes(audio_data)) + if not text: + await update.message.reply_text("Konnte die Nachricht nicht verstehen.") + return + + log.info("Voice transkribiert: %s", text[:100]) + await update.message.reply_text(f"🗣 \"{text}\"\n\n🤔 Denke nach...") + + channel_key = str(update.effective_chat.id) + session_id = memory_client.get_or_create_session(channel_key, source="telegram") + + context.last_suggest_result = {"type": None, "candidate_id": None} + handlers = context.get_tool_handlers(session_id=session_id) + answer = llm.ask_with_tools(text, handlers, session_id=session_id) + if session_id: + memory_client.log_message(session_id, "user", text) + memory_client.log_message(session_id, "assistant", answer) + + await update.message.reply_text(answer[:4000], reply_markup=KEYBOARD) + + audio_out = voice.synthesize(answer[:4000]) + if audio_out: + import io as _io + await update.message.reply_voice(voice=_io.BytesIO(audio_out)) + else: + log.warning("TTS fehlgeschlagen — nur Text gesendet") + except Exception as e: + log.exception("Fehler bei Voice-Nachricht") + await update.message.reply_text(f"Fehler: {e}") + + async def handle_message(update: Update, ctx: ContextTypes.DEFAULT_TYPE): """Button-Presses und Freitext-Fragen verarbeiten.""" if not _authorized(update): @@ -517,6 +562,7 @@ def main(): app.add_handler(CommandHandler("feeds", cmd_feeds)) app.add_handler(CommandHandler("memory", cmd_memory)) app.add_handler(CallbackQueryHandler(handle_memory_callback, pattern=r"^mem_")) + app.add_handler(MessageHandler(filters.VOICE, handle_voice)) app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)) async def post_init(application): diff --git a/homelab-ai-bot/voice.py b/homelab-ai-bot/voice.py new file mode 100644 index 00000000..e2e219b8 --- /dev/null +++ b/homelab-ai-bot/voice.py @@ -0,0 +1,76 @@ +"""Spracheingabe (Whisper) und Sprachausgabe (TTS) fuer den Hausmeister-Bot. + +Nutzt die OpenAI API direkt (nicht OpenRouter). +""" + +import io +import logging +from typing import Optional + +import requests + +from core import config + +log = logging.getLogger("voice") + +_api_key: Optional[str] = None + +TTS_MODEL = "tts-1" +TTS_VOICE = "onyx" +WHISPER_MODEL = "whisper-1" + + +def _get_key() -> str: + global _api_key + if not _api_key: + cfg = config.parse_config() + _api_key = cfg.raw.get("OPENAI_API_KEY", "") + if not _api_key: + log.error("OPENAI_API_KEY fehlt in homelab.conf") + return _api_key + + +def transcribe(audio_bytes: bytes, filename: str = "voice.ogg") -> Optional[str]: + """Transkribiert Audio via Whisper API. Gibt Text zurueck oder None bei Fehler.""" + key = _get_key() + if not key: + return None + try: + r = requests.post( + "https://api.openai.com/v1/audio/transcriptions", + headers={"Authorization": f"Bearer {key}"}, + files={"file": (filename, io.BytesIO(audio_bytes), "audio/ogg")}, + data={"model": WHISPER_MODEL, "language": "de"}, + timeout=30, + ) + if r.ok: + text = r.json().get("text", "").strip() + log.info("Whisper: %s", text[:80]) + return text + log.warning("Whisper Fehler: %s %s", r.status_code, r.text[:200]) + except Exception as e: + log.warning("Whisper Exception: %s", e) + return None + + +def synthesize(text: str) -> Optional[bytes]: + """Erzeugt Sprache via TTS API. Gibt OGG-Bytes zurueck oder None bei Fehler.""" + key = _get_key() + if not key: + return None + if len(text) > 4000: + text = text[:4000] + try: + r = requests.post( + "https://api.openai.com/v1/audio/speech", + headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"}, + json={"model": TTS_MODEL, "input": text, "voice": TTS_VOICE, "response_format": "opus"}, + timeout=30, + ) + if r.ok: + log.info("TTS: %d bytes fuer %d Zeichen", len(r.content), len(text)) + return r.content + log.warning("TTS Fehler: %s %s", r.status_code, r.text[:200]) + except Exception as e: + log.warning("TTS Exception: %s", e) + return None diff --git a/homelab.conf b/homelab.conf index 7e3eade5..760682cc 100644 --- a/homelab.conf +++ b/homelab.conf @@ -176,6 +176,7 @@ FORGEJO_TOKEN="b874766bdf357bd4c32fa4369d0c588fc6193336" FORGEJO_SYNC_TOKEN="5402da0447b0eb6aede721a8748a08974ddc5c42" GITHUB_PAT="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" OPENROUTER_KEY="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" +OPENAI_API_KEY="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" MEMORY_API_TOKEN="Ai8eeQibV6Z1RWc7oNPim4PXB4vILU1nRW2-XgRcX2M" MEMORY_API_URL="http://100.121.192.94:8400"