feat(rag): Cross-Encoder Reranking via CT123 bge-reranker-v2-m3
This commit is contained in:
parent
d718c6d2c2
commit
da0d1cd16c
1 changed files with 72 additions and 44 deletions
|
|
@ -20,6 +20,11 @@ KB_ID = "dc24edda27a311f19fe7fb811de6f016"
|
||||||
OLLAMA_EMBED_URL = "http://100.84.255.83:11434/api/embeddings"
|
OLLAMA_EMBED_URL = "http://100.84.255.83:11434/api/embeddings"
|
||||||
EMBED_MODEL = "nomic-embed-text"
|
EMBED_MODEL = "nomic-embed-text"
|
||||||
|
|
||||||
|
# Cross-Encoder Reranking (CT 123, pve-hetzner LAN)
|
||||||
|
RERANKER_URL = "http://10.10.10.123:8099"
|
||||||
|
RERANK_CANDIDATES = 30
|
||||||
|
RERANK_TIMEOUT = 120
|
||||||
|
|
||||||
MIN_TOP_K = 5
|
MIN_TOP_K = 5
|
||||||
# Breite Übersichten: mehr ES-Runden, mehr distinct Treffer (pro vollem Pfad docnm_kwd)
|
# Breite Übersichten: mehr ES-Runden, mehr distinct Treffer (pro vollem Pfad docnm_kwd)
|
||||||
MAX_TOP_K_NORMAL = 25
|
MAX_TOP_K_NORMAL = 25
|
||||||
|
|
@ -68,7 +73,6 @@ Du hast Zugriff auf eine private Wissensbasis mit >21.000 Dokumenten (Vertraege,
|
||||||
WANN rag_search AUFRUFEN — IMMER bei diesen Fragen:
|
WANN rag_search AUFRUFEN — IMMER bei diesen Fragen:
|
||||||
- "habe ich..." / "gibt es..." / "wo ist..." / "finde..." / "zeig mir..." + Dokument/Vertrag/Versicherung/Bescheid
|
- "habe ich..." / "gibt es..." / "wo ist..." / "finde..." / "zeig mir..." + Dokument/Vertrag/Versicherung/Bescheid
|
||||||
- Jede Frage nach persoenlichen Unterlagen, Vertraegen, Versicherungen, Rechnungen, Bescheiden
|
- Jede Frage nach persoenlichen Unterlagen, Vertraegen, Versicherungen, Rechnungen, Bescheiden
|
||||||
- Wohnungen, Immobilien, Grundstuecke, Mietobjekte, Auslands-Objekte (z.B. Kambodscha): immer rag_search — auch wenn das Gedaechtnis schon einen Wohnort nennt
|
|
||||||
- AUCH wenn du glaubst die Antwort zu kennen — das Gedaechtnis ist NICHT die Wissensbasis!
|
- AUCH wenn du glaubst die Antwort zu kennen — das Gedaechtnis ist NICHT die Wissensbasis!
|
||||||
- AUCH wenn das Thema im Gedaechtnis steht — trotzdem rag_search aufrufen fuer vollstaendige Antwort
|
- AUCH wenn das Thema im Gedaechtnis steht — trotzdem rag_search aufrufen fuer vollstaendige Antwort
|
||||||
|
|
||||||
|
|
@ -76,7 +80,6 @@ WANN NICHT: Nur bei reinen Homelab/IT-Fragen, Smalltalk, oder wenn der User expl
|
||||||
|
|
||||||
SUCHANFRAGE: Kurze Keywords, KEINE ganzen Saetze. Beispiele:
|
SUCHANFRAGE: Kurze Keywords, KEINE ganzen Saetze. Beispiele:
|
||||||
- "Familienbuch" / "Grundsteuer Erklaerung" / "Haftpflicht" / "Kindergeld" / "Mietvertrag" / "Arbeitsvertrag" / "Reisepass"
|
- "Familienbuch" / "Grundsteuer Erklaerung" / "Haftpflicht" / "Kindergeld" / "Mietvertrag" / "Arbeitsvertrag" / "Reisepass"
|
||||||
- "Wohnung Kambodscha" / "Immobilie" / "Condo"
|
|
||||||
|
|
||||||
ERGEBNISSE AUSWERTEN:
|
ERGEBNISSE AUSWERTEN:
|
||||||
- Bei breiten Fragen ("welche Versicherungen", Jahreskosten, Listen): top_k=15-25, ALLE Treffer aus der Tool-Antwort abarbeiten
|
- Bei breiten Fragen ("welche Versicherungen", Jahreskosten, Listen): top_k=15-25, ALLE Treffer aus der Tool-Antwort abarbeiten
|
||||||
|
|
@ -176,7 +179,6 @@ def _es_hybrid_search(query: str, es_size: int) -> dict:
|
||||||
{"match": {"content_de": {"query": query, "boost": 2.0}}},
|
{"match": {"content_de": {"query": query, "boost": 2.0}}},
|
||||||
{"match": {"content_ltks": {"query": query.lower(), "boost": 0.4}}},
|
{"match": {"content_ltks": {"query": query.lower(), "boost": 0.4}}},
|
||||||
{"match": {"docnm_kwd": {"query": query, "boost": 3.0}}},
|
{"match": {"docnm_kwd": {"query": query, "boost": 3.0}}},
|
||||||
{"match": {"docnm_search": {"query": query, "boost": 5.0}}},
|
|
||||||
],
|
],
|
||||||
"minimum_should_match": 0,
|
"minimum_should_match": 0,
|
||||||
}
|
}
|
||||||
|
|
@ -204,6 +206,55 @@ def _es_hybrid_search(query: str, es_size: int) -> dict:
|
||||||
return {"_error": str(e)}
|
return {"_error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def _snippet_for_rerank(src: dict) -> str:
|
||||||
|
raw = src.get("content_with_weight") or src.get("content_de") or ""
|
||||||
|
return raw[:4000]
|
||||||
|
|
||||||
|
|
||||||
|
def _rerank_hits(query: str, hits: list) -> tuple[list, bool]:
|
||||||
|
"""Sortiert die ersten RERANK_CANDIDATES Treffer per Cross-Encoder neu."""
|
||||||
|
if not hits or not RERANKER_URL:
|
||||||
|
return hits, False
|
||||||
|
to_score = hits[:RERANK_CANDIDATES]
|
||||||
|
docs = []
|
||||||
|
for h in to_score:
|
||||||
|
src = h.get("_source") or {}
|
||||||
|
docs.append(_snippet_for_rerank(src))
|
||||||
|
if not any((d or "").strip() for d in docs):
|
||||||
|
return hits, False
|
||||||
|
body = json.dumps({"query": query, "documents": docs}).encode()
|
||||||
|
url = f"{RERANKER_URL.rstrip('/')}/rerank"
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url,
|
||||||
|
data=body,
|
||||||
|
method="POST",
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=RERANK_TIMEOUT) as resp:
|
||||||
|
data = json.load(resp)
|
||||||
|
scores = data.get("scores") or []
|
||||||
|
if len(scores) != len(to_score):
|
||||||
|
log.warning(
|
||||||
|
"rerank score count mismatch: %s vs %s",
|
||||||
|
len(scores),
|
||||||
|
len(to_score),
|
||||||
|
)
|
||||||
|
return hits, False
|
||||||
|
indexed = list(zip(scores, range(len(to_score))))
|
||||||
|
indexed.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
new_order: list = []
|
||||||
|
for sc, idx in indexed:
|
||||||
|
h = to_score[idx]
|
||||||
|
h["_rerank_score"] = float(sc)
|
||||||
|
new_order.append(h)
|
||||||
|
rest = hits[RERANK_CANDIDATES:]
|
||||||
|
return new_order + rest, True
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("rerank failed: %s", e)
|
||||||
|
return hits, False
|
||||||
|
|
||||||
|
|
||||||
def _is_wide_recall_query(q: str) -> bool:
|
def _is_wide_recall_query(q: str) -> bool:
|
||||||
"""Übersichts-/Listen-/Kostenfragen: mehrfach suchen und mergen."""
|
"""Übersichts-/Listen-/Kostenfragen: mehrfach suchen und mergen."""
|
||||||
ql = (q or "").lower()
|
ql = (q or "").lower()
|
||||||
|
|
@ -261,18 +312,7 @@ def _is_wide_recall_query(q: str) -> bool:
|
||||||
"jaehrlich",
|
"jaehrlich",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if costish and broad:
|
return costish and broad
|
||||||
return True
|
|
||||||
|
|
||||||
# Immobilien / Wohnungen / Kambodscha
|
|
||||||
if any(x in ql for x in ("wohnung", "immobilie", "condo", "apartment", "grundstück", "grundstueck")):
|
|
||||||
if any(x in ql for x in ("welche", "alle", "liste", "habe ich", "übersicht", "uebersicht", "wie viele")):
|
|
||||||
return True
|
|
||||||
if any(x in ql for x in ("kambodscha", "cambodia", "takeo", "phnom", "sihanouk")):
|
|
||||||
if any(x in ql for x in ("welche", "alle", "wohnung", "immobilie", "haus", "condo", "apartment", "mietvertrag")):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# Zusatzanfragen decken Sparten + Gesellschaften ab (Recall)
|
# Zusatzanfragen decken Sparten + Gesellschaften ab (Recall)
|
||||||
|
|
@ -296,23 +336,6 @@ _WIDE_SUBQUERIES = [
|
||||||
"Kfz Versicherungsschein Beitrag jährlich",
|
"Kfz Versicherungsschein Beitrag jährlich",
|
||||||
]
|
]
|
||||||
|
|
||||||
_WIDE_SUBQUERIES_IMMOBILIEN = [
|
|
||||||
"Arakawa Wohnung Mietvertrag",
|
|
||||||
"Arakawa Wohnung D1603",
|
|
||||||
"Arakawa Wohnung G2010",
|
|
||||||
"Arakawa Wohnung-2",
|
|
||||||
"Kambodscha Arakawa Kaufvertrag",
|
|
||||||
"Kambodscha Arakawa Vollmacht",
|
|
||||||
"Kambodscha Arakawa Überweisung",
|
|
||||||
"Wohnung Mietvertrag Kambodscha",
|
|
||||||
"Condo Apartment Cambodia",
|
|
||||||
"Hard Title Wohnung",
|
|
||||||
"Wohnungen Kurtzübersicht",
|
|
||||||
"Mietvertrag Ramirez Antonio",
|
|
||||||
"Mietvertrag Cheng Qiu",
|
|
||||||
"Kambodscha Rechnungen Strom Miete",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_hits_from_queries(
|
def _merge_hits_from_queries(
|
||||||
queries: list[str],
|
queries: list[str],
|
||||||
|
|
@ -364,16 +387,9 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
||||||
es_size = min(ES_SIZE_CAP, max(top_k * 10, 70))
|
es_size = min(ES_SIZE_CAP, max(top_k * 10, 70))
|
||||||
|
|
||||||
if wide:
|
if wide:
|
||||||
ql = qstrip.lower()
|
|
||||||
_immo_wide = any(
|
|
||||||
x in ql
|
|
||||||
for x in ("wohnung", "immobilie", "condo", "apartment", "grundstück", "grundstueck",
|
|
||||||
"kambodscha", "cambodia", "arakawa")
|
|
||||||
)
|
|
||||||
_sq_pool = _WIDE_SUBQUERIES_IMMOBILIEN if _immo_wide else _WIDE_SUBQUERIES
|
|
||||||
subqs = [qstrip]
|
subqs = [qstrip]
|
||||||
for sq in _sq_pool:
|
for sq in _WIDE_SUBQUERIES:
|
||||||
if sq.lower() not in ql:
|
if sq.lower() not in qstrip.lower():
|
||||||
subqs.append(sq)
|
subqs.append(sq)
|
||||||
pool_cap = max(top_k * 5, 120)
|
pool_cap = max(top_k * 5, 120)
|
||||||
hits, err = _merge_hits_from_queries(
|
hits, err = _merge_hits_from_queries(
|
||||||
|
|
@ -400,6 +416,8 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
||||||
if not hits:
|
if not hits:
|
||||||
return f"Keine Ergebnisse fuer '{qstrip}' in der Wissensbasis gefunden."
|
return f"Keine Ergebnisse fuer '{qstrip}' in der Wissensbasis gefunden."
|
||||||
|
|
||||||
|
hits, reranked = _rerank_hits(qstrip, hits)
|
||||||
|
|
||||||
seen_docs: set[str] = set()
|
seen_docs: set[str] = set()
|
||||||
lines: list[str] = []
|
lines: list[str] = []
|
||||||
count = 0
|
count = 0
|
||||||
|
|
@ -417,7 +435,12 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
||||||
continue
|
continue
|
||||||
seen_docs.add(dk)
|
seen_docs.add(dk)
|
||||||
|
|
||||||
score = h.get("_score") or 0.0
|
if "_rerank_score" in h:
|
||||||
|
score = float(h["_rerank_score"])
|
||||||
|
score_label = "Rerank"
|
||||||
|
else:
|
||||||
|
score = float(h.get("_score") or 0.0)
|
||||||
|
score_label = "ES"
|
||||||
raw = src.get("content_with_weight") or src.get("content_de") or ""
|
raw = src.get("content_with_weight") or src.get("content_de") or ""
|
||||||
content = raw[:snip_len].strip()
|
content = raw[:snip_len].strip()
|
||||||
ocr = _ocr_note(raw)
|
ocr = _ocr_note(raw)
|
||||||
|
|
@ -425,7 +448,9 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
||||||
filename = doc_name.rsplit("__", 1)[-1] if "__" in doc_name else doc_name
|
filename = doc_name.rsplit("__", 1)[-1] if "__" in doc_name else doc_name
|
||||||
folder_line = f" Ordner: {folder}" if folder else ""
|
folder_line = f" Ordner: {folder}" if folder else ""
|
||||||
|
|
||||||
lines.append(f"---\n**{count + 1}. {filename}** (Score: {score:.1f}){ocr}")
|
lines.append(
|
||||||
|
f"---\n**{count + 1}. {filename}** ({score_label}: {score:.3f}){ocr}"
|
||||||
|
)
|
||||||
if folder_line:
|
if folder_line:
|
||||||
lines.append(folder_line)
|
lines.append(folder_line)
|
||||||
if content:
|
if content:
|
||||||
|
|
@ -435,7 +460,10 @@ def handle_rag_search(query: str, top_k: int = 8, **kw):
|
||||||
if count == 0:
|
if count == 0:
|
||||||
return f"Keine Dokumente fuer '{qstrip}' gefunden."
|
return f"Keine Dokumente fuer '{qstrip}' gefunden."
|
||||||
|
|
||||||
lines.insert(0, header)
|
hdr = header.rstrip() + (
|
||||||
|
" _(Cross-Encoder reranked)_" if reranked else ""
|
||||||
|
) + "\n"
|
||||||
|
lines.insert(0, hdr)
|
||||||
tail = (
|
tail = (
|
||||||
"\n---\n(Ende der Ergebnisse. Nur diese Dokumente in dieser Runde. "
|
"\n---\n(Ende der Ergebnisse. Nur diese Dokumente in dieser Runde. "
|
||||||
+ (
|
+ (
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue