feat: KI-Plausibilitätsprüfung für jeden Preis
- Jeder Preis wird nach dem Scan von der KI einzeln geprüft - plausibel/verdächtig/ungeprüft Status in DB (prices.plausibel) - Fallback auf Regelwerk wenn KI nicht erreichbar - Dashboard: Farbcodierung (grün=PE bestätigt, rot=verdächtig) - Nur plausible Preise in Stats/Vergleich/KI-Auswertung - Scraper: Fallback ohne Gepäck komplett entfernt - Scraper: Mindestpreis 700€ für PE (filtert Economy raus) - Scraper: Kabinen-Verifikation auf jeder Seite Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
28235c9eda
commit
4dde05ff01
5 changed files with 233 additions and 74 deletions
|
|
@ -90,6 +90,8 @@ def init_db():
|
||||||
for col_sql in [
|
for col_sql in [
|
||||||
"ALTER TABLE prices ADD COLUMN booking_url TEXT",
|
"ALTER TABLE prices ADD COLUMN booking_url TEXT",
|
||||||
"ALTER TABLE prices ADD COLUMN screenshot_id INTEGER",
|
"ALTER TABLE prices ADD COLUMN screenshot_id INTEGER",
|
||||||
|
"ALTER TABLE prices ADD COLUMN plausibel INTEGER",
|
||||||
|
"ALTER TABLE prices ADD COLUMN plausi_grund TEXT DEFAULT ''",
|
||||||
]:
|
]:
|
||||||
try:
|
try:
|
||||||
c.execute(col_sql)
|
c.execute(col_sql)
|
||||||
|
|
|
||||||
133
hub/src/ki.py
133
hub/src/ki.py
|
|
@ -10,6 +10,138 @@ client = OpenAI(
|
||||||
|
|
||||||
MODEL = os.environ.get("AI_MODEL", "openai/gpt-4o-mini")
|
MODEL = os.environ.get("AI_MODEL", "openai/gpt-4o-mini")
|
||||||
|
|
||||||
|
PLAUSI_PROMPT = """Du bist ein Flugpreis-Experte. Pruefe jeden der folgenden Preise auf Plausibilitaet.
|
||||||
|
|
||||||
|
KONTEXT:
|
||||||
|
- Strecke: Roundtrip Frankfurt (FRA) → Phnom Penh Techo (KTI), ca. 2 Monate Aufenthalt
|
||||||
|
- Kabinenklasse: PREMIUM ECONOMY (nicht Economy!)
|
||||||
|
- Gepaeck: 1 grosser Koffer + Handgepaeck MUSS inklusive sein
|
||||||
|
- Bevorzugte Airlines: China Southern (CZ), Cathay Pacific (CX), Singapore Airlines (SQ), Thai Airways (TG), Vietnam Airlines (VN)
|
||||||
|
|
||||||
|
PREISREFERENZ fuer Premium Economy Roundtrip FRA-KTI mit Gepaeck:
|
||||||
|
- Sehr guenstig: 900-1200 EUR (seltene Deals, plausibel wenn bekannte Airline)
|
||||||
|
- Normal: 1200-1800 EUR
|
||||||
|
- Teuer: 1800-2500 EUR
|
||||||
|
- Ueber 2500 EUR: zu teuer oder Business Class
|
||||||
|
- UNTER 700 EUR: fast sicher ECONOMY, nicht Premium Economy!
|
||||||
|
- 700-900 EUR: sehr verdaechtig, wahrscheinlich Economy oder ohne Gepaeck
|
||||||
|
|
||||||
|
PRUEFREGELN:
|
||||||
|
1. Preis unter 700 EUR → NICHT PLAUSIBEL (Economy ohne Gepaeck)
|
||||||
|
2. Preis 700-900 EUR → VERDAECHTIG (pruefen ob Economy oder ohne Gepaeck)
|
||||||
|
3. Preis 900-2500 EUR mit bekannter Airline → PLAUSIBEL
|
||||||
|
4. Preis ueber 2500 EUR → VERDAECHTIG (eventuell Business Class)
|
||||||
|
5. Scanner "kayak_multicity" (HKG Stopover): Preise 100-200 EUR hoeher als Direkt ist normal
|
||||||
|
6. Wenn ein Scanner deutlich guenstigere Preise zeigt als alle anderen: VERDAECHTIG
|
||||||
|
|
||||||
|
PREISE ZU PRUEFEN:
|
||||||
|
{preise_liste}
|
||||||
|
|
||||||
|
Antworte NUR mit gueltigem JSON-Array. Fuer jeden Preis:
|
||||||
|
{{"id": <price_id>, "plausibel": true/false, "grund": "<kurze Begruendung auf Deutsch>"}}
|
||||||
|
|
||||||
|
Beispiel:
|
||||||
|
[
|
||||||
|
{{"id": 123, "plausibel": true, "grund": "1350 EUR fuer CX PE Roundtrip ist marktgerecht"}},
|
||||||
|
{{"id": 124, "plausibel": false, "grund": "436 EUR ist Economy-Preis, nicht PE mit Gepaeck"}}
|
||||||
|
]"""
|
||||||
|
|
||||||
|
|
||||||
|
def plausibilitaetspruefung(von="FRA", nach="KTI"):
|
||||||
|
"""Prüft alle ungeprüften Preise des aktuellen Laufs via KI."""
|
||||||
|
log("KI-Plausibilitätsprüfung gestartet")
|
||||||
|
conn = get_conn()
|
||||||
|
|
||||||
|
ungepruefte = conn.execute("""
|
||||||
|
SELECT id, scanner, node, preis, airline, abflug
|
||||||
|
FROM prices
|
||||||
|
WHERE von=? AND nach=?
|
||||||
|
AND plausibel IS NULL
|
||||||
|
AND date(scraped_at) = date('now')
|
||||||
|
ORDER BY preis ASC
|
||||||
|
""", (von, nach)).fetchall()
|
||||||
|
|
||||||
|
if not ungepruefte:
|
||||||
|
log("Keine ungeprüften Preise — Plausibilitätsprüfung übersprungen")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
preise_liste = "\n".join([
|
||||||
|
f" ID {p['id']}: {p['preis']:.0f} EUR — Scanner: {p['scanner']} — "
|
||||||
|
f"Node: {p['node']} — Airline: {p['airline'] or 'k.A.'} — Abflug: {p['abflug']}"
|
||||||
|
for p in ungepruefte
|
||||||
|
])
|
||||||
|
|
||||||
|
prompt = PLAUSI_PROMPT.format(preise_liste=preise_liste)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
max_tokens=2000,
|
||||||
|
temperature=0.1,
|
||||||
|
)
|
||||||
|
antwort = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
# JSON aus Antwort extrahieren (KI gibt manchmal Markdown-Wrapper)
|
||||||
|
if "```" in antwort:
|
||||||
|
antwort = antwort.split("```")[1]
|
||||||
|
if antwort.startswith("json"):
|
||||||
|
antwort = antwort[4:]
|
||||||
|
|
||||||
|
ergebnisse = json.loads(antwort)
|
||||||
|
|
||||||
|
plausibel_count = 0
|
||||||
|
verdaechtig_count = 0
|
||||||
|
|
||||||
|
for e in ergebnisse:
|
||||||
|
pid = e.get("id")
|
||||||
|
ist_plausibel = 1 if e.get("plausibel") else 0
|
||||||
|
grund = e.get("grund", "")[:200]
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE prices SET plausibel=?, plausi_grund=? WHERE id=?",
|
||||||
|
(ist_plausibel, grund, pid)
|
||||||
|
)
|
||||||
|
if ist_plausibel:
|
||||||
|
plausibel_count += 1
|
||||||
|
else:
|
||||||
|
verdaechtig_count += 1
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
log(f"Plausibilitätsprüfung: {plausibel_count} plausibel, "
|
||||||
|
f"{verdaechtig_count} verdächtig von {len(ungepruefte)} Preisen")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
log(f"KI-Plausi JSON-Fehler: {e} — Antwort: {antwort[:200]}", "ERROR")
|
||||||
|
# Fallback: regelbasiert markieren
|
||||||
|
_regelbasierte_plausi(conn, ungepruefte)
|
||||||
|
except Exception as e:
|
||||||
|
log(f"KI-Plausi Fehler: {e}", "ERROR")
|
||||||
|
_regelbasierte_plausi(conn, ungepruefte)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _regelbasierte_plausi(conn, preise):
|
||||||
|
"""Fallback wenn KI nicht erreichbar: einfache Regeln."""
|
||||||
|
log("Regelbasierte Plausibilitätsprüfung als Fallback")
|
||||||
|
for p in preise:
|
||||||
|
preis = p["preis"]
|
||||||
|
if preis < 700:
|
||||||
|
conn.execute("UPDATE prices SET plausibel=0, plausi_grund=? WHERE id=?",
|
||||||
|
("Preis unter 700€ — sehr wahrscheinlich Economy", p["id"]))
|
||||||
|
elif preis < 900:
|
||||||
|
conn.execute("UPDATE prices SET plausibel=0, plausi_grund=? WHERE id=?",
|
||||||
|
("Preis 700-900€ — verdächtig, wahrscheinlich Economy oder ohne Gepäck", p["id"]))
|
||||||
|
elif preis > 3000:
|
||||||
|
conn.execute("UPDATE prices SET plausibel=0, plausi_grund=? WHERE id=?",
|
||||||
|
("Preis über 3000€ — möglicherweise Business Class", p["id"]))
|
||||||
|
else:
|
||||||
|
conn.execute("UPDATE prices SET plausibel=1, plausi_grund=? WHERE id=?",
|
||||||
|
("Preis im erwarteten PE-Bereich", p["id"]))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def get_prompt():
|
def get_prompt():
|
||||||
conn = get_conn()
|
conn = get_conn()
|
||||||
|
|
@ -29,6 +161,7 @@ def auswerten(von="FRA", nach="KTI"):
|
||||||
FROM prices
|
FROM prices
|
||||||
WHERE von=? AND nach=?
|
WHERE von=? AND nach=?
|
||||||
AND date(scraped_at) = date('now')
|
AND date(scraped_at) = date('now')
|
||||||
|
AND (plausibel = 1 OR plausibel IS NULL)
|
||||||
ORDER BY preis ASC
|
ORDER BY preis ASC
|
||||||
""", (von, nach)).fetchall()
|
""", (von, nach)).fetchall()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import requests
|
||||||
import schedule
|
import schedule
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from db import init_db, get_conn, log
|
from db import init_db, get_conn, log
|
||||||
from ki import auswerten
|
from ki import auswerten, plausibilitaetspruefung
|
||||||
|
|
||||||
# Verhindert dass zwei Läufe gleichzeitig laufen
|
# Verhindert dass zwei Läufe gleichzeitig laufen
|
||||||
_scan_lock = threading.Lock()
|
_scan_lock = threading.Lock()
|
||||||
|
|
@ -184,10 +184,15 @@ def scraping_lauf(label="Standard", flex_tage_liste=None):
|
||||||
log(f"Scraping [{label}] fertig — {online}/{len(nodes)} Nodes | "
|
log(f"Scraping [{label}] fertig — {online}/{len(nodes)} Nodes | "
|
||||||
f"{fehler} Fehler | {dauer}s Laufzeit")
|
f"{fehler} Fehler | {dauer}s Laufzeit")
|
||||||
|
|
||||||
|
try:
|
||||||
|
plausibilitaetspruefung()
|
||||||
|
except Exception as e:
|
||||||
|
log(f"KI-Plausi-Fehler: {e}", "ERROR")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
auswerten()
|
auswerten()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f"KI-Fehler: {e}", "ERROR")
|
log(f"KI-Auswertung-Fehler: {e}", "ERROR")
|
||||||
|
|
||||||
log(f"=== Lauf [{label}] beendet ===")
|
log(f"=== Lauf [{label}] beendet ===")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -89,8 +89,8 @@ OVERVIEW_HTML = BASE_HTML.replace("{% block content %}{% endblock %}", """
|
||||||
<div class="grid-3" style="margin-bottom:1.5rem">
|
<div class="grid-3" style="margin-bottom:1.5rem">
|
||||||
<div class="card stat-box">
|
<div class="card stat-box">
|
||||||
<div class="value" id="min-preis">—</div>
|
<div class="value" id="min-preis">—</div>
|
||||||
<div class="label">Günstigster Preis heute (EUR)</div>
|
<div class="label">Günstigster PE-Preis heute (EUR)</div>
|
||||||
<div id="min-preis-warnung" style="display:none;margin-top:0.4rem;font-size:0.75rem;color:#fbbf24">⚠ unter 1.000 € — bitte prüfen</div>
|
<div id="min-preis-warnung" style="display:none;margin-top:0.4rem;font-size:0.75rem;color:#34d399">✓ KI-geprüft: nur plausible PE-Preise</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="card stat-box">
|
<div class="card stat-box">
|
||||||
<div class="value" id="avg-preis">—</div>
|
<div class="value" id="avg-preis">—</div>
|
||||||
|
|
@ -199,10 +199,8 @@ async function ladeUebersicht() {
|
||||||
|
|
||||||
const minHeute = stats.min_heute;
|
const minHeute = stats.min_heute;
|
||||||
document.getElementById('min-preis').textContent = minHeute ? Math.round(minHeute) : '—';
|
document.getElementById('min-preis').textContent = minHeute ? Math.round(minHeute) : '—';
|
||||||
document.getElementById('min-preis').style.color = (minHeute && minHeute < PLAUSI_GRENZE) ? '#fbbf24' : '#38bdf8';
|
document.getElementById('min-preis').style.color = '#38bdf8';
|
||||||
if (minHeute && minHeute < PLAUSI_GRENZE) {
|
document.getElementById('min-preis-warnung').style.display = minHeute ? 'block' : 'none';
|
||||||
document.getElementById('min-preis-warnung').style.display = 'block';
|
|
||||||
}
|
|
||||||
document.getElementById('avg-preis').textContent = stats.avg_30d ? Math.round(stats.avg_30d) : '—';
|
document.getElementById('avg-preis').textContent = stats.avg_30d ? Math.round(stats.avg_30d) : '—';
|
||||||
document.getElementById('node-count').textContent = nodes.filter(n=>n.status==='online').length;
|
document.getElementById('node-count').textContent = nodes.filter(n=>n.status==='online').length;
|
||||||
|
|
||||||
|
|
@ -261,26 +259,38 @@ async function ladeUebersicht() {
|
||||||
const HOTEL_HKG = 150; // geschätzte Hotel-Kosten HKG in EUR
|
const HOTEL_HKG = 150; // geschätzte Hotel-Kosten HKG in EUR
|
||||||
tbody.innerHTML = preise.map(p => {
|
tbody.innerHTML = preise.map(p => {
|
||||||
const isMulticity = p.scanner === 'kayak_multicity';
|
const isMulticity = p.scanner === 'kayak_multicity';
|
||||||
const warn = p.preis < PLAUSI_GRENZE;
|
// KI-Plausibilitätsstatus: 1=plausibel, 0=verdächtig, -1/null=ungeprüft
|
||||||
const plausi = warn
|
const ps = p.plausi_status !== undefined ? p.plausi_status : (p.plausibel !== undefined ? p.plausibel : -1);
|
||||||
? '<span style="background:#451a03;color:#fbbf24;padding:0.15rem 0.5rem;border-radius:4px;font-size:0.75rem">⚠ bitte prüfen</span>'
|
const grund = p.plausi_info || p.plausi_grund || '';
|
||||||
: '<span style="background:#064e3b;color:#34d399;padding:0.15rem 0.5rem;border-radius:4px;font-size:0.75rem">✓ plausibel</span>';
|
let plausi;
|
||||||
|
if (ps === 1) {
|
||||||
|
plausi = `<span title="${grund}" style="background:#064e3b;color:#34d399;padding:0.15rem 0.5rem;border-radius:4px;font-size:0.75rem;cursor:help">✓ PE bestätigt</span>`;
|
||||||
|
} else if (ps === 0) {
|
||||||
|
plausi = `<span title="${grund}" style="background:#7f1d1d;color:#fca5a5;padding:0.15rem 0.5rem;border-radius:4px;font-size:0.75rem;cursor:help">✗ ${grund.substring(0,40) || 'verdächtig'}</span>`;
|
||||||
|
} else {
|
||||||
|
plausi = '<span style="background:#451a03;color:#fbbf24;padding:0.15rem 0.5rem;border-radius:4px;font-size:0.75rem">⏳ prüfe...</span>';
|
||||||
|
}
|
||||||
const buchBtn = p.booking_url
|
const buchBtn = p.booking_url
|
||||||
? `<a href="${p.booking_url}" target="_blank" class="btn btn-sm" style="text-decoration:none">Öffnen ↗</a>`
|
? `<a href="${p.booking_url}" target="_blank" class="btn btn-sm" style="text-decoration:none">Öffnen ↗</a>`
|
||||||
: '—';
|
: '—';
|
||||||
const scannerLabel = isMulticity
|
const scannerLabel = isMulticity
|
||||||
? `<strong style="color:#818cf8">🇭🇰 HKG Stopover</strong><br><span style="font-size:0.72rem;color:#64748b">+~${HOTEL_HKG}€ Hotel</span>`
|
? `<strong style="color:#818cf8">🇭🇰 HKG Stopover</strong><br><span style="font-size:0.72rem;color:#64748b">+~${HOTEL_HKG}€ Hotel</span>`
|
||||||
: p.scanner;
|
: p.scanner;
|
||||||
|
const verdaechtig = (ps === 0);
|
||||||
|
const preisFarbe = verdaechtig ? '#ef4444' : (isMulticity ? '#a78bfa' : '#34d399');
|
||||||
const gesamtHtml = isMulticity
|
const gesamtHtml = isMulticity
|
||||||
? `<strong style="color:${warn?'#fbbf24':'#a78bfa'}">${p.preis} €</strong><br><span style="font-size:0.75rem;color:#64748b">∑ ~${Math.round(p.preis)+HOTEL_HKG} € inkl. Hotel</span>`
|
? `<strong style="color:${preisFarbe}">${p.preis} €</strong><br><span style="font-size:0.75rem;color:#64748b">∑ ~${Math.round(p.preis)+HOTEL_HKG} € inkl. Hotel</span>`
|
||||||
: `<strong style="color:${warn?'#fbbf24':'#34d399'}">${p.preis} €</strong>`;
|
: `<strong style="color:${preisFarbe}">${p.preis} €</strong>`;
|
||||||
const ssBtn = p.screenshot_id
|
const ssBtn = p.screenshot_id
|
||||||
? `<button onclick="zeigeScreenshot(${p.screenshot_id},'${p.scanner} · ${p.node} · ${p.abflug||''}')"
|
? `<button onclick="zeigeScreenshot(${p.screenshot_id},'${p.scanner} · ${p.node} · ${p.abflug||''}')"
|
||||||
style="background:#1e3a5f;border:1px solid #2563eb;color:#93c5fd;padding:0.2rem 0.5rem;border-radius:5px;cursor:pointer;font-size:0.8rem">
|
style="background:#1e3a5f;border:1px solid #2563eb;color:#93c5fd;padding:0.2rem 0.5rem;border-radius:5px;cursor:pointer;font-size:0.8rem">
|
||||||
📷
|
📷
|
||||||
</button>`
|
</button>`
|
||||||
: '<span style="color:#334155;font-size:0.75rem">—</span>';
|
: '<span style="color:#334155;font-size:0.75rem">—</span>';
|
||||||
return `<tr${isMulticity?' style="background:rgba(99,102,241,0.06);border-left:3px solid #6366f1"':''}>
|
const rowStyle = verdaechtig
|
||||||
|
? ' style="background:rgba(239,68,68,0.08);border-left:3px solid #ef4444;opacity:0.7"'
|
||||||
|
: (isMulticity ? ' style="background:rgba(99,102,241,0.06);border-left:3px solid #6366f1"' : '');
|
||||||
|
return `<tr${rowStyle}>
|
||||||
<td>${scannerLabel}</td>
|
<td>${scannerLabel}</td>
|
||||||
<td style="font-size:0.8rem;color:#64748b">${p.node}</td>
|
<td style="font-size:0.8rem;color:#64748b">${p.node}</td>
|
||||||
<td>${gesamtHtml}</td>
|
<td>${gesamtHtml}</td>
|
||||||
|
|
@ -290,7 +300,7 @@ async function ladeUebersicht() {
|
||||||
<td>${buchBtn}</td>
|
<td>${buchBtn}</td>
|
||||||
<td>${ssBtn}</td>
|
<td>${ssBtn}</td>
|
||||||
</tr>`;
|
</tr>`;
|
||||||
}).join('') || '<tr><td colspan="7" style="color:#475569;text-align:center">Noch keine Daten heute</td></tr>';
|
}).join('') || '<tr><td colspan="8" style="color:#475569;text-align:center">Noch keine Daten heute</td></tr>';
|
||||||
|
|
||||||
const ntbody = document.getElementById('nodes-tbody');
|
const ntbody = document.getElementById('nodes-tbody');
|
||||||
ntbody.innerHTML = nodes.map(n => `
|
ntbody.innerHTML = nodes.map(n => `
|
||||||
|
|
@ -369,10 +379,10 @@ ladeUebersicht();
|
||||||
def api_stats():
|
def api_stats():
|
||||||
conn = get_conn()
|
conn = get_conn()
|
||||||
min_heute = conn.execute(
|
min_heute = conn.execute(
|
||||||
"SELECT MIN(preis) as v FROM prices WHERE date(scraped_at)=date('now')"
|
"SELECT MIN(preis) as v FROM prices WHERE date(scraped_at)=date('now') AND (plausibel=1 OR plausibel IS NULL)"
|
||||||
).fetchone()["v"]
|
).fetchone()["v"]
|
||||||
avg_30d = conn.execute(
|
avg_30d = conn.execute(
|
||||||
"SELECT AVG(preis) as v FROM prices WHERE scraped_at >= datetime('now','-30 days')"
|
"SELECT AVG(preis) as v FROM prices WHERE scraped_at >= datetime('now','-30 days') AND (plausibel=1 OR plausibel IS NULL)"
|
||||||
).fetchone()["v"]
|
).fetchone()["v"]
|
||||||
conn.close()
|
conn.close()
|
||||||
return jsonify({"min_heute": min_heute, "avg_30d": avg_30d})
|
return jsonify({"min_heute": min_heute, "avg_30d": avg_30d})
|
||||||
|
|
@ -399,17 +409,20 @@ def api_preise_heute():
|
||||||
# Neuester Scan-Lauf: ab MAX(scraped_at) - 20 Minuten
|
# Neuester Scan-Lauf: ab MAX(scraped_at) - 20 Minuten
|
||||||
# Damit werden immer die Preise des letzten Laufs gezeigt — alle mit Screenshot
|
# Damit werden immer die Preise des letzten Laufs gezeigt — alle mit Screenshot
|
||||||
rows = conn.execute("""
|
rows = conn.execute("""
|
||||||
SELECT * FROM prices
|
SELECT *, COALESCE(plausibel, -1) as plausi_status,
|
||||||
|
COALESCE(plausi_grund, '') as plausi_info
|
||||||
|
FROM prices
|
||||||
WHERE scraped_at >= datetime(
|
WHERE scraped_at >= datetime(
|
||||||
(SELECT MAX(scraped_at) FROM prices WHERE date(scraped_at) = date('now')),
|
(SELECT MAX(scraped_at) FROM prices WHERE date(scraped_at) = date('now')),
|
||||||
'-20 minutes'
|
'-20 minutes'
|
||||||
)
|
)
|
||||||
ORDER BY preis ASC LIMIT 100
|
ORDER BY preis ASC LIMIT 100
|
||||||
""").fetchall()
|
""").fetchall()
|
||||||
# Fallback: ganzer Tag (z.B. erster Lauf des Tages noch nicht abgeschlossen)
|
|
||||||
if not rows:
|
if not rows:
|
||||||
rows = conn.execute("""
|
rows = conn.execute("""
|
||||||
SELECT * FROM prices
|
SELECT *, COALESCE(plausibel, -1) as plausi_status,
|
||||||
|
COALESCE(plausi_grund, '') as plausi_info
|
||||||
|
FROM prices
|
||||||
WHERE date(scraped_at) = date('now')
|
WHERE date(scraped_at) = date('now')
|
||||||
ORDER BY preis ASC LIMIT 100
|
ORDER BY preis ASC LIMIT 100
|
||||||
""").fetchall()
|
""").fetchall()
|
||||||
|
|
@ -422,7 +435,7 @@ def api_preise_vergleich():
|
||||||
"""Pro Scanner: günstigster Preis je Node + Delta zum Vortag."""
|
"""Pro Scanner: günstigster Preis je Node + Delta zum Vortag."""
|
||||||
conn = get_conn()
|
conn = get_conn()
|
||||||
|
|
||||||
# Heute: günstigster Preis pro Scanner+Node — aus letztem Scan-Lauf
|
# Heute: günstigster PLAUSIBLER Preis pro Scanner+Node
|
||||||
heute = conn.execute("""
|
heute = conn.execute("""
|
||||||
SELECT scanner, node, MIN(preis) as preis, booking_url, abflug, ankunft
|
SELECT scanner, node, MIN(preis) as preis, booking_url, abflug, ankunft
|
||||||
FROM prices
|
FROM prices
|
||||||
|
|
@ -430,6 +443,7 @@ def api_preise_vergleich():
|
||||||
(SELECT MAX(scraped_at) FROM prices WHERE date(scraped_at) = date('now')),
|
(SELECT MAX(scraped_at) FROM prices WHERE date(scraped_at) = date('now')),
|
||||||
'-20 minutes'
|
'-20 minutes'
|
||||||
)
|
)
|
||||||
|
AND (plausibel = 1 OR plausibel IS NULL)
|
||||||
GROUP BY scanner, node
|
GROUP BY scanner, node
|
||||||
ORDER BY scanner, preis
|
ORDER BY scanner, preis
|
||||||
""").fetchall()
|
""").fetchall()
|
||||||
|
|
@ -438,6 +452,7 @@ def api_preise_vergleich():
|
||||||
SELECT scanner, node, MIN(preis) as preis, booking_url, abflug, ankunft
|
SELECT scanner, node, MIN(preis) as preis, booking_url, abflug, ankunft
|
||||||
FROM prices
|
FROM prices
|
||||||
WHERE date(scraped_at) = date('now')
|
WHERE date(scraped_at) = date('now')
|
||||||
|
AND (plausibel = 1 OR plausibel IS NULL)
|
||||||
GROUP BY scanner, node
|
GROUP BY scanner, node
|
||||||
ORDER BY scanner, preis
|
ORDER BY scanner, preis
|
||||||
""").fetchall()
|
""").fetchall()
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,11 @@ from seleniumbase import SB
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
# ── Qualitätsschwellen ────────────────────────────────────────────────────────
|
||||||
|
# PE Roundtrip FRA→KTI mit Gepäck: realistisch ab ~800€
|
||||||
|
MIN_PREIS_PE_ROUNDTRIP = 700
|
||||||
|
MAX_PREIS_PE_ROUNDTRIP = 12000
|
||||||
|
|
||||||
|
|
||||||
def _scrape_disabled(*args, **kwargs):
|
def _scrape_disabled(*args, **kwargs):
|
||||||
"""Deaktivierter Scanner — gibt leere Ergebnisse zurück."""
|
"""Deaktivierter Scanner — gibt leere Ergebnisse zurück."""
|
||||||
|
|
@ -9,6 +14,33 @@ def _scrape_disabled(*args, **kwargs):
|
||||||
return [], ""
|
return [], ""
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_results(results, scanner_name, kabine="premium_economy"):
|
||||||
|
"""Qualitätskontrolle: filtert unplausible Preise raus."""
|
||||||
|
if kabine == "premium_economy":
|
||||||
|
before = len(results)
|
||||||
|
results = [r for r in results if MIN_PREIS_PE_ROUNDTRIP <= r["preis"] <= MAX_PREIS_PE_ROUNDTRIP]
|
||||||
|
dropped = before - len(results)
|
||||||
|
if dropped:
|
||||||
|
print(f"[QC/{scanner_name}] {dropped} Preise außerhalb {MIN_PREIS_PE_ROUNDTRIP}-{MAX_PREIS_PE_ROUNDTRIP}€ entfernt (vermutlich Economy oder Fehler)")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _check_cabin_on_page(body, title, kabine="premium_economy"):
|
||||||
|
"""Prüft ob die Seite die gewünschte Kabinenklasse bestätigt."""
|
||||||
|
text = (title + " " + body[:3000]).lower()
|
||||||
|
if kabine == "premium_economy":
|
||||||
|
pe_keywords = ["premium economy", "premium eco", "premiumeconomy",
|
||||||
|
"premium_economy", "kabine: premium", "cabin: premium",
|
||||||
|
"prem eco", "w class"]
|
||||||
|
eco_only = ["economy" in text and "premium" not in text]
|
||||||
|
if any(kw in text for kw in pe_keywords):
|
||||||
|
return True
|
||||||
|
if eco_only[0]:
|
||||||
|
print("[QC] WARNUNG: Seite zeigt 'Economy' OHNE 'Premium' — möglicherweise falsche Kabine!")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60,
|
def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60,
|
||||||
trip_type="roundtrip", kabine="premium_economy",
|
trip_type="roundtrip", kabine="premium_economy",
|
||||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||||
|
|
@ -476,9 +508,6 @@ def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
booking_url = _booking_url_kayak(von, nach, abflug, rueck, kc, bags,
|
booking_url = _booking_url_kayak(von, nach, abflug, rueck, kc, bags,
|
||||||
layover_min, layover_max, airline_filter,
|
layover_min, layover_max, airline_filter,
|
||||||
max_flugzeit_h, max_stops)
|
max_flugzeit_h, max_stops)
|
||||||
booking_url_raw = _booking_url_kayak(von, nach, abflug, rueck, kc, 0,
|
|
||||||
layover_min, layover_max, airline_filter,
|
|
||||||
max_flugzeit_h, max_stops)
|
|
||||||
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
||||||
print(f"[KY{airline_label}] URL: {booking_url}")
|
print(f"[KY{airline_label}] URL: {booking_url}")
|
||||||
|
|
||||||
|
|
@ -517,19 +546,12 @@ def scrape_kayak(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
r["airline"] = airline_filter or ""
|
r["airline"] = airline_filter or ""
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
# Kein Ergebnis mit Bags-Filter → Fallback ohne Filter
|
# Kabinen-Verifikation: prüfe ob "Premium Economy" in der Seite steht
|
||||||
if not results and bags > 0:
|
pe_confirmed = _check_cabin_on_page(body, title, "premium_economy")
|
||||||
print(f"[KY] Kein Ergebnis mit Filtern — Fallback ohne Bags-Filter")
|
if not pe_confirmed:
|
||||||
sb.open(booking_url_raw)
|
print(f"[KY{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!")
|
||||||
sb.sleep(12)
|
|
||||||
body2 = sb.get_text("body")
|
|
||||||
for r in _preise_aus_body(body2, "kayak", abflug):
|
|
||||||
r["ankunft"] = rueck
|
|
||||||
r["booking_url"] = booking_url_raw
|
|
||||||
r["airline"] = airline_filter or ""
|
|
||||||
results.append(r)
|
|
||||||
|
|
||||||
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
results = _validate_results(results, f"kayak{airline_label}", "premium_economy")
|
||||||
print(f"[KY{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[KY{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
@ -595,6 +617,11 @@ def scrape_trip(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
r["booking_url"] = booking_url
|
r["booking_url"] = booking_url
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
|
pe_confirmed = _check_cabin_on_page(body, title, "premium_economy")
|
||||||
|
if not pe_confirmed:
|
||||||
|
print("[TR] WARNUNG: Premium Economy nicht auf Seite bestätigt!")
|
||||||
|
|
||||||
|
results = _validate_results(results, "trip", "premium_economy")
|
||||||
print(f"[TR] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[TR] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
@ -637,8 +664,6 @@ def scrape_kayak_multicity(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
|
|
||||||
booking_url = _booking_url_kayak_multicity(von, nach, via, abflug, via_datum, rueck,
|
booking_url = _booking_url_kayak_multicity(von, nach, via, abflug, via_datum, rueck,
|
||||||
kc, bags, airline_filter)
|
kc, bags, airline_filter)
|
||||||
booking_url_raw = _booking_url_kayak_multicity(von, nach, via, abflug, via_datum, rueck,
|
|
||||||
kc, 0, airline_filter)
|
|
||||||
|
|
||||||
print(f"[MC{airline_label}] Multi-City via {via}: {abflug} → +1T → {rueck}")
|
print(f"[MC{airline_label}] Multi-City via {via}: {abflug} → +1T → {rueck}")
|
||||||
print(f"[MC{airline_label}] URL: {booking_url}")
|
print(f"[MC{airline_label}] URL: {booking_url}")
|
||||||
|
|
@ -683,19 +708,7 @@ def scrape_kayak_multicity(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
r["airline"] = airline_filter or via
|
r["airline"] = airline_filter or via
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
# Fallback ohne Bags-Filter
|
results = _validate_results(results, f"multicity{airline_label}", "premium_economy")
|
||||||
if not results and bags > 0:
|
|
||||||
print(f"[MC] Kein Ergebnis mit Bags — Fallback ohne Bags-Filter")
|
|
||||||
sb.open(booking_url_raw)
|
|
||||||
sb.sleep(12)
|
|
||||||
body2 = sb.get_text("body")
|
|
||||||
for r in _preise_aus_body(body2, "kayak_multicity", abflug):
|
|
||||||
if r["preis"] > 600:
|
|
||||||
r["ankunft"] = rueck
|
|
||||||
r["booking_url"] = booking_url_raw
|
|
||||||
r["airline"] = airline_filter or via
|
|
||||||
results.append(r)
|
|
||||||
|
|
||||||
print(f"[MC{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[MC{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
@ -715,9 +728,6 @@ def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
booking_url = _booking_url_momondo(von, nach, abflug, rueck, kc, bags,
|
booking_url = _booking_url_momondo(von, nach, abflug, rueck, kc, bags,
|
||||||
layover_min, layover_max, airline_filter,
|
layover_min, layover_max, airline_filter,
|
||||||
max_flugzeit_h, max_stops)
|
max_flugzeit_h, max_stops)
|
||||||
booking_url_raw = _booking_url_momondo(von, nach, abflug, rueck, kc, 0,
|
|
||||||
layover_min, layover_max, airline_filter,
|
|
||||||
max_flugzeit_h, max_stops)
|
|
||||||
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
||||||
print(f"[MO{airline_label}] URL: {booking_url}")
|
print(f"[MO{airline_label}] URL: {booking_url}")
|
||||||
|
|
||||||
|
|
@ -772,18 +782,11 @@ def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
r["airline"] = airline_filter or ""
|
r["airline"] = airline_filter or ""
|
||||||
results.append(r)
|
results.append(r)
|
||||||
|
|
||||||
# Fallback ohne Bags-Filter
|
pe_confirmed = _check_cabin_on_page(body, title, "premium_economy")
|
||||||
if not results and bags > 0:
|
if not pe_confirmed:
|
||||||
print(f"[MO] Kein Ergebnis — Fallback ohne Bags-Filter")
|
print(f"[MO{airline_label}] WARNUNG: Premium Economy nicht auf Seite bestätigt!")
|
||||||
sb.open(booking_url_raw)
|
|
||||||
sb.sleep(12)
|
|
||||||
body2 = sb.get_text("body")
|
|
||||||
for r in _preise_aus_body(body2, "momondo", abflug):
|
|
||||||
r["ankunft"] = rueck
|
|
||||||
r["booking_url"] = booking_url_raw
|
|
||||||
r["airline"] = airline_filter or ""
|
|
||||||
results.append(r)
|
|
||||||
|
|
||||||
|
results = _validate_results(results, f"momondo{airline_label}", "premium_economy")
|
||||||
print(f"[MO{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[MO{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
@ -929,6 +932,7 @@ def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
pass
|
pass
|
||||||
|
|
||||||
results.sort(key=lambda x: x["preis"])
|
results.sort(key=lambda x: x["preis"])
|
||||||
|
results = _validate_results(results, "traveloka", "premium_economy")
|
||||||
print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue