feat: Roundtrip FRA→KTI (Phnom Penh Techo Airport), Premium Economy, HAN als Umstieg

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Cursor 2026-02-25 14:37:47 +07:00
parent 190281e2bb
commit 0a9e65b71a
4 changed files with 129 additions and 121 deletions

View file

@ -33,6 +33,9 @@ def init_db():
von TEXT NOT NULL,
nach TEXT NOT NULL,
tage INTEGER DEFAULT 30,
aufenthalt_tage INTEGER DEFAULT 14,
trip_type TEXT DEFAULT 'roundtrip',
kabine TEXT DEFAULT 'premium_economy',
intervall TEXT DEFAULT 'daily',
aktiv INTEGER DEFAULT 1,
created_at TEXT DEFAULT (datetime('now'))
@ -119,10 +122,13 @@ TREND: [STEIGEND / FALLEND / STABIL]'
('flugscanner-mu', '100.75.182.15', 'Muldenstein DE')
""")
# Standard-Job: Frankfurt → Hanoi, Premium Economy
# Standard-Jobs: Frankfurt → Phnom Penh (KTI), Premium Economy, Roundtrip
# HAN ist Umstieg — wird automatisch als günstigste Verbindung gefunden
c.execute("""
INSERT OR IGNORE INTO jobs (scanner, von, nach, tage, intervall) VALUES
('google_flights', 'FRA', 'HAN', 30, 'daily')
INSERT OR IGNORE INTO jobs (scanner, von, nach, tage, aufenthalt_tage, trip_type, kabine, intervall) VALUES
('google_flights', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily'),
('kayak', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily'),
('trip', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily')
""")
conn.commit()

View file

@ -52,7 +52,10 @@ def dispatch_job(node, job):
"scanner": job["scanner"],
"von": job["von"],
"nach": job["nach"],
"tage": job["tage"]
"tage": job["tage"],
"aufenthalt_tage": job.get("aufenthalt_tage", 14),
"trip_type": job.get("trip_type", "roundtrip"),
"kabine": job.get("kabine", "premium_economy"),
}
log(f"Job an {node['name']} ({node['tailscale_ip']}): {payload}")
try:

View file

@ -14,15 +14,18 @@ def status():
@app.route("/job", methods=["POST"])
def job():
data = request.json
scanner = data.get("scanner", "google_flights")
von = data.get("von", "FRA")
nach = data.get("nach", "PNH")
tage = data.get("tage", 30)
scanner = data.get("scanner", "google_flights")
von = data.get("von", "FRA")
nach = data.get("nach", "KTI")
tage = data.get("tage", 30)
aufenthalt = data.get("aufenthalt_tage", 14)
trip_type = data.get("trip_type", "roundtrip")
kabine = data.get("kabine", "premium_economy")
print(f"[{NODE_NAME}] Job: {scanner} {von}{nach} ({tage} Tage)")
print(f"[{NODE_NAME}] Job: {scanner} {von}{nach} ({trip_type}, {kabine}, +{tage}Tage/{aufenthalt}Tage)")
try:
results = scrape(scanner, von, nach, tage)
results = scrape(scanner, von, nach, tage, aufenthalt, trip_type, kabine)
print(f"[{NODE_NAME}] {len(results)} Preise gefunden")
return jsonify({
"results": results,

View file

@ -3,7 +3,8 @@ from datetime import datetime, timedelta
import re
def scrape(scanner, von, nach, tage=30):
def scrape(scanner, von, nach, tage=30, aufenthalt_tage=14,
trip_type="roundtrip", kabine="premium_economy"):
dispatcher = {
"google_flights": scrape_google_flights,
"kayak": scrape_kayak,
@ -13,19 +14,26 @@ def scrape(scanner, von, nach, tage=30):
fn = dispatcher.get(scanner)
if not fn:
raise ValueError(f"Unbekannter Scanner: {scanner}")
return fn(von, nach, tage)
return fn(von, nach, tage, aufenthalt_tage, trip_type, kabine)
# ── Kabinen-Codes ──────────────────────────────────────────────────────────────
KABINE_GOOGLE = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"}
KABINE_KAYAK = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"}
KABINE_TRIP = {"economy": "Y", "premium_economy": "W", "business": "C", "first": "F"}
def _parse_preis(text):
if not text:
return None
text = text.replace('\xa0', ' ').replace('\u202f', ' ').replace(',', '.')
for p in [r'(\d{3,4})\s?€', r'\s?(\d{3,4})', r'EUR\s?(\d{3,4})', r'(\d{3,4})\s?EUR']:
text = text.replace('\xa0', ' ').replace('\u202f', ' ')
for p in [r'(\d{1,2}[.,]\d{3})\s?€', r'(\d{3,5})\s?€',
r'\s?(\d{3,5})', r'EUR\s?(\d{3,5})', r'(\d{3,5})\s?EUR']:
m = re.search(p, text)
if m:
try:
v = float(m.group(1).replace('.', ''))
if 200 < v < 8000:
v = float(m.group(1).replace('.', '').replace(',', ''))
if 200 < v < 15000:
return round(v, 2)
except ValueError:
pass
@ -35,12 +43,11 @@ def _parse_preis(text):
def _preise_aus_body(body, scanner, abflug):
results = []
seen = set()
# Alle Zahlen 300-6000 die neben einem € stehen
for m in re.finditer(r'(\d[\d\s\.]{1,5})\s?€|€\s?(\d[\d\s\.]{1,5})', body):
raw = (m.group(1) or m.group(2)).replace(' ', '').replace('.', '')
try:
v = float(raw)
if 300 < v < 6000 and v not in seen:
if 300 < v < 12000 and v not in seen:
seen.add(v)
results.append({
"scanner": scanner, "preis": v, "waehrung": "EUR",
@ -52,98 +59,85 @@ def _preise_aus_body(body, scanner, abflug):
return results[:10]
def scrape_google_flights(von, nach, tage=30):
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
# sc:w = Premium Economy
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w")
def _consent_google(sb):
"""Google Consent-Seite (DSGVO) behandeln."""
if "consent" in sb.get_current_url() or "Bevor Sie" in sb.get_title():
print("[CONSENT] Google Consent erkannt")
for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]',
'.lssxud button', 'button[aria-label*="kzeptieren"]']:
try:
sb.click(sel, timeout=3)
sb.sleep(4)
print(f"[CONSENT] Geklickt: {sel}")
return True
except Exception:
pass
return False
log = []
def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=14,
trip_type="roundtrip", kabine="premium_economy"):
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
kc = KABINE_GOOGLE.get(kabine, "w")
if trip_type == "roundtrip":
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck};c:EUR;e:1;sd:1;t:r;sc:{kc}")
else:
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:{kc}")
print(f"[GF] URL: {url[:100]}")
results = []
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
sb.open(url)
sb.sleep(7)
current_url = sb.get_current_url()
title = sb.get_title()
log.append(f"[GF] title={title[:60]} url={current_url[:80]}")
# Consent-Seite behandeln
if "consent" in current_url or "Bevor Sie" in title:
log.append("[GF] Consent-Seite erkannt")
for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]',
'.lssxud button', 'button[aria-label*="akzeptieren"]']:
try:
sb.click(sel, timeout=3)
sb.sleep(4)
log.append(f"[GF] Consent geklickt: {sel}")
break
except Exception:
continue
# Zurück zur Such-URL
if _consent_google(sb):
sb.open(url)
sb.sleep(8)
log.append(f"[GF] Nach Consent-Redirect: {sb.get_title()[:60]}")
# Checken ob wir auf der Suchergebnisseite sind
title2 = sb.get_title()
if "Günstige Flüge" in title2 or "Google Flüge" in title2:
log.append("[GF] WARNUNG: Auf Homepage gelandet, versuche Formular")
# Formular direkt ausfüllen
try:
# Von-Feld leeren und FRA eingeben
sb.click('input[placeholder*="Von"]', timeout=4)
sb.triple_click('input[placeholder*="Von"]')
sb.type('input[placeholder*="Von"]', von)
sb.sleep(2)
# Ersten Autocomplete-Eintrag nehmen
sb.click('li[data-code="' + von + '"]', timeout=3)
sb.sleep(1)
# Nach-Feld
sb.click('input[placeholder*="Wohin"]', timeout=3)
sb.type('input[placeholder*="Wohin"]', nach)
sb.sleep(2)
sb.click('li[data-code="' + nach + '"]', timeout=3)
sb.sleep(1)
log.append("[GF] Formular ausgefüllt, suche...")
sb.sleep(5)
except Exception as e:
log.append(f"[GF] Formular-Fehler: {e}")
title = sb.get_title()
print(f"[GF] Title: {title[:80]}")
# Preise extrahieren
body = sb.get_text("body")
log.append(f"[GF] Body-Länge: {len(body)} Zeichen")
print(f"[GF] Body-Länge: {len(body)}")
# Aria-Labels mit €
try:
elems = sb.find_elements('[aria-label*=""]', timeout=3)
log.append(f"[GF] aria-label €-Elemente: {len(elems)}")
for elem in elems[:15]:
for elem in elems[:20]:
label = elem.get_attribute('aria-label') or elem.text
p = _parse_preis(label)
if p:
results.append({"scanner": "google_flights", "preis": p,
"waehrung": "EUR", "airline": "", "abflug": abflug, "ankunft": ""})
"waehrung": "EUR", "airline": "",
"abflug": abflug, "ankunft": rueck if trip_type == "roundtrip" else ""})
except Exception:
pass
# Fallback: Regex über Body
# Fallback Regex
if not results:
results = _preise_aus_body(body, "google_flights", abflug)
log.append(f"[GF] Regex-Fallback: {len(results)} Preise")
log.append(f"[GF] Ergebnis: {[r['preis'] for r in results]}")
print('\n'.join(log))
print(f"[GF] Ergebnis: {[r['preis'] for r in results[:5]]}")
return results[:10]
def scrape_kayak(von, nach, tage=30):
def scrape_kayak(von, nach, tage=30, aufenthalt_tage=14,
trip_type="roundtrip", kabine="premium_economy"):
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a&cabin=w&currency=EUR"
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
kc = KABINE_KAYAK.get(kabine, "w")
log = []
if trip_type == "roundtrip":
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}/{rueck}?sort=price_a&cabin={kc}&currency=EUR"
else:
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a&cabin={kc}&currency=EUR"
print(f"[KY] URL: {url}")
results = []
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
@ -152,54 +146,57 @@ def scrape_kayak(von, nach, tage=30):
title = sb.get_title()
body = sb.get_text("body")
log.append(f"[KY] title={title[:60]}")
log.append(f"[KY] body-länge={len(body)}")
log.append(f"[KY] body-500={body[:500]}")
print(f"[KY] Title: {title[:80]}")
print(f"[KY] Body-500: {body[:300]}")
# CSS-Selektoren Kayak
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
'span[class*="price"]', '.Iqt3', 'div.nrc6-price']:
'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price']:
try:
elems = sb.find_elements(sel, timeout=2)
if elems:
log.append(f"[KY] Selector '{sel}': {len(elems)} Elemente")
for e in elems[:10]:
for e in elems[:15]:
p = _parse_preis(e.text)
if p:
results.append({"scanner": "kayak", "preis": p,
"waehrung": "EUR", "airline": "",
"abflug": abflug, "ankunft": ""})
"abflug": abflug,
"ankunft": rueck if trip_type == "roundtrip" else ""})
if results:
break
except Exception:
pass
# Fallback Regex
if not results:
results = _preise_aus_body(body, "kayak", abflug)
log.append(f"[KY] Ergebnis: {[r['preis'] for r in results]}")
print('\n'.join(log))
print(f"[KY] Ergebnis: {[r['preis'] for r in results[:5]]}")
return results[:10]
def scrape_skyscanner(von, nach, tage=30):
"""Skyscanner hat starken Bot-Schutz — für jetzt übersprungen."""
print("[SS] Skyscanner übersprungen (Bot-Detection)")
return []
def scrape_trip(von, nach, tage=30):
"""trip.com — gut von Asia-IPs, oft günstigere Asien-Preise."""
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
def scrape_trip(von, nach, tage=30, aufenthalt_tage=14,
trip_type="roundtrip", kabine="premium_economy"):
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y%m%d")
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
# W = Premium Economy bei trip.com
url = (f"https://www.trip.com/flights/{von.lower()}-to-{nach.lower()}/"
f"tickets-{von.lower()}-{nach.lower()}/"
f"?DDate1={abflug}&class=W&curr=EUR")
rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
kc = KABINE_TRIP.get(kabine, "W")
log = []
# trip.com Stadtname-Mapping für URL
stadtname = {"FRA": "frankfurt", "HAN": "hanoi", "KTI": "phnom-penh",
"PNH": "phnom-penh", "BKK": "bangkok", "SGN": "ho-chi-minh-city"}
von_name = stadtname.get(von, von.lower())
nach_name = stadtname.get(nach, nach.lower())
if trip_type == "roundtrip":
url = (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/"
f"tickets-{von.lower()}-{nach.lower()}/"
f"?DDate1={abflug}&DDate2={rueck}&class={kc}&curr=EUR")
else:
url = (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/"
f"tickets-{von.lower()}-{nach.lower()}/"
f"?DDate1={abflug}&class={kc}&curr=EUR")
print(f"[TR] URL: {url}")
results = []
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
@ -208,12 +205,11 @@ def scrape_trip(von, nach, tage=30):
title = sb.get_title()
body = sb.get_text("body")
log.append(f"[TR] title={title[:60]}")
log.append(f"[TR] body-länge={len(body)}")
print(f"[TR] Title: {title[:80]}")
# Cookie-Banner
for sel in ['button[id*="accept"]', 'button[class*="accept"]',
'button[aria-label*="Accept"]', '.cookie-accept']:
'button[aria-label*="Accept"]', '.cookie-accept', '#onetrust-accept-btn-handler']:
try:
sb.click(sel, timeout=2)
sb.sleep(2)
@ -221,32 +217,32 @@ def scrape_trip(von, nach, tage=30):
except Exception:
pass
# CSS-Selektoren trip.com
for sel in ['.price-box .price', '.flight-price', 'span[class*="price"]',
'div[class*="price-num"]', '.price-content span', 'em[class*="price"]']:
'div[class*="price-num"]', 'em[class*="price"]', '.c-price']:
try:
elems = sb.find_elements(sel, timeout=2)
if elems:
log.append(f"[TR] Selector '{sel}': {len(elems)} Elemente")
for e in elems[:10]:
p = _parse_preis(e.text)
if p:
results.append({
"scanner": "trip", "preis": p,
"waehrung": "EUR", "airline": "",
"abflug": abflug_iso, "ankunft": ""
})
results.append({"scanner": "trip", "preis": p,
"waehrung": "EUR", "airline": "",
"abflug": abflug_iso,
"ankunft": rueck_iso if trip_type == "roundtrip" else ""})
if results:
break
except Exception:
pass
# Fallback Regex
if not results:
results = _preise_aus_body(body, "trip", abflug_iso)
log.append(f"[TR] Regex-Fallback: {len(results)} Preise")
log.append(f"[TR] Ergebnis: {[r['preis'] for r in results]}")
print('\n'.join(log))
print(f"[TR] Ergebnis: {[r['preis'] for r in results[:5]]}")
return results[:10]
def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=14,
trip_type="roundtrip", kabine="premium_economy"):
"""Skyscanner hat starken Bot-Schutz — übersprungen."""
print("[SS] Skyscanner übersprungen (Bot-Detection)")
return []