feat: Momondo + Wego + Traveloka Scanner, Google Flights deaktiviert
- worker.py: scrape_momondo() — gleiche Firma wie Kayak, andere Preise - worker.py: scrape_wego() — asiatische Flugsuchmaschine - worker.py: scrape_traveloka() — größte SE-Asien Reiseplattform - worker.py: Google Flights + Skyscanner auf _scrape_disabled gesetzt - db.py: Dispatcher um momondo/wego/traveloka erweitert Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f8797f3e08
commit
207485fb8f
1 changed files with 279 additions and 5 deletions
|
|
@ -3,6 +3,12 @@ from datetime import datetime, timedelta
|
|||
import re
|
||||
|
||||
|
||||
def _scrape_disabled(*args, **kwargs):
|
||||
"""Deaktivierter Scanner — gibt leere Ergebnisse zurück."""
|
||||
print("[SKIP] Scanner deaktiviert")
|
||||
return [], ""
|
||||
|
||||
|
||||
def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60,
|
||||
trip_type="roundtrip", kabine="premium_economy",
|
||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||
|
|
@ -15,10 +21,13 @@ def scrape(scanner, von, nach, tage=30, aufenthalt_tage=60,
|
|||
screenshot_b64 = JPEG Full-Page Screenshot als base64-String (leer wenn Fehler)
|
||||
"""
|
||||
dispatcher = {
|
||||
"google_flights": scrape_google_flights,
|
||||
"google_flights": _scrape_disabled,
|
||||
"kayak": scrape_kayak,
|
||||
"kayak_multicity": scrape_kayak_multicity,
|
||||
"skyscanner": scrape_skyscanner,
|
||||
"momondo": scrape_momondo,
|
||||
"wego": scrape_wego,
|
||||
"traveloka": scrape_traveloka,
|
||||
"skyscanner": _scrape_disabled,
|
||||
"trip": scrape_trip,
|
||||
}
|
||||
fn = dispatcher.get(scanner)
|
||||
|
|
@ -90,6 +99,28 @@ def _booking_url_kayak(von, nach, abflug, rueck, kc, bags=1,
|
|||
return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
||||
|
||||
|
||||
def _booking_url_momondo(von, nach, abflug, rueck, kc, bags=1,
|
||||
layover_min=120, layover_max=300, airline="",
|
||||
max_flugzeit_h=22, max_stops=2):
|
||||
"""Momondo URL — gleiche Struktur wie Kayak (Booking Holdings), andere Domain."""
|
||||
filters = []
|
||||
if bags:
|
||||
filters.append(f"bfc%3D{bags}")
|
||||
if layover_min and layover_max:
|
||||
filters.append(f"ctr%3D{layover_min}%2C{layover_max}")
|
||||
if max_flugzeit_h:
|
||||
filters.append(f"duration%3D-{max_flugzeit_h * 60}")
|
||||
if max_stops is not None and max_stops < 10:
|
||||
filters.append(f"s%3D{max_stops}")
|
||||
if airline:
|
||||
filters.append(f"airlines%3D{airline}")
|
||||
fs = ("&fs=" + "%3B".join(filters)) if filters else ""
|
||||
base = f"https://www.momondo.de/flightsearch/{von}-{nach}/{abflug}"
|
||||
if rueck:
|
||||
return f"{base}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
||||
return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
||||
|
||||
|
||||
def _booking_url_trip(von, nach, abflug_fmt, rueck_fmt, kc, von_name, nach_name):
|
||||
if rueck_fmt:
|
||||
return (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/"
|
||||
|
|
@ -214,10 +245,33 @@ def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=60,
|
|||
print(f"[GF] Suche: {von_name}→{nach_name} {abflug_de}")
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open("https://www.google.com/travel/flights?hl=de&curr=EUR")
|
||||
sb.sleep(5)
|
||||
# ── Strategie 1: Direkte URL mit Datums-Parametern ─────────────────
|
||||
# Google Flights verarbeitet den Hash-Fragment erst nach JS-Ausführung
|
||||
direct_url = (
|
||||
f"https://www.google.com/travel/flights?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck}"
|
||||
f";c:EUR;e:1;sd:1;t:r;sc:w"
|
||||
) if rueck else (
|
||||
f"https://www.google.com/travel/flights?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w"
|
||||
)
|
||||
sb.open(direct_url)
|
||||
sb.sleep(8)
|
||||
_consent_google(sb)
|
||||
sb.sleep(2)
|
||||
sb.sleep(3)
|
||||
title_direct = sb.get_title()
|
||||
print(f"[GF] URL-Ansatz: {title_direct[:60]}")
|
||||
|
||||
# Wenn direkte URL Ergebnisse liefert (Titel enthält Städtenamen)
|
||||
url_erfolgreich = any(kw in title_direct for kw in
|
||||
[von, nach, "FRA", "KTI", "Frankfurt", "Phnom", "Flüge"])
|
||||
if not url_erfolgreich:
|
||||
# ── Strategie 2: Startseite + Formular befüllen ─────────────────
|
||||
print("[GF] Direktlink kein Ergebnis — wechsle zu Formular-Ansatz")
|
||||
sb.open("https://www.google.com/travel/flights?hl=de&curr=EUR")
|
||||
sb.sleep(5)
|
||||
_consent_google(sb)
|
||||
sb.sleep(2)
|
||||
|
||||
# ── 1. Kabine auf "Premium Economy" setzen ──────────────────────────
|
||||
try:
|
||||
|
|
@ -647,6 +701,226 @@ def scrape_kayak_multicity(von, nach, tage=30, aufenthalt_tage=60,
|
|||
return results[:10], screenshot_b64
|
||||
|
||||
|
||||
def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60,
|
||||
trip_type="roundtrip", kabine="premium_economy",
|
||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||
layover_min=120, layover_max=300,
|
||||
max_flugzeit_h=22, max_stops=2):
|
||||
"""Momondo — gleiche Firma wie Kayak, aber oft andere Preise."""
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
||||
if trip_type == "roundtrip" else ""
|
||||
kc = KABINE_KAYAK.get(kabine, "w")
|
||||
bags = 1 if "koffer" in gepaeck else 0
|
||||
booking_url = _booking_url_momondo(von, nach, abflug, rueck, kc, bags,
|
||||
layover_min, layover_max, airline_filter,
|
||||
max_flugzeit_h, max_stops)
|
||||
booking_url_raw = _booking_url_momondo(von, nach, abflug, rueck, kc, 0,
|
||||
layover_min, layover_max, airline_filter,
|
||||
max_flugzeit_h, max_stops)
|
||||
airline_label = f" [{airline_filter}]" if airline_filter else ""
|
||||
print(f"[MO{airline_label}] URL: {booking_url}")
|
||||
|
||||
results = []
|
||||
screenshot_b64 = ""
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(booking_url)
|
||||
sb.sleep(15)
|
||||
|
||||
title = sb.get_title()
|
||||
body = sb.get_text("body")
|
||||
print(f"[MO] Title: {title[:80]}")
|
||||
|
||||
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
|
||||
'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price',
|
||||
'[class*="resultPrice"]', '.lowest-price']:
|
||||
try:
|
||||
elems = sb.find_elements(sel)
|
||||
if elems:
|
||||
for e in elems[:15]:
|
||||
p = _parse_preis(e.text)
|
||||
if p:
|
||||
results.append({"scanner": "momondo", "preis": p,
|
||||
"waehrung": "EUR",
|
||||
"airline": airline_filter or "",
|
||||
"abflug": abflug, "ankunft": rueck,
|
||||
"booking_url": booking_url})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not results:
|
||||
for r in _preise_aus_body(body, "momondo", abflug):
|
||||
r["ankunft"] = rueck
|
||||
r["booking_url"] = booking_url
|
||||
r["airline"] = airline_filter or ""
|
||||
results.append(r)
|
||||
|
||||
# Fallback ohne Bags-Filter
|
||||
if not results and bags > 0:
|
||||
print(f"[MO] Kein Ergebnis — Fallback ohne Bags-Filter")
|
||||
sb.open(booking_url_raw)
|
||||
sb.sleep(12)
|
||||
body2 = sb.get_text("body")
|
||||
for r in _preise_aus_body(body2, "momondo", abflug):
|
||||
r["ankunft"] = rueck
|
||||
r["booking_url"] = booking_url_raw
|
||||
r["airline"] = airline_filter or ""
|
||||
results.append(r)
|
||||
|
||||
print(f"[MO{airline_label}] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
screenshot_b64 = _take_screenshot(sb)
|
||||
return results[:10], screenshot_b64
|
||||
|
||||
|
||||
def scrape_wego(von, nach, tage=30, aufenthalt_tage=60,
|
||||
trip_type="roundtrip", kabine="premium_economy",
|
||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||
layover_min=120, layover_max=300,
|
||||
max_flugzeit_h=22, max_stops=2):
|
||||
"""Wego — asiatische Flugsuchmaschine, populär in Südostasien."""
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
||||
if trip_type == "roundtrip" else ""
|
||||
|
||||
KABINE_WEGO = {"economy": "economy", "premium_economy": "premiumEconomy",
|
||||
"business": "business", "first": "first"}
|
||||
kc = KABINE_WEGO.get(kabine, "premiumEconomy")
|
||||
|
||||
if rueck:
|
||||
booking_url = (f"https://www.wego.com/flights/searches/new?"
|
||||
f"origin={von.lower()}&destination={nach.lower()}"
|
||||
f"&outbound_date={abflug}&inbound_date={rueck}"
|
||||
f"&cabin={kc}&adults=1&children=0&infants=0"
|
||||
f"¤cy=EUR&sort=price")
|
||||
else:
|
||||
booking_url = (f"https://www.wego.com/flights/searches/new?"
|
||||
f"origin={von.lower()}&destination={nach.lower()}"
|
||||
f"&outbound_date={abflug}"
|
||||
f"&cabin={kc}&adults=1&children=0&infants=0"
|
||||
f"¤cy=EUR&sort=price")
|
||||
|
||||
print(f"[WG] URL: {booking_url}")
|
||||
results = []
|
||||
screenshot_b64 = ""
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(booking_url)
|
||||
sb.sleep(18)
|
||||
|
||||
title = sb.get_title()
|
||||
body = sb.get_text("body")
|
||||
print(f"[WG] Title: {title[:80]} | Body: {len(body)} chars")
|
||||
|
||||
for sel in ['[class*="price"]', '[data-testid*="price"]',
|
||||
'.flight-price', 'span[class*="Price"]',
|
||||
'.fare-price', '[class*="FarePrice"]']:
|
||||
try:
|
||||
elems = sb.find_elements(sel)
|
||||
if elems:
|
||||
for e in elems[:15]:
|
||||
p = _parse_preis(e.text)
|
||||
if p:
|
||||
results.append({"scanner": "wego", "preis": p,
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug, "ankunft": rueck,
|
||||
"booking_url": booking_url})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not results:
|
||||
for r in _preise_aus_body(body, "wego", abflug):
|
||||
r["ankunft"] = rueck
|
||||
r["booking_url"] = booking_url
|
||||
results.append(r)
|
||||
|
||||
print(f"[WG] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
screenshot_b64 = _take_screenshot(sb)
|
||||
return results[:10], screenshot_b64
|
||||
|
||||
|
||||
def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
||||
trip_type="roundtrip", kabine="premium_economy",
|
||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||
layover_min=120, layover_max=300,
|
||||
max_flugzeit_h=22, max_stops=2):
|
||||
"""Traveloka — größte Reiseplattform Südostasiens."""
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%d-%m-%Y")
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%d-%m-%Y") \
|
||||
if trip_type == "roundtrip" else ""
|
||||
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
||||
if trip_type == "roundtrip" else ""
|
||||
|
||||
# Traveloka URL-Parameter
|
||||
KABINE_TV = {"economy": "ECONOMY", "premium_economy": "PREMIUM_ECONOMY",
|
||||
"business": "BUSINESS", "first": "FIRST_CLASS"}
|
||||
kc = KABINE_TV.get(kabine, "PREMIUM_ECONOMY")
|
||||
|
||||
if rueck:
|
||||
booking_url = (f"https://www.traveloka.com/en-en/flight/fullsearch?"
|
||||
f"ap={von}.{nach}&dt={abflug}.{rueck}"
|
||||
f"&ps=1.0.0&sc={kc}")
|
||||
else:
|
||||
booking_url = (f"https://www.traveloka.com/en-en/flight/fullsearch?"
|
||||
f"ap={von}.{nach}&dt={abflug}"
|
||||
f"&ps=1.0.0&sc={kc}")
|
||||
|
||||
print(f"[TV] URL: {booking_url}")
|
||||
results = []
|
||||
screenshot_b64 = ""
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(booking_url)
|
||||
sb.sleep(18)
|
||||
|
||||
title = sb.get_title()
|
||||
body = sb.get_text("body")
|
||||
print(f"[TV] Title: {title[:80]} | Body: {len(body)} chars")
|
||||
|
||||
# Cookie/Consent Banner klicken
|
||||
for sel in ['button[data-testid*="accept"]', 'button[id*="accept"]',
|
||||
'#onetrust-accept-btn-handler', 'button[class*="accept"]']:
|
||||
try:
|
||||
sb.find_element(sel, timeout=2).click()
|
||||
sb.sleep(2)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for sel in ['[class*="price"]', '[data-testid*="price"]',
|
||||
'.flight-price', 'span[class*="Price"]',
|
||||
'div[class*="farePrice"]', '[class*="totalPrice"]']:
|
||||
try:
|
||||
elems = sb.find_elements(sel)
|
||||
if elems:
|
||||
for e in elems[:15]:
|
||||
p = _parse_preis(e.text)
|
||||
if p:
|
||||
results.append({"scanner": "traveloka", "preis": p,
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug_iso, "ankunft": rueck_iso,
|
||||
"booking_url": booking_url})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not results:
|
||||
for r in _preise_aus_body(body, "traveloka", abflug_iso):
|
||||
r["ankunft"] = rueck_iso
|
||||
r["booking_url"] = booking_url
|
||||
results.append(r)
|
||||
|
||||
print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
screenshot_b64 = _take_screenshot(sb)
|
||||
return results[:10], screenshot_b64
|
||||
|
||||
|
||||
def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=60,
|
||||
trip_type="roundtrip", kabine="premium_economy",
|
||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue