fix: Momondo Consent+URL, Wego URL-Format, Traveloka USD→EUR Parsing
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
207485fb8f
commit
f1720ac629
1 changed files with 59 additions and 46 deletions
|
|
@ -115,7 +115,7 @@ def _booking_url_momondo(von, nach, abflug, rueck, kc, bags=1,
|
||||||
if airline:
|
if airline:
|
||||||
filters.append(f"airlines%3D{airline}")
|
filters.append(f"airlines%3D{airline}")
|
||||||
fs = ("&fs=" + "%3B".join(filters)) if filters else ""
|
fs = ("&fs=" + "%3B".join(filters)) if filters else ""
|
||||||
base = f"https://www.momondo.de/flightsearch/{von}-{nach}/{abflug}"
|
base = f"https://www.momondo.de/flight-search/{von}-{nach}/{abflug}"
|
||||||
if rueck:
|
if rueck:
|
||||||
return f"{base}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
return f"{base}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
||||||
return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
return f"{base}?sort=price_a&cabin={kc}¤cy=EUR{fs}"
|
||||||
|
|
@ -726,11 +726,25 @@ def scrape_momondo(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
|
|
||||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||||
sb.open(booking_url)
|
sb.open(booking_url)
|
||||||
sb.sleep(15)
|
sb.sleep(8)
|
||||||
|
|
||||||
|
# Momondo Cookie-Consent wegklicken
|
||||||
|
for sel in ['button[class*="accept"]', '.RxNS-button-content',
|
||||||
|
'#onetrust-accept-btn-handler', 'button[title*="akzeptieren"]',
|
||||||
|
'button[title*="Alle akzeptieren"]', '.evidon-banner-acceptbutton']:
|
||||||
|
try:
|
||||||
|
sb.find_element(sel, timeout=2).click()
|
||||||
|
print(f"[MO] Consent geklickt: {sel}")
|
||||||
|
sb.sleep(3)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Nach Consent: Seite muss neu laden / Ergebnisse warten
|
||||||
|
sb.sleep(12)
|
||||||
title = sb.get_title()
|
title = sb.get_title()
|
||||||
body = sb.get_text("body")
|
body = sb.get_text("body")
|
||||||
print(f"[MO] Title: {title[:80]}")
|
print(f"[MO] Title: {title[:80]} | Body: {len(body)} chars")
|
||||||
|
|
||||||
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
|
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
|
||||||
'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price',
|
'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price',
|
||||||
|
|
@ -789,18 +803,18 @@ def scrape_wego(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
"business": "business", "first": "first"}
|
"business": "business", "first": "first"}
|
||||||
kc = KABINE_WEGO.get(kabine, "premiumEconomy")
|
kc = KABINE_WEGO.get(kabine, "premiumEconomy")
|
||||||
|
|
||||||
|
stadtname_wego = {"FRA": "frankfurt", "KTI": "phnom-penh", "HAN": "hanoi",
|
||||||
|
"BKK": "bangkok", "SGN": "ho-chi-minh-city", "HKG": "hong-kong"}
|
||||||
|
von_slug = stadtname_wego.get(von, von.lower())
|
||||||
|
nach_slug = stadtname_wego.get(nach, nach.lower())
|
||||||
if rueck:
|
if rueck:
|
||||||
booking_url = (f"https://www.wego.com/flights/searches/new?"
|
booking_url = (f"https://www.wego.com/flights/{von.lower()}/{nach.lower()}"
|
||||||
f"origin={von.lower()}&destination={nach.lower()}"
|
f"/{abflug}/{rueck}"
|
||||||
f"&outbound_date={abflug}&inbound_date={rueck}"
|
f"?cabin_class={kc}&adults_count=1&sort=price¤cy_code=EUR")
|
||||||
f"&cabin={kc}&adults=1&children=0&infants=0"
|
|
||||||
f"¤cy=EUR&sort=price")
|
|
||||||
else:
|
else:
|
||||||
booking_url = (f"https://www.wego.com/flights/searches/new?"
|
booking_url = (f"https://www.wego.com/flights/{von.lower()}/{nach.lower()}"
|
||||||
f"origin={von.lower()}&destination={nach.lower()}"
|
f"/{abflug}"
|
||||||
f"&outbound_date={abflug}"
|
f"?cabin_class={kc}&adults_count=1&sort=price¤cy_code=EUR")
|
||||||
f"&cabin={kc}&adults=1&children=0&infants=0"
|
|
||||||
f"¤cy=EUR&sort=price")
|
|
||||||
|
|
||||||
print(f"[WG] URL: {booking_url}")
|
print(f"[WG] URL: {booking_url}")
|
||||||
results = []
|
results = []
|
||||||
|
|
@ -843,12 +857,30 @@ def scrape_wego(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_preis_usd(text):
|
||||||
|
"""Parst USD-Preise aus Text wie 'USD 1,388.60' und wandelt grob in EUR um."""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
# USD-Format: 1,388.60 (Komma als Tausender, Punkt als Dezimal)
|
||||||
|
for p in [r'USD\s?([\d,]+\.?\d*)', r'\$\s?([\d,]+\.?\d*)']:
|
||||||
|
m = re.search(p, text)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
v = float(m.group(1).replace(',', ''))
|
||||||
|
eur = round(v * 0.92, 2) # grobe USD→EUR Umrechnung
|
||||||
|
if 200 < eur < 15000:
|
||||||
|
return eur
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
trip_type="roundtrip", kabine="premium_economy",
|
trip_type="roundtrip", kabine="premium_economy",
|
||||||
gepaeck="1koffer+handgepaeck", airline_filter="",
|
gepaeck="1koffer+handgepaeck", airline_filter="",
|
||||||
layover_min=120, layover_max=300,
|
layover_min=120, layover_max=300,
|
||||||
max_flugzeit_h=22, max_stops=2):
|
max_flugzeit_h=22, max_stops=2):
|
||||||
"""Traveloka — größte Reiseplattform Südostasiens."""
|
"""Traveloka — größte Reiseplattform Südostasiens. Preise in USD, werden in EUR umgerechnet."""
|
||||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%d-%m-%Y")
|
abflug = (datetime.now() + timedelta(days=tage)).strftime("%d-%m-%Y")
|
||||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%d-%m-%Y") \
|
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%d-%m-%Y") \
|
||||||
if trip_type == "roundtrip" else ""
|
if trip_type == "roundtrip" else ""
|
||||||
|
|
@ -856,7 +888,6 @@ def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d") \
|
||||||
if trip_type == "roundtrip" else ""
|
if trip_type == "roundtrip" else ""
|
||||||
|
|
||||||
# Traveloka URL-Parameter
|
|
||||||
KABINE_TV = {"economy": "ECONOMY", "premium_economy": "PREMIUM_ECONOMY",
|
KABINE_TV = {"economy": "ECONOMY", "premium_economy": "PREMIUM_ECONOMY",
|
||||||
"business": "BUSINESS", "first": "FIRST_CLASS"}
|
"business": "BUSINESS", "first": "FIRST_CLASS"}
|
||||||
kc = KABINE_TV.get(kabine, "PREMIUM_ECONOMY")
|
kc = KABINE_TV.get(kabine, "PREMIUM_ECONOMY")
|
||||||
|
|
@ -882,40 +913,22 @@ def scrape_traveloka(von, nach, tage=30, aufenthalt_tage=60,
|
||||||
body = sb.get_text("body")
|
body = sb.get_text("body")
|
||||||
print(f"[TV] Title: {title[:80]} | Body: {len(body)} chars")
|
print(f"[TV] Title: {title[:80]} | Body: {len(body)} chars")
|
||||||
|
|
||||||
# Cookie/Consent Banner klicken
|
# Preise aus dem Body-Text extrahieren (USD → EUR)
|
||||||
for sel in ['button[data-testid*="accept"]', 'button[id*="accept"]',
|
seen = set()
|
||||||
'#onetrust-accept-btn-handler', 'button[class*="accept"]']:
|
for m in re.finditer(r'USD\s?([\d,]+\.?\d*)', body):
|
||||||
try:
|
try:
|
||||||
sb.find_element(sel, timeout=2).click()
|
usd = float(m.group(1).replace(',', ''))
|
||||||
sb.sleep(2)
|
eur = round(usd * 0.92)
|
||||||
break
|
if 400 < eur < 12000 and eur not in seen:
|
||||||
except Exception:
|
seen.add(eur)
|
||||||
|
results.append({"scanner": "traveloka", "preis": eur,
|
||||||
|
"waehrung": "EUR", "airline": "",
|
||||||
|
"abflug": abflug_iso, "ankunft": rueck_iso,
|
||||||
|
"booking_url": booking_url})
|
||||||
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
for sel in ['[class*="price"]', '[data-testid*="price"]',
|
results.sort(key=lambda x: x["preis"])
|
||||||
'.flight-price', 'span[class*="Price"]',
|
|
||||||
'div[class*="farePrice"]', '[class*="totalPrice"]']:
|
|
||||||
try:
|
|
||||||
elems = sb.find_elements(sel)
|
|
||||||
if elems:
|
|
||||||
for e in elems[:15]:
|
|
||||||
p = _parse_preis(e.text)
|
|
||||||
if p:
|
|
||||||
results.append({"scanner": "traveloka", "preis": p,
|
|
||||||
"waehrung": "EUR", "airline": "",
|
|
||||||
"abflug": abflug_iso, "ankunft": rueck_iso,
|
|
||||||
"booking_url": booking_url})
|
|
||||||
if results:
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
for r in _preise_aus_body(body, "traveloka", abflug_iso):
|
|
||||||
r["ankunft"] = rueck_iso
|
|
||||||
r["booking_url"] = booking_url
|
|
||||||
results.append(r)
|
|
||||||
|
|
||||||
print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
print(f"[TV] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||||
screenshot_b64 = _take_screenshot(sb)
|
screenshot_b64 = _take_screenshot(sb)
|
||||||
return results[:10], screenshot_b64
|
return results[:10], screenshot_b64
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue