feat: Roundtrip FRA→KTI (Phnom Penh Techo Airport), Premium Economy, HAN als Umstieg
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
190281e2bb
commit
0a9e65b71a
4 changed files with 129 additions and 121 deletions
|
|
@ -33,6 +33,9 @@ def init_db():
|
|||
von TEXT NOT NULL,
|
||||
nach TEXT NOT NULL,
|
||||
tage INTEGER DEFAULT 30,
|
||||
aufenthalt_tage INTEGER DEFAULT 14,
|
||||
trip_type TEXT DEFAULT 'roundtrip',
|
||||
kabine TEXT DEFAULT 'premium_economy',
|
||||
intervall TEXT DEFAULT 'daily',
|
||||
aktiv INTEGER DEFAULT 1,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
|
|
@ -119,10 +122,13 @@ TREND: [STEIGEND / FALLEND / STABIL]'
|
|||
('flugscanner-mu', '100.75.182.15', 'Muldenstein DE')
|
||||
""")
|
||||
|
||||
# Standard-Job: Frankfurt → Hanoi, Premium Economy
|
||||
# Standard-Jobs: Frankfurt → Phnom Penh (KTI), Premium Economy, Roundtrip
|
||||
# HAN ist Umstieg — wird automatisch als günstigste Verbindung gefunden
|
||||
c.execute("""
|
||||
INSERT OR IGNORE INTO jobs (scanner, von, nach, tage, intervall) VALUES
|
||||
('google_flights', 'FRA', 'HAN', 30, 'daily')
|
||||
INSERT OR IGNORE INTO jobs (scanner, von, nach, tage, aufenthalt_tage, trip_type, kabine, intervall) VALUES
|
||||
('google_flights', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily'),
|
||||
('kayak', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily'),
|
||||
('trip', 'FRA', 'KTI', 30, 14, 'roundtrip', 'premium_economy', 'daily')
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
|
|
|
|||
|
|
@ -52,7 +52,10 @@ def dispatch_job(node, job):
|
|||
"scanner": job["scanner"],
|
||||
"von": job["von"],
|
||||
"nach": job["nach"],
|
||||
"tage": job["tage"]
|
||||
"tage": job["tage"],
|
||||
"aufenthalt_tage": job.get("aufenthalt_tage", 14),
|
||||
"trip_type": job.get("trip_type", "roundtrip"),
|
||||
"kabine": job.get("kabine", "premium_economy"),
|
||||
}
|
||||
log(f"Job an {node['name']} ({node['tailscale_ip']}): {payload}")
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -14,15 +14,18 @@ def status():
|
|||
@app.route("/job", methods=["POST"])
|
||||
def job():
|
||||
data = request.json
|
||||
scanner = data.get("scanner", "google_flights")
|
||||
von = data.get("von", "FRA")
|
||||
nach = data.get("nach", "PNH")
|
||||
tage = data.get("tage", 30)
|
||||
scanner = data.get("scanner", "google_flights")
|
||||
von = data.get("von", "FRA")
|
||||
nach = data.get("nach", "KTI")
|
||||
tage = data.get("tage", 30)
|
||||
aufenthalt = data.get("aufenthalt_tage", 14)
|
||||
trip_type = data.get("trip_type", "roundtrip")
|
||||
kabine = data.get("kabine", "premium_economy")
|
||||
|
||||
print(f"[{NODE_NAME}] Job: {scanner} {von}→{nach} ({tage} Tage)")
|
||||
print(f"[{NODE_NAME}] Job: {scanner} {von}→{nach} ({trip_type}, {kabine}, +{tage}Tage/{aufenthalt}Tage)")
|
||||
|
||||
try:
|
||||
results = scrape(scanner, von, nach, tage)
|
||||
results = scrape(scanner, von, nach, tage, aufenthalt, trip_type, kabine)
|
||||
print(f"[{NODE_NAME}] {len(results)} Preise gefunden")
|
||||
return jsonify({
|
||||
"results": results,
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@ from datetime import datetime, timedelta
|
|||
import re
|
||||
|
||||
|
||||
def scrape(scanner, von, nach, tage=30):
|
||||
def scrape(scanner, von, nach, tage=30, aufenthalt_tage=14,
|
||||
trip_type="roundtrip", kabine="premium_economy"):
|
||||
dispatcher = {
|
||||
"google_flights": scrape_google_flights,
|
||||
"kayak": scrape_kayak,
|
||||
|
|
@ -13,19 +14,26 @@ def scrape(scanner, von, nach, tage=30):
|
|||
fn = dispatcher.get(scanner)
|
||||
if not fn:
|
||||
raise ValueError(f"Unbekannter Scanner: {scanner}")
|
||||
return fn(von, nach, tage)
|
||||
return fn(von, nach, tage, aufenthalt_tage, trip_type, kabine)
|
||||
|
||||
|
||||
# ── Kabinen-Codes ──────────────────────────────────────────────────────────────
|
||||
KABINE_GOOGLE = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"}
|
||||
KABINE_KAYAK = {"economy": "e", "premium_economy": "w", "business": "b", "first": "f"}
|
||||
KABINE_TRIP = {"economy": "Y", "premium_economy": "W", "business": "C", "first": "F"}
|
||||
|
||||
|
||||
def _parse_preis(text):
|
||||
if not text:
|
||||
return None
|
||||
text = text.replace('\xa0', ' ').replace('\u202f', ' ').replace(',', '.')
|
||||
for p in [r'(\d{3,4})\s?€', r'€\s?(\d{3,4})', r'EUR\s?(\d{3,4})', r'(\d{3,4})\s?EUR']:
|
||||
text = text.replace('\xa0', ' ').replace('\u202f', ' ')
|
||||
for p in [r'(\d{1,2}[.,]\d{3})\s?€', r'(\d{3,5})\s?€',
|
||||
r'€\s?(\d{3,5})', r'EUR\s?(\d{3,5})', r'(\d{3,5})\s?EUR']:
|
||||
m = re.search(p, text)
|
||||
if m:
|
||||
try:
|
||||
v = float(m.group(1).replace('.', ''))
|
||||
if 200 < v < 8000:
|
||||
v = float(m.group(1).replace('.', '').replace(',', ''))
|
||||
if 200 < v < 15000:
|
||||
return round(v, 2)
|
||||
except ValueError:
|
||||
pass
|
||||
|
|
@ -35,12 +43,11 @@ def _parse_preis(text):
|
|||
def _preise_aus_body(body, scanner, abflug):
|
||||
results = []
|
||||
seen = set()
|
||||
# Alle Zahlen 300-6000 die neben einem € stehen
|
||||
for m in re.finditer(r'(\d[\d\s\.]{1,5})\s?€|€\s?(\d[\d\s\.]{1,5})', body):
|
||||
raw = (m.group(1) or m.group(2)).replace(' ', '').replace('.', '')
|
||||
try:
|
||||
v = float(raw)
|
||||
if 300 < v < 6000 and v not in seen:
|
||||
if 300 < v < 12000 and v not in seen:
|
||||
seen.add(v)
|
||||
results.append({
|
||||
"scanner": scanner, "preis": v, "waehrung": "EUR",
|
||||
|
|
@ -52,98 +59,85 @@ def _preise_aus_body(body, scanner, abflug):
|
|||
return results[:10]
|
||||
|
||||
|
||||
def scrape_google_flights(von, nach, tage=30):
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
# sc:w = Premium Economy
|
||||
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:w")
|
||||
def _consent_google(sb):
|
||||
"""Google Consent-Seite (DSGVO) behandeln."""
|
||||
if "consent" in sb.get_current_url() or "Bevor Sie" in sb.get_title():
|
||||
print("[CONSENT] Google Consent erkannt")
|
||||
for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]',
|
||||
'.lssxud button', 'button[aria-label*="kzeptieren"]']:
|
||||
try:
|
||||
sb.click(sel, timeout=3)
|
||||
sb.sleep(4)
|
||||
print(f"[CONSENT] Geklickt: {sel}")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
log = []
|
||||
|
||||
def scrape_google_flights(von, nach, tage=30, aufenthalt_tage=14,
|
||||
trip_type="roundtrip", kabine="premium_economy"):
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
|
||||
kc = KABINE_GOOGLE.get(kabine, "w")
|
||||
|
||||
if trip_type == "roundtrip":
|
||||
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug}*{nach}.{von}.{rueck};c:EUR;e:1;sd:1;t:r;sc:{kc}")
|
||||
else:
|
||||
url = (f"https://www.google.com/travel/flights/search?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f;sc:{kc}")
|
||||
|
||||
print(f"[GF] URL: {url[:100]}")
|
||||
results = []
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(url)
|
||||
sb.sleep(7)
|
||||
|
||||
current_url = sb.get_current_url()
|
||||
title = sb.get_title()
|
||||
log.append(f"[GF] title={title[:60]} url={current_url[:80]}")
|
||||
|
||||
# Consent-Seite behandeln
|
||||
if "consent" in current_url or "Bevor Sie" in title:
|
||||
log.append("[GF] Consent-Seite erkannt")
|
||||
for sel in ['form[action*="save"] button', 'button[jsname="tHlp8d"]',
|
||||
'.lssxud button', 'button[aria-label*="akzeptieren"]']:
|
||||
try:
|
||||
sb.click(sel, timeout=3)
|
||||
sb.sleep(4)
|
||||
log.append(f"[GF] Consent geklickt: {sel}")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
# Zurück zur Such-URL
|
||||
if _consent_google(sb):
|
||||
sb.open(url)
|
||||
sb.sleep(8)
|
||||
log.append(f"[GF] Nach Consent-Redirect: {sb.get_title()[:60]}")
|
||||
|
||||
# Checken ob wir auf der Suchergebnisseite sind
|
||||
title2 = sb.get_title()
|
||||
if "Günstige Flüge" in title2 or "Google Flüge" in title2:
|
||||
log.append("[GF] WARNUNG: Auf Homepage gelandet, versuche Formular")
|
||||
# Formular direkt ausfüllen
|
||||
try:
|
||||
# Von-Feld leeren und FRA eingeben
|
||||
sb.click('input[placeholder*="Von"]', timeout=4)
|
||||
sb.triple_click('input[placeholder*="Von"]')
|
||||
sb.type('input[placeholder*="Von"]', von)
|
||||
sb.sleep(2)
|
||||
# Ersten Autocomplete-Eintrag nehmen
|
||||
sb.click('li[data-code="' + von + '"]', timeout=3)
|
||||
sb.sleep(1)
|
||||
# Nach-Feld
|
||||
sb.click('input[placeholder*="Wohin"]', timeout=3)
|
||||
sb.type('input[placeholder*="Wohin"]', nach)
|
||||
sb.sleep(2)
|
||||
sb.click('li[data-code="' + nach + '"]', timeout=3)
|
||||
sb.sleep(1)
|
||||
log.append("[GF] Formular ausgefüllt, suche...")
|
||||
sb.sleep(5)
|
||||
except Exception as e:
|
||||
log.append(f"[GF] Formular-Fehler: {e}")
|
||||
title = sb.get_title()
|
||||
print(f"[GF] Title: {title[:80]}")
|
||||
|
||||
# Preise extrahieren
|
||||
body = sb.get_text("body")
|
||||
log.append(f"[GF] Body-Länge: {len(body)} Zeichen")
|
||||
print(f"[GF] Body-Länge: {len(body)}")
|
||||
|
||||
# Aria-Labels mit €
|
||||
try:
|
||||
elems = sb.find_elements('[aria-label*="€"]', timeout=3)
|
||||
log.append(f"[GF] aria-label €-Elemente: {len(elems)}")
|
||||
for elem in elems[:15]:
|
||||
for elem in elems[:20]:
|
||||
label = elem.get_attribute('aria-label') or elem.text
|
||||
p = _parse_preis(label)
|
||||
if p:
|
||||
results.append({"scanner": "google_flights", "preis": p,
|
||||
"waehrung": "EUR", "airline": "", "abflug": abflug, "ankunft": ""})
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug, "ankunft": rueck if trip_type == "roundtrip" else ""})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: Regex über Body
|
||||
# Fallback Regex
|
||||
if not results:
|
||||
results = _preise_aus_body(body, "google_flights", abflug)
|
||||
log.append(f"[GF] Regex-Fallback: {len(results)} Preise")
|
||||
|
||||
log.append(f"[GF] Ergebnis: {[r['preis'] for r in results]}")
|
||||
|
||||
print('\n'.join(log))
|
||||
print(f"[GF] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
return results[:10]
|
||||
|
||||
|
||||
def scrape_kayak(von, nach, tage=30):
|
||||
def scrape_kayak(von, nach, tage=30, aufenthalt_tage=14,
|
||||
trip_type="roundtrip", kabine="premium_economy"):
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a&cabin=w¤cy=EUR"
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
|
||||
kc = KABINE_KAYAK.get(kabine, "w")
|
||||
|
||||
log = []
|
||||
if trip_type == "roundtrip":
|
||||
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}/{rueck}?sort=price_a&cabin={kc}¤cy=EUR"
|
||||
else:
|
||||
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a&cabin={kc}¤cy=EUR"
|
||||
|
||||
print(f"[KY] URL: {url}")
|
||||
results = []
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
|
|
@ -152,54 +146,57 @@ def scrape_kayak(von, nach, tage=30):
|
|||
|
||||
title = sb.get_title()
|
||||
body = sb.get_text("body")
|
||||
log.append(f"[KY] title={title[:60]}")
|
||||
log.append(f"[KY] body-länge={len(body)}")
|
||||
log.append(f"[KY] body-500={body[:500]}")
|
||||
print(f"[KY] Title: {title[:80]}")
|
||||
print(f"[KY] Body-500: {body[:300]}")
|
||||
|
||||
# CSS-Selektoren Kayak
|
||||
for sel in ['.price-text', '.f8F1-price-text', 'div[class*="price"] span',
|
||||
'span[class*="price"]', '.Iqt3', 'div.nrc6-price']:
|
||||
'span[class*="price"]', '.Iqt3', 'div.nrc6-price', '.price']:
|
||||
try:
|
||||
elems = sb.find_elements(sel, timeout=2)
|
||||
if elems:
|
||||
log.append(f"[KY] Selector '{sel}': {len(elems)} Elemente")
|
||||
for e in elems[:10]:
|
||||
for e in elems[:15]:
|
||||
p = _parse_preis(e.text)
|
||||
if p:
|
||||
results.append({"scanner": "kayak", "preis": p,
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug, "ankunft": ""})
|
||||
"abflug": abflug,
|
||||
"ankunft": rueck if trip_type == "roundtrip" else ""})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback Regex
|
||||
if not results:
|
||||
results = _preise_aus_body(body, "kayak", abflug)
|
||||
|
||||
log.append(f"[KY] Ergebnis: {[r['preis'] for r in results]}")
|
||||
|
||||
print('\n'.join(log))
|
||||
print(f"[KY] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
return results[:10]
|
||||
|
||||
|
||||
def scrape_skyscanner(von, nach, tage=30):
|
||||
"""Skyscanner hat starken Bot-Schutz — für jetzt übersprungen."""
|
||||
print("[SS] Skyscanner übersprungen (Bot-Detection)")
|
||||
return []
|
||||
|
||||
|
||||
def scrape_trip(von, nach, tage=30):
|
||||
"""trip.com — gut von Asia-IPs, oft günstigere Asien-Preise."""
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
|
||||
def scrape_trip(von, nach, tage=30, aufenthalt_tage=14,
|
||||
trip_type="roundtrip", kabine="premium_economy"):
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y%m%d")
|
||||
rueck = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y%m%d")
|
||||
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
# W = Premium Economy bei trip.com
|
||||
url = (f"https://www.trip.com/flights/{von.lower()}-to-{nach.lower()}/"
|
||||
f"tickets-{von.lower()}-{nach.lower()}/"
|
||||
f"?DDate1={abflug}&class=W&curr=EUR")
|
||||
rueck_iso = (datetime.now() + timedelta(days=tage + aufenthalt_tage)).strftime("%Y-%m-%d")
|
||||
kc = KABINE_TRIP.get(kabine, "W")
|
||||
|
||||
log = []
|
||||
# trip.com Stadtname-Mapping für URL
|
||||
stadtname = {"FRA": "frankfurt", "HAN": "hanoi", "KTI": "phnom-penh",
|
||||
"PNH": "phnom-penh", "BKK": "bangkok", "SGN": "ho-chi-minh-city"}
|
||||
von_name = stadtname.get(von, von.lower())
|
||||
nach_name = stadtname.get(nach, nach.lower())
|
||||
|
||||
if trip_type == "roundtrip":
|
||||
url = (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/"
|
||||
f"tickets-{von.lower()}-{nach.lower()}/"
|
||||
f"?DDate1={abflug}&DDate2={rueck}&class={kc}&curr=EUR")
|
||||
else:
|
||||
url = (f"https://www.trip.com/flights/{von_name}-to-{nach_name}/"
|
||||
f"tickets-{von.lower()}-{nach.lower()}/"
|
||||
f"?DDate1={abflug}&class={kc}&curr=EUR")
|
||||
|
||||
print(f"[TR] URL: {url}")
|
||||
results = []
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
|
|
@ -208,12 +205,11 @@ def scrape_trip(von, nach, tage=30):
|
|||
|
||||
title = sb.get_title()
|
||||
body = sb.get_text("body")
|
||||
log.append(f"[TR] title={title[:60]}")
|
||||
log.append(f"[TR] body-länge={len(body)}")
|
||||
print(f"[TR] Title: {title[:80]}")
|
||||
|
||||
# Cookie-Banner
|
||||
for sel in ['button[id*="accept"]', 'button[class*="accept"]',
|
||||
'button[aria-label*="Accept"]', '.cookie-accept']:
|
||||
'button[aria-label*="Accept"]', '.cookie-accept', '#onetrust-accept-btn-handler']:
|
||||
try:
|
||||
sb.click(sel, timeout=2)
|
||||
sb.sleep(2)
|
||||
|
|
@ -221,32 +217,32 @@ def scrape_trip(von, nach, tage=30):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# CSS-Selektoren trip.com
|
||||
for sel in ['.price-box .price', '.flight-price', 'span[class*="price"]',
|
||||
'div[class*="price-num"]', '.price-content span', 'em[class*="price"]']:
|
||||
'div[class*="price-num"]', 'em[class*="price"]', '.c-price']:
|
||||
try:
|
||||
elems = sb.find_elements(sel, timeout=2)
|
||||
if elems:
|
||||
log.append(f"[TR] Selector '{sel}': {len(elems)} Elemente")
|
||||
for e in elems[:10]:
|
||||
p = _parse_preis(e.text)
|
||||
if p:
|
||||
results.append({
|
||||
"scanner": "trip", "preis": p,
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug_iso, "ankunft": ""
|
||||
})
|
||||
results.append({"scanner": "trip", "preis": p,
|
||||
"waehrung": "EUR", "airline": "",
|
||||
"abflug": abflug_iso,
|
||||
"ankunft": rueck_iso if trip_type == "roundtrip" else ""})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback Regex
|
||||
if not results:
|
||||
results = _preise_aus_body(body, "trip", abflug_iso)
|
||||
log.append(f"[TR] Regex-Fallback: {len(results)} Preise")
|
||||
|
||||
log.append(f"[TR] Ergebnis: {[r['preis'] for r in results]}")
|
||||
|
||||
print('\n'.join(log))
|
||||
print(f"[TR] Ergebnis: {[r['preis'] for r in results[:5]]}")
|
||||
return results[:10]
|
||||
|
||||
|
||||
def scrape_skyscanner(von, nach, tage=30, aufenthalt_tage=14,
|
||||
trip_type="roundtrip", kabine="premium_economy"):
|
||||
"""Skyscanner hat starken Bot-Schutz — übersprungen."""
|
||||
print("[SS] Skyscanner übersprungen (Bot-Detection)")
|
||||
return []
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue