fix: robusteres Scraping mit mehreren Selektoren + Regex-Fallback
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f0a295a408
commit
7adc9529b7
1 changed files with 194 additions and 94 deletions
|
|
@ -15,70 +15,105 @@ def scrape(scanner, von, nach, tage=30):
|
|||
return fn(von, nach, tage)
|
||||
|
||||
|
||||
def parse_preis(text):
|
||||
"""Zahl aus einem Preis-String extrahieren."""
|
||||
matches = re.findall(r'[\d.,]+', text.replace(',', '.'))
|
||||
for m in matches:
|
||||
try:
|
||||
v = float(m.replace(',', ''))
|
||||
if 50 < v < 10000:
|
||||
return round(v, 2)
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def scrape_google_flights(von, nach, tage=30):
|
||||
def parse_preise_aus_text(text, scanner, abflug):
|
||||
"""EUR-Preise aus Seitentext per Regex extrahieren."""
|
||||
results = []
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
url = (
|
||||
f"https://www.google.com/travel/flights/search"
|
||||
f"?tfs=CBwQAhojEgoyMDI1LTA2LTA1agcIARIDRlJBcgwIAxIIL20vMDVxeHgQAQ"
|
||||
)
|
||||
# Direkte URL mit Parametern
|
||||
url = (
|
||||
f"https://www.google.com/travel/flights?"
|
||||
f"q=Flights+from+{von}+to+{nach}+on+{abflug}"
|
||||
)
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox") as sb:
|
||||
sb.open(url)
|
||||
sb.sleep(4)
|
||||
|
||||
# Cookie-Banner wegklicken falls vorhanden
|
||||
try:
|
||||
sb.click('button[aria-label*="Accept"]', timeout=3)
|
||||
sb.sleep(1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Preise suchen
|
||||
try:
|
||||
preise_elems = sb.find_elements('li[data-price]', timeout=8)
|
||||
for elem in preise_elems[:10]:
|
||||
preis_str = elem.get_attribute('data-price') or elem.text
|
||||
preis = parse_preis(preis_str)
|
||||
if preis:
|
||||
# Muster: 578 € oder €578 oder EUR 578 oder 578 EUR oder 1.234 €
|
||||
patterns = [
|
||||
r'(\d{1,2}[.,]\d{3})\s*[€]', # 1.234 €
|
||||
r'[€]\s*(\d{3,4})', # €578
|
||||
r'(\d{3,4})\s*[€]', # 578 €
|
||||
r'EUR\s*(\d{3,4})', # EUR 578
|
||||
r'(\d{3,4})\s*EUR', # 578 EUR
|
||||
]
|
||||
seen = set()
|
||||
for pattern in patterns:
|
||||
for m in re.findall(pattern, text):
|
||||
clean = m.replace('.', '').replace(',', '')
|
||||
try:
|
||||
preis = float(clean)
|
||||
if 100 < preis < 5000 and preis not in seen:
|
||||
seen.add(preis)
|
||||
results.append({
|
||||
"scanner": "google_flights",
|
||||
"scanner": scanner,
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug,
|
||||
"ankunft": ""
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
except ValueError:
|
||||
continue
|
||||
# Maximal 10 günstigste
|
||||
results.sort(key=lambda x: x["preis"])
|
||||
return results[:10]
|
||||
|
||||
# Fallback: Alle Preis-Texte auf der Seite
|
||||
|
||||
def scrape_google_flights(von, nach, tage=30):
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
# Direkte Google Flights Such-URL
|
||||
url = (
|
||||
f"https://www.google.com/travel/flights/search"
|
||||
f"?hl=de&curr=EUR"
|
||||
f"#flt={von}.{nach}.{abflug};c:EUR;e:1;sd:1;t:f"
|
||||
)
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(url)
|
||||
sb.sleep(6)
|
||||
|
||||
# Cookie-Banner wegklicken
|
||||
for selector in ['button[aria-label*="Alle"]', 'button[aria-label*="Accept"]',
|
||||
'button[aria-label*="Zustimmen"]', 'button.VfPpkd-LgbsSe']:
|
||||
try:
|
||||
sb.click(selector, timeout=2)
|
||||
sb.sleep(1)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Noch etwas warten bis Preise laden
|
||||
sb.sleep(4)
|
||||
|
||||
results = []
|
||||
|
||||
# Versuch 1: Spezifische Preis-Elemente
|
||||
selectors = [
|
||||
'span[data-gs*="price"]',
|
||||
'[data-price]',
|
||||
'div.YMlIz', # Google Flights Preis-Container (oft genutzt)
|
||||
'div.U3gSDe',
|
||||
'span.nE0Jnd',
|
||||
'div[jsname="MkNb9"] span',
|
||||
'li[data-gs] span',
|
||||
]
|
||||
for sel in selectors:
|
||||
try:
|
||||
elems = sb.find_elements(sel, timeout=2)
|
||||
for elem in elems[:15]:
|
||||
preis = _parse_single(elem.text)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "google_flights",
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug,
|
||||
"ankunft": ""
|
||||
})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Versuch 2: Aria-Labels mit Preisangaben
|
||||
if not results:
|
||||
try:
|
||||
page_text = sb.get_page_source()
|
||||
# Suche nach EUR-Preisen im HTML
|
||||
matches = re.findall(r'(\d{3,4})\s*€', page_text)
|
||||
for m in matches[:5]:
|
||||
preis = float(m)
|
||||
if 100 < preis < 5000:
|
||||
elems = sb.find_elements('[aria-label*="€"]', timeout=3)
|
||||
for elem in elems[:20]:
|
||||
label = elem.get_attribute('aria-label') or elem.text
|
||||
preis = _parse_single(label)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "google_flights",
|
||||
"preis": preis,
|
||||
|
|
@ -90,60 +125,125 @@ def scrape_google_flights(von, nach, tage=30):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
# Versuch 3: Regex über ganzen Seitentext
|
||||
if not results:
|
||||
try:
|
||||
body = sb.get_text("body", timeout=5)
|
||||
results = parse_preise_aus_text(body, "google_flights", abflug)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Deduplizieren
|
||||
seen = set()
|
||||
unique = []
|
||||
for r in sorted(results, key=lambda x: x["preis"]):
|
||||
if r["preis"] not in seen:
|
||||
seen.add(r["preis"])
|
||||
unique.append(r)
|
||||
|
||||
return unique[:10]
|
||||
|
||||
|
||||
def _parse_single(text):
|
||||
if not text:
|
||||
return None
|
||||
text = text.replace('\xa0', ' ').replace('\u202f', ' ')
|
||||
patterns = [
|
||||
r'(\d{1,2}[.,]\d{3})\s*[€]',
|
||||
r'[€]\s*(\d{3,4})',
|
||||
r'(\d{3,4})\s*[€]',
|
||||
r'EUR\s*(\d{3,4})',
|
||||
r'(\d{3,4})\s*EUR',
|
||||
]
|
||||
for p in patterns:
|
||||
m = re.search(p, text)
|
||||
if m:
|
||||
clean = m.group(1).replace('.', '').replace(',', '')
|
||||
try:
|
||||
v = float(clean)
|
||||
if 100 < v < 5000:
|
||||
return round(v, 2)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def scrape_kayak(von, nach, tage=30):
|
||||
results = []
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
url = f"https://www.kayak.com/flights/{von}-{nach}/{abflug}?sort=price_a"
|
||||
url = f"https://www.kayak.de/flights/{von}-{nach}/{abflug}?sort=price_a¤cy=EUR"
|
||||
results = []
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox") as sb:
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(url)
|
||||
sb.sleep(5)
|
||||
sb.sleep(7)
|
||||
|
||||
try:
|
||||
elems = sb.find_elements('.price-text', timeout=8)
|
||||
for elem in elems[:10]:
|
||||
preis = parse_preis(elem.text)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "kayak",
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug,
|
||||
"ankunft": ""
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
# Preis-Selektoren Kayak
|
||||
for sel in ['.price-text', '.f8F1-price-text', 'span[class*="price"]',
|
||||
'div[class*="price"] span', '.Iqt3']:
|
||||
try:
|
||||
elems = sb.find_elements(sel, timeout=3)
|
||||
for elem in elems[:10]:
|
||||
preis = _parse_single(elem.text)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "kayak",
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug,
|
||||
"ankunft": ""
|
||||
})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
# Fallback Regex
|
||||
if not results:
|
||||
try:
|
||||
body = sb.get_text("body", timeout=5)
|
||||
results = parse_preise_aus_text(body, "kayak", abflug)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results[:10]
|
||||
|
||||
|
||||
def scrape_skyscanner(von, nach, tage=30):
|
||||
abflug_fmt = (datetime.now() + timedelta(days=tage)).strftime("%y%m%d")
|
||||
abflug_iso = (datetime.now() + timedelta(days=tage)).strftime("%Y-%m-%d")
|
||||
url = f"https://www.skyscanner.de/transport/flights/{von.lower()}/{nach.lower()}/{abflug_fmt}/?currency=EUR"
|
||||
results = []
|
||||
abflug = (datetime.now() + timedelta(days=tage)).strftime("%y%m%d")
|
||||
url = f"https://www.skyscanner.de/flights/{von.lower()}/{nach.lower()}/{abflug}/"
|
||||
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox") as sb:
|
||||
with SB(uc=True, headless=True, chromium_arg="--no-sandbox --disable-dev-shm-usage") as sb:
|
||||
sb.open(url)
|
||||
sb.sleep(5)
|
||||
sb.sleep(7)
|
||||
|
||||
try:
|
||||
elems = sb.find_elements('[data-testid="price-label"]', timeout=8)
|
||||
for elem in elems[:10]:
|
||||
preis = parse_preis(elem.text)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "skyscanner",
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug,
|
||||
"ankunft": ""
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
for sel in ['[data-testid="price-label"]', 'span[class*="Price"]',
|
||||
'div[class*="price"]', 'span[class*="price"]']:
|
||||
try:
|
||||
elems = sb.find_elements(sel, timeout=3)
|
||||
for elem in elems[:10]:
|
||||
preis = _parse_single(elem.text)
|
||||
if preis:
|
||||
results.append({
|
||||
"scanner": "skyscanner",
|
||||
"preis": preis,
|
||||
"waehrung": "EUR",
|
||||
"airline": "",
|
||||
"abflug": abflug_iso,
|
||||
"ankunft": ""
|
||||
})
|
||||
if results:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
if not results:
|
||||
try:
|
||||
body = sb.get_text("body", timeout=5)
|
||||
results = parse_preise_aus_text(body, "skyscanner", abflug_iso)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results[:10]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue