This commit is contained in:
Guillem Hernandez Sola
2026-05-19 10:34:19 +02:00
parent 63265c9322
commit dea94476a9

View File

@@ -675,38 +675,26 @@ def _click_retry_button(page):
# --- TikTok Scraping --- # --- TikTok Scraping ---
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
"""
Scrape recent TikTok videos from a public profile using Playwright.
No login required for public profiles.
Strategy:
1. Navigate to profile and wait for page to settle.
2. Dismiss top RGPD banner + cookie modal.
3. Reload page so TikTok renders the grid cleanly.
4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
a. Dismiss any banners that reappeared.
b. Wait for grid selector (15 s soft-fail).
c. Click "Actualizar" retry button if grid shows error.
d. Wait for grid to repopulate.
e. Check for video links — break immediately if found.
f. If not found and attempts remain → full page reload.
5. Scroll to load more videos.
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
"""
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
# playwright-stealth — optional but strongly recommended # playwright-stealth 2.x uses Stealth class; 1.x uses stealth_sync
try: try:
from playwright_stealth import stealth_sync from playwright_stealth import Stealth
USE_STEALTH = True USE_STEALTH = "v2"
logging.info("🥷 playwright-stealth available — stealth mode ON") _stealth = Stealth()
logging.info("🥷 playwright-stealth 2.x available — stealth mode ON")
except ImportError: except ImportError:
USE_STEALTH = False try:
logging.warning( from playwright_stealth import stealth_sync
"⚠️ playwright-stealth not installed — running without stealth. " USE_STEALTH = "v1"
"Run: pip install playwright-stealth" logging.info("🥷 playwright-stealth 1.x available — stealth mode ON (legacy)")
) except ImportError:
USE_STEALTH = False
logging.warning(
"⚠️ playwright-stealth not installed — running without stealth. "
"Run: pip install playwright-stealth"
)
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch( browser = p.chromium.launch(
@@ -750,9 +738,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
page = context.new_page() page = context.new_page()
if USE_STEALTH: # Apply stealth patches — version-aware
if USE_STEALTH == "v2":
_stealth.apply_stealth_sync(page)
logging.info("🥷 Stealth patches applied (2.x).")
elif USE_STEALTH == "v1":
stealth_sync(page) stealth_sync(page)
logging.info("🥷 Stealth patches applied.") logging.info("🥷 Stealth patches applied (1.x legacy).")
page.add_init_script(""" page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
@@ -784,16 +776,11 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
_dismiss_banners(page) _dismiss_banners(page)
# ── 3. Initial reload for clean grid render ────────────────── # ── 3. Initial reload for clean grid render ──────────────────
# TikTok serves "Hubo un problema" when the page first loaded
# while banners were present. A reload after dismissal gives
# TikTok a clean cookie state so the grid renders correctly.
logging.info("🔄 Reloading page after banner dismissal for clean grid render...") logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
page.reload(wait_until="domcontentloaded", timeout=40000) page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 4. Multi-attempt loop ──────────────────────────────────── # ── 4. Multi-attempt loop ────────────────────────────────────
# Each attempt: dismiss banners → wait for grid → click retry
# → check for links. Reload between attempts if still empty.
video_links = [] video_links = []
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1): for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
@@ -818,18 +805,15 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
# 4c. Click "Actualizar" if grid shows error state # 4c. Click "Actualizar" if grid shows error state
if _click_retry_button(page): if _click_retry_button(page):
# 4d. Wait for grid to repopulate after retry click
logging.info("⏳ Waiting for grid to reload after retry click...") logging.info("⏳ Waiting for grid to reload after retry click...")
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=15000) page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("✅ Grid reloaded after retry.") logging.info("✅ Grid reloaded after retry.")
except Exception: except Exception:
logging.warning( logging.warning("⚠️ Grid still not visible after retry click.")
"⚠️ Grid still not visible after retry click."
)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# 4e. Check if video links appeared # 4d. Check if video links appeared
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Attempt {attempt}: found {len(video_links)} video links." f"📊 Attempt {attempt}: found {len(video_links)} video links."
@@ -841,7 +825,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
) )
break break
# 4f. No links yet — full reload before next attempt # 4e. No links yet — full reload before next attempt
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS: if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
logging.info( logging.info(
f"🔄 No videos found — reloading page " f"🔄 No videos found — reloading page "
@@ -857,7 +841,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
time.sleep(TIKTOK_SCROLL_PAUSE_S) time.sleep(TIKTOK_SCROLL_PAUSE_S)
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
# Re-collect after scrolling to include lazy-loaded cards
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Found {len(video_links)} video links after scroll. " f"📊 Found {len(video_links)} video links after scroll. "
@@ -869,12 +852,12 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
take_error_screenshot(page, "tiktok_no_video_links") take_error_screenshot(page, "tiktok_no_video_links")
logging.error( logging.error(
"❌ No video links found after all attempts. " "❌ No video links found after all attempts. "
"Install playwright-stealth: pip install playwright-stealth" "Check screenshot for current TikTok block state."
) )
browser.close() browser.close()
return [] return []
# ── 7. Parse video links into ScrapedTikTok objects ────────── # ── 7. Parse video links ──────────────────────────────────────
seen_urls = set() seen_urls = set()
for link in video_links: for link in video_links:
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
@@ -896,7 +879,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
continue continue
seen_urls.add(canonical) seen_urls.add(canonical)
# Caption
caption = "" caption = ""
try: try:
card = link.locator("..").first card = link.locator("..").first
@@ -910,7 +892,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
except Exception: except Exception:
pass pass
# Thumbnail
thumbnail_url = None thumbnail_url = None
try: try:
img = link.locator("img").first img = link.locator("img").first
@@ -943,7 +924,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.") logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
return tiktoks return tiktoks
# --- Video URL extraction --- # --- Video URL extraction ---
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None): def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None):
""" """