Tiktok example 5

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 10:12:27 +02:00
parent 1cf7a334c0
commit 185efaca5f

View File

@@ -67,6 +67,7 @@ TIKTOK_PAGE_LOAD_WAIT_S = 5.0
TIKTOK_SCROLL_PAUSE_S = 2.5 TIKTOK_SCROLL_PAUSE_S = 2.5
TIKTOK_MAX_SCROLLS = 8 TIKTOK_MAX_SCROLLS = 8
TIKTOK_BANNER_WAIT_S = 3.0 TIKTOK_BANNER_WAIT_S = 3.0
TIKTOK_MAX_LOAD_ATTEMPTS = 3 # full reload attempts before giving up
DYNAMIC_ALT_MAX_LENGTH = 150 DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20 TRUNCATE_MIN_PREFIX_CHARS = 20
@@ -617,7 +618,6 @@ def _dismiss_banners(page):
Dismiss all TikTok banners in the correct order: Dismiss all TikTok banners in the correct order:
1. Top RGPD/info banner ("Entendido") 1. Top RGPD/info banner ("Entendido")
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.) 2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
Returns True if at least one banner was dismissed. Returns True if at least one banner was dismissed.
""" """
any_dismissed = False any_dismissed = False
@@ -657,7 +657,8 @@ def _dismiss_banners(page):
def _click_retry_button(page): def _click_retry_button(page):
""" """
Click the "Actualizar" / "Refresh" button that TikTok shows inside Click the "Actualizar" / "Refresh" button that TikTok shows inside
the video grid when it renders an error state. Returns True if clicked. the video grid when it renders an error state.
Returns True if clicked.
""" """
for selector in RETRY_BUTTON_SELECTORS: for selector in RETRY_BUTTON_SELECTORS:
try: try:
@@ -680,13 +681,17 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
Strategy: Strategy:
1. Navigate to profile and wait for page to settle. 1. Navigate to profile and wait for page to settle.
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas"). 2. Dismiss top RGPD banner + cookie modal.
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema"). 3. Reload page so TikTok renders the grid cleanly.
4. Dismiss any banners that reappear after reload. 4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
5. Wait for video grid selector (30 s, soft-fail). a. Dismiss any banners that reappeared.
6. Click "Actualizar" retry button if TikTok shows grid error state. b. Wait for grid selector (15 s soft-fail).
7. Scroll to load more videos. c. Click "Actualizar" retry button if grid shows error.
8. Collect all a[href*="/video/"] links. d. Wait for grid to repopulate.
e. Check for video links — break immediately if found.
f. If not found and attempts remain → full page reload.
5. Scroll to load more videos.
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
""" """
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
@@ -775,65 +780,101 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 2. Dismiss banners (first pass) ────────────────────────── # ── 2. First banner dismissal ────────────────────────────────
_dismiss_banners(page) _dismiss_banners(page)
# ── 3. Reload for clean grid render ────────────────────────── # ── 3. Initial reload for clean grid render ──────────────────
# TikTok serves "Hubo un problema" when the page first loaded # TikTok serves "Hubo un problema" when the page first loaded
# while banners were blocking. A reload after dismissal gives # while banners were present. A reload after dismissal gives
# TikTok a clean cookie state so the grid renders correctly. # TikTok a clean cookie state so the grid renders correctly.
logging.info("🔄 Reloading page after banner dismissal for clean grid render...") logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
page.reload(wait_until="domcontentloaded", timeout=40000) page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 4. Dismiss banners (second pass, post-reload) ──────────── # ── 4. Multi-attempt loop ────────────────────────────────────
# Each attempt: dismiss banners → wait for grid → click retry
# → check for links. Reload between attempts if still empty.
video_links = []
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
logging.info(
f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..."
)
# 4a. Dismiss any banners that reappeared
_dismiss_banners(page) _dismiss_banners(page)
# ── 5. Wait for video grid ─────────────────────────────────── # 4b. Wait for grid selector (soft-fail)
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=30000) page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("TikTok video grid detected.") logging.info(f"Grid selector found on attempt {attempt}.")
except Exception: except Exception:
logging.warning( logging.warning(
"⚠️ Grid selector timed out after 30s — continuing anyway." f"⚠️ Grid selector timed out on attempt {attempt}."
)
take_error_screenshot(
page, f"tiktok_grid_timeout_attempt{attempt}"
) )
take_error_screenshot(page, "tiktok_grid_timeout")
# ── 6. Click "Actualizar" if grid shows error state ────────── # 4c. Click "Actualizar" if grid shows error state
# Even when the grid DOM node exists, TikTok may render an
# error card inside it. Clicking the retry button triggers a
# client-side reload of the video feed without a full page
# reload, which often resolves the empty grid.
if _click_retry_button(page): if _click_retry_button(page):
# 4d. Wait for grid to repopulate after retry click
logging.info("⏳ Waiting for grid to reload after retry click...") logging.info("⏳ Waiting for grid to reload after retry click...")
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=15000) page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("✅ Grid reloaded after retry.") logging.info("✅ Grid reloaded after retry.")
except Exception: except Exception:
logging.warning("⚠️ Grid still not visible after retry click.") logging.warning(
"⚠️ Grid still not visible after retry click."
)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 7. Scroll to load more videos ──────────────────────────── # 4e. Check if video links appeared
video_links = page.locator('a[href*="/video/"]').all()
logging.info(
f"📊 Attempt {attempt}: found {len(video_links)} video links."
)
if video_links:
logging.info(
f"✅ Got video links on attempt {attempt} — proceeding."
)
break
# 4f. No links yet — full reload before next attempt
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
logging.info(
f"🔄 No videos found — reloading page "
f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..."
)
page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 5. Scroll to load more videos ────────────────────────────
if video_links:
for scroll_i in range(TIKTOK_MAX_SCROLLS): for scroll_i in range(TIKTOK_MAX_SCROLLS):
page.evaluate("window.scrollBy(0, window.innerHeight * 2)") page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
time.sleep(TIKTOK_SCROLL_PAUSE_S) time.sleep(TIKTOK_SCROLL_PAUSE_S)
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
# ── 8. Collect video links ─────────────────────────────────── # Re-collect after scrolling to include lazy-loaded cards
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Found {len(video_links)} video links. " f"📊 Found {len(video_links)} video links after scroll. "
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..." f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
) )
# ── 6. Bail out if still nothing ─────────────────────────────
if not video_links: if not video_links:
take_error_screenshot(page, "tiktok_no_video_links") take_error_screenshot(page, "tiktok_no_video_links")
logging.error( logging.error(
"❌ No video links found after scroll. " "❌ No video links found after all attempts. "
"TikTok may still be blocking — check screenshot." "Install playwright-stealth: pip install playwright-stealth"
) )
browser.close() browser.close()
return [] return []
# ── 7. Parse video links into ScrapedTikTok objects ──────────
seen_urls = set() seen_urls = set()
for link in video_links: for link in video_links:
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: