From 1cf7a334c002da593cd6462b86201ffcfdfb0205 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 09:56:31 +0200 Subject: [PATCH] Tiktok example 5 --- tiktok2bsky.py | 88 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 522c26e..7b0b04f 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -106,6 +106,15 @@ GRID_SELECTORS = ( '[class*="DivVideoFeedV2"]' ) +# --- Grid error retry button selectors --- +RETRY_BUTTON_SELECTORS = [ + 'button:has-text("Actualizar")', + 'button:has-text("Refresh")', + 'button:has-text("Retry")', + 'button:has-text("Reintentar")', + '[data-e2e="retry-button"]', +] + # --- Logging Setup --- logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", @@ -602,7 +611,7 @@ def make_rich(content): return text_builder -# --- TikTok Scraping --- +# --- Banner helpers --- def _dismiss_banners(page): """ Dismiss all TikTok banners in the correct order: @@ -645,18 +654,39 @@ def _dismiss_banners(page): return any_dismissed +def _click_retry_button(page): + """ + Click the "Actualizar" / "Refresh" button that TikTok shows inside + the video grid when it renders an error state. Returns True if clicked. + """ + for selector in RETRY_BUTTON_SELECTORS: + try: + btn = page.locator(selector).first + if btn.is_visible(timeout=2000): + btn.click() + logging.info(f"🔁 Clicked grid retry button: {selector}") + time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) + return True + except Exception: + pass + return False + + +# --- TikTok Scraping --- def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: """ Scrape recent TikTok videos from a public profile using Playwright. No login required for public profiles. - Banner-handling strategy (fixes applied): - 1. Dismiss top RGPD info banner ("Entendido") first. - 2. Dismiss cookie consent modal ("Permitir todas" / etc.) second. - 3. Reload the page after all banners are dismissed so TikTok - renders the video grid cleanly (avoids "Hubo un problema"). - 4. playwright-stealth applied before navigation when available. - 5. Broader grid selector list + 30 s timeout + soft-fail on timeout. + Strategy: + 1. Navigate to profile and wait for page to settle. + 2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas"). + 3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema"). + 4. Dismiss any banners that reappear after reload. + 5. Wait for video grid selector (30 s, soft-fail). + 6. Click "Actualizar" retry button if TikTok shows grid error state. + 7. Scroll to load more videos. + 8. Collect all a[href*="/video/"] links. """ tiktoks = [] profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" @@ -740,43 +770,55 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> """) try: - # ── Initial navigation ─────────────────────────────────────── + # ── 1. Initial navigation ──────────────────────────────────── logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # ── Dismiss all banners ────────────────────────────────────── + # ── 2. Dismiss banners (first pass) ────────────────────────── _dismiss_banners(page) - # ── Reload for a clean grid render ─────────────────────────── - # TikTok renders "Hubo un problema" when the page first loaded - # with banners present. A fresh reload after banner dismissal - # gives TikTok a clean state and the grid renders correctly. + # ── 3. Reload for clean grid render ────────────────────────── + # TikTok serves "Hubo un problema" when the page first loaded + # while banners were blocking. A reload after dismissal gives + # TikTok a clean cookie state so the grid renders correctly. logging.info("🔄 Reloading page after banner dismissal for clean grid render...") page.reload(wait_until="domcontentloaded", timeout=40000) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # ── Dismiss any banners that reappear after reload ─────────── + # ── 4. Dismiss banners (second pass, post-reload) ──────────── _dismiss_banners(page) - # ── Wait for video grid ────────────────────────────────────── + # ── 5. Wait for video grid ─────────────────────────────────── try: page.wait_for_selector(GRID_SELECTORS, timeout=30000) logging.info("✅ TikTok video grid detected.") except Exception: logging.warning( - "⚠️ Grid selector timed out after 30s — " - "attempting scroll anyway (grid may be partially loaded)" + "⚠️ Grid selector timed out after 30s — continuing anyway." ) take_error_screenshot(page, "tiktok_grid_timeout") - # ── Scroll to load more videos ─────────────────────────────── + # ── 6. Click "Actualizar" if grid shows error state ────────── + # Even when the grid DOM node exists, TikTok may render an + # error card inside it. Clicking the retry button triggers a + # client-side reload of the video feed without a full page + # reload, which often resolves the empty grid. + if _click_retry_button(page): + logging.info("⏳ Waiting for grid to reload after retry click...") + try: + page.wait_for_selector(GRID_SELECTORS, timeout=15000) + logging.info("✅ Grid reloaded after retry.") + except Exception: + logging.warning("⚠️ Grid still not visible after retry click.") + + # ── 7. Scroll to load more videos ──────────────────────────── for scroll_i in range(TIKTOK_MAX_SCROLLS): page.evaluate("window.scrollBy(0, window.innerHeight * 2)") time.sleep(TIKTOK_SCROLL_PAUSE_S) logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") - # ── Collect video links ────────────────────────────────────── + # ── 8. Collect video links ─────────────────────────────────── video_links = page.locator('a[href*="/video/"]').all() logging.info( f"📊 Found {len(video_links)} video links. " @@ -970,9 +1012,9 @@ def _probe_video_duration(file_path): def download_and_crop_video(video_url: str, output_path: str): - temp_input = output_path.replace(".mp4", "_source.mp4") - temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") - temp_output = output_path.replace(".mp4", "_compressed.mp4") + temp_input = output_path.replace(".mp4", "_source.mp4") + temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") + temp_output = output_path.replace(".mp4", "_compressed.mp4") try: logging.info(f"⬇️ Downloading TikTok video: {video_url}")