diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 7b0b04f..8281795 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -63,10 +63,11 @@ FFPROBE_TIMEOUT_SECONDS = 15 DEFAULT_BSKY_BASE_URL = "https://bsky.social" SESSION_FILE_PERMISSIONS = 0o600 -TIKTOK_PAGE_LOAD_WAIT_S = 5.0 -TIKTOK_SCROLL_PAUSE_S = 2.5 -TIKTOK_MAX_SCROLLS = 8 -TIKTOK_BANNER_WAIT_S = 3.0 +TIKTOK_PAGE_LOAD_WAIT_S = 5.0 +TIKTOK_SCROLL_PAUSE_S = 2.5 +TIKTOK_MAX_SCROLLS = 8 +TIKTOK_BANNER_WAIT_S = 3.0 +TIKTOK_MAX_LOAD_ATTEMPTS = 3 # full reload attempts before giving up DYNAMIC_ALT_MAX_LENGTH = 150 TRUNCATE_MIN_PREFIX_CHARS = 20 @@ -405,7 +406,7 @@ def remember_posted_video(state, candidate, bsky_uri=None): def candidate_matches_state(candidate, state): - canonical_url = candidate["canonical_post_url"] + canonical_url = candidate["canonical_post_url"] text_media_key = candidate["text_media_key"] normalized_text = candidate["normalized_text"] posted = state.get("posted_videos", {}) @@ -617,7 +618,6 @@ def _dismiss_banners(page): Dismiss all TikTok banners in the correct order: 1. Top RGPD/info banner ("Entendido") 2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.) - Returns True if at least one banner was dismissed. """ any_dismissed = False @@ -657,7 +657,8 @@ def _dismiss_banners(page): def _click_retry_button(page): """ Click the "Actualizar" / "Refresh" button that TikTok shows inside - the video grid when it renders an error state. Returns True if clicked. + the video grid when it renders an error state. + Returns True if clicked. """ for selector in RETRY_BUTTON_SELECTORS: try: @@ -679,14 +680,18 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> No login required for public profiles. Strategy: - 1. Navigate to profile and wait for page to settle. - 2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas"). - 3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema"). - 4. Dismiss any banners that reappear after reload. - 5. Wait for video grid selector (30 s, soft-fail). - 6. Click "Actualizar" retry button if TikTok shows grid error state. - 7. Scroll to load more videos. - 8. Collect all a[href*="/video/"] links. + 1. Navigate to profile and wait for page to settle. + 2. Dismiss top RGPD banner + cookie modal. + 3. Reload page so TikTok renders the grid cleanly. + 4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS): + a. Dismiss any banners that reappeared. + b. Wait for grid selector (15 s soft-fail). + c. Click "Actualizar" retry button if grid shows error. + d. Wait for grid to repopulate. + e. Check for video links — break immediately if found. + f. If not found and attempts remain → full page reload. + 5. Scroll to load more videos. + 6. Parse all a[href*="/video/"] links into ScrapedTikTok objects. """ tiktoks = [] profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" @@ -775,65 +780,101 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # ── 2. Dismiss banners (first pass) ────────────────────────── + # ── 2. First banner dismissal ──────────────────────────────── _dismiss_banners(page) - # ── 3. Reload for clean grid render ────────────────────────── + # ── 3. Initial reload for clean grid render ────────────────── # TikTok serves "Hubo un problema" when the page first loaded - # while banners were blocking. A reload after dismissal gives + # while banners were present. A reload after dismissal gives # TikTok a clean cookie state so the grid renders correctly. logging.info("🔄 Reloading page after banner dismissal for clean grid render...") page.reload(wait_until="domcontentloaded", timeout=40000) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # ── 4. Dismiss banners (second pass, post-reload) ──────────── - _dismiss_banners(page) + # ── 4. Multi-attempt loop ──────────────────────────────────── + # Each attempt: dismiss banners → wait for grid → click retry + # → check for links. Reload between attempts if still empty. + video_links = [] - # ── 5. Wait for video grid ─────────────────────────────────── - try: - page.wait_for_selector(GRID_SELECTORS, timeout=30000) - logging.info("✅ TikTok video grid detected.") - except Exception: - logging.warning( - "⚠️ Grid selector timed out after 30s — continuing anyway." + for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1): + logging.info( + f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..." ) - take_error_screenshot(page, "tiktok_grid_timeout") - # ── 6. Click "Actualizar" if grid shows error state ────────── - # Even when the grid DOM node exists, TikTok may render an - # error card inside it. Clicking the retry button triggers a - # client-side reload of the video feed without a full page - # reload, which often resolves the empty grid. - if _click_retry_button(page): - logging.info("⏳ Waiting for grid to reload after retry click...") + # 4a. Dismiss any banners that reappeared + _dismiss_banners(page) + + # 4b. Wait for grid selector (soft-fail) try: page.wait_for_selector(GRID_SELECTORS, timeout=15000) - logging.info("✅ Grid reloaded after retry.") + logging.info(f"✅ Grid selector found on attempt {attempt}.") except Exception: - logging.warning("⚠️ Grid still not visible after retry click.") + logging.warning( + f"⚠️ Grid selector timed out on attempt {attempt}." + ) + take_error_screenshot( + page, f"tiktok_grid_timeout_attempt{attempt}" + ) - # ── 7. Scroll to load more videos ──────────────────────────── - for scroll_i in range(TIKTOK_MAX_SCROLLS): - page.evaluate("window.scrollBy(0, window.innerHeight * 2)") - time.sleep(TIKTOK_SCROLL_PAUSE_S) - logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") + # 4c. Click "Actualizar" if grid shows error state + if _click_retry_button(page): + # 4d. Wait for grid to repopulate after retry click + logging.info("⏳ Waiting for grid to reload after retry click...") + try: + page.wait_for_selector(GRID_SELECTORS, timeout=15000) + logging.info("✅ Grid reloaded after retry.") + except Exception: + logging.warning( + "⚠️ Grid still not visible after retry click." + ) + time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # ── 8. Collect video links ─────────────────────────────────── - video_links = page.locator('a[href*="/video/"]').all() - logging.info( - f"📊 Found {len(video_links)} video links. " - f"Parsing up to {SCRAPE_VIDEO_LIMIT}..." - ) + # 4e. Check if video links appeared + video_links = page.locator('a[href*="/video/"]').all() + logging.info( + f"📊 Attempt {attempt}: found {len(video_links)} video links." + ) + if video_links: + logging.info( + f"✅ Got video links on attempt {attempt} — proceeding." + ) + break + + # 4f. No links yet — full reload before next attempt + if attempt < TIKTOK_MAX_LOAD_ATTEMPTS: + logging.info( + f"🔄 No videos found — reloading page " + f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..." + ) + page.reload(wait_until="domcontentloaded", timeout=40000) + time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) + + # ── 5. Scroll to load more videos ──────────────────────────── + if video_links: + for scroll_i in range(TIKTOK_MAX_SCROLLS): + page.evaluate("window.scrollBy(0, window.innerHeight * 2)") + time.sleep(TIKTOK_SCROLL_PAUSE_S) + logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") + + # Re-collect after scrolling to include lazy-loaded cards + video_links = page.locator('a[href*="/video/"]').all() + logging.info( + f"📊 Found {len(video_links)} video links after scroll. " + f"Parsing up to {SCRAPE_VIDEO_LIMIT}..." + ) + + # ── 6. Bail out if still nothing ───────────────────────────── if not video_links: take_error_screenshot(page, "tiktok_no_video_links") logging.error( - "❌ No video links found after scroll. " - "TikTok may still be blocking — check screenshot." + "❌ No video links found after all attempts. " + "Install playwright-stealth: pip install playwright-stealth" ) browser.close() return [] + # ── 7. Parse video links into ScrapedTikTok objects ────────── seen_urls = set() for link in video_links: if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: