diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 8281795..c9cfe72 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -675,38 +675,26 @@ def _click_retry_button(page): # --- TikTok Scraping --- def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: - """ - Scrape recent TikTok videos from a public profile using Playwright. - No login required for public profiles. - - Strategy: - 1. Navigate to profile and wait for page to settle. - 2. Dismiss top RGPD banner + cookie modal. - 3. Reload page so TikTok renders the grid cleanly. - 4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS): - a. Dismiss any banners that reappeared. - b. Wait for grid selector (15 s soft-fail). - c. Click "Actualizar" retry button if grid shows error. - d. Wait for grid to repopulate. - e. Check for video links — break immediately if found. - f. If not found and attempts remain → full page reload. - 5. Scroll to load more videos. - 6. Parse all a[href*="/video/"] links into ScrapedTikTok objects. - """ tiktoks = [] profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" - # playwright-stealth — optional but strongly recommended + # playwright-stealth 2.x uses Stealth class; 1.x uses stealth_sync try: - from playwright_stealth import stealth_sync - USE_STEALTH = True - logging.info("🥷 playwright-stealth available — stealth mode ON") + from playwright_stealth import Stealth + USE_STEALTH = "v2" + _stealth = Stealth() + logging.info("🥷 playwright-stealth 2.x available — stealth mode ON") except ImportError: - USE_STEALTH = False - logging.warning( - "⚠️ playwright-stealth not installed — running without stealth. " - "Run: pip install playwright-stealth" - ) + try: + from playwright_stealth import stealth_sync + USE_STEALTH = "v1" + logging.info("🥷 playwright-stealth 1.x available — stealth mode ON (legacy)") + except ImportError: + USE_STEALTH = False + logging.warning( + "⚠️ playwright-stealth not installed — running without stealth. " + "Run: pip install playwright-stealth" + ) with sync_playwright() as p: browser = p.chromium.launch( @@ -750,9 +738,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> page = context.new_page() - if USE_STEALTH: + # Apply stealth patches — version-aware + if USE_STEALTH == "v2": + _stealth.apply_stealth_sync(page) + logging.info("🥷 Stealth patches applied (2.x).") + elif USE_STEALTH == "v1": stealth_sync(page) - logging.info("🥷 Stealth patches applied.") + logging.info("🥷 Stealth patches applied (1.x legacy).") page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); @@ -784,16 +776,11 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> _dismiss_banners(page) # ── 3. Initial reload for clean grid render ────────────────── - # TikTok serves "Hubo un problema" when the page first loaded - # while banners were present. A reload after dismissal gives - # TikTok a clean cookie state so the grid renders correctly. logging.info("🔄 Reloading page after banner dismissal for clean grid render...") page.reload(wait_until="domcontentloaded", timeout=40000) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) # ── 4. Multi-attempt loop ──────────────────────────────────── - # Each attempt: dismiss banners → wait for grid → click retry - # → check for links. Reload between attempts if still empty. video_links = [] for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1): @@ -818,18 +805,15 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> # 4c. Click "Actualizar" if grid shows error state if _click_retry_button(page): - # 4d. Wait for grid to repopulate after retry click logging.info("⏳ Waiting for grid to reload after retry click...") try: page.wait_for_selector(GRID_SELECTORS, timeout=15000) logging.info("✅ Grid reloaded after retry.") except Exception: - logging.warning( - "⚠️ Grid still not visible after retry click." - ) + logging.warning("⚠️ Grid still not visible after retry click.") time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # 4e. Check if video links appeared + # 4d. Check if video links appeared video_links = page.locator('a[href*="/video/"]').all() logging.info( f"📊 Attempt {attempt}: found {len(video_links)} video links." @@ -841,7 +825,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> ) break - # 4f. No links yet — full reload before next attempt + # 4e. No links yet — full reload before next attempt if attempt < TIKTOK_MAX_LOAD_ATTEMPTS: logging.info( f"🔄 No videos found — reloading page " @@ -857,7 +841,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> time.sleep(TIKTOK_SCROLL_PAUSE_S) logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") - # Re-collect after scrolling to include lazy-loaded cards video_links = page.locator('a[href*="/video/"]').all() logging.info( f"📊 Found {len(video_links)} video links after scroll. " @@ -869,12 +852,12 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> take_error_screenshot(page, "tiktok_no_video_links") logging.error( "❌ No video links found after all attempts. " - "Install playwright-stealth: pip install playwright-stealth" + "Check screenshot for current TikTok block state." ) browser.close() return [] - # ── 7. Parse video links into ScrapedTikTok objects ────────── + # ── 7. Parse video links ────────────────────────────────────── seen_urls = set() for link in video_links: if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: @@ -896,7 +879,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> continue seen_urls.add(canonical) - # Caption caption = "" try: card = link.locator("..").first @@ -910,7 +892,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> except Exception: pass - # Thumbnail thumbnail_url = None try: img = link.locator("img").first @@ -943,7 +924,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.") return tiktoks - # --- Video URL extraction --- def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None): """