diff --git a/tiktok2bsky.py b/tiktok2bsky.py index e94eeb2..522c26e 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -63,18 +63,28 @@ FFPROBE_TIMEOUT_SECONDS = 15 DEFAULT_BSKY_BASE_URL = "https://bsky.social" SESSION_FILE_PERMISSIONS = 0o600 -TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # increased from 3.0 +TIKTOK_PAGE_LOAD_WAIT_S = 5.0 TIKTOK_SCROLL_PAUSE_S = 2.5 -TIKTOK_MAX_SCROLLS = 8 # increased from 5 -TIKTOK_BANNER_WAIT_S = 3.0 # wait after dismissing cookie banner +TIKTOK_MAX_SCROLLS = 8 +TIKTOK_BANNER_WAIT_S = 3.0 DYNAMIC_ALT_MAX_LENGTH = 150 TRUNCATE_MIN_PREFIX_CHARS = 20 ORPHAN_DIGIT_MAX_DIGITS = 3 -# --- Cookie banner selectors (Spanish + English) --- +# --- Top info/RGPD banner selectors (dismissed first) --- +TOP_BANNER_SELECTORS = [ + 'button:has-text("Entendido")', + 'button:has-text("Got it")', + 'button:has-text("Understood")', + '[data-e2e="top-banner-close"]', + '[class*="BannerContainer"] button', + '[class*="DivBannerContainer"] button', +] + +# --- Cookie consent banner selectors (dismissed second) --- GDPR_SELECTORS = [ - 'button:has-text("Permitir todas")', # ← exact text shown on screen + 'button:has-text("Permitir todas")', 'button:has-text("Rechazar cookies opcionales")', 'button:has-text("Entendido")', 'button:has-text("Aceptar todo")', @@ -136,7 +146,6 @@ class ScrapedMedia: class ScrapedTikTok: - """Mirrors ScrapedTweet from twitter2bsky.py.""" def __init__(self, created_on, text, video_url, post_url=None, thumbnail_url=None): self.created_on = created_on self.text = text @@ -562,7 +571,6 @@ def build_dynamic_alt(text): def make_rich(content): - """Build a Bluesky TextBuilder with hashtag and URL facets.""" text_builder = client_utils.TextBuilder() content = clean_post_text(content) lines = content.splitlines() @@ -595,25 +603,65 @@ def make_rich(content): # --- TikTok Scraping --- +def _dismiss_banners(page): + """ + Dismiss all TikTok banners in the correct order: + 1. Top RGPD/info banner ("Entendido") + 2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.) + + Returns True if at least one banner was dismissed. + """ + any_dismissed = False + + # ── Step 1: Top RGPD info banner ──────────────────────────────────── + for selector in TOP_BANNER_SELECTORS: + try: + btn = page.locator(selector).first + if btn.is_visible(timeout=2000): + btn.click() + logging.info(f"✅ Dismissed top banner: {selector}") + time.sleep(1) + any_dismissed = True + break + except Exception: + pass + + # ── Step 2: Cookie consent modal ──────────────────────────────────── + for selector in GDPR_SELECTORS: + try: + btn = page.locator(selector).first + if btn.is_visible(timeout=3000): + btn.click() + logging.info(f"✅ Dismissed cookie banner: {selector}") + time.sleep(TIKTOK_BANNER_WAIT_S) + any_dismissed = True + break + except Exception: + pass + + if not any_dismissed: + logging.info("ℹ️ No banners found — continuing.") + + return any_dismissed + + def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: """ Scrape recent TikTok videos from a public profile using Playwright. No login required for public profiles. - Fixes applied: - 1. Aggressive GDPR/cookie banner dismissal — Spanish + English, - waits TIKTOK_BANNER_WAIT_S after click for grid to render. - 2. Stealth headers: Windows Chrome UA, Europe/Madrid timezone, - es-ES locale, sec-ch-ua headers, navigator.webdriver hidden. - 3. playwright-stealth applied before navigation (graceful fallback - if not installed). - 4. Broader grid selector list + 30s timeout + continues with scroll - even if selector times out instead of hard-failing. + Banner-handling strategy (fixes applied): + 1. Dismiss top RGPD info banner ("Entendido") first. + 2. Dismiss cookie consent modal ("Permitir todas" / etc.) second. + 3. Reload the page after all banners are dismissed so TikTok + renders the video grid cleanly (avoids "Hubo un problema"). + 4. playwright-stealth applied before navigation when available. + 5. Broader grid selector list + 30 s timeout + soft-fail on timeout. """ tiktoks = [] profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" - # FIX 3 — playwright-stealth (optional but strongly recommended) + # playwright-stealth — optional but strongly recommended try: from playwright_stealth import stealth_sync USE_STEALTH = True @@ -638,7 +686,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> ], ) - # FIX 2 — Fake a real Windows Chrome with Spanish locale + Madrid timezone context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " @@ -668,12 +715,10 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> page = context.new_page() - # FIX 3 — Apply playwright-stealth before any navigation if USE_STEALTH: stealth_sync(page) logging.info("🥷 Stealth patches applied.") - # FIX 2 — Hide webdriver flag + fake plugins/languages page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'plugins', { @@ -695,30 +740,26 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> """) try: + # ── Initial navigation ─────────────────────────────────────── logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) - - # FIX 1 — Wait for page to settle before looking for banner time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # FIX 1 — Dismiss cookie/consent banner BEFORE waiting for grid - banner_dismissed = False - for selector in GDPR_SELECTORS: - try: - btn = page.locator(selector).first - if btn.is_visible(timeout=3000): - btn.click() - logging.info(f"✅ Dismissed cookie banner: {selector}") - time.sleep(TIKTOK_BANNER_WAIT_S) # wait for grid to render - banner_dismissed = True - break - except Exception: - pass + # ── Dismiss all banners ────────────────────────────────────── + _dismiss_banners(page) - if not banner_dismissed: - logging.info("ℹ️ No cookie banner found — continuing.") + # ── Reload for a clean grid render ─────────────────────────── + # TikTok renders "Hubo un problema" when the page first loaded + # with banners present. A fresh reload after banner dismissal + # gives TikTok a clean state and the grid renders correctly. + logging.info("🔄 Reloading page after banner dismissal for clean grid render...") + page.reload(wait_until="domcontentloaded", timeout=40000) + time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # FIX 4 — Broader selector + longer timeout (30s) + soft fail + # ── Dismiss any banners that reappear after reload ─────────── + _dismiss_banners(page) + + # ── Wait for video grid ────────────────────────────────────── try: page.wait_for_selector(GRID_SELECTORS, timeout=30000) logging.info("✅ TikTok video grid detected.") @@ -729,13 +770,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> ) take_error_screenshot(page, "tiktok_grid_timeout") - # Scroll to load more videos + # ── Scroll to load more videos ─────────────────────────────── for scroll_i in range(TIKTOK_MAX_SCROLLS): page.evaluate("window.scrollBy(0, window.innerHeight * 2)") time.sleep(TIKTOK_SCROLL_PAUSE_S) logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") - # Collect video links + # ── Collect video links ────────────────────────────────────── video_links = page.locator('a[href*="/video/"]').all() logging.info( f"📊 Found {len(video_links)} video links. " @@ -772,7 +813,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> continue seen_urls.add(canonical) - # Try to get caption from the card + # Caption caption = "" try: card = link.locator("..").first @@ -795,11 +836,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> except Exception: pass - created_on = arrow.utcnow().isoformat() - tiktoks.append( ScrapedTikTok( - created_on=created_on, + created_on=arrow.utcnow().isoformat(), text=caption, video_url=canonical, post_url=canonical, @@ -849,12 +888,10 @@ def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = No if ".m4s" in url_l or "/aud/" in url_l or "mp4a" in url_l: return - if ".m3u8" in url_l or "mpegurl" in content_type: if best_m3u8_url is None: best_m3u8_url = url return - if ".mp4" in url_l or "video/mp4" in content_type: if best_mp4_url is None: best_mp4_url = url @@ -933,9 +970,9 @@ def _probe_video_duration(file_path): def download_and_crop_video(video_url: str, output_path: str): - temp_input = output_path.replace(".mp4", "_source.mp4") - temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") - temp_output = output_path.replace(".mp4", "_compressed.mp4") + temp_input = output_path.replace(".mp4", "_source.mp4") + temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") + temp_output = output_path.replace(".mp4", "_compressed.mp4") try: logging.info(f"⬇️ Downloading TikTok video: {video_url}")