diff --git a/jenkins/jijantesFCTikTok b/jenkins/jijantesFCTikTok index 01f1be8..a90d207 100644 --- a/jenkins/jijantesFCTikTok +++ b/jenkins/jijantesFCTikTok @@ -37,6 +37,7 @@ pipeline { "${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \ atproto \ playwright \ + playwright-stealth \ httpx \ arrow \ python-dotenv \ diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 6958c84..2df98fb 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -24,12 +24,16 @@ import grapheme # --- Configuration --- LOG_PATH = "tiktok2bsky.log" STATE_PATH = "tiktok2bsky_state.json" -SCRAPE_VIDEO_LIMIT = 15 # TikTok loads fewer items per scroll than Twitter DEDUPE_BSKY_LIMIT = 30 VIDEO_MAX_AGE_DAYS = 3 BSKY_TEXT_MAX_LENGTH = 300 DEFAULT_BSKY_LANGS = ["ca"] +TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # was 3.0 — increased for slower grid render +TIKTOK_MAX_SCROLLS = 8 # was 5 — more scrolls = more videos discovered +SCRAPE_VIDEO_LIMIT = 30 # was 15 + + VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 @@ -64,7 +68,6 @@ DEFAULT_BSKY_BASE_URL = "https://bsky.social" SESSION_FILE_PERMISSIONS = 0o600 TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load -TIKTOK_MAX_SCROLLS = 5 # how many times to scroll down the profile TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads DYNAMIC_ALT_MAX_LENGTH = 150 TRUNCATE_MIN_PREFIX_CHARS = 20 @@ -531,16 +534,32 @@ def make_rich(content): return text_builder +# --- TikTok Scraping --- # --- TikTok Scraping --- def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list: """ Scrape recent TikTok videos from a public profile using Playwright. No login required for public profiles. Returns a list of ScrapedTikTok objects. + + Fixes applied: + 1. Aggressive GDPR/consent banner dismissal (Spanish + English) + 2. Stealth headers: timezone, locale, sec-ch-ua, webdriver flag hidden + 3. playwright-stealth applied before navigation + 4. Broader + longer grid selector wait (30s, more selectors) """ tiktoks = [] profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" + # playwright-stealth is optional but strongly recommended + try: + from playwright_stealth import stealth_sync + USE_STEALTH = True + logging.info("🥷 playwright-stealth available — stealth mode ON") + except ImportError: + USE_STEALTH = False + logging.warning("⚠️ playwright-stealth not installed — running without stealth") + with sync_playwright() as p: browser = p.chromium.launch( headless=True, @@ -548,59 +567,112 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> "--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + "--disable-gpu", + "--window-size=1366,768", ], ) + + # FIX 2 — Fake a real Windows Chrome browser with Spanish locale + Madrid timezone context = browser.new_context( user_agent=( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/145.0.7632.6 Safari/537.36" + "Chrome/124.0.0.0 Safari/537.36" ), - viewport={"width": 1920, "height": 1080}, - locale=locale, - # TikTok checks these headers — set them explicitly + viewport={"width": 1366, "height": 768}, + locale="es-ES", + timezone_id="Europe/Madrid", extra_http_headers={ - "Accept-Language": f"{locale},en;q=0.9", + "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", + "Accept": ( + "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/avif,image/webp,*/*;q=0.8" + ), "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", + "Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"Windows"', }, ) + page = context.new_page() + # FIX 3 — Apply playwright-stealth before any navigation + if USE_STEALTH: + stealth_sync(page) + logging.info("🥷 Stealth patches applied.") + + # FIX 2 — Hide webdriver flag + fake plugins/languages via init script + page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'plugins', { + get: () => [ + { name: 'Chrome PDF Plugin' }, + { name: 'Chrome PDF Viewer' }, + { name: 'Native Client' } + ] + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['es-ES', 'es', 'en'] + }); + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: {} + }; + """) + try: logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) - time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - # Dismiss cookie/consent banners if present - for selector in [ + # FIX 1 — Wait longer for initial page render (was 3.0s) + time.sleep(TIKTOK_PAGE_LOAD_WAIT_S + 2) + + # FIX 1 — Aggressive GDPR/consent banner dismissal (Spanish + English) + GDPR_SELECTORS = [ + 'button:has-text("Entendido")', + 'button:has-text("Aceptar todo")', 'button:has-text("Accept all")', + 'button:has-text("Got it")', 'button:has-text("Decline optional")', '[data-e2e="cookie-banner-accept"]', - ]: + '[id*="accept"]', + '[class*="accept-btn"]', + ] + for selector in GDPR_SELECTORS: try: btn = page.locator(selector).first - if btn.is_visible(timeout=2000): + if btn.is_visible(timeout=3000): btn.click() - time.sleep(1) + logging.info(f"✅ Dismissed banner: {selector}") + time.sleep(2) break except Exception: pass - # Wait for video grid to appear + # FIX 4 — Broader selector list + longer timeout (30s, was 20s) + GRID_SELECTORS = ( + '[data-e2e="user-post-item"], ' + '[class*="DivItemContainerV2"], ' + 'a[href*="/video/"], ' + '[class*="video-feed"], ' + 'div[class*="VideoFeed"], ' + '[class*="DivVideoFeedV2"]' + ) try: - page.wait_for_selector( - '[data-e2e="user-post-item"], ' - '[class*="DivItemContainerV2"], ' - 'a[href*="/video/"]', - timeout=20000, - ) + page.wait_for_selector(GRID_SELECTORS, timeout=30000) + logging.info("✅ TikTok video grid detected.") except Exception: - take_error_screenshot(page, "tiktok_profile_load_failed") - logging.error("❌ TikTok video grid did not appear.") - browser.close() - return [] + # FIX 4 — Don't give up immediately: try scrolling anyway + logging.warning( + "⚠️ Grid selector timed out — attempting scroll anyway " + "(grid may still be partially loaded)" + ) # Scroll to load more videos for scroll_i in range(TIKTOK_MAX_SCROLLS): @@ -610,7 +682,16 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> # Collect video links video_links = page.locator('a[href*="/video/"]').all() - logging.info(f"📊 Found {len(video_links)} video links. Parsing up to {SCRAPE_VIDEO_LIMIT}...") + logging.info( + f"📊 Found {len(video_links)} video links. " + f"Parsing up to {SCRAPE_VIDEO_LIMIT}..." + ) + + if not video_links: + take_error_screenshot(page, "tiktok_no_video_links") + logging.error("❌ No video links found after scroll. TikTok may be blocking.") + browser.close() + return [] seen_urls = set() for link in video_links: @@ -633,10 +714,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> continue seen_urls.add(canonical) - # Try to get caption from the card itself (avoids opening each video) + # Try to get caption from the card itself caption = "" try: - # The caption is often in a sibling/child element card = link.locator("..").first caption_el = card.locator( '[data-e2e="video-desc"], ' @@ -657,15 +737,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> except Exception: pass - # TikTok doesn't expose post timestamps in the grid — - # use now as a conservative estimate; dedup prevents re-posting created_on = arrow.utcnow().isoformat() tiktoks.append( ScrapedTikTok( created_on=created_on, text=caption, - video_url=canonical, # placeholder; real URL resolved later + video_url=canonical, post_url=canonical, thumbnail_url=thumbnail_url, ) @@ -685,7 +763,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.") return tiktoks - # --- Video extraction --- def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None: """