Tiktok example 4

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 09:43:53 +02:00
parent 23e87c17b7
commit 01303de2d3

View File

@@ -63,18 +63,28 @@ FFPROBE_TIMEOUT_SECONDS = 15
DEFAULT_BSKY_BASE_URL = "https://bsky.social" DEFAULT_BSKY_BASE_URL = "https://bsky.social"
SESSION_FILE_PERMISSIONS = 0o600 SESSION_FILE_PERMISSIONS = 0o600
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # increased from 3.0 TIKTOK_PAGE_LOAD_WAIT_S = 5.0
TIKTOK_SCROLL_PAUSE_S = 2.5 TIKTOK_SCROLL_PAUSE_S = 2.5
TIKTOK_MAX_SCROLLS = 8 # increased from 5 TIKTOK_MAX_SCROLLS = 8
TIKTOK_BANNER_WAIT_S = 3.0 # wait after dismissing cookie banner TIKTOK_BANNER_WAIT_S = 3.0
DYNAMIC_ALT_MAX_LENGTH = 150 DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20 TRUNCATE_MIN_PREFIX_CHARS = 20
ORPHAN_DIGIT_MAX_DIGITS = 3 ORPHAN_DIGIT_MAX_DIGITS = 3
# --- Cookie banner selectors (Spanish + English) --- # --- Top info/RGPD banner selectors (dismissed first) ---
TOP_BANNER_SELECTORS = [
'button:has-text("Entendido")',
'button:has-text("Got it")',
'button:has-text("Understood")',
'[data-e2e="top-banner-close"]',
'[class*="BannerContainer"] button',
'[class*="DivBannerContainer"] button',
]
# --- Cookie consent banner selectors (dismissed second) ---
GDPR_SELECTORS = [ GDPR_SELECTORS = [
'button:has-text("Permitir todas")', # ← exact text shown on screen 'button:has-text("Permitir todas")',
'button:has-text("Rechazar cookies opcionales")', 'button:has-text("Rechazar cookies opcionales")',
'button:has-text("Entendido")', 'button:has-text("Entendido")',
'button:has-text("Aceptar todo")', 'button:has-text("Aceptar todo")',
@@ -136,7 +146,6 @@ class ScrapedMedia:
class ScrapedTikTok: class ScrapedTikTok:
"""Mirrors ScrapedTweet from twitter2bsky.py."""
def __init__(self, created_on, text, video_url, post_url=None, thumbnail_url=None): def __init__(self, created_on, text, video_url, post_url=None, thumbnail_url=None):
self.created_on = created_on self.created_on = created_on
self.text = text self.text = text
@@ -562,7 +571,6 @@ def build_dynamic_alt(text):
def make_rich(content): def make_rich(content):
"""Build a Bluesky TextBuilder with hashtag and URL facets."""
text_builder = client_utils.TextBuilder() text_builder = client_utils.TextBuilder()
content = clean_post_text(content) content = clean_post_text(content)
lines = content.splitlines() lines = content.splitlines()
@@ -595,25 +603,65 @@ def make_rich(content):
# --- TikTok Scraping --- # --- TikTok Scraping ---
def _dismiss_banners(page):
"""
Dismiss all TikTok banners in the correct order:
1. Top RGPD/info banner ("Entendido")
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
Returns True if at least one banner was dismissed.
"""
any_dismissed = False
# ── Step 1: Top RGPD info banner ────────────────────────────────────
for selector in TOP_BANNER_SELECTORS:
try:
btn = page.locator(selector).first
if btn.is_visible(timeout=2000):
btn.click()
logging.info(f"✅ Dismissed top banner: {selector}")
time.sleep(1)
any_dismissed = True
break
except Exception:
pass
# ── Step 2: Cookie consent modal ────────────────────────────────────
for selector in GDPR_SELECTORS:
try:
btn = page.locator(selector).first
if btn.is_visible(timeout=3000):
btn.click()
logging.info(f"✅ Dismissed cookie banner: {selector}")
time.sleep(TIKTOK_BANNER_WAIT_S)
any_dismissed = True
break
except Exception:
pass
if not any_dismissed:
logging.info(" No banners found — continuing.")
return any_dismissed
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
""" """
Scrape recent TikTok videos from a public profile using Playwright. Scrape recent TikTok videos from a public profile using Playwright.
No login required for public profiles. No login required for public profiles.
Fixes applied: Banner-handling strategy (fixes applied):
1. Aggressive GDPR/cookie banner dismissal — Spanish + English, 1. Dismiss top RGPD info banner ("Entendido") first.
waits TIKTOK_BANNER_WAIT_S after click for grid to render. 2. Dismiss cookie consent modal ("Permitir todas" / etc.) second.
2. Stealth headers: Windows Chrome UA, Europe/Madrid timezone, 3. Reload the page after all banners are dismissed so TikTok
es-ES locale, sec-ch-ua headers, navigator.webdriver hidden. renders the video grid cleanly (avoids "Hubo un problema").
3. playwright-stealth applied before navigation (graceful fallback 4. playwright-stealth applied before navigation when available.
if not installed). 5. Broader grid selector list + 30 s timeout + soft-fail on timeout.
4. Broader grid selector list + 30s timeout + continues with scroll
even if selector times out instead of hard-failing.
""" """
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
# FIX 3 — playwright-stealth (optional but strongly recommended) # playwright-stealth optional but strongly recommended
try: try:
from playwright_stealth import stealth_sync from playwright_stealth import stealth_sync
USE_STEALTH = True USE_STEALTH = True
@@ -638,7 +686,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
], ],
) )
# FIX 2 — Fake a real Windows Chrome with Spanish locale + Madrid timezone
context = browser.new_context( context = browser.new_context(
user_agent=( user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -668,12 +715,10 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
page = context.new_page() page = context.new_page()
# FIX 3 — Apply playwright-stealth before any navigation
if USE_STEALTH: if USE_STEALTH:
stealth_sync(page) stealth_sync(page)
logging.info("🥷 Stealth patches applied.") logging.info("🥷 Stealth patches applied.")
# FIX 2 — Hide webdriver flag + fake plugins/languages
page.add_init_script(""" page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', { Object.defineProperty(navigator, 'plugins', {
@@ -695,30 +740,26 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
""") """)
try: try:
# ── Initial navigation ───────────────────────────────────────
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
# FIX 1 — Wait for page to settle before looking for banner
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# FIX 1 — Dismiss cookie/consent banner BEFORE waiting for grid # ── Dismiss all banners ──────────────────────────────────────
banner_dismissed = False _dismiss_banners(page)
for selector in GDPR_SELECTORS:
try:
btn = page.locator(selector).first
if btn.is_visible(timeout=3000):
btn.click()
logging.info(f"✅ Dismissed cookie banner: {selector}")
time.sleep(TIKTOK_BANNER_WAIT_S) # wait for grid to render
banner_dismissed = True
break
except Exception:
pass
if not banner_dismissed: # ── Reload for a clean grid render ───────────────────────────
logging.info(" No cookie banner found — continuing.") # TikTok renders "Hubo un problema" when the page first loaded
# with banners present. A fresh reload after banner dismissal
# gives TikTok a clean state and the grid renders correctly.
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# FIX 4 — Broader selector + longer timeout (30s) + soft fail # ── Dismiss any banners that reappear after reload ───────────
_dismiss_banners(page)
# ── Wait for video grid ──────────────────────────────────────
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=30000) page.wait_for_selector(GRID_SELECTORS, timeout=30000)
logging.info("✅ TikTok video grid detected.") logging.info("✅ TikTok video grid detected.")
@@ -729,13 +770,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
) )
take_error_screenshot(page, "tiktok_grid_timeout") take_error_screenshot(page, "tiktok_grid_timeout")
# Scroll to load more videos # ── Scroll to load more videos ───────────────────────────────
for scroll_i in range(TIKTOK_MAX_SCROLLS): for scroll_i in range(TIKTOK_MAX_SCROLLS):
page.evaluate("window.scrollBy(0, window.innerHeight * 2)") page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
time.sleep(TIKTOK_SCROLL_PAUSE_S) time.sleep(TIKTOK_SCROLL_PAUSE_S)
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
# Collect video links # ── Collect video links ──────────────────────────────────────
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Found {len(video_links)} video links. " f"📊 Found {len(video_links)} video links. "
@@ -772,7 +813,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
continue continue
seen_urls.add(canonical) seen_urls.add(canonical)
# Try to get caption from the card # Caption
caption = "" caption = ""
try: try:
card = link.locator("..").first card = link.locator("..").first
@@ -795,11 +836,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
except Exception: except Exception:
pass pass
created_on = arrow.utcnow().isoformat()
tiktoks.append( tiktoks.append(
ScrapedTikTok( ScrapedTikTok(
created_on=created_on, created_on=arrow.utcnow().isoformat(),
text=caption, text=caption,
video_url=canonical, video_url=canonical,
post_url=canonical, post_url=canonical,
@@ -849,12 +888,10 @@ def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = No
if ".m4s" in url_l or "/aud/" in url_l or "mp4a" in url_l: if ".m4s" in url_l or "/aud/" in url_l or "mp4a" in url_l:
return return
if ".m3u8" in url_l or "mpegurl" in content_type: if ".m3u8" in url_l or "mpegurl" in content_type:
if best_m3u8_url is None: if best_m3u8_url is None:
best_m3u8_url = url best_m3u8_url = url
return return
if ".mp4" in url_l or "video/mp4" in content_type: if ".mp4" in url_l or "video/mp4" in content_type:
if best_mp4_url is None: if best_mp4_url is None:
best_mp4_url = url best_mp4_url = url