Tiktok example 5

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 10:12:27 +02:00
parent 1cf7a334c0
commit 185efaca5f

View File

@@ -63,10 +63,11 @@ FFPROBE_TIMEOUT_SECONDS = 15
DEFAULT_BSKY_BASE_URL = "https://bsky.social" DEFAULT_BSKY_BASE_URL = "https://bsky.social"
SESSION_FILE_PERMISSIONS = 0o600 SESSION_FILE_PERMISSIONS = 0o600
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 TIKTOK_PAGE_LOAD_WAIT_S = 5.0
TIKTOK_SCROLL_PAUSE_S = 2.5 TIKTOK_SCROLL_PAUSE_S = 2.5
TIKTOK_MAX_SCROLLS = 8 TIKTOK_MAX_SCROLLS = 8
TIKTOK_BANNER_WAIT_S = 3.0 TIKTOK_BANNER_WAIT_S = 3.0
TIKTOK_MAX_LOAD_ATTEMPTS = 3 # full reload attempts before giving up
DYNAMIC_ALT_MAX_LENGTH = 150 DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20 TRUNCATE_MIN_PREFIX_CHARS = 20
@@ -405,7 +406,7 @@ def remember_posted_video(state, candidate, bsky_uri=None):
def candidate_matches_state(candidate, state): def candidate_matches_state(candidate, state):
canonical_url = candidate["canonical_post_url"] canonical_url = candidate["canonical_post_url"]
text_media_key = candidate["text_media_key"] text_media_key = candidate["text_media_key"]
normalized_text = candidate["normalized_text"] normalized_text = candidate["normalized_text"]
posted = state.get("posted_videos", {}) posted = state.get("posted_videos", {})
@@ -617,7 +618,6 @@ def _dismiss_banners(page):
Dismiss all TikTok banners in the correct order: Dismiss all TikTok banners in the correct order:
1. Top RGPD/info banner ("Entendido") 1. Top RGPD/info banner ("Entendido")
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.) 2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
Returns True if at least one banner was dismissed. Returns True if at least one banner was dismissed.
""" """
any_dismissed = False any_dismissed = False
@@ -657,7 +657,8 @@ def _dismiss_banners(page):
def _click_retry_button(page): def _click_retry_button(page):
""" """
Click the "Actualizar" / "Refresh" button that TikTok shows inside Click the "Actualizar" / "Refresh" button that TikTok shows inside
the video grid when it renders an error state. Returns True if clicked. the video grid when it renders an error state.
Returns True if clicked.
""" """
for selector in RETRY_BUTTON_SELECTORS: for selector in RETRY_BUTTON_SELECTORS:
try: try:
@@ -679,14 +680,18 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
No login required for public profiles. No login required for public profiles.
Strategy: Strategy:
1. Navigate to profile and wait for page to settle. 1. Navigate to profile and wait for page to settle.
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas"). 2. Dismiss top RGPD banner + cookie modal.
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema"). 3. Reload page so TikTok renders the grid cleanly.
4. Dismiss any banners that reappear after reload. 4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
5. Wait for video grid selector (30 s, soft-fail). a. Dismiss any banners that reappeared.
6. Click "Actualizar" retry button if TikTok shows grid error state. b. Wait for grid selector (15 s soft-fail).
7. Scroll to load more videos. c. Click "Actualizar" retry button if grid shows error.
8. Collect all a[href*="/video/"] links. d. Wait for grid to repopulate.
e. Check for video links — break immediately if found.
f. If not found and attempts remain → full page reload.
5. Scroll to load more videos.
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
""" """
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
@@ -775,65 +780,101 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 2. Dismiss banners (first pass) ────────────────────────── # ── 2. First banner dismissal ────────────────────────────────
_dismiss_banners(page) _dismiss_banners(page)
# ── 3. Reload for clean grid render ────────────────────────── # ── 3. Initial reload for clean grid render ──────────────────
# TikTok serves "Hubo un problema" when the page first loaded # TikTok serves "Hubo un problema" when the page first loaded
# while banners were blocking. A reload after dismissal gives # while banners were present. A reload after dismissal gives
# TikTok a clean cookie state so the grid renders correctly. # TikTok a clean cookie state so the grid renders correctly.
logging.info("🔄 Reloading page after banner dismissal for clean grid render...") logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
page.reload(wait_until="domcontentloaded", timeout=40000) page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 4. Dismiss banners (second pass, post-reload) ──────────── # ── 4. Multi-attempt loop ────────────────────────────────────
_dismiss_banners(page) # Each attempt: dismiss banners → wait for grid → click retry
# → check for links. Reload between attempts if still empty.
video_links = []
# ── 5. Wait for video grid ─────────────────────────────────── for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
try: logging.info(
page.wait_for_selector(GRID_SELECTORS, timeout=30000) f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..."
logging.info("✅ TikTok video grid detected.")
except Exception:
logging.warning(
"⚠️ Grid selector timed out after 30s — continuing anyway."
) )
take_error_screenshot(page, "tiktok_grid_timeout")
# ── 6. Click "Actualizar" if grid shows error state ────────── # 4a. Dismiss any banners that reappeared
# Even when the grid DOM node exists, TikTok may render an _dismiss_banners(page)
# error card inside it. Clicking the retry button triggers a
# client-side reload of the video feed without a full page # 4b. Wait for grid selector (soft-fail)
# reload, which often resolves the empty grid.
if _click_retry_button(page):
logging.info("⏳ Waiting for grid to reload after retry click...")
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=15000) page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("✅ Grid reloaded after retry.") logging.info(f"✅ Grid selector found on attempt {attempt}.")
except Exception: except Exception:
logging.warning("⚠️ Grid still not visible after retry click.") logging.warning(
f"⚠️ Grid selector timed out on attempt {attempt}."
)
take_error_screenshot(
page, f"tiktok_grid_timeout_attempt{attempt}"
)
# ── 7. Scroll to load more videos ──────────────────────────── # 4c. Click "Actualizar" if grid shows error state
for scroll_i in range(TIKTOK_MAX_SCROLLS): if _click_retry_button(page):
page.evaluate("window.scrollBy(0, window.innerHeight * 2)") # 4d. Wait for grid to repopulate after retry click
time.sleep(TIKTOK_SCROLL_PAUSE_S) logging.info("⏳ Waiting for grid to reload after retry click...")
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") try:
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("✅ Grid reloaded after retry.")
except Exception:
logging.warning(
"⚠️ Grid still not visible after retry click."
)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 8. Collect video links ─────────────────────────────────── # 4e. Check if video links appeared
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Found {len(video_links)} video links. " f"📊 Attempt {attempt}: found {len(video_links)} video links."
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..." )
)
if video_links:
logging.info(
f"✅ Got video links on attempt {attempt} — proceeding."
)
break
# 4f. No links yet — full reload before next attempt
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
logging.info(
f"🔄 No videos found — reloading page "
f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..."
)
page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── 5. Scroll to load more videos ────────────────────────────
if video_links:
for scroll_i in range(TIKTOK_MAX_SCROLLS):
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
time.sleep(TIKTOK_SCROLL_PAUSE_S)
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
# Re-collect after scrolling to include lazy-loaded cards
video_links = page.locator('a[href*="/video/"]').all()
logging.info(
f"📊 Found {len(video_links)} video links after scroll. "
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
)
# ── 6. Bail out if still nothing ─────────────────────────────
if not video_links: if not video_links:
take_error_screenshot(page, "tiktok_no_video_links") take_error_screenshot(page, "tiktok_no_video_links")
logging.error( logging.error(
"❌ No video links found after scroll. " "❌ No video links found after all attempts. "
"TikTok may still be blocking — check screenshot." "Install playwright-stealth: pip install playwright-stealth"
) )
browser.close() browser.close()
return [] return []
# ── 7. Parse video links into ScrapedTikTok objects ──────────
seen_urls = set() seen_urls = set()
for link in video_links: for link in video_links:
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: