Tiktok example 5

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 09:56:31 +02:00
parent 01303de2d3
commit 1cf7a334c0

View File

@@ -106,6 +106,15 @@ GRID_SELECTORS = (
'[class*="DivVideoFeedV2"]' '[class*="DivVideoFeedV2"]'
) )
# --- Grid error retry button selectors ---
RETRY_BUTTON_SELECTORS = [
'button:has-text("Actualizar")',
'button:has-text("Refresh")',
'button:has-text("Retry")',
'button:has-text("Reintentar")',
'[data-e2e="retry-button"]',
]
# --- Logging Setup --- # --- Logging Setup ---
logging.basicConfig( logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s", format="%(asctime)s [%(levelname)s] %(message)s",
@@ -602,7 +611,7 @@ def make_rich(content):
return text_builder return text_builder
# --- TikTok Scraping --- # --- Banner helpers ---
def _dismiss_banners(page): def _dismiss_banners(page):
""" """
Dismiss all TikTok banners in the correct order: Dismiss all TikTok banners in the correct order:
@@ -645,18 +654,39 @@ def _dismiss_banners(page):
return any_dismissed return any_dismissed
def _click_retry_button(page):
"""
Click the "Actualizar" / "Refresh" button that TikTok shows inside
the video grid when it renders an error state. Returns True if clicked.
"""
for selector in RETRY_BUTTON_SELECTORS:
try:
btn = page.locator(selector).first
if btn.is_visible(timeout=2000):
btn.click()
logging.info(f"🔁 Clicked grid retry button: {selector}")
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
return True
except Exception:
pass
return False
# --- TikTok Scraping ---
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list: def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
""" """
Scrape recent TikTok videos from a public profile using Playwright. Scrape recent TikTok videos from a public profile using Playwright.
No login required for public profiles. No login required for public profiles.
Banner-handling strategy (fixes applied): Strategy:
1. Dismiss top RGPD info banner ("Entendido") first. 1. Navigate to profile and wait for page to settle.
2. Dismiss cookie consent modal ("Permitir todas" / etc.) second. 2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas").
3. Reload the page after all banners are dismissed so TikTok 3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema").
renders the video grid cleanly (avoids "Hubo un problema"). 4. Dismiss any banners that reappear after reload.
4. playwright-stealth applied before navigation when available. 5. Wait for video grid selector (30 s, soft-fail).
5. Broader grid selector list + 30 s timeout + soft-fail on timeout. 6. Click "Actualizar" retry button if TikTok shows grid error state.
7. Scroll to load more videos.
8. Collect all a[href*="/video/"] links.
""" """
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
@@ -740,43 +770,55 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
""") """)
try: try:
# ── Initial navigation ─────────────────────────────────────── # ── 1. Initial navigation ────────────────────────────────────
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── Dismiss all banners ────────────────────────────────────── # ── 2. Dismiss banners (first pass) ──────────────────────────
_dismiss_banners(page) _dismiss_banners(page)
# ── Reload for a clean grid render ────────────────────────── # ── 3. Reload for clean grid render ──────────────────────────
# TikTok renders "Hubo un problema" when the page first loaded # TikTok serves "Hubo un problema" when the page first loaded
# with banners present. A fresh reload after banner dismissal # while banners were blocking. A reload after dismissal gives
# gives TikTok a clean state and the grid renders correctly. # TikTok a clean cookie state so the grid renders correctly.
logging.info("🔄 Reloading page after banner dismissal for clean grid render...") logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
page.reload(wait_until="domcontentloaded", timeout=40000) page.reload(wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# ── Dismiss any banners that reappear after reload ─────────── # ── 4. Dismiss banners (second pass, post-reload) ────────────
_dismiss_banners(page) _dismiss_banners(page)
# ── Wait for video grid ────────────────────────────────────── # ── 5. Wait for video grid ───────────────────────────────────
try: try:
page.wait_for_selector(GRID_SELECTORS, timeout=30000) page.wait_for_selector(GRID_SELECTORS, timeout=30000)
logging.info("✅ TikTok video grid detected.") logging.info("✅ TikTok video grid detected.")
except Exception: except Exception:
logging.warning( logging.warning(
"⚠️ Grid selector timed out after 30s — " "⚠️ Grid selector timed out after 30s — continuing anyway."
"attempting scroll anyway (grid may be partially loaded)"
) )
take_error_screenshot(page, "tiktok_grid_timeout") take_error_screenshot(page, "tiktok_grid_timeout")
# ── Scroll to load more videos ─────────────────────────────── # ── 6. Click "Actualizar" if grid shows error state ──────────
# Even when the grid DOM node exists, TikTok may render an
# error card inside it. Clicking the retry button triggers a
# client-side reload of the video feed without a full page
# reload, which often resolves the empty grid.
if _click_retry_button(page):
logging.info("⏳ Waiting for grid to reload after retry click...")
try:
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
logging.info("✅ Grid reloaded after retry.")
except Exception:
logging.warning("⚠️ Grid still not visible after retry click.")
# ── 7. Scroll to load more videos ────────────────────────────
for scroll_i in range(TIKTOK_MAX_SCROLLS): for scroll_i in range(TIKTOK_MAX_SCROLLS):
page.evaluate("window.scrollBy(0, window.innerHeight * 2)") page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
time.sleep(TIKTOK_SCROLL_PAUSE_S) time.sleep(TIKTOK_SCROLL_PAUSE_S)
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}") logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
# ── Collect video links ────────────────────────────────────── # ── 8. Collect video links ───────────────────────────────────
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info( logging.info(
f"📊 Found {len(video_links)} video links. " f"📊 Found {len(video_links)} video links. "