Tiktok example 5
This commit is contained in:
@@ -106,6 +106,15 @@ GRID_SELECTORS = (
|
||||
'[class*="DivVideoFeedV2"]'
|
||||
)
|
||||
|
||||
# --- Grid error retry button selectors ---
|
||||
RETRY_BUTTON_SELECTORS = [
|
||||
'button:has-text("Actualizar")',
|
||||
'button:has-text("Refresh")',
|
||||
'button:has-text("Retry")',
|
||||
'button:has-text("Reintentar")',
|
||||
'[data-e2e="retry-button"]',
|
||||
]
|
||||
|
||||
# --- Logging Setup ---
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
@@ -602,7 +611,7 @@ def make_rich(content):
|
||||
return text_builder
|
||||
|
||||
|
||||
# --- TikTok Scraping ---
|
||||
# --- Banner helpers ---
|
||||
def _dismiss_banners(page):
|
||||
"""
|
||||
Dismiss all TikTok banners in the correct order:
|
||||
@@ -645,18 +654,39 @@ def _dismiss_banners(page):
|
||||
return any_dismissed
|
||||
|
||||
|
||||
def _click_retry_button(page):
|
||||
"""
|
||||
Click the "Actualizar" / "Refresh" button that TikTok shows inside
|
||||
the video grid when it renders an error state. Returns True if clicked.
|
||||
"""
|
||||
for selector in RETRY_BUTTON_SELECTORS:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if btn.is_visible(timeout=2000):
|
||||
btn.click()
|
||||
logging.info(f"🔁 Clicked grid retry button: {selector}")
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
# --- TikTok Scraping ---
|
||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
||||
"""
|
||||
Scrape recent TikTok videos from a public profile using Playwright.
|
||||
No login required for public profiles.
|
||||
|
||||
Banner-handling strategy (fixes applied):
|
||||
1. Dismiss top RGPD info banner ("Entendido") first.
|
||||
2. Dismiss cookie consent modal ("Permitir todas" / etc.) second.
|
||||
3. Reload the page after all banners are dismissed so TikTok
|
||||
renders the video grid cleanly (avoids "Hubo un problema").
|
||||
4. playwright-stealth applied before navigation when available.
|
||||
5. Broader grid selector list + 30 s timeout + soft-fail on timeout.
|
||||
Strategy:
|
||||
1. Navigate to profile and wait for page to settle.
|
||||
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas").
|
||||
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema").
|
||||
4. Dismiss any banners that reappear after reload.
|
||||
5. Wait for video grid selector (30 s, soft-fail).
|
||||
6. Click "Actualizar" retry button if TikTok shows grid error state.
|
||||
7. Scroll to load more videos.
|
||||
8. Collect all a[href*="/video/"] links.
|
||||
"""
|
||||
tiktoks = []
|
||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||
@@ -740,43 +770,55 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
""")
|
||||
|
||||
try:
|
||||
# ── Initial navigation ───────────────────────────────────────
|
||||
# ── 1. Initial navigation ────────────────────────────────────
|
||||
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── Dismiss all banners ──────────────────────────────────────
|
||||
# ── 2. Dismiss banners (first pass) ──────────────────────────
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── Reload for a clean grid render ───────────────────────────
|
||||
# TikTok renders "Hubo un problema" when the page first loaded
|
||||
# with banners present. A fresh reload after banner dismissal
|
||||
# gives TikTok a clean state and the grid renders correctly.
|
||||
# ── 3. Reload for clean grid render ──────────────────────────
|
||||
# TikTok serves "Hubo un problema" when the page first loaded
|
||||
# while banners were blocking. A reload after dismissal gives
|
||||
# TikTok a clean cookie state so the grid renders correctly.
|
||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── Dismiss any banners that reappear after reload ───────────
|
||||
# ── 4. Dismiss banners (second pass, post-reload) ────────────
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── Wait for video grid ──────────────────────────────────────
|
||||
# ── 5. Wait for video grid ───────────────────────────────────
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||
logging.info("✅ TikTok video grid detected.")
|
||||
except Exception:
|
||||
logging.warning(
|
||||
"⚠️ Grid selector timed out after 30s — "
|
||||
"attempting scroll anyway (grid may be partially loaded)"
|
||||
"⚠️ Grid selector timed out after 30s — continuing anyway."
|
||||
)
|
||||
take_error_screenshot(page, "tiktok_grid_timeout")
|
||||
|
||||
# ── Scroll to load more videos ───────────────────────────────
|
||||
# ── 6. Click "Actualizar" if grid shows error state ──────────
|
||||
# Even when the grid DOM node exists, TikTok may render an
|
||||
# error card inside it. Clicking the retry button triggers a
|
||||
# client-side reload of the video feed without a full page
|
||||
# reload, which often resolves the empty grid.
|
||||
if _click_retry_button(page):
|
||||
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||
logging.info("✅ Grid reloaded after retry.")
|
||||
except Exception:
|
||||
logging.warning("⚠️ Grid still not visible after retry click.")
|
||||
|
||||
# ── 7. Scroll to load more videos ────────────────────────────
|
||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||
|
||||
# ── Collect video links ──────────────────────────────────────
|
||||
# ── 8. Collect video links ───────────────────────────────────
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Found {len(video_links)} video links. "
|
||||
|
||||
Reference in New Issue
Block a user