Tiktok example 5
This commit is contained in:
@@ -67,6 +67,7 @@ TIKTOK_PAGE_LOAD_WAIT_S = 5.0
|
||||
TIKTOK_SCROLL_PAUSE_S = 2.5
|
||||
TIKTOK_MAX_SCROLLS = 8
|
||||
TIKTOK_BANNER_WAIT_S = 3.0
|
||||
TIKTOK_MAX_LOAD_ATTEMPTS = 3 # full reload attempts before giving up
|
||||
|
||||
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||
@@ -617,7 +618,6 @@ def _dismiss_banners(page):
|
||||
Dismiss all TikTok banners in the correct order:
|
||||
1. Top RGPD/info banner ("Entendido")
|
||||
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
|
||||
|
||||
Returns True if at least one banner was dismissed.
|
||||
"""
|
||||
any_dismissed = False
|
||||
@@ -657,7 +657,8 @@ def _dismiss_banners(page):
|
||||
def _click_retry_button(page):
|
||||
"""
|
||||
Click the "Actualizar" / "Refresh" button that TikTok shows inside
|
||||
the video grid when it renders an error state. Returns True if clicked.
|
||||
the video grid when it renders an error state.
|
||||
Returns True if clicked.
|
||||
"""
|
||||
for selector in RETRY_BUTTON_SELECTORS:
|
||||
try:
|
||||
@@ -680,13 +681,17 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
|
||||
Strategy:
|
||||
1. Navigate to profile and wait for page to settle.
|
||||
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas").
|
||||
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema").
|
||||
4. Dismiss any banners that reappear after reload.
|
||||
5. Wait for video grid selector (30 s, soft-fail).
|
||||
6. Click "Actualizar" retry button if TikTok shows grid error state.
|
||||
7. Scroll to load more videos.
|
||||
8. Collect all a[href*="/video/"] links.
|
||||
2. Dismiss top RGPD banner + cookie modal.
|
||||
3. Reload page so TikTok renders the grid cleanly.
|
||||
4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
|
||||
a. Dismiss any banners that reappeared.
|
||||
b. Wait for grid selector (15 s soft-fail).
|
||||
c. Click "Actualizar" retry button if grid shows error.
|
||||
d. Wait for grid to repopulate.
|
||||
e. Check for video links — break immediately if found.
|
||||
f. If not found and attempts remain → full page reload.
|
||||
5. Scroll to load more videos.
|
||||
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
|
||||
"""
|
||||
tiktoks = []
|
||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||
@@ -775,65 +780,101 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── 2. Dismiss banners (first pass) ──────────────────────────
|
||||
# ── 2. First banner dismissal ────────────────────────────────
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── 3. Reload for clean grid render ──────────────────────────
|
||||
# ── 3. Initial reload for clean grid render ──────────────────
|
||||
# TikTok serves "Hubo un problema" when the page first loaded
|
||||
# while banners were blocking. A reload after dismissal gives
|
||||
# while banners were present. A reload after dismissal gives
|
||||
# TikTok a clean cookie state so the grid renders correctly.
|
||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── 4. Dismiss banners (second pass, post-reload) ────────────
|
||||
# ── 4. Multi-attempt loop ────────────────────────────────────
|
||||
# Each attempt: dismiss banners → wait for grid → click retry
|
||||
# → check for links. Reload between attempts if still empty.
|
||||
video_links = []
|
||||
|
||||
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
|
||||
logging.info(
|
||||
f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..."
|
||||
)
|
||||
|
||||
# 4a. Dismiss any banners that reappeared
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── 5. Wait for video grid ───────────────────────────────────
|
||||
# 4b. Wait for grid selector (soft-fail)
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||
logging.info("✅ TikTok video grid detected.")
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||
logging.info(f"✅ Grid selector found on attempt {attempt}.")
|
||||
except Exception:
|
||||
logging.warning(
|
||||
"⚠️ Grid selector timed out after 30s — continuing anyway."
|
||||
f"⚠️ Grid selector timed out on attempt {attempt}."
|
||||
)
|
||||
take_error_screenshot(
|
||||
page, f"tiktok_grid_timeout_attempt{attempt}"
|
||||
)
|
||||
take_error_screenshot(page, "tiktok_grid_timeout")
|
||||
|
||||
# ── 6. Click "Actualizar" if grid shows error state ──────────
|
||||
# Even when the grid DOM node exists, TikTok may render an
|
||||
# error card inside it. Clicking the retry button triggers a
|
||||
# client-side reload of the video feed without a full page
|
||||
# reload, which often resolves the empty grid.
|
||||
# 4c. Click "Actualizar" if grid shows error state
|
||||
if _click_retry_button(page):
|
||||
# 4d. Wait for grid to repopulate after retry click
|
||||
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||
logging.info("✅ Grid reloaded after retry.")
|
||||
except Exception:
|
||||
logging.warning("⚠️ Grid still not visible after retry click.")
|
||||
logging.warning(
|
||||
"⚠️ Grid still not visible after retry click."
|
||||
)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── 7. Scroll to load more videos ────────────────────────────
|
||||
# 4e. Check if video links appeared
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Attempt {attempt}: found {len(video_links)} video links."
|
||||
)
|
||||
|
||||
if video_links:
|
||||
logging.info(
|
||||
f"✅ Got video links on attempt {attempt} — proceeding."
|
||||
)
|
||||
break
|
||||
|
||||
# 4f. No links yet — full reload before next attempt
|
||||
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
|
||||
logging.info(
|
||||
f"🔄 No videos found — reloading page "
|
||||
f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..."
|
||||
)
|
||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── 5. Scroll to load more videos ────────────────────────────
|
||||
if video_links:
|
||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||
|
||||
# ── 8. Collect video links ───────────────────────────────────
|
||||
# Re-collect after scrolling to include lazy-loaded cards
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Found {len(video_links)} video links. "
|
||||
f"📊 Found {len(video_links)} video links after scroll. "
|
||||
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
|
||||
)
|
||||
|
||||
# ── 6. Bail out if still nothing ─────────────────────────────
|
||||
if not video_links:
|
||||
take_error_screenshot(page, "tiktok_no_video_links")
|
||||
logging.error(
|
||||
"❌ No video links found after scroll. "
|
||||
"TikTok may still be blocking — check screenshot."
|
||||
"❌ No video links found after all attempts. "
|
||||
"Install playwright-stealth: pip install playwright-stealth"
|
||||
)
|
||||
browser.close()
|
||||
return []
|
||||
|
||||
# ── 7. Parse video links into ScrapedTikTok objects ──────────
|
||||
seen_urls = set()
|
||||
for link in video_links:
|
||||
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
||||
|
||||
Reference in New Issue
Block a user