Tiktok example 5
This commit is contained in:
@@ -67,6 +67,7 @@ TIKTOK_PAGE_LOAD_WAIT_S = 5.0
|
|||||||
TIKTOK_SCROLL_PAUSE_S = 2.5
|
TIKTOK_SCROLL_PAUSE_S = 2.5
|
||||||
TIKTOK_MAX_SCROLLS = 8
|
TIKTOK_MAX_SCROLLS = 8
|
||||||
TIKTOK_BANNER_WAIT_S = 3.0
|
TIKTOK_BANNER_WAIT_S = 3.0
|
||||||
|
TIKTOK_MAX_LOAD_ATTEMPTS = 3 # full reload attempts before giving up
|
||||||
|
|
||||||
DYNAMIC_ALT_MAX_LENGTH = 150
|
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||||
TRUNCATE_MIN_PREFIX_CHARS = 20
|
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||||
@@ -617,7 +618,6 @@ def _dismiss_banners(page):
|
|||||||
Dismiss all TikTok banners in the correct order:
|
Dismiss all TikTok banners in the correct order:
|
||||||
1. Top RGPD/info banner ("Entendido")
|
1. Top RGPD/info banner ("Entendido")
|
||||||
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
|
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
|
||||||
|
|
||||||
Returns True if at least one banner was dismissed.
|
Returns True if at least one banner was dismissed.
|
||||||
"""
|
"""
|
||||||
any_dismissed = False
|
any_dismissed = False
|
||||||
@@ -657,7 +657,8 @@ def _dismiss_banners(page):
|
|||||||
def _click_retry_button(page):
|
def _click_retry_button(page):
|
||||||
"""
|
"""
|
||||||
Click the "Actualizar" / "Refresh" button that TikTok shows inside
|
Click the "Actualizar" / "Refresh" button that TikTok shows inside
|
||||||
the video grid when it renders an error state. Returns True if clicked.
|
the video grid when it renders an error state.
|
||||||
|
Returns True if clicked.
|
||||||
"""
|
"""
|
||||||
for selector in RETRY_BUTTON_SELECTORS:
|
for selector in RETRY_BUTTON_SELECTORS:
|
||||||
try:
|
try:
|
||||||
@@ -680,13 +681,17 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
|
|
||||||
Strategy:
|
Strategy:
|
||||||
1. Navigate to profile and wait for page to settle.
|
1. Navigate to profile and wait for page to settle.
|
||||||
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas").
|
2. Dismiss top RGPD banner + cookie modal.
|
||||||
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema").
|
3. Reload page so TikTok renders the grid cleanly.
|
||||||
4. Dismiss any banners that reappear after reload.
|
4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
|
||||||
5. Wait for video grid selector (30 s, soft-fail).
|
a. Dismiss any banners that reappeared.
|
||||||
6. Click "Actualizar" retry button if TikTok shows grid error state.
|
b. Wait for grid selector (15 s soft-fail).
|
||||||
7. Scroll to load more videos.
|
c. Click "Actualizar" retry button if grid shows error.
|
||||||
8. Collect all a[href*="/video/"] links.
|
d. Wait for grid to repopulate.
|
||||||
|
e. Check for video links — break immediately if found.
|
||||||
|
f. If not found and attempts remain → full page reload.
|
||||||
|
5. Scroll to load more videos.
|
||||||
|
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
|
||||||
"""
|
"""
|
||||||
tiktoks = []
|
tiktoks = []
|
||||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||||
@@ -775,65 +780,101 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── 2. Dismiss banners (first pass) ──────────────────────────
|
# ── 2. First banner dismissal ────────────────────────────────
|
||||||
_dismiss_banners(page)
|
_dismiss_banners(page)
|
||||||
|
|
||||||
# ── 3. Reload for clean grid render ──────────────────────────
|
# ── 3. Initial reload for clean grid render ──────────────────
|
||||||
# TikTok serves "Hubo un problema" when the page first loaded
|
# TikTok serves "Hubo un problema" when the page first loaded
|
||||||
# while banners were blocking. A reload after dismissal gives
|
# while banners were present. A reload after dismissal gives
|
||||||
# TikTok a clean cookie state so the grid renders correctly.
|
# TikTok a clean cookie state so the grid renders correctly.
|
||||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── 4. Dismiss banners (second pass, post-reload) ────────────
|
# ── 4. Multi-attempt loop ────────────────────────────────────
|
||||||
|
# Each attempt: dismiss banners → wait for grid → click retry
|
||||||
|
# → check for links. Reload between attempts if still empty.
|
||||||
|
video_links = []
|
||||||
|
|
||||||
|
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
|
||||||
|
logging.info(
|
||||||
|
f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4a. Dismiss any banners that reappeared
|
||||||
_dismiss_banners(page)
|
_dismiss_banners(page)
|
||||||
|
|
||||||
# ── 5. Wait for video grid ───────────────────────────────────
|
# 4b. Wait for grid selector (soft-fail)
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||||
logging.info("✅ TikTok video grid detected.")
|
logging.info(f"✅ Grid selector found on attempt {attempt}.")
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"⚠️ Grid selector timed out after 30s — continuing anyway."
|
f"⚠️ Grid selector timed out on attempt {attempt}."
|
||||||
|
)
|
||||||
|
take_error_screenshot(
|
||||||
|
page, f"tiktok_grid_timeout_attempt{attempt}"
|
||||||
)
|
)
|
||||||
take_error_screenshot(page, "tiktok_grid_timeout")
|
|
||||||
|
|
||||||
# ── 6. Click "Actualizar" if grid shows error state ──────────
|
# 4c. Click "Actualizar" if grid shows error state
|
||||||
# Even when the grid DOM node exists, TikTok may render an
|
|
||||||
# error card inside it. Clicking the retry button triggers a
|
|
||||||
# client-side reload of the video feed without a full page
|
|
||||||
# reload, which often resolves the empty grid.
|
|
||||||
if _click_retry_button(page):
|
if _click_retry_button(page):
|
||||||
|
# 4d. Wait for grid to repopulate after retry click
|
||||||
logging.info("⏳ Waiting for grid to reload after retry click...")
|
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||||
logging.info("✅ Grid reloaded after retry.")
|
logging.info("✅ Grid reloaded after retry.")
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning("⚠️ Grid still not visible after retry click.")
|
logging.warning(
|
||||||
|
"⚠️ Grid still not visible after retry click."
|
||||||
|
)
|
||||||
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── 7. Scroll to load more videos ────────────────────────────
|
# 4e. Check if video links appeared
|
||||||
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
|
logging.info(
|
||||||
|
f"📊 Attempt {attempt}: found {len(video_links)} video links."
|
||||||
|
)
|
||||||
|
|
||||||
|
if video_links:
|
||||||
|
logging.info(
|
||||||
|
f"✅ Got video links on attempt {attempt} — proceeding."
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
# 4f. No links yet — full reload before next attempt
|
||||||
|
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
|
||||||
|
logging.info(
|
||||||
|
f"🔄 No videos found — reloading page "
|
||||||
|
f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..."
|
||||||
|
)
|
||||||
|
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||||
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
|
# ── 5. Scroll to load more videos ────────────────────────────
|
||||||
|
if video_links:
|
||||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||||
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
||||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||||
|
|
||||||
# ── 8. Collect video links ───────────────────────────────────
|
# Re-collect after scrolling to include lazy-loaded cards
|
||||||
video_links = page.locator('a[href*="/video/"]').all()
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
logging.info(
|
logging.info(
|
||||||
f"📊 Found {len(video_links)} video links. "
|
f"📊 Found {len(video_links)} video links after scroll. "
|
||||||
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
|
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── 6. Bail out if still nothing ─────────────────────────────
|
||||||
if not video_links:
|
if not video_links:
|
||||||
take_error_screenshot(page, "tiktok_no_video_links")
|
take_error_screenshot(page, "tiktok_no_video_links")
|
||||||
logging.error(
|
logging.error(
|
||||||
"❌ No video links found after scroll. "
|
"❌ No video links found after all attempts. "
|
||||||
"TikTok may still be blocking — check screenshot."
|
"Install playwright-stealth: pip install playwright-stealth"
|
||||||
)
|
)
|
||||||
browser.close()
|
browser.close()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# ── 7. Parse video links into ScrapedTikTok objects ──────────
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
for link in video_links:
|
for link in video_links:
|
||||||
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
||||||
|
|||||||
Reference in New Issue
Block a user