Added 4
This commit is contained in:
@@ -675,32 +675,20 @@ def _click_retry_button(page):
|
||||
|
||||
# --- TikTok Scraping ---
|
||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
||||
"""
|
||||
Scrape recent TikTok videos from a public profile using Playwright.
|
||||
No login required for public profiles.
|
||||
|
||||
Strategy:
|
||||
1. Navigate to profile and wait for page to settle.
|
||||
2. Dismiss top RGPD banner + cookie modal.
|
||||
3. Reload page so TikTok renders the grid cleanly.
|
||||
4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
|
||||
a. Dismiss any banners that reappeared.
|
||||
b. Wait for grid selector (15 s soft-fail).
|
||||
c. Click "Actualizar" retry button if grid shows error.
|
||||
d. Wait for grid to repopulate.
|
||||
e. Check for video links — break immediately if found.
|
||||
f. If not found and attempts remain → full page reload.
|
||||
5. Scroll to load more videos.
|
||||
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
|
||||
"""
|
||||
tiktoks = []
|
||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||
|
||||
# playwright-stealth — optional but strongly recommended
|
||||
# playwright-stealth 2.x uses Stealth class; 1.x uses stealth_sync
|
||||
try:
|
||||
from playwright_stealth import Stealth
|
||||
USE_STEALTH = "v2"
|
||||
_stealth = Stealth()
|
||||
logging.info("🥷 playwright-stealth 2.x available — stealth mode ON")
|
||||
except ImportError:
|
||||
try:
|
||||
from playwright_stealth import stealth_sync
|
||||
USE_STEALTH = True
|
||||
logging.info("🥷 playwright-stealth available — stealth mode ON")
|
||||
USE_STEALTH = "v1"
|
||||
logging.info("🥷 playwright-stealth 1.x available — stealth mode ON (legacy)")
|
||||
except ImportError:
|
||||
USE_STEALTH = False
|
||||
logging.warning(
|
||||
@@ -750,9 +738,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
if USE_STEALTH:
|
||||
# Apply stealth patches — version-aware
|
||||
if USE_STEALTH == "v2":
|
||||
_stealth.apply_stealth_sync(page)
|
||||
logging.info("🥷 Stealth patches applied (2.x).")
|
||||
elif USE_STEALTH == "v1":
|
||||
stealth_sync(page)
|
||||
logging.info("🥷 Stealth patches applied.")
|
||||
logging.info("🥷 Stealth patches applied (1.x legacy).")
|
||||
|
||||
page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
@@ -784,16 +776,11 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── 3. Initial reload for clean grid render ──────────────────
|
||||
# TikTok serves "Hubo un problema" when the page first loaded
|
||||
# while banners were present. A reload after dismissal gives
|
||||
# TikTok a clean cookie state so the grid renders correctly.
|
||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# ── 4. Multi-attempt loop ────────────────────────────────────
|
||||
# Each attempt: dismiss banners → wait for grid → click retry
|
||||
# → check for links. Reload between attempts if still empty.
|
||||
video_links = []
|
||||
|
||||
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
|
||||
@@ -818,18 +805,15 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
|
||||
# 4c. Click "Actualizar" if grid shows error state
|
||||
if _click_retry_button(page):
|
||||
# 4d. Wait for grid to repopulate after retry click
|
||||
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||
logging.info("✅ Grid reloaded after retry.")
|
||||
except Exception:
|
||||
logging.warning(
|
||||
"⚠️ Grid still not visible after retry click."
|
||||
)
|
||||
logging.warning("⚠️ Grid still not visible after retry click.")
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# 4e. Check if video links appeared
|
||||
# 4d. Check if video links appeared
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Attempt {attempt}: found {len(video_links)} video links."
|
||||
@@ -841,7 +825,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
)
|
||||
break
|
||||
|
||||
# 4f. No links yet — full reload before next attempt
|
||||
# 4e. No links yet — full reload before next attempt
|
||||
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
|
||||
logging.info(
|
||||
f"🔄 No videos found — reloading page "
|
||||
@@ -857,7 +841,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||
|
||||
# Re-collect after scrolling to include lazy-loaded cards
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Found {len(video_links)} video links after scroll. "
|
||||
@@ -869,12 +852,12 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
take_error_screenshot(page, "tiktok_no_video_links")
|
||||
logging.error(
|
||||
"❌ No video links found after all attempts. "
|
||||
"Install playwright-stealth: pip install playwright-stealth"
|
||||
"Check screenshot for current TikTok block state."
|
||||
)
|
||||
browser.close()
|
||||
return []
|
||||
|
||||
# ── 7. Parse video links into ScrapedTikTok objects ──────────
|
||||
# ── 7. Parse video links ──────────────────────────────────────
|
||||
seen_urls = set()
|
||||
for link in video_links:
|
||||
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
||||
@@ -896,7 +879,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
continue
|
||||
seen_urls.add(canonical)
|
||||
|
||||
# Caption
|
||||
caption = ""
|
||||
try:
|
||||
card = link.locator("..").first
|
||||
@@ -910,7 +892,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Thumbnail
|
||||
thumbnail_url = None
|
||||
try:
|
||||
img = link.locator("img").first
|
||||
@@ -943,7 +924,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
||||
return tiktoks
|
||||
|
||||
|
||||
# --- Video URL extraction ---
|
||||
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user