Added 4
This commit is contained in:
@@ -675,32 +675,20 @@ def _click_retry_button(page):
|
|||||||
|
|
||||||
# --- TikTok Scraping ---
|
# --- TikTok Scraping ---
|
||||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
||||||
"""
|
|
||||||
Scrape recent TikTok videos from a public profile using Playwright.
|
|
||||||
No login required for public profiles.
|
|
||||||
|
|
||||||
Strategy:
|
|
||||||
1. Navigate to profile and wait for page to settle.
|
|
||||||
2. Dismiss top RGPD banner + cookie modal.
|
|
||||||
3. Reload page so TikTok renders the grid cleanly.
|
|
||||||
4. Multi-attempt loop (TIKTOK_MAX_LOAD_ATTEMPTS):
|
|
||||||
a. Dismiss any banners that reappeared.
|
|
||||||
b. Wait for grid selector (15 s soft-fail).
|
|
||||||
c. Click "Actualizar" retry button if grid shows error.
|
|
||||||
d. Wait for grid to repopulate.
|
|
||||||
e. Check for video links — break immediately if found.
|
|
||||||
f. If not found and attempts remain → full page reload.
|
|
||||||
5. Scroll to load more videos.
|
|
||||||
6. Parse all a[href*="/video/"] links into ScrapedTikTok objects.
|
|
||||||
"""
|
|
||||||
tiktoks = []
|
tiktoks = []
|
||||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||||
|
|
||||||
# playwright-stealth — optional but strongly recommended
|
# playwright-stealth 2.x uses Stealth class; 1.x uses stealth_sync
|
||||||
|
try:
|
||||||
|
from playwright_stealth import Stealth
|
||||||
|
USE_STEALTH = "v2"
|
||||||
|
_stealth = Stealth()
|
||||||
|
logging.info("🥷 playwright-stealth 2.x available — stealth mode ON")
|
||||||
|
except ImportError:
|
||||||
try:
|
try:
|
||||||
from playwright_stealth import stealth_sync
|
from playwright_stealth import stealth_sync
|
||||||
USE_STEALTH = True
|
USE_STEALTH = "v1"
|
||||||
logging.info("🥷 playwright-stealth available — stealth mode ON")
|
logging.info("🥷 playwright-stealth 1.x available — stealth mode ON (legacy)")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
USE_STEALTH = False
|
USE_STEALTH = False
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@@ -750,9 +738,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
if USE_STEALTH:
|
# Apply stealth patches — version-aware
|
||||||
|
if USE_STEALTH == "v2":
|
||||||
|
_stealth.apply_stealth_sync(page)
|
||||||
|
logging.info("🥷 Stealth patches applied (2.x).")
|
||||||
|
elif USE_STEALTH == "v1":
|
||||||
stealth_sync(page)
|
stealth_sync(page)
|
||||||
logging.info("🥷 Stealth patches applied.")
|
logging.info("🥷 Stealth patches applied (1.x legacy).")
|
||||||
|
|
||||||
page.add_init_script("""
|
page.add_init_script("""
|
||||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
@@ -784,16 +776,11 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
_dismiss_banners(page)
|
_dismiss_banners(page)
|
||||||
|
|
||||||
# ── 3. Initial reload for clean grid render ──────────────────
|
# ── 3. Initial reload for clean grid render ──────────────────
|
||||||
# TikTok serves "Hubo un problema" when the page first loaded
|
|
||||||
# while banners were present. A reload after dismissal gives
|
|
||||||
# TikTok a clean cookie state so the grid renders correctly.
|
|
||||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── 4. Multi-attempt loop ────────────────────────────────────
|
# ── 4. Multi-attempt loop ────────────────────────────────────
|
||||||
# Each attempt: dismiss banners → wait for grid → click retry
|
|
||||||
# → check for links. Reload between attempts if still empty.
|
|
||||||
video_links = []
|
video_links = []
|
||||||
|
|
||||||
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
|
for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1):
|
||||||
@@ -818,18 +805,15 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
|
|
||||||
# 4c. Click "Actualizar" if grid shows error state
|
# 4c. Click "Actualizar" if grid shows error state
|
||||||
if _click_retry_button(page):
|
if _click_retry_button(page):
|
||||||
# 4d. Wait for grid to repopulate after retry click
|
|
||||||
logging.info("⏳ Waiting for grid to reload after retry click...")
|
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||||
logging.info("✅ Grid reloaded after retry.")
|
logging.info("✅ Grid reloaded after retry.")
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(
|
logging.warning("⚠️ Grid still not visible after retry click.")
|
||||||
"⚠️ Grid still not visible after retry click."
|
|
||||||
)
|
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# 4e. Check if video links appeared
|
# 4d. Check if video links appeared
|
||||||
video_links = page.locator('a[href*="/video/"]').all()
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
logging.info(
|
logging.info(
|
||||||
f"📊 Attempt {attempt}: found {len(video_links)} video links."
|
f"📊 Attempt {attempt}: found {len(video_links)} video links."
|
||||||
@@ -841,7 +825,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
# 4f. No links yet — full reload before next attempt
|
# 4e. No links yet — full reload before next attempt
|
||||||
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
|
if attempt < TIKTOK_MAX_LOAD_ATTEMPTS:
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🔄 No videos found — reloading page "
|
f"🔄 No videos found — reloading page "
|
||||||
@@ -857,7 +841,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||||
|
|
||||||
# Re-collect after scrolling to include lazy-loaded cards
|
|
||||||
video_links = page.locator('a[href*="/video/"]').all()
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
logging.info(
|
logging.info(
|
||||||
f"📊 Found {len(video_links)} video links after scroll. "
|
f"📊 Found {len(video_links)} video links after scroll. "
|
||||||
@@ -869,12 +852,12 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
take_error_screenshot(page, "tiktok_no_video_links")
|
take_error_screenshot(page, "tiktok_no_video_links")
|
||||||
logging.error(
|
logging.error(
|
||||||
"❌ No video links found after all attempts. "
|
"❌ No video links found after all attempts. "
|
||||||
"Install playwright-stealth: pip install playwright-stealth"
|
"Check screenshot for current TikTok block state."
|
||||||
)
|
)
|
||||||
browser.close()
|
browser.close()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# ── 7. Parse video links into ScrapedTikTok objects ──────────
|
# ── 7. Parse video links ──────────────────────────────────────
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
for link in video_links:
|
for link in video_links:
|
||||||
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
if len(tiktoks) >= SCRAPE_VIDEO_LIMIT:
|
||||||
@@ -896,7 +879,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
continue
|
continue
|
||||||
seen_urls.add(canonical)
|
seen_urls.add(canonical)
|
||||||
|
|
||||||
# Caption
|
|
||||||
caption = ""
|
caption = ""
|
||||||
try:
|
try:
|
||||||
card = link.locator("..").first
|
card = link.locator("..").first
|
||||||
@@ -910,7 +892,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Thumbnail
|
|
||||||
thumbnail_url = None
|
thumbnail_url = None
|
||||||
try:
|
try:
|
||||||
img = link.locator("img").first
|
img = link.locator("img").first
|
||||||
@@ -943,7 +924,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
||||||
return tiktoks
|
return tiktoks
|
||||||
|
|
||||||
|
|
||||||
# --- Video URL extraction ---
|
# --- Video URL extraction ---
|
||||||
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None):
|
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user