Tiktok example 5
This commit is contained in:
@@ -106,6 +106,15 @@ GRID_SELECTORS = (
|
|||||||
'[class*="DivVideoFeedV2"]'
|
'[class*="DivVideoFeedV2"]'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- Grid error retry button selectors ---
|
||||||
|
RETRY_BUTTON_SELECTORS = [
|
||||||
|
'button:has-text("Actualizar")',
|
||||||
|
'button:has-text("Refresh")',
|
||||||
|
'button:has-text("Retry")',
|
||||||
|
'button:has-text("Reintentar")',
|
||||||
|
'[data-e2e="retry-button"]',
|
||||||
|
]
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
@@ -602,7 +611,7 @@ def make_rich(content):
|
|||||||
return text_builder
|
return text_builder
|
||||||
|
|
||||||
|
|
||||||
# --- TikTok Scraping ---
|
# --- Banner helpers ---
|
||||||
def _dismiss_banners(page):
|
def _dismiss_banners(page):
|
||||||
"""
|
"""
|
||||||
Dismiss all TikTok banners in the correct order:
|
Dismiss all TikTok banners in the correct order:
|
||||||
@@ -645,18 +654,39 @@ def _dismiss_banners(page):
|
|||||||
return any_dismissed
|
return any_dismissed
|
||||||
|
|
||||||
|
|
||||||
|
def _click_retry_button(page):
|
||||||
|
"""
|
||||||
|
Click the "Actualizar" / "Refresh" button that TikTok shows inside
|
||||||
|
the video grid when it renders an error state. Returns True if clicked.
|
||||||
|
"""
|
||||||
|
for selector in RETRY_BUTTON_SELECTORS:
|
||||||
|
try:
|
||||||
|
btn = page.locator(selector).first
|
||||||
|
if btn.is_visible(timeout=2000):
|
||||||
|
btn.click()
|
||||||
|
logging.info(f"🔁 Clicked grid retry button: {selector}")
|
||||||
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# --- TikTok Scraping ---
|
||||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
||||||
"""
|
"""
|
||||||
Scrape recent TikTok videos from a public profile using Playwright.
|
Scrape recent TikTok videos from a public profile using Playwright.
|
||||||
No login required for public profiles.
|
No login required for public profiles.
|
||||||
|
|
||||||
Banner-handling strategy (fixes applied):
|
Strategy:
|
||||||
1. Dismiss top RGPD info banner ("Entendido") first.
|
1. Navigate to profile and wait for page to settle.
|
||||||
2. Dismiss cookie consent modal ("Permitir todas" / etc.) second.
|
2. Dismiss top RGPD banner ("Entendido") + cookie modal ("Permitir todas").
|
||||||
3. Reload the page after all banners are dismissed so TikTok
|
3. Reload page so TikTok renders the grid cleanly (no "Hubo un problema").
|
||||||
renders the video grid cleanly (avoids "Hubo un problema").
|
4. Dismiss any banners that reappear after reload.
|
||||||
4. playwright-stealth applied before navigation when available.
|
5. Wait for video grid selector (30 s, soft-fail).
|
||||||
5. Broader grid selector list + 30 s timeout + soft-fail on timeout.
|
6. Click "Actualizar" retry button if TikTok shows grid error state.
|
||||||
|
7. Scroll to load more videos.
|
||||||
|
8. Collect all a[href*="/video/"] links.
|
||||||
"""
|
"""
|
||||||
tiktoks = []
|
tiktoks = []
|
||||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||||
@@ -740,43 +770,55 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
|||||||
""")
|
""")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# ── Initial navigation ───────────────────────────────────────
|
# ── 1. Initial navigation ────────────────────────────────────
|
||||||
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
||||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── Dismiss all banners ──────────────────────────────────────
|
# ── 2. Dismiss banners (first pass) ──────────────────────────
|
||||||
_dismiss_banners(page)
|
_dismiss_banners(page)
|
||||||
|
|
||||||
# ── Reload for a clean grid render ───────────────────────────
|
# ── 3. Reload for clean grid render ──────────────────────────
|
||||||
# TikTok renders "Hubo un problema" when the page first loaded
|
# TikTok serves "Hubo un problema" when the page first loaded
|
||||||
# with banners present. A fresh reload after banner dismissal
|
# while banners were blocking. A reload after dismissal gives
|
||||||
# gives TikTok a clean state and the grid renders correctly.
|
# TikTok a clean cookie state so the grid renders correctly.
|
||||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||||
|
|
||||||
# ── Dismiss any banners that reappear after reload ───────────
|
# ── 4. Dismiss banners (second pass, post-reload) ────────────
|
||||||
_dismiss_banners(page)
|
_dismiss_banners(page)
|
||||||
|
|
||||||
# ── Wait for video grid ──────────────────────────────────────
|
# ── 5. Wait for video grid ───────────────────────────────────
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||||
logging.info("✅ TikTok video grid detected.")
|
logging.info("✅ TikTok video grid detected.")
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"⚠️ Grid selector timed out after 30s — "
|
"⚠️ Grid selector timed out after 30s — continuing anyway."
|
||||||
"attempting scroll anyway (grid may be partially loaded)"
|
|
||||||
)
|
)
|
||||||
take_error_screenshot(page, "tiktok_grid_timeout")
|
take_error_screenshot(page, "tiktok_grid_timeout")
|
||||||
|
|
||||||
# ── Scroll to load more videos ───────────────────────────────
|
# ── 6. Click "Actualizar" if grid shows error state ──────────
|
||||||
|
# Even when the grid DOM node exists, TikTok may render an
|
||||||
|
# error card inside it. Clicking the retry button triggers a
|
||||||
|
# client-side reload of the video feed without a full page
|
||||||
|
# reload, which often resolves the empty grid.
|
||||||
|
if _click_retry_button(page):
|
||||||
|
logging.info("⏳ Waiting for grid to reload after retry click...")
|
||||||
|
try:
|
||||||
|
page.wait_for_selector(GRID_SELECTORS, timeout=15000)
|
||||||
|
logging.info("✅ Grid reloaded after retry.")
|
||||||
|
except Exception:
|
||||||
|
logging.warning("⚠️ Grid still not visible after retry click.")
|
||||||
|
|
||||||
|
# ── 7. Scroll to load more videos ────────────────────────────
|
||||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||||
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
||||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||||
|
|
||||||
# ── Collect video links ──────────────────────────────────────
|
# ── 8. Collect video links ───────────────────────────────────
|
||||||
video_links = page.locator('a[href*="/video/"]').all()
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
logging.info(
|
logging.info(
|
||||||
f"📊 Found {len(video_links)} video links. "
|
f"📊 Found {len(video_links)} video links. "
|
||||||
@@ -970,9 +1012,9 @@ def _probe_video_duration(file_path):
|
|||||||
|
|
||||||
|
|
||||||
def download_and_crop_video(video_url: str, output_path: str):
|
def download_and_crop_video(video_url: str, output_path: str):
|
||||||
temp_input = output_path.replace(".mp4", "_source.mp4")
|
temp_input = output_path.replace(".mp4", "_source.mp4")
|
||||||
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
||||||
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"⬇️ Downloading TikTok video: {video_url}")
|
logging.info(f"⬇️ Downloading TikTok video: {video_url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user