Tiktok example 4
This commit is contained in:
135
tiktok2bsky.py
135
tiktok2bsky.py
@@ -63,18 +63,28 @@ FFPROBE_TIMEOUT_SECONDS = 15
|
||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||
SESSION_FILE_PERMISSIONS = 0o600
|
||||
|
||||
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # increased from 3.0
|
||||
TIKTOK_PAGE_LOAD_WAIT_S = 5.0
|
||||
TIKTOK_SCROLL_PAUSE_S = 2.5
|
||||
TIKTOK_MAX_SCROLLS = 8 # increased from 5
|
||||
TIKTOK_BANNER_WAIT_S = 3.0 # wait after dismissing cookie banner
|
||||
TIKTOK_MAX_SCROLLS = 8
|
||||
TIKTOK_BANNER_WAIT_S = 3.0
|
||||
|
||||
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||
ORPHAN_DIGIT_MAX_DIGITS = 3
|
||||
|
||||
# --- Cookie banner selectors (Spanish + English) ---
|
||||
# --- Top info/RGPD banner selectors (dismissed first) ---
|
||||
TOP_BANNER_SELECTORS = [
|
||||
'button:has-text("Entendido")',
|
||||
'button:has-text("Got it")',
|
||||
'button:has-text("Understood")',
|
||||
'[data-e2e="top-banner-close"]',
|
||||
'[class*="BannerContainer"] button',
|
||||
'[class*="DivBannerContainer"] button',
|
||||
]
|
||||
|
||||
# --- Cookie consent banner selectors (dismissed second) ---
|
||||
GDPR_SELECTORS = [
|
||||
'button:has-text("Permitir todas")', # ← exact text shown on screen
|
||||
'button:has-text("Permitir todas")',
|
||||
'button:has-text("Rechazar cookies opcionales")',
|
||||
'button:has-text("Entendido")',
|
||||
'button:has-text("Aceptar todo")',
|
||||
@@ -136,7 +146,6 @@ class ScrapedMedia:
|
||||
|
||||
|
||||
class ScrapedTikTok:
|
||||
"""Mirrors ScrapedTweet from twitter2bsky.py."""
|
||||
def __init__(self, created_on, text, video_url, post_url=None, thumbnail_url=None):
|
||||
self.created_on = created_on
|
||||
self.text = text
|
||||
@@ -562,7 +571,6 @@ def build_dynamic_alt(text):
|
||||
|
||||
|
||||
def make_rich(content):
|
||||
"""Build a Bluesky TextBuilder with hashtag and URL facets."""
|
||||
text_builder = client_utils.TextBuilder()
|
||||
content = clean_post_text(content)
|
||||
lines = content.splitlines()
|
||||
@@ -595,25 +603,65 @@ def make_rich(content):
|
||||
|
||||
|
||||
# --- TikTok Scraping ---
|
||||
def _dismiss_banners(page):
|
||||
"""
|
||||
Dismiss all TikTok banners in the correct order:
|
||||
1. Top RGPD/info banner ("Entendido")
|
||||
2. Cookie consent modal ("Permitir todas" / "Accept all" / etc.)
|
||||
|
||||
Returns True if at least one banner was dismissed.
|
||||
"""
|
||||
any_dismissed = False
|
||||
|
||||
# ── Step 1: Top RGPD info banner ────────────────────────────────────
|
||||
for selector in TOP_BANNER_SELECTORS:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if btn.is_visible(timeout=2000):
|
||||
btn.click()
|
||||
logging.info(f"✅ Dismissed top banner: {selector}")
|
||||
time.sleep(1)
|
||||
any_dismissed = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Step 2: Cookie consent modal ────────────────────────────────────
|
||||
for selector in GDPR_SELECTORS:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if btn.is_visible(timeout=3000):
|
||||
btn.click()
|
||||
logging.info(f"✅ Dismissed cookie banner: {selector}")
|
||||
time.sleep(TIKTOK_BANNER_WAIT_S)
|
||||
any_dismissed = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not any_dismissed:
|
||||
logging.info("ℹ️ No banners found — continuing.")
|
||||
|
||||
return any_dismissed
|
||||
|
||||
|
||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") -> list:
|
||||
"""
|
||||
Scrape recent TikTok videos from a public profile using Playwright.
|
||||
No login required for public profiles.
|
||||
|
||||
Fixes applied:
|
||||
1. Aggressive GDPR/cookie banner dismissal — Spanish + English,
|
||||
waits TIKTOK_BANNER_WAIT_S after click for grid to render.
|
||||
2. Stealth headers: Windows Chrome UA, Europe/Madrid timezone,
|
||||
es-ES locale, sec-ch-ua headers, navigator.webdriver hidden.
|
||||
3. playwright-stealth applied before navigation (graceful fallback
|
||||
if not installed).
|
||||
4. Broader grid selector list + 30s timeout + continues with scroll
|
||||
even if selector times out instead of hard-failing.
|
||||
Banner-handling strategy (fixes applied):
|
||||
1. Dismiss top RGPD info banner ("Entendido") first.
|
||||
2. Dismiss cookie consent modal ("Permitir todas" / etc.) second.
|
||||
3. Reload the page after all banners are dismissed so TikTok
|
||||
renders the video grid cleanly (avoids "Hubo un problema").
|
||||
4. playwright-stealth applied before navigation when available.
|
||||
5. Broader grid selector list + 30 s timeout + soft-fail on timeout.
|
||||
"""
|
||||
tiktoks = []
|
||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||
|
||||
# FIX 3 — playwright-stealth (optional but strongly recommended)
|
||||
# playwright-stealth — optional but strongly recommended
|
||||
try:
|
||||
from playwright_stealth import stealth_sync
|
||||
USE_STEALTH = True
|
||||
@@ -638,7 +686,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
],
|
||||
)
|
||||
|
||||
# FIX 2 — Fake a real Windows Chrome with Spanish locale + Madrid timezone
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
@@ -668,12 +715,10 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# FIX 3 — Apply playwright-stealth before any navigation
|
||||
if USE_STEALTH:
|
||||
stealth_sync(page)
|
||||
logging.info("🥷 Stealth patches applied.")
|
||||
|
||||
# FIX 2 — Hide webdriver flag + fake plugins/languages
|
||||
page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
@@ -695,30 +740,26 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
""")
|
||||
|
||||
try:
|
||||
# ── Initial navigation ───────────────────────────────────────
|
||||
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||
|
||||
# FIX 1 — Wait for page to settle before looking for banner
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# FIX 1 — Dismiss cookie/consent banner BEFORE waiting for grid
|
||||
banner_dismissed = False
|
||||
for selector in GDPR_SELECTORS:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if btn.is_visible(timeout=3000):
|
||||
btn.click()
|
||||
logging.info(f"✅ Dismissed cookie banner: {selector}")
|
||||
time.sleep(TIKTOK_BANNER_WAIT_S) # wait for grid to render
|
||||
banner_dismissed = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
# ── Dismiss all banners ──────────────────────────────────────
|
||||
_dismiss_banners(page)
|
||||
|
||||
if not banner_dismissed:
|
||||
logging.info("ℹ️ No cookie banner found — continuing.")
|
||||
# ── Reload for a clean grid render ───────────────────────────
|
||||
# TikTok renders "Hubo un problema" when the page first loaded
|
||||
# with banners present. A fresh reload after banner dismissal
|
||||
# gives TikTok a clean state and the grid renders correctly.
|
||||
logging.info("🔄 Reloading page after banner dismissal for clean grid render...")
|
||||
page.reload(wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# FIX 4 — Broader selector + longer timeout (30s) + soft fail
|
||||
# ── Dismiss any banners that reappear after reload ───────────
|
||||
_dismiss_banners(page)
|
||||
|
||||
# ── Wait for video grid ──────────────────────────────────────
|
||||
try:
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||
logging.info("✅ TikTok video grid detected.")
|
||||
@@ -729,13 +770,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
)
|
||||
take_error_screenshot(page, "tiktok_grid_timeout")
|
||||
|
||||
# Scroll to load more videos
|
||||
# ── Scroll to load more videos ───────────────────────────────
|
||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
||||
time.sleep(TIKTOK_SCROLL_PAUSE_S)
|
||||
logging.info(f"📜 Scroll {scroll_i + 1}/{TIKTOK_MAX_SCROLLS}")
|
||||
|
||||
# Collect video links
|
||||
# ── Collect video links ──────────────────────────────────────
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(
|
||||
f"📊 Found {len(video_links)} video links. "
|
||||
@@ -772,7 +813,7 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
continue
|
||||
seen_urls.add(canonical)
|
||||
|
||||
# Try to get caption from the card
|
||||
# Caption
|
||||
caption = ""
|
||||
try:
|
||||
card = link.locator("..").first
|
||||
@@ -795,11 +836,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "es-ES") ->
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
created_on = arrow.utcnow().isoformat()
|
||||
|
||||
tiktoks.append(
|
||||
ScrapedTikTok(
|
||||
created_on=created_on,
|
||||
created_on=arrow.utcnow().isoformat(),
|
||||
text=caption,
|
||||
video_url=canonical,
|
||||
post_url=canonical,
|
||||
@@ -849,12 +888,10 @@ def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = No
|
||||
|
||||
if ".m4s" in url_l or "/aud/" in url_l or "mp4a" in url_l:
|
||||
return
|
||||
|
||||
if ".m3u8" in url_l or "mpegurl" in content_type:
|
||||
if best_m3u8_url is None:
|
||||
best_m3u8_url = url
|
||||
return
|
||||
|
||||
if ".mp4" in url_l or "video/mp4" in content_type:
|
||||
if best_mp4_url is None:
|
||||
best_mp4_url = url
|
||||
@@ -933,9 +970,9 @@ def _probe_video_duration(file_path):
|
||||
|
||||
|
||||
def download_and_crop_video(video_url: str, output_path: str):
|
||||
temp_input = output_path.replace(".mp4", "_source.mp4")
|
||||
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
||||
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
||||
temp_input = output_path.replace(".mp4", "_source.mp4")
|
||||
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
||||
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
||||
|
||||
try:
|
||||
logging.info(f"⬇️ Downloading TikTok video: {video_url}")
|
||||
|
||||
Reference in New Issue
Block a user