Tiktok example 2

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 09:29:47 +02:00
parent 7fef3f3ab8
commit ef32a15cbc
2 changed files with 110 additions and 32 deletions

View File

@@ -37,6 +37,7 @@ pipeline {
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \
atproto \
playwright \
playwright-stealth \
httpx \
arrow \
python-dotenv \

View File

@@ -24,12 +24,16 @@ import grapheme
# --- Configuration ---
LOG_PATH = "tiktok2bsky.log"
STATE_PATH = "tiktok2bsky_state.json"
SCRAPE_VIDEO_LIMIT = 15 # TikTok loads fewer items per scroll than Twitter
DEDUPE_BSKY_LIMIT = 30
VIDEO_MAX_AGE_DAYS = 3
BSKY_TEXT_MAX_LENGTH = 300
DEFAULT_BSKY_LANGS = ["ca"]
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # was 3.0 — increased for slower grid render
TIKTOK_MAX_SCROLLS = 8 # was 5 — more scrolls = more videos discovered
SCRAPE_VIDEO_LIMIT = 30 # was 15
VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -64,7 +68,6 @@ DEFAULT_BSKY_BASE_URL = "https://bsky.social"
SESSION_FILE_PERMISSIONS = 0o600
TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load
TIKTOK_MAX_SCROLLS = 5 # how many times to scroll down the profile
TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads
DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20
@@ -531,16 +534,32 @@ def make_rich(content):
return text_builder
# --- TikTok Scraping ---
# --- TikTok Scraping ---
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list:
"""
Scrape recent TikTok videos from a public profile using Playwright.
No login required for public profiles.
Returns a list of ScrapedTikTok objects.
Fixes applied:
1. Aggressive GDPR/consent banner dismissal (Spanish + English)
2. Stealth headers: timezone, locale, sec-ch-ua, webdriver flag hidden
3. playwright-stealth applied before navigation
4. Broader + longer grid selector wait (30s, more selectors)
"""
tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
# playwright-stealth is optional but strongly recommended
try:
from playwright_stealth import stealth_sync
USE_STEALTH = True
logging.info("🥷 playwright-stealth available — stealth mode ON")
except ImportError:
USE_STEALTH = False
logging.warning("⚠️ playwright-stealth not installed — running without stealth")
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
@@ -548,59 +567,112 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--window-size=1366,768",
],
)
# FIX 2 — Fake a real Windows Chrome browser with Spanish locale + Madrid timezone
context = browser.new_context(
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/145.0.7632.6 Safari/537.36"
"Chrome/124.0.0.0 Safari/537.36"
),
viewport={"width": 1920, "height": 1080},
locale=locale,
# TikTok checks these headers — set them explicitly
viewport={"width": 1366, "height": 768},
locale="es-ES",
timezone_id="Europe/Madrid",
extra_http_headers={
"Accept-Language": f"{locale},en;q=0.9",
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
"Accept": (
"text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/avif,image/webp,*/*;q=0.8"
),
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
},
)
page = context.new_page()
# FIX 3 — Apply playwright-stealth before any navigation
if USE_STEALTH:
stealth_sync(page)
logging.info("🥷 Stealth patches applied.")
# FIX 2 — Hide webdriver flag + fake plugins/languages via init script
page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', {
get: () => [
{ name: 'Chrome PDF Plugin' },
{ name: 'Chrome PDF Viewer' },
{ name: 'Native Client' }
]
});
Object.defineProperty(navigator, 'languages', {
get: () => ['es-ES', 'es', 'en']
});
window.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {}
};
""")
try:
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# Dismiss cookie/consent banners if present
for selector in [
# FIX 1 — Wait longer for initial page render (was 3.0s)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S + 2)
# FIX 1 — Aggressive GDPR/consent banner dismissal (Spanish + English)
GDPR_SELECTORS = [
'button:has-text("Entendido")',
'button:has-text("Aceptar todo")',
'button:has-text("Accept all")',
'button:has-text("Got it")',
'button:has-text("Decline optional")',
'[data-e2e="cookie-banner-accept"]',
]:
'[id*="accept"]',
'[class*="accept-btn"]',
]
for selector in GDPR_SELECTORS:
try:
btn = page.locator(selector).first
if btn.is_visible(timeout=2000):
if btn.is_visible(timeout=3000):
btn.click()
time.sleep(1)
logging.info(f"✅ Dismissed banner: {selector}")
time.sleep(2)
break
except Exception:
pass
# Wait for video grid to appear
# FIX 4 — Broader selector list + longer timeout (30s, was 20s)
GRID_SELECTORS = (
'[data-e2e="user-post-item"], '
'[class*="DivItemContainerV2"], '
'a[href*="/video/"], '
'[class*="video-feed"], '
'div[class*="VideoFeed"], '
'[class*="DivVideoFeedV2"]'
)
try:
page.wait_for_selector(
'[data-e2e="user-post-item"], '
'[class*="DivItemContainerV2"], '
'a[href*="/video/"]',
timeout=20000,
)
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
logging.info("✅ TikTok video grid detected.")
except Exception:
take_error_screenshot(page, "tiktok_profile_load_failed")
logging.error("❌ TikTok video grid did not appear.")
browser.close()
return []
# FIX 4 — Don't give up immediately: try scrolling anyway
logging.warning(
"⚠️ Grid selector timed out — attempting scroll anyway "
"(grid may still be partially loaded)"
)
# Scroll to load more videos
for scroll_i in range(TIKTOK_MAX_SCROLLS):
@@ -610,7 +682,16 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
# Collect video links
video_links = page.locator('a[href*="/video/"]').all()
logging.info(f"📊 Found {len(video_links)} video links. Parsing up to {SCRAPE_VIDEO_LIMIT}...")
logging.info(
f"📊 Found {len(video_links)} video links. "
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
)
if not video_links:
take_error_screenshot(page, "tiktok_no_video_links")
logging.error("❌ No video links found after scroll. TikTok may be blocking.")
browser.close()
return []
seen_urls = set()
for link in video_links:
@@ -633,10 +714,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
continue
seen_urls.add(canonical)
# Try to get caption from the card itself (avoids opening each video)
# Try to get caption from the card itself
caption = ""
try:
# The caption is often in a sibling/child element
card = link.locator("..").first
caption_el = card.locator(
'[data-e2e="video-desc"], '
@@ -657,15 +737,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
except Exception:
pass
# TikTok doesn't expose post timestamps in the grid —
# use now as a conservative estimate; dedup prevents re-posting
created_on = arrow.utcnow().isoformat()
tiktoks.append(
ScrapedTikTok(
created_on=created_on,
text=caption,
video_url=canonical, # placeholder; real URL resolved later
video_url=canonical,
post_url=canonical,
thumbnail_url=thumbnail_url,
)
@@ -685,7 +763,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
return tiktoks
# --- Video extraction ---
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None:
"""