Tiktok example 2

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 09:29:47 +02:00
parent 7fef3f3ab8
commit ef32a15cbc
2 changed files with 110 additions and 32 deletions

View File

@@ -37,6 +37,7 @@ pipeline {
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \ "${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \
atproto \ atproto \
playwright \ playwright \
playwright-stealth \
httpx \ httpx \
arrow \ arrow \
python-dotenv \ python-dotenv \

View File

@@ -24,12 +24,16 @@ import grapheme
# --- Configuration --- # --- Configuration ---
LOG_PATH = "tiktok2bsky.log" LOG_PATH = "tiktok2bsky.log"
STATE_PATH = "tiktok2bsky_state.json" STATE_PATH = "tiktok2bsky_state.json"
SCRAPE_VIDEO_LIMIT = 15 # TikTok loads fewer items per scroll than Twitter
DEDUPE_BSKY_LIMIT = 30 DEDUPE_BSKY_LIMIT = 30
VIDEO_MAX_AGE_DAYS = 3 VIDEO_MAX_AGE_DAYS = 3
BSKY_TEXT_MAX_LENGTH = 300 BSKY_TEXT_MAX_LENGTH = 300
DEFAULT_BSKY_LANGS = ["ca"] DEFAULT_BSKY_LANGS = ["ca"]
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # was 3.0 — increased for slower grid render
TIKTOK_MAX_SCROLLS = 8 # was 5 — more scrolls = more videos discovered
SCRAPE_VIDEO_LIMIT = 30 # was 15
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -64,7 +68,6 @@ DEFAULT_BSKY_BASE_URL = "https://bsky.social"
SESSION_FILE_PERMISSIONS = 0o600 SESSION_FILE_PERMISSIONS = 0o600
TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load
TIKTOK_MAX_SCROLLS = 5 # how many times to scroll down the profile
TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads
DYNAMIC_ALT_MAX_LENGTH = 150 DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20 TRUNCATE_MIN_PREFIX_CHARS = 20
@@ -531,16 +534,32 @@ def make_rich(content):
return text_builder return text_builder
# --- TikTok Scraping ---
# --- TikTok Scraping --- # --- TikTok Scraping ---
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list: def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list:
""" """
Scrape recent TikTok videos from a public profile using Playwright. Scrape recent TikTok videos from a public profile using Playwright.
No login required for public profiles. No login required for public profiles.
Returns a list of ScrapedTikTok objects. Returns a list of ScrapedTikTok objects.
Fixes applied:
1. Aggressive GDPR/consent banner dismissal (Spanish + English)
2. Stealth headers: timezone, locale, sec-ch-ua, webdriver flag hidden
3. playwright-stealth applied before navigation
4. Broader + longer grid selector wait (30s, more selectors)
""" """
tiktoks = [] tiktoks = []
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
# playwright-stealth is optional but strongly recommended
try:
from playwright_stealth import stealth_sync
USE_STEALTH = True
logging.info("🥷 playwright-stealth available — stealth mode ON")
except ImportError:
USE_STEALTH = False
logging.warning("⚠️ playwright-stealth not installed — running without stealth")
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch( browser = p.chromium.launch(
headless=True, headless=True,
@@ -548,59 +567,112 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
"--disable-blink-features=AutomationControlled", "--disable-blink-features=AutomationControlled",
"--no-sandbox", "--no-sandbox",
"--disable-setuid-sandbox", "--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--window-size=1366,768",
], ],
) )
# FIX 2 — Fake a real Windows Chrome browser with Spanish locale + Madrid timezone
context = browser.new_context( context = browser.new_context(
user_agent=( user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) " "AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/145.0.7632.6 Safari/537.36" "Chrome/124.0.0.0 Safari/537.36"
), ),
viewport={"width": 1920, "height": 1080}, viewport={"width": 1366, "height": 768},
locale=locale, locale="es-ES",
# TikTok checks these headers — set them explicitly timezone_id="Europe/Madrid",
extra_http_headers={ extra_http_headers={
"Accept-Language": f"{locale},en;q=0.9", "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
"Accept": (
"text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/avif,image/webp,*/*;q=0.8"
),
"Sec-Fetch-Dest": "document", "Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate", "Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none", "Sec-Fetch-Site": "none",
"Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
}, },
) )
page = context.new_page() page = context.new_page()
# FIX 3 — Apply playwright-stealth before any navigation
if USE_STEALTH:
stealth_sync(page)
logging.info("🥷 Stealth patches applied.")
# FIX 2 — Hide webdriver flag + fake plugins/languages via init script
page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', {
get: () => [
{ name: 'Chrome PDF Plugin' },
{ name: 'Chrome PDF Viewer' },
{ name: 'Native Client' }
]
});
Object.defineProperty(navigator, 'languages', {
get: () => ['es-ES', 'es', 'en']
});
window.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {}
};
""")
try: try:
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
# Dismiss cookie/consent banners if present # FIX 1 — Wait longer for initial page render (was 3.0s)
for selector in [ time.sleep(TIKTOK_PAGE_LOAD_WAIT_S + 2)
# FIX 1 — Aggressive GDPR/consent banner dismissal (Spanish + English)
GDPR_SELECTORS = [
'button:has-text("Entendido")',
'button:has-text("Aceptar todo")',
'button:has-text("Accept all")', 'button:has-text("Accept all")',
'button:has-text("Got it")',
'button:has-text("Decline optional")', 'button:has-text("Decline optional")',
'[data-e2e="cookie-banner-accept"]', '[data-e2e="cookie-banner-accept"]',
]: '[id*="accept"]',
'[class*="accept-btn"]',
]
for selector in GDPR_SELECTORS:
try: try:
btn = page.locator(selector).first btn = page.locator(selector).first
if btn.is_visible(timeout=2000): if btn.is_visible(timeout=3000):
btn.click() btn.click()
time.sleep(1) logging.info(f"✅ Dismissed banner: {selector}")
time.sleep(2)
break break
except Exception: except Exception:
pass pass
# Wait for video grid to appear # FIX 4 — Broader selector list + longer timeout (30s, was 20s)
GRID_SELECTORS = (
'[data-e2e="user-post-item"], '
'[class*="DivItemContainerV2"], '
'a[href*="/video/"], '
'[class*="video-feed"], '
'div[class*="VideoFeed"], '
'[class*="DivVideoFeedV2"]'
)
try: try:
page.wait_for_selector( page.wait_for_selector(GRID_SELECTORS, timeout=30000)
'[data-e2e="user-post-item"], ' logging.info("✅ TikTok video grid detected.")
'[class*="DivItemContainerV2"], '
'a[href*="/video/"]',
timeout=20000,
)
except Exception: except Exception:
take_error_screenshot(page, "tiktok_profile_load_failed") # FIX 4 — Don't give up immediately: try scrolling anyway
logging.error("❌ TikTok video grid did not appear.") logging.warning(
browser.close() "⚠️ Grid selector timed out — attempting scroll anyway "
return [] "(grid may still be partially loaded)"
)
# Scroll to load more videos # Scroll to load more videos
for scroll_i in range(TIKTOK_MAX_SCROLLS): for scroll_i in range(TIKTOK_MAX_SCROLLS):
@@ -610,7 +682,16 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
# Collect video links # Collect video links
video_links = page.locator('a[href*="/video/"]').all() video_links = page.locator('a[href*="/video/"]').all()
logging.info(f"📊 Found {len(video_links)} video links. Parsing up to {SCRAPE_VIDEO_LIMIT}...") logging.info(
f"📊 Found {len(video_links)} video links. "
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
)
if not video_links:
take_error_screenshot(page, "tiktok_no_video_links")
logging.error("❌ No video links found after scroll. TikTok may be blocking.")
browser.close()
return []
seen_urls = set() seen_urls = set()
for link in video_links: for link in video_links:
@@ -633,10 +714,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
continue continue
seen_urls.add(canonical) seen_urls.add(canonical)
# Try to get caption from the card itself (avoids opening each video) # Try to get caption from the card itself
caption = "" caption = ""
try: try:
# The caption is often in a sibling/child element
card = link.locator("..").first card = link.locator("..").first
caption_el = card.locator( caption_el = card.locator(
'[data-e2e="video-desc"], ' '[data-e2e="video-desc"], '
@@ -657,15 +737,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
except Exception: except Exception:
pass pass
# TikTok doesn't expose post timestamps in the grid —
# use now as a conservative estimate; dedup prevents re-posting
created_on = arrow.utcnow().isoformat() created_on = arrow.utcnow().isoformat()
tiktoks.append( tiktoks.append(
ScrapedTikTok( ScrapedTikTok(
created_on=created_on, created_on=created_on,
text=caption, text=caption,
video_url=canonical, # placeholder; real URL resolved later video_url=canonical,
post_url=canonical, post_url=canonical,
thumbnail_url=thumbnail_url, thumbnail_url=thumbnail_url,
) )
@@ -685,7 +763,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.") logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
return tiktoks return tiktoks
# --- Video extraction --- # --- Video extraction ---
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None: def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None:
""" """