Tiktok example 2
This commit is contained in:
141
tiktok2bsky.py
141
tiktok2bsky.py
@@ -24,12 +24,16 @@ import grapheme
|
||||
# --- Configuration ---
|
||||
LOG_PATH = "tiktok2bsky.log"
|
||||
STATE_PATH = "tiktok2bsky_state.json"
|
||||
SCRAPE_VIDEO_LIMIT = 15 # TikTok loads fewer items per scroll than Twitter
|
||||
DEDUPE_BSKY_LIMIT = 30
|
||||
VIDEO_MAX_AGE_DAYS = 3
|
||||
BSKY_TEXT_MAX_LENGTH = 300
|
||||
DEFAULT_BSKY_LANGS = ["ca"]
|
||||
|
||||
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # was 3.0 — increased for slower grid render
|
||||
TIKTOK_MAX_SCROLLS = 8 # was 5 — more scrolls = more videos discovered
|
||||
SCRAPE_VIDEO_LIMIT = 30 # was 15
|
||||
|
||||
|
||||
VIDEO_MAX_DURATION_SECONDS = 179
|
||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||
|
||||
@@ -64,7 +68,6 @@ DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||
SESSION_FILE_PERMISSIONS = 0o600
|
||||
|
||||
TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load
|
||||
TIKTOK_MAX_SCROLLS = 5 # how many times to scroll down the profile
|
||||
TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads
|
||||
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||
@@ -531,16 +534,32 @@ def make_rich(content):
|
||||
return text_builder
|
||||
|
||||
|
||||
# --- TikTok Scraping ---
|
||||
# --- TikTok Scraping ---
|
||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list:
|
||||
"""
|
||||
Scrape recent TikTok videos from a public profile using Playwright.
|
||||
No login required for public profiles.
|
||||
Returns a list of ScrapedTikTok objects.
|
||||
|
||||
Fixes applied:
|
||||
1. Aggressive GDPR/consent banner dismissal (Spanish + English)
|
||||
2. Stealth headers: timezone, locale, sec-ch-ua, webdriver flag hidden
|
||||
3. playwright-stealth applied before navigation
|
||||
4. Broader + longer grid selector wait (30s, more selectors)
|
||||
"""
|
||||
tiktoks = []
|
||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||
|
||||
# playwright-stealth is optional but strongly recommended
|
||||
try:
|
||||
from playwright_stealth import stealth_sync
|
||||
USE_STEALTH = True
|
||||
logging.info("🥷 playwright-stealth available — stealth mode ON")
|
||||
except ImportError:
|
||||
USE_STEALTH = False
|
||||
logging.warning("⚠️ playwright-stealth not installed — running without stealth")
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=True,
|
||||
@@ -548,59 +567,112 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
"--window-size=1366,768",
|
||||
],
|
||||
)
|
||||
|
||||
# FIX 2 — Fake a real Windows Chrome browser with Spanish locale + Madrid timezone
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/145.0.7632.6 Safari/537.36"
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale=locale,
|
||||
# TikTok checks these headers — set them explicitly
|
||||
viewport={"width": 1366, "height": 768},
|
||||
locale="es-ES",
|
||||
timezone_id="Europe/Madrid",
|
||||
extra_http_headers={
|
||||
"Accept-Language": f"{locale},en;q=0.9",
|
||||
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/avif,image/webp,*/*;q=0.8"
|
||||
),
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||||
},
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# FIX 3 — Apply playwright-stealth before any navigation
|
||||
if USE_STEALTH:
|
||||
stealth_sync(page)
|
||||
logging.info("🥷 Stealth patches applied.")
|
||||
|
||||
# FIX 2 — Hide webdriver flag + fake plugins/languages via init script
|
||||
page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [
|
||||
{ name: 'Chrome PDF Plugin' },
|
||||
{ name: 'Chrome PDF Viewer' },
|
||||
{ name: 'Native Client' }
|
||||
]
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['es-ES', 'es', 'en']
|
||||
});
|
||||
window.chrome = {
|
||||
runtime: {},
|
||||
loadTimes: function() {},
|
||||
csi: function() {},
|
||||
app: {}
|
||||
};
|
||||
""")
|
||||
|
||||
try:
|
||||
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
||||
|
||||
# Dismiss cookie/consent banners if present
|
||||
for selector in [
|
||||
# FIX 1 — Wait longer for initial page render (was 3.0s)
|
||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S + 2)
|
||||
|
||||
# FIX 1 — Aggressive GDPR/consent banner dismissal (Spanish + English)
|
||||
GDPR_SELECTORS = [
|
||||
'button:has-text("Entendido")',
|
||||
'button:has-text("Aceptar todo")',
|
||||
'button:has-text("Accept all")',
|
||||
'button:has-text("Got it")',
|
||||
'button:has-text("Decline optional")',
|
||||
'[data-e2e="cookie-banner-accept"]',
|
||||
]:
|
||||
'[id*="accept"]',
|
||||
'[class*="accept-btn"]',
|
||||
]
|
||||
for selector in GDPR_SELECTORS:
|
||||
try:
|
||||
btn = page.locator(selector).first
|
||||
if btn.is_visible(timeout=2000):
|
||||
if btn.is_visible(timeout=3000):
|
||||
btn.click()
|
||||
time.sleep(1)
|
||||
logging.info(f"✅ Dismissed banner: {selector}")
|
||||
time.sleep(2)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Wait for video grid to appear
|
||||
# FIX 4 — Broader selector list + longer timeout (30s, was 20s)
|
||||
GRID_SELECTORS = (
|
||||
'[data-e2e="user-post-item"], '
|
||||
'[class*="DivItemContainerV2"], '
|
||||
'a[href*="/video/"], '
|
||||
'[class*="video-feed"], '
|
||||
'div[class*="VideoFeed"], '
|
||||
'[class*="DivVideoFeedV2"]'
|
||||
)
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
'[data-e2e="user-post-item"], '
|
||||
'[class*="DivItemContainerV2"], '
|
||||
'a[href*="/video/"]',
|
||||
timeout=20000,
|
||||
)
|
||||
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||
logging.info("✅ TikTok video grid detected.")
|
||||
except Exception:
|
||||
take_error_screenshot(page, "tiktok_profile_load_failed")
|
||||
logging.error("❌ TikTok video grid did not appear.")
|
||||
browser.close()
|
||||
return []
|
||||
# FIX 4 — Don't give up immediately: try scrolling anyway
|
||||
logging.warning(
|
||||
"⚠️ Grid selector timed out — attempting scroll anyway "
|
||||
"(grid may still be partially loaded)"
|
||||
)
|
||||
|
||||
# Scroll to load more videos
|
||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||
@@ -610,7 +682,16 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
||||
|
||||
# Collect video links
|
||||
video_links = page.locator('a[href*="/video/"]').all()
|
||||
logging.info(f"📊 Found {len(video_links)} video links. Parsing up to {SCRAPE_VIDEO_LIMIT}...")
|
||||
logging.info(
|
||||
f"📊 Found {len(video_links)} video links. "
|
||||
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
|
||||
)
|
||||
|
||||
if not video_links:
|
||||
take_error_screenshot(page, "tiktok_no_video_links")
|
||||
logging.error("❌ No video links found after scroll. TikTok may be blocking.")
|
||||
browser.close()
|
||||
return []
|
||||
|
||||
seen_urls = set()
|
||||
for link in video_links:
|
||||
@@ -633,10 +714,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
||||
continue
|
||||
seen_urls.add(canonical)
|
||||
|
||||
# Try to get caption from the card itself (avoids opening each video)
|
||||
# Try to get caption from the card itself
|
||||
caption = ""
|
||||
try:
|
||||
# The caption is often in a sibling/child element
|
||||
card = link.locator("..").first
|
||||
caption_el = card.locator(
|
||||
'[data-e2e="video-desc"], '
|
||||
@@ -657,15 +737,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# TikTok doesn't expose post timestamps in the grid —
|
||||
# use now as a conservative estimate; dedup prevents re-posting
|
||||
created_on = arrow.utcnow().isoformat()
|
||||
|
||||
tiktoks.append(
|
||||
ScrapedTikTok(
|
||||
created_on=created_on,
|
||||
text=caption,
|
||||
video_url=canonical, # placeholder; real URL resolved later
|
||||
video_url=canonical,
|
||||
post_url=canonical,
|
||||
thumbnail_url=thumbnail_url,
|
||||
)
|
||||
@@ -685,7 +763,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
||||
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
||||
return tiktoks
|
||||
|
||||
|
||||
# --- Video extraction ---
|
||||
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user