Tiktok example 2
This commit is contained in:
@@ -37,6 +37,7 @@ pipeline {
|
|||||||
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \
|
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U \
|
||||||
atproto \
|
atproto \
|
||||||
playwright \
|
playwright \
|
||||||
|
playwright-stealth \
|
||||||
httpx \
|
httpx \
|
||||||
arrow \
|
arrow \
|
||||||
python-dotenv \
|
python-dotenv \
|
||||||
|
|||||||
137
tiktok2bsky.py
137
tiktok2bsky.py
@@ -24,12 +24,16 @@ import grapheme
|
|||||||
# --- Configuration ---
|
# --- Configuration ---
|
||||||
LOG_PATH = "tiktok2bsky.log"
|
LOG_PATH = "tiktok2bsky.log"
|
||||||
STATE_PATH = "tiktok2bsky_state.json"
|
STATE_PATH = "tiktok2bsky_state.json"
|
||||||
SCRAPE_VIDEO_LIMIT = 15 # TikTok loads fewer items per scroll than Twitter
|
|
||||||
DEDUPE_BSKY_LIMIT = 30
|
DEDUPE_BSKY_LIMIT = 30
|
||||||
VIDEO_MAX_AGE_DAYS = 3
|
VIDEO_MAX_AGE_DAYS = 3
|
||||||
BSKY_TEXT_MAX_LENGTH = 300
|
BSKY_TEXT_MAX_LENGTH = 300
|
||||||
DEFAULT_BSKY_LANGS = ["ca"]
|
DEFAULT_BSKY_LANGS = ["ca"]
|
||||||
|
|
||||||
|
TIKTOK_PAGE_LOAD_WAIT_S = 5.0 # was 3.0 — increased for slower grid render
|
||||||
|
TIKTOK_MAX_SCROLLS = 8 # was 5 — more scrolls = more videos discovered
|
||||||
|
SCRAPE_VIDEO_LIMIT = 30 # was 15
|
||||||
|
|
||||||
|
|
||||||
VIDEO_MAX_DURATION_SECONDS = 179
|
VIDEO_MAX_DURATION_SECONDS = 179
|
||||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||||
|
|
||||||
@@ -64,7 +68,6 @@ DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
|||||||
SESSION_FILE_PERMISSIONS = 0o600
|
SESSION_FILE_PERMISSIONS = 0o600
|
||||||
|
|
||||||
TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load
|
TIKTOK_SCROLL_PAUSE_S = 2.5 # pause between scrolls to let videos load
|
||||||
TIKTOK_MAX_SCROLLS = 5 # how many times to scroll down the profile
|
|
||||||
TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads
|
TIKTOK_PAGE_LOAD_WAIT_S = 3.0 # initial wait after profile page loads
|
||||||
DYNAMIC_ALT_MAX_LENGTH = 150
|
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||||
TRUNCATE_MIN_PREFIX_CHARS = 20
|
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||||
@@ -531,16 +534,32 @@ def make_rich(content):
|
|||||||
return text_builder
|
return text_builder
|
||||||
|
|
||||||
|
|
||||||
|
# --- TikTok Scraping ---
|
||||||
# --- TikTok Scraping ---
|
# --- TikTok Scraping ---
|
||||||
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list:
|
def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") -> list:
|
||||||
"""
|
"""
|
||||||
Scrape recent TikTok videos from a public profile using Playwright.
|
Scrape recent TikTok videos from a public profile using Playwright.
|
||||||
No login required for public profiles.
|
No login required for public profiles.
|
||||||
Returns a list of ScrapedTikTok objects.
|
Returns a list of ScrapedTikTok objects.
|
||||||
|
|
||||||
|
Fixes applied:
|
||||||
|
1. Aggressive GDPR/consent banner dismissal (Spanish + English)
|
||||||
|
2. Stealth headers: timezone, locale, sec-ch-ua, webdriver flag hidden
|
||||||
|
3. playwright-stealth applied before navigation
|
||||||
|
4. Broader + longer grid selector wait (30s, more selectors)
|
||||||
"""
|
"""
|
||||||
tiktoks = []
|
tiktoks = []
|
||||||
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}"
|
||||||
|
|
||||||
|
# playwright-stealth is optional but strongly recommended
|
||||||
|
try:
|
||||||
|
from playwright_stealth import stealth_sync
|
||||||
|
USE_STEALTH = True
|
||||||
|
logging.info("🥷 playwright-stealth available — stealth mode ON")
|
||||||
|
except ImportError:
|
||||||
|
USE_STEALTH = False
|
||||||
|
logging.warning("⚠️ playwright-stealth not installed — running without stealth")
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(
|
browser = p.chromium.launch(
|
||||||
headless=True,
|
headless=True,
|
||||||
@@ -548,59 +567,112 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
|||||||
"--disable-blink-features=AutomationControlled",
|
"--disable-blink-features=AutomationControlled",
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
"--disable-setuid-sandbox",
|
"--disable-setuid-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--window-size=1366,768",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# FIX 2 — Fake a real Windows Chrome browser with Spanish locale + Madrid timezone
|
||||||
context = browser.new_context(
|
context = browser.new_context(
|
||||||
user_agent=(
|
user_agent=(
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
"Chrome/145.0.7632.6 Safari/537.36"
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
),
|
),
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1366, "height": 768},
|
||||||
locale=locale,
|
locale="es-ES",
|
||||||
# TikTok checks these headers — set them explicitly
|
timezone_id="Europe/Madrid",
|
||||||
extra_http_headers={
|
extra_http_headers={
|
||||||
"Accept-Language": f"{locale},en;q=0.9",
|
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
|
||||||
|
"Accept": (
|
||||||
|
"text/html,application/xhtml+xml,application/xml;"
|
||||||
|
"q=0.9,image/avif,image/webp,*/*;q=0.8"
|
||||||
|
),
|
||||||
"Sec-Fetch-Dest": "document",
|
"Sec-Fetch-Dest": "document",
|
||||||
"Sec-Fetch-Mode": "navigate",
|
"Sec-Fetch-Mode": "navigate",
|
||||||
"Sec-Fetch-Site": "none",
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
|
||||||
|
"Sec-Ch-Ua-Mobile": "?0",
|
||||||
|
"Sec-Ch-Ua-Platform": '"Windows"',
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
|
# FIX 3 — Apply playwright-stealth before any navigation
|
||||||
|
if USE_STEALTH:
|
||||||
|
stealth_sync(page)
|
||||||
|
logging.info("🥷 Stealth patches applied.")
|
||||||
|
|
||||||
|
# FIX 2 — Hide webdriver flag + fake plugins/languages via init script
|
||||||
|
page.add_init_script("""
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'plugins', {
|
||||||
|
get: () => [
|
||||||
|
{ name: 'Chrome PDF Plugin' },
|
||||||
|
{ name: 'Chrome PDF Viewer' },
|
||||||
|
{ name: 'Native Client' }
|
||||||
|
]
|
||||||
|
});
|
||||||
|
Object.defineProperty(navigator, 'languages', {
|
||||||
|
get: () => ['es-ES', 'es', 'en']
|
||||||
|
});
|
||||||
|
window.chrome = {
|
||||||
|
runtime: {},
|
||||||
|
loadTimes: function() {},
|
||||||
|
csi: function() {},
|
||||||
|
app: {}
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
logging.info(f"🌐 Navigating to TikTok profile: {profile_url}")
|
||||||
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
page.goto(profile_url, wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S)
|
|
||||||
|
|
||||||
# Dismiss cookie/consent banners if present
|
# FIX 1 — Wait longer for initial page render (was 3.0s)
|
||||||
for selector in [
|
time.sleep(TIKTOK_PAGE_LOAD_WAIT_S + 2)
|
||||||
|
|
||||||
|
# FIX 1 — Aggressive GDPR/consent banner dismissal (Spanish + English)
|
||||||
|
GDPR_SELECTORS = [
|
||||||
|
'button:has-text("Entendido")',
|
||||||
|
'button:has-text("Aceptar todo")',
|
||||||
'button:has-text("Accept all")',
|
'button:has-text("Accept all")',
|
||||||
|
'button:has-text("Got it")',
|
||||||
'button:has-text("Decline optional")',
|
'button:has-text("Decline optional")',
|
||||||
'[data-e2e="cookie-banner-accept"]',
|
'[data-e2e="cookie-banner-accept"]',
|
||||||
]:
|
'[id*="accept"]',
|
||||||
|
'[class*="accept-btn"]',
|
||||||
|
]
|
||||||
|
for selector in GDPR_SELECTORS:
|
||||||
try:
|
try:
|
||||||
btn = page.locator(selector).first
|
btn = page.locator(selector).first
|
||||||
if btn.is_visible(timeout=2000):
|
if btn.is_visible(timeout=3000):
|
||||||
btn.click()
|
btn.click()
|
||||||
time.sleep(1)
|
logging.info(f"✅ Dismissed banner: {selector}")
|
||||||
|
time.sleep(2)
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Wait for video grid to appear
|
# FIX 4 — Broader selector list + longer timeout (30s, was 20s)
|
||||||
try:
|
GRID_SELECTORS = (
|
||||||
page.wait_for_selector(
|
|
||||||
'[data-e2e="user-post-item"], '
|
'[data-e2e="user-post-item"], '
|
||||||
'[class*="DivItemContainerV2"], '
|
'[class*="DivItemContainerV2"], '
|
||||||
'a[href*="/video/"]',
|
'a[href*="/video/"], '
|
||||||
timeout=20000,
|
'[class*="video-feed"], '
|
||||||
|
'div[class*="VideoFeed"], '
|
||||||
|
'[class*="DivVideoFeedV2"]'
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
page.wait_for_selector(GRID_SELECTORS, timeout=30000)
|
||||||
|
logging.info("✅ TikTok video grid detected.")
|
||||||
except Exception:
|
except Exception:
|
||||||
take_error_screenshot(page, "tiktok_profile_load_failed")
|
# FIX 4 — Don't give up immediately: try scrolling anyway
|
||||||
logging.error("❌ TikTok video grid did not appear.")
|
logging.warning(
|
||||||
browser.close()
|
"⚠️ Grid selector timed out — attempting scroll anyway "
|
||||||
return []
|
"(grid may still be partially loaded)"
|
||||||
|
)
|
||||||
|
|
||||||
# Scroll to load more videos
|
# Scroll to load more videos
|
||||||
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
for scroll_i in range(TIKTOK_MAX_SCROLLS):
|
||||||
@@ -610,7 +682,16 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
|||||||
|
|
||||||
# Collect video links
|
# Collect video links
|
||||||
video_links = page.locator('a[href*="/video/"]').all()
|
video_links = page.locator('a[href*="/video/"]').all()
|
||||||
logging.info(f"📊 Found {len(video_links)} video links. Parsing up to {SCRAPE_VIDEO_LIMIT}...")
|
logging.info(
|
||||||
|
f"📊 Found {len(video_links)} video links. "
|
||||||
|
f"Parsing up to {SCRAPE_VIDEO_LIMIT}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not video_links:
|
||||||
|
take_error_screenshot(page, "tiktok_no_video_links")
|
||||||
|
logging.error("❌ No video links found after scroll. TikTok may be blocking.")
|
||||||
|
browser.close()
|
||||||
|
return []
|
||||||
|
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
for link in video_links:
|
for link in video_links:
|
||||||
@@ -633,10 +714,9 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
|||||||
continue
|
continue
|
||||||
seen_urls.add(canonical)
|
seen_urls.add(canonical)
|
||||||
|
|
||||||
# Try to get caption from the card itself (avoids opening each video)
|
# Try to get caption from the card itself
|
||||||
caption = ""
|
caption = ""
|
||||||
try:
|
try:
|
||||||
# The caption is often in a sibling/child element
|
|
||||||
card = link.locator("..").first
|
card = link.locator("..").first
|
||||||
caption_el = card.locator(
|
caption_el = card.locator(
|
||||||
'[data-e2e="video-desc"], '
|
'[data-e2e="video-desc"], '
|
||||||
@@ -657,15 +737,13 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# TikTok doesn't expose post timestamps in the grid —
|
|
||||||
# use now as a conservative estimate; dedup prevents re-posting
|
|
||||||
created_on = arrow.utcnow().isoformat()
|
created_on = arrow.utcnow().isoformat()
|
||||||
|
|
||||||
tiktoks.append(
|
tiktoks.append(
|
||||||
ScrapedTikTok(
|
ScrapedTikTok(
|
||||||
created_on=created_on,
|
created_on=created_on,
|
||||||
text=caption,
|
text=caption,
|
||||||
video_url=canonical, # placeholder; real URL resolved later
|
video_url=canonical,
|
||||||
post_url=canonical,
|
post_url=canonical,
|
||||||
thumbnail_url=thumbnail_url,
|
thumbnail_url=thumbnail_url,
|
||||||
)
|
)
|
||||||
@@ -685,7 +763,6 @@ def scrape_tiktoks_via_playwright(target_handle: str, locale: str = "en-US") ->
|
|||||||
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
logging.info(f"✅ Scraped {len(tiktoks)} TikTok videos.")
|
||||||
return tiktoks
|
return tiktoks
|
||||||
|
|
||||||
|
|
||||||
# --- Video extraction ---
|
# --- Video extraction ---
|
||||||
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None:
|
def extract_tiktok_video_url_isolated(browser, post_url: str, video_id: str = None) -> str | None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user