Added all
This commit is contained in:
212
tiktok2bsky.py
212
tiktok2bsky.py
@@ -724,6 +724,161 @@ def _take_debug_screenshot(page, label: str):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
TIKTOK_GDPR_SELS = [
|
||||||
|
'button:has-text("Entendido")',
|
||||||
|
'button:has-text("Understood")',
|
||||||
|
'button:has-text("Got it")',
|
||||||
|
'[class*="gdpr"] button',
|
||||||
|
'[class*="privacy"] button:has-text("Entendido")',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _dismiss_all_overlays(page):
|
||||||
|
"""Dismiss GDPR notices, cookie banners and any other modals."""
|
||||||
|
for sel in TIKTOK_GDPR_SELS + TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS:
|
||||||
|
try:
|
||||||
|
el = page.locator(sel).first
|
||||||
|
if el.is_visible(timeout=1500):
|
||||||
|
el.click(timeout=2000)
|
||||||
|
logging.info(f"🚫 Dismissed overlay: {sel}")
|
||||||
|
time.sleep(0.6)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
|
||||||
|
"""
|
||||||
|
Click the Actualizar / Refresh button up to max_attempts times,
|
||||||
|
waiting progressively longer each time.
|
||||||
|
Returns True if the video grid eventually appears.
|
||||||
|
"""
|
||||||
|
for i in range(1, max_attempts + 1):
|
||||||
|
wait_s = 4.0 * i
|
||||||
|
logging.info(
|
||||||
|
f"🔄 Grid error detected — clicking Actualizar "
|
||||||
|
f"(attempt {i}/{max_attempts}, waiting {wait_s:.0f}s)..."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
page.locator(TIKTOK_REFRESH_BTN_SEL).first.click(timeout=3000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
time.sleep(wait_s)
|
||||||
|
_dismiss_all_overlays(page)
|
||||||
|
try:
|
||||||
|
page.wait_for_selector(TIKTOK_VIDEO_GRID_SEL, timeout=6000)
|
||||||
|
logging.info("✅ Video grid appeared after refresh.")
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||||
|
"""
|
||||||
|
Fallback: hit TikTok's internal item_list API directly with httpx.
|
||||||
|
Returns same list-of-dicts format as the Playwright scraper.
|
||||||
|
"""
|
||||||
|
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
|
||||||
|
|
||||||
|
# Build a cookie header string from the injected cookies
|
||||||
|
cookie_header = "; ".join(
|
||||||
|
f"{c.get('name', '')}={c.get('value', '')}"
|
||||||
|
for c in cookies
|
||||||
|
if c.get("name") and c.get("value")
|
||||||
|
)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Referer": f"https://www.tiktok.com/@{handle}",
|
||||||
|
"Cookie": cookie_header,
|
||||||
|
"Accept": "application/json, text/plain, */*",
|
||||||
|
"Accept-Language": "es-ES,es;q=0.9",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Resolve the numeric user ID from the profile page HTML
|
||||||
|
user_id = _resolve_tiktok_user_id(handle, headers)
|
||||||
|
if not user_id:
|
||||||
|
logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
videos = []
|
||||||
|
cursor = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
params = {
|
||||||
|
"aid": "1988",
|
||||||
|
"app_name": "tiktok_web",
|
||||||
|
"count": str(SCRAPE_VIDEO_LIMIT),
|
||||||
|
"cursor": str(cursor),
|
||||||
|
"secUid": "",
|
||||||
|
"id": user_id,
|
||||||
|
"type": "1",
|
||||||
|
"sourceType": "8",
|
||||||
|
"appId": "1233",
|
||||||
|
"region": "ES",
|
||||||
|
"priority_region": "ES",
|
||||||
|
"language": "es",
|
||||||
|
}
|
||||||
|
resp = httpx.get(
|
||||||
|
"https://www.tiktok.com/api/post/item_list/",
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
timeout=20,
|
||||||
|
follow_redirects=True,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
|
||||||
|
for item in data.get("itemList", []):
|
||||||
|
try:
|
||||||
|
vid_id = item.get("id", "")
|
||||||
|
desc = item.get("desc", "")
|
||||||
|
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
||||||
|
videos.append({
|
||||||
|
"id": vid_id,
|
||||||
|
"url": url,
|
||||||
|
"desc": desc,
|
||||||
|
"timestamp": arrow.utcnow().isoformat(),
|
||||||
|
"video_url": url,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ API item parse error: {e}")
|
||||||
|
|
||||||
|
logging.info(f"✅ API fallback returned {len(videos)} videos.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ TikTok API fallback failed: {e}")
|
||||||
|
|
||||||
|
return videos
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
|
||||||
|
"""Extract the numeric TikTok user ID from the profile page HTML."""
|
||||||
|
try:
|
||||||
|
resp = httpx.get(
|
||||||
|
f"https://www.tiktok.com/@{handle}",
|
||||||
|
headers=headers,
|
||||||
|
timeout=15,
|
||||||
|
follow_redirects=True,
|
||||||
|
)
|
||||||
|
# TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag
|
||||||
|
match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
|
||||||
|
if match:
|
||||||
|
uid = match.group(1)
|
||||||
|
logging.info(f"✅ Resolved TikTok user ID: {uid}")
|
||||||
|
return uid
|
||||||
|
# Fallback pattern
|
||||||
|
match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def scrape_tiktoks_via_playwright(handle: str) -> list:
|
def scrape_tiktoks_via_playwright(handle: str) -> list:
|
||||||
"""
|
"""
|
||||||
@@ -760,7 +915,6 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
timezone_id="Europe/Madrid",
|
timezone_id="Europe/Madrid",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Inject saved cookies
|
|
||||||
if cookies:
|
if cookies:
|
||||||
inject_cookies_into_context(context, cookies)
|
inject_cookies_into_context(context, cookies)
|
||||||
|
|
||||||
@@ -772,7 +926,6 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
else:
|
else:
|
||||||
stealth_sync(page)
|
stealth_sync(page)
|
||||||
|
|
||||||
# Mask automation signals
|
|
||||||
page.add_init_script("""
|
page.add_init_script("""
|
||||||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||||||
window.chrome = { runtime: {} };
|
window.chrome = { runtime: {} };
|
||||||
@@ -780,7 +933,8 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
Object.defineProperty(navigator, 'languages', {get: () => ['es-ES', 'es', 'en']});
|
Object.defineProperty(navigator, 'languages', {get: () => ['es-ES', 'es', 'en']});
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# ── Multi-attempt page load ────────────────────────────────────
|
grid_loaded = False
|
||||||
|
|
||||||
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
|
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
|
f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
|
||||||
@@ -800,28 +954,36 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
break
|
break
|
||||||
|
|
||||||
time.sleep(random.uniform(2.5, 4.0))
|
time.sleep(random.uniform(2.5, 4.0))
|
||||||
_dismiss_overlays(page)
|
|
||||||
|
# ── Dismiss ALL overlays including GDPR ────────────────────
|
||||||
|
_dismiss_all_overlays(page)
|
||||||
time.sleep(1.5)
|
time.sleep(1.5)
|
||||||
|
|
||||||
# Check for grid error state
|
# ── Check for grid error and retry with Actualizar ─────────
|
||||||
try:
|
try:
|
||||||
if page.locator(TIKTOK_GRID_ERROR_SEL).is_visible(timeout=2000):
|
if page.locator(TIKTOK_GRID_ERROR_SEL).is_visible(timeout=2000):
|
||||||
logging.warning("⚠️ Grid error state detected. Clicking Refresh...")
|
if _try_refresh_grid(page, max_attempts=4):
|
||||||
try:
|
grid_loaded = True
|
||||||
page.locator(TIKTOK_REFRESH_BTN_SEL).first.click(timeout=3000)
|
break
|
||||||
|
# Grid still broken — try a full page reload
|
||||||
|
logging.warning(
|
||||||
|
"⚠️ Grid still broken after Actualizar retries. "
|
||||||
|
"Reloading page..."
|
||||||
|
)
|
||||||
|
if attempt < PLAYWRIGHT_MAX_RELOADS:
|
||||||
time.sleep(3.0)
|
time.sleep(3.0)
|
||||||
except Exception:
|
continue
|
||||||
pass
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Wait for video grid
|
# ── Wait for video grid normally ───────────────────────────
|
||||||
try:
|
try:
|
||||||
page.wait_for_selector(
|
page.wait_for_selector(
|
||||||
TIKTOK_VIDEO_GRID_SEL,
|
TIKTOK_VIDEO_GRID_SEL,
|
||||||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||||||
)
|
)
|
||||||
logging.info("✅ Video grid found.")
|
logging.info("✅ Video grid found.")
|
||||||
|
grid_loaded = True
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@@ -830,11 +992,16 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
_take_debug_screenshot(page, f"no_grid_{attempt}")
|
_take_debug_screenshot(page, f"no_grid_{attempt}")
|
||||||
if attempt < PLAYWRIGHT_MAX_RELOADS:
|
if attempt < PLAYWRIGHT_MAX_RELOADS:
|
||||||
time.sleep(3.0)
|
time.sleep(3.0)
|
||||||
else:
|
|
||||||
logging.error("❌ Video grid never loaded after all attempts.")
|
if not grid_loaded:
|
||||||
_take_debug_screenshot(page, "final_fail")
|
logging.warning(
|
||||||
|
"⚠️ Playwright grid scraping failed. "
|
||||||
|
"Trying API fallback..."
|
||||||
|
)
|
||||||
|
_take_debug_screenshot(page, "playwright_failed")
|
||||||
browser.close()
|
browser.close()
|
||||||
return []
|
# ── API fallback ───────────────────────────────────────────
|
||||||
|
return _scrape_via_api(handle, cookies)
|
||||||
|
|
||||||
# ── Scroll to load more videos ─────────────────────────────────
|
# ── Scroll to load more videos ─────────────────────────────────
|
||||||
logging.info("📜 Scrolling to load videos...")
|
logging.info("📜 Scrolling to load videos...")
|
||||||
@@ -848,23 +1015,19 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
|
|
||||||
for item in items[:SCRAPE_VIDEO_LIMIT]:
|
for item in items[:SCRAPE_VIDEO_LIMIT]:
|
||||||
try:
|
try:
|
||||||
# Get the link
|
|
||||||
link_el = item.locator("a").first
|
link_el = item.locator("a").first
|
||||||
href = link_el.get_attribute("href") or ""
|
href = link_el.get_attribute("href") or ""
|
||||||
if not href or "/video/" not in href:
|
if not href or "/video/" not in href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Normalise URL
|
|
||||||
if href.startswith("/"):
|
if href.startswith("/"):
|
||||||
href = "https://www.tiktok.com" + href
|
href = "https://www.tiktok.com" + href
|
||||||
|
|
||||||
# Extract video ID
|
|
||||||
vid_match = re.search(r"/video/(\d+)", href)
|
vid_match = re.search(r"/video/(\d+)", href)
|
||||||
if not vid_match:
|
if not vid_match:
|
||||||
continue
|
continue
|
||||||
video_id = vid_match.group(1)
|
video_id = vid_match.group(1)
|
||||||
|
|
||||||
# Get description (best-effort)
|
|
||||||
desc = ""
|
desc = ""
|
||||||
try:
|
try:
|
||||||
desc = item.get_attribute("aria-label") or ""
|
desc = item.get_attribute("aria-label") or ""
|
||||||
@@ -881,7 +1044,7 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
"url": href,
|
"url": href,
|
||||||
"desc": desc,
|
"desc": desc,
|
||||||
"timestamp": arrow.utcnow().isoformat(),
|
"timestamp": arrow.utcnow().isoformat(),
|
||||||
"video_url": href, # resolved later during download
|
"video_url": href,
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -890,10 +1053,15 @@ def scrape_tiktoks_via_playwright(handle: str) -> list:
|
|||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
|
# ── If Playwright found nothing, try API fallback ──────────────────
|
||||||
|
if not videos:
|
||||||
|
logging.warning(
|
||||||
|
"⚠️ Playwright returned 0 videos. Trying API fallback..."
|
||||||
|
)
|
||||||
|
return _scrape_via_api(handle, cookies)
|
||||||
|
|
||||||
logging.info(f"✅ Scraped {len(videos)} videos from @{handle}.")
|
logging.info(f"✅ Scraped {len(videos)} videos from @{handle}.")
|
||||||
return videos
|
return videos
|
||||||
|
|
||||||
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
# Core: process a single TikTok video → post to Bluesky
|
# Core: process a single TikTok video → post to Bluesky
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user