From b9e1eb66adddcea83ae33fe05d2cca096c44d61c Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 11:12:44 +0200 Subject: [PATCH] Cookies 4 --- tiktok2bsky.py | 1620 ++++++++++++++++++++++-------------------------- 1 file changed, 727 insertions(+), 893 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 7e28533..0ef3d1a 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -1,782 +1,229 @@ +#!/usr/bin/env python3 +""" +tiktok2bsky.py +────────────── +Scrapes recent videos from a public TikTok profile and cross-posts +them to a Bluesky account. + +Usage: + python tiktok2bsky.py \ + --tiktok-handle jijantesfc \ + --bsky-handle jijantesfc.eurosky.social \ + --bsky-app-password xxxx-xxxx-xxxx-xxxx \ + --bsky-base-url https://eurosky.social \ + --bsky-langs es \ + --cookies-path tiktok_cookies.json +""" + import argparse -import arrow -import hashlib import json import logging -import re -import time import os -import subprocess -import uuid import random +import re +import subprocess +import sys import tempfile +import time +from datetime import datetime, timezone from pathlib import Path + +import arrow +import httpx +from atproto import Client from dotenv import load_dotenv -from atproto import Client, client_utils, models from playwright.sync_api import sync_playwright -from moviepy import VideoFileClip -import grapheme +from playwright_stealth import stealth_sync -# ───────────────────────────────────────────── -# Configuration -# ───────────────────────────────────────────── -LOG_PATH = "tiktok2bsky.log" -STATE_PATH = "tiktok2bsky_state.json" -TIKTOK_COOKIES_PATH = "tiktok_cookies.json" # ← export from your browser - -SCRAPE_VIDEO_LIMIT = 30 -DEDUPE_BSKY_LIMIT = 30 -VIDEO_MAX_AGE_DAYS = 3 -BSKY_TEXT_MAX_LENGTH = 300 -DEFAULT_BSKY_LANGS = ["es"] - -VIDEO_MAX_DURATION_SECONDS = 179 -MAX_VIDEO_UPLOAD_SIZE_MB = 45 - -BSKY_BLOB_UPLOAD_MAX_RETRIES = 5 -BSKY_BLOB_UPLOAD_BASE_DELAY = 10 -BSKY_BLOB_UPLOAD_MAX_DELAY = 300 -BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3 -BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 - -BSKY_SEND_POST_MAX_RETRIES = 3 -BSKY_SEND_POST_BASE_DELAY = 5 -BSKY_SEND_POST_MAX_DELAY = 60 - -BSKY_LOGIN_MAX_RETRIES = 4 -BSKY_LOGIN_BASE_DELAY = 10 -BSKY_LOGIN_MAX_DELAY = 600 -BSKY_LOGIN_JITTER_MAX = 1.5 - -SUBPROCESS_TIMEOUT_SECONDS = 180 -FFPROBE_TIMEOUT_SECONDS = 15 -DEFAULT_BSKY_BASE_URL = "https://bsky.social" - -TIKTOK_PAGE_LOAD_WAIT_S = 5.0 -TIKTOK_SCROLL_PAUSE_S = 2.5 -TIKTOK_MAX_SCROLLS = 8 -TIKTOK_BANNER_WAIT_S = 3.0 -TIKTOK_MAX_LOAD_ATTEMPTS = 3 - -DYNAMIC_ALT_MAX_LENGTH = 150 -TRUNCATE_MIN_PREFIX_CHARS = 20 - -# ───────────────────────────────────────────── -# Selectors -# ───────────────────────────────────────────── -GDPR_SELECTORS = [ - 'button:has-text("Permitir todas")', - 'button:has-text("Rechazar cookies opcionales")', - 'button:has-text("Entendido")', - 'button:has-text("Aceptar todo")', - 'button:has-text("Accept all")', - 'button:has-text("Got it")', - 'button:has-text("Decline optional")', - '[data-e2e="cookie-banner-accept"]', - '[id*="accept"]', - '[class*="accept-btn"]', -] - -TOP_BANNER_SELECTORS = [ - 'button:has-text("Entendido")', - 'button:has-text("Got it")', - 'button:has-text("Understood")', - '[data-e2e="top-banner-close"]', - '[class*="BannerContainer"] button', - '[class*="DivBannerContainer"] button', -] - -CAPTCHA_SELECTORS = [ - '[class*="captcha"]', - '[id*="captcha"]', - 'div:has-text("Drag the puzzle")', - 'div:has-text("puzzle piece")', - '[class*="secsdk-captcha"]', - '[class*="tiktok-captcha"]', -] - -GRID_SELECTORS = ( - '[data-e2e="user-post-item"], ' - '[class*="DivItemContainerV2"], ' - 'a[href*="/video/"], ' - '[class*="video-feed"], ' - 'div[class*="VideoFeed"], ' - '[class*="DivVideoFeedV2"]' -) - -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────────────────────── # Logging -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────────────────────── logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ - logging.FileHandler(LOG_PATH, encoding="utf-8"), - logging.StreamHandler(), + logging.StreamHandler(sys.stdout), + logging.FileHandler("tiktok2bsky.log", encoding="utf-8"), ], level=logging.INFO, ) -# ───────────────────────────────────────────── -# Data classes -# ───────────────────────────────────────────── -class ScrapedMedia: - def __init__(self, url, media_type="video"): - self.type = media_type - self.media_url_https = url +# ───────────────────────────────────────────────────────────────────────────── +# Constants & defaults +# ───────────────────────────────────────────────────────────────────────────── +DEFAULT_BSKY_BASE_URL = "https://bsky.social" +DEFAULT_BSKY_LANGS = ["es"] +TIKTOK_COOKIES_PATH = "tiktok_cookies.json" +STATE_FILE = "tiktok2bsky_state.json" +STATE_MAX_ENTRIES = 5000 -class ScrapedTikTok: - def __init__(self, created_on, text, video_url, - post_url=None, thumbnail_url=None): - self.created_on = created_on - self.text = text - self.post_url = post_url - self.thumbnail_url = thumbnail_url - self.media = ([ScrapedMedia(video_url, "video")] - if video_url else []) +SCRAPE_VIDEO_LIMIT = 30 +VIDEO_MAX_AGE_DAYS = 3 +VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s +VIDEO_MAX_SIZE_BYTES = 45 * 1024 * 1024 # 45 MB -# ───────────────────────────────────────────── -# Generic helpers -# ───────────────────────────────────────────── -def sha256_file(path, chunk_size=1024 * 1024): - h = hashlib.sha256() - with open(path, "rb") as f: - while True: - chunk = f.read(chunk_size) - if not chunk: - break - h.update(chunk) - return h.hexdigest() +# Bluesky login retry config +BSKY_LOGIN_MAX_RETRIES = 4 +BSKY_LOGIN_BASE_DELAY = 15.0 +BSKY_LOGIN_MAX_DELAY = 120.0 +BSKY_LOGIN_JITTER_MAX = 10.0 +# Bluesky upload retry config +BSKY_UPLOAD_MAX_RETRIES = 5 +BSKY_UPLOAD_BASE_DELAY = 10.0 +BSKY_UPLOAD_MAX_DELAY = 120.0 +BSKY_UPLOAD_JITTER_MAX = 5.0 -def take_error_screenshot(page, label): - timestamp = time.strftime("%Y%m%d_%H%M%S") - name = f"screenshot_{label}_{timestamp}.png" - try: - page.screenshot(path=name, full_page=True) - logging.info(f"📸 Screenshot saved: {name}") - except Exception as e: - logging.warning(f"⚠️ Could not save screenshot: {e}") +# Playwright scraping config +PLAYWRIGHT_TIMEOUT_MS = 30_000 +PLAYWRIGHT_SLOW_MO = 50 +PLAYWRIGHT_MAX_RELOADS = 3 +# TikTok selectors +TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' +TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' +TIKTOK_BANNER_SELS = [ + '[id*="banner"]', + '[class*="banner"]', + '[data-e2e="recommend-modal-close"]', + 'button:has-text("Rechazar")', + 'button:has-text("Reject")', + 'button:has-text("Accept")', + 'button:has-text("Aceptar")', + '[aria-label="Close"]', + '[aria-label="Cerrar"]', +] +TIKTOK_COOKIE_MODAL_SELS = [ + 'button:has-text("Decline all")', + 'button:has-text("Rechazar todo")', + 'button:has-text("Reject all")', + 'button:has-text("Accept all")', + 'button:has-text("Aceptar todo")', + '[class*="cookie"] button', + '[id*="cookie"] button', +] +TIKTOK_GRID_ERROR_SEL = '[data-e2e="user-post-item-list-error"]' +TIKTOK_REFRESH_BTN_SEL = 'button:has-text("Actualizar"), button:has-text("Refresh")' -def canonicalize_tiktok_url(url): - if not url: - return None - match = re.search( - r"https?://(?:www\.)?tiktok\.com/@([^/]+)/video/(\d+)", - url, re.IGNORECASE, - ) - if match: - return (f"https://www.tiktok.com/@{match.group(1)}" - f"/video/{match.group(2)}") - return url.strip() - - -def load_state(): - if os.path.exists(STATE_PATH): - with open(STATE_PATH, "r", encoding="utf-8") as f: - return json.load(f) - return {"posted_ids": []} - - -def save_state(state): - with open(STATE_PATH, "w", encoding="utf-8") as f: - json.dump(state, f, indent=2, ensure_ascii=False) - - -def tiktok_id_from_url(url): - if not url: - return None - match = re.search(r"/video/(\d+)", url) - return match.group(1) if match else url - - -def truncate_grapheme(text, max_len, suffix="…"): - clusters = list(grapheme.graphemes(text)) - if len(clusters) <= max_len: - return text - keep = max(TRUNCATE_MIN_PREFIX_CHARS, max_len - len(suffix)) - return "".join(clusters[:keep]) + suffix - - -# ───────────────────────────────────────────── -# Cookie helpers (Option 1) -# ───────────────────────────────────────────── -def load_tiktok_cookies() -> list: - """ - Load TikTok session cookies exported from a real browser. - Supports both Netscape/EditThisCookie JSON format and - the simpler list-of-dicts format used by Cookie-Editor. - """ - if not os.path.exists(TIKTOK_COOKIES_PATH): - logging.warning( - f"⚠️ Cookie file not found at '{TIKTOK_COOKIES_PATH}'. " - "Running without session — CAPTCHA risk is higher." - ) - return [] - - with open(TIKTOK_COOKIES_PATH, "r", encoding="utf-8") as f: - raw = json.load(f) - - # Normalise to Playwright format - cookies = [] - for c in raw: - entry = { - "name": c.get("name", ""), - "value": c.get("value", ""), - "domain": c.get("domain", ".tiktok.com"), - "path": c.get("path", "/"), - } - # sameSite must be one of "Strict" | "Lax" | "None" - ss = c.get("sameSite", "None") - entry["sameSite"] = ss if ss in ("Strict", "Lax", "None") else "None" - if "expirationDate" in c: - entry["expires"] = int(c["expirationDate"]) - elif "expires" in c: - entry["expires"] = int(c["expires"]) - cookies.append(entry) - - logging.info(f"🍪 Loaded {len(cookies)} TikTok cookies from {TIKTOK_COOKIES_PATH}") - return cookies - - -def _is_captcha_visible(page) -> bool: - for sel in CAPTCHA_SELECTORS: +# ───────────────────────────────────────────────────────────────────────────── +# State management +# ───────────────────────────────────────────────────────────────────────────── +def load_state() -> dict: + if os.path.exists(STATE_FILE): try: - if page.locator(sel).first.is_visible(timeout=1500): - logging.warning(f"🚧 CAPTCHA detected via selector: {sel}") - return True - except Exception: - pass - return False - - -# ───────────────────────────────────────────── -# yt-dlp scraper (Option 2 — fallback) -# ───────────────────────────────────────────── -def scrape_tiktoks_via_ytdlp(target_handle: str) -> list: - """ - Use yt-dlp as a fallback scraper when Playwright hits a CAPTCHA. - Extracts video URLs from the public TikTok profile without a browser. - Requires: pip install yt-dlp - """ - logging.info(f"🔄 Falling back to yt-dlp scraper for @{target_handle}...") - tiktoks = [] - - try: - import yt_dlp # noqa: F401 — verify it's installed - except ImportError: - logging.error( - "❌ yt-dlp is not installed. Run: pip install yt-dlp\n" - " Cannot scrape without Playwright session or yt-dlp." - ) - return [] - - profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" - - ydl_opts = { - "quiet": True, - "no_warnings": True, - "extract_flat": "in_playlist", # don't download, just list URLs - "playlistend": SCRAPE_VIDEO_LIMIT, - "ignoreerrors": True, - "socket_timeout": 30, - # Pass cookies file if available so yt-dlp also benefits from session - **({"cookiefile": TIKTOK_COOKIES_PATH} - if os.path.exists(TIKTOK_COOKIES_PATH) else {}), - } - - try: - import yt_dlp - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - logging.info(f"🌐 yt-dlp extracting profile: {profile_url}") - info = ydl.extract_info(profile_url, download=False) - - if not info: - logging.error("❌ yt-dlp returned no info for profile.") - return [] - - entries = info.get("entries", []) - if not entries: - logging.warning("⚠️ yt-dlp found no video entries.") - return [] - - seen_urls = set() - for entry in entries: - if not entry: - continue - if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: - break - - url = entry.get("url") or entry.get("webpage_url") or "" - canonical = canonicalize_tiktok_url(url) - if not canonical or canonical in seen_urls: - continue - if "/video/" not in canonical: - continue - seen_urls.add(canonical) - - # yt-dlp gives us rich metadata for free - title = entry.get("title", "") - timestamp = entry.get("timestamp") - thumbnail = entry.get("thumbnail", "") - created = (arrow.Arrow.fromtimestamp(timestamp).isoformat() - if timestamp else arrow.utcnow().isoformat()) - - tiktoks.append(ScrapedTikTok( - created_on = created, - text = title, - video_url = canonical, - post_url = canonical, - thumbnail_url = thumbnail, - )) - logging.info(f"🎵 [yt-dlp] Scraped: {canonical}") - - logging.info(f"✅ yt-dlp scraped {len(tiktoks)} videos.") - - except Exception as e: - logging.error(f"❌ yt-dlp scrape failed: {e}") - - return tiktoks - - -# ───────────────────────────────────────────── -# Playwright scraper (Option 1 — primary) -# ───────────────────────────────────────────── -def _dismiss_banners(page): - for sel in TOP_BANNER_SELECTORS + GDPR_SELECTORS: - try: - btn = page.locator(sel).first - if btn.is_visible(timeout=2000): - btn.click() - logging.info(f"✅ Dismissed banner: {sel}") - time.sleep(1.0) - return - except Exception: - pass - - -def _click_retry_button(page) -> bool: - for label in ("Actualizar", "Refresh", "Retry", "Reintentar"): - try: - btn = page.locator(f'button:has-text("{label}")').first - if btn.is_visible(timeout=1500): - btn.click() - logging.info(f"🔁 Clicked grid retry button: {label}") - time.sleep(2.0) - return True - except Exception: - pass - return False - - -def scrape_tiktoks_via_playwright(target_handle: str) -> list: - """ - Primary scraper: Playwright + session cookies. - Automatically falls back to yt-dlp if a CAPTCHA is detected. - """ - tiktoks = [] - profile_url = f"https://www.tiktok.com/@{target_handle.lstrip('@')}" - cookies = load_tiktok_cookies() - - # ── Stealth: support both playwright-stealth 2.x and 1.x ────────── - try: - from playwright_stealth import Stealth - USE_STEALTH = "v2" - _stealth = Stealth() - logging.info("🥷 playwright-stealth 2.x — stealth ON") - except ImportError: - try: - from playwright_stealth import stealth_sync - USE_STEALTH = "v1" - logging.info("🥷 playwright-stealth 1.x — stealth ON (legacy)") - except ImportError: - USE_STEALTH = False - logging.warning("⚠️ playwright-stealth not installed — no stealth.") - - with sync_playwright() as p: - browser = p.chromium.launch( - headless=True, - args=[ - "--disable-blink-features=AutomationControlled", - "--no-sandbox", - "--disable-setuid-sandbox", - "--disable-dev-shm-usage", - "--disable-gpu", - "--window-size=1366,768", - ], - ) - - context = browser.new_context( - user_agent=( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" - ), - viewport={"width": 1366, "height": 768}, - locale="es-ES", - timezone_id="Europe/Madrid", - extra_http_headers={ - "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", - "Accept": ( - "text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/avif,image/webp,*/*;q=0.8" - ), - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Ch-Ua": ( - '"Chromium";v="124","Google Chrome";v="124",' - '"Not-A.Brand";v="99"' - ), - "Sec-Ch-Ua-Mobile": "?0", - "Sec-Ch-Ua-Platform": '"Windows"', - }, - ) - - # ── Inject session cookies BEFORE navigation ─────────────────── - if cookies: - context.add_cookies(cookies) - logging.info(f"🍪 Injected {len(cookies)} session cookies.") - else: - logging.warning( - "⚠️ No cookies loaded. " - f"Create '{TIKTOK_COOKIES_PATH}' to avoid CAPTCHAs." - ) - - page = context.new_page() - - # ── Apply stealth patches ────────────────────────────────────── - if USE_STEALTH == "v2": - _stealth.apply_stealth_sync(page) - logging.info("🥷 Stealth patches applied (2.x).") - elif USE_STEALTH == "v1": - stealth_sync(page) - logging.info("🥷 Stealth patches applied (1.x).") - - page.add_init_script(""" - Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); - Object.defineProperty(navigator, 'plugins', { - get: () => [ - {name:'Chrome PDF Plugin'}, - {name:'Chrome PDF Viewer'}, - {name:'Native Client'} - ] - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['es-ES','es','en'] - }); - window.chrome = { - runtime:{}, loadTimes:function(){}, - csi:function(){}, app:{} - }; - """) - - try: - # ── 1. Navigate ──────────────────────────────────────────── - logging.info(f"🌐 Navigating to TikTok profile: {profile_url}") - page.goto(profile_url, wait_until="domcontentloaded", timeout=40000) - time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - - # ── 2. CAPTCHA check immediately after load ───────────────── - if _is_captcha_visible(page): - take_error_screenshot(page, "captcha_after_load") - logging.warning( - "🚧 CAPTCHA detected right after page load. " - "Cookies may be expired — falling back to yt-dlp." - ) - browser.close() - return scrape_tiktoks_via_ytdlp(target_handle) - - # ── 3. Dismiss banners ───────────────────────────────────── - _dismiss_banners(page) - - # ── 4. Reload for clean grid ─────────────────────────────── - logging.info("🔄 Reloading page for clean grid render...") - page.reload(wait_until="domcontentloaded", timeout=40000) - time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - - # ── 5. Multi-attempt loop ────────────────────────────────── - video_links = [] - - for attempt in range(1, TIKTOK_MAX_LOAD_ATTEMPTS + 1): + with open(STATE_FILE, "r", encoding="utf-8") as f: + state = json.load(f) logging.info( - f"🔁 Grid load attempt {attempt}/{TIKTOK_MAX_LOAD_ATTEMPTS}..." + f"📂 Loaded state: {len(state.get('posted', {}))} entries." ) - - # CAPTCHA check on every attempt - if _is_captcha_visible(page): - take_error_screenshot(page, f"captcha_attempt_{attempt}") - logging.warning( - f"🚧 CAPTCHA on attempt {attempt} — falling back to yt-dlp." - ) - browser.close() - return scrape_tiktoks_via_ytdlp(target_handle) - - _dismiss_banners(page) - - try: - page.wait_for_selector(GRID_SELECTORS, timeout=15000) - logging.info(f"✅ Grid selector found on attempt {attempt}.") - except Exception: - logging.warning( - f"⚠️ Grid selector timed out on attempt {attempt}." - ) - take_error_screenshot( - page, f"grid_timeout_attempt_{attempt}" - ) - - _click_retry_button(page) - - try: - page.wait_for_selector(GRID_SELECTORS, timeout=10000) - except Exception: - pass - - time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - - video_links = page.locator('a[href*="/video/"]').all() - logging.info( - f"📊 Attempt {attempt}: found {len(video_links)} video links." - ) - - if video_links: - logging.info(f"✅ Got video links on attempt {attempt}.") - break - - if attempt < TIKTOK_MAX_LOAD_ATTEMPTS: - logging.info( - f"🔄 No videos — reloading " - f"(attempt {attempt + 1}/{TIKTOK_MAX_LOAD_ATTEMPTS})..." - ) - page.reload(wait_until="domcontentloaded", timeout=40000) - time.sleep(TIKTOK_PAGE_LOAD_WAIT_S) - - # ── 6. Scroll to load more ───────────────────────────────── - if video_links: - for i in range(TIKTOK_MAX_SCROLLS): - page.evaluate("window.scrollBy(0, window.innerHeight * 2)") - time.sleep(TIKTOK_SCROLL_PAUSE_S) - video_links = page.locator('a[href*="/video/"]').all() - logging.info( - f"📊 {len(video_links)} video links after scrolling." - ) - - # ── 7. Still nothing → yt-dlp fallback ──────────────────── - if not video_links: - take_error_screenshot(page, "no_video_links_final") - logging.warning( - "⚠️ No video links found after all Playwright attempts. " - "Falling back to yt-dlp." - ) - browser.close() - return scrape_tiktoks_via_ytdlp(target_handle) - - # ── 8. Parse video links ─────────────────────────────────── - seen_urls = set() - for link in video_links: - if len(tiktoks) >= SCRAPE_VIDEO_LIMIT: - break - try: - href = link.get_attribute("href") - if not href: - continue - post_url = ( - f"https://www.tiktok.com{href}" - if href.startswith("/") else href - ) - canonical = canonicalize_tiktok_url(post_url) - if not canonical or canonical in seen_urls: - continue - if "/video/" not in canonical: - continue - seen_urls.add(canonical) - - caption = "" - try: - card = link.locator("..").first - cap_el = card.locator( - '[data-e2e="video-desc"], ' - '[class*="SpanUniqueId"], ' - 'p[class*="caption"]' - ).first - if cap_el.is_visible(timeout=1000): - caption = cap_el.inner_text() - except Exception: - pass - - thumbnail_url = None - try: - img = link.locator("img").first - if img.is_visible(timeout=1000): - thumbnail_url = img.get_attribute("src") - except Exception: - pass - - tiktoks.append(ScrapedTikTok( - created_on = arrow.utcnow().isoformat(), - text = caption, - video_url = canonical, - post_url = canonical, - thumbnail_url = thumbnail_url, - )) - logging.info(f"🎵 [Playwright] Scraped: {canonical}") - - except Exception as e: - logging.warning(f"⚠️ Failed to parse video card: {e}") - + return state except Exception as e: - take_error_screenshot(page, "playwright_scrape_failed") - logging.error(f"❌ Playwright scrape failed: {e}") - browser.close() - logging.info("🔄 Attempting yt-dlp fallback after Playwright error...") - return scrape_tiktoks_via_ytdlp(target_handle) - - browser.close() - - logging.info(f"✅ [Playwright] Scraped {len(tiktoks)} videos.") - return tiktoks + logging.warning(f"⚠️ Could not load state file: {e}. Starting fresh.") + return {"posted": {}} -# ───────────────────────────────────────────── -# Video download (yt-dlp) -# ───────────────────────────────────────────── -def download_video_ytdlp(post_url: str, output_dir: str) -> str | None: - """ - Download a single TikTok video using yt-dlp. - Returns the path to the downloaded file, or None on failure. - """ +def save_state(state: dict): + # Prune to last STATE_MAX_ENTRIES + posted = state.get("posted", {}) + if len(posted) > STATE_MAX_ENTRIES: + sorted_keys = sorted( + posted.keys(), + key=lambda k: posted[k].get("posted_at", ""), + ) + for old_key in sorted_keys[: len(posted) - STATE_MAX_ENTRIES]: + del posted[old_key] + state["posted"] = posted + try: - import yt_dlp - except ImportError: - logging.error("❌ yt-dlp not installed. Run: pip install yt-dlp") - return None + with open(STATE_FILE, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2, ensure_ascii=False) + except Exception as e: + logging.error(f"❌ Could not save state: {e}") - output_template = os.path.join(output_dir, "%(id)s.%(ext)s") - ydl_opts = { - "quiet": True, - "no_warnings": True, - "outtmpl": output_template, - "format": "mp4/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "merge_output_format": "mp4", - "socket_timeout": 30, - "retries": 3, - **({"cookiefile": TIKTOK_COOKIES_PATH} - if os.path.exists(TIKTOK_COOKIES_PATH) else {}), + +def is_already_posted(video_id: str, state: dict) -> bool: + return video_id in state.get("posted", {}) + + +def mark_as_posted(video_id: str, state: dict, meta: dict = None): + state.setdefault("posted", {})[video_id] = { + "posted_at": arrow.utcnow().isoformat(), + **(meta or {}), } + save_state(state) +# ───────────────────────────────────────────────────────────────────────────── +# Cookie helpers +# ───────────────────────────────────────────────────────────────────────────── +def load_cookies_from_file(path: str) -> list: + """Load cookies from a JSON file (format produced by generate_tiktok_cookies.py).""" + if not os.path.exists(path): + logging.warning(f"⚠️ Cookie file not found: {path}") + return [] try: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(post_url, download=True) - if not info: - return None - filename = ydl.prepare_filename(info) - # yt-dlp may change extension after merge - for ext in ("mp4", "mkv", "webm"): - candidate = re.sub(r"\.\w+$", f".{ext}", filename) - if os.path.exists(candidate): - logging.info(f"📥 Downloaded via yt-dlp: {candidate}") - return candidate - if os.path.exists(filename): - return filename + with open(path, "r", encoding="utf-8") as f: + cookies = json.load(f) + logging.info(f"🍪 Loaded {len(cookies)} cookies from {path}") + return cookies except Exception as e: - logging.error(f"❌ yt-dlp download failed for {post_url}: {e}") - - return None + logging.warning(f"⚠️ Could not load cookies from {path}: {e}") + return [] -# ───────────────────────────────────────────── -# Video processing helpers -# ───────────────────────────────────────────── -def get_video_duration(path: str) -> float | None: +def inject_cookies_into_context(context, cookies: list): + """Inject a list of cookie dicts into a Playwright browser context.""" + if not cookies: + return + playwright_cookies = [] + for c in cookies: + entry = { + "name": c.get("name", ""), + "value": c.get("value", ""), + "domain": c.get("domain", ".tiktok.com"), + "path": c.get("path", "/"), + "secure": c.get("secure", False), + "httpOnly": c.get("httpOnly", False), + "sameSite": c.get("sameSite", "None"), + } + exp = c.get("expirationDate") or c.get("expires") + if exp and float(exp) > 0: + entry["expires"] = float(exp) + playwright_cookies.append(entry) try: - result = subprocess.run( - [ - "ffprobe", "-v", "error", - "-show_entries", "format=duration", - "-of", "default=noprint_wrappers=1:nokey=1", - path, - ], - capture_output=True, text=True, - timeout=FFPROBE_TIMEOUT_SECONDS, - ) - return float(result.stdout.strip()) + context.add_cookies(playwright_cookies) + logging.info(f"🍪 Injected {len(playwright_cookies)} cookies into browser context.") except Exception as e: - logging.warning(f"⚠️ ffprobe failed: {e}") - return None + logging.warning(f"⚠️ Could not inject cookies: {e}") - -def trim_video(input_path: str, output_path: str, - max_seconds: int = VIDEO_MAX_DURATION_SECONDS) -> bool: - try: - subprocess.run( - [ - "ffmpeg", "-y", "-i", input_path, - "-t", str(max_seconds), - "-c", "copy", output_path, - ], - capture_output=True, check=True, - timeout=SUBPROCESS_TIMEOUT_SECONDS, - ) - return True - except Exception as e: - logging.error(f"❌ ffmpeg trim failed: {e}") - return False - - -def get_video_dimensions(path: str) -> tuple[int, int] | None: - try: - result = subprocess.run( - [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=width,height", - "-of", "csv=p=0", - path, - ], - capture_output=True, text=True, - timeout=FFPROBE_TIMEOUT_SECONDS, - ) - parts = result.stdout.strip().split(",") - if len(parts) == 2: - return int(parts[0]), int(parts[1]) - except Exception as e: - logging.warning(f"⚠️ Could not get video dimensions: {e}") - return None - - -def extract_thumbnail(video_path: str, output_path: str) -> bool: - try: - subprocess.run( - [ - "ffmpeg", "-y", "-i", video_path, - "-ss", "00:00:01", - "-vframes", "1", - "-q:v", "2", - output_path, - ], - capture_output=True, check=True, - timeout=FFPROBE_TIMEOUT_SECONDS, - ) - return os.path.exists(output_path) - except Exception as e: - logging.warning(f"⚠️ Thumbnail extraction failed: {e}") - return False - - -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────────────────────── # Bluesky helpers -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────────────────────── def bsky_login(client: Client, handle: str, password: str, base_url: str) -> bool: for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1): try: client.base_url = base_url client.login(handle, password) - logging.info(f"✅ Logged in to Bluesky as {handle}") + logging.info(f"✅ Logged in to Bluesky as {handle} via {base_url}") return True except Exception as e: + err = str(e) + + # 401 = wrong credentials — no point retrying + if any(x in err for x in ("401", "AuthenticationRequired", + "Invalid identifier", "Invalid password")): + logging.error( + f"❌ Bluesky login failed: invalid handle or app password. " + f"Check your BSKY_JIJANTESFC_APP_PASSWORD credential in Jenkins. " + f"({err})" + ) + return False + + if attempt == BSKY_LOGIN_MAX_RETRIES: + logging.error(f"❌ All {BSKY_LOGIN_MAX_RETRIES} login attempts failed.") + return False + delay = min( BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)) + random.uniform(0, BSKY_LOGIN_JITTER_MAX), @@ -787,242 +234,622 @@ def bsky_login(client: Client, handle: str, password: str, f"Retrying in {delay:.1f}s..." ) time.sleep(delay) - logging.error("❌ All Bluesky login attempts failed.") + return False -def upload_video_blob(client: Client, video_path: str): - size_mb = os.path.getsize(video_path) / (1024 * 1024) - if size_mb > MAX_VIDEO_UPLOAD_SIZE_MB: - logging.error( - f"❌ Video too large: {size_mb:.1f} MB " - f"(max {MAX_VIDEO_UPLOAD_SIZE_MB} MB)" - ) - return None +def bsky_get_recent_post_urls(client: Client, handle: str, + limit: int = 50) -> set: + """Return a set of URLs recently posted to Bluesky (to avoid duplicates).""" + urls: set = set() + try: + feed = client.get_author_feed(actor=handle, limit=limit) + for item in feed.feed: + post = item.post + if hasattr(post, "record") and hasattr(post.record, "embed"): + embed = post.record.embed + if hasattr(embed, "external") and hasattr(embed.external, "uri"): + urls.add(embed.external.uri) + if hasattr(post, "record") and hasattr(post.record, "text"): + text = post.record.text + found = re.findall(r"https?://\S+", text) + urls.update(found) + except Exception as e: + logging.warning(f"⚠️ Could not fetch recent Bluesky posts: {e}") + return urls - for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1): + +def bsky_upload_blob_with_retry(client: Client, data: bytes, + mime_type: str) -> object: + """Upload a blob to Bluesky with retry + exponential backoff.""" + for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): try: - with open(video_path, "rb") as f: - data = f.read() resp = client.upload_blob(data) - logging.info(f"✅ Video blob uploaded on attempt {attempt}.") + logging.info( + f"✅ Blob uploaded ({len(data) / 1024 / 1024:.1f} MB) " + f"on attempt {attempt}." + ) return resp.blob except Exception as e: - err = str(e).lower() - is_transient = any( - k in err for k in ("rate", "timeout", "503", "502", "500") - ) + err = str(e) + is_rate_limit = "429" in err or "RateLimitExceeded" in err + + if attempt == BSKY_UPLOAD_MAX_RETRIES: + logging.error( + f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" + ) + raise + delay = min( - BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)), - BSKY_BLOB_UPLOAD_MAX_DELAY, + BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) + + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), + BSKY_UPLOAD_MAX_DELAY, ) + if is_rate_limit: + delay = max(delay, 60.0) + logging.warning( f"⚠️ Blob upload attempt {attempt} failed: {e}. " - f"Retrying in {delay}s..." + f"Retrying in {delay:.1f}s..." ) - if not is_transient and attempt >= BSKY_BLOB_TRANSIENT_ERROR_RETRIES: - break time.sleep(delay) - logging.error("❌ All blob upload attempts failed.") - return None - -def upload_thumb_blob(client: Client, thumb_path: str): - try: - with open(thumb_path, "rb") as f: - data = f.read() - resp = client.upload_blob(data) - logging.info("✅ Thumbnail blob uploaded.") - return resp.blob - except Exception as e: - logging.warning(f"⚠️ Thumbnail upload failed: {e}") - return None - - -def send_bsky_post(client: Client, text: str, video_blob, - thumb_blob, langs: list, - aspect_ratio: models.AppBskyEmbedDefs.AspectRatio | None, - alt_text: str = "") -> bool: - tb = client_utils.TextBuilder() - tb.text(text) - - video_embed = models.AppBskyEmbedVideo.Main( - video = video_blob, - alt = alt_text[:DYNAMIC_ALT_MAX_LENGTH], - thumbnail = thumb_blob, - **({"aspectRatio": aspect_ratio} if aspect_ratio else {}), - ) - - for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1): +def bsky_create_post_with_retry(client: Client, text: str, + embed=None, langs=None) -> bool: + """Create a Bluesky post with retry + exponential backoff.""" + for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): try: - client.send_post( - text = tb, - embed = video_embed, - langs = langs, - ) - logging.info("✅ Post sent to Bluesky.") + kwargs = {"text": text} + if embed: + kwargs["embed"] = embed + if langs: + kwargs["langs"] = langs + client.send_post(**kwargs) + logging.info(f"✅ Post created on attempt {attempt}.") return True except Exception as e: + err = str(e) + is_rate_limit = "429" in err or "RateLimitExceeded" in err + + if attempt == BSKY_UPLOAD_MAX_RETRIES: + logging.error( + f"❌ Post creation failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" + ) + return False + delay = min( - BSKY_SEND_POST_BASE_DELAY * (2 ** (attempt - 1)), - BSKY_SEND_POST_MAX_DELAY, + BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) + + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), + BSKY_UPLOAD_MAX_DELAY, ) + if is_rate_limit: + delay = max(delay, 60.0) + logging.warning( - f"⚠️ Send post attempt {attempt} failed: {e}. " - f"Retrying in {delay}s..." + f"⚠️ Post creation attempt {attempt} failed: {e}. " + f"Retrying in {delay:.1f}s..." ) time.sleep(delay) - logging.error("❌ All send-post attempts failed.") return False - -# ───────────────────────────────────────────── -# Core sync logic -# ───────────────────────────────────────────── -def already_posted(video_id: str, state: dict) -> bool: - return video_id in state.get("posted_ids", []) +# ───────────────────────────────────────────────────────────────────────────── +# Video processing helpers +# ───────────────────────────────────────────────────────────────────────────── +def get_video_duration(path: str) -> float: + """Return video duration in seconds using ffprobe.""" + try: + result = subprocess.run( + [ + "ffprobe", "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + path, + ], + capture_output=True, text=True, timeout=30, + ) + return float(result.stdout.strip()) + except Exception as e: + logging.warning(f"⚠️ ffprobe failed: {e}") + return 0.0 -def mark_posted(video_id: str, state: dict): - ids = state.setdefault("posted_ids", []) - if video_id not in ids: - ids.append(video_id) - # Keep only the last N to avoid unbounded growth - state["posted_ids"] = ids[-DEDUPE_BSKY_LIMIT * 10:] +def compress_video(input_path: str, output_path: str, + max_duration: int = VIDEO_MAX_DURATION_S, + max_size_bytes: int = VIDEO_MAX_SIZE_BYTES) -> bool: + """ + Trim to max_duration and compress to fit max_size_bytes. + Returns True on success. + """ + try: + duration = get_video_duration(input_path) + trim_to = min(duration, max_duration) + + # Target bitrate calculation (leave 10% headroom) + target_bits = max_size_bytes * 8 * 0.90 + target_kbps = int(target_bits / trim_to / 1000) + video_kbps = max(200, target_kbps - 128) # reserve 128k for audio + + logging.info( + f"🎬 Compressing: duration={duration:.1f}s → trim={trim_to:.1f}s, " + f"video_bitrate={video_kbps}k" + ) + + cmd = [ + "ffmpeg", "-y", + "-i", input_path, + "-t", str(trim_to), + "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease", + "-c:v", "libx264", + "-b:v", f"{video_kbps}k", + "-maxrate", f"{video_kbps * 2}k", + "-bufsize", f"{video_kbps * 4}k", + "-c:a", "aac", + "-b:a", "128k", + "-movflags", "+faststart", + "-pix_fmt", "yuv420p", + output_path, + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + if result.returncode != 0: + logging.error(f"❌ ffmpeg failed:\n{result.stderr}") + return False + + final_size = os.path.getsize(output_path) + logging.info( + f"✅ Compressed video: {final_size / 1024 / 1024:.1f} MB → {output_path}" + ) + return True + + except Exception as e: + logging.error(f"❌ compress_video error: {e}") + return False -def process_tiktok(tiktok: ScrapedTikTok, client: Client, +def download_video(url: str, output_path: str, + cookies: list = None) -> bool: + """ + Download a video from a URL (MP4 or M3U8) using httpx or yt-dlp. + Falls back to yt-dlp for HLS streams or when direct download fails. + """ + # ── Try direct HTTP download first ──────────────────────────────── + if not url.endswith(".m3u8"): + try: + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + "Referer": "https://www.tiktok.com/", + } + with httpx.stream("GET", url, headers=headers, + follow_redirects=True, timeout=60) as r: + r.raise_for_status() + with open(output_path, "wb") as f: + for chunk in r.iter_bytes(chunk_size=1024 * 64): + f.write(chunk) + size = os.path.getsize(output_path) + if size > 10_000: + logging.info( + f"✅ Direct download OK: {size / 1024 / 1024:.1f} MB" + ) + return True + logging.warning( + f"⚠️ Direct download too small ({size} bytes), trying yt-dlp..." + ) + except Exception as e: + logging.warning(f"⚠️ Direct download failed: {e}. Trying yt-dlp...") + + # ── Fall back to yt-dlp ──────────────────────────────────────────── + return download_video_ytdlp(url, output_path, cookies=cookies) + + +def download_video_ytdlp(url: str, output_path: str, + cookies: list = None) -> bool: + """Download a video using yt-dlp, optionally injecting cookies.""" + cookie_file = None + try: + import yt_dlp + + ydl_opts = { + "outtmpl": output_path, + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "quiet": True, + "no_warnings": False, + "merge_output_format": "mp4", + } + + # Write cookies to a temp Netscape file if provided + if cookies: + cookie_file = _write_netscape_cookies(cookies) + if cookie_file: + ydl_opts["cookiefile"] = cookie_file + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + + if os.path.exists(output_path) and os.path.getsize(output_path) > 10_000: + logging.info( + f"✅ yt-dlp download OK: " + f"{os.path.getsize(output_path) / 1024 / 1024:.1f} MB" + ) + return True + + logging.error("❌ yt-dlp produced no output file or file too small.") + return False + + except Exception as e: + logging.error(f"❌ yt-dlp download failed: {e}") + return False + finally: + if cookie_file and os.path.exists(cookie_file): + os.unlink(cookie_file) + + +def _write_netscape_cookies(cookies: list) -> str | None: + """Write cookies list to a Netscape-format temp file for yt-dlp.""" + try: + fd, path = tempfile.mkstemp(suffix=".txt", prefix="tiktok_cookies_") + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write("# Netscape HTTP Cookie File\n") + for c in cookies: + domain = c.get("domain", ".tiktok.com") + flag = "TRUE" if domain.startswith(".") else "FALSE" + path_val = c.get("path", "/") + secure = "TRUE" if c.get("secure") else "FALSE" + exp = int(c.get("expirationDate", 0) or c.get("expires", 0) or 0) + name = c.get("name", "") + value = c.get("value", "") + f.write(f"{domain}\t{flag}\t{path_val}\t{secure}\t{exp}\t{name}\t{value}\n") + return path + except Exception as e: + logging.warning(f"⚠️ Could not write Netscape cookie file: {e}") + return None + +# ───────────────────────────────────────────────────────────────────────────── +# TikTok scraping via Playwright +# ───────────────────────────────────────────────────────────────────────────── +def _dismiss_overlays(page): + """Dismiss cookie banners and RGPD modals.""" + for sel in TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS: + try: + el = page.locator(sel).first + if el.is_visible(timeout=1500): + el.click(timeout=2000) + logging.info(f"🚫 Dismissed overlay: {sel}") + time.sleep(0.5) + except Exception: + pass + + +def _take_debug_screenshot(page, label: str): + """Save a debug screenshot to workspace.""" + try: + path = f"screenshot_{label}_{int(time.time())}.png" + page.screenshot(path=path) + logging.info(f"📸 Screenshot saved: {path}") + except Exception: + pass + + +def scrape_tiktoks_via_playwright(handle: str) -> list: + """ + Scrape recent videos from a public TikTok profile. + Returns a list of dicts: {id, url, desc, timestamp, video_url} + """ + profile_url = f"https://www.tiktok.com/@{handle.lstrip('@')}" + cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) + videos = [] + + logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + slow_mo=PLAYWRIGHT_SLOW_MO, + args=[ + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--disable-gpu", + ], + ) + + context = browser.new_context( + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + viewport={"width": 1280, "height": 900}, + locale="es-ES", + timezone_id="Europe/Madrid", + ) + + # Inject saved cookies + if cookies: + inject_cookies_into_context(context, cookies) + + page = context.new_page() + + # Stealth mode + stealth_sync(page) + + # Mask automation signals + page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + window.chrome = { runtime: {} }; + Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]}); + Object.defineProperty(navigator, 'languages', {get: () => ['es-ES', 'es', 'en']}); + """) + + # ── Multi-attempt page load ──────────────────────────────────── + for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): + logging.info( + f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) + try: + page.goto( + profile_url, + wait_until="domcontentloaded", + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + except Exception as e: + logging.warning(f"⚠️ page.goto failed on attempt {attempt}: {e}") + _take_debug_screenshot(page, f"goto_fail_{attempt}") + if attempt < PLAYWRIGHT_MAX_RELOADS: + time.sleep(3.0) + continue + break + + time.sleep(random.uniform(2.5, 4.0)) + _dismiss_overlays(page) + time.sleep(1.5) + + # Check for grid error state + try: + if page.locator(TIKTOK_GRID_ERROR_SEL).is_visible(timeout=2000): + logging.warning("⚠️ Grid error state detected. Clicking Refresh...") + try: + page.locator(TIKTOK_REFRESH_BTN_SEL).first.click(timeout=3000) + time.sleep(3.0) + except Exception: + pass + except Exception: + pass + + # Wait for video grid + try: + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + logging.info("✅ Video grid found.") + break + except Exception: + logging.warning( + f"⚠️ Video grid not found on attempt {attempt}." + ) + _take_debug_screenshot(page, f"no_grid_{attempt}") + if attempt < PLAYWRIGHT_MAX_RELOADS: + time.sleep(3.0) + else: + logging.error("❌ Video grid never loaded after all attempts.") + _take_debug_screenshot(page, "final_fail") + browser.close() + return [] + + # ── Scroll to load more videos ───────────────────────────────── + logging.info("📜 Scrolling to load videos...") + for _ in range(5): + page.evaluate("window.scrollBy(0, window.innerHeight * 2)") + time.sleep(random.uniform(1.0, 2.0)) + + # ── Extract video items ──────────────────────────────────────── + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + logging.info(f"📋 Found {len(items)} video items in grid.") + + cutoff = arrow.utcnow().shift(days=-VIDEO_MAX_AGE_DAYS) + + for item in items[:SCRAPE_VIDEO_LIMIT]: + try: + # Get the link + link_el = item.locator("a").first + href = link_el.get_attribute("href") or "" + if not href or "/video/" not in href: + continue + + # Normalise URL + if href.startswith("/"): + href = "https://www.tiktok.com" + href + + # Extract video ID + vid_match = re.search(r"/video/(\d+)", href) + if not vid_match: + continue + video_id = vid_match.group(1) + + # Get description (best-effort) + desc = "" + try: + desc = item.get_attribute("aria-label") or "" + if not desc: + desc_el = item.locator('[class*="desc"], [class*="title"]').first + desc = desc_el.inner_text(timeout=1000).strip() + except Exception: + pass + + videos.append({ + "id": video_id, + "url": href, + "desc": desc, + "timestamp": arrow.utcnow().isoformat(), + "video_url": href, # resolved later during download + }) + + except Exception as e: + logging.warning(f"⚠️ Error parsing video item: {e}") + continue + + browser.close() + + logging.info(f"✅ Scraped {len(videos)} videos from @{handle}.") + return videos + +# ───────────────────────────────────────────────────────────────────────────── +# Core: process a single TikTok video → post to Bluesky +# ───────────────────────────────────────────────────────────────────────────── +def process_tiktok(video: dict, client: Client, langs: list, state: dict) -> bool: - """Download, process, and post a single TikTok video to Bluesky.""" - if not tiktok.media: - logging.warning("⚠️ TikTok has no media — skipping.") + """ + Download, compress, and post a single TikTok video to Bluesky. + Returns True if successfully posted. + """ + video_id = video["id"] + video_url = video["url"] + desc = video.get("desc", "") + + # ── Deduplication ────────────────────────────────────────────────── + if is_already_posted(video_id, state): + logging.info(f"⏭️ Skipping already-posted video: {video_id}") return False - post_url = tiktok.post_url or tiktok.media[0].media_url_https - video_id = tiktok_id_from_url(post_url) + logging.info(f"🎬 Processing video {video_id}: {video_url}") - if already_posted(video_id, state): - logging.info(f"⏭️ Already posted {video_id} — skipping.") - return False + cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) with tempfile.TemporaryDirectory() as tmpdir: - # ── Download video ──────────────────────────────────────────── - video_path = download_video_ytdlp(post_url, tmpdir) - if not video_path or not os.path.exists(video_path): - logging.error(f"❌ Could not download video: {post_url}") + raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") + processed_path = os.path.join(tmpdir, f"{video_id}.mp4") + + # ── Download ─────────────────────────────────────────────────── + logging.info(f"⬇️ Downloading: {video_url}") + if not download_video(video_url, raw_path, cookies=cookies): + logging.error(f"❌ Download failed for {video_id}. Skipping.") return False - # ── Check / trim duration ───────────────────────────────────── - duration = get_video_duration(video_path) - if duration and duration > VIDEO_MAX_DURATION_SECONDS: - logging.info( - f"✂️ Video {duration:.0f}s > {VIDEO_MAX_DURATION_SECONDS}s — trimming." - ) - trimmed = os.path.join(tmpdir, "trimmed.mp4") - if not trim_video(video_path, trimmed): - logging.error("❌ Trim failed — skipping.") - return False - video_path = trimmed + # ── Compress / trim ──────────────────────────────────────────── + if not compress_video(raw_path, processed_path): + logging.error(f"❌ Compression failed for {video_id}. Skipping.") + return False - # ── Check file size ─────────────────────────────────────────── - size_mb = os.path.getsize(video_path) / (1024 * 1024) - if size_mb > MAX_VIDEO_UPLOAD_SIZE_MB: + # ── Size guard ───────────────────────────────────────────────── + final_size = os.path.getsize(processed_path) + if final_size > VIDEO_MAX_SIZE_BYTES: logging.error( - f"❌ Video still too large after trim: {size_mb:.1f} MB — skipping." + f"❌ Compressed video still too large: " + f"{final_size / 1024 / 1024:.1f} MB > " + f"{VIDEO_MAX_SIZE_BYTES / 1024 / 1024:.0f} MB. Skipping." ) return False - # ── Thumbnail ───────────────────────────────────────────────── - thumb_path = os.path.join(tmpdir, "thumb.jpg") - if not extract_thumbnail(video_path, thumb_path): - thumb_path = None + # ── Upload to Bluesky ────────────────────────────────────────── + logging.info( + f"⬆️ Uploading to Bluesky " + f"({final_size / 1024 / 1024:.1f} MB)..." + ) + with open(processed_path, "rb") as f: + video_data = f.read() - # ── Aspect ratio ────────────────────────────────────────────── - aspect_ratio = None - dims = get_video_dimensions(video_path) - if dims: - w, h = dims - aspect_ratio = models.AppBskyEmbedDefs.AspectRatio( - width=w, height=h - ) - - # ── Upload blobs ────────────────────────────────────────────── - video_blob = upload_video_blob(client, video_path) - if not video_blob: + try: + blob = bsky_upload_blob_with_retry(client, video_data, "video/mp4") + except Exception as e: + logging.error(f"❌ Blob upload failed for {video_id}: {e}") return False - thumb_blob = None - if thumb_path and os.path.exists(thumb_path): - thumb_blob = upload_thumb_blob(client, thumb_path) + # ── Build post text ──────────────────────────────────────────── + post_text = desc.strip() if desc else "" + if len(post_text) > 280: + post_text = post_text[:277] + "..." + if not post_text: + post_text = f"🎬 {video_url}" - # ── Build post text ─────────────────────────────────────────── - raw_text = tiktok.text or "" - max_chars = BSKY_TEXT_MAX_LENGTH - if post_url: - max_chars -= len(post_url) + 2 # " \n" separator - post_text = truncate_grapheme(raw_text, max_chars) - if post_url: - post_text = f"{post_text}\n{post_url}".strip() + # ── Build video embed ────────────────────────────────────────── + try: + from atproto import models + video_embed = models.AppBskyEmbedVideo.Main( + video=blob, + alt=desc[:1000] if desc else "", + ) + except Exception as e: + logging.error(f"❌ Could not build video embed: {e}") + return False - alt_text = truncate_grapheme(raw_text, DYNAMIC_ALT_MAX_LENGTH) - - # ── Send post ───────────────────────────────────────────────── - success = send_bsky_post( - client, post_text, video_blob, - thumb_blob, langs, aspect_ratio, alt_text, + # ── Create post ──────────────────────────────────────────────── + success = bsky_create_post_with_retry( + client, + text=post_text, + embed=video_embed, + langs=langs, ) if success: - mark_posted(video_id, state) - save_state(state) - logging.info(f"🎉 Posted {video_id} to Bluesky.") + mark_as_posted(video_id, state, { + "tiktok_url": video_url, + "desc": desc[:200] if desc else "", + }) + logging.info(f"✅ Posted video {video_id} to Bluesky.") + return True - return success + logging.error(f"❌ Failed to post video {video_id} to Bluesky.") + return False - -# ───────────────────────────────────────────── +# ───────────────────────────────────────────────────────────────────────────── # Entry point -# ───────────────────────────────────────────── -# ✅ FIXED — global declared at the very top of the function +# ───────────────────────────────────────────────────────────────────────────── def main(): - global TIKTOK_COOKIES_PATH # ← must be first, before any other statement + global TIKTOK_COOKIES_PATH # must be first line in function load_dotenv() parser = argparse.ArgumentParser( description="TikTok → Bluesky cross-poster" ) - parser.add_argument("--tiktok-handle", required=True) - parser.add_argument("--bsky-handle", required=True) - parser.add_argument("--bsky-password", required=True) - parser.add_argument("--bsky-base-url", default=DEFAULT_BSKY_BASE_URL) - parser.add_argument("--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS) - parser.add_argument( - "--cookies-path", - default=TIKTOK_COOKIES_PATH, - help="Path to exported TikTok cookies JSON file.", - ) + parser.add_argument("--tiktok-handle", required=True, + help="TikTok handle to scrape (without @)") + parser.add_argument("--bsky-handle", required=True, + help="Bluesky handle (e.g. user.eurosky.social)") + parser.add_argument("--bsky-app-password", required=True, + help="Bluesky app password (not account password)") + parser.add_argument("--bsky-base-url", default=DEFAULT_BSKY_BASE_URL, + help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})") + parser.add_argument("--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS, + help="Post language codes (default: es)") + parser.add_argument("--cookies-path", default=TIKTOK_COOKIES_PATH, + help="Path to TikTok cookies JSON file") args = parser.parse_args() - # Now safe to reassign the global + # Override global cookie path from CLI TIKTOK_COOKIES_PATH = args.cookies_path - logging.info(f"🤖 TikTok→Bluesky bot started. Scraping @{args.tiktok_handle}") - logging.info(f"🍪 Cookie file: {TIKTOK_COOKIES_PATH} " - f"({'found' if os.path.exists(TIKTOK_COOKIES_PATH) else 'NOT FOUND'})") + logging.info("=" * 60) + logging.info(f"🤖 TikTok→Bluesky bot started") + logging.info(f" TikTok handle : @{args.tiktok_handle}") + logging.info(f" Bluesky handle: {args.bsky_handle}") + logging.info(f" Bluesky PDS : {args.bsky_base_url}") + logging.info(f" Languages : {args.bsky_langs}") + logging.info( + f" Cookie file : {TIKTOK_COOKIES_PATH} " + f"({'✅ found' if os.path.exists(TIKTOK_COOKIES_PATH) else '❌ NOT FOUND'})" + ) + logging.info("=" * 60) state = load_state() client = Client() + # ── Bluesky login ────────────────────────────────────────────────── if not bsky_login(client, args.bsky_handle, - args.bsky_password, args.bsky_base_url): - logging.error("❌ Cannot proceed without Bluesky login.") - return - - logging.info("🔄 Starting TikTok → Bluesky sync cycle...") + args.bsky_app_password, + args.bsky_base_url): + logging.error("❌ Cannot proceed without Bluesky login. Exiting.") + sys.exit(1) + # ── Scrape TikTok ────────────────────────────────────────────────── + logging.info(f"🔄 Scraping @{args.tiktok_handle}...") tiktoks = scrape_tiktoks_via_playwright(args.tiktok_handle) if not tiktoks: @@ -1030,20 +857,27 @@ def main(): logging.info("🤖 Bot finished.") return - logging.info(f"📋 Found {len(tiktoks)} videos. Processing new ones...") + logging.info(f"📋 Found {len(tiktoks)} video(s). Processing new ones...") + # ── Process each video ───────────────────────────────────────────── posted = 0 for tiktok in tiktoks: try: if process_tiktok(tiktok, client, args.bsky_langs, state): posted += 1 + # Polite delay between posts time.sleep(random.uniform(3.0, 7.0)) except Exception as e: - logging.error(f"❌ Unexpected error processing video: {e}") + logging.error( + f"❌ Unexpected error processing video " + f"{tiktok.get('id', '?')}: {e}" + ) continue + logging.info("=" * 60) logging.info(f"✅ Sync complete. Posted {posted} new video(s).") logging.info("🤖 Bot finished.") + logging.info("=" * 60) if __name__ == "__main__":