From 7cddbd057a93f50d454030ee8360c6c0abc35a75 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Wed, 20 May 2026 07:13:45 +0200 Subject: [PATCH] Fixes for today --- tiktok2bsky.py | 1695 +++++++++++++++++++++++------------------------- 1 file changed, 808 insertions(+), 887 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index f545eb4..eac502b 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -34,13 +34,21 @@ from atproto import Client from dotenv import load_dotenv from playwright.sync_api import sync_playwright -# playwright-stealth 1.x uses stealth_sync, 2.x uses Stealth class + +# ───────────────────────────────────────────────────────────────────────────── +# playwright-stealth: detect installed version +# ───────────────────────────────────────────────────────────────────────────── +_STEALTH_V2 = None # None = not available at all + try: from playwright_stealth import stealth_sync _STEALTH_V2 = False except ImportError: - from playwright_stealth import Stealth - _STEALTH_V2 = True + try: + from playwright_stealth import Stealth + _STEALTH_V2 = True + except ImportError: + pass # stealth disabled — warning emitted at runtime # ───────────────────────────────────────────────────────────────────────────── @@ -69,29 +77,30 @@ SCRAPE_VIDEO_LIMIT = 30 VIDEO_MAX_AGE_DAYS = 3 VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s -VIDEO_MAX_SIZE_BYTES = 45 * 1024 * 1024 # 45 MB -# Bluesky login retry config -BSKY_LOGIN_MAX_RETRIES = 4 -BSKY_LOGIN_BASE_DELAY = 15.0 -BSKY_LOGIN_MAX_DELAY = 120.0 -BSKY_LOGIN_JITTER_MAX = 10.0 +# ── Bluesky login retry config (ported from twitter2bsky.py) ───────────────── +BSKY_LOGIN_MAX_RETRIES = 6 +BSKY_LOGIN_BASE_DELAY = 15.0 +BSKY_LOGIN_MAX_DELAY = 600.0 +BSKY_LOGIN_JITTER_MAX = 5.0 +BSKY_LOGIN_RATE_LIMIT_DELAY = 90.0 # minimum wait on 429 +BSKY_LOGIN_RATE_LIMIT_MAX_DELAY = 600.0 # maximum wait on 429 -# Bluesky upload retry config +# ── Bluesky upload retry config ─────────────────────────────────────────────── BSKY_UPLOAD_MAX_RETRIES = 5 BSKY_UPLOAD_BASE_DELAY = 10.0 BSKY_UPLOAD_MAX_DELAY = 120.0 BSKY_UPLOAD_JITTER_MAX = 5.0 -# Playwright scraping config +# ── Playwright scraping config ──────────────────────────────────────────────── PLAYWRIGHT_TIMEOUT_MS = 30_000 PLAYWRIGHT_SLOW_MO = 50 PLAYWRIGHT_MAX_RELOADS = 3 -# TikTok selectors -TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' -TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' -TIKTOK_BANNER_SELS = [ +# ── TikTok selectors ────────────────────────────────────────────────────────── +TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' +TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' +TIKTOK_BANNER_SELS = [ '[id*="banner"]', '[class*="banner"]', '[data-e2e="recommend-modal-close"]', @@ -115,6 +124,20 @@ TIKTOK_GRID_ERROR_SEL = '[data-e2e="user-post-item-list-error"]' TIKTOK_REFRESH_BTN_SEL = 'button:has-text("Actualizar"), button:has-text("Refresh")' +# ───────────────────────────────────────────────────────────────────────────── +# Fix 2 — Dynamic video size limit based on PDS +# ───────────────────────────────────────────────────────────────────────────── +def get_video_size_limit(bsky_base_url: str) -> int: + """ + bsky.social supports ~50 MB blobs. Third-party PDS instances + typically cap at 10–20 MB. Use a conservative 10 MB for + anything that isn't the official PDS. + """ + if "bsky.social" in (bsky_base_url or ""): + return 20 * 1024 * 1024 # 20 MB — official PDS + return 10 * 1024 * 1024 # 10 MB — safe for third-party PDS + + # ───────────────────────────────────────────────────────────────────────────── # State management # ───────────────────────────────────────────────────────────────────────────── @@ -133,7 +156,6 @@ def load_state() -> dict: def save_state(state: dict): - # Prune to last STATE_MAX_ENTRIES posted = state.get("posted", {}) if len(posted) > STATE_MAX_ENTRIES: sorted_keys = sorted( @@ -167,7 +189,7 @@ def mark_as_posted(video_id: str, state: dict, meta: dict = None): # Cookie helpers # ───────────────────────────────────────────────────────────────────────────── def load_cookies_from_file(path: str) -> list: - """Load cookies from a JSON file (format produced by generate_tiktok_cookies.py).""" + """Load cookies from a JSON file.""" if not os.path.exists(path): logging.warning(f"⚠️ Cookie file not found: {path}") return [] @@ -207,322 +229,273 @@ def inject_cookies_into_context(context, cookies: list): logging.warning(f"⚠️ Could not inject cookies: {e}") +def convert_json_cookies_to_netscape(json_path: str) -> str | None: + """ + Convert a JSON cookie file (browser extension format) to a Netscape + cookie file that yt-dlp can consume. + + Returns the path to a temporary Netscape file, or None on failure. + The caller is responsible for deleting the file when done. + + Netscape format columns (tab-separated): + domain include_subdomains path secure expiry name value + """ + try: + with open(json_path, "r", encoding="utf-8") as f: + cookies = json.load(f) + + tmp = tempfile.NamedTemporaryFile( + mode="w", + suffix=".txt", + delete=False, + encoding="utf-8", + ) + + tmp.write("# Netscape HTTP Cookie File\n") + tmp.write("# Generated by tiktok2bsky.py\n\n") + + for c in cookies: + domain = c.get("domain", ".tiktok.com") + include_sub = "TRUE" if domain.startswith(".") else "FALSE" + path = c.get("path", "/") + secure = "TRUE" if c.get("secure", False) else "FALSE" + expiry = int(c.get("expirationDate") or c.get("expires") or 0) + name = c.get("name", "") + value = c.get("value", "") + + tmp.write( + f"{domain}\t{include_sub}\t{path}\t" + f"{secure}\t{expiry}\t{name}\t{value}\n" + ) + + tmp.close() + logging.info( + f"🍪 Converted {len(cookies)} cookies to Netscape format: {tmp.name}" + ) + return tmp.name + + except Exception as e: + logging.warning( + f"⚠️ Could not convert cookies to Netscape format: " + f"{type(e).__name__}: {e}" + ) + return None + + # ───────────────────────────────────────────────────────────────────────────── -# Bluesky error classification helpers (ported from twitter2bsky.py) +# Bluesky error classification (ported from twitter2bsky.py) # ───────────────────────────────────────────────────────────────────────────── +def _bsky_error_text(error_obj) -> str: + """Normalised lowercase repr for pattern matching.""" + return repr(error_obj).lower() + + def is_rate_limited_error(error_obj) -> bool: - text = repr(error_obj).lower() + text = _bsky_error_text(error_obj) return ( - "429" in text + "429" in text or "ratelimitexceeded" in text or "too many requests" in text - or "rate limit" in text + or "rate limit" in text + or "ratelimit" in text ) def is_auth_error(error_obj) -> bool: - text = repr(error_obj).lower() + text = _bsky_error_text(error_obj) return ( - "401" in text - or "403" in text + "401" in text + or "403" in text + or "invalid identifier" in text + or "invalid password" in text + or "authenticationrequired" in text + or "invalidtoken" in text + or "expiredtoken" in text + or "accounttakedown" in text or "invalid identifier or password" in text - or "authenticationrequired" in text - or "invalidtoken" in text ) def is_network_error(error_obj) -> bool: text = repr(error_obj) signals = [ - "ConnectError", - "RemoteProtocolError", - "ReadTimeout", - "WriteTimeout", - "TimeoutException", - "503", - "502", - "504", - "ConnectionResetError", + "ConnectError", "RemoteProtocolError", "ReadTimeout", + "WriteTimeout", "TimeoutException", "ConnectionResetError", + "503", "502", "504", ] - return any(sig in text for sig in signals) + return any(s in text for s in signals) def is_transient_error(error_obj) -> bool: text = repr(error_obj) signals = [ - "InvokeTimeoutError", - "ReadTimeout", - "WriteTimeout", - "TimeoutException", - "RemoteProtocolError", - "ConnectError", - "503", - "502", - "504", + "InvokeTimeoutError", "ReadTimeout", "WriteTimeout", + "TimeoutException", "RemoteProtocolError", "ConnectError", + "503", "502", "504", ] - return any(sig in text for sig in signals) + return any(s in text for s in signals) def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: """ - Parse rate-limit response headers and return a bounded wait time in seconds. - Supports retry-after, x-ratelimit-after, and ratelimit-reset (unix timestamp). + Extract the server-requested wait time from rate-limit error headers. + + Checks (in order): + 1. error_obj.headers dict — Retry-After, X-RateLimit-After, RateLimit-Reset + 2. repr(error_obj) text — same keys embedded as strings + 3. Falls back to default_delay + Ported from twitter2bsky.py. """ + now_ts = int(time.time()) + + # ── 1. Live headers object ──────────────────────────────────────────── try: - now_ts = int(time.time()) headers = getattr(error_obj, "headers", None) or {} for key in ("retry-after", "Retry-After"): - if headers.get(key): - return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) for key in ("x-ratelimit-after", "X-RateLimit-After"): - if headers.get(key): - return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) for key in ("ratelimit-reset", "RateLimit-Reset"): - if headers.get(key): - wait = max(int(headers[key]) - now_ts + 1, default_delay) - return min(wait, BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + wait = max(int(val) - now_ts + 2, default_delay) + return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + except Exception: pass - # repr() fallback — parse headers embedded in the exception string + # ── 2. repr() string fallback ───────────────────────────────────────── text = repr(error_obj) - for pattern, is_timestamp in [ - (r"'retry-after':\s*'(\d+)'", False), - (r"'x-ratelimit-after':\s*'(\d+)'", False), - (r"'ratelimit-reset':\s*'(\d+)'", True), + for pattern, is_ts in [ + (r"['\"]retry-after['\"]\s*:\s*['\"](\d+)['\"]", False), + (r"['\"]x-ratelimit-after['\"]\s*:\s*['\"](\d+)['\"]", False), + (r"['\"]ratelimit-reset['\"]\s*:\s*['\"](\d+)['\"]", True), + (r"retry.?after[=:\s]+(\d+)", False), ]: m = re.search(pattern, text, re.IGNORECASE) if m: val = int(m.group(1)) - if is_timestamp: - return min( - max(val - int(time.time()) + 1, default_delay), - BSKY_LOGIN_MAX_DELAY, - ) - return min(max(val, 1), BSKY_LOGIN_MAX_DELAY) + if is_ts: + wait = max(val - now_ts + 2, default_delay) + return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + return min(max(val, 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) return default_delay # ───────────────────────────────────────────────────────────────────────────── -# Bluesky helpers +# Bluesky client — improved login (ported from twitter2bsky.py) # ───────────────────────────────────────────────────────────────────────────── -def bsky_login(client: Client, handle: str, password: str, - base_url: str = DEFAULT_BSKY_BASE_URL) -> bool: +def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: """ - Authenticate against the AT Protocol PDS. + Authenticate with Bluesky with full retry logic ported from twitter2bsky.py: - base_url is always https://bsky.social for standard Bluesky accounts — - even when the user's handle lives on a custom domain like eurosky.social. - The Client is re-initialised with the base URL baked in at construction - time, which is the only reliable way to override the internal session - resolver (mirrors create_bsky_client() in twitter2bsky.py). + • 429 / rate-limit → honour Retry-After header; wait up to 600s + • auth errors → fail immediately (retrying won't help) + • network/transient → exponential backoff with jitter + • other errors → exponential backoff with jitter + • exhausted retries → raise so Jenkins marks the build FAILURE """ - normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/") - logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}") + logging.info(f"🔐 Connecting Bluesky client → {base_url}") + client = Client(base_url=base_url) - # Re-initialise the client so the base URL is baked in from the start. - # Setting client.base_url after construction does not reliably override - # the internal session resolver in the atproto SDK. - client.__init__(base_url=normalized_base_url) + attempt = 0 + last_error = None + + while attempt < BSKY_LOGIN_MAX_RETRIES: + attempt += 1 + logging.info( + f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} " + f"for {handle}" + ) - for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1): try: - logging.info( - f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} " - f"for {handle}" - ) - client.login(handle, password) + client.login(handle, app_password) + # Fetch profile to confirm the session is fully live + client.me = client.get_profile(handle) logging.info(f"✅ Bluesky login successful as {handle}") - return True + return client except Exception as e: + last_error = e + err_detail = f"{type(e).__name__}: {e}" - # ── 401 / auth errors — no point retrying ───────────────── + # ── Auth errors: no point retrying ─────────────────────────── if is_auth_error(e): logging.error( - f"❌ Bluesky login failed: invalid handle or app password.\n" - f" Handle : {handle}\n" - f" PDS : {normalized_base_url}\n" - f" Fix : regenerate app password at " - f"https://bsky.app/settings/app-passwords\n" - f" Detail : {repr(e)}" - ) - return False - - # ── Rate limit ───────────────────────────────────────────── - if is_rate_limited_error(e): - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = get_rate_limit_wait_seconds( - e, default_delay=BSKY_LOGIN_BASE_DELAY - ) - wait += random.uniform(0, BSKY_LOGIN_JITTER_MAX) - logging.warning( - f"⏳ Bluesky login rate-limited " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " - f"Retrying in {wait:.1f}s." - ) - time.sleep(wait) - continue - logging.error( - "❌ Exhausted Bluesky login retries due to rate limiting." - ) - return False - - # ── Transient / network errors ───────────────────────────── - if is_network_error(e) or is_transient_error(e): - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = min( - BSKY_LOGIN_BASE_DELAY * attempt, - BSKY_LOGIN_MAX_DELAY, - ) + random.uniform(0, BSKY_LOGIN_JITTER_MAX) - logging.warning( - f"⏳ Transient login failure " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " - f"Retrying in {wait:.1f}s." - ) - time.sleep(wait) - continue - logging.error( - "❌ Exhausted Bluesky login retries after " - "transient/network errors." - ) - return False - - # ── Unexpected error — retry with backoff ────────────────── - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = min( - BSKY_LOGIN_BASE_DELAY * attempt, - BSKY_LOGIN_MAX_DELAY, - ) + random.uniform(0, BSKY_LOGIN_JITTER_MAX) - logging.warning( - f"⏳ Unexpected login error " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " - f"{repr(e)}. Retrying in {wait:.1f}s." - ) - time.sleep(wait) - continue - - logging.error( - f"❌ All Bluesky login attempts failed. Last error: {repr(e)}" - ) - return False - - return False - - -def bsky_get_recent_post_urls(client: Client, handle: str, - limit: int = 50) -> set: - """Return a set of URLs recently posted to Bluesky (to avoid duplicates).""" - urls: set = set() - try: - feed = client.get_author_feed(actor=handle, limit=limit) - for item in feed.feed: - post = item.post - if hasattr(post, "record") and hasattr(post.record, "embed"): - embed = post.record.embed - if hasattr(embed, "external") and hasattr(embed.external, "uri"): - urls.add(embed.external.uri) - if hasattr(post, "record") and hasattr(post.record, "text"): - text = post.record.text - found = re.findall(r"https?://\S+", text) - urls.update(found) - except Exception as e: - logging.warning(f"⚠️ Could not fetch recent Bluesky posts: {e}") - return urls - - -def bsky_upload_blob_with_retry(client: Client, data: bytes, - mime_type: str) -> object: - """Upload a blob to Bluesky with retry + exponential backoff.""" - for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): - try: - resp = client.upload_blob(data) - logging.info( - f"✅ Blob uploaded ({len(data) / 1024 / 1024:.1f} MB) " - f"on attempt {attempt}." - ) - return resp.blob - except Exception as e: - is_rate_limit = is_rate_limited_error(e) - - if attempt == BSKY_UPLOAD_MAX_RETRIES: - logging.error( - f"❌ Blob upload failed after " - f"{BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" + f"❌ Bluesky login auth error (will not retry): {err_detail}" ) raise - delay = min( - BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) - + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), - BSKY_UPLOAD_MAX_DELAY, - ) - if is_rate_limit: - delay = max( - get_rate_limit_wait_seconds(e, default_delay=delay), - 60.0, + # ── Rate-limited (429) ──────────────────────────────────────── + if is_rate_limited_error(e): + raw_wait = get_rate_limit_wait_seconds(e, BSKY_LOGIN_RATE_LIMIT_DELAY) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = min(raw_wait + jitter, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + logging.warning( + f"⏳ Bluesky login rate-limited (attempt {attempt}/" + f"{BSKY_LOGIN_MAX_RETRIES}). " + f"Waiting {wait:.1f}s (server requested {raw_wait:.0f}s)." ) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) + continue + # ── Network / transient errors ──────────────────────────────── + if is_network_error(e) or is_transient_error(e): + delay = min( + BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_LOGIN_MAX_DELAY, + ) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter + logging.warning( + f"⚠️ Bluesky login network/transient error " + f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " + f"{err_detail}. Retrying in {wait:.1f}s." + ) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) + continue + + # ── Unknown errors ──────────────────────────────────────────── + delay = min( + BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_LOGIN_MAX_DELAY, + ) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter logging.warning( - f"⚠️ Blob upload attempt {attempt} failed: {e}. " - f"Retrying in {delay:.1f}s..." + f"⚠️ Bluesky login failed " + f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " + f"{err_detail}. Retrying in {wait:.1f}s." ) - time.sleep(delay) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) - -def bsky_create_post_with_retry(client: Client, text: str, - embed=None, langs=None) -> bool: - """Create a Bluesky post with retry + exponential backoff.""" - for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): - try: - kwargs = {"text": text} - if embed: - kwargs["embed"] = embed - if langs: - kwargs["langs"] = langs - client.send_post(**kwargs) - logging.info(f"✅ Post created on attempt {attempt}.") - return True - except Exception as e: - is_rate_limit = is_rate_limited_error(e) - - if attempt == BSKY_UPLOAD_MAX_RETRIES: - logging.error( - f"❌ Post creation failed after " - f"{BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" - ) - return False - - delay = min( - BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) - + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), - BSKY_UPLOAD_MAX_DELAY, - ) - if is_rate_limit: - delay = max( - get_rate_limit_wait_seconds(e, default_delay=delay), - 60.0, - ) - - logging.warning( - f"⚠️ Post creation attempt {attempt} failed: {e}. " - f"Retrying in {delay:.1f}s..." - ) - time.sleep(delay) - - return False + logging.error( + f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts. " + f"Last error: {type(last_error).__name__}: {last_error}" + ) + raise RuntimeError( + f"Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts: " + f"{last_error}" + ) # ───────────────────────────────────────────────────────────────────────────── -# Video processing helpers +# Video helpers # ───────────────────────────────────────────────────────────────────────────── def get_video_duration(path: str) -> float: - """Return video duration in seconds using ffprobe.""" + """Return video duration in seconds via ffprobe, or 0.0 on failure.""" try: result = subprocess.run( [ @@ -531,51 +504,71 @@ def get_video_duration(path: str) -> float: "-of", "default=noprint_wrappers=1:nokey=1", path, ], - capture_output=True, text=True, timeout=30, + capture_output=True, + text=True, + timeout=15, ) return float(result.stdout.strip()) except Exception as e: - logging.warning(f"⚠️ ffprobe failed: {e}") + logging.warning(f"⚠️ ffprobe failed for {path}: {e}") return 0.0 -def compress_video(input_path: str, output_path: str, - max_duration: int = VIDEO_MAX_DURATION_S, - max_size_bytes: int = VIDEO_MAX_SIZE_BYTES) -> bool: +def compress_video( + input_path: str, + output_path: str, + max_duration: int = VIDEO_MAX_DURATION_S, + max_size_bytes: int = None, +) -> bool: + """ + Re-encode input_path → output_path using libx264, targeting max_size_bytes. + + Fixes applied: + • pad=ceil(iw/2)*2:ceil(ih/2)*2 — ensures even dimensions (libx264 requirement) + • -maxrate == -b:v — hard ceiling, no burst above target + • post-encode size guard — rejects file if still over limit + """ + if max_size_bytes is None: + max_size_bytes = 20 * 1024 * 1024 + try: duration = get_video_duration(input_path) - # Guard: ffprobe returned 0 = file is not a valid video if duration <= 0: logging.error( - f"❌ compress_video: ffprobe returned duration={duration} " - f"— file is not a valid video: {input_path} " - f"({os.path.getsize(input_path)} bytes)" + f"❌ compress_video: invalid duration={duration} " + f"for {input_path} ({os.path.getsize(input_path)} bytes)" ) return False trim_to = min(duration, max_duration) - target_bits = max_size_bytes * 8 * 0.90 - target_kbps = int(target_bits / trim_to / 1000) - video_kbps = max(200, target_kbps - 128) + target_bits = max_size_bytes * 8 * 0.85 + total_kbps = int(target_bits / trim_to / 1000) + audio_kbps = 96 + video_kbps = max(200, total_kbps - audio_kbps) logging.info( f"🎬 Compressing: duration={duration:.1f}s → trim={trim_to:.1f}s, " - f"video_bitrate={video_kbps}k" + f"video_bitrate={video_kbps}k " + f"(target ≤ {max_size_bytes // 1024 // 1024}MB)" ) cmd = [ "ffmpeg", "-y", "-i", input_path, "-t", str(trim_to), - "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease", + "-vf", ( + "scale='min(1280,iw)':'min(720,ih)'" + ":force_original_aspect_ratio=decrease," + "pad=ceil(iw/2)*2:ceil(ih/2)*2" + ), "-c:v", "libx264", "-b:v", f"{video_kbps}k", - "-maxrate", f"{video_kbps * 2}k", - "-bufsize", f"{video_kbps * 4}k", + "-maxrate", f"{video_kbps}k", + "-bufsize", f"{video_kbps * 2}k", "-c:a", "aac", - "-b:a", "128k", + "-b:a", f"{audio_kbps}k", "-movflags", "+faststart", "-pix_fmt", "yuv420p", output_path, @@ -587,697 +580,597 @@ def compress_video(input_path: str, output_path: str, return False final_size = os.path.getsize(output_path) + + if final_size > max_size_bytes: + logging.error( + f"❌ Compressed file still too large: " + f"{final_size / 1024 / 1024:.1f} MB > " + f"{max_size_bytes / 1024 / 1024:.0f} MB limit. Skipping." + ) + return False + logging.info( f"✅ Compressed video: {final_size / 1024 / 1024:.1f} MB → {output_path}" ) return True except Exception as e: - logging.error(f"❌ compress_video error: {e}") + logging.error(f"❌ compress_video error: {type(e).__name__}: {e}") return False -def download_video(url: str, output_path: str, - cookies: list = None) -> bool: +# ───────────────────────────────────────────────────────────────────────────── +# yt-dlp helpers +# ───────────────────────────────────────────────────────────────────────────── +def get_best_impersonation_target() -> str | None: """ - Download a video from a URL (MP4 or M3U8) using httpx or yt-dlp. - Falls back to yt-dlp for HLS streams or when direct download fails. + Dynamically select the best available curl_cffi impersonation target. + Returns None if curl_cffi is not installed or no target is available. """ - # ── Try direct HTTP download first ──────────────────────────────── - if not url.endswith(".m3u8"): - try: - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" - ), - "Referer": "https://www.tiktok.com/", - } - with httpx.stream("GET", url, headers=headers, - follow_redirects=True, timeout=60) as r: - r.raise_for_status() - with open(output_path, "wb") as f: - for chunk in r.iter_bytes(chunk_size=1024 * 64): - f.write(chunk) - size = os.path.getsize(output_path) - if size > 10_000: - logging.info( - f"✅ Direct download OK: {size / 1024 / 1024:.1f} MB" - ) - return True - logging.warning( - f"⚠️ Direct download too small ({size} bytes), trying yt-dlp..." - ) - except Exception as e: - logging.warning(f"⚠️ Direct download failed: {e}. Trying yt-dlp...") - - # ── Fall back to yt-dlp ──────────────────────────────────────────── - return download_video_ytdlp(url, output_path, cookies=cookies) + try: + from curl_cffi.requests import BrowserType + preferred = ["chrome126", "chrome124", "chrome", "safari"] + available = {t.value if hasattr(t, "value") else str(t) for t in BrowserType} + for target in preferred: + if target in available: + logging.info(f"🎭 yt-dlp impersonation target: {target}") + return target + if available: + target = sorted(available)[0] + logging.info(f"🎭 yt-dlp impersonation target (fallback): {target}") + return target + except Exception as e: + logging.warning(f"⚠️ Could not check impersonation targets: {e}") + return None -def download_video_ytdlp(url: str, output_path: str, - cookies: list = None) -> bool: +def download_video_ytdlp( + url: str, + output_path: str, + netscape_cookies_path: str = None, +) -> bool: """ - Download a video using yt-dlp with TikTok impersonation support. - curl_cffi must be installed for impersonation to work. + Download a TikTok video using yt-dlp with browser impersonation. + Accepts a Netscape-format cookie file path (not JSON). """ - cookie_file = None + impersonate = get_best_impersonation_target() + + ydl_opts = { + "outtmpl": output_path, + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "quiet": False, + "no_warnings": False, + "merge_output_format": "mp4", + } + + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + ydl_opts["cookiefile"] = netscape_cookies_path + + if impersonate: + ydl_opts["impersonate"] = impersonate + try: import yt_dlp - - ydl_opts = { - "outtmpl": output_path, - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "quiet": True, - "no_warnings": False, - "merge_output_format": "mp4", - # ── TikTok impersonation ─────────────────────────────────── - # Requires curl_cffi: pip install curl-cffi - "impersonate": "chrome", - } - - if cookies: - cookie_file = _write_netscape_cookies(cookies) - if cookie_file: - ydl_opts["cookiefile"] = cookie_file - with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) - # Validate: must exist AND be a real video (> 50 KB) - if os.path.exists(output_path): - size = os.path.getsize(output_path) - if size > 50_000: - logging.info( - f"✅ yt-dlp download OK: {size / 1024 / 1024:.1f} MB" - ) - return True - logging.error( - f"❌ yt-dlp output too small ({size} bytes) — " - f"likely an HTML error page, not a video." + if os.path.exists(output_path) and os.path.getsize(output_path) > 50 * 1024: + size_mb = os.path.getsize(output_path) / 1024 / 1024 + logging.info(f"✅ yt-dlp download OK: {size_mb:.1f} MB") + return True + else: + logging.warning( + f"⚠️ yt-dlp output too small or missing: {output_path} " + f"({os.path.getsize(output_path) if os.path.exists(output_path) else 0} bytes)" ) return False - logging.error("❌ yt-dlp produced no output file.") - return False - except Exception as e: - logging.error(f"❌ yt-dlp download failed: {e}") + logging.error( + f"❌ yt-dlp download failed for {url}: {type(e).__name__}: {e}" + ) return False - finally: - if cookie_file and os.path.exists(cookie_file): - os.unlink(cookie_file) -def _write_netscape_cookies(cookies: list) -> str | None: - """Write cookies list to a Netscape-format temp file for yt-dlp.""" + +def download_video( + url: str, + output_path: str, + netscape_cookies_path: str = None, +) -> bool: + """Download a TikTok video via yt-dlp with browser impersonation.""" + logging.info(f"⬇️ Downloading: {url}") + return download_video_ytdlp(url, output_path, netscape_cookies_path=netscape_cookies_path) + + +# ───────────────────────────────────────────────────────────────────────────── +# Bluesky upload +# ───────────────────────────────────────────────────────────────────────────── +def upload_video_to_bluesky( + client: Client, + video_path: str, + video_id: str, +) -> object | None: + """ + Upload a video file to Bluesky as a blob. + All exceptions logged as type(e).__name__: e for full visibility. + """ + size_mb = os.path.getsize(video_path) / 1024 / 1024 + logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") + + with open(video_path, "rb") as f: + video_data = f.read() + + delay = BSKY_UPLOAD_BASE_DELAY + + for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): + try: + blob = client.upload_blob(video_data) + logging.info(f"✅ Blob uploaded successfully for {video_id}") + return blob.blob + + except Exception as e: + err_detail = f"{type(e).__name__}: {e}" + + if attempt >= BSKY_UPLOAD_MAX_RETRIES: + logging.error( + f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: " + f"{err_detail}" + ) + return None + + logging.warning( + f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} " + f"failed: {err_detail}. Retrying in {delay:.1f}s..." + ) + time.sleep(delay + random.uniform(0, BSKY_UPLOAD_JITTER_MAX)) + delay = min(delay * 2, BSKY_UPLOAD_MAX_DELAY) + + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Bluesky post +# ───────────────────────────────────────────────────────────────────────────── +def post_video_to_bluesky( + client: Client, + blob, + caption: str, + langs: list[str], + video_id: str, +) -> bool: + """Create a Bluesky post embedding the uploaded video blob.""" + from atproto import models + try: - fd, path = tempfile.mkstemp(suffix=".txt", prefix="tiktok_cookies_") - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write("# Netscape HTTP Cookie File\n") - for c in cookies: - domain = c.get("domain", ".tiktok.com") - flag = "TRUE" if domain.startswith(".") else "FALSE" - path_val = c.get("path", "/") - secure = "TRUE" if c.get("secure") else "FALSE" - exp = int( - c.get("expirationDate", 0) or c.get("expires", 0) or 0 - ) - name = c.get("name", "") - value = c.get("value", "") - f.write( - f"{domain}\t{flag}\t{path_val}\t{secure}\t" - f"{exp}\t{name}\t{value}\n" - ) - return path + video_embed = models.AppBskyEmbedVideo.Main(video=blob) + client.send_post( + text=caption, + embed=video_embed, + langs=langs, + ) + logging.info(f"✅ Posted video {video_id} to Bluesky.") + return True + except Exception as e: - logging.warning(f"⚠️ Could not write Netscape cookie file: {e}") - return None + logging.error( + f"❌ Failed to post video {video_id} to Bluesky: " + f"{type(e).__name__}: {e}" + ) + return False # ───────────────────────────────────────────────────────────────────────────── -# TikTok scraping via Playwright +# TikTok scraping — Playwright # ───────────────────────────────────────────────────────────────────────────── -def _dismiss_overlays(page): - """Dismiss cookie banners and RGPD modals.""" - for sel in TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS: +def dismiss_overlays(page) -> None: + """Try to dismiss cookie banners and modal overlays.""" + all_sels = TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS + for sel in all_sels: try: el = page.locator(sel).first if el.is_visible(timeout=1500): - el.click(timeout=2000) + el.click(timeout=1500) logging.info(f"🚫 Dismissed overlay: {sel}") time.sleep(0.5) except Exception: pass -def _take_debug_screenshot(page, label: str): - """Save a debug screenshot to workspace.""" - try: - path = f"screenshot_{label}_{int(time.time())}.png" - page.screenshot(path=path) - logging.info(f"📸 Screenshot saved: {path}") - except Exception: - pass +def _run_playwright_scrape_loop(page, profile_url: str, limit: int) -> list[dict]: + """ + Inner scraping loop shared by both the stealth and no-stealth paths. + Returns a list of video dicts. + """ + videos = [] -TIKTOK_GDPR_SELS = [ - 'button:has-text("Entendido")', - 'button:has-text("Understood")', - 'button:has-text("Got it")', - '[class*="gdpr"] button', - '[class*="privacy"] button:has-text("Entendido")', -] - - -def _dismiss_all_overlays(page): - """Dismiss GDPR notices, cookie banners and any other modals.""" - for sel in TIKTOK_GDPR_SELS + TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS: + for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): try: - el = page.locator(sel).first - if el.is_visible(timeout=1500): - el.click(timeout=2000) - logging.info(f"🚫 Dismissed overlay: {sel}") - time.sleep(0.6) - except Exception: - pass + logging.info( + f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) + page.goto( + profile_url, + wait_until="domcontentloaded", + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + time.sleep(3) + dismiss_overlays(page) - -def _try_refresh_grid(page, max_attempts: int = 4) -> bool: - """ - Click the Actualizar / Refresh button up to max_attempts times, - waiting progressively longer each time. - Returns True if the video grid eventually appears. - """ - for i in range(1, max_attempts + 1): - wait_s = 4.0 * i - logging.info( - f"🔄 Grid error detected — clicking Actualizar " - f"(attempt {i}/{max_attempts}, waiting {wait_s:.0f}s)..." - ) - try: - page.locator(TIKTOK_REFRESH_BTN_SEL).first.click(timeout=3000) - except Exception: - pass - time.sleep(wait_s) - _dismiss_all_overlays(page) - try: - page.wait_for_selector(TIKTOK_VIDEO_GRID_SEL, timeout=6000) - logging.info("✅ Video grid appeared after refresh.") - return True - except Exception: - pass - return False - -def _scrape_via_api(handle: str, cookies: list) -> list: - """ - Fallback scraper using yt-dlp to list videos from a TikTok profile. - yt-dlp handles TikTok's request signing internally — no raw API needed. - Returns same list-of-dicts format as the Playwright scraper. - """ - logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...") - - cookie_file = None - videos = [] - - try: - import yt_dlp - - cookie_file = _write_netscape_cookies(cookies) - - ydl_opts = { - "quiet": True, - "no_warnings": False, - "extract_flat": True, # metadata only — no video download yet - "playlistend": SCRAPE_VIDEO_LIMIT, - "ignoreerrors": True, - } - if cookie_file: - ydl_opts["cookiefile"] = cookie_file - - profile_url = f"https://www.tiktok.com/@{handle}" - logging.info(f"🌐 yt-dlp extracting: {profile_url}") - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(profile_url, download=False) - - if not info: - logging.warning("⚠️ yt-dlp returned no info for profile.") - return [] - - entries = info.get("entries") or [] - logging.info( - f"✅ yt-dlp returned {len(entries)} entries " - f"(playlist: {info.get('title', '?')})" - ) - - for entry in entries[:SCRAPE_VIDEO_LIMIT]: try: - if not entry: - continue - - vid_id = str(entry.get("id") or "") - url = ( - entry.get("webpage_url") - or entry.get("url") - or "" - ) - desc = ( - entry.get("title") - or entry.get("description") - or "" + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, ) + except Exception: + pass - # Normalise URL - if vid_id and not url: - url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first + if not grid.is_visible(timeout=5000): + logging.warning(f"⚠️ Video grid not found on attempt {attempt}.") + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png") + logging.info( + f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png" + ) + except Exception: + pass + time.sleep(3) + continue - # Extract ID from URL if missing - if not vid_id and url: - m = re.search(r"/video/(\d+)", url) - if m: - vid_id = m.group(1) + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + for item in items[:limit]: + try: + link = item.locator("a").first.get_attribute("href") + if link and "/video/" in link: + vid_match = re.search(r"/video/(\d+)", link) + if vid_match: + video_id = vid_match.group(1) + full_url = ( + link if link.startswith("http") + else f"https://www.tiktok.com{link}" + ) + videos.append({ + "video_id": video_id, + "url": full_url, + "timestamp": None, + }) + except Exception: + pass - if not vid_id: - logging.debug(f"⏭️ Skipping entry with no ID: {entry}") - continue + if videos: + logging.info(f"✅ Playwright scraped {len(videos)} videos.") + break - videos.append({ - "id": vid_id, - "url": url, - "desc": desc, - "timestamp": arrow.utcnow().isoformat(), - "video_url": url, - }) - logging.debug(f" 📹 {vid_id}: {desc[:60]}") - - except Exception as e: - logging.warning(f"⚠️ yt-dlp entry parse error: {e}") - - logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.") - - except Exception as e: - logging.error(f"❌ yt-dlp profile scrape failed: {e}") - - finally: - if cookie_file and os.path.exists(cookie_file): - os.unlink(cookie_file) + except Exception as e: + logging.warning( + f"⚠️ Playwright attempt {attempt} error: " + f"{type(e).__name__}: {e}" + ) + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png") + except Exception: + pass + time.sleep(3) return videos -def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]: + + +def scrape_tiktok_profile_playwright( + handle: str, + cookies: list, + limit: int = SCRAPE_VIDEO_LIMIT, +) -> list[dict]: """ - Extract both the numeric user ID and secUid from the profile page HTML. - Returns (user_id, sec_uid) — either may be None. + Scrape the most recent video URLs from a TikTok profile page using Playwright. + + Stealth handling: + v1.x → stealth_sync(page) after new_page() + v2.x → Stealth() used as context manager; page created inside it + none → plain page, no stealth """ - user_id = None - sec_uid = None - - try: - resp = httpx.get( - f"https://www.tiktok.com/@{handle}", - headers=headers, - timeout=15, - follow_redirects=True, - ) - html = resp.text - - # ── Numeric user ID ──────────────────────────────────────────── - id_patterns = [ - r'"authorId"\s*:\s*"(\d{15,25})"', - r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"', - r'"userId"\s*:\s*"(\d{15,25})"', - r'"uid"\s*:\s*"(\d{15,25})"', - r'"ownerUid"\s*:\s*"(\d{15,25})"', - r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"', - r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"', - ] - for pattern in id_patterns: - m = re.search(pattern, html, re.IGNORECASE) - if m: - user_id = m.group(1) - logging.info(f"✅ Resolved TikTok user ID: {user_id}") - break - - # ── secUid ───────────────────────────────────────────────────── - sec_patterns = [ - r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', - r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"', - ] - for pattern in sec_patterns: - m = re.search(pattern, html, re.IGNORECASE) - if m: - sec_uid = m.group(1) - logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...") - break - - if not user_id and not sec_uid: - # Window search fallback - handle_pos = html.find(f'"uniqueId":"{handle}"') - if handle_pos != -1: - window = html[max(0, handle_pos - 300): handle_pos + 300] - m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window) - if m: - user_id = m.group(1) - logging.info(f"✅ Resolved TikTok user ID (window): {user_id}") - m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window) - if m: - sec_uid = m.group(1) - logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...") - - if not user_id and not sec_uid: - logging.warning( - f"⚠️ Could not resolve any TikTok ID for @{handle}. " - f"HTML length: {len(html)} chars." - ) - - except Exception as e: - logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}") - - return user_id, sec_uid -def scrape_tiktoks_via_playwright(handle: str) -> list: - """ - Scrape recent videos from a public TikTok profile. - Returns a list of dicts: {id, url, desc, timestamp, video_url} - """ - profile_url = f"https://www.tiktok.com/@{handle.lstrip('@')}" - cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) - videos = [] - + profile_url = f"https://www.tiktok.com/@{handle}" logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") + videos = [] + with sync_playwright() as p: browser = p.chromium.launch( headless=True, slow_mo=PLAYWRIGHT_SLOW_MO, args=[ + "--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-setuid-sandbox", - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--disable-gpu", ], ) - context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" + "Chrome/126.0.0.0 Safari/537.36" ), viewport={"width": 1280, "height": 900}, locale="es-ES", - timezone_id="Europe/Madrid", ) - if cookies: - inject_cookies_into_context(context, cookies) + inject_cookies_into_context(context, cookies) - page = context.new_page() - - # Stealth mode — compatible with both v1.x and v2.x - if _STEALTH_V2: - Stealth().apply_stealth_sync(page) - else: - stealth_sync(page) - - page.add_init_script(""" - Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); - window.chrome = { runtime: {} }; - Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]}); - Object.defineProperty(navigator, 'languages', {get: () => ['es-ES', 'es', 'en']}); - """) - - grid_loaded = False - - for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): - logging.info( - f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." - ) + # ── Stealth v2.x — page must be created inside the context manager ── + if _STEALTH_V2 is True: try: - page.goto( - profile_url, - wait_until="domcontentloaded", - timeout=PLAYWRIGHT_TIMEOUT_MS, - ) + stealth_instance = Stealth() + with stealth_instance(context) as stealthy_context: + page = stealthy_context.new_page() + logging.info("🥷 playwright-stealth v2.x applied (context manager).") + videos = _run_playwright_scrape_loop(page, profile_url, limit) except Exception as e: - logging.warning(f"⚠️ page.goto failed on attempt {attempt}: {e}") - _take_debug_screenshot(page, f"goto_fail_{attempt}") - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) - continue - break + logging.warning( + f"⚠️ playwright-stealth v2.x failed: {type(e).__name__}: {e}. " + f"Retrying without stealth." + ) + # Fall through to no-stealth path below + page = context.new_page() + videos = _run_playwright_scrape_loop(page, profile_url, limit) - time.sleep(random.uniform(2.5, 4.0)) - - # ── Dismiss ALL overlays including GDPR ──────────────────── - _dismiss_all_overlays(page) - time.sleep(1.5) - - # ── Check for grid error and retry with Actualizar ───────── + # ── Stealth v1.x ────────────────────────────────────────────────── + elif _STEALTH_V2 is False: + page = context.new_page() try: - if page.locator(TIKTOK_GRID_ERROR_SEL).is_visible(timeout=2000): - if _try_refresh_grid(page, max_attempts=4): - grid_loaded = True - break - # Grid still broken — try a full page reload - logging.warning( - "⚠️ Grid still broken after Actualizar retries. " - "Reloading page..." - ) - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) - continue + stealth_sync(page) + logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") + except Exception as e: + logging.warning( + f"⚠️ playwright-stealth v1.x failed: {type(e).__name__}: {e}. " + f"Continuing without stealth." + ) + videos = _run_playwright_scrape_loop(page, profile_url, limit) + + # ── No stealth available ────────────────────────────────────────── + else: + logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.") + page = context.new_page() + videos = _run_playwright_scrape_loop(page, profile_url, limit) + + if not videos: + logging.warning( + f"⚠️ Video grid not found after {PLAYWRIGHT_MAX_RELOADS} attempts." + ) + ts = int(time.time()) + try: + page.screenshot( + path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) except Exception: pass - # ── Wait for video grid normally ─────────────────────────── + for obj in (page, context, browser): try: - page.wait_for_selector( - TIKTOK_VIDEO_GRID_SEL, - timeout=PLAYWRIGHT_TIMEOUT_MS, - ) - logging.info("✅ Video grid found.") - grid_loaded = True - break + if obj: + obj.close() except Exception: - logging.warning( - f"⚠️ Video grid not found on attempt {attempt}." - ) - _take_debug_screenshot(page, f"no_grid_{attempt}") - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) + pass - if not grid_loaded: - logging.warning( - "⚠️ Playwright grid scraping failed. " - "Trying API fallback..." - ) - _take_debug_screenshot(page, "playwright_failed") - browser.close() - # ── API fallback ─────────────────────────────────────────── - return _scrape_via_api(handle, cookies) + return videos - # ── Scroll to load more videos ───────────────────────────────── - logging.info("📜 Scrolling to load videos...") - for _ in range(5): - page.evaluate("window.scrollBy(0, window.innerHeight * 2)") - time.sleep(random.uniform(1.0, 2.0)) - # ── Extract video items ──────────────────────────────────────── - items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() - logging.info(f"📋 Found {len(items)} video items in grid.") +# ───────────────────────────────────────────────────────────────────────────── +# TikTok scraping — yt-dlp fallback +# ───────────────────────────────────────────────────────────────────────────── +def scrape_tiktok_profile_ytdlp( + handle: str, + netscape_cookies_path: str = None, + limit: int = SCRAPE_VIDEO_LIMIT, +) -> list[dict]: + """ + Fallback: use yt-dlp to extract the video list from a TikTok profile. + Accepts a Netscape-format cookie file path (not JSON). + """ + import yt_dlp - for item in items[:SCRAPE_VIDEO_LIMIT]: - try: - link_el = item.locator("a").first - href = link_el.get_attribute("href") or "" - if not href or "/video/" not in href: - continue + profile_url = f"https://www.tiktok.com/@{handle}" + logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...") - if href.startswith("/"): - href = "https://www.tiktok.com" + href + impersonate = get_best_impersonation_target() - vid_match = re.search(r"/video/(\d+)", href) - if not vid_match: - continue - video_id = vid_match.group(1) + ydl_opts = { + "extract_flat": True, + "quiet": True, + "no_warnings": True, + "playlistend": limit, + } + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + ydl_opts["cookiefile"] = netscape_cookies_path + if impersonate: + ydl_opts["impersonate"] = impersonate - desc = "" - try: - desc = item.get_attribute("aria-label") or "" - if not desc: - desc_el = item.locator( - '[class*="desc"], [class*="title"]' - ).first - desc = desc_el.inner_text(timeout=1000).strip() - except Exception: - pass + try: + logging.info(f"🌐 yt-dlp extracting: {profile_url}") + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(profile_url, download=False) + entries = info.get("entries", []) if info else [] + logging.info( + f"✅ yt-dlp returned {len(entries)} entries " + f"(playlist: {info.get('title', '?') if info else '?'})" + ) + + videos = [] + for entry in entries: + if not entry: + continue + url = entry.get("url") or entry.get("webpage_url") or "" + vid_match = re.search(r"/video/(\d+)", url) + if not vid_match: + vid_id = entry.get("id", "") + if vid_id: + url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + vid_match = re.search(r"/video/(\d+)", url) + if vid_match: videos.append({ - "id": video_id, - "url": href, - "desc": desc, - "timestamp": arrow.utcnow().isoformat(), - "video_url": href, + "video_id": vid_match.group(1), + "url": url, + "timestamp": entry.get("timestamp"), }) - except Exception as e: - logging.warning(f"⚠️ Error parsing video item: {e}") + logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.") + return videos[:limit] + + except Exception as e: + logging.error( + f"❌ yt-dlp profile scrape failed: {type(e).__name__}: {e}" + ) + return [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Caption builder +# ───────────────────────────────────────────────────────────────────────────── +def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> str: + """Build a Bluesky post caption from video metadata.""" + desc = (video_info.get("description") or "").strip() + url = video_info.get("url", "") + + if desc: + url_len = len(url) + 1 + max_desc = max_len - url_len + if len(desc) > max_desc: + desc = desc[: max_desc - 1] + "…" + return f"{desc}\n{url}" + + return url + + +# ───────────────────────────────────────────────────────────────────────────── +# Main processing loop +# ───────────────────────────────────────────────────────────────────────────── +def process_videos( + videos: list[dict], + state: dict, + client: Client, + tiktok_handle: str, + netscape_cookies_path: str, + langs: list[str], + max_age_days: int, + video_max_size_bytes: int, +) -> int: + """ + Download, compress, upload and post each new video. + Returns the count of successfully posted videos. + """ + posted_count = 0 + now = arrow.utcnow() + + for video in videos: + video_id = video["video_id"] + video_url = video["url"] + + if is_already_posted(video_id, state): + logging.info(f"⏭️ Already posted: {video_id}") + continue + + ts = video.get("timestamp") + if ts: + try: + video_time = arrow.get(ts) + age_days = (now - video_time).days + if age_days > max_age_days: + logging.info( + f"⏭️ Video {video_id} too old " + f"({age_days}d > {max_age_days}d). Skipping." + ) + continue + except Exception: + pass + + logging.info(f"🎬 Processing video {video_id}: {video_url}") + + with tempfile.TemporaryDirectory() as tmpdir: + raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") + comp_path = os.path.join(tmpdir, f"{video_id}.mp4") + + # 1. Download + ok = download_video( + video_url, + raw_path, + netscape_cookies_path=netscape_cookies_path, + ) + if not ok: + logging.error(f"❌ Download failed for {video_id}. Skipping.") continue - browser.close() - - # ── If Playwright found nothing, try API fallback ────────────────── - if not videos: - logging.warning( - "⚠️ Playwright returned 0 videos. Trying API fallback..." - ) - return _scrape_via_api(handle, cookies) - - logging.info(f"✅ Scraped {len(videos)} videos from @{handle}.") - return videos -# ───────────────────────────────────────────────────────────────────────────── -# Core: process a single TikTok video → post to Bluesky -# ───────────────────────────────────────────────────────────────────────────── -def process_tiktok(video: dict, client: Client, - langs: list, state: dict) -> bool: - """ - Download, compress, and post a single TikTok video to Bluesky. - Returns True if successfully posted. - """ - video_id = video["id"] - video_url = video["url"] - desc = video.get("desc", "") - - # ── Deduplication ────────────────────────────────────────────────── - if is_already_posted(video_id, state): - logging.info(f"⏭️ Skipping already-posted video: {video_id}") - return False - - logging.info(f"🎬 Processing video {video_id}: {video_url}") - - cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) - - with tempfile.TemporaryDirectory() as tmpdir: - raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") - processed_path = os.path.join(tmpdir, f"{video_id}.mp4") - - # ── Download ─────────────────────────────────────────────────── - logging.info(f"⬇️ Downloading: {video_url}") - if not download_video(video_url, raw_path, cookies=cookies): - logging.error(f"❌ Download failed for {video_id}. Skipping.") - return False - - # ── Compress / trim ──────────────────────────────────────────── - if not compress_video(raw_path, processed_path): - logging.error(f"❌ Compression failed for {video_id}. Skipping.") - return False - - # ── Size guard ───────────────────────────────────────────────── - final_size = os.path.getsize(processed_path) - if final_size > VIDEO_MAX_SIZE_BYTES: - logging.error( - f"❌ Compressed video still too large: " - f"{final_size / 1024 / 1024:.1f} MB > " - f"{VIDEO_MAX_SIZE_BYTES / 1024 / 1024:.0f} MB. Skipping." + # 2. Compress + ok = compress_video( + raw_path, + comp_path, + max_size_bytes=video_max_size_bytes, ) - return False + if not ok: + logging.error(f"❌ Compression failed for {video_id}. Skipping.") + continue - # ── Upload to Bluesky ────────────────────────────────────────── - logging.info( - f"⬆️ Uploading to Bluesky " - f"({final_size / 1024 / 1024:.1f} MB)..." - ) - with open(processed_path, "rb") as f: - video_data = f.read() + # 3. Upload blob + blob = upload_video_to_bluesky(client, comp_path, video_id) + if blob is None: + logging.error(f"❌ Blob upload failed for {video_id}.") + continue - try: - blob = bsky_upload_blob_with_retry(client, video_data, "video/mp4") - except Exception as e: - logging.error(f"❌ Blob upload failed for {video_id}: {e}") - return False + # 4. Post + caption = build_caption(video, tiktok_handle) + ok = post_video_to_bluesky(client, blob, caption, langs, video_id) + if ok: + mark_as_posted(video_id, state, meta={"url": video_url}) + posted_count += 1 + time.sleep(random.uniform(2.0, 5.0)) - # ── Build post text ──────────────────────────────────────────── - post_text = desc.strip() if desc else "" - if len(post_text) > 280: - post_text = post_text[:277] + "..." - if not post_text: - post_text = f"🎬 {video_url}" - - # ── Build video embed ────────────────────────────────────────── - try: - from atproto import models - video_embed = models.AppBskyEmbedVideo.Main( - video=blob, - alt=desc[:1000] if desc else "", - ) - except Exception as e: - logging.error(f"❌ Could not build video embed: {e}") - return False - - # ── Create post ──────────────────────────────────────────────── - success = bsky_create_post_with_retry( - client, - text=post_text, - embed=video_embed, - langs=langs, - ) - - if success: - mark_as_posted(video_id, state, { - "tiktok_url": video_url, - "desc": desc[:200] if desc else "", - }) - logging.info(f"✅ Posted video {video_id} to Bluesky.") - return True - - logging.error(f"❌ Failed to post video {video_id} to Bluesky.") - return False + return posted_count # ───────────────────────────────────────────────────────────────────────────── # Entry point # ───────────────────────────────────────────────────────────────────────────── -def main(): - global TIKTOK_COOKIES_PATH # must be first line in function - - load_dotenv() - +def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="TikTok → Bluesky cross-poster" + description="Cross-post TikTok videos to Bluesky." + ) + parser.add_argument("--tiktok-handle", required=True) + parser.add_argument("--bsky-handle", required=True) + parser.add_argument("--bsky-app-password", required=True) + parser.add_argument( + "--bsky-base-url", + default=DEFAULT_BSKY_BASE_URL, + help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})", ) parser.add_argument( - "--tiktok-handle", required=True, - help="TikTok handle to scrape (without @)", + "--bsky-langs", + nargs="+", + default=DEFAULT_BSKY_LANGS, + help="BCP-47 language tags for posts (default: es)", ) parser.add_argument( - "--bsky-handle", required=True, - help="Bluesky handle (e.g. user.bsky.social)", + "--cookies-path", + default=TIKTOK_COOKIES_PATH, + help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})", ) parser.add_argument( - "--bsky-app-password", required=True, - help="Bluesky app password (not account password)", + "--max-age-days", + type=int, + default=VIDEO_MAX_AGE_DAYS, + help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})", ) - parser.add_argument( - "--bsky-base-url", default=DEFAULT_BSKY_BASE_URL, - help=( - "Bluesky AT Protocol PDS base URL. " - "Always https://bsky.social even for custom-domain users " - "(e.g. eurosky.social handles still authenticate via bsky.social). " - f"Default: {DEFAULT_BSKY_BASE_URL}" - ), - ) - parser.add_argument( - "--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS, - help="Post language codes (default: es)", - ) - parser.add_argument( - "--cookies-path", default=TIKTOK_COOKIES_PATH, - help="Path to TikTok cookies JSON file", - ) - args = parser.parse_args() + return parser.parse_args() - # Override global cookie path from CLI - TIKTOK_COOKIES_PATH = args.cookies_path + +def main(): + load_dotenv() + args = parse_args() + + video_max_size_bytes = get_video_size_limit(args.bsky_base_url) logging.info("=" * 60) logging.info("🤖 TikTok→Bluesky bot started") @@ -1285,57 +1178,85 @@ def main(): logging.info(f" Bluesky handle: {args.bsky_handle}") logging.info(f" Bluesky PDS : {args.bsky_base_url}") logging.info(f" Languages : {args.bsky_langs}") - logging.info( - f" Cookie file : {TIKTOK_COOKIES_PATH} " - f"({'✅ found' if os.path.exists(TIKTOK_COOKIES_PATH) else '❌ NOT FOUND'})" - ) + logging.info(f" Video size cap: {video_max_size_bytes // 1024 // 1024} MB") + cookie_status = "✅ found" if os.path.exists(args.cookies_path) else "❌ NOT FOUND" + logging.info(f" Cookie file : {args.cookies_path} ({cookie_status})") logging.info("=" * 60) - state = load_state() + state = load_state() - # Instantiate client — base URL is baked in via bsky_login() - client = Client() - - # ── Bluesky login ────────────────────────────────────────────────── - if not bsky_login( - client, + # Connect to Bluesky + client = connect_bluesky( args.bsky_handle, args.bsky_app_password, args.bsky_base_url, - ): - logging.error("❌ Cannot proceed without Bluesky login. Exiting.") - sys.exit(1) + ) - # ── Scrape TikTok ────────────────────────────────────────────────── - logging.info(f"🔄 Scraping @{args.tiktok_handle}...") - tiktoks = scrape_tiktoks_via_playwright(args.tiktok_handle) + # Convert JSON cookies → Netscape format once for all yt-dlp calls + netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path) + if netscape_cookies_path: + logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}") + else: + logging.warning( + "⚠️ Could not create Netscape cookie file. " + "yt-dlp will run without cookies." + ) - if not tiktoks: - logging.warning("⚠️ No TikTok videos found. Skipping sync.") - logging.info("🤖 Bot finished.") - return + try: + logging.info(f"🔄 Scraping @{args.tiktok_handle}...") + cookies = load_cookies_from_file(args.cookies_path) - logging.info(f"📋 Found {len(tiktoks)} video(s). Processing new ones...") + videos = scrape_tiktok_profile_playwright( + args.tiktok_handle, + cookies, + limit=SCRAPE_VIDEO_LIMIT, + ) - # ── Process each video ───────────────────────────────────────────── - posted = 0 - for tiktok in tiktoks: - try: - if process_tiktok(tiktok, client, args.bsky_langs, state): - posted += 1 - # Polite delay between posts - time.sleep(random.uniform(3.0, 7.0)) - except Exception as e: - logging.error( - f"❌ Unexpected error processing video " - f"{tiktok.get('id', '?')}: {e}" + if not videos: + logging.warning( + "⚠️ Playwright grid scraping failed. Trying yt-dlp fallback..." ) - continue + ts = int(time.time()) + logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png") - logging.info("=" * 60) - logging.info(f"✅ Sync complete. Posted {posted} new video(s).") - logging.info("🤖 Bot finished.") - logging.info("=" * 60) + videos = scrape_tiktok_profile_ytdlp( + args.tiktok_handle, + netscape_cookies_path=netscape_cookies_path, + limit=SCRAPE_VIDEO_LIMIT, + ) + + if not videos: + logging.error("❌ No videos found. Exiting.") + sys.exit(0) + + logging.info(f"📋 Found {len(videos)} video(s). Processing new ones...") + + posted = process_videos( + videos=videos, + state=state, + client=client, + tiktok_handle=args.tiktok_handle, + netscape_cookies_path=netscape_cookies_path, + langs=args.bsky_langs, + max_age_days=args.max_age_days, + video_max_size_bytes=video_max_size_bytes, + ) + + logging.info("=" * 60) + logging.info(f"✅ Sync complete. Posted {posted} new video(s).") + logging.info("🤖 Bot finished.") + logging.info("=" * 60) + + finally: + # Always clean up the temporary Netscape cookie file + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + try: + os.remove(netscape_cookies_path) + logging.info( + f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}" + ) + except Exception as e: + logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}") if __name__ == "__main__":