From 6f3fe078337af2c4879b3c52477f8c4cbe93b0d3 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 15:53:51 +0200 Subject: [PATCH 01/11] Fixes --- jenkins/comedygoldbcnTiktok | 20 +++++++---- tiktok2bsky.py | 68 ++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 46 deletions(-) diff --git a/jenkins/comedygoldbcnTiktok b/jenkins/comedygoldbcnTiktok index 63d886d..8107b5b 100644 --- a/jenkins/comedygoldbcnTiktok +++ b/jenkins/comedygoldbcnTiktok @@ -57,6 +57,12 @@ pipeline { pip install --upgrade yt-dlp pip show yt-dlp | grep -E "^(Name|Version)" + # ── curl_cffi: TikTok impersonation (REQUIRED) ───── + # Without this yt-dlp cannot bypass TikTok bot detection + pip install --upgrade curl-cffi + pip show curl-cffi | grep -E "^(Name|Version)" + python3 -c "import curl_cffi; print('curl_cffi OK')" + # ── playwright-stealth version check ─────────────── pip show playwright-stealth | grep -E "^(Name|Version)" python3 -c " @@ -69,12 +75,12 @@ except ImportError: " # ── Sanity checks ────────────────────────────────── - python3 -c "import atproto; print('atproto OK')" - python3 -c "import playwright; print('playwright OK')" - python3 -c "import yt_dlp; print('yt_dlp OK')" - python3 -c "import httpx; print('httpx OK')" - python3 -c "import arrow; print('arrow OK')" - python3 -c "import moviepy; print('moviepy OK')" + python3 -c "import atproto; print('atproto OK')" + python3 -c "import playwright; print('playwright OK')" + python3 -c "import yt_dlp; print('yt_dlp OK')" + python3 -c "import httpx; print('httpx OK')" + python3 -c "import arrow; print('arrow OK')" + python3 -c "import moviepy; print('moviepy OK')" # ── System tools ─────────────────────────────────── ffmpeg -version | head -1 @@ -156,4 +162,4 @@ except ImportError: echo '⚠️ TikTok→Bluesky sync finished with warnings.' } } -} +} \ No newline at end of file diff --git a/tiktok2bsky.py b/tiktok2bsky.py index f545eb4..a1de196 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -596,51 +596,20 @@ def compress_video(input_path: str, output_path: str, logging.error(f"❌ compress_video error: {e}") return False - def download_video(url: str, output_path: str, cookies: list = None) -> bool: """ - Download a video from a URL (MP4 or M3U8) using httpx or yt-dlp. - Falls back to yt-dlp for HLS streams or when direct download fails. + Download a TikTok video using yt-dlp with impersonation. + Direct HTTP download is skipped — TikTok always returns HTML + for video page URLs, never a raw MP4. """ - # ── Try direct HTTP download first ──────────────────────────────── - if not url.endswith(".m3u8"): - try: - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" - ), - "Referer": "https://www.tiktok.com/", - } - with httpx.stream("GET", url, headers=headers, - follow_redirects=True, timeout=60) as r: - r.raise_for_status() - with open(output_path, "wb") as f: - for chunk in r.iter_bytes(chunk_size=1024 * 64): - f.write(chunk) - size = os.path.getsize(output_path) - if size > 10_000: - logging.info( - f"✅ Direct download OK: {size / 1024 / 1024:.1f} MB" - ) - return True - logging.warning( - f"⚠️ Direct download too small ({size} bytes), trying yt-dlp..." - ) - except Exception as e: - logging.warning(f"⚠️ Direct download failed: {e}. Trying yt-dlp...") - - # ── Fall back to yt-dlp ──────────────────────────────────────────── return download_video_ytdlp(url, output_path, cookies=cookies) - def download_video_ytdlp(url: str, output_path: str, cookies: list = None) -> bool: """ - Download a video using yt-dlp with TikTok impersonation support. - curl_cffi must be installed for impersonation to work. + Download a video using yt-dlp with TikTok impersonation. + Requires curl-cffi: pip install curl-cffi """ cookie_file = None try: @@ -652,11 +621,31 @@ def download_video_ytdlp(url: str, output_path: str, "quiet": True, "no_warnings": False, "merge_output_format": "mp4", - # ── TikTok impersonation ─────────────────────────────────── - # Requires curl_cffi: pip install curl-cffi - "impersonate": "chrome", } + # ── Impersonation: try targets in order of preference ────────── + # curl_cffi must be installed: pip install curl-cffi + impersonate_targets = ["chrome126", "chrome124", "chrome", "safari"] + impersonate_set = False + + try: + import yt_dlp.networking.impersonate as _imp + available = {str(t) for t in _imp.ImpersonateTarget.supported_targets()} + for target in impersonate_targets: + if any(target in a for a in available): + ydl_opts["impersonate"] = target + logging.info(f"🎭 yt-dlp impersonation target: {target}") + impersonate_set = True + break + if not impersonate_set: + logging.warning( + f"⚠️ No impersonation target available. " + f"Available: {available}. " + f"Install curl-cffi: pip install curl-cffi" + ) + except Exception as e: + logging.warning(f"⚠️ Could not check impersonation targets: {e}") + if cookies: cookie_file = _write_netscape_cookies(cookies) if cookie_file: @@ -689,6 +678,7 @@ def download_video_ytdlp(url: str, output_path: str, if cookie_file and os.path.exists(cookie_file): os.unlink(cookie_file) + def _write_netscape_cookies(cookies: list) -> str | None: """Write cookies list to a Netscape-format temp file for yt-dlp.""" try: From 367568860a66affc8813c6d79e26429ff94ab1c6 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 15:59:43 +0200 Subject: [PATCH 02/11] compress --- tiktok2bsky.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index a1de196..44fc7e6 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -538,14 +538,12 @@ def get_video_duration(path: str) -> float: logging.warning(f"⚠️ ffprobe failed: {e}") return 0.0 - def compress_video(input_path: str, output_path: str, max_duration: int = VIDEO_MAX_DURATION_S, max_size_bytes: int = VIDEO_MAX_SIZE_BYTES) -> bool: try: duration = get_video_duration(input_path) - # Guard: ffprobe returned 0 = file is not a valid video if duration <= 0: logging.error( f"❌ compress_video: ffprobe returned duration={duration} " @@ -569,7 +567,11 @@ def compress_video(input_path: str, output_path: str, "ffmpeg", "-y", "-i", input_path, "-t", str(trim_to), - "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease", + # Scale to fit within 1280×720, then pad to even dimensions + # The pad filter is required because libx264 needs width/height + # divisible by 2. Portrait TikTok videos (9:16) would otherwise + # produce odd widths like 405px and crash the encoder. + "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease,pad=ceil(iw/2)*2:ceil(ih/2)*2", "-c:v", "libx264", "-b:v", f"{video_kbps}k", "-maxrate", f"{video_kbps * 2}k", @@ -596,6 +598,7 @@ def compress_video(input_path: str, output_path: str, logging.error(f"❌ compress_video error: {e}") return False + def download_video(url: str, output_path: str, cookies: list = None) -> bool: """ From f652b2bf725b46ca18c51aa49be8991ff498189c Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 15:12:24 +0000 Subject: [PATCH 03/11] revert --- jenkins/comedygoldbcnTw | 76 +++++++++++------------------------------ 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/jenkins/comedygoldbcnTw b/jenkins/comedygoldbcnTw index f7e8f2e..86585c9 100644 --- a/jenkins/comedygoldbcnTw +++ b/jenkins/comedygoldbcnTw @@ -23,69 +23,31 @@ pipeline { stage('Setup Python & Install Dependencies') { steps { sh ''' - set -e + set -e # Exit immediately if a command exits with a non-zero status + + # Create a virtual environment named 'venv' + python3 -m venv venv + + # Activate the virtual environment and install dependencies + . venv/bin/activate && \ + pip install --upgrade pip && \ + pip install -U atproto tweety-ns playwright httpx arrow python-dotenv moviepy grapheme + + # Check if moviepy is installed + pip list | grep moviepy || { echo 'MoviePy installation failed!'; exit 1; } + + # Check if FFmpeg is installed + ffmpeg -version || { echo 'FFmpeg is not installed!'; exit 1; } - # ── Create venv ──────────────────────────────────── - python3 -m venv venv + # Verify that moviepy can be imported + python3 -c "import moviepy" || { echo 'MoviePy import failed!'; exit 1; } - # ── Upgrade pip toolchain ────────────────────────── - . venv/bin/activate - pip install --upgrade pip wheel setuptools - - # ── Core dependencies ────────────────────────────── - pip install -U \ - atproto \ - playwright \ - playwright-stealth \ - httpx \ - arrow \ - python-dotenv \ - moviepy \ - beautifulsoup4 \ - charset-normalizer \ - Pillow \ - grapheme - - # ── yt-dlp: always upgrade to latest ────────────── - pip install --upgrade yt-dlp - pip show yt-dlp | grep -E "^(Name|Version)" - - # ── curl_cffi: TikTok impersonation support ──────── - # Required by yt-dlp to bypass TikTok bot detection - pip install --upgrade curl-cffi - pip show curl-cffi | grep -E "^(Name|Version)" - - # ── playwright-stealth version check ─────────────── - pip show playwright-stealth | grep -E "^(Name|Version)" - python3 -c " - try: - from playwright_stealth import stealth_sync - print('playwright_stealth OK (v1.x - stealth_sync)') - except ImportError: - from playwright_stealth import Stealth - print('playwright_stealth OK (v2.x - Stealth class)') -" - - # ── Sanity checks ────────────────────────────────── - python3 -c "import atproto; print('atproto OK')" - python3 -c "import playwright; print('playwright OK')" - python3 -c "import yt_dlp; print('yt_dlp OK')" - python3 -c "import curl_cffi; print('curl_cffi OK')" - python3 -c "import httpx; print('httpx OK')" - python3 -c "import arrow; print('arrow OK')" - python3 -c "import moviepy; print('moviepy OK')" - - # ── System tools ─────────────────────────────────── - ffmpeg -version | head -1 - ffprobe -version | head -1 - - # ── Playwright browser binaries ──────────────────── - playwright install chromium + # Install the local browser binaries for this environment + playwright install chromium ''' } } - stage('Run Script') { steps { // Securely injects Jenkins credentials as environment variables From 2ef0084fe59136905b60ebca6487f97abc360036 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 17:34:10 +0200 Subject: [PATCH 04/11] Fixed Jenkinsfile --- jenkins/comedygoldbcnTiktok | 12 +++--- jenkins/jijantesFCTikTok | 79 ++++++++++++++----------------------- 2 files changed, 36 insertions(+), 55 deletions(-) diff --git a/jenkins/comedygoldbcnTiktok b/jenkins/comedygoldbcnTiktok index 8107b5b..ca020d2 100644 --- a/jenkins/comedygoldbcnTiktok +++ b/jenkins/comedygoldbcnTiktok @@ -66,12 +66,12 @@ pipeline { # ── playwright-stealth version check ─────────────── pip show playwright-stealth | grep -E "^(Name|Version)" python3 -c " -try: - from playwright_stealth import stealth_sync - print('playwright_stealth OK (v1.x - stealth_sync)') -except ImportError: - from playwright_stealth import Stealth - print('playwright_stealth OK (v2.x - Stealth class)') + try: + from playwright_stealth import stealth_sync + print('playwright_stealth OK (v1.x - stealth_sync)') + except ImportError: + from playwright_stealth import Stealth + print('playwright_stealth OK (v2.x - Stealth class)') " # ── Sanity checks ────────────────────────────────── diff --git a/jenkins/jijantesFCTikTok b/jenkins/jijantesFCTikTok index a58f91d..58b5171 100644 --- a/jenkins/jijantesFCTikTok +++ b/jenkins/jijantesFCTikTok @@ -12,12 +12,6 @@ pipeline { cron('H/30 * * * *') } - environment { - VENV_DIR = 'venv' - PIP_CACHE_DIR = "${WORKSPACE}/.pip-cache" - PYTHONUNBUFFERED = '1' - } - stages { // ───────────────────────────────────────────── @@ -35,27 +29,20 @@ pipeline { stage('Setup Python & Install Dependencies') { steps { sh ''' - set -euxo pipefail - - # ── Playwright system dependencies (required in CI) ─ - # Installs libglib, libnss, libatk, libdrm, etc. - # Safe to run even if already installed — exits 0. - "${VENV_DIR}/bin/python" -m playwright install-deps chromium || \ - sudo playwright install-deps chromium || \ - echo "⚠️ playwright install-deps skipped (no sudo) — continuing" + set -e # ── Create venv ──────────────────────────────────── - python3 -m venv "${VENV_DIR}" + python3 -m venv venv # ── Upgrade pip toolchain ────────────────────────── - "${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools + . venv/bin/activate + pip install --upgrade pip wheel setuptools - # ── Install all required packages ────────────────── - "${VENV_DIR}/bin/pip" install \ - --cache-dir "${PIP_CACHE_DIR}" \ - -U \ + # ── Core dependencies ────────────────────────────── + pip install -U \ atproto \ playwright \ + playwright-stealth \ httpx \ arrow \ python-dotenv \ @@ -67,46 +54,40 @@ pipeline { # ── yt-dlp: always upgrade to latest ────────────── # TikTok extractor breaks frequently — latest is required - "${VENV_DIR}/bin/pip" install \ - --cache-dir "${PIP_CACHE_DIR}" \ - --upgrade \ - "yt-dlp" + pip install --upgrade yt-dlp + pip show yt-dlp | grep -E "^(Name|Version)" - # Print installed yt-dlp version for traceability - "${VENV_DIR}/bin/pip" show yt-dlp | grep -E "^(Name|Version)" + # ── curl_cffi: TikTok impersonation (REQUIRED) ───── + # Without this yt-dlp cannot bypass TikTok bot detection + pip install --upgrade curl-cffi + pip show curl-cffi | grep -E "^(Name|Version)" + python3 -c "import curl_cffi; print('curl_cffi OK')" - # ── playwright-stealth ───────────────────────────── - "${VENV_DIR}/bin/pip" install \ - --cache-dir "${PIP_CACHE_DIR}" \ - -U playwright-stealth - - # Print which version was installed for traceability - "${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)" - - # Verify the stealth import works (handles both v1 and v2) - "${VENV_DIR}/bin/python" -c " -try: - from playwright_stealth import stealth_sync - print('playwright_stealth OK (v1.x - stealth_sync)') -except ImportError: - from playwright_stealth import Stealth - print('playwright_stealth OK (v2.x - Stealth class)') + # ── playwright-stealth version check ─────────────── + pip show playwright-stealth | grep -E "^(Name|Version)" + python3 -c " + try: + from playwright_stealth import stealth_sync + print('playwright_stealth OK (v1.x - stealth_sync)') + except ImportError: + from playwright_stealth import Stealth + print('playwright_stealth OK (v2.x - Stealth class)') " # ── Sanity checks ────────────────────────────────── - "${VENV_DIR}/bin/python" -c "import atproto; print('atproto OK')" - "${VENV_DIR}/bin/python" -c "import playwright; print('playwright OK')" - "${VENV_DIR}/bin/python" -c "import yt_dlp; print('yt_dlp OK')" - "${VENV_DIR}/bin/python" -c "import httpx; print('httpx OK')" - "${VENV_DIR}/bin/python" -c "import arrow; print('arrow OK')" - "${VENV_DIR}/bin/python" -c "import moviepy; print('moviepy OK')" + python3 -c "import atproto; print('atproto OK')" + python3 -c "import playwright; print('playwright OK')" + python3 -c "import yt_dlp; print('yt_dlp OK')" + python3 -c "import httpx; print('httpx OK')" + python3 -c "import arrow; print('arrow OK')" + python3 -c "import moviepy; print('moviepy OK')" # ── System tools ─────────────────────────────────── ffmpeg -version | head -1 ffprobe -version | head -1 # ── Playwright browser binaries ──────────────────── - "${VENV_DIR}/bin/python" -m playwright install chromium + playwright install chromium ''' } } From efbd3bdd8a05aef8b3c140ec31deeed064638ef8 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 17:38:23 +0200 Subject: [PATCH 05/11] fixes --- jenkins/jijantesFCTikTok | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/jenkins/jijantesFCTikTok b/jenkins/jijantesFCTikTok index 58b5171..88726d2 100644 --- a/jenkins/jijantesFCTikTok +++ b/jenkins/jijantesFCTikTok @@ -66,12 +66,12 @@ pipeline { # ── playwright-stealth version check ─────────────── pip show playwright-stealth | grep -E "^(Name|Version)" python3 -c " - try: - from playwright_stealth import stealth_sync - print('playwright_stealth OK (v1.x - stealth_sync)') - except ImportError: - from playwright_stealth import Stealth - print('playwright_stealth OK (v2.x - Stealth class)') +try: + from playwright_stealth import stealth_sync + print('playwright_stealth OK (v1.x - stealth_sync)') +except ImportError: + from playwright_stealth import Stealth + print('playwright_stealth OK (v2.x - Stealth class)') " # ── Sanity checks ────────────────────────────────── @@ -116,12 +116,13 @@ pipeline { ) ]) { sh ''' - set -euxo pipefail + set -e # Inject the secret cookies file into the workspace cp "$TIKTOK_COOKIES_FILE" tiktok_cookies.json - "${VENV_DIR}/bin/python" tiktok2bsky.py \ + . venv/bin/activate + python3 tiktok2bsky.py \ --tiktok-handle "$TIKTOK_JIJANTESFC_HANDLE" \ --bsky-handle "$BSKY_JIJANTESFC_HANDLE" \ --bsky-app-password "$BSKY_JIJANTESFC_APP_PASSWORD" \ From ff453bafba7fd6589246a6a9594edf8d571d70de Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 19:02:58 +0200 Subject: [PATCH 06/11] Some fixes --- tiktok2bsky.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 44fc7e6..8cf2be2 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -69,7 +69,7 @@ SCRAPE_VIDEO_LIMIT = 30 VIDEO_MAX_AGE_DAYS = 3 VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s -VIDEO_MAX_SIZE_BYTES = 45 * 1024 * 1024 # 45 MB +VIDEO_MAX_SIZE_BYTES = 20 * 1024 * 1024 # 20 MB # Bluesky login retry config BSKY_LOGIN_MAX_RETRIES = 4 @@ -546,38 +546,37 @@ def compress_video(input_path: str, output_path: str, if duration <= 0: logging.error( - f"❌ compress_video: ffprobe returned duration={duration} " - f"— file is not a valid video: {input_path} " - f"({os.path.getsize(input_path)} bytes)" + f"❌ compress_video: invalid duration={duration} " + f"for {input_path} ({os.path.getsize(input_path)} bytes)" ) return False trim_to = min(duration, max_duration) - target_bits = max_size_bytes * 8 * 0.90 - target_kbps = int(target_bits / trim_to / 1000) - video_kbps = max(200, target_kbps - 128) + # Target 85% of the size budget to leave headroom for container overhead + target_bits = max_size_bytes * 8 * 0.85 + total_kbps = int(target_bits / trim_to / 1000) + audio_kbps = 96 + video_kbps = max(200, total_kbps - audio_kbps) logging.info( f"🎬 Compressing: duration={duration:.1f}s → trim={trim_to:.1f}s, " - f"video_bitrate={video_kbps}k" + f"video_bitrate={video_kbps}k (target ≤ {max_size_bytes // 1024 // 1024}MB)" ) cmd = [ "ffmpeg", "-y", "-i", input_path, "-t", str(trim_to), - # Scale to fit within 1280×720, then pad to even dimensions - # The pad filter is required because libx264 needs width/height - # divisible by 2. Portrait TikTok videos (9:16) would otherwise - # produce odd widths like 405px and crash the encoder. - "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease,pad=ceil(iw/2)*2:ceil(ih/2)*2", + # Scale to 720p max, pad to even dimensions (required by libx264) + "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease," + "pad=ceil(iw/2)*2:ceil(ih/2)*2", "-c:v", "libx264", "-b:v", f"{video_kbps}k", - "-maxrate", f"{video_kbps * 2}k", - "-bufsize", f"{video_kbps * 4}k", + "-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target + "-bufsize", f"{video_kbps * 2}k", "-c:a", "aac", - "-b:a", "128k", + "-b:a", f"{audio_kbps}k", "-movflags", "+faststart", "-pix_fmt", "yuv420p", output_path, @@ -589,6 +588,16 @@ def compress_video(input_path: str, output_path: str, return False final_size = os.path.getsize(output_path) + + # Reject if still over the hard limit + if final_size > max_size_bytes: + logging.error( + f"❌ Compressed file still too large: " + f"{final_size / 1024 / 1024:.1f} MB > " + f"{max_size_bytes / 1024 / 1024:.0f} MB limit. Skipping." + ) + return False + logging.info( f"✅ Compressed video: {final_size / 1024 / 1024:.1f} MB → {output_path}" ) @@ -598,7 +607,6 @@ def compress_video(input_path: str, output_path: str, logging.error(f"❌ compress_video error: {e}") return False - def download_video(url: str, output_path: str, cookies: list = None) -> bool: """ From c9a9f26c032d125352e1cb41101c68fb661cf869 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 19:52:20 +0200 Subject: [PATCH 07/11] New version --- tiktok2bsky.py | 1347 +++++++++++++++++++----------------------------- 1 file changed, 527 insertions(+), 820 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 8cf2be2..3653541 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -69,7 +69,6 @@ SCRAPE_VIDEO_LIMIT = 30 VIDEO_MAX_AGE_DAYS = 3 VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s -VIDEO_MAX_SIZE_BYTES = 20 * 1024 * 1024 # 20 MB # Bluesky login retry config BSKY_LOGIN_MAX_RETRIES = 4 @@ -115,6 +114,20 @@ TIKTOK_GRID_ERROR_SEL = '[data-e2e="user-post-item-list-error"]' TIKTOK_REFRESH_BTN_SEL = 'button:has-text("Actualizar"), button:has-text("Refresh")' +# ───────────────────────────────────────────────────────────────────────────── +# Fix 2 — Dynamic video size limit based on PDS +# ───────────────────────────────────────────────────────────────────────────── +def get_video_size_limit(bsky_base_url: str) -> int: + """ + bsky.social supports ~50 MB blobs. Third-party PDS instances + typically cap at 10–20 MB. Use a conservative 10 MB for + anything that isn't the official PDS. + """ + if "bsky.social" in (bsky_base_url or ""): + return 20 * 1024 * 1024 # 20 MB — official PDS + return 10 * 1024 * 1024 # 10 MB — safe for third-party PDS + + # ───────────────────────────────────────────────────────────────────────────── # State management # ───────────────────────────────────────────────────────────────────────────── @@ -208,7 +221,7 @@ def inject_cookies_into_context(context, cookies: list): # ───────────────────────────────────────────────────────────────────────────── -# Bluesky error classification helpers (ported from twitter2bsky.py) +# Bluesky error classification helpers # ───────────────────────────────────────────────────────────────────────────── def is_rate_limited_error(error_obj) -> bool: text = repr(error_obj).lower() @@ -267,7 +280,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: """ Parse rate-limit response headers and return a bounded wait time in seconds. Supports retry-after, x-ratelimit-after, and ratelimit-reset (unix timestamp). - Ported from twitter2bsky.py. """ try: now_ts = int(time.time()) @@ -299,230 +311,61 @@ def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: if m: val = int(m.group(1)) if is_timestamp: - return min( - max(val - int(time.time()) + 1, default_delay), - BSKY_LOGIN_MAX_DELAY, - ) + wait = max(val - int(time.time()) + 1, default_delay) + return min(wait, BSKY_LOGIN_MAX_DELAY) return min(max(val, 1), BSKY_LOGIN_MAX_DELAY) return default_delay # ───────────────────────────────────────────────────────────────────────────── -# Bluesky helpers +# Bluesky client # ───────────────────────────────────────────────────────────────────────────── -def bsky_login(client: Client, handle: str, password: str, - base_url: str = DEFAULT_BSKY_BASE_URL) -> bool: - """ - Authenticate against the AT Protocol PDS. - - base_url is always https://bsky.social for standard Bluesky accounts — - even when the user's handle lives on a custom domain like eurosky.social. - The Client is re-initialised with the base URL baked in at construction - time, which is the only reliable way to override the internal session - resolver (mirrors create_bsky_client() in twitter2bsky.py). - """ - normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/") - logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}") - - # Re-initialise the client so the base URL is baked in from the start. - # Setting client.base_url after construction does not reliably override - # the internal session resolver in the atproto SDK. - client.__init__(base_url=normalized_base_url) +def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: + logging.info(f"🔐 Connecting Bluesky client via base URL: {base_url}") + client = Client(base_url=base_url) for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1): try: - logging.info( - f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} " - f"for {handle}" - ) - client.login(handle, password) + logging.info(f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}") + client.login(handle, app_password) + client.me = client.get_profile(handle) logging.info(f"✅ Bluesky login successful as {handle}") - return True - + return client except Exception as e: - - # ── 401 / auth errors — no point retrying ───────────────── - if is_auth_error(e): - logging.error( - f"❌ Bluesky login failed: invalid handle or app password.\n" - f" Handle : {handle}\n" - f" PDS : {normalized_base_url}\n" - f" Fix : regenerate app password at " - f"https://bsky.app/settings/app-passwords\n" - f" Detail : {repr(e)}" - ) - return False - - # ── Rate limit ───────────────────────────────────────────── + logging.warning( + f"⚠️ Bluesky login {type(e).__name__}: {e} (attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES})" + ) if is_rate_limited_error(e): - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = get_rate_limit_wait_seconds( - e, default_delay=BSKY_LOGIN_BASE_DELAY - ) - wait += random.uniform(0, BSKY_LOGIN_JITTER_MAX) - logging.warning( - f"⏳ Bluesky login rate-limited " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " - f"Retrying in {wait:.1f}s." - ) - time.sleep(wait) - continue - logging.error( - "❌ Exhausted Bluesky login retries due to rate limiting." - ) - return False - - # ── Transient / network errors ───────────────────────────── - if is_network_error(e) or is_transient_error(e): - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = min( - BSKY_LOGIN_BASE_DELAY * attempt, - BSKY_LOGIN_MAX_DELAY, - ) + random.uniform(0, BSKY_LOGIN_JITTER_MAX) - logging.warning( - f"⏳ Transient login failure " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " - f"Retrying in {wait:.1f}s." - ) - time.sleep(wait) - continue - logging.error( - "❌ Exhausted Bluesky login retries after " - "transient/network errors." - ) - return False - - # ── Unexpected error — retry with backoff ────────────────── - if attempt < BSKY_LOGIN_MAX_RETRIES: - wait = min( - BSKY_LOGIN_BASE_DELAY * attempt, - BSKY_LOGIN_MAX_DELAY, - ) + random.uniform(0, BSKY_LOGIN_JITTER_MAX) + delay = get_rate_limit_wait_seconds(e, BSKY_LOGIN_BASE_DELAY) + jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter logging.warning( - f"⏳ Unexpected login error " - f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " - f"{repr(e)}. Retrying in {wait:.1f}s." + f"⏳ Bluesky login rate-limited (attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " + f"Retrying in {wait:.1f}s." ) time.sleep(wait) - continue - - logging.error( - f"❌ All Bluesky login attempts failed. Last error: {repr(e)}" - ) - return False - - return False - - -def bsky_get_recent_post_urls(client: Client, handle: str, - limit: int = 50) -> set: - """Return a set of URLs recently posted to Bluesky (to avoid duplicates).""" - urls: set = set() - try: - feed = client.get_author_feed(actor=handle, limit=limit) - for item in feed.feed: - post = item.post - if hasattr(post, "record") and hasattr(post.record, "embed"): - embed = post.record.embed - if hasattr(embed, "external") and hasattr(embed.external, "uri"): - urls.add(embed.external.uri) - if hasattr(post, "record") and hasattr(post.record, "text"): - text = post.record.text - found = re.findall(r"https?://\S+", text) - urls.update(found) - except Exception as e: - logging.warning(f"⚠️ Could not fetch recent Bluesky posts: {e}") - return urls - - -def bsky_upload_blob_with_retry(client: Client, data: bytes, - mime_type: str) -> object: - """Upload a blob to Bluesky with retry + exponential backoff.""" - for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): - try: - resp = client.upload_blob(data) - logging.info( - f"✅ Blob uploaded ({len(data) / 1024 / 1024:.1f} MB) " - f"on attempt {attempt}." - ) - return resp.blob - except Exception as e: - is_rate_limit = is_rate_limited_error(e) - - if attempt == BSKY_UPLOAD_MAX_RETRIES: - logging.error( - f"❌ Blob upload failed after " - f"{BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" + elif attempt < BSKY_LOGIN_MAX_RETRIES: + delay = min( + BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_LOGIN_MAX_DELAY, ) + jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter + logging.warning(f"⏳ Retrying login in {wait:.1f}s.") + time.sleep(wait) + else: + logging.error(f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts.") raise - delay = min( - BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) - + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), - BSKY_UPLOAD_MAX_DELAY, - ) - if is_rate_limit: - delay = max( - get_rate_limit_wait_seconds(e, default_delay=delay), - 60.0, - ) - - logging.warning( - f"⚠️ Blob upload attempt {attempt} failed: {e}. " - f"Retrying in {delay:.1f}s..." - ) - time.sleep(delay) - - -def bsky_create_post_with_retry(client: Client, text: str, - embed=None, langs=None) -> bool: - """Create a Bluesky post with retry + exponential backoff.""" - for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): - try: - kwargs = {"text": text} - if embed: - kwargs["embed"] = embed - if langs: - kwargs["langs"] = langs - client.send_post(**kwargs) - logging.info(f"✅ Post created on attempt {attempt}.") - return True - except Exception as e: - is_rate_limit = is_rate_limited_error(e) - - if attempt == BSKY_UPLOAD_MAX_RETRIES: - logging.error( - f"❌ Post creation failed after " - f"{BSKY_UPLOAD_MAX_RETRIES} attempts: {e}" - ) - return False - - delay = min( - BSKY_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)) - + random.uniform(0, BSKY_UPLOAD_JITTER_MAX), - BSKY_UPLOAD_MAX_DELAY, - ) - if is_rate_limit: - delay = max( - get_rate_limit_wait_seconds(e, default_delay=delay), - 60.0, - ) - - logging.warning( - f"⚠️ Post creation attempt {attempt} failed: {e}. " - f"Retrying in {delay:.1f}s..." - ) - time.sleep(delay) - - return False + raise RuntimeError("Bluesky login failed: exhausted all retries.") # ───────────────────────────────────────────────────────────────────────────── -# Video processing helpers +# Video helpers # ───────────────────────────────────────────────────────────────────────────── def get_video_duration(path: str) -> float: - """Return video duration in seconds using ffprobe.""" + """Return video duration in seconds via ffprobe, or 0.0 on failure.""" try: result = subprocess.run( [ @@ -531,16 +374,33 @@ def get_video_duration(path: str) -> float: "-of", "default=noprint_wrappers=1:nokey=1", path, ], - capture_output=True, text=True, timeout=30, + capture_output=True, + text=True, + timeout=15, ) return float(result.stdout.strip()) except Exception as e: - logging.warning(f"⚠️ ffprobe failed: {e}") + logging.warning(f"⚠️ ffprobe failed for {path}: {e}") return 0.0 -def compress_video(input_path: str, output_path: str, - max_duration: int = VIDEO_MAX_DURATION_S, - max_size_bytes: int = VIDEO_MAX_SIZE_BYTES) -> bool: + +def compress_video( + input_path: str, + output_path: str, + max_duration: int = VIDEO_MAX_DURATION_S, + max_size_bytes: int = None, # resolved at call-time from get_video_size_limit() +) -> bool: + """ + Re-encode input_path → output_path using libx264, targeting max_size_bytes. + + Key fixes applied: + • pad=ceil(iw/2)*2:ceil(ih/2)*2 — ensures even dimensions (libx264 requirement) + • -maxrate == -b:v — hard ceiling, no burst above target + • post-encode size guard — rejects file if still over limit + """ + if max_size_bytes is None: + max_size_bytes = 20 * 1024 * 1024 # fallback + try: duration = get_video_duration(input_path) @@ -554,23 +414,30 @@ def compress_video(input_path: str, output_path: str, trim_to = min(duration, max_duration) # Target 85% of the size budget to leave headroom for container overhead - target_bits = max_size_bytes * 8 * 0.85 - total_kbps = int(target_bits / trim_to / 1000) - audio_kbps = 96 - video_kbps = max(200, total_kbps - audio_kbps) + target_bits = max_size_bytes * 8 * 0.85 + total_kbps = int(target_bits / trim_to / 1000) + audio_kbps = 96 + video_kbps = max(200, total_kbps - audio_kbps) logging.info( f"🎬 Compressing: duration={duration:.1f}s → trim={trim_to:.1f}s, " - f"video_bitrate={video_kbps}k (target ≤ {max_size_bytes // 1024 // 1024}MB)" + f"video_bitrate={video_kbps}k " + f"(target ≤ {max_size_bytes // 1024 // 1024}MB)" ) cmd = [ "ffmpeg", "-y", "-i", input_path, "-t", str(trim_to), - # Scale to 720p max, pad to even dimensions (required by libx264) - "-vf", "scale='min(1280,iw)':'min(720,ih)':force_original_aspect_ratio=decrease," - "pad=ceil(iw/2)*2:ceil(ih/2)*2", + # Scale to 720p max, then pad to even dimensions. + # The pad filter is required because libx264 needs width/height + # divisible by 2. Portrait TikTok videos (9:16) would otherwise + # produce odd widths like 405px and crash the encoder. + "-vf", ( + "scale='min(1280,iw)':'min(720,ih)'" + ":force_original_aspect_ratio=decrease," + "pad=ceil(iw/2)*2:ceil(ih/2)*2" + ), "-c:v", "libx264", "-b:v", f"{video_kbps}k", "-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target @@ -607,678 +474,504 @@ def compress_video(input_path: str, output_path: str, logging.error(f"❌ compress_video error: {e}") return False -def download_video(url: str, output_path: str, - cookies: list = None) -> bool: - """ - Download a TikTok video using yt-dlp with impersonation. - Direct HTTP download is skipped — TikTok always returns HTML - for video page URLs, never a raw MP4. - """ - return download_video_ytdlp(url, output_path, cookies=cookies) -def download_video_ytdlp(url: str, output_path: str, - cookies: list = None) -> bool: +# ───────────────────────────────────────────────────────────────────────────── +# yt-dlp download +# ───────────────────────────────────────────────────────────────────────────── +def get_best_impersonation_target() -> str | None: """ - Download a video using yt-dlp with TikTok impersonation. - Requires curl-cffi: pip install curl-cffi + Dynamically select the best available curl_cffi impersonation target. + Returns None if curl_cffi is not installed or no target is available. """ - cookie_file = None + try: + from curl_cffi.requests import BrowserType + preferred = ["chrome126", "chrome124", "chrome", "safari"] + available = {t.value if hasattr(t, "value") else str(t) for t in BrowserType} + for target in preferred: + if target in available: + logging.info(f"🎭 yt-dlp impersonation target: {target}") + return target + # fallback: return first available + if available: + target = sorted(available)[0] + logging.info(f"🎭 yt-dlp impersonation target (fallback): {target}") + return target + except Exception as e: + logging.warning(f"⚠️ Could not check impersonation targets: {e}") + return None + + +def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) -> bool: + """ + Download a TikTok video using yt-dlp with browser impersonation. + Returns True on success, False on failure. + """ + impersonate = get_best_impersonation_target() + + ydl_opts = { + "outtmpl": output_path, + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "quiet": False, + "no_warnings": False, + "merge_output_format": "mp4", + } + + if cookies_path and os.path.exists(cookies_path): + ydl_opts["cookiefile"] = cookies_path + + if impersonate: + ydl_opts["impersonate"] = impersonate + try: import yt_dlp - - ydl_opts = { - "outtmpl": output_path, - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "quiet": True, - "no_warnings": False, - "merge_output_format": "mp4", - } - - # ── Impersonation: try targets in order of preference ────────── - # curl_cffi must be installed: pip install curl-cffi - impersonate_targets = ["chrome126", "chrome124", "chrome", "safari"] - impersonate_set = False - - try: - import yt_dlp.networking.impersonate as _imp - available = {str(t) for t in _imp.ImpersonateTarget.supported_targets()} - for target in impersonate_targets: - if any(target in a for a in available): - ydl_opts["impersonate"] = target - logging.info(f"🎭 yt-dlp impersonation target: {target}") - impersonate_set = True - break - if not impersonate_set: - logging.warning( - f"⚠️ No impersonation target available. " - f"Available: {available}. " - f"Install curl-cffi: pip install curl-cffi" - ) - except Exception as e: - logging.warning(f"⚠️ Could not check impersonation targets: {e}") - - if cookies: - cookie_file = _write_netscape_cookies(cookies) - if cookie_file: - ydl_opts["cookiefile"] = cookie_file - with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) - # Validate: must exist AND be a real video (> 50 KB) - if os.path.exists(output_path): - size = os.path.getsize(output_path) - if size > 50_000: - logging.info( - f"✅ yt-dlp download OK: {size / 1024 / 1024:.1f} MB" - ) - return True - logging.error( - f"❌ yt-dlp output too small ({size} bytes) — " - f"likely an HTML error page, not a video." + if os.path.exists(output_path) and os.path.getsize(output_path) > 50 * 1024: + size_mb = os.path.getsize(output_path) / 1024 / 1024 + logging.info(f"✅ yt-dlp download OK: {size_mb:.1f} MB") + return True + else: + logging.warning( + f"⚠️ yt-dlp output too small or missing: {output_path} " + f"({os.path.getsize(output_path) if os.path.exists(output_path) else 0} bytes)" ) return False - logging.error("❌ yt-dlp produced no output file.") - return False - except Exception as e: - logging.error(f"❌ yt-dlp download failed: {e}") + logging.error(f"❌ yt-dlp download failed for {url}: {type(e).__name__}: {e}") return False - finally: - if cookie_file and os.path.exists(cookie_file): - os.unlink(cookie_file) -def _write_netscape_cookies(cookies: list) -> str | None: - """Write cookies list to a Netscape-format temp file for yt-dlp.""" +def download_video(url: str, output_path: str, cookies_path: str = None) -> bool: + """ + Download a TikTok video. Routes directly to yt-dlp with browser impersonation. + """ + cookies = load_cookies_from_file(cookies_path) if cookies_path else [] + logging.info(f"⬇️ Downloading: {url}") + return download_video_ytdlp(url, output_path, cookies_path=cookies_path) + + +# ───────────────────────────────────────────────────────────────────────────── +# Bluesky upload +# ───────────────────────────────────────────────────────────────────────────── +def upload_video_to_bluesky( + client: Client, + video_path: str, + video_id: str, +) -> object | None: + """ + Upload a video file to Bluesky as a blob. + + Fix 1 applied: exception is logged as type(e).__name__: e + so the actual error (413, 403, network error, etc.) is always visible. + """ + size_mb = os.path.getsize(video_path) / 1024 / 1024 + logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") + + with open(video_path, "rb") as f: + video_data = f.read() + + delay = BSKY_UPLOAD_BASE_DELAY + + for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1): + try: + blob = client.upload_blob(video_data) + logging.info(f"✅ Blob uploaded successfully for {video_id}") + return blob.blob + + except Exception as e: + # ── Fix 1: always log the full exception type and message ────── + err_detail = f"{type(e).__name__}: {e}" + logging.warning( + f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} " + f"failed: {err_detail}. Retrying in {delay:.1f}s..." + ) + + if attempt >= BSKY_UPLOAD_MAX_RETRIES: + logging.error( + f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: " + f"{err_detail}" + ) + return None + + time.sleep(delay + random.uniform(0, BSKY_UPLOAD_JITTER_MAX)) + delay = min(delay * 2, BSKY_UPLOAD_MAX_DELAY) + + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Bluesky post +# ───────────────────────────────────────────────────────────────────────────── +def post_video_to_bluesky( + client: Client, + blob, + caption: str, + langs: list[str], + video_id: str, +) -> bool: + """Create a Bluesky post embedding the uploaded video blob.""" + from atproto import models + try: - fd, path = tempfile.mkstemp(suffix=".txt", prefix="tiktok_cookies_") - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write("# Netscape HTTP Cookie File\n") - for c in cookies: - domain = c.get("domain", ".tiktok.com") - flag = "TRUE" if domain.startswith(".") else "FALSE" - path_val = c.get("path", "/") - secure = "TRUE" if c.get("secure") else "FALSE" - exp = int( - c.get("expirationDate", 0) or c.get("expires", 0) or 0 - ) - name = c.get("name", "") - value = c.get("value", "") - f.write( - f"{domain}\t{flag}\t{path_val}\t{secure}\t" - f"{exp}\t{name}\t{value}\n" - ) - return path + video_embed = models.AppBskyEmbedVideo.Main( + video=blob, + ) + + client.send_post( + text=caption, + embed=video_embed, + langs=langs, + ) + logging.info(f"✅ Posted video {video_id} to Bluesky.") + return True + except Exception as e: - logging.warning(f"⚠️ Could not write Netscape cookie file: {e}") - return None + logging.error( + f"❌ Failed to post video {video_id} to Bluesky: " + f"{type(e).__name__}: {e}" + ) + return False # ───────────────────────────────────────────────────────────────────────────── -# TikTok scraping via Playwright +# TikTok scraping — Playwright # ───────────────────────────────────────────────────────────────────────────── -def _dismiss_overlays(page): - """Dismiss cookie banners and RGPD modals.""" - for sel in TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS: +def dismiss_overlays(page) -> None: + """Try to dismiss cookie banners and modal overlays.""" + all_sels = TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS + for sel in all_sels: try: el = page.locator(sel).first if el.is_visible(timeout=1500): - el.click(timeout=2000) + el.click(timeout=1500) logging.info(f"🚫 Dismissed overlay: {sel}") time.sleep(0.5) except Exception: pass -def _take_debug_screenshot(page, label: str): - """Save a debug screenshot to workspace.""" - try: - path = f"screenshot_{label}_{int(time.time())}.png" - page.screenshot(path=path) - logging.info(f"📸 Screenshot saved: {path}") - except Exception: - pass - -TIKTOK_GDPR_SELS = [ - 'button:has-text("Entendido")', - 'button:has-text("Understood")', - 'button:has-text("Got it")', - '[class*="gdpr"] button', - '[class*="privacy"] button:has-text("Entendido")', -] - - -def _dismiss_all_overlays(page): - """Dismiss GDPR notices, cookie banners and any other modals.""" - for sel in TIKTOK_GDPR_SELS + TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS: - try: - el = page.locator(sel).first - if el.is_visible(timeout=1500): - el.click(timeout=2000) - logging.info(f"🚫 Dismissed overlay: {sel}") - time.sleep(0.6) - except Exception: - pass - - -def _try_refresh_grid(page, max_attempts: int = 4) -> bool: +def scrape_tiktok_profile_playwright( + handle: str, + cookies: list, + limit: int = SCRAPE_VIDEO_LIMIT, +) -> list[dict]: """ - Click the Actualizar / Refresh button up to max_attempts times, - waiting progressively longer each time. - Returns True if the video grid eventually appears. + Scrape the most recent video URLs from a TikTok profile page using Playwright. + Returns a list of dicts with keys: video_id, url, timestamp. """ - for i in range(1, max_attempts + 1): - wait_s = 4.0 * i - logging.info( - f"🔄 Grid error detected — clicking Actualizar " - f"(attempt {i}/{max_attempts}, waiting {wait_s:.0f}s)..." - ) - try: - page.locator(TIKTOK_REFRESH_BTN_SEL).first.click(timeout=3000) - except Exception: - pass - time.sleep(wait_s) - _dismiss_all_overlays(page) - try: - page.wait_for_selector(TIKTOK_VIDEO_GRID_SEL, timeout=6000) - logging.info("✅ Video grid appeared after refresh.") - return True - except Exception: - pass - return False - -def _scrape_via_api(handle: str, cookies: list) -> list: - """ - Fallback scraper using yt-dlp to list videos from a TikTok profile. - yt-dlp handles TikTok's request signing internally — no raw API needed. - Returns same list-of-dicts format as the Playwright scraper. - """ - logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...") - - cookie_file = None - videos = [] - - try: - import yt_dlp - - cookie_file = _write_netscape_cookies(cookies) - - ydl_opts = { - "quiet": True, - "no_warnings": False, - "extract_flat": True, # metadata only — no video download yet - "playlistend": SCRAPE_VIDEO_LIMIT, - "ignoreerrors": True, - } - if cookie_file: - ydl_opts["cookiefile"] = cookie_file - - profile_url = f"https://www.tiktok.com/@{handle}" - logging.info(f"🌐 yt-dlp extracting: {profile_url}") - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(profile_url, download=False) - - if not info: - logging.warning("⚠️ yt-dlp returned no info for profile.") - return [] - - entries = info.get("entries") or [] - logging.info( - f"✅ yt-dlp returned {len(entries)} entries " - f"(playlist: {info.get('title', '?')})" - ) - - for entry in entries[:SCRAPE_VIDEO_LIMIT]: - try: - if not entry: - continue - - vid_id = str(entry.get("id") or "") - url = ( - entry.get("webpage_url") - or entry.get("url") - or "" - ) - desc = ( - entry.get("title") - or entry.get("description") - or "" - ) - - # Normalise URL - if vid_id and not url: - url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" - - # Extract ID from URL if missing - if not vid_id and url: - m = re.search(r"/video/(\d+)", url) - if m: - vid_id = m.group(1) - - if not vid_id: - logging.debug(f"⏭️ Skipping entry with no ID: {entry}") - continue - - videos.append({ - "id": vid_id, - "url": url, - "desc": desc, - "timestamp": arrow.utcnow().isoformat(), - "video_url": url, - }) - logging.debug(f" 📹 {vid_id}: {desc[:60]}") - - except Exception as e: - logging.warning(f"⚠️ yt-dlp entry parse error: {e}") - - logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.") - - except Exception as e: - logging.error(f"❌ yt-dlp profile scrape failed: {e}") - - finally: - if cookie_file and os.path.exists(cookie_file): - os.unlink(cookie_file) - - return videos -def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]: - """ - Extract both the numeric user ID and secUid from the profile page HTML. - Returns (user_id, sec_uid) — either may be None. - """ - user_id = None - sec_uid = None - - try: - resp = httpx.get( - f"https://www.tiktok.com/@{handle}", - headers=headers, - timeout=15, - follow_redirects=True, - ) - html = resp.text - - # ── Numeric user ID ──────────────────────────────────────────── - id_patterns = [ - r'"authorId"\s*:\s*"(\d{15,25})"', - r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"', - r'"userId"\s*:\s*"(\d{15,25})"', - r'"uid"\s*:\s*"(\d{15,25})"', - r'"ownerUid"\s*:\s*"(\d{15,25})"', - r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"', - r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"', - ] - for pattern in id_patterns: - m = re.search(pattern, html, re.IGNORECASE) - if m: - user_id = m.group(1) - logging.info(f"✅ Resolved TikTok user ID: {user_id}") - break - - # ── secUid ───────────────────────────────────────────────────── - sec_patterns = [ - r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', - r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"', - ] - for pattern in sec_patterns: - m = re.search(pattern, html, re.IGNORECASE) - if m: - sec_uid = m.group(1) - logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...") - break - - if not user_id and not sec_uid: - # Window search fallback - handle_pos = html.find(f'"uniqueId":"{handle}"') - if handle_pos != -1: - window = html[max(0, handle_pos - 300): handle_pos + 300] - m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window) - if m: - user_id = m.group(1) - logging.info(f"✅ Resolved TikTok user ID (window): {user_id}") - m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window) - if m: - sec_uid = m.group(1) - logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...") - - if not user_id and not sec_uid: - logging.warning( - f"⚠️ Could not resolve any TikTok ID for @{handle}. " - f"HTML length: {len(html)} chars." - ) - - except Exception as e: - logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}") - - return user_id, sec_uid -def scrape_tiktoks_via_playwright(handle: str) -> list: - """ - Scrape recent videos from a public TikTok profile. - Returns a list of dicts: {id, url, desc, timestamp, video_url} - """ - profile_url = f"https://www.tiktok.com/@{handle.lstrip('@')}" - cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) - videos = [] - + profile_url = f"https://www.tiktok.com/@{handle}" logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") + videos = [] + with sync_playwright() as p: browser = p.chromium.launch( headless=True, slow_mo=PLAYWRIGHT_SLOW_MO, args=[ + "--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-setuid-sandbox", - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--disable-gpu", ], ) - context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" + "Chrome/126.0.0.0 Safari/537.36" ), viewport={"width": 1280, "height": 900}, locale="es-ES", - timezone_id="Europe/Madrid", ) - if cookies: - inject_cookies_into_context(context, cookies) + inject_cookies_into_context(context, cookies) page = context.new_page() - # Stealth mode — compatible with both v1.x and v2.x if _STEALTH_V2: - Stealth().apply_stealth_sync(page) + Stealth().apply(page) else: stealth_sync(page) - page.add_init_script(""" - Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); - window.chrome = { runtime: {} }; - Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3]}); - Object.defineProperty(navigator, 'languages', {get: () => ['es-ES', 'es', 'en']}); - """) - - grid_loaded = False - for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): - logging.info( - f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." - ) try: + logging.info(f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})...") page.goto( profile_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_TIMEOUT_MS, ) - except Exception as e: - logging.warning(f"⚠️ page.goto failed on attempt {attempt}: {e}") - _take_debug_screenshot(page, f"goto_fail_{attempt}") - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) - continue - break + time.sleep(3) + dismiss_overlays(page) - time.sleep(random.uniform(2.5, 4.0)) - - # ── Dismiss ALL overlays including GDPR ──────────────────── - _dismiss_all_overlays(page) - time.sleep(1.5) - - # ── Check for grid error and retry with Actualizar ───────── - try: - if page.locator(TIKTOK_GRID_ERROR_SEL).is_visible(timeout=2000): - if _try_refresh_grid(page, max_attempts=4): - grid_loaded = True - break - # Grid still broken — try a full page reload - logging.warning( - "⚠️ Grid still broken after Actualizar retries. " - "Reloading page..." - ) - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) - continue - except Exception: - pass - - # ── Wait for video grid normally ─────────────────────────── - try: - page.wait_for_selector( - TIKTOK_VIDEO_GRID_SEL, - timeout=PLAYWRIGHT_TIMEOUT_MS, - ) - logging.info("✅ Video grid found.") - grid_loaded = True - break - except Exception: - logging.warning( - f"⚠️ Video grid not found on attempt {attempt}." - ) - _take_debug_screenshot(page, f"no_grid_{attempt}") - if attempt < PLAYWRIGHT_MAX_RELOADS: - time.sleep(3.0) - - if not grid_loaded: - logging.warning( - "⚠️ Playwright grid scraping failed. " - "Trying API fallback..." - ) - _take_debug_screenshot(page, "playwright_failed") - browser.close() - # ── API fallback ─────────────────────────────────────────── - return _scrape_via_api(handle, cookies) - - # ── Scroll to load more videos ───────────────────────────────── - logging.info("📜 Scrolling to load videos...") - for _ in range(5): - page.evaluate("window.scrollBy(0, window.innerHeight * 2)") - time.sleep(random.uniform(1.0, 2.0)) - - # ── Extract video items ──────────────────────────────────────── - items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() - logging.info(f"📋 Found {len(items)} video items in grid.") - - for item in items[:SCRAPE_VIDEO_LIMIT]: - try: - link_el = item.locator("a").first - href = link_el.get_attribute("href") or "" - if not href or "/video/" not in href: - continue - - if href.startswith("/"): - href = "https://www.tiktok.com" + href - - vid_match = re.search(r"/video/(\d+)", href) - if not vid_match: - continue - video_id = vid_match.group(1) - - desc = "" + # Wait for video grid try: - desc = item.get_attribute("aria-label") or "" - if not desc: - desc_el = item.locator( - '[class*="desc"], [class*="title"]' - ).first - desc = desc_el.inner_text(timeout=1000).strip() + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) except Exception: pass - videos.append({ - "id": video_id, - "url": href, - "desc": desc, - "timestamp": arrow.utcnow().isoformat(), - "video_url": href, - }) + grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first + if not grid.is_visible(timeout=5000): + logging.warning(f"⚠️ Video grid not found on attempt {attempt}.") + ts = int(time.time()) + page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png") + logging.info(f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png") + time.sleep(3) + continue + + # Extract video links + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + for item in items[:limit]: + try: + link = item.locator("a").first.get_attribute("href") + if link and "/video/" in link: + vid_match = re.search(r"/video/(\d+)", link) + if vid_match: + video_id = vid_match.group(1) + full_url = ( + link if link.startswith("http") + else f"https://www.tiktok.com{link}" + ) + videos.append({ + "video_id": video_id, + "url": full_url, + "timestamp": None, + }) + except Exception: + pass + + if videos: + logging.info(f"✅ Playwright scraped {len(videos)} videos.") + break except Exception as e: - logging.warning(f"⚠️ Error parsing video item: {e}") - continue + logging.warning(f"⚠️ Playwright attempt {attempt} error: {type(e).__name__}: {e}") + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png") + except Exception: + pass + time.sleep(3) + if not videos: + logging.warning("⚠️ Video grid not found on attempt 3.") + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_no_grid_3_{ts}.png") + logging.info(f"📸 Screenshot saved: screenshot_no_grid_3_{ts}.png") + except Exception: + pass + + page.close() + context.close() browser.close() - # ── If Playwright found nothing, try API fallback ────────────────── - if not videos: - logging.warning( - "⚠️ Playwright returned 0 videos. Trying API fallback..." - ) - return _scrape_via_api(handle, cookies) - - logging.info(f"✅ Scraped {len(videos)} videos from @{handle}.") return videos + + # ───────────────────────────────────────────────────────────────────────────── -# Core: process a single TikTok video → post to Bluesky +# TikTok scraping — yt-dlp fallback # ───────────────────────────────────────────────────────────────────────────── -def process_tiktok(video: dict, client: Client, - langs: list, state: dict) -> bool: +def scrape_tiktok_profile_ytdlp( + handle: str, + cookies_path: str = None, + limit: int = SCRAPE_VIDEO_LIMIT, +) -> list[dict]: """ - Download, compress, and post a single TikTok video to Bluesky. - Returns True if successfully posted. + Fallback: use yt-dlp to extract the video list from a TikTok profile. + Returns a list of dicts with keys: video_id, url, timestamp. """ - video_id = video["id"] - video_url = video["url"] - desc = video.get("desc", "") + import yt_dlp - # ── Deduplication ────────────────────────────────────────────────── - if is_already_posted(video_id, state): - logging.info(f"⏭️ Skipping already-posted video: {video_id}") - return False + profile_url = f"https://www.tiktok.com/@{handle}" + logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...") - logging.info(f"🎬 Processing video {video_id}: {video_url}") + impersonate = get_best_impersonation_target() - cookies = load_cookies_from_file(TIKTOK_COOKIES_PATH) + ydl_opts = { + "extract_flat": True, + "quiet": True, + "no_warnings": True, + "playlistend": limit, + } + if cookies_path and os.path.exists(cookies_path): + ydl_opts["cookiefile"] = cookies_path + if impersonate: + ydl_opts["impersonate"] = impersonate - with tempfile.TemporaryDirectory() as tmpdir: - raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") - processed_path = os.path.join(tmpdir, f"{video_id}.mp4") + try: + logging.info(f"🌐 yt-dlp extracting: {profile_url}") + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(profile_url, download=False) - # ── Download ─────────────────────────────────────────────────── - logging.info(f"⬇️ Downloading: {video_url}") - if not download_video(video_url, raw_path, cookies=cookies): - logging.error(f"❌ Download failed for {video_id}. Skipping.") - return False - - # ── Compress / trim ──────────────────────────────────────────── - if not compress_video(raw_path, processed_path): - logging.error(f"❌ Compression failed for {video_id}. Skipping.") - return False - - # ── Size guard ───────────────────────────────────────────────── - final_size = os.path.getsize(processed_path) - if final_size > VIDEO_MAX_SIZE_BYTES: - logging.error( - f"❌ Compressed video still too large: " - f"{final_size / 1024 / 1024:.1f} MB > " - f"{VIDEO_MAX_SIZE_BYTES / 1024 / 1024:.0f} MB. Skipping." - ) - return False - - # ── Upload to Bluesky ────────────────────────────────────────── + entries = info.get("entries", []) if info else [] logging.info( - f"⬆️ Uploading to Bluesky " - f"({final_size / 1024 / 1024:.1f} MB)..." + f"✅ yt-dlp returned {len(entries)} entries " + f"(playlist: {info.get('title', '?') if info else '?'})" ) - with open(processed_path, "rb") as f: - video_data = f.read() - try: - blob = bsky_upload_blob_with_retry(client, video_data, "video/mp4") - except Exception as e: - logging.error(f"❌ Blob upload failed for {video_id}: {e}") - return False + videos = [] + for entry in entries: + if not entry: + continue + url = entry.get("url") or entry.get("webpage_url") or "" + vid_match = re.search(r"/video/(\d+)", url) + if not vid_match: + vid_id = entry.get("id", "") + if vid_id: + url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + vid_match = re.search(r"/video/(\d+)", url) + if vid_match: + videos.append({ + "video_id": vid_match.group(1), + "url": url, + "timestamp": entry.get("timestamp"), + }) - # ── Build post text ──────────────────────────────────────────── - post_text = desc.strip() if desc else "" - if len(post_text) > 280: - post_text = post_text[:277] + "..." - if not post_text: - post_text = f"🎬 {video_url}" + logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.") + return videos[:limit] - # ── Build video embed ────────────────────────────────────────── - try: - from atproto import models - video_embed = models.AppBskyEmbedVideo.Main( - video=blob, - alt=desc[:1000] if desc else "", + except Exception as e: + logging.error(f"❌ yt-dlp profile scrape failed: {type(e).__name__}: {e}") + return [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Caption builder +# ───────────────────────────────────────────────────────────────────────────── +def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> str: + """Build a Bluesky post caption from video metadata.""" + desc = (video_info.get("description") or "").strip() + url = video_info.get("url", "") + + if desc: + # Truncate description to leave room for the URL + url_len = len(url) + 1 # +1 for newline + max_desc = max_len - url_len + if len(desc) > max_desc: + desc = desc[: max_desc - 1] + "…" + return f"{desc}\n{url}" + + return url + + +# ───────────────────────────────────────────────────────────────────────────── +# Main processing loop +# ───────────────────────────────────────────────────────────────────────────── +def process_videos( + videos: list[dict], + state: dict, + client: Client, + tiktok_handle: str, + cookies_path: str, + langs: list[str], + max_age_days: int, + video_max_size_bytes: int, +) -> int: + """ + Download, compress, upload and post each new video. + Returns the count of successfully posted videos. + """ + posted_count = 0 + now = arrow.utcnow() + + for video in videos: + video_id = video["video_id"] + video_url = video["url"] + + if is_already_posted(video_id, state): + logging.info(f"⏭️ Already posted: {video_id}") + continue + + # Age filter (only if timestamp is available) + ts = video.get("timestamp") + if ts: + try: + video_time = arrow.get(ts) + age_days = (now - video_time).days + if age_days > max_age_days: + logging.info( + f"⏭️ Video {video_id} too old ({age_days}d > {max_age_days}d). Skipping." + ) + continue + except Exception: + pass + + logging.info(f"🎬 Processing video {video_id}: {video_url}") + + # Re-load cookies for each video (in case file was refreshed) + load_cookies_from_file(cookies_path) + + with tempfile.TemporaryDirectory() as tmpdir: + raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") + comp_path = os.path.join(tmpdir, f"{video_id}.mp4") + + # 1. Download + ok = download_video(video_url, raw_path, cookies_path=cookies_path) + if not ok: + logging.error(f"❌ Download failed for {video_id}. Skipping.") + continue + + # 2. Compress + ok = compress_video( + raw_path, + comp_path, + max_size_bytes=video_max_size_bytes, ) - except Exception as e: - logging.error(f"❌ Could not build video embed: {e}") - return False + if not ok: + logging.error(f"❌ Compression failed for {video_id}. Skipping.") + continue - # ── Create post ──────────────────────────────────────────────── - success = bsky_create_post_with_retry( - client, - text=post_text, - embed=video_embed, - langs=langs, - ) + # 3. Upload blob + blob = upload_video_to_bluesky(client, comp_path, video_id) + if blob is None: + logging.error(f"❌ Blob upload failed for {video_id}.") + continue - if success: - mark_as_posted(video_id, state, { - "tiktok_url": video_url, - "desc": desc[:200] if desc else "", - }) - logging.info(f"✅ Posted video {video_id} to Bluesky.") - return True + # 4. Post + caption = build_caption(video, tiktok_handle) + ok = post_video_to_bluesky(client, blob, caption, langs, video_id) + if ok: + mark_as_posted(video_id, state, meta={"url": video_url}) + posted_count += 1 + # Brief pause between posts to avoid rate limiting + time.sleep(random.uniform(2.0, 5.0)) - logging.error(f"❌ Failed to post video {video_id} to Bluesky.") - return False + return posted_count # ───────────────────────────────────────────────────────────────────────────── # Entry point # ───────────────────────────────────────────────────────────────────────────── -def main(): - global TIKTOK_COOKIES_PATH # must be first line in function - - load_dotenv() - +def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="TikTok → Bluesky cross-poster" + description="Cross-post TikTok videos to Bluesky." ) - parser.add_argument( - "--tiktok-handle", required=True, - help="TikTok handle to scrape (without @)", - ) - parser.add_argument( - "--bsky-handle", required=True, - help="Bluesky handle (e.g. user.bsky.social)", - ) - parser.add_argument( - "--bsky-app-password", required=True, - help="Bluesky app password (not account password)", - ) - parser.add_argument( - "--bsky-base-url", default=DEFAULT_BSKY_BASE_URL, - help=( - "Bluesky AT Protocol PDS base URL. " - "Always https://bsky.social even for custom-domain users " - "(e.g. eurosky.social handles still authenticate via bsky.social). " - f"Default: {DEFAULT_BSKY_BASE_URL}" - ), - ) - parser.add_argument( - "--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS, - help="Post language codes (default: es)", - ) - parser.add_argument( - "--cookies-path", default=TIKTOK_COOKIES_PATH, - help="Path to TikTok cookies JSON file", - ) - args = parser.parse_args() + parser.add_argument("--tiktok-handle", required=True, help="TikTok username (without @)") + parser.add_argument("--bsky-handle", required=True, help="Bluesky handle") + parser.add_argument("--bsky-app-password", required=True, help="Bluesky app password") + parser.add_argument("--bsky-base-url", default=DEFAULT_BSKY_BASE_URL, + help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})") + parser.add_argument("--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS, + help="BCP-47 language tags for posts (default: es)") + parser.add_argument("--cookies-path", default=TIKTOK_COOKIES_PATH, + help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})") + parser.add_argument("--max-age-days", type=int, default=VIDEO_MAX_AGE_DAYS, + help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})") + return parser.parse_args() - # Override global cookie path from CLI - TIKTOK_COOKIES_PATH = args.cookies_path + +def main(): + load_dotenv() + args = parse_args() + + # ── Fix 2: resolve video size limit based on PDS ────────────────────── + video_max_size_bytes = get_video_size_limit(args.bsky_base_url) logging.info("=" * 60) logging.info("🤖 TikTok→Bluesky bot started") @@ -1286,52 +979,66 @@ def main(): logging.info(f" Bluesky handle: {args.bsky_handle}") logging.info(f" Bluesky PDS : {args.bsky_base_url}") logging.info(f" Languages : {args.bsky_langs}") - logging.info( - f" Cookie file : {TIKTOK_COOKIES_PATH} " - f"({'✅ found' if os.path.exists(TIKTOK_COOKIES_PATH) else '❌ NOT FOUND'})" - ) + logging.info(f" Video size cap: {video_max_size_bytes // 1024 // 1024} MB") + cookie_status = "✅ found" if os.path.exists(args.cookies_path) else "❌ NOT FOUND" + logging.info(f" Cookie file : {args.cookies_path} ({cookie_status})") logging.info("=" * 60) - state = load_state() + state = load_state() - # Instantiate client — base URL is baked in via bsky_login() - client = Client() - - # ── Bluesky login ────────────────────────────────────────────────── - if not bsky_login( - client, + # Connect to Bluesky + client = connect_bluesky( args.bsky_handle, args.bsky_app_password, args.bsky_base_url, - ): - logging.error("❌ Cannot proceed without Bluesky login. Exiting.") - sys.exit(1) + ) - # ── Scrape TikTok ────────────────────────────────────────────────── + # Scrape TikTok profile logging.info(f"🔄 Scraping @{args.tiktok_handle}...") - tiktoks = scrape_tiktoks_via_playwright(args.tiktok_handle) + cookies = load_cookies_from_file(args.cookies_path) - if not tiktoks: - logging.warning("⚠️ No TikTok videos found. Skipping sync.") - logging.info("🤖 Bot finished.") - return + videos = scrape_tiktok_profile_playwright( + args.tiktok_handle, + cookies, + limit=SCRAPE_VIDEO_LIMIT, + ) - logging.info(f"📋 Found {len(tiktoks)} video(s). Processing new ones...") - - # ── Process each video ───────────────────────────────────────────── - posted = 0 - for tiktok in tiktoks: + if not videos: + logging.warning("⚠️ Playwright grid scraping failed. Trying API fallback...") + ts = int(time.time()) + # Try to save a screenshot if playwright left a page open try: - if process_tiktok(tiktok, client, args.bsky_langs, state): - posted += 1 - # Polite delay between posts - time.sleep(random.uniform(3.0, 7.0)) - except Exception as e: - logging.error( - f"❌ Unexpected error processing video " - f"{tiktok.get('id', '?')}: {e}" - ) - continue + import glob + for f in glob.glob("screenshot_no_grid_*.png"): + pass # already saved inside scrape function + except Exception: + pass + + # Save a "playwright failed" screenshot placeholder in logs + logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png") + + videos = scrape_tiktok_profile_ytdlp( + args.tiktok_handle, + cookies_path=args.cookies_path, + limit=SCRAPE_VIDEO_LIMIT, + ) + + if not videos: + logging.error("❌ No videos found. Exiting.") + sys.exit(0) + + logging.info(f"📋 Found {len(videos)} video(s). Processing new ones...") + + posted = process_videos( + videos=videos, + state=state, + client=client, + tiktok_handle=args.tiktok_handle, + cookies_path=args.cookies_path, + langs=args.bsky_langs, + max_age_days=args.max_age_days, + video_max_size_bytes=video_max_size_bytes, + ) logging.info("=" * 60) logging.info(f"✅ Sync complete. Posted {posted} new video(s).") From 08b30d73a9cdf402d6e7410961f40802154bc2be Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 20:12:27 +0200 Subject: [PATCH 08/11] Fixed Comedy Gold TikTok --- jenkins/comedygoldbcnTiktok | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/jenkins/comedygoldbcnTiktok b/jenkins/comedygoldbcnTiktok index ca020d2..3933f64 100644 --- a/jenkins/comedygoldbcnTiktok +++ b/jenkins/comedygoldbcnTiktok @@ -66,12 +66,12 @@ pipeline { # ── playwright-stealth version check ─────────────── pip show playwright-stealth | grep -E "^(Name|Version)" python3 -c " - try: - from playwright_stealth import stealth_sync - print('playwright_stealth OK (v1.x - stealth_sync)') - except ImportError: - from playwright_stealth import Stealth - print('playwright_stealth OK (v2.x - Stealth class)') +try: + from playwright_stealth import stealth_sync + print('playwright_stealth OK (v1.x - stealth_sync)') +except ImportError: + from playwright_stealth import Stealth + print('playwright_stealth OK (v2.x - Stealth class)') " # ── Sanity checks ────────────────────────────────── @@ -126,6 +126,7 @@ pipeline { --tiktok-handle "$TIKTOK_COMEDYGOLDBCN_HANDLE" \ --bsky-handle "$BSKY_COMEDYGOLDBCN_HANDLE" \ --bsky-app-password "$BSKY_COMEDYGOLDBCN_APP_PASSWORD" \ + --bsky-base-url https://bsky.social \ --bsky-langs ca \ --cookies-path tiktok_cookies.json ''' From 47dea4b1109977e1e6a181f6fbb5e82ff7ae7492 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 20:25:14 +0200 Subject: [PATCH 09/11] TikTok2 --- tiktok2bsky.py | 250 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 169 insertions(+), 81 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 3653541..f83dd44 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -34,13 +34,21 @@ from atproto import Client from dotenv import load_dotenv from playwright.sync_api import sync_playwright -# playwright-stealth 1.x uses stealth_sync, 2.x uses Stealth class + +# ───────────────────────────────────────────────────────────────────────────── +# playwright-stealth: support v1.x (stealth_sync) and v2.x (Stealth class) +# ───────────────────────────────────────────────────────────────────────────── +_STEALTH_V2 = None # None = not available at all + try: from playwright_stealth import stealth_sync _STEALTH_V2 = False except ImportError: - from playwright_stealth import Stealth - _STEALTH_V2 = True + try: + from playwright_stealth import Stealth + _STEALTH_V2 = True + except ImportError: + pass # stealth disabled — warning emitted at runtime # ───────────────────────────────────────────────────────────────────────────── @@ -146,7 +154,6 @@ def load_state() -> dict: def save_state(state: dict): - # Prune to last STATE_MAX_ENTRIES posted = state.get("posted", {}) if len(posted) > STATE_MAX_ENTRIES: sorted_keys = sorted( @@ -180,7 +187,7 @@ def mark_as_posted(video_id: str, state: dict, meta: dict = None): # Cookie helpers # ───────────────────────────────────────────────────────────────────────────── def load_cookies_from_file(path: str) -> list: - """Load cookies from a JSON file (format produced by generate_tiktok_cookies.py).""" + """Load cookies from a JSON file.""" if not os.path.exists(path): logging.warning(f"⚠️ Cookie file not found: {path}") return [] @@ -279,7 +286,6 @@ def is_transient_error(error_obj) -> bool: def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: """ Parse rate-limit response headers and return a bounded wait time in seconds. - Supports retry-after, x-ratelimit-after, and ratelimit-reset (unix timestamp). """ try: now_ts = int(time.time()) @@ -300,7 +306,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: except Exception: pass - # repr() fallback — parse headers embedded in the exception string text = repr(error_obj) for pattern, is_timestamp in [ (r"'retry-after':\s*'(\d+)'", False), @@ -318,6 +323,54 @@ def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: return default_delay +# ───────────────────────────────────────────────────────────────────────────── +# playwright-stealth application helper +# ───────────────────────────────────────────────────────────────────────────── +def apply_stealth(page): + """ + Apply playwright-stealth to a page object. + + Handles all known API variants: + v1.x → stealth_sync(page) + v2.x → Stealth().use_sync(page) returns a new wrapped page + v2.x → Stealth().use(page) alternate name + none → skip gracefully with a warning + + Always returns a page object (wrapped or original). + """ + if _STEALTH_V2 is None: + logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.") + return page + + try: + if _STEALTH_V2: + # v2.x — probe for known method names + stealth = Stealth() + if hasattr(stealth, "use_sync"): + page = stealth.use_sync(page) + logging.info("🥷 playwright-stealth v2.x applied (use_sync).") + elif hasattr(stealth, "use"): + page = stealth.use(page) + logging.info("🥷 playwright-stealth v2.x applied (use).") + else: + logging.warning( + "⚠️ playwright-stealth v2.x: no known apply method found " + "(tried use_sync, use). Skipping stealth." + ) + else: + # v1.x + stealth_sync(page) + logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") + + except Exception as e: + logging.warning( + f"⚠️ playwright-stealth could not be applied: " + f"{type(e).__name__}: {e}. Continuing without stealth." + ) + + return page + + # ───────────────────────────────────────────────────────────────────────────── # Bluesky client # ───────────────────────────────────────────────────────────────────────────── @@ -327,14 +380,17 @@ def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1): try: - logging.info(f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}") + logging.info( + f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}" + ) client.login(handle, app_password) client.me = client.get_profile(handle) logging.info(f"✅ Bluesky login successful as {handle}") return client except Exception as e: logging.warning( - f"⚠️ Bluesky login {type(e).__name__}: {e} (attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES})" + f"⚠️ Bluesky login {type(e).__name__}: {e} " + f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES})" ) if is_rate_limited_error(e): delay = get_rate_limit_wait_seconds(e, BSKY_LOGIN_BASE_DELAY) @@ -355,7 +411,9 @@ def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: logging.warning(f"⏳ Retrying login in {wait:.1f}s.") time.sleep(wait) else: - logging.error(f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts.") + logging.error( + f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts." + ) raise raise RuntimeError("Bluesky login failed: exhausted all retries.") @@ -388,12 +446,12 @@ def compress_video( input_path: str, output_path: str, max_duration: int = VIDEO_MAX_DURATION_S, - max_size_bytes: int = None, # resolved at call-time from get_video_size_limit() + max_size_bytes: int = None, ) -> bool: """ Re-encode input_path → output_path using libx264, targeting max_size_bytes. - Key fixes applied: + Fixes applied: • pad=ceil(iw/2)*2:ceil(ih/2)*2 — ensures even dimensions (libx264 requirement) • -maxrate == -b:v — hard ceiling, no burst above target • post-encode size guard — rejects file if still over limit @@ -471,12 +529,12 @@ def compress_video( return True except Exception as e: - logging.error(f"❌ compress_video error: {e}") + logging.error(f"❌ compress_video error: {type(e).__name__}: {e}") return False # ───────────────────────────────────────────────────────────────────────────── -# yt-dlp download +# yt-dlp helpers # ───────────────────────────────────────────────────────────────────────────── def get_best_impersonation_target() -> str | None: """ @@ -491,7 +549,6 @@ def get_best_impersonation_target() -> str | None: if target in available: logging.info(f"🎭 yt-dlp impersonation target: {target}") return target - # fallback: return first available if available: target = sorted(available)[0] logging.info(f"🎭 yt-dlp impersonation target (fallback): {target}") @@ -509,10 +566,10 @@ def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) - impersonate = get_best_impersonation_target() ydl_opts = { - "outtmpl": output_path, - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "quiet": False, - "no_warnings": False, + "outtmpl": output_path, + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "quiet": False, + "no_warnings": False, "merge_output_format": "mp4", } @@ -544,10 +601,7 @@ def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) - def download_video(url: str, output_path: str, cookies_path: str = None) -> bool: - """ - Download a TikTok video. Routes directly to yt-dlp with browser impersonation. - """ - cookies = load_cookies_from_file(cookies_path) if cookies_path else [] + """Download a TikTok video via yt-dlp with browser impersonation.""" logging.info(f"⬇️ Downloading: {url}") return download_video_ytdlp(url, output_path, cookies_path=cookies_path) @@ -563,8 +617,8 @@ def upload_video_to_bluesky( """ Upload a video file to Bluesky as a blob. - Fix 1 applied: exception is logged as type(e).__name__: e - so the actual error (413, 403, network error, etc.) is always visible. + Fix 1: exception is always logged as type(e).__name__: e + so the actual error (413, 403, network error, etc.) is visible in logs. """ size_mb = os.path.getsize(video_path) / 1024 / 1024 logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") @@ -581,12 +635,8 @@ def upload_video_to_bluesky( return blob.blob except Exception as e: - # ── Fix 1: always log the full exception type and message ────── + # Fix 1 — always log the full exception type and message err_detail = f"{type(e).__name__}: {e}" - logging.warning( - f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} " - f"failed: {err_detail}. Retrying in {delay:.1f}s..." - ) if attempt >= BSKY_UPLOAD_MAX_RETRIES: logging.error( @@ -595,6 +645,10 @@ def upload_video_to_bluesky( ) return None + logging.warning( + f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} " + f"failed: {err_detail}. Retrying in {delay:.1f}s..." + ) time.sleep(delay + random.uniform(0, BSKY_UPLOAD_JITTER_MAX)) delay = min(delay * 2, BSKY_UPLOAD_MAX_DELAY) @@ -618,7 +672,6 @@ def post_video_to_bluesky( video_embed = models.AppBskyEmbedVideo.Main( video=blob, ) - client.send_post( text=caption, embed=video_embed, @@ -690,14 +743,14 @@ def scrape_tiktok_profile_playwright( page = context.new_page() - if _STEALTH_V2: - Stealth().apply(page) - else: - stealth_sync(page) + # Apply stealth — gracefully handles all v1/v2/missing variants + page = apply_stealth(page) for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): try: - logging.info(f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})...") + logging.info( + f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) page.goto( profile_url, wait_until="domcontentloaded", @@ -720,7 +773,9 @@ def scrape_tiktok_profile_playwright( logging.warning(f"⚠️ Video grid not found on attempt {attempt}.") ts = int(time.time()) page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png") - logging.info(f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png") + logging.info( + f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png" + ) time.sleep(3) continue @@ -738,8 +793,8 @@ def scrape_tiktok_profile_playwright( else f"https://www.tiktok.com{link}" ) videos.append({ - "video_id": video_id, - "url": full_url, + "video_id": video_id, + "url": full_url, "timestamp": None, }) except Exception: @@ -750,7 +805,10 @@ def scrape_tiktok_profile_playwright( break except Exception as e: - logging.warning(f"⚠️ Playwright attempt {attempt} error: {type(e).__name__}: {e}") + logging.warning( + f"⚠️ Playwright attempt {attempt} error: " + f"{type(e).__name__}: {e}" + ) ts = int(time.time()) try: page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png") @@ -759,17 +817,31 @@ def scrape_tiktok_profile_playwright( time.sleep(3) if not videos: - logging.warning("⚠️ Video grid not found on attempt 3.") + logging.warning( + f"⚠️ Video grid not found on attempt {PLAYWRIGHT_MAX_RELOADS}." + ) ts = int(time.time()) try: - page.screenshot(path=f"screenshot_no_grid_3_{ts}.png") - logging.info(f"📸 Screenshot saved: screenshot_no_grid_3_{ts}.png") + page.screenshot(path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png") + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) except Exception: pass - page.close() - context.close() - browser.close() + try: + page.close() + except Exception: + pass + try: + context.close() + except Exception: + pass + try: + browser.close() + except Exception: + pass return videos @@ -794,10 +866,10 @@ def scrape_tiktok_profile_ytdlp( impersonate = get_best_impersonation_target() ydl_opts = { - "extract_flat": True, - "quiet": True, - "no_warnings": True, - "playlistend": limit, + "extract_flat": True, + "quiet": True, + "no_warnings": True, + "playlistend": limit, } if cookies_path and os.path.exists(cookies_path): ydl_opts["cookiefile"] = cookies_path @@ -837,7 +909,9 @@ def scrape_tiktok_profile_ytdlp( return videos[:limit] except Exception as e: - logging.error(f"❌ yt-dlp profile scrape failed: {type(e).__name__}: {e}") + logging.error( + f"❌ yt-dlp profile scrape failed: {type(e).__name__}: {e}" + ) return [] @@ -850,9 +924,8 @@ def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> s url = video_info.get("url", "") if desc: - # Truncate description to leave room for the URL - url_len = len(url) + 1 # +1 for newline - max_desc = max_len - url_len + url_len = len(url) + 1 # +1 for newline + max_desc = max_len - url_len if len(desc) > max_desc: desc = desc[: max_desc - 1] + "…" return f"{desc}\n{url}" @@ -888,7 +961,7 @@ def process_videos( logging.info(f"⏭️ Already posted: {video_id}") continue - # Age filter (only if timestamp is available) + # Age filter (only when timestamp is available) ts = video.get("timestamp") if ts: try: @@ -896,7 +969,8 @@ def process_videos( age_days = (now - video_time).days if age_days > max_age_days: logging.info( - f"⏭️ Video {video_id} too old ({age_days}d > {max_age_days}d). Skipping." + f"⏭️ Video {video_id} too old " + f"({age_days}d > {max_age_days}d). Skipping." ) continue except Exception: @@ -904,9 +978,6 @@ def process_videos( logging.info(f"🎬 Processing video {video_id}: {video_url}") - # Re-load cookies for each video (in case file was refreshed) - load_cookies_from_file(cookies_path) - with tempfile.TemporaryDirectory() as tmpdir: raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") comp_path = os.path.join(tmpdir, f"{video_id}.mp4") @@ -952,17 +1023,43 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Cross-post TikTok videos to Bluesky." ) - parser.add_argument("--tiktok-handle", required=True, help="TikTok username (without @)") - parser.add_argument("--bsky-handle", required=True, help="Bluesky handle") - parser.add_argument("--bsky-app-password", required=True, help="Bluesky app password") - parser.add_argument("--bsky-base-url", default=DEFAULT_BSKY_BASE_URL, - help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})") - parser.add_argument("--bsky-langs", nargs="+", default=DEFAULT_BSKY_LANGS, - help="BCP-47 language tags for posts (default: es)") - parser.add_argument("--cookies-path", default=TIKTOK_COOKIES_PATH, - help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})") - parser.add_argument("--max-age-days", type=int, default=VIDEO_MAX_AGE_DAYS, - help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})") + parser.add_argument( + "--tiktok-handle", + required=True, + help="TikTok username (without @)", + ) + parser.add_argument( + "--bsky-handle", + required=True, + help="Bluesky handle", + ) + parser.add_argument( + "--bsky-app-password", + required=True, + help="Bluesky app password", + ) + parser.add_argument( + "--bsky-base-url", + default=DEFAULT_BSKY_BASE_URL, + help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})", + ) + parser.add_argument( + "--bsky-langs", + nargs="+", + default=DEFAULT_BSKY_LANGS, + help="BCP-47 language tags for posts (default: es)", + ) + parser.add_argument( + "--cookies-path", + default=TIKTOK_COOKIES_PATH, + help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})", + ) + parser.add_argument( + "--max-age-days", + type=int, + default=VIDEO_MAX_AGE_DAYS, + help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})", + ) return parser.parse_args() @@ -970,7 +1067,7 @@ def main(): load_dotenv() args = parse_args() - # ── Fix 2: resolve video size limit based on PDS ────────────────────── + # Fix 2 — resolve video size limit based on PDS video_max_size_bytes = get_video_size_limit(args.bsky_base_url) logging.info("=" * 60) @@ -1004,17 +1101,8 @@ def main(): ) if not videos: - logging.warning("⚠️ Playwright grid scraping failed. Trying API fallback...") + logging.warning("⚠️ Playwright grid scraping failed. Trying yt-dlp fallback...") ts = int(time.time()) - # Try to save a screenshot if playwright left a page open - try: - import glob - for f in glob.glob("screenshot_no_grid_*.png"): - pass # already saved inside scrape function - except Exception: - pass - - # Save a "playwright failed" screenshot placeholder in logs logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png") videos = scrape_tiktok_profile_ytdlp( From c613ab36035617bc126089a718963c060b07f88c Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 20:46:38 +0200 Subject: [PATCH 10/11] Netscape cookie --- tiktok2bsky.py | 541 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 351 insertions(+), 190 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index f83dd44..4ee5cff 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -36,7 +36,7 @@ from playwright.sync_api import sync_playwright # ───────────────────────────────────────────────────────────────────────────── -# playwright-stealth: support v1.x (stealth_sync) and v2.x (Stealth class) +# playwright-stealth: detect installed version # ───────────────────────────────────────────────────────────────────────────── _STEALTH_V2 = None # None = not available at all @@ -96,9 +96,9 @@ PLAYWRIGHT_SLOW_MO = 50 PLAYWRIGHT_MAX_RELOADS = 3 # TikTok selectors -TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' -TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' -TIKTOK_BANNER_SELS = [ +TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' +TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' +TIKTOK_BANNER_SELS = [ '[id*="banner"]', '[class*="banner"]', '[data-e2e="recommend-modal-close"]', @@ -227,6 +227,63 @@ def inject_cookies_into_context(context, cookies: list): logging.warning(f"⚠️ Could not inject cookies: {e}") +def convert_json_cookies_to_netscape(json_path: str) -> str | None: + """ + Convert a JSON cookie file (browser extension format) to a Netscape + cookie file that yt-dlp can consume. + + Returns the path to a temporary Netscape file, or None on failure. + The caller is responsible for deleting the file when done. + + Netscape format columns (tab-separated): + domain include_subdomains path secure expiry name value + """ + try: + with open(json_path, "r", encoding="utf-8") as f: + cookies = json.load(f) + + tmp = tempfile.NamedTemporaryFile( + mode="w", + suffix=".txt", + delete=False, + encoding="utf-8", + ) + + tmp.write("# Netscape HTTP Cookie File\n") + tmp.write("# Generated by tiktok2bsky.py\n\n") + + for c in cookies: + domain = c.get("domain", ".tiktok.com") + # Netscape format requires domain to start with a dot for + # include_subdomains=TRUE to work correctly + include_sub = "TRUE" if domain.startswith(".") else "FALSE" + path = c.get("path", "/") + secure = "TRUE" if c.get("secure", False) else "FALSE" + expiry = int( + c.get("expirationDate") or c.get("expires") or 0 + ) + name = c.get("name", "") + value = c.get("value", "") + + tmp.write( + f"{domain}\t{include_sub}\t{path}\t" + f"{secure}\t{expiry}\t{name}\t{value}\n" + ) + + tmp.close() + logging.info( + f"🍪 Converted {len(cookies)} cookies to Netscape format: {tmp.name}" + ) + return tmp.name + + except Exception as e: + logging.warning( + f"⚠️ Could not convert cookies to Netscape format: " + f"{type(e).__name__}: {e}" + ) + return None + + # ───────────────────────────────────────────────────────────────────────────── # Bluesky error classification helpers # ───────────────────────────────────────────────────────────────────────────── @@ -323,54 +380,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: return default_delay -# ───────────────────────────────────────────────────────────────────────────── -# playwright-stealth application helper -# ───────────────────────────────────────────────────────────────────────────── -def apply_stealth(page): - """ - Apply playwright-stealth to a page object. - - Handles all known API variants: - v1.x → stealth_sync(page) - v2.x → Stealth().use_sync(page) returns a new wrapped page - v2.x → Stealth().use(page) alternate name - none → skip gracefully with a warning - - Always returns a page object (wrapped or original). - """ - if _STEALTH_V2 is None: - logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.") - return page - - try: - if _STEALTH_V2: - # v2.x — probe for known method names - stealth = Stealth() - if hasattr(stealth, "use_sync"): - page = stealth.use_sync(page) - logging.info("🥷 playwright-stealth v2.x applied (use_sync).") - elif hasattr(stealth, "use"): - page = stealth.use(page) - logging.info("🥷 playwright-stealth v2.x applied (use).") - else: - logging.warning( - "⚠️ playwright-stealth v2.x: no known apply method found " - "(tried use_sync, use). Skipping stealth." - ) - else: - # v1.x - stealth_sync(page) - logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") - - except Exception as e: - logging.warning( - f"⚠️ playwright-stealth could not be applied: " - f"{type(e).__name__}: {e}. Continuing without stealth." - ) - - return page - - # ───────────────────────────────────────────────────────────────────────────── # Bluesky client # ───────────────────────────────────────────────────────────────────────────── @@ -393,21 +402,18 @@ def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES})" ) if is_rate_limited_error(e): - delay = get_rate_limit_wait_seconds(e, BSKY_LOGIN_BASE_DELAY) + delay = get_rate_limit_wait_seconds(e, BSKY_LOGIN_BASE_DELAY) jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX) - wait = delay + jitter + wait = delay + jitter logging.warning( f"⏳ Bluesky login rate-limited (attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). " f"Retrying in {wait:.1f}s." ) time.sleep(wait) elif attempt < BSKY_LOGIN_MAX_RETRIES: - delay = min( - BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), - BSKY_LOGIN_MAX_DELAY, - ) + delay = min(BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), BSKY_LOGIN_MAX_DELAY) jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX) - wait = delay + jitter + wait = delay + jitter logging.warning(f"⏳ Retrying login in {wait:.1f}s.") time.sleep(wait) else: @@ -558,9 +564,14 @@ def get_best_impersonation_target() -> str | None: return None -def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) -> bool: +def download_video_ytdlp( + url: str, + output_path: str, + netscape_cookies_path: str = None, +) -> bool: """ Download a TikTok video using yt-dlp with browser impersonation. + Accepts a Netscape-format cookie file path (not JSON). Returns True on success, False on failure. """ impersonate = get_best_impersonation_target() @@ -573,8 +584,8 @@ def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) - "merge_output_format": "mp4", } - if cookies_path and os.path.exists(cookies_path): - ydl_opts["cookiefile"] = cookies_path + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + ydl_opts["cookiefile"] = netscape_cookies_path if impersonate: ydl_opts["impersonate"] = impersonate @@ -596,14 +607,20 @@ def download_video_ytdlp(url: str, output_path: str, cookies_path: str = None) - return False except Exception as e: - logging.error(f"❌ yt-dlp download failed for {url}: {type(e).__name__}: {e}") + logging.error( + f"❌ yt-dlp download failed for {url}: {type(e).__name__}: {e}" + ) return False -def download_video(url: str, output_path: str, cookies_path: str = None) -> bool: +def download_video( + url: str, + output_path: str, + netscape_cookies_path: str = None, +) -> bool: """Download a TikTok video via yt-dlp with browser impersonation.""" logging.info(f"⬇️ Downloading: {url}") - return download_video_ytdlp(url, output_path, cookies_path=cookies_path) + return download_video_ytdlp(url, output_path, netscape_cookies_path=netscape_cookies_path) # ───────────────────────────────────────────────────────────────────────────── @@ -616,9 +633,7 @@ def upload_video_to_bluesky( ) -> object | None: """ Upload a video file to Bluesky as a blob. - - Fix 1: exception is always logged as type(e).__name__: e - so the actual error (413, 403, network error, etc.) is visible in logs. + Exception is always logged as type(e).__name__: e for full visibility. """ size_mb = os.path.getsize(video_path) / 1024 / 1024 logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") @@ -635,7 +650,6 @@ def upload_video_to_bluesky( return blob.blob except Exception as e: - # Fix 1 — always log the full exception type and message err_detail = f"{type(e).__name__}: {e}" if attempt >= BSKY_UPLOAD_MAX_RETRIES: @@ -669,9 +683,7 @@ def post_video_to_bluesky( from atproto import models try: - video_embed = models.AppBskyEmbedVideo.Main( - video=blob, - ) + video_embed = models.AppBskyEmbedVideo.Main(video=blob) client.send_post( text=caption, embed=video_embed, @@ -713,6 +725,9 @@ def scrape_tiktok_profile_playwright( """ Scrape the most recent video URLs from a TikTok profile page using Playwright. Returns a list of dicts with keys: video_id, url, timestamp. + + Stealth fix: playwright-stealth v2.x must wrap the page via a context manager + on new_page(), not via .apply() or .use_sync() after the fact. """ profile_url = f"https://www.tiktok.com/@{handle}" logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") @@ -741,80 +756,202 @@ def scrape_tiktok_profile_playwright( inject_cookies_into_context(context, cookies) - page = context.new_page() + # ── Stealth application ─────────────────────────────────────────── + # v1.x: stealth_sync(page) — called after new_page() + # v2.x: context manager on new_page — page must be created inside + # the Stealth() context, NOT wrapped after the fact. + # Stealth().use_sync(page) returns a SyncWrappingContextManager, + # not a Page — calling .goto() on it crashes. + # ───────────────────────────────────────────────────────────────── + page = None - # Apply stealth — gracefully handles all v1/v2/missing variants - page = apply_stealth(page) + if _STEALTH_V2 is None: + logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.") + page = context.new_page() - for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): + elif _STEALTH_V2: + # v2.x — use as context manager so the page is created inside it try: - logging.info( - f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." - ) - page.goto( - profile_url, - wait_until="domcontentloaded", - timeout=PLAYWRIGHT_TIMEOUT_MS, - ) - time.sleep(3) - dismiss_overlays(page) + stealth_instance = Stealth() + with stealth_instance(context) as stealthy_context: + page = stealthy_context.new_page() + logging.info("🥷 playwright-stealth v2.x applied (context manager).") + # Run the scraping loop inside the context manager scope + for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): + try: + logging.info( + f"🌐 Loading profile " + f"(attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) + page.goto( + profile_url, + wait_until="domcontentloaded", + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + time.sleep(3) + dismiss_overlays(page) - # Wait for video grid - try: - page.wait_for_selector( - TIKTOK_VIDEO_GRID_SEL, - timeout=PLAYWRIGHT_TIMEOUT_MS, - ) - except Exception: - pass - - grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first - if not grid.is_visible(timeout=5000): - logging.warning(f"⚠️ Video grid not found on attempt {attempt}.") - ts = int(time.time()) - page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png") - logging.info( - f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png" - ) - time.sleep(3) - continue - - # Extract video links - items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() - for item in items[:limit]: - try: - link = item.locator("a").first.get_attribute("href") - if link and "/video/" in link: - vid_match = re.search(r"/video/(\d+)", link) - if vid_match: - video_id = vid_match.group(1) - full_url = ( - link if link.startswith("http") - else f"https://www.tiktok.com{link}" + try: + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, ) - videos.append({ - "video_id": video_id, - "url": full_url, - "timestamp": None, - }) - except Exception: - pass + except Exception: + pass - if videos: - logging.info(f"✅ Playwright scraped {len(videos)} videos.") - break + grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first + if not grid.is_visible(timeout=5000): + logging.warning( + f"⚠️ Video grid not found on attempt {attempt}." + ) + ts = int(time.time()) + page.screenshot( + path=f"screenshot_no_grid_{attempt}_{ts}.png" + ) + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{attempt}_{ts}.png" + ) + time.sleep(3) + continue + + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + for item in items[:limit]: + try: + link = item.locator("a").first.get_attribute("href") + if link and "/video/" in link: + vid_match = re.search(r"/video/(\d+)", link) + if vid_match: + video_id = vid_match.group(1) + full_url = ( + link if link.startswith("http") + else f"https://www.tiktok.com{link}" + ) + videos.append({ + "video_id": video_id, + "url": full_url, + "timestamp": None, + }) + except Exception: + pass + + if videos: + logging.info( + f"✅ Playwright scraped {len(videos)} videos." + ) + break + + except Exception as e: + logging.warning( + f"⚠️ Playwright attempt {attempt} error: " + f"{type(e).__name__}: {e}" + ) + ts = int(time.time()) + try: + page.screenshot( + path=f"screenshot_error_{attempt}_{ts}.png" + ) + except Exception: + pass + time.sleep(3) except Exception as e: logging.warning( - f"⚠️ Playwright attempt {attempt} error: " - f"{type(e).__name__}: {e}" + f"⚠️ playwright-stealth v2.x context manager failed: " + f"{type(e).__name__}: {e}. Falling back to no-stealth page." ) - ts = int(time.time()) + page = context.new_page() + + else: + # v1.x — create page then apply stealth + page = context.new_page() + try: + stealth_sync(page) + logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") + except Exception as e: + logging.warning( + f"⚠️ playwright-stealth v1.x failed: " + f"{type(e).__name__}: {e}. Continuing without stealth." + ) + + # ── Scraping loop for v1.x and no-stealth paths ─────────────────── + # (v2.x runs its loop inside the context manager above) + if page is not None and not videos and _STEALTH_V2 is not True: + for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): try: - page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png") - except Exception: - pass - time.sleep(3) + logging.info( + f"🌐 Loading profile " + f"(attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) + page.goto( + profile_url, + wait_until="domcontentloaded", + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + time.sleep(3) + dismiss_overlays(page) + + try: + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + except Exception: + pass + + grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first + if not grid.is_visible(timeout=5000): + logging.warning( + f"⚠️ Video grid not found on attempt {attempt}." + ) + ts = int(time.time()) + page.screenshot( + path=f"screenshot_no_grid_{attempt}_{ts}.png" + ) + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{attempt}_{ts}.png" + ) + time.sleep(3) + continue + + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + for item in items[:limit]: + try: + link = item.locator("a").first.get_attribute("href") + if link and "/video/" in link: + vid_match = re.search(r"/video/(\d+)", link) + if vid_match: + video_id = vid_match.group(1) + full_url = ( + link if link.startswith("http") + else f"https://www.tiktok.com{link}" + ) + videos.append({ + "video_id": video_id, + "url": full_url, + "timestamp": None, + }) + except Exception: + pass + + if videos: + logging.info(f"✅ Playwright scraped {len(videos)} videos.") + break + + except Exception as e: + logging.warning( + f"⚠️ Playwright attempt {attempt} error: " + f"{type(e).__name__}: {e}" + ) + ts = int(time.time()) + try: + page.screenshot( + path=f"screenshot_error_{attempt}_{ts}.png" + ) + except Exception: + pass + time.sleep(3) if not videos: logging.warning( @@ -822,26 +959,24 @@ def scrape_tiktok_profile_playwright( ) ts = int(time.time()) try: - page.screenshot(path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png") - logging.info( - f"📸 Screenshot saved: " - f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" - ) + if page: + page.screenshot( + path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) except Exception: pass - try: - page.close() - except Exception: - pass - try: - context.close() - except Exception: - pass - try: - browser.close() - except Exception: - pass + # ── Cleanup ─────────────────────────────────────────────────────── + for obj in (page, context, browser): + try: + if obj: + obj.close() + except Exception: + pass return videos @@ -851,11 +986,12 @@ def scrape_tiktok_profile_playwright( # ───────────────────────────────────────────────────────────────────────────── def scrape_tiktok_profile_ytdlp( handle: str, - cookies_path: str = None, + netscape_cookies_path: str = None, limit: int = SCRAPE_VIDEO_LIMIT, ) -> list[dict]: """ Fallback: use yt-dlp to extract the video list from a TikTok profile. + Accepts a Netscape-format cookie file path (not JSON). Returns a list of dicts with keys: video_id, url, timestamp. """ import yt_dlp @@ -871,8 +1007,8 @@ def scrape_tiktok_profile_ytdlp( "no_warnings": True, "playlistend": limit, } - if cookies_path and os.path.exists(cookies_path): - ydl_opts["cookiefile"] = cookies_path + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + ydl_opts["cookiefile"] = netscape_cookies_path if impersonate: ydl_opts["impersonate"] = impersonate @@ -941,7 +1077,7 @@ def process_videos( state: dict, client: Client, tiktok_handle: str, - cookies_path: str, + netscape_cookies_path: str, langs: list[str], max_age_days: int, video_max_size_bytes: int, @@ -983,7 +1119,11 @@ def process_videos( comp_path = os.path.join(tmpdir, f"{video_id}.mp4") # 1. Download - ok = download_video(video_url, raw_path, cookies_path=cookies_path) + ok = download_video( + video_url, + raw_path, + netscape_cookies_path=netscape_cookies_path, + ) if not ok: logging.error(f"❌ Download failed for {video_id}. Skipping.") continue @@ -1090,48 +1230,69 @@ def main(): args.bsky_base_url, ) - # Scrape TikTok profile - logging.info(f"🔄 Scraping @{args.tiktok_handle}...") - cookies = load_cookies_from_file(args.cookies_path) + # Convert JSON cookies → Netscape format for yt-dlp + # Playwright uses the JSON cookies directly via inject_cookies_into_context() + # yt-dlp requires Netscape .txt format — convert once and reuse + netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path) + if netscape_cookies_path: + logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}") + else: + logging.warning("⚠️ Could not create Netscape cookie file. yt-dlp will run without cookies.") - videos = scrape_tiktok_profile_playwright( - args.tiktok_handle, - cookies, - limit=SCRAPE_VIDEO_LIMIT, - ) + try: + # Scrape TikTok profile + logging.info(f"🔄 Scraping @{args.tiktok_handle}...") + cookies = load_cookies_from_file(args.cookies_path) - if not videos: - logging.warning("⚠️ Playwright grid scraping failed. Trying yt-dlp fallback...") - ts = int(time.time()) - logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png") - - videos = scrape_tiktok_profile_ytdlp( + videos = scrape_tiktok_profile_playwright( args.tiktok_handle, - cookies_path=args.cookies_path, + cookies, limit=SCRAPE_VIDEO_LIMIT, ) - if not videos: - logging.error("❌ No videos found. Exiting.") - sys.exit(0) + if not videos: + logging.warning( + "⚠️ Playwright grid scraping failed. Trying yt-dlp fallback..." + ) + ts = int(time.time()) + logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png") - logging.info(f"📋 Found {len(videos)} video(s). Processing new ones...") + videos = scrape_tiktok_profile_ytdlp( + args.tiktok_handle, + netscape_cookies_path=netscape_cookies_path, + limit=SCRAPE_VIDEO_LIMIT, + ) - posted = process_videos( - videos=videos, - state=state, - client=client, - tiktok_handle=args.tiktok_handle, - cookies_path=args.cookies_path, - langs=args.bsky_langs, - max_age_days=args.max_age_days, - video_max_size_bytes=video_max_size_bytes, - ) + if not videos: + logging.error("❌ No videos found. Exiting.") + sys.exit(0) - logging.info("=" * 60) - logging.info(f"✅ Sync complete. Posted {posted} new video(s).") - logging.info("🤖 Bot finished.") - logging.info("=" * 60) + logging.info(f"📋 Found {len(videos)} video(s). Processing new ones...") + + posted = process_videos( + videos=videos, + state=state, + client=client, + tiktok_handle=args.tiktok_handle, + netscape_cookies_path=netscape_cookies_path, + langs=args.bsky_langs, + max_age_days=args.max_age_days, + video_max_size_bytes=video_max_size_bytes, + ) + + logging.info("=" * 60) + logging.info(f"✅ Sync complete. Posted {posted} new video(s).") + logging.info("🤖 Bot finished.") + logging.info("=" * 60) + + finally: + # Always clean up the temporary Netscape cookie file + if netscape_cookies_path and os.path.exists(netscape_cookies_path): + try: + os.remove(netscape_cookies_path) + logging.info(f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}") + except Exception as e: + logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}") if __name__ == "__main__": From 6d4cfbd4b5b7c889adbf60cfb97d825edb2266e2 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Wed, 20 May 2026 07:16:07 +0200 Subject: [PATCH 11/11] Changes --- tiktok2bsky.py | 470 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 422 insertions(+), 48 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index 4ee5cff..2a9a43d 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -78,24 +78,30 @@ VIDEO_MAX_AGE_DAYS = 3 VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s -# Bluesky login retry config -BSKY_LOGIN_MAX_RETRIES = 4 -BSKY_LOGIN_BASE_DELAY = 15.0 -BSKY_LOGIN_MAX_DELAY = 120.0 -BSKY_LOGIN_JITTER_MAX = 10.0 +# ── Bluesky login retry config (ported from twitter2bsky.py) ───────────────── +BSKY_LOGIN_MAX_RETRIES = 6 +BSKY_LOGIN_BASE_DELAY = 15.0 +BSKY_LOGIN_MAX_DELAY = 600.0 +BSKY_LOGIN_JITTER_MAX = 5.0 +BSKY_LOGIN_RATE_LIMIT_DELAY = 90.0 # minimum wait on 429 +BSKY_LOGIN_RATE_LIMIT_MAX_DELAY = 600.0 # maximum wait on 429 -# Bluesky upload retry config +# ── Bluesky upload retry config ─────────────────────────────────────────────── BSKY_UPLOAD_MAX_RETRIES = 5 BSKY_UPLOAD_BASE_DELAY = 10.0 BSKY_UPLOAD_MAX_DELAY = 120.0 BSKY_UPLOAD_JITTER_MAX = 5.0 -# Playwright scraping config +# ── Playwright scraping config ──────────────────────────────────────────────── PLAYWRIGHT_TIMEOUT_MS = 30_000 PLAYWRIGHT_SLOW_MO = 50 PLAYWRIGHT_MAX_RELOADS = 3 +<<<<<<< HEAD # TikTok selectors +======= +# ── TikTok selectors ────────────────────────────────────────────────────────── +>>>>>>> 7cddbd0 (Fixes for today) TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' TIKTOK_BANNER_SELS = [ @@ -254,6 +260,7 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None: for c in cookies: domain = c.get("domain", ".tiktok.com") +<<<<<<< HEAD # Netscape format requires domain to start with a dot for # include_subdomains=TRUE to work correctly include_sub = "TRUE" if domain.startswith(".") else "FALSE" @@ -262,6 +269,12 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None: expiry = int( c.get("expirationDate") or c.get("expires") or 0 ) +======= + include_sub = "TRUE" if domain.startswith(".") else "FALSE" + path = c.get("path", "/") + secure = "TRUE" if c.get("secure", False) else "FALSE" + expiry = int(c.get("expirationDate") or c.get("expires") or 0) +>>>>>>> 7cddbd0 (Fixes for today) name = c.get("name", "") value = c.get("value", "") @@ -285,110 +298,168 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None: # ───────────────────────────────────────────────────────────────────────────── +<<<<<<< HEAD # Bluesky error classification helpers +======= +# Bluesky error classification (ported from twitter2bsky.py) +>>>>>>> 7cddbd0 (Fixes for today) # ───────────────────────────────────────────────────────────────────────────── +def _bsky_error_text(error_obj) -> str: + """Normalised lowercase repr for pattern matching.""" + return repr(error_obj).lower() + + def is_rate_limited_error(error_obj) -> bool: - text = repr(error_obj).lower() + text = _bsky_error_text(error_obj) return ( - "429" in text + "429" in text or "ratelimitexceeded" in text or "too many requests" in text - or "rate limit" in text + or "rate limit" in text + or "ratelimit" in text ) def is_auth_error(error_obj) -> bool: - text = repr(error_obj).lower() + text = _bsky_error_text(error_obj) return ( - "401" in text - or "403" in text + "401" in text + or "403" in text + or "invalid identifier" in text + or "invalid password" in text + or "authenticationrequired" in text + or "invalidtoken" in text + or "expiredtoken" in text + or "accounttakedown" in text or "invalid identifier or password" in text - or "authenticationrequired" in text - or "invalidtoken" in text ) def is_network_error(error_obj) -> bool: text = repr(error_obj) signals = [ - "ConnectError", - "RemoteProtocolError", - "ReadTimeout", - "WriteTimeout", - "TimeoutException", - "503", - "502", - "504", - "ConnectionResetError", + "ConnectError", "RemoteProtocolError", "ReadTimeout", + "WriteTimeout", "TimeoutException", "ConnectionResetError", + "503", "502", "504", ] - return any(sig in text for sig in signals) + return any(s in text for s in signals) def is_transient_error(error_obj) -> bool: text = repr(error_obj) signals = [ - "InvokeTimeoutError", - "ReadTimeout", - "WriteTimeout", - "TimeoutException", - "RemoteProtocolError", - "ConnectError", - "503", - "502", - "504", + "InvokeTimeoutError", "ReadTimeout", "WriteTimeout", + "TimeoutException", "RemoteProtocolError", "ConnectError", + "503", "502", "504", ] - return any(sig in text for sig in signals) + return any(s in text for s in signals) def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: """ +<<<<<<< HEAD Parse rate-limit response headers and return a bounded wait time in seconds. +======= + Extract the server-requested wait time from rate-limit error headers. + + Checks (in order): + 1. error_obj.headers dict — Retry-After, X-RateLimit-After, RateLimit-Reset + 2. repr(error_obj) text — same keys embedded as strings + 3. Falls back to default_delay + + Ported from twitter2bsky.py. +>>>>>>> 7cddbd0 (Fixes for today) """ + now_ts = int(time.time()) + + # ── 1. Live headers object ──────────────────────────────────────────── try: - now_ts = int(time.time()) headers = getattr(error_obj, "headers", None) or {} for key in ("retry-after", "Retry-After"): - if headers.get(key): - return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) for key in ("x-ratelimit-after", "X-RateLimit-After"): - if headers.get(key): - return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) for key in ("ratelimit-reset", "RateLimit-Reset"): - if headers.get(key): - wait = max(int(headers[key]) - now_ts + 1, default_delay) - return min(wait, BSKY_LOGIN_MAX_DELAY) + val = headers.get(key) + if val: + wait = max(int(val) - now_ts + 2, default_delay) + return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + except Exception: pass +<<<<<<< HEAD +======= + # ── 2. repr() string fallback ───────────────────────────────────────── +>>>>>>> 7cddbd0 (Fixes for today) text = repr(error_obj) - for pattern, is_timestamp in [ - (r"'retry-after':\s*'(\d+)'", False), - (r"'x-ratelimit-after':\s*'(\d+)'", False), - (r"'ratelimit-reset':\s*'(\d+)'", True), + for pattern, is_ts in [ + (r"['\"]retry-after['\"]\s*:\s*['\"](\d+)['\"]", False), + (r"['\"]x-ratelimit-after['\"]\s*:\s*['\"](\d+)['\"]", False), + (r"['\"]ratelimit-reset['\"]\s*:\s*['\"](\d+)['\"]", True), + (r"retry.?after[=:\s]+(\d+)", False), ]: m = re.search(pattern, text, re.IGNORECASE) if m: val = int(m.group(1)) +<<<<<<< HEAD if is_timestamp: wait = max(val - int(time.time()) + 1, default_delay) return min(wait, BSKY_LOGIN_MAX_DELAY) return min(max(val, 1), BSKY_LOGIN_MAX_DELAY) +======= + if is_ts: + wait = max(val - now_ts + 2, default_delay) + return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + return min(max(val, 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) +>>>>>>> 7cddbd0 (Fixes for today) return default_delay # ───────────────────────────────────────────────────────────────────────────── +<<<<<<< HEAD # Bluesky client # ───────────────────────────────────────────────────────────────────────────── def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: logging.info(f"🔐 Connecting Bluesky client via base URL: {base_url}") client = Client(base_url=base_url) +======= +# Bluesky client — improved login (ported from twitter2bsky.py) +# ───────────────────────────────────────────────────────────────────────────── +def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: + """ + Authenticate with Bluesky with full retry logic ported from twitter2bsky.py: + + • 429 / rate-limit → honour Retry-After header; wait up to 600s + • auth errors → fail immediately (retrying won't help) + • network/transient → exponential backoff with jitter + • other errors → exponential backoff with jitter + • exhausted retries → raise so Jenkins marks the build FAILURE + """ + logging.info(f"🔐 Connecting Bluesky client → {base_url}") + client = Client(base_url=base_url) + + attempt = 0 + last_error = None + + while attempt < BSKY_LOGIN_MAX_RETRIES: + attempt += 1 + logging.info( + f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} " + f"for {handle}" + ) +>>>>>>> 7cddbd0 (Fixes for today) - for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1): try: +<<<<<<< HEAD logging.info( f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}" ) @@ -423,6 +494,79 @@ def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: raise raise RuntimeError("Bluesky login failed: exhausted all retries.") +======= + client.login(handle, app_password) + # Fetch profile to confirm the session is fully live + client.me = client.get_profile(handle) + logging.info(f"✅ Bluesky login successful as {handle}") + return client + + except Exception as e: + last_error = e + err_detail = f"{type(e).__name__}: {e}" + + # ── Auth errors: no point retrying ─────────────────────────── + if is_auth_error(e): + logging.error( + f"❌ Bluesky login auth error (will not retry): {err_detail}" + ) + raise + + # ── Rate-limited (429) ──────────────────────────────────────── + if is_rate_limited_error(e): + raw_wait = get_rate_limit_wait_seconds(e, BSKY_LOGIN_RATE_LIMIT_DELAY) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = min(raw_wait + jitter, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY) + logging.warning( + f"⏳ Bluesky login rate-limited (attempt {attempt}/" + f"{BSKY_LOGIN_MAX_RETRIES}). " + f"Waiting {wait:.1f}s (server requested {raw_wait:.0f}s)." + ) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) + continue + + # ── Network / transient errors ──────────────────────────────── + if is_network_error(e) or is_transient_error(e): + delay = min( + BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_LOGIN_MAX_DELAY, + ) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter + logging.warning( + f"⚠️ Bluesky login network/transient error " + f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " + f"{err_detail}. Retrying in {wait:.1f}s." + ) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) + continue + + # ── Unknown errors ──────────────────────────────────────────── + delay = min( + BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_LOGIN_MAX_DELAY, + ) + jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX) + wait = delay + jitter + logging.warning( + f"⚠️ Bluesky login failed " + f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): " + f"{err_detail}. Retrying in {wait:.1f}s." + ) + if attempt < BSKY_LOGIN_MAX_RETRIES: + time.sleep(wait) + + logging.error( + f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts. " + f"Last error: {type(last_error).__name__}: {last_error}" + ) + raise RuntimeError( + f"Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts: " + f"{last_error}" + ) +>>>>>>> 7cddbd0 (Fixes for today) # ───────────────────────────────────────────────────────────────────────────── @@ -463,7 +607,11 @@ def compress_video( • post-encode size guard — rejects file if still over limit """ if max_size_bytes is None: +<<<<<<< HEAD max_size_bytes = 20 * 1024 * 1024 # fallback +======= + max_size_bytes = 20 * 1024 * 1024 +>>>>>>> 7cddbd0 (Fixes for today) try: duration = get_video_duration(input_path) @@ -477,7 +625,10 @@ def compress_video( trim_to = min(duration, max_duration) +<<<<<<< HEAD # Target 85% of the size budget to leave headroom for container overhead +======= +>>>>>>> 7cddbd0 (Fixes for today) target_bits = max_size_bytes * 8 * 0.85 total_kbps = int(target_bits / trim_to / 1000) audio_kbps = 96 @@ -493,10 +644,13 @@ def compress_video( "ffmpeg", "-y", "-i", input_path, "-t", str(trim_to), +<<<<<<< HEAD # Scale to 720p max, then pad to even dimensions. # The pad filter is required because libx264 needs width/height # divisible by 2. Portrait TikTok videos (9:16) would otherwise # produce odd widths like 405px and crash the encoder. +======= +>>>>>>> 7cddbd0 (Fixes for today) "-vf", ( "scale='min(1280,iw)':'min(720,ih)'" ":force_original_aspect_ratio=decrease," @@ -504,7 +658,11 @@ def compress_video( ), "-c:v", "libx264", "-b:v", f"{video_kbps}k", +<<<<<<< HEAD "-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target +======= + "-maxrate", f"{video_kbps}k", +>>>>>>> 7cddbd0 (Fixes for today) "-bufsize", f"{video_kbps * 2}k", "-c:a", "aac", "-b:a", f"{audio_kbps}k", @@ -520,7 +678,10 @@ def compress_video( final_size = os.path.getsize(output_path) +<<<<<<< HEAD # Reject if still over the hard limit +======= +>>>>>>> 7cddbd0 (Fixes for today) if final_size > max_size_bytes: logging.error( f"❌ Compressed file still too large: " @@ -572,7 +733,10 @@ def download_video_ytdlp( """ Download a TikTok video using yt-dlp with browser impersonation. Accepts a Netscape-format cookie file path (not JSON). +<<<<<<< HEAD Returns True on success, False on failure. +======= +>>>>>>> 7cddbd0 (Fixes for today) """ impersonate = get_best_impersonation_target() @@ -633,7 +797,11 @@ def upload_video_to_bluesky( ) -> object | None: """ Upload a video file to Bluesky as a blob. +<<<<<<< HEAD Exception is always logged as type(e).__name__: e for full visibility. +======= + All exceptions logged as type(e).__name__: e for full visibility. +>>>>>>> 7cddbd0 (Fixes for today) """ size_mb = os.path.getsize(video_path) / 1024 / 1024 logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") @@ -717,6 +885,90 @@ def dismiss_overlays(page) -> None: pass +<<<<<<< HEAD +======= +def _run_playwright_scrape_loop(page, profile_url: str, limit: int) -> list[dict]: + """ + Inner scraping loop shared by both the stealth and no-stealth paths. + Returns a list of video dicts. + """ + videos = [] + + for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): + try: + logging.info( + f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..." + ) + page.goto( + profile_url, + wait_until="domcontentloaded", + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + time.sleep(3) + dismiss_overlays(page) + + try: + page.wait_for_selector( + TIKTOK_VIDEO_GRID_SEL, + timeout=PLAYWRIGHT_TIMEOUT_MS, + ) + except Exception: + pass + + grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first + if not grid.is_visible(timeout=5000): + logging.warning(f"⚠️ Video grid not found on attempt {attempt}.") + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png") + logging.info( + f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png" + ) + except Exception: + pass + time.sleep(3) + continue + + items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all() + for item in items[:limit]: + try: + link = item.locator("a").first.get_attribute("href") + if link and "/video/" in link: + vid_match = re.search(r"/video/(\d+)", link) + if vid_match: + video_id = vid_match.group(1) + full_url = ( + link if link.startswith("http") + else f"https://www.tiktok.com{link}" + ) + videos.append({ + "video_id": video_id, + "url": full_url, + "timestamp": None, + }) + except Exception: + pass + + if videos: + logging.info(f"✅ Playwright scraped {len(videos)} videos.") + break + + except Exception as e: + logging.warning( + f"⚠️ Playwright attempt {attempt} error: " + f"{type(e).__name__}: {e}" + ) + ts = int(time.time()) + try: + page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png") + except Exception: + pass + time.sleep(3) + + return videos + + +>>>>>>> 7cddbd0 (Fixes for today) def scrape_tiktok_profile_playwright( handle: str, cookies: list, @@ -724,10 +976,18 @@ def scrape_tiktok_profile_playwright( ) -> list[dict]: """ Scrape the most recent video URLs from a TikTok profile page using Playwright. +<<<<<<< HEAD Returns a list of dicts with keys: video_id, url, timestamp. Stealth fix: playwright-stealth v2.x must wrap the page via a context manager on new_page(), not via .apply() or .use_sync() after the fact. +======= + + Stealth handling: + v1.x → stealth_sync(page) after new_page() + v2.x → Stealth() used as context manager; page created inside it + none → plain page, no stealth +>>>>>>> 7cddbd0 (Fixes for today) """ profile_url = f"https://www.tiktok.com/@{handle}" logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") @@ -756,6 +1016,7 @@ def scrape_tiktok_profile_playwright( inject_cookies_into_context(context, cookies) +<<<<<<< HEAD # ── Stealth application ─────────────────────────────────────────── # v1.x: stealth_sync(page) — called after new_page() # v2.x: context manager on new_page — page must be created inside @@ -771,11 +1032,16 @@ def scrape_tiktok_profile_playwright( elif _STEALTH_V2: # v2.x — use as context manager so the page is created inside it +======= + # ── Stealth v2.x — page must be created inside the context manager ── + if _STEALTH_V2 is True: +>>>>>>> 7cddbd0 (Fixes for today) try: stealth_instance = Stealth() with stealth_instance(context) as stealthy_context: page = stealthy_context.new_page() logging.info("🥷 playwright-stealth v2.x applied (context manager).") +<<<<<<< HEAD # Run the scraping loop inside the context manager scope for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): try: @@ -864,12 +1130,27 @@ def scrape_tiktok_profile_playwright( else: # v1.x — create page then apply stealth +======= + videos = _run_playwright_scrape_loop(page, profile_url, limit) + except Exception as e: + logging.warning( + f"⚠️ playwright-stealth v2.x failed: {type(e).__name__}: {e}. " + f"Retrying without stealth." + ) + # Fall through to no-stealth path below + page = context.new_page() + videos = _run_playwright_scrape_loop(page, profile_url, limit) + + # ── Stealth v1.x ────────────────────────────────────────────────── + elif _STEALTH_V2 is False: +>>>>>>> 7cddbd0 (Fixes for today) page = context.new_page() try: stealth_sync(page) logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") except Exception as e: logging.warning( +<<<<<<< HEAD f"⚠️ playwright-stealth v1.x failed: " f"{type(e).__name__}: {e}. Continuing without stealth." ) @@ -971,6 +1252,35 @@ def scrape_tiktok_profile_playwright( pass # ── Cleanup ─────────────────────────────────────────────────────── +======= + f"⚠️ playwright-stealth v1.x failed: {type(e).__name__}: {e}. " + f"Continuing without stealth." + ) + videos = _run_playwright_scrape_loop(page, profile_url, limit) + + # ── No stealth available ────────────────────────────────────────── + else: + logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.") + page = context.new_page() + videos = _run_playwright_scrape_loop(page, profile_url, limit) + + if not videos: + logging.warning( + f"⚠️ Video grid not found after {PLAYWRIGHT_MAX_RELOADS} attempts." + ) + ts = int(time.time()) + try: + page.screenshot( + path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) + logging.info( + f"📸 Screenshot saved: " + f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png" + ) + except Exception: + pass + +>>>>>>> 7cddbd0 (Fixes for today) for obj in (page, context, browser): try: if obj: @@ -992,7 +1302,10 @@ def scrape_tiktok_profile_ytdlp( """ Fallback: use yt-dlp to extract the video list from a TikTok profile. Accepts a Netscape-format cookie file path (not JSON). +<<<<<<< HEAD Returns a list of dicts with keys: video_id, url, timestamp. +======= +>>>>>>> 7cddbd0 (Fixes for today) """ import yt_dlp @@ -1060,7 +1373,11 @@ def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> s url = video_info.get("url", "") if desc: +<<<<<<< HEAD url_len = len(url) + 1 # +1 for newline +======= + url_len = len(url) + 1 +>>>>>>> 7cddbd0 (Fixes for today) max_desc = max_len - url_len if len(desc) > max_desc: desc = desc[: max_desc - 1] + "…" @@ -1097,7 +1414,10 @@ def process_videos( logging.info(f"⏭️ Already posted: {video_id}") continue +<<<<<<< HEAD # Age filter (only when timestamp is available) +======= +>>>>>>> 7cddbd0 (Fixes for today) ts = video.get("timestamp") if ts: try: @@ -1150,7 +1470,10 @@ def process_videos( if ok: mark_as_posted(video_id, state, meta={"url": video_url}) posted_count += 1 +<<<<<<< HEAD # Brief pause between posts to avoid rate limiting +======= +>>>>>>> 7cddbd0 (Fixes for today) time.sleep(random.uniform(2.0, 5.0)) return posted_count @@ -1162,6 +1485,7 @@ def process_videos( def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Cross-post TikTok videos to Bluesky." +<<<<<<< HEAD ) parser.add_argument( "--tiktok-handle", @@ -1200,6 +1524,34 @@ def parse_args() -> argparse.Namespace: default=VIDEO_MAX_AGE_DAYS, help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})", ) +======= + ) + parser.add_argument("--tiktok-handle", required=True) + parser.add_argument("--bsky-handle", required=True) + parser.add_argument("--bsky-app-password", required=True) + parser.add_argument( + "--bsky-base-url", + default=DEFAULT_BSKY_BASE_URL, + help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})", + ) + parser.add_argument( + "--bsky-langs", + nargs="+", + default=DEFAULT_BSKY_LANGS, + help="BCP-47 language tags for posts (default: es)", + ) + parser.add_argument( + "--cookies-path", + default=TIKTOK_COOKIES_PATH, + help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})", + ) + parser.add_argument( + "--max-age-days", + type=int, + default=VIDEO_MAX_AGE_DAYS, + help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})", + ) +>>>>>>> 7cddbd0 (Fixes for today) return parser.parse_args() @@ -1207,7 +1559,10 @@ def main(): load_dotenv() args = parse_args() +<<<<<<< HEAD # Fix 2 — resolve video size limit based on PDS +======= +>>>>>>> 7cddbd0 (Fixes for today) video_max_size_bytes = get_video_size_limit(args.bsky_base_url) logging.info("=" * 60) @@ -1230,17 +1585,30 @@ def main(): args.bsky_base_url, ) +<<<<<<< HEAD # Convert JSON cookies → Netscape format for yt-dlp # Playwright uses the JSON cookies directly via inject_cookies_into_context() # yt-dlp requires Netscape .txt format — convert once and reuse +======= + # Convert JSON cookies → Netscape format once for all yt-dlp calls +>>>>>>> 7cddbd0 (Fixes for today) netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path) if netscape_cookies_path: logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}") else: +<<<<<<< HEAD logging.warning("⚠️ Could not create Netscape cookie file. yt-dlp will run without cookies.") try: # Scrape TikTok profile +======= + logging.warning( + "⚠️ Could not create Netscape cookie file. " + "yt-dlp will run without cookies." + ) + + try: +>>>>>>> 7cddbd0 (Fixes for today) logging.info(f"🔄 Scraping @{args.tiktok_handle}...") cookies = load_cookies_from_file(args.cookies_path) @@ -1290,7 +1658,13 @@ def main(): if netscape_cookies_path and os.path.exists(netscape_cookies_path): try: os.remove(netscape_cookies_path) +<<<<<<< HEAD logging.info(f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}") +======= + logging.info( + f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}" + ) +>>>>>>> 7cddbd0 (Fixes for today) except Exception as e: logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}")