1673 lines
66 KiB
Python
1673 lines
66 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
tiktok2bsky.py
|
||
──────────────
|
||
Scrapes recent videos from a public TikTok profile and cross-posts
|
||
them to a Bluesky account.
|
||
|
||
Usage:
|
||
python tiktok2bsky.py \
|
||
--tiktok-handle jijantesfc \
|
||
--bsky-handle jijantesfc.bsky.social \
|
||
--bsky-app-password xxxx-xxxx-xxxx-xxxx \
|
||
--bsky-base-url https://bsky.social \
|
||
--bsky-langs es \
|
||
--cookies-path tiktok_cookies.json
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import random
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import time
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
import arrow
|
||
import httpx
|
||
from atproto import Client
|
||
from dotenv import load_dotenv
|
||
from playwright.sync_api import sync_playwright
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# playwright-stealth: detect installed version
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
_STEALTH_V2 = None # None = not available at all
|
||
|
||
try:
|
||
from playwright_stealth import stealth_sync
|
||
_STEALTH_V2 = False
|
||
except ImportError:
|
||
try:
|
||
from playwright_stealth import Stealth
|
||
_STEALTH_V2 = True
|
||
except ImportError:
|
||
pass # stealth disabled — warning emitted at runtime
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Logging
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
logging.basicConfig(
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
handlers=[
|
||
logging.StreamHandler(sys.stdout),
|
||
logging.FileHandler("tiktok2bsky.log", encoding="utf-8"),
|
||
],
|
||
level=logging.INFO,
|
||
)
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Constants & defaults
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||
DEFAULT_BSKY_LANGS = ["es"]
|
||
TIKTOK_COOKIES_PATH = "tiktok_cookies.json"
|
||
|
||
STATE_FILE = "tiktok2bsky_state.json"
|
||
STATE_MAX_ENTRIES = 5000
|
||
|
||
SCRAPE_VIDEO_LIMIT = 30
|
||
VIDEO_MAX_AGE_DAYS = 3
|
||
|
||
VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s
|
||
|
||
# ── Bluesky login retry config (ported from twitter2bsky.py) ─────────────────
|
||
BSKY_LOGIN_MAX_RETRIES = 6
|
||
BSKY_LOGIN_BASE_DELAY = 15.0
|
||
BSKY_LOGIN_MAX_DELAY = 600.0
|
||
BSKY_LOGIN_JITTER_MAX = 5.0
|
||
BSKY_LOGIN_RATE_LIMIT_DELAY = 90.0 # minimum wait on 429
|
||
BSKY_LOGIN_RATE_LIMIT_MAX_DELAY = 600.0 # maximum wait on 429
|
||
|
||
# ── Bluesky upload retry config ───────────────────────────────────────────────
|
||
BSKY_UPLOAD_MAX_RETRIES = 5
|
||
BSKY_UPLOAD_BASE_DELAY = 10.0
|
||
BSKY_UPLOAD_MAX_DELAY = 120.0
|
||
BSKY_UPLOAD_JITTER_MAX = 5.0
|
||
|
||
# ── Playwright scraping config ────────────────────────────────────────────────
|
||
PLAYWRIGHT_TIMEOUT_MS = 30_000
|
||
PLAYWRIGHT_SLOW_MO = 50
|
||
PLAYWRIGHT_MAX_RELOADS = 3
|
||
|
||
<<<<<<< HEAD
|
||
# TikTok selectors
|
||
=======
|
||
# ── TikTok selectors ──────────────────────────────────────────────────────────
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]'
|
||
TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]'
|
||
TIKTOK_BANNER_SELS = [
|
||
'[id*="banner"]',
|
||
'[class*="banner"]',
|
||
'[data-e2e="recommend-modal-close"]',
|
||
'button:has-text("Rechazar")',
|
||
'button:has-text("Reject")',
|
||
'button:has-text("Accept")',
|
||
'button:has-text("Aceptar")',
|
||
'[aria-label="Close"]',
|
||
'[aria-label="Cerrar"]',
|
||
]
|
||
TIKTOK_COOKIE_MODAL_SELS = [
|
||
'button:has-text("Decline all")',
|
||
'button:has-text("Rechazar todo")',
|
||
'button:has-text("Reject all")',
|
||
'button:has-text("Accept all")',
|
||
'button:has-text("Aceptar todo")',
|
||
'[class*="cookie"] button',
|
||
'[id*="cookie"] button',
|
||
]
|
||
TIKTOK_GRID_ERROR_SEL = '[data-e2e="user-post-item-list-error"]'
|
||
TIKTOK_REFRESH_BTN_SEL = 'button:has-text("Actualizar"), button:has-text("Refresh")'
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Fix 2 — Dynamic video size limit based on PDS
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def get_video_size_limit(bsky_base_url: str) -> int:
|
||
"""
|
||
bsky.social supports ~50 MB blobs. Third-party PDS instances
|
||
typically cap at 10–20 MB. Use a conservative 10 MB for
|
||
anything that isn't the official PDS.
|
||
"""
|
||
if "bsky.social" in (bsky_base_url or ""):
|
||
return 20 * 1024 * 1024 # 20 MB — official PDS
|
||
return 10 * 1024 * 1024 # 10 MB — safe for third-party PDS
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# State management
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def load_state() -> dict:
|
||
if os.path.exists(STATE_FILE):
|
||
try:
|
||
with open(STATE_FILE, "r", encoding="utf-8") as f:
|
||
state = json.load(f)
|
||
logging.info(
|
||
f"📂 Loaded state: {len(state.get('posted', {}))} entries."
|
||
)
|
||
return state
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Could not load state file: {e}. Starting fresh.")
|
||
return {"posted": {}}
|
||
|
||
|
||
def save_state(state: dict):
|
||
posted = state.get("posted", {})
|
||
if len(posted) > STATE_MAX_ENTRIES:
|
||
sorted_keys = sorted(
|
||
posted.keys(),
|
||
key=lambda k: posted[k].get("posted_at", ""),
|
||
)
|
||
for old_key in sorted_keys[: len(posted) - STATE_MAX_ENTRIES]:
|
||
del posted[old_key]
|
||
state["posted"] = posted
|
||
|
||
try:
|
||
with open(STATE_FILE, "w", encoding="utf-8") as f:
|
||
json.dump(state, f, indent=2, ensure_ascii=False)
|
||
except Exception as e:
|
||
logging.error(f"❌ Could not save state: {e}")
|
||
|
||
|
||
def is_already_posted(video_id: str, state: dict) -> bool:
|
||
return video_id in state.get("posted", {})
|
||
|
||
|
||
def mark_as_posted(video_id: str, state: dict, meta: dict = None):
|
||
state.setdefault("posted", {})[video_id] = {
|
||
"posted_at": arrow.utcnow().isoformat(),
|
||
**(meta or {}),
|
||
}
|
||
save_state(state)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Cookie helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def load_cookies_from_file(path: str) -> list:
|
||
"""Load cookies from a JSON file."""
|
||
if not os.path.exists(path):
|
||
logging.warning(f"⚠️ Cookie file not found: {path}")
|
||
return []
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
cookies = json.load(f)
|
||
logging.info(f"🍪 Loaded {len(cookies)} cookies from {path}")
|
||
return cookies
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Could not load cookies from {path}: {e}")
|
||
return []
|
||
|
||
|
||
def inject_cookies_into_context(context, cookies: list):
|
||
"""Inject a list of cookie dicts into a Playwright browser context."""
|
||
if not cookies:
|
||
return
|
||
playwright_cookies = []
|
||
for c in cookies:
|
||
entry = {
|
||
"name": c.get("name", ""),
|
||
"value": c.get("value", ""),
|
||
"domain": c.get("domain", ".tiktok.com"),
|
||
"path": c.get("path", "/"),
|
||
"secure": c.get("secure", False),
|
||
"httpOnly": c.get("httpOnly", False),
|
||
"sameSite": c.get("sameSite", "None"),
|
||
}
|
||
exp = c.get("expirationDate") or c.get("expires")
|
||
if exp and float(exp) > 0:
|
||
entry["expires"] = float(exp)
|
||
playwright_cookies.append(entry)
|
||
try:
|
||
context.add_cookies(playwright_cookies)
|
||
logging.info(f"🍪 Injected {len(playwright_cookies)} cookies into browser context.")
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Could not inject cookies: {e}")
|
||
|
||
|
||
def convert_json_cookies_to_netscape(json_path: str) -> str | None:
|
||
"""
|
||
Convert a JSON cookie file (browser extension format) to a Netscape
|
||
cookie file that yt-dlp can consume.
|
||
|
||
Returns the path to a temporary Netscape file, or None on failure.
|
||
The caller is responsible for deleting the file when done.
|
||
|
||
Netscape format columns (tab-separated):
|
||
domain include_subdomains path secure expiry name value
|
||
"""
|
||
try:
|
||
with open(json_path, "r", encoding="utf-8") as f:
|
||
cookies = json.load(f)
|
||
|
||
tmp = tempfile.NamedTemporaryFile(
|
||
mode="w",
|
||
suffix=".txt",
|
||
delete=False,
|
||
encoding="utf-8",
|
||
)
|
||
|
||
tmp.write("# Netscape HTTP Cookie File\n")
|
||
tmp.write("# Generated by tiktok2bsky.py\n\n")
|
||
|
||
for c in cookies:
|
||
domain = c.get("domain", ".tiktok.com")
|
||
<<<<<<< HEAD
|
||
# Netscape format requires domain to start with a dot for
|
||
# include_subdomains=TRUE to work correctly
|
||
include_sub = "TRUE" if domain.startswith(".") else "FALSE"
|
||
path = c.get("path", "/")
|
||
secure = "TRUE" if c.get("secure", False) else "FALSE"
|
||
expiry = int(
|
||
c.get("expirationDate") or c.get("expires") or 0
|
||
)
|
||
=======
|
||
include_sub = "TRUE" if domain.startswith(".") else "FALSE"
|
||
path = c.get("path", "/")
|
||
secure = "TRUE" if c.get("secure", False) else "FALSE"
|
||
expiry = int(c.get("expirationDate") or c.get("expires") or 0)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
name = c.get("name", "")
|
||
value = c.get("value", "")
|
||
|
||
tmp.write(
|
||
f"{domain}\t{include_sub}\t{path}\t"
|
||
f"{secure}\t{expiry}\t{name}\t{value}\n"
|
||
)
|
||
|
||
tmp.close()
|
||
logging.info(
|
||
f"🍪 Converted {len(cookies)} cookies to Netscape format: {tmp.name}"
|
||
)
|
||
return tmp.name
|
||
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ Could not convert cookies to Netscape format: "
|
||
f"{type(e).__name__}: {e}"
|
||
)
|
||
return None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
<<<<<<< HEAD
|
||
# Bluesky error classification helpers
|
||
=======
|
||
# Bluesky error classification (ported from twitter2bsky.py)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def _bsky_error_text(error_obj) -> str:
|
||
"""Normalised lowercase repr for pattern matching."""
|
||
return repr(error_obj).lower()
|
||
|
||
|
||
def is_rate_limited_error(error_obj) -> bool:
|
||
text = _bsky_error_text(error_obj)
|
||
return (
|
||
"429" in text
|
||
or "ratelimitexceeded" in text
|
||
or "too many requests" in text
|
||
or "rate limit" in text
|
||
or "ratelimit" in text
|
||
)
|
||
|
||
|
||
def is_auth_error(error_obj) -> bool:
|
||
text = _bsky_error_text(error_obj)
|
||
return (
|
||
"401" in text
|
||
or "403" in text
|
||
or "invalid identifier" in text
|
||
or "invalid password" in text
|
||
or "authenticationrequired" in text
|
||
or "invalidtoken" in text
|
||
or "expiredtoken" in text
|
||
or "accounttakedown" in text
|
||
or "invalid identifier or password" in text
|
||
)
|
||
|
||
|
||
def is_network_error(error_obj) -> bool:
|
||
text = repr(error_obj)
|
||
signals = [
|
||
"ConnectError", "RemoteProtocolError", "ReadTimeout",
|
||
"WriteTimeout", "TimeoutException", "ConnectionResetError",
|
||
"503", "502", "504",
|
||
]
|
||
return any(s in text for s in signals)
|
||
|
||
|
||
def is_transient_error(error_obj) -> bool:
|
||
text = repr(error_obj)
|
||
signals = [
|
||
"InvokeTimeoutError", "ReadTimeout", "WriteTimeout",
|
||
"TimeoutException", "RemoteProtocolError", "ConnectError",
|
||
"503", "502", "504",
|
||
]
|
||
return any(s in text for s in signals)
|
||
|
||
|
||
def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float:
|
||
"""
|
||
<<<<<<< HEAD
|
||
Parse rate-limit response headers and return a bounded wait time in seconds.
|
||
=======
|
||
Extract the server-requested wait time from rate-limit error headers.
|
||
|
||
Checks (in order):
|
||
1. error_obj.headers dict — Retry-After, X-RateLimit-After, RateLimit-Reset
|
||
2. repr(error_obj) text — same keys embedded as strings
|
||
3. Falls back to default_delay
|
||
|
||
Ported from twitter2bsky.py.
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"""
|
||
now_ts = int(time.time())
|
||
|
||
# ── 1. Live headers object ────────────────────────────────────────────
|
||
try:
|
||
headers = getattr(error_obj, "headers", None) or {}
|
||
|
||
for key in ("retry-after", "Retry-After"):
|
||
val = headers.get(key)
|
||
if val:
|
||
return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
|
||
for key in ("x-ratelimit-after", "X-RateLimit-After"):
|
||
val = headers.get(key)
|
||
if val:
|
||
return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
|
||
for key in ("ratelimit-reset", "RateLimit-Reset"):
|
||
val = headers.get(key)
|
||
if val:
|
||
wait = max(int(val) - now_ts + 2, default_delay)
|
||
return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
<<<<<<< HEAD
|
||
=======
|
||
# ── 2. repr() string fallback ─────────────────────────────────────────
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
text = repr(error_obj)
|
||
for pattern, is_ts in [
|
||
(r"['\"]retry-after['\"]\s*:\s*['\"](\d+)['\"]", False),
|
||
(r"['\"]x-ratelimit-after['\"]\s*:\s*['\"](\d+)['\"]", False),
|
||
(r"['\"]ratelimit-reset['\"]\s*:\s*['\"](\d+)['\"]", True),
|
||
(r"retry.?after[=:\s]+(\d+)", False),
|
||
]:
|
||
m = re.search(pattern, text, re.IGNORECASE)
|
||
if m:
|
||
val = int(m.group(1))
|
||
<<<<<<< HEAD
|
||
if is_timestamp:
|
||
wait = max(val - int(time.time()) + 1, default_delay)
|
||
return min(wait, BSKY_LOGIN_MAX_DELAY)
|
||
return min(max(val, 1), BSKY_LOGIN_MAX_DELAY)
|
||
=======
|
||
if is_ts:
|
||
wait = max(val - now_ts + 2, default_delay)
|
||
return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
return min(max(val, 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
|
||
return default_delay
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
<<<<<<< HEAD
|
||
# Bluesky client
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client:
|
||
logging.info(f"🔐 Connecting Bluesky client via base URL: {base_url}")
|
||
client = Client(base_url=base_url)
|
||
=======
|
||
# Bluesky client — improved login (ported from twitter2bsky.py)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client:
|
||
"""
|
||
Authenticate with Bluesky with full retry logic ported from twitter2bsky.py:
|
||
|
||
• 429 / rate-limit → honour Retry-After header; wait up to 600s
|
||
• auth errors → fail immediately (retrying won't help)
|
||
• network/transient → exponential backoff with jitter
|
||
• other errors → exponential backoff with jitter
|
||
• exhausted retries → raise so Jenkins marks the build FAILURE
|
||
"""
|
||
logging.info(f"🔐 Connecting Bluesky client → {base_url}")
|
||
client = Client(base_url=base_url)
|
||
|
||
attempt = 0
|
||
last_error = None
|
||
|
||
while attempt < BSKY_LOGIN_MAX_RETRIES:
|
||
attempt += 1
|
||
logging.info(
|
||
f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} "
|
||
f"for {handle}"
|
||
)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
|
||
try:
|
||
<<<<<<< HEAD
|
||
logging.info(
|
||
f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}"
|
||
)
|
||
client.login(handle, app_password)
|
||
client.me = client.get_profile(handle)
|
||
logging.info(f"✅ Bluesky login successful as {handle}")
|
||
return client
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ Bluesky login {type(e).__name__}: {e} "
|
||
f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES})"
|
||
)
|
||
if is_rate_limited_error(e):
|
||
delay = get_rate_limit_wait_seconds(e, BSKY_LOGIN_BASE_DELAY)
|
||
jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX)
|
||
wait = delay + jitter
|
||
logging.warning(
|
||
f"⏳ Bluesky login rate-limited (attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}). "
|
||
f"Retrying in {wait:.1f}s."
|
||
)
|
||
time.sleep(wait)
|
||
elif attempt < BSKY_LOGIN_MAX_RETRIES:
|
||
delay = min(BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)), BSKY_LOGIN_MAX_DELAY)
|
||
jitter = random.uniform(0, BSKY_LOGIN_JITTER_MAX)
|
||
wait = delay + jitter
|
||
logging.warning(f"⏳ Retrying login in {wait:.1f}s.")
|
||
time.sleep(wait)
|
||
else:
|
||
logging.error(
|
||
f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts."
|
||
)
|
||
raise
|
||
|
||
raise RuntimeError("Bluesky login failed: exhausted all retries.")
|
||
=======
|
||
client.login(handle, app_password)
|
||
# Fetch profile to confirm the session is fully live
|
||
client.me = client.get_profile(handle)
|
||
logging.info(f"✅ Bluesky login successful as {handle}")
|
||
return client
|
||
|
||
except Exception as e:
|
||
last_error = e
|
||
err_detail = f"{type(e).__name__}: {e}"
|
||
|
||
# ── Auth errors: no point retrying ───────────────────────────
|
||
if is_auth_error(e):
|
||
logging.error(
|
||
f"❌ Bluesky login auth error (will not retry): {err_detail}"
|
||
)
|
||
raise
|
||
|
||
# ── Rate-limited (429) ────────────────────────────────────────
|
||
if is_rate_limited_error(e):
|
||
raw_wait = get_rate_limit_wait_seconds(e, BSKY_LOGIN_RATE_LIMIT_DELAY)
|
||
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
|
||
wait = min(raw_wait + jitter, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
|
||
logging.warning(
|
||
f"⏳ Bluesky login rate-limited (attempt {attempt}/"
|
||
f"{BSKY_LOGIN_MAX_RETRIES}). "
|
||
f"Waiting {wait:.1f}s (server requested {raw_wait:.0f}s)."
|
||
)
|
||
if attempt < BSKY_LOGIN_MAX_RETRIES:
|
||
time.sleep(wait)
|
||
continue
|
||
|
||
# ── Network / transient errors ────────────────────────────────
|
||
if is_network_error(e) or is_transient_error(e):
|
||
delay = min(
|
||
BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)),
|
||
BSKY_LOGIN_MAX_DELAY,
|
||
)
|
||
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
|
||
wait = delay + jitter
|
||
logging.warning(
|
||
f"⚠️ Bluesky login network/transient error "
|
||
f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): "
|
||
f"{err_detail}. Retrying in {wait:.1f}s."
|
||
)
|
||
if attempt < BSKY_LOGIN_MAX_RETRIES:
|
||
time.sleep(wait)
|
||
continue
|
||
|
||
# ── Unknown errors ────────────────────────────────────────────
|
||
delay = min(
|
||
BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)),
|
||
BSKY_LOGIN_MAX_DELAY,
|
||
)
|
||
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
|
||
wait = delay + jitter
|
||
logging.warning(
|
||
f"⚠️ Bluesky login failed "
|
||
f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): "
|
||
f"{err_detail}. Retrying in {wait:.1f}s."
|
||
)
|
||
if attempt < BSKY_LOGIN_MAX_RETRIES:
|
||
time.sleep(wait)
|
||
|
||
logging.error(
|
||
f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts. "
|
||
f"Last error: {type(last_error).__name__}: {last_error}"
|
||
)
|
||
raise RuntimeError(
|
||
f"Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts: "
|
||
f"{last_error}"
|
||
)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Video helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def get_video_duration(path: str) -> float:
|
||
"""Return video duration in seconds via ffprobe, or 0.0 on failure."""
|
||
try:
|
||
result = subprocess.run(
|
||
[
|
||
"ffprobe", "-v", "error",
|
||
"-show_entries", "format=duration",
|
||
"-of", "default=noprint_wrappers=1:nokey=1",
|
||
path,
|
||
],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=15,
|
||
)
|
||
return float(result.stdout.strip())
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ ffprobe failed for {path}: {e}")
|
||
return 0.0
|
||
|
||
|
||
def compress_video(
|
||
input_path: str,
|
||
output_path: str,
|
||
max_duration: int = VIDEO_MAX_DURATION_S,
|
||
max_size_bytes: int = None,
|
||
) -> bool:
|
||
"""
|
||
Re-encode input_path → output_path using libx264, targeting max_size_bytes.
|
||
|
||
Fixes applied:
|
||
• pad=ceil(iw/2)*2:ceil(ih/2)*2 — ensures even dimensions (libx264 requirement)
|
||
• -maxrate == -b:v — hard ceiling, no burst above target
|
||
• post-encode size guard — rejects file if still over limit
|
||
"""
|
||
if max_size_bytes is None:
|
||
<<<<<<< HEAD
|
||
max_size_bytes = 20 * 1024 * 1024 # fallback
|
||
=======
|
||
max_size_bytes = 20 * 1024 * 1024
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
|
||
try:
|
||
duration = get_video_duration(input_path)
|
||
|
||
if duration <= 0:
|
||
logging.error(
|
||
f"❌ compress_video: invalid duration={duration} "
|
||
f"for {input_path} ({os.path.getsize(input_path)} bytes)"
|
||
)
|
||
return False
|
||
|
||
trim_to = min(duration, max_duration)
|
||
|
||
<<<<<<< HEAD
|
||
# Target 85% of the size budget to leave headroom for container overhead
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
target_bits = max_size_bytes * 8 * 0.85
|
||
total_kbps = int(target_bits / trim_to / 1000)
|
||
audio_kbps = 96
|
||
video_kbps = max(200, total_kbps - audio_kbps)
|
||
|
||
logging.info(
|
||
f"🎬 Compressing: duration={duration:.1f}s → trim={trim_to:.1f}s, "
|
||
f"video_bitrate={video_kbps}k "
|
||
f"(target ≤ {max_size_bytes // 1024 // 1024}MB)"
|
||
)
|
||
|
||
cmd = [
|
||
"ffmpeg", "-y",
|
||
"-i", input_path,
|
||
"-t", str(trim_to),
|
||
<<<<<<< HEAD
|
||
# Scale to 720p max, then pad to even dimensions.
|
||
# The pad filter is required because libx264 needs width/height
|
||
# divisible by 2. Portrait TikTok videos (9:16) would otherwise
|
||
# produce odd widths like 405px and crash the encoder.
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"-vf", (
|
||
"scale='min(1280,iw)':'min(720,ih)'"
|
||
":force_original_aspect_ratio=decrease,"
|
||
"pad=ceil(iw/2)*2:ceil(ih/2)*2"
|
||
),
|
||
"-c:v", "libx264",
|
||
"-b:v", f"{video_kbps}k",
|
||
<<<<<<< HEAD
|
||
"-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target
|
||
=======
|
||
"-maxrate", f"{video_kbps}k",
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"-bufsize", f"{video_kbps * 2}k",
|
||
"-c:a", "aac",
|
||
"-b:a", f"{audio_kbps}k",
|
||
"-movflags", "+faststart",
|
||
"-pix_fmt", "yuv420p",
|
||
output_path,
|
||
]
|
||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
||
|
||
if result.returncode != 0:
|
||
logging.error(f"❌ ffmpeg failed:\n{result.stderr}")
|
||
return False
|
||
|
||
final_size = os.path.getsize(output_path)
|
||
|
||
<<<<<<< HEAD
|
||
# Reject if still over the hard limit
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
if final_size > max_size_bytes:
|
||
logging.error(
|
||
f"❌ Compressed file still too large: "
|
||
f"{final_size / 1024 / 1024:.1f} MB > "
|
||
f"{max_size_bytes / 1024 / 1024:.0f} MB limit. Skipping."
|
||
)
|
||
return False
|
||
|
||
logging.info(
|
||
f"✅ Compressed video: {final_size / 1024 / 1024:.1f} MB → {output_path}"
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logging.error(f"❌ compress_video error: {type(e).__name__}: {e}")
|
||
return False
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# yt-dlp helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def get_best_impersonation_target() -> str | None:
|
||
"""
|
||
Dynamically select the best available curl_cffi impersonation target.
|
||
Returns None if curl_cffi is not installed or no target is available.
|
||
"""
|
||
try:
|
||
from curl_cffi.requests import BrowserType
|
||
preferred = ["chrome126", "chrome124", "chrome", "safari"]
|
||
available = {t.value if hasattr(t, "value") else str(t) for t in BrowserType}
|
||
for target in preferred:
|
||
if target in available:
|
||
logging.info(f"🎭 yt-dlp impersonation target: {target}")
|
||
return target
|
||
if available:
|
||
target = sorted(available)[0]
|
||
logging.info(f"🎭 yt-dlp impersonation target (fallback): {target}")
|
||
return target
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Could not check impersonation targets: {e}")
|
||
return None
|
||
|
||
|
||
def download_video_ytdlp(
|
||
url: str,
|
||
output_path: str,
|
||
netscape_cookies_path: str = None,
|
||
) -> bool:
|
||
"""
|
||
Download a TikTok video using yt-dlp with browser impersonation.
|
||
Accepts a Netscape-format cookie file path (not JSON).
|
||
<<<<<<< HEAD
|
||
Returns True on success, False on failure.
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"""
|
||
impersonate = get_best_impersonation_target()
|
||
|
||
ydl_opts = {
|
||
"outtmpl": output_path,
|
||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||
"quiet": False,
|
||
"no_warnings": False,
|
||
"merge_output_format": "mp4",
|
||
}
|
||
|
||
if netscape_cookies_path and os.path.exists(netscape_cookies_path):
|
||
ydl_opts["cookiefile"] = netscape_cookies_path
|
||
|
||
if impersonate:
|
||
ydl_opts["impersonate"] = impersonate
|
||
|
||
try:
|
||
import yt_dlp
|
||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||
ydl.download([url])
|
||
|
||
if os.path.exists(output_path) and os.path.getsize(output_path) > 50 * 1024:
|
||
size_mb = os.path.getsize(output_path) / 1024 / 1024
|
||
logging.info(f"✅ yt-dlp download OK: {size_mb:.1f} MB")
|
||
return True
|
||
else:
|
||
logging.warning(
|
||
f"⚠️ yt-dlp output too small or missing: {output_path} "
|
||
f"({os.path.getsize(output_path) if os.path.exists(output_path) else 0} bytes)"
|
||
)
|
||
return False
|
||
|
||
except Exception as e:
|
||
logging.error(
|
||
f"❌ yt-dlp download failed for {url}: {type(e).__name__}: {e}"
|
||
)
|
||
return False
|
||
|
||
|
||
def download_video(
|
||
url: str,
|
||
output_path: str,
|
||
netscape_cookies_path: str = None,
|
||
) -> bool:
|
||
"""Download a TikTok video via yt-dlp with browser impersonation."""
|
||
logging.info(f"⬇️ Downloading: {url}")
|
||
return download_video_ytdlp(url, output_path, netscape_cookies_path=netscape_cookies_path)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Bluesky upload
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def upload_video_to_bluesky(
|
||
client: Client,
|
||
video_path: str,
|
||
video_id: str,
|
||
) -> object | None:
|
||
"""
|
||
Upload a video file to Bluesky as a blob.
|
||
<<<<<<< HEAD
|
||
Exception is always logged as type(e).__name__: e for full visibility.
|
||
=======
|
||
All exceptions logged as type(e).__name__: e for full visibility.
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"""
|
||
size_mb = os.path.getsize(video_path) / 1024 / 1024
|
||
logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...")
|
||
|
||
with open(video_path, "rb") as f:
|
||
video_data = f.read()
|
||
|
||
delay = BSKY_UPLOAD_BASE_DELAY
|
||
|
||
for attempt in range(1, BSKY_UPLOAD_MAX_RETRIES + 1):
|
||
try:
|
||
blob = client.upload_blob(video_data)
|
||
logging.info(f"✅ Blob uploaded successfully for {video_id}")
|
||
return blob.blob
|
||
|
||
except Exception as e:
|
||
err_detail = f"{type(e).__name__}: {e}"
|
||
|
||
if attempt >= BSKY_UPLOAD_MAX_RETRIES:
|
||
logging.error(
|
||
f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: "
|
||
f"{err_detail}"
|
||
)
|
||
return None
|
||
|
||
logging.warning(
|
||
f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} "
|
||
f"failed: {err_detail}. Retrying in {delay:.1f}s..."
|
||
)
|
||
time.sleep(delay + random.uniform(0, BSKY_UPLOAD_JITTER_MAX))
|
||
delay = min(delay * 2, BSKY_UPLOAD_MAX_DELAY)
|
||
|
||
return None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Bluesky post
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def post_video_to_bluesky(
|
||
client: Client,
|
||
blob,
|
||
caption: str,
|
||
langs: list[str],
|
||
video_id: str,
|
||
) -> bool:
|
||
"""Create a Bluesky post embedding the uploaded video blob."""
|
||
from atproto import models
|
||
|
||
try:
|
||
video_embed = models.AppBskyEmbedVideo.Main(video=blob)
|
||
client.send_post(
|
||
text=caption,
|
||
embed=video_embed,
|
||
langs=langs,
|
||
)
|
||
logging.info(f"✅ Posted video {video_id} to Bluesky.")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logging.error(
|
||
f"❌ Failed to post video {video_id} to Bluesky: "
|
||
f"{type(e).__name__}: {e}"
|
||
)
|
||
return False
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# TikTok scraping — Playwright
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def dismiss_overlays(page) -> None:
|
||
"""Try to dismiss cookie banners and modal overlays."""
|
||
all_sels = TIKTOK_COOKIE_MODAL_SELS + TIKTOK_BANNER_SELS
|
||
for sel in all_sels:
|
||
try:
|
||
el = page.locator(sel).first
|
||
if el.is_visible(timeout=1500):
|
||
el.click(timeout=1500)
|
||
logging.info(f"🚫 Dismissed overlay: {sel}")
|
||
time.sleep(0.5)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
<<<<<<< HEAD
|
||
=======
|
||
def _run_playwright_scrape_loop(page, profile_url: str, limit: int) -> list[dict]:
|
||
"""
|
||
Inner scraping loop shared by both the stealth and no-stealth paths.
|
||
Returns a list of video dicts.
|
||
"""
|
||
videos = []
|
||
|
||
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
|
||
try:
|
||
logging.info(
|
||
f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
|
||
)
|
||
page.goto(
|
||
profile_url,
|
||
wait_until="domcontentloaded",
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
time.sleep(3)
|
||
dismiss_overlays(page)
|
||
|
||
try:
|
||
page.wait_for_selector(
|
||
TIKTOK_VIDEO_GRID_SEL,
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first
|
||
if not grid.is_visible(timeout=5000):
|
||
logging.warning(f"⚠️ Video grid not found on attempt {attempt}.")
|
||
ts = int(time.time())
|
||
try:
|
||
page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png")
|
||
logging.info(
|
||
f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png"
|
||
)
|
||
except Exception:
|
||
pass
|
||
time.sleep(3)
|
||
continue
|
||
|
||
items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all()
|
||
for item in items[:limit]:
|
||
try:
|
||
link = item.locator("a").first.get_attribute("href")
|
||
if link and "/video/" in link:
|
||
vid_match = re.search(r"/video/(\d+)", link)
|
||
if vid_match:
|
||
video_id = vid_match.group(1)
|
||
full_url = (
|
||
link if link.startswith("http")
|
||
else f"https://www.tiktok.com{link}"
|
||
)
|
||
videos.append({
|
||
"video_id": video_id,
|
||
"url": full_url,
|
||
"timestamp": None,
|
||
})
|
||
except Exception:
|
||
pass
|
||
|
||
if videos:
|
||
logging.info(f"✅ Playwright scraped {len(videos)} videos.")
|
||
break
|
||
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ Playwright attempt {attempt} error: "
|
||
f"{type(e).__name__}: {e}"
|
||
)
|
||
ts = int(time.time())
|
||
try:
|
||
page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png")
|
||
except Exception:
|
||
pass
|
||
time.sleep(3)
|
||
|
||
return videos
|
||
|
||
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
def scrape_tiktok_profile_playwright(
|
||
handle: str,
|
||
cookies: list,
|
||
limit: int = SCRAPE_VIDEO_LIMIT,
|
||
) -> list[dict]:
|
||
"""
|
||
Scrape the most recent video URLs from a TikTok profile page using Playwright.
|
||
<<<<<<< HEAD
|
||
Returns a list of dicts with keys: video_id, url, timestamp.
|
||
|
||
Stealth fix: playwright-stealth v2.x must wrap the page via a context manager
|
||
on new_page(), not via .apply() or .use_sync() after the fact.
|
||
=======
|
||
|
||
Stealth handling:
|
||
v1.x → stealth_sync(page) after new_page()
|
||
v2.x → Stealth() used as context manager; page created inside it
|
||
none → plain page, no stealth
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"""
|
||
profile_url = f"https://www.tiktok.com/@{handle}"
|
||
logging.info(f"🕷️ Scraping TikTok profile: {profile_url}")
|
||
|
||
videos = []
|
||
|
||
with sync_playwright() as p:
|
||
browser = p.chromium.launch(
|
||
headless=True,
|
||
slow_mo=PLAYWRIGHT_SLOW_MO,
|
||
args=[
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--no-sandbox",
|
||
"--disable-setuid-sandbox",
|
||
],
|
||
)
|
||
context = browser.new_context(
|
||
user_agent=(
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/126.0.0.0 Safari/537.36"
|
||
),
|
||
viewport={"width": 1280, "height": 900},
|
||
locale="es-ES",
|
||
)
|
||
|
||
inject_cookies_into_context(context, cookies)
|
||
|
||
<<<<<<< HEAD
|
||
# ── Stealth application ───────────────────────────────────────────
|
||
# v1.x: stealth_sync(page) — called after new_page()
|
||
# v2.x: context manager on new_page — page must be created inside
|
||
# the Stealth() context, NOT wrapped after the fact.
|
||
# Stealth().use_sync(page) returns a SyncWrappingContextManager,
|
||
# not a Page — calling .goto() on it crashes.
|
||
# ─────────────────────────────────────────────────────────────────
|
||
page = None
|
||
|
||
if _STEALTH_V2 is None:
|
||
logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.")
|
||
page = context.new_page()
|
||
|
||
elif _STEALTH_V2:
|
||
# v2.x — use as context manager so the page is created inside it
|
||
=======
|
||
# ── Stealth v2.x — page must be created inside the context manager ──
|
||
if _STEALTH_V2 is True:
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
try:
|
||
stealth_instance = Stealth()
|
||
with stealth_instance(context) as stealthy_context:
|
||
page = stealthy_context.new_page()
|
||
logging.info("🥷 playwright-stealth v2.x applied (context manager).")
|
||
<<<<<<< HEAD
|
||
# Run the scraping loop inside the context manager scope
|
||
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
|
||
try:
|
||
logging.info(
|
||
f"🌐 Loading profile "
|
||
f"(attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
|
||
)
|
||
page.goto(
|
||
profile_url,
|
||
wait_until="domcontentloaded",
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
time.sleep(3)
|
||
dismiss_overlays(page)
|
||
|
||
try:
|
||
page.wait_for_selector(
|
||
TIKTOK_VIDEO_GRID_SEL,
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first
|
||
if not grid.is_visible(timeout=5000):
|
||
logging.warning(
|
||
f"⚠️ Video grid not found on attempt {attempt}."
|
||
)
|
||
ts = int(time.time())
|
||
page.screenshot(
|
||
path=f"screenshot_no_grid_{attempt}_{ts}.png"
|
||
)
|
||
logging.info(
|
||
f"📸 Screenshot saved: "
|
||
f"screenshot_no_grid_{attempt}_{ts}.png"
|
||
)
|
||
time.sleep(3)
|
||
continue
|
||
|
||
items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all()
|
||
for item in items[:limit]:
|
||
try:
|
||
link = item.locator("a").first.get_attribute("href")
|
||
if link and "/video/" in link:
|
||
vid_match = re.search(r"/video/(\d+)", link)
|
||
if vid_match:
|
||
video_id = vid_match.group(1)
|
||
full_url = (
|
||
link if link.startswith("http")
|
||
else f"https://www.tiktok.com{link}"
|
||
)
|
||
videos.append({
|
||
"video_id": video_id,
|
||
"url": full_url,
|
||
"timestamp": None,
|
||
})
|
||
except Exception:
|
||
pass
|
||
|
||
if videos:
|
||
logging.info(
|
||
f"✅ Playwright scraped {len(videos)} videos."
|
||
)
|
||
break
|
||
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ Playwright attempt {attempt} error: "
|
||
f"{type(e).__name__}: {e}"
|
||
)
|
||
ts = int(time.time())
|
||
try:
|
||
page.screenshot(
|
||
path=f"screenshot_error_{attempt}_{ts}.png"
|
||
)
|
||
except Exception:
|
||
pass
|
||
time.sleep(3)
|
||
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ playwright-stealth v2.x context manager failed: "
|
||
f"{type(e).__name__}: {e}. Falling back to no-stealth page."
|
||
)
|
||
page = context.new_page()
|
||
|
||
else:
|
||
# v1.x — create page then apply stealth
|
||
=======
|
||
videos = _run_playwright_scrape_loop(page, profile_url, limit)
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ playwright-stealth v2.x failed: {type(e).__name__}: {e}. "
|
||
f"Retrying without stealth."
|
||
)
|
||
# Fall through to no-stealth path below
|
||
page = context.new_page()
|
||
videos = _run_playwright_scrape_loop(page, profile_url, limit)
|
||
|
||
# ── Stealth v1.x ──────────────────────────────────────────────────
|
||
elif _STEALTH_V2 is False:
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
page = context.new_page()
|
||
try:
|
||
stealth_sync(page)
|
||
logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).")
|
||
except Exception as e:
|
||
logging.warning(
|
||
<<<<<<< HEAD
|
||
f"⚠️ playwright-stealth v1.x failed: "
|
||
f"{type(e).__name__}: {e}. Continuing without stealth."
|
||
)
|
||
|
||
# ── Scraping loop for v1.x and no-stealth paths ───────────────────
|
||
# (v2.x runs its loop inside the context manager above)
|
||
if page is not None and not videos and _STEALTH_V2 is not True:
|
||
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
|
||
try:
|
||
logging.info(
|
||
f"🌐 Loading profile "
|
||
f"(attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
|
||
)
|
||
page.goto(
|
||
profile_url,
|
||
wait_until="domcontentloaded",
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
time.sleep(3)
|
||
dismiss_overlays(page)
|
||
|
||
try:
|
||
page.wait_for_selector(
|
||
TIKTOK_VIDEO_GRID_SEL,
|
||
timeout=PLAYWRIGHT_TIMEOUT_MS,
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first
|
||
if not grid.is_visible(timeout=5000):
|
||
logging.warning(
|
||
f"⚠️ Video grid not found on attempt {attempt}."
|
||
)
|
||
ts = int(time.time())
|
||
page.screenshot(
|
||
path=f"screenshot_no_grid_{attempt}_{ts}.png"
|
||
)
|
||
logging.info(
|
||
f"📸 Screenshot saved: "
|
||
f"screenshot_no_grid_{attempt}_{ts}.png"
|
||
)
|
||
time.sleep(3)
|
||
continue
|
||
|
||
items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all()
|
||
for item in items[:limit]:
|
||
try:
|
||
link = item.locator("a").first.get_attribute("href")
|
||
if link and "/video/" in link:
|
||
vid_match = re.search(r"/video/(\d+)", link)
|
||
if vid_match:
|
||
video_id = vid_match.group(1)
|
||
full_url = (
|
||
link if link.startswith("http")
|
||
else f"https://www.tiktok.com{link}"
|
||
)
|
||
videos.append({
|
||
"video_id": video_id,
|
||
"url": full_url,
|
||
"timestamp": None,
|
||
})
|
||
except Exception:
|
||
pass
|
||
|
||
if videos:
|
||
logging.info(f"✅ Playwright scraped {len(videos)} videos.")
|
||
break
|
||
|
||
except Exception as e:
|
||
logging.warning(
|
||
f"⚠️ Playwright attempt {attempt} error: "
|
||
f"{type(e).__name__}: {e}"
|
||
)
|
||
ts = int(time.time())
|
||
try:
|
||
page.screenshot(
|
||
path=f"screenshot_error_{attempt}_{ts}.png"
|
||
)
|
||
except Exception:
|
||
pass
|
||
time.sleep(3)
|
||
|
||
if not videos:
|
||
logging.warning(
|
||
f"⚠️ Video grid not found on attempt {PLAYWRIGHT_MAX_RELOADS}."
|
||
)
|
||
ts = int(time.time())
|
||
try:
|
||
if page:
|
||
page.screenshot(
|
||
path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
|
||
)
|
||
logging.info(
|
||
f"📸 Screenshot saved: "
|
||
f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
# ── Cleanup ───────────────────────────────────────────────────────
|
||
=======
|
||
f"⚠️ playwright-stealth v1.x failed: {type(e).__name__}: {e}. "
|
||
f"Continuing without stealth."
|
||
)
|
||
videos = _run_playwright_scrape_loop(page, profile_url, limit)
|
||
|
||
# ── No stealth available ──────────────────────────────────────────
|
||
else:
|
||
logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.")
|
||
page = context.new_page()
|
||
videos = _run_playwright_scrape_loop(page, profile_url, limit)
|
||
|
||
if not videos:
|
||
logging.warning(
|
||
f"⚠️ Video grid not found after {PLAYWRIGHT_MAX_RELOADS} attempts."
|
||
)
|
||
ts = int(time.time())
|
||
try:
|
||
page.screenshot(
|
||
path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
|
||
)
|
||
logging.info(
|
||
f"📸 Screenshot saved: "
|
||
f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
for obj in (page, context, browser):
|
||
try:
|
||
if obj:
|
||
obj.close()
|
||
except Exception:
|
||
pass
|
||
|
||
return videos
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# TikTok scraping — yt-dlp fallback
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def scrape_tiktok_profile_ytdlp(
|
||
handle: str,
|
||
netscape_cookies_path: str = None,
|
||
limit: int = SCRAPE_VIDEO_LIMIT,
|
||
) -> list[dict]:
|
||
"""
|
||
Fallback: use yt-dlp to extract the video list from a TikTok profile.
|
||
Accepts a Netscape-format cookie file path (not JSON).
|
||
<<<<<<< HEAD
|
||
Returns a list of dicts with keys: video_id, url, timestamp.
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
"""
|
||
import yt_dlp
|
||
|
||
profile_url = f"https://www.tiktok.com/@{handle}"
|
||
logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...")
|
||
|
||
impersonate = get_best_impersonation_target()
|
||
|
||
ydl_opts = {
|
||
"extract_flat": True,
|
||
"quiet": True,
|
||
"no_warnings": True,
|
||
"playlistend": limit,
|
||
}
|
||
if netscape_cookies_path and os.path.exists(netscape_cookies_path):
|
||
ydl_opts["cookiefile"] = netscape_cookies_path
|
||
if impersonate:
|
||
ydl_opts["impersonate"] = impersonate
|
||
|
||
try:
|
||
logging.info(f"🌐 yt-dlp extracting: {profile_url}")
|
||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||
info = ydl.extract_info(profile_url, download=False)
|
||
|
||
entries = info.get("entries", []) if info else []
|
||
logging.info(
|
||
f"✅ yt-dlp returned {len(entries)} entries "
|
||
f"(playlist: {info.get('title', '?') if info else '?'})"
|
||
)
|
||
|
||
videos = []
|
||
for entry in entries:
|
||
if not entry:
|
||
continue
|
||
url = entry.get("url") or entry.get("webpage_url") or ""
|
||
vid_match = re.search(r"/video/(\d+)", url)
|
||
if not vid_match:
|
||
vid_id = entry.get("id", "")
|
||
if vid_id:
|
||
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
||
vid_match = re.search(r"/video/(\d+)", url)
|
||
if vid_match:
|
||
videos.append({
|
||
"video_id": vid_match.group(1),
|
||
"url": url,
|
||
"timestamp": entry.get("timestamp"),
|
||
})
|
||
|
||
logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.")
|
||
return videos[:limit]
|
||
|
||
except Exception as e:
|
||
logging.error(
|
||
f"❌ yt-dlp profile scrape failed: {type(e).__name__}: {e}"
|
||
)
|
||
return []
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Caption builder
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> str:
|
||
"""Build a Bluesky post caption from video metadata."""
|
||
desc = (video_info.get("description") or "").strip()
|
||
url = video_info.get("url", "")
|
||
|
||
if desc:
|
||
<<<<<<< HEAD
|
||
url_len = len(url) + 1 # +1 for newline
|
||
=======
|
||
url_len = len(url) + 1
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
max_desc = max_len - url_len
|
||
if len(desc) > max_desc:
|
||
desc = desc[: max_desc - 1] + "…"
|
||
return f"{desc}\n{url}"
|
||
|
||
return url
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main processing loop
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def process_videos(
|
||
videos: list[dict],
|
||
state: dict,
|
||
client: Client,
|
||
tiktok_handle: str,
|
||
netscape_cookies_path: str,
|
||
langs: list[str],
|
||
max_age_days: int,
|
||
video_max_size_bytes: int,
|
||
) -> int:
|
||
"""
|
||
Download, compress, upload and post each new video.
|
||
Returns the count of successfully posted videos.
|
||
"""
|
||
posted_count = 0
|
||
now = arrow.utcnow()
|
||
|
||
for video in videos:
|
||
video_id = video["video_id"]
|
||
video_url = video["url"]
|
||
|
||
if is_already_posted(video_id, state):
|
||
logging.info(f"⏭️ Already posted: {video_id}")
|
||
continue
|
||
|
||
<<<<<<< HEAD
|
||
# Age filter (only when timestamp is available)
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
ts = video.get("timestamp")
|
||
if ts:
|
||
try:
|
||
video_time = arrow.get(ts)
|
||
age_days = (now - video_time).days
|
||
if age_days > max_age_days:
|
||
logging.info(
|
||
f"⏭️ Video {video_id} too old "
|
||
f"({age_days}d > {max_age_days}d). Skipping."
|
||
)
|
||
continue
|
||
except Exception:
|
||
pass
|
||
|
||
logging.info(f"🎬 Processing video {video_id}: {video_url}")
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4")
|
||
comp_path = os.path.join(tmpdir, f"{video_id}.mp4")
|
||
|
||
# 1. Download
|
||
ok = download_video(
|
||
video_url,
|
||
raw_path,
|
||
netscape_cookies_path=netscape_cookies_path,
|
||
)
|
||
if not ok:
|
||
logging.error(f"❌ Download failed for {video_id}. Skipping.")
|
||
continue
|
||
|
||
# 2. Compress
|
||
ok = compress_video(
|
||
raw_path,
|
||
comp_path,
|
||
max_size_bytes=video_max_size_bytes,
|
||
)
|
||
if not ok:
|
||
logging.error(f"❌ Compression failed for {video_id}. Skipping.")
|
||
continue
|
||
|
||
# 3. Upload blob
|
||
blob = upload_video_to_bluesky(client, comp_path, video_id)
|
||
if blob is None:
|
||
logging.error(f"❌ Blob upload failed for {video_id}.")
|
||
continue
|
||
|
||
# 4. Post
|
||
caption = build_caption(video, tiktok_handle)
|
||
ok = post_video_to_bluesky(client, blob, caption, langs, video_id)
|
||
if ok:
|
||
mark_as_posted(video_id, state, meta={"url": video_url})
|
||
posted_count += 1
|
||
<<<<<<< HEAD
|
||
# Brief pause between posts to avoid rate limiting
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
time.sleep(random.uniform(2.0, 5.0))
|
||
|
||
return posted_count
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Entry point
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
def parse_args() -> argparse.Namespace:
|
||
parser = argparse.ArgumentParser(
|
||
description="Cross-post TikTok videos to Bluesky."
|
||
<<<<<<< HEAD
|
||
)
|
||
parser.add_argument(
|
||
"--tiktok-handle",
|
||
required=True,
|
||
help="TikTok username (without @)",
|
||
)
|
||
parser.add_argument(
|
||
"--bsky-handle",
|
||
required=True,
|
||
help="Bluesky handle",
|
||
)
|
||
parser.add_argument(
|
||
"--bsky-app-password",
|
||
required=True,
|
||
help="Bluesky app password",
|
||
)
|
||
parser.add_argument(
|
||
"--bsky-base-url",
|
||
default=DEFAULT_BSKY_BASE_URL,
|
||
help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})",
|
||
)
|
||
parser.add_argument(
|
||
"--bsky-langs",
|
||
nargs="+",
|
||
default=DEFAULT_BSKY_LANGS,
|
||
help="BCP-47 language tags for posts (default: es)",
|
||
)
|
||
parser.add_argument(
|
||
"--cookies-path",
|
||
default=TIKTOK_COOKIES_PATH,
|
||
help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})",
|
||
)
|
||
parser.add_argument(
|
||
"--max-age-days",
|
||
type=int,
|
||
default=VIDEO_MAX_AGE_DAYS,
|
||
help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})",
|
||
)
|
||
=======
|
||
)
|
||
parser.add_argument("--tiktok-handle", required=True)
|
||
parser.add_argument("--bsky-handle", required=True)
|
||
parser.add_argument("--bsky-app-password", required=True)
|
||
parser.add_argument(
|
||
"--bsky-base-url",
|
||
default=DEFAULT_BSKY_BASE_URL,
|
||
help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})",
|
||
)
|
||
parser.add_argument(
|
||
"--bsky-langs",
|
||
nargs="+",
|
||
default=DEFAULT_BSKY_LANGS,
|
||
help="BCP-47 language tags for posts (default: es)",
|
||
)
|
||
parser.add_argument(
|
||
"--cookies-path",
|
||
default=TIKTOK_COOKIES_PATH,
|
||
help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})",
|
||
)
|
||
parser.add_argument(
|
||
"--max-age-days",
|
||
type=int,
|
||
default=VIDEO_MAX_AGE_DAYS,
|
||
help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})",
|
||
)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
return parser.parse_args()
|
||
|
||
|
||
def main():
|
||
load_dotenv()
|
||
args = parse_args()
|
||
|
||
<<<<<<< HEAD
|
||
# Fix 2 — resolve video size limit based on PDS
|
||
=======
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
video_max_size_bytes = get_video_size_limit(args.bsky_base_url)
|
||
|
||
logging.info("=" * 60)
|
||
logging.info("🤖 TikTok→Bluesky bot started")
|
||
logging.info(f" TikTok handle : @{args.tiktok_handle}")
|
||
logging.info(f" Bluesky handle: {args.bsky_handle}")
|
||
logging.info(f" Bluesky PDS : {args.bsky_base_url}")
|
||
logging.info(f" Languages : {args.bsky_langs}")
|
||
logging.info(f" Video size cap: {video_max_size_bytes // 1024 // 1024} MB")
|
||
cookie_status = "✅ found" if os.path.exists(args.cookies_path) else "❌ NOT FOUND"
|
||
logging.info(f" Cookie file : {args.cookies_path} ({cookie_status})")
|
||
logging.info("=" * 60)
|
||
|
||
state = load_state()
|
||
|
||
# Connect to Bluesky
|
||
client = connect_bluesky(
|
||
args.bsky_handle,
|
||
args.bsky_app_password,
|
||
args.bsky_base_url,
|
||
)
|
||
|
||
<<<<<<< HEAD
|
||
# Convert JSON cookies → Netscape format for yt-dlp
|
||
# Playwright uses the JSON cookies directly via inject_cookies_into_context()
|
||
# yt-dlp requires Netscape .txt format — convert once and reuse
|
||
=======
|
||
# Convert JSON cookies → Netscape format once for all yt-dlp calls
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path)
|
||
if netscape_cookies_path:
|
||
logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}")
|
||
else:
|
||
<<<<<<< HEAD
|
||
logging.warning("⚠️ Could not create Netscape cookie file. yt-dlp will run without cookies.")
|
||
|
||
try:
|
||
# Scrape TikTok profile
|
||
=======
|
||
logging.warning(
|
||
"⚠️ Could not create Netscape cookie file. "
|
||
"yt-dlp will run without cookies."
|
||
)
|
||
|
||
try:
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
logging.info(f"🔄 Scraping @{args.tiktok_handle}...")
|
||
cookies = load_cookies_from_file(args.cookies_path)
|
||
|
||
videos = scrape_tiktok_profile_playwright(
|
||
args.tiktok_handle,
|
||
cookies,
|
||
limit=SCRAPE_VIDEO_LIMIT,
|
||
)
|
||
|
||
if not videos:
|
||
logging.warning(
|
||
"⚠️ Playwright grid scraping failed. Trying yt-dlp fallback..."
|
||
)
|
||
ts = int(time.time())
|
||
logging.info(f"📸 Screenshot saved: screenshot_playwright_failed_{ts}.png")
|
||
|
||
videos = scrape_tiktok_profile_ytdlp(
|
||
args.tiktok_handle,
|
||
netscape_cookies_path=netscape_cookies_path,
|
||
limit=SCRAPE_VIDEO_LIMIT,
|
||
)
|
||
|
||
if not videos:
|
||
logging.error("❌ No videos found. Exiting.")
|
||
sys.exit(0)
|
||
|
||
logging.info(f"📋 Found {len(videos)} video(s). Processing new ones...")
|
||
|
||
posted = process_videos(
|
||
videos=videos,
|
||
state=state,
|
||
client=client,
|
||
tiktok_handle=args.tiktok_handle,
|
||
netscape_cookies_path=netscape_cookies_path,
|
||
langs=args.bsky_langs,
|
||
max_age_days=args.max_age_days,
|
||
video_max_size_bytes=video_max_size_bytes,
|
||
)
|
||
|
||
logging.info("=" * 60)
|
||
logging.info(f"✅ Sync complete. Posted {posted} new video(s).")
|
||
logging.info("🤖 Bot finished.")
|
||
logging.info("=" * 60)
|
||
|
||
finally:
|
||
# Always clean up the temporary Netscape cookie file
|
||
if netscape_cookies_path and os.path.exists(netscape_cookies_path):
|
||
try:
|
||
os.remove(netscape_cookies_path)
|
||
<<<<<<< HEAD
|
||
logging.info(f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}")
|
||
=======
|
||
logging.info(
|
||
f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}"
|
||
)
|
||
>>>>>>> 7cddbd0 (Fixes for today)
|
||
except Exception as e:
|
||
logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |