This commit is contained in:
Guillem Hernandez Sola
2026-05-20 07:16:07 +02:00
parent c613ab3603
commit 6d4cfbd4b5

View File

@@ -78,24 +78,30 @@ VIDEO_MAX_AGE_DAYS = 3
VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s VIDEO_MAX_DURATION_S = 179 # Bluesky hard limit is 180s
# Bluesky login retry config # ── Bluesky login retry config (ported from twitter2bsky.py) ─────────────────
BSKY_LOGIN_MAX_RETRIES = 4 BSKY_LOGIN_MAX_RETRIES = 6
BSKY_LOGIN_BASE_DELAY = 15.0 BSKY_LOGIN_BASE_DELAY = 15.0
BSKY_LOGIN_MAX_DELAY = 120.0 BSKY_LOGIN_MAX_DELAY = 600.0
BSKY_LOGIN_JITTER_MAX = 10.0 BSKY_LOGIN_JITTER_MAX = 5.0
BSKY_LOGIN_RATE_LIMIT_DELAY = 90.0 # minimum wait on 429
BSKY_LOGIN_RATE_LIMIT_MAX_DELAY = 600.0 # maximum wait on 429
# Bluesky upload retry config # ── Bluesky upload retry config ───────────────────────────────────────────────
BSKY_UPLOAD_MAX_RETRIES = 5 BSKY_UPLOAD_MAX_RETRIES = 5
BSKY_UPLOAD_BASE_DELAY = 10.0 BSKY_UPLOAD_BASE_DELAY = 10.0
BSKY_UPLOAD_MAX_DELAY = 120.0 BSKY_UPLOAD_MAX_DELAY = 120.0
BSKY_UPLOAD_JITTER_MAX = 5.0 BSKY_UPLOAD_JITTER_MAX = 5.0
# Playwright scraping config # ── Playwright scraping config ────────────────────────────────────────────────
PLAYWRIGHT_TIMEOUT_MS = 30_000 PLAYWRIGHT_TIMEOUT_MS = 30_000
PLAYWRIGHT_SLOW_MO = 50 PLAYWRIGHT_SLOW_MO = 50
PLAYWRIGHT_MAX_RELOADS = 3 PLAYWRIGHT_MAX_RELOADS = 3
<<<<<<< HEAD
# TikTok selectors # TikTok selectors
=======
# ── TikTok selectors ──────────────────────────────────────────────────────────
>>>>>>> 7cddbd0 (Fixes for today)
TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]' TIKTOK_VIDEO_GRID_SEL = '[data-e2e="user-post-item-list"]'
TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]' TIKTOK_VIDEO_ITEM_SEL = '[data-e2e="user-post-item"]'
TIKTOK_BANNER_SELS = [ TIKTOK_BANNER_SELS = [
@@ -254,6 +260,7 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None:
for c in cookies: for c in cookies:
domain = c.get("domain", ".tiktok.com") domain = c.get("domain", ".tiktok.com")
<<<<<<< HEAD
# Netscape format requires domain to start with a dot for # Netscape format requires domain to start with a dot for
# include_subdomains=TRUE to work correctly # include_subdomains=TRUE to work correctly
include_sub = "TRUE" if domain.startswith(".") else "FALSE" include_sub = "TRUE" if domain.startswith(".") else "FALSE"
@@ -262,6 +269,12 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None:
expiry = int( expiry = int(
c.get("expirationDate") or c.get("expires") or 0 c.get("expirationDate") or c.get("expires") or 0
) )
=======
include_sub = "TRUE" if domain.startswith(".") else "FALSE"
path = c.get("path", "/")
secure = "TRUE" if c.get("secure", False) else "FALSE"
expiry = int(c.get("expirationDate") or c.get("expires") or 0)
>>>>>>> 7cddbd0 (Fixes for today)
name = c.get("name", "") name = c.get("name", "")
value = c.get("value", "") value = c.get("value", "")
@@ -285,110 +298,168 @@ def convert_json_cookies_to_netscape(json_path: str) -> str | None:
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
<<<<<<< HEAD
# Bluesky error classification helpers # Bluesky error classification helpers
=======
# Bluesky error classification (ported from twitter2bsky.py)
>>>>>>> 7cddbd0 (Fixes for today)
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
def _bsky_error_text(error_obj) -> str:
"""Normalised lowercase repr for pattern matching."""
return repr(error_obj).lower()
def is_rate_limited_error(error_obj) -> bool: def is_rate_limited_error(error_obj) -> bool:
text = repr(error_obj).lower() text = _bsky_error_text(error_obj)
return ( return (
"429" in text "429" in text
or "ratelimitexceeded" in text or "ratelimitexceeded" in text
or "too many requests" in text or "too many requests" in text
or "rate limit" in text or "rate limit" in text
or "ratelimit" in text
) )
def is_auth_error(error_obj) -> bool: def is_auth_error(error_obj) -> bool:
text = repr(error_obj).lower() text = _bsky_error_text(error_obj)
return ( return (
"401" in text "401" in text
or "403" in text or "403" in text
or "invalid identifier" in text
or "invalid password" in text
or "authenticationrequired" in text
or "invalidtoken" in text
or "expiredtoken" in text
or "accounttakedown" in text
or "invalid identifier or password" in text or "invalid identifier or password" in text
or "authenticationrequired" in text
or "invalidtoken" in text
) )
def is_network_error(error_obj) -> bool: def is_network_error(error_obj) -> bool:
text = repr(error_obj) text = repr(error_obj)
signals = [ signals = [
"ConnectError", "ConnectError", "RemoteProtocolError", "ReadTimeout",
"RemoteProtocolError", "WriteTimeout", "TimeoutException", "ConnectionResetError",
"ReadTimeout", "503", "502", "504",
"WriteTimeout",
"TimeoutException",
"503",
"502",
"504",
"ConnectionResetError",
] ]
return any(sig in text for sig in signals) return any(s in text for s in signals)
def is_transient_error(error_obj) -> bool: def is_transient_error(error_obj) -> bool:
text = repr(error_obj) text = repr(error_obj)
signals = [ signals = [
"InvokeTimeoutError", "InvokeTimeoutError", "ReadTimeout", "WriteTimeout",
"ReadTimeout", "TimeoutException", "RemoteProtocolError", "ConnectError",
"WriteTimeout", "503", "502", "504",
"TimeoutException",
"RemoteProtocolError",
"ConnectError",
"503",
"502",
"504",
] ]
return any(sig in text for sig in signals) return any(s in text for s in signals)
def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float: def get_rate_limit_wait_seconds(error_obj, default_delay: float) -> float:
""" """
<<<<<<< HEAD
Parse rate-limit response headers and return a bounded wait time in seconds. Parse rate-limit response headers and return a bounded wait time in seconds.
=======
Extract the server-requested wait time from rate-limit error headers.
Checks (in order):
1. error_obj.headers dict — Retry-After, X-RateLimit-After, RateLimit-Reset
2. repr(error_obj) text — same keys embedded as strings
3. Falls back to default_delay
Ported from twitter2bsky.py.
>>>>>>> 7cddbd0 (Fixes for today)
""" """
now_ts = int(time.time())
# ── 1. Live headers object ────────────────────────────────────────────
try: try:
now_ts = int(time.time())
headers = getattr(error_obj, "headers", None) or {} headers = getattr(error_obj, "headers", None) or {}
for key in ("retry-after", "Retry-After"): for key in ("retry-after", "Retry-After"):
if headers.get(key): val = headers.get(key)
return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) if val:
return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
for key in ("x-ratelimit-after", "X-RateLimit-After"): for key in ("x-ratelimit-after", "X-RateLimit-After"):
if headers.get(key): val = headers.get(key)
return min(max(int(headers[key]), 1), BSKY_LOGIN_MAX_DELAY) if val:
return min(max(int(val), 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
for key in ("ratelimit-reset", "RateLimit-Reset"): for key in ("ratelimit-reset", "RateLimit-Reset"):
if headers.get(key): val = headers.get(key)
wait = max(int(headers[key]) - now_ts + 1, default_delay) if val:
return min(wait, BSKY_LOGIN_MAX_DELAY) wait = max(int(val) - now_ts + 2, default_delay)
return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
except Exception: except Exception:
pass pass
<<<<<<< HEAD
=======
# ── 2. repr() string fallback ─────────────────────────────────────────
>>>>>>> 7cddbd0 (Fixes for today)
text = repr(error_obj) text = repr(error_obj)
for pattern, is_timestamp in [ for pattern, is_ts in [
(r"'retry-after':\s*'(\d+)'", False), (r"['\"]retry-after['\"]\s*:\s*['\"](\d+)['\"]", False),
(r"'x-ratelimit-after':\s*'(\d+)'", False), (r"['\"]x-ratelimit-after['\"]\s*:\s*['\"](\d+)['\"]", False),
(r"'ratelimit-reset':\s*'(\d+)'", True), (r"['\"]ratelimit-reset['\"]\s*:\s*['\"](\d+)['\"]", True),
(r"retry.?after[=:\s]+(\d+)", False),
]: ]:
m = re.search(pattern, text, re.IGNORECASE) m = re.search(pattern, text, re.IGNORECASE)
if m: if m:
val = int(m.group(1)) val = int(m.group(1))
<<<<<<< HEAD
if is_timestamp: if is_timestamp:
wait = max(val - int(time.time()) + 1, default_delay) wait = max(val - int(time.time()) + 1, default_delay)
return min(wait, BSKY_LOGIN_MAX_DELAY) return min(wait, BSKY_LOGIN_MAX_DELAY)
return min(max(val, 1), BSKY_LOGIN_MAX_DELAY) return min(max(val, 1), BSKY_LOGIN_MAX_DELAY)
=======
if is_ts:
wait = max(val - now_ts + 2, default_delay)
return min(wait, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
return min(max(val, 1), BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
>>>>>>> 7cddbd0 (Fixes for today)
return default_delay return default_delay
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
<<<<<<< HEAD
# Bluesky client # Bluesky client
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client: def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client:
logging.info(f"🔐 Connecting Bluesky client via base URL: {base_url}") logging.info(f"🔐 Connecting Bluesky client via base URL: {base_url}")
client = Client(base_url=base_url) client = Client(base_url=base_url)
=======
# Bluesky client — improved login (ported from twitter2bsky.py)
# ─────────────────────────────────────────────────────────────────────────────
def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client:
"""
Authenticate with Bluesky with full retry logic ported from twitter2bsky.py:
• 429 / rate-limit → honour Retry-After header; wait up to 600s
• auth errors → fail immediately (retrying won't help)
• network/transient → exponential backoff with jitter
• other errors → exponential backoff with jitter
• exhausted retries → raise so Jenkins marks the build FAILURE
"""
logging.info(f"🔐 Connecting Bluesky client → {base_url}")
client = Client(base_url=base_url)
attempt = 0
last_error = None
while attempt < BSKY_LOGIN_MAX_RETRIES:
attempt += 1
logging.info(
f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} "
f"for {handle}"
)
>>>>>>> 7cddbd0 (Fixes for today)
for attempt in range(1, BSKY_LOGIN_MAX_RETRIES + 1):
try: try:
<<<<<<< HEAD
logging.info( logging.info(
f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}" f"🔐 Bluesky login attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES} for {handle}"
) )
@@ -423,6 +494,79 @@ def connect_bluesky(handle: str, app_password: str, base_url: str) -> Client:
raise raise
raise RuntimeError("Bluesky login failed: exhausted all retries.") raise RuntimeError("Bluesky login failed: exhausted all retries.")
=======
client.login(handle, app_password)
# Fetch profile to confirm the session is fully live
client.me = client.get_profile(handle)
logging.info(f"✅ Bluesky login successful as {handle}")
return client
except Exception as e:
last_error = e
err_detail = f"{type(e).__name__}: {e}"
# ── Auth errors: no point retrying ───────────────────────────
if is_auth_error(e):
logging.error(
f"❌ Bluesky login auth error (will not retry): {err_detail}"
)
raise
# ── Rate-limited (429) ────────────────────────────────────────
if is_rate_limited_error(e):
raw_wait = get_rate_limit_wait_seconds(e, BSKY_LOGIN_RATE_LIMIT_DELAY)
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
wait = min(raw_wait + jitter, BSKY_LOGIN_RATE_LIMIT_MAX_DELAY)
logging.warning(
f"⏳ Bluesky login rate-limited (attempt {attempt}/"
f"{BSKY_LOGIN_MAX_RETRIES}). "
f"Waiting {wait:.1f}s (server requested {raw_wait:.0f}s)."
)
if attempt < BSKY_LOGIN_MAX_RETRIES:
time.sleep(wait)
continue
# ── Network / transient errors ────────────────────────────────
if is_network_error(e) or is_transient_error(e):
delay = min(
BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)),
BSKY_LOGIN_MAX_DELAY,
)
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
wait = delay + jitter
logging.warning(
f"⚠️ Bluesky login network/transient error "
f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): "
f"{err_detail}. Retrying in {wait:.1f}s."
)
if attempt < BSKY_LOGIN_MAX_RETRIES:
time.sleep(wait)
continue
# ── Unknown errors ────────────────────────────────────────────
delay = min(
BSKY_LOGIN_BASE_DELAY * (2 ** (attempt - 1)),
BSKY_LOGIN_MAX_DELAY,
)
jitter = random.uniform(0.0, BSKY_LOGIN_JITTER_MAX)
wait = delay + jitter
logging.warning(
f"⚠️ Bluesky login failed "
f"(attempt {attempt}/{BSKY_LOGIN_MAX_RETRIES}): "
f"{err_detail}. Retrying in {wait:.1f}s."
)
if attempt < BSKY_LOGIN_MAX_RETRIES:
time.sleep(wait)
logging.error(
f"❌ Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts. "
f"Last error: {type(last_error).__name__}: {last_error}"
)
raise RuntimeError(
f"Bluesky login failed after {BSKY_LOGIN_MAX_RETRIES} attempts: "
f"{last_error}"
)
>>>>>>> 7cddbd0 (Fixes for today)
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
@@ -463,7 +607,11 @@ def compress_video(
• post-encode size guard — rejects file if still over limit • post-encode size guard — rejects file if still over limit
""" """
if max_size_bytes is None: if max_size_bytes is None:
<<<<<<< HEAD
max_size_bytes = 20 * 1024 * 1024 # fallback max_size_bytes = 20 * 1024 * 1024 # fallback
=======
max_size_bytes = 20 * 1024 * 1024
>>>>>>> 7cddbd0 (Fixes for today)
try: try:
duration = get_video_duration(input_path) duration = get_video_duration(input_path)
@@ -477,7 +625,10 @@ def compress_video(
trim_to = min(duration, max_duration) trim_to = min(duration, max_duration)
<<<<<<< HEAD
# Target 85% of the size budget to leave headroom for container overhead # Target 85% of the size budget to leave headroom for container overhead
=======
>>>>>>> 7cddbd0 (Fixes for today)
target_bits = max_size_bytes * 8 * 0.85 target_bits = max_size_bytes * 8 * 0.85
total_kbps = int(target_bits / trim_to / 1000) total_kbps = int(target_bits / trim_to / 1000)
audio_kbps = 96 audio_kbps = 96
@@ -493,10 +644,13 @@ def compress_video(
"ffmpeg", "-y", "ffmpeg", "-y",
"-i", input_path, "-i", input_path,
"-t", str(trim_to), "-t", str(trim_to),
<<<<<<< HEAD
# Scale to 720p max, then pad to even dimensions. # Scale to 720p max, then pad to even dimensions.
# The pad filter is required because libx264 needs width/height # The pad filter is required because libx264 needs width/height
# divisible by 2. Portrait TikTok videos (9:16) would otherwise # divisible by 2. Portrait TikTok videos (9:16) would otherwise
# produce odd widths like 405px and crash the encoder. # produce odd widths like 405px and crash the encoder.
=======
>>>>>>> 7cddbd0 (Fixes for today)
"-vf", ( "-vf", (
"scale='min(1280,iw)':'min(720,ih)'" "scale='min(1280,iw)':'min(720,ih)'"
":force_original_aspect_ratio=decrease," ":force_original_aspect_ratio=decrease,"
@@ -504,7 +658,11 @@ def compress_video(
), ),
"-c:v", "libx264", "-c:v", "libx264",
"-b:v", f"{video_kbps}k", "-b:v", f"{video_kbps}k",
<<<<<<< HEAD
"-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target "-maxrate", f"{video_kbps}k", # hard ceiling — no burst above target
=======
"-maxrate", f"{video_kbps}k",
>>>>>>> 7cddbd0 (Fixes for today)
"-bufsize", f"{video_kbps * 2}k", "-bufsize", f"{video_kbps * 2}k",
"-c:a", "aac", "-c:a", "aac",
"-b:a", f"{audio_kbps}k", "-b:a", f"{audio_kbps}k",
@@ -520,7 +678,10 @@ def compress_video(
final_size = os.path.getsize(output_path) final_size = os.path.getsize(output_path)
<<<<<<< HEAD
# Reject if still over the hard limit # Reject if still over the hard limit
=======
>>>>>>> 7cddbd0 (Fixes for today)
if final_size > max_size_bytes: if final_size > max_size_bytes:
logging.error( logging.error(
f"❌ Compressed file still too large: " f"❌ Compressed file still too large: "
@@ -572,7 +733,10 @@ def download_video_ytdlp(
""" """
Download a TikTok video using yt-dlp with browser impersonation. Download a TikTok video using yt-dlp with browser impersonation.
Accepts a Netscape-format cookie file path (not JSON). Accepts a Netscape-format cookie file path (not JSON).
<<<<<<< HEAD
Returns True on success, False on failure. Returns True on success, False on failure.
=======
>>>>>>> 7cddbd0 (Fixes for today)
""" """
impersonate = get_best_impersonation_target() impersonate = get_best_impersonation_target()
@@ -633,7 +797,11 @@ def upload_video_to_bluesky(
) -> object | None: ) -> object | None:
""" """
Upload a video file to Bluesky as a blob. Upload a video file to Bluesky as a blob.
<<<<<<< HEAD
Exception is always logged as type(e).__name__: e for full visibility. Exception is always logged as type(e).__name__: e for full visibility.
=======
All exceptions logged as type(e).__name__: e for full visibility.
>>>>>>> 7cddbd0 (Fixes for today)
""" """
size_mb = os.path.getsize(video_path) / 1024 / 1024 size_mb = os.path.getsize(video_path) / 1024 / 1024
logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...") logging.info(f"⬆️ Uploading to Bluesky ({size_mb:.1f} MB)...")
@@ -717,6 +885,90 @@ def dismiss_overlays(page) -> None:
pass pass
<<<<<<< HEAD
=======
def _run_playwright_scrape_loop(page, profile_url: str, limit: int) -> list[dict]:
"""
Inner scraping loop shared by both the stealth and no-stealth paths.
Returns a list of video dicts.
"""
videos = []
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
try:
logging.info(
f"🌐 Loading profile (attempt {attempt}/{PLAYWRIGHT_MAX_RELOADS})..."
)
page.goto(
profile_url,
wait_until="domcontentloaded",
timeout=PLAYWRIGHT_TIMEOUT_MS,
)
time.sleep(3)
dismiss_overlays(page)
try:
page.wait_for_selector(
TIKTOK_VIDEO_GRID_SEL,
timeout=PLAYWRIGHT_TIMEOUT_MS,
)
except Exception:
pass
grid = page.locator(TIKTOK_VIDEO_GRID_SEL).first
if not grid.is_visible(timeout=5000):
logging.warning(f"⚠️ Video grid not found on attempt {attempt}.")
ts = int(time.time())
try:
page.screenshot(path=f"screenshot_no_grid_{attempt}_{ts}.png")
logging.info(
f"📸 Screenshot saved: screenshot_no_grid_{attempt}_{ts}.png"
)
except Exception:
pass
time.sleep(3)
continue
items = page.locator(TIKTOK_VIDEO_ITEM_SEL).all()
for item in items[:limit]:
try:
link = item.locator("a").first.get_attribute("href")
if link and "/video/" in link:
vid_match = re.search(r"/video/(\d+)", link)
if vid_match:
video_id = vid_match.group(1)
full_url = (
link if link.startswith("http")
else f"https://www.tiktok.com{link}"
)
videos.append({
"video_id": video_id,
"url": full_url,
"timestamp": None,
})
except Exception:
pass
if videos:
logging.info(f"✅ Playwright scraped {len(videos)} videos.")
break
except Exception as e:
logging.warning(
f"⚠️ Playwright attempt {attempt} error: "
f"{type(e).__name__}: {e}"
)
ts = int(time.time())
try:
page.screenshot(path=f"screenshot_error_{attempt}_{ts}.png")
except Exception:
pass
time.sleep(3)
return videos
>>>>>>> 7cddbd0 (Fixes for today)
def scrape_tiktok_profile_playwright( def scrape_tiktok_profile_playwright(
handle: str, handle: str,
cookies: list, cookies: list,
@@ -724,10 +976,18 @@ def scrape_tiktok_profile_playwright(
) -> list[dict]: ) -> list[dict]:
""" """
Scrape the most recent video URLs from a TikTok profile page using Playwright. Scrape the most recent video URLs from a TikTok profile page using Playwright.
<<<<<<< HEAD
Returns a list of dicts with keys: video_id, url, timestamp. Returns a list of dicts with keys: video_id, url, timestamp.
Stealth fix: playwright-stealth v2.x must wrap the page via a context manager Stealth fix: playwright-stealth v2.x must wrap the page via a context manager
on new_page(), not via .apply() or .use_sync() after the fact. on new_page(), not via .apply() or .use_sync() after the fact.
=======
Stealth handling:
v1.x → stealth_sync(page) after new_page()
v2.x → Stealth() used as context manager; page created inside it
none → plain page, no stealth
>>>>>>> 7cddbd0 (Fixes for today)
""" """
profile_url = f"https://www.tiktok.com/@{handle}" profile_url = f"https://www.tiktok.com/@{handle}"
logging.info(f"🕷️ Scraping TikTok profile: {profile_url}") logging.info(f"🕷️ Scraping TikTok profile: {profile_url}")
@@ -756,6 +1016,7 @@ def scrape_tiktok_profile_playwright(
inject_cookies_into_context(context, cookies) inject_cookies_into_context(context, cookies)
<<<<<<< HEAD
# ── Stealth application ─────────────────────────────────────────── # ── Stealth application ───────────────────────────────────────────
# v1.x: stealth_sync(page) — called after new_page() # v1.x: stealth_sync(page) — called after new_page()
# v2.x: context manager on new_page — page must be created inside # v2.x: context manager on new_page — page must be created inside
@@ -771,11 +1032,16 @@ def scrape_tiktok_profile_playwright(
elif _STEALTH_V2: elif _STEALTH_V2:
# v2.x — use as context manager so the page is created inside it # v2.x — use as context manager so the page is created inside it
=======
# ── Stealth v2.x — page must be created inside the context manager ──
if _STEALTH_V2 is True:
>>>>>>> 7cddbd0 (Fixes for today)
try: try:
stealth_instance = Stealth() stealth_instance = Stealth()
with stealth_instance(context) as stealthy_context: with stealth_instance(context) as stealthy_context:
page = stealthy_context.new_page() page = stealthy_context.new_page()
logging.info("🥷 playwright-stealth v2.x applied (context manager).") logging.info("🥷 playwright-stealth v2.x applied (context manager).")
<<<<<<< HEAD
# Run the scraping loop inside the context manager scope # Run the scraping loop inside the context manager scope
for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1): for attempt in range(1, PLAYWRIGHT_MAX_RELOADS + 1):
try: try:
@@ -864,12 +1130,27 @@ def scrape_tiktok_profile_playwright(
else: else:
# v1.x — create page then apply stealth # v1.x — create page then apply stealth
=======
videos = _run_playwright_scrape_loop(page, profile_url, limit)
except Exception as e:
logging.warning(
f"⚠️ playwright-stealth v2.x failed: {type(e).__name__}: {e}. "
f"Retrying without stealth."
)
# Fall through to no-stealth path below
page = context.new_page()
videos = _run_playwright_scrape_loop(page, profile_url, limit)
# ── Stealth v1.x ──────────────────────────────────────────────────
elif _STEALTH_V2 is False:
>>>>>>> 7cddbd0 (Fixes for today)
page = context.new_page() page = context.new_page()
try: try:
stealth_sync(page) stealth_sync(page)
logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).") logging.info("🥷 playwright-stealth v1.x applied (stealth_sync).")
except Exception as e: except Exception as e:
logging.warning( logging.warning(
<<<<<<< HEAD
f"⚠️ playwright-stealth v1.x failed: " f"⚠️ playwright-stealth v1.x failed: "
f"{type(e).__name__}: {e}. Continuing without stealth." f"{type(e).__name__}: {e}. Continuing without stealth."
) )
@@ -971,6 +1252,35 @@ def scrape_tiktok_profile_playwright(
pass pass
# ── Cleanup ─────────────────────────────────────────────────────── # ── Cleanup ───────────────────────────────────────────────────────
=======
f"⚠️ playwright-stealth v1.x failed: {type(e).__name__}: {e}. "
f"Continuing without stealth."
)
videos = _run_playwright_scrape_loop(page, profile_url, limit)
# ── No stealth available ──────────────────────────────────────────
else:
logging.warning("⚠️ playwright-stealth not installed. Skipping stealth.")
page = context.new_page()
videos = _run_playwright_scrape_loop(page, profile_url, limit)
if not videos:
logging.warning(
f"⚠️ Video grid not found after {PLAYWRIGHT_MAX_RELOADS} attempts."
)
ts = int(time.time())
try:
page.screenshot(
path=f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
)
logging.info(
f"📸 Screenshot saved: "
f"screenshot_no_grid_{PLAYWRIGHT_MAX_RELOADS}_{ts}.png"
)
except Exception:
pass
>>>>>>> 7cddbd0 (Fixes for today)
for obj in (page, context, browser): for obj in (page, context, browser):
try: try:
if obj: if obj:
@@ -992,7 +1302,10 @@ def scrape_tiktok_profile_ytdlp(
""" """
Fallback: use yt-dlp to extract the video list from a TikTok profile. Fallback: use yt-dlp to extract the video list from a TikTok profile.
Accepts a Netscape-format cookie file path (not JSON). Accepts a Netscape-format cookie file path (not JSON).
<<<<<<< HEAD
Returns a list of dicts with keys: video_id, url, timestamp. Returns a list of dicts with keys: video_id, url, timestamp.
=======
>>>>>>> 7cddbd0 (Fixes for today)
""" """
import yt_dlp import yt_dlp
@@ -1060,7 +1373,11 @@ def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> s
url = video_info.get("url", "") url = video_info.get("url", "")
if desc: if desc:
<<<<<<< HEAD
url_len = len(url) + 1 # +1 for newline url_len = len(url) + 1 # +1 for newline
=======
url_len = len(url) + 1
>>>>>>> 7cddbd0 (Fixes for today)
max_desc = max_len - url_len max_desc = max_len - url_len
if len(desc) > max_desc: if len(desc) > max_desc:
desc = desc[: max_desc - 1] + "" desc = desc[: max_desc - 1] + ""
@@ -1097,7 +1414,10 @@ def process_videos(
logging.info(f"⏭️ Already posted: {video_id}") logging.info(f"⏭️ Already posted: {video_id}")
continue continue
<<<<<<< HEAD
# Age filter (only when timestamp is available) # Age filter (only when timestamp is available)
=======
>>>>>>> 7cddbd0 (Fixes for today)
ts = video.get("timestamp") ts = video.get("timestamp")
if ts: if ts:
try: try:
@@ -1150,7 +1470,10 @@ def process_videos(
if ok: if ok:
mark_as_posted(video_id, state, meta={"url": video_url}) mark_as_posted(video_id, state, meta={"url": video_url})
posted_count += 1 posted_count += 1
<<<<<<< HEAD
# Brief pause between posts to avoid rate limiting # Brief pause between posts to avoid rate limiting
=======
>>>>>>> 7cddbd0 (Fixes for today)
time.sleep(random.uniform(2.0, 5.0)) time.sleep(random.uniform(2.0, 5.0))
return posted_count return posted_count
@@ -1162,6 +1485,7 @@ def process_videos(
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Cross-post TikTok videos to Bluesky." description="Cross-post TikTok videos to Bluesky."
<<<<<<< HEAD
) )
parser.add_argument( parser.add_argument(
"--tiktok-handle", "--tiktok-handle",
@@ -1200,6 +1524,34 @@ def parse_args() -> argparse.Namespace:
default=VIDEO_MAX_AGE_DAYS, default=VIDEO_MAX_AGE_DAYS,
help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})", help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})",
) )
=======
)
parser.add_argument("--tiktok-handle", required=True)
parser.add_argument("--bsky-handle", required=True)
parser.add_argument("--bsky-app-password", required=True)
parser.add_argument(
"--bsky-base-url",
default=DEFAULT_BSKY_BASE_URL,
help=f"Bluesky PDS base URL (default: {DEFAULT_BSKY_BASE_URL})",
)
parser.add_argument(
"--bsky-langs",
nargs="+",
default=DEFAULT_BSKY_LANGS,
help="BCP-47 language tags for posts (default: es)",
)
parser.add_argument(
"--cookies-path",
default=TIKTOK_COOKIES_PATH,
help=f"Path to TikTok cookies JSON (default: {TIKTOK_COOKIES_PATH})",
)
parser.add_argument(
"--max-age-days",
type=int,
default=VIDEO_MAX_AGE_DAYS,
help=f"Skip videos older than N days (default: {VIDEO_MAX_AGE_DAYS})",
)
>>>>>>> 7cddbd0 (Fixes for today)
return parser.parse_args() return parser.parse_args()
@@ -1207,7 +1559,10 @@ def main():
load_dotenv() load_dotenv()
args = parse_args() args = parse_args()
<<<<<<< HEAD
# Fix 2 — resolve video size limit based on PDS # Fix 2 — resolve video size limit based on PDS
=======
>>>>>>> 7cddbd0 (Fixes for today)
video_max_size_bytes = get_video_size_limit(args.bsky_base_url) video_max_size_bytes = get_video_size_limit(args.bsky_base_url)
logging.info("=" * 60) logging.info("=" * 60)
@@ -1230,17 +1585,30 @@ def main():
args.bsky_base_url, args.bsky_base_url,
) )
<<<<<<< HEAD
# Convert JSON cookies → Netscape format for yt-dlp # Convert JSON cookies → Netscape format for yt-dlp
# Playwright uses the JSON cookies directly via inject_cookies_into_context() # Playwright uses the JSON cookies directly via inject_cookies_into_context()
# yt-dlp requires Netscape .txt format — convert once and reuse # yt-dlp requires Netscape .txt format — convert once and reuse
=======
# Convert JSON cookies → Netscape format once for all yt-dlp calls
>>>>>>> 7cddbd0 (Fixes for today)
netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path) netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path)
if netscape_cookies_path: if netscape_cookies_path:
logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}") logging.info(f"🍪 Netscape cookie file ready: {netscape_cookies_path}")
else: else:
<<<<<<< HEAD
logging.warning("⚠️ Could not create Netscape cookie file. yt-dlp will run without cookies.") logging.warning("⚠️ Could not create Netscape cookie file. yt-dlp will run without cookies.")
try: try:
# Scrape TikTok profile # Scrape TikTok profile
=======
logging.warning(
"⚠️ Could not create Netscape cookie file. "
"yt-dlp will run without cookies."
)
try:
>>>>>>> 7cddbd0 (Fixes for today)
logging.info(f"🔄 Scraping @{args.tiktok_handle}...") logging.info(f"🔄 Scraping @{args.tiktok_handle}...")
cookies = load_cookies_from_file(args.cookies_path) cookies = load_cookies_from_file(args.cookies_path)
@@ -1290,7 +1658,13 @@ def main():
if netscape_cookies_path and os.path.exists(netscape_cookies_path): if netscape_cookies_path and os.path.exists(netscape_cookies_path):
try: try:
os.remove(netscape_cookies_path) os.remove(netscape_cookies_path)
<<<<<<< HEAD
logging.info(f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}") logging.info(f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}")
=======
logging.info(
f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}"
)
>>>>>>> 7cddbd0 (Fixes for today)
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}") logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}")