This commit is contained in:
Guillem Hernandez Sola
2026-05-20 09:28:12 +02:00
parent 04384ec91c
commit c0524b76ee

View File

@@ -44,7 +44,6 @@ _STEALTH_SYNC = None # will hold the stealth_sync callable if v1.x is present
try: try:
from playwright_stealth import stealth_sync as _stealth_sync_import from playwright_stealth import stealth_sync as _stealth_sync_import
_STEALTH_SYNC = _stealth_sync_import _STEALTH_SYNC = _stealth_sync_import
logging.getLogger(__name__).debug("playwright-stealth v1.x detected (stealth_sync)")
except ImportError: except ImportError:
# v2.x is installed but its API is too unstable to use reliably — # v2.x is installed but its API is too unstable to use reliably —
# browser launch args provide equivalent protection for our use case # browser launch args provide equivalent protection for our use case
@@ -494,7 +493,7 @@ def compress_video(
) )
return False return False
trim_to = min(duration, max_duration) trim_to = min(duration, max_duration)
target_bits = max_size_bytes * 8 * 0.85 target_bits = max_size_bytes * 8 * 0.85
total_kbps = int(target_bits / trim_to / 1000) total_kbps = int(target_bits / trim_to / 1000)
audio_kbps = 96 audio_kbps = 96
@@ -553,24 +552,18 @@ def compress_video(
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# yt-dlp helpers # yt-dlp helpers
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
def get_best_impersonation_target() -> str | None: def get_best_impersonation_target():
""" """
Ask yt-dlp directly which impersonation targets are actually available Ask yt-dlp directly which impersonation targets are actually available
in the current environment. This is the only reliable method — in the current environment. Returns the best ImpersonateTarget object,
curl_cffi's BrowserType enum values change between versions and do not or None if none are available.
map 1:1 to yt-dlp's target names.
Returns the best available target string, or None if none are available. This is the only reliable method — curl_cffi's BrowserType enum values
change between versions and do not map 1:1 to yt-dlp's target names.
""" """
try: try:
import yt_dlp import yt_dlp
# yt-dlp exposes available impersonation targets via
# ImpersonateTarget.supported_targets() in newer builds,
# or via YoutubeDL._impersonate_target_key in older ones.
# The safest cross-version approach is to instantiate a YoutubeDL
# object with quiet=True and inspect _impersonate_targets.
with yt_dlp.YoutubeDL({"quiet": True, "no_warnings": True}) as ydl: with yt_dlp.YoutubeDL({"quiet": True, "no_warnings": True}) as ydl:
# _impersonate_targets is a dict of {ImpersonateTarget: handler}
targets = getattr(ydl, "_impersonate_targets", None) targets = getattr(ydl, "_impersonate_targets", None)
if not targets: if not targets:
logging.warning( logging.warning(
@@ -578,11 +571,8 @@ def get_best_impersonation_target() -> str | None:
) )
return None return None
# Convert to string representations and pick the best one
preferred = ["chrome", "safari", "firefox", "edge"]
available_strs = [] available_strs = []
for t in targets.keys(): for t in targets.keys():
# ImpersonateTarget has .client and optionally .version
client = getattr(t, "client", None) or str(t) client = getattr(t, "client", None) or str(t)
version = getattr(t, "version", None) version = getattr(t, "version", None)
label = f"{client}-{version}" if version else str(client) label = f"{client}-{version}" if version else str(client)
@@ -593,7 +583,7 @@ def get_best_impersonation_target() -> str | None:
f"{[s for s, _ in available_strs]}" f"{[s for s, _ in available_strs]}"
) )
# Pick highest-versioned chrome first, then others # Prefer highest-versioned chrome, then anything else
chrome_targets = sorted( chrome_targets = sorted(
[(s, t) for s, t in available_strs if "chrome" in s], [(s, t) for s, t in available_strs if "chrome" in s],
key=lambda x: x[0], key=lambda x: x[0],
@@ -602,9 +592,8 @@ def get_best_impersonation_target() -> str | None:
if chrome_targets: if chrome_targets:
best_label, best_target = chrome_targets[0] best_label, best_target = chrome_targets[0]
logging.info(f"🎭 Selected impersonation target: {best_label}") logging.info(f"🎭 Selected impersonation target: {best_label}")
return best_target # return the actual ImpersonateTarget object return best_target
# Fallback to any available target
best_label, best_target = available_strs[0] best_label, best_target = available_strs[0]
logging.info(f"🎭 Selected impersonation target (fallback): {best_label}") logging.info(f"🎭 Selected impersonation target (fallback): {best_label}")
return best_target return best_target
@@ -617,11 +606,76 @@ def get_best_impersonation_target() -> str | None:
return None return None
def fetch_video_metadata_ytdlp(
url: str,
netscape_cookies_path: str = None,
) -> dict:
"""
Fetch metadata (title, description, timestamp, uploader) for a single
TikTok video URL using yt-dlp without downloading the video file.
TikTok captions (the text the creator wrote) live in the 'description'
field of yt-dlp's info dict. 'title' is a shorter auto-generated label.
Returns a dict with keys: description, title, timestamp, uploader.
All values default to empty string / None on failure.
"""
import yt_dlp
impersonate = get_best_impersonation_target()
ydl_opts = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
}
if netscape_cookies_path and os.path.exists(netscape_cookies_path):
ydl_opts["cookiefile"] = netscape_cookies_path
if impersonate is not None:
ydl_opts["impersonate"] = impersonate
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
if not info:
return {}
raw_desc = (info.get("description") or "").strip()
raw_title = (info.get("title") or "").strip()
# Prefer description (full caption with hashtags) over title
description = raw_desc or raw_title
logging.info(
f"📝 Fetched metadata for {url}: "
f"description={description[:80]!r}"
f"{'...' if len(description) > 80 else ''}"
)
return {
"description": description,
"title": raw_title,
"timestamp": info.get("timestamp"),
"uploader": info.get("uploader") or info.get("channel") or "",
}
except Exception as e:
logging.warning(
f"⚠️ Could not fetch metadata for {url}: {type(e).__name__}: {e}"
)
return {}
def download_video_ytdlp( def download_video_ytdlp(
url: str, url: str,
output_path: str, output_path: str,
netscape_cookies_path: str = None, netscape_cookies_path: str = None,
) -> bool: ) -> bool:
"""
Download a TikTok video using yt-dlp with browser impersonation.
Accepts a Netscape-format cookie file path (not JSON).
"""
impersonate = get_best_impersonation_target() impersonate = get_best_impersonation_target()
ydl_opts = { ydl_opts = {
@@ -631,10 +685,8 @@ def download_video_ytdlp(
"no_warnings": False, "no_warnings": False,
"merge_output_format": "mp4", "merge_output_format": "mp4",
} }
if netscape_cookies_path and os.path.exists(netscape_cookies_path): if netscape_cookies_path and os.path.exists(netscape_cookies_path):
ydl_opts["cookiefile"] = netscape_cookies_path ydl_opts["cookiefile"] = netscape_cookies_path
if impersonate is not None: if impersonate is not None:
ydl_opts["impersonate"] = impersonate ydl_opts["impersonate"] = impersonate
@@ -665,7 +717,9 @@ def download_video(
netscape_cookies_path: str = None, netscape_cookies_path: str = None,
) -> bool: ) -> bool:
logging.info(f"⬇️ Downloading: {url}") logging.info(f"⬇️ Downloading: {url}")
return download_video_ytdlp(url, output_path, netscape_cookies_path=netscape_cookies_path) return download_video_ytdlp(
url, output_path, netscape_cookies_path=netscape_cookies_path
)
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
@@ -692,14 +746,12 @@ def upload_video_to_bluesky(
except Exception as e: except Exception as e:
err_detail = f"{type(e).__name__}: {e}" err_detail = f"{type(e).__name__}: {e}"
if attempt >= BSKY_UPLOAD_MAX_RETRIES: if attempt >= BSKY_UPLOAD_MAX_RETRIES:
logging.error( logging.error(
f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: " f"❌ Blob upload failed after {BSKY_UPLOAD_MAX_RETRIES} attempts: "
f"{err_detail}" f"{err_detail}"
) )
return None return None
logging.warning( logging.warning(
f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} " f"⚠️ Blob upload attempt {attempt}/{BSKY_UPLOAD_MAX_RETRIES} "
f"failed: {err_detail}. Retrying in {delay:.1f}s..." f"failed: {err_detail}. Retrying in {delay:.1f}s..."
@@ -734,6 +786,43 @@ def post_video_to_bluesky(
return False return False
# ─────────────────────────────────────────────────────────────────────────────
# Caption builder
# ─────────────────────────────────────────────────────────────────────────────
def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> str:
"""
Build a Bluesky post caption from video metadata.
Format:
<description>
<tiktok_url>
If description + URL exceeds 290 chars, the description is trimmed at
the last whitespace boundary before the limit to avoid cutting mid-word
or mid-hashtag.
"""
desc = (video_info.get("description") or "").strip()
url = video_info.get("url", "").strip()
if not desc:
# No caption available — just post the URL
return url
# Reserve space for newline + URL
url_block = f"\n{url}"
max_desc = max_len - len(url_block)
if len(desc) > max_desc:
trimmed = desc[:max_desc - 1]
cut = trimmed.rfind(" ")
# Only use word boundary if it doesn't cut off too much
if cut > max_desc // 2:
trimmed = trimmed[:cut]
desc = trimmed + ""
return f"{desc}{url_block}"
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# TikTok scraping — Playwright # TikTok scraping — Playwright
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
@@ -801,9 +890,10 @@ def _run_playwright_scrape_loop(page, profile_url: str, limit: int) -> list[dict
else f"https://www.tiktok.com{link}" else f"https://www.tiktok.com{link}"
) )
videos.append({ videos.append({
"video_id": video_id, "video_id": video_id,
"url": full_url, "url": full_url,
"timestamp": None, "timestamp": None,
"description": "",
}) })
except Exception: except Exception:
pass pass
@@ -864,7 +954,6 @@ def scrape_tiktok_profile_playwright(
), ),
viewport={"width": 1280, "height": 900}, viewport={"width": 1280, "height": 900},
locale="es-ES", locale="es-ES",
# Mask automation signals at the context level
extra_http_headers={ extra_http_headers={
"Accept-Language": "es-ES,es;q=0.9,en;q=0.8", "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
}, },
@@ -873,7 +962,7 @@ def scrape_tiktok_profile_playwright(
inject_cookies_into_context(context, cookies) inject_cookies_into_context(context, cookies)
page = context.new_page() page = context.new_page()
# Apply stealth v1.x if available; skip v2.x entirely # Apply stealth v1.x if available; skip v2.x entirely (unstable API)
if _STEALTH_SYNC is not None: if _STEALTH_SYNC is not None:
try: try:
_STEALTH_SYNC(page) _STEALTH_SYNC(page)
@@ -928,6 +1017,9 @@ def scrape_tiktok_profile_ytdlp(
""" """
Fallback: use yt-dlp to extract the video list from a TikTok profile. Fallback: use yt-dlp to extract the video list from a TikTok profile.
Accepts a Netscape-format cookie file path (not JSON). Accepts a Netscape-format cookie file path (not JSON).
Note: flat playlist extraction gives us basic metadata (title, timestamp)
but not the full description — that is fetched per-video in process_videos().
""" """
import yt_dlp import yt_dlp
@@ -970,10 +1062,13 @@ def scrape_tiktok_profile_ytdlp(
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
vid_match = re.search(r"/video/(\d+)", url) vid_match = re.search(r"/video/(\d+)", url)
if vid_match: if vid_match:
# description from flat extraction is usually just the title —
# the full caption is fetched per-video in process_videos()
videos.append({ videos.append({
"video_id": vid_match.group(1), "video_id": vid_match.group(1),
"url": url, "url": url,
"timestamp": entry.get("timestamp"), "timestamp": entry.get("timestamp"),
"description": (entry.get("description") or "").strip(),
}) })
logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.") logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.")
@@ -984,21 +1079,6 @@ def scrape_tiktok_profile_ytdlp(
return [] return []
# ─────────────────────────────────────────────────────────────────────────────
# Caption builder
# ─────────────────────────────────────────────────────────────────────────────
def build_caption(video_info: dict, tiktok_handle: str, max_len: int = 290) -> str:
desc = (video_info.get("description") or "").strip()
url = video_info.get("url", "")
if desc:
url_len = len(url) + 1
max_desc = max_len - url_len
if len(desc) > max_desc:
desc = desc[: max_desc - 1] + ""
return f"{desc}\n{url}"
return url
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# Main processing loop # Main processing loop
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
@@ -1012,6 +1092,14 @@ def process_videos(
max_age_days: int, max_age_days: int,
video_max_size_bytes: int, video_max_size_bytes: int,
) -> int: ) -> int:
"""
For each new video:
0. Fetch full metadata (description/caption) via yt-dlp
1. Download the video file
2. Compress to fit within the PDS size limit
3. Upload blob to Bluesky
4. Create the post with caption + URL
"""
posted_count = 0 posted_count = 0
now = arrow.utcnow() now = arrow.utcnow()
@@ -1023,6 +1111,7 @@ def process_videos(
logging.info(f"⏭️ Already posted: {video_id}") logging.info(f"⏭️ Already posted: {video_id}")
continue continue
# Age filter (only when timestamp is available)
ts = video.get("timestamp") ts = video.get("timestamp")
if ts: if ts:
try: try:
@@ -1039,6 +1128,19 @@ def process_videos(
logging.info(f"🎬 Processing video {video_id}: {video_url}") logging.info(f"🎬 Processing video {video_id}: {video_url}")
# ── 0. Fetch full metadata if description not already populated ───
if not video.get("description"):
logging.info(f"🔍 Fetching metadata for {video_id}...")
meta = fetch_video_metadata_ytdlp(
video_url,
netscape_cookies_path=netscape_cookies_path,
)
if meta:
video["description"] = meta.get("description", "")
# Backfill timestamp if we didn't have one from scraping
if not video.get("timestamp") and meta.get("timestamp"):
video["timestamp"] = meta["timestamp"]
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4") raw_path = os.path.join(tmpdir, f"{video_id}_raw.mp4")
comp_path = os.path.join(tmpdir, f"{video_id}.mp4") comp_path = os.path.join(tmpdir, f"{video_id}.mp4")
@@ -1053,7 +1155,9 @@ def process_videos(
continue continue
# 2. Compress # 2. Compress
ok = compress_video(raw_path, comp_path, max_size_bytes=video_max_size_bytes) ok = compress_video(
raw_path, comp_path, max_size_bytes=video_max_size_bytes
)
if not ok: if not ok:
logging.error(f"❌ Compression failed for {video_id}. Skipping.") logging.error(f"❌ Compression failed for {video_id}. Skipping.")
continue continue
@@ -1066,6 +1170,7 @@ def process_videos(
# 4. Post # 4. Post
caption = build_caption(video, tiktok_handle) caption = build_caption(video, tiktok_handle)
logging.info(f"📝 Caption preview: {caption[:120]!r}")
ok = post_video_to_bluesky(client, blob, caption, langs, video_id) ok = post_video_to_bluesky(client, blob, caption, langs, video_id)
if ok: if ok:
mark_as_posted(video_id, state, meta={"url": video_url}) mark_as_posted(video_id, state, meta={"url": video_url})
@@ -1122,7 +1227,9 @@ def main():
logging.info("=" * 60) logging.info("=" * 60)
state = load_state() state = load_state()
client = connect_bluesky(args.bsky_handle, args.bsky_app_password, args.bsky_base_url) client = connect_bluesky(
args.bsky_handle, args.bsky_app_password, args.bsky_base_url
)
# Convert JSON cookies → Netscape format once for all yt-dlp calls # Convert JSON cookies → Netscape format once for all yt-dlp calls
netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path) netscape_cookies_path = convert_json_cookies_to_netscape(args.cookies_path)
@@ -1178,6 +1285,7 @@ def main():
logging.info("=" * 60) logging.info("=" * 60)
finally: finally:
# Always clean up the temporary Netscape cookie file
if netscape_cookies_path and os.path.exists(netscape_cookies_path): if netscape_cookies_path and os.path.exists(netscape_cookies_path):
try: try:
os.remove(netscape_cookies_path) os.remove(netscape_cookies_path)
@@ -1185,7 +1293,9 @@ def main():
f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}" f"🧹 Removed temporary Netscape cookie file: {netscape_cookies_path}"
) )
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Could not remove Netscape cookie file: {e}") logging.warning(
f"⚠️ Could not remove Netscape cookie file: {e}"
)
if __name__ == "__main__": if __name__ == "__main__":