From cb054cdb32c25784165c05d9fa415c4524e1db55 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Sun, 5 Apr 2026 11:41:39 +0000 Subject: [PATCH] New video limit 179 seconds --- twitter2bsky_daemon.py | 113 ++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 74 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 4333566..7edf4e1 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -8,7 +8,6 @@ import httpx import time import os import subprocess -import tempfile from urllib.parse import urlparse from dotenv import load_dotenv from atproto import Client, client_utils, models @@ -21,9 +20,7 @@ STATE_PATH = "twitter2bsky_state.json" SCRAPE_TWEET_LIMIT = 30 DEDUPE_BSKY_LIMIT = 30 TWEET_MAX_AGE_DAYS = 3 - -STATE_MAX_ENTRIES = 5000 -STATE_MAX_AGE_DAYS = 180 +VIDEO_MAX_DURATION_SECONDS = 179 # --- Logging Setup --- logging.basicConfig( @@ -265,27 +262,6 @@ def build_text_media_key(normalized_text, media_fingerprint): return hashlib.sha256(f"{normalized_text}||{media_fingerprint}".encode("utf-8")).hexdigest() -def safe_remove_file(path): - if path and os.path.exists(path): - try: - os.remove(path) - logging.debug(f"๐Ÿงน Removed temp file: {path}") - except Exception as e: - logging.warning(f"โš ๏ธ Could not remove temp file {path}: {e}") - - -def build_temp_video_output_path(tweet): - """ - Create a unique temp mp4 path for this tweet. - """ - canonical_url = canonicalize_tweet_url(tweet.tweet_url) or "" - seed = canonical_url or f"{tweet.created_on}_{tweet.text[:50]}" - suffix = hashlib.sha256(seed.encode("utf-8")).hexdigest()[:12] - - temp_dir = tempfile.gettempdir() - return os.path.join(temp_dir, f"twitter2bsky_{suffix}.mp4") - - # --- Local State Management --- def default_state(): return { @@ -382,59 +358,47 @@ def candidate_matches_state(candidate, state): if canonical_tweet_url and canonical_tweet_url in posted_tweets: return True, "state:tweet_url" - for record in posted_tweets.values(): + for _, record in posted_tweets.items(): if record.get("text_media_key") == text_media_key: return True, "state:text_media_fingerprint" - for record in posted_tweets.values(): + for _, record in posted_tweets.items(): if record.get("normalized_text") == normalized_text: return True, "state:normalized_text" return False, None -def prune_state(state, max_entries=STATE_MAX_ENTRIES, max_age_days=STATE_MAX_AGE_DAYS): +def prune_state(state, max_entries=5000): """ Keep state file from growing forever. - Prunes: - - entries older than max_age_days - - entries beyond max_entries, keeping newest first - - orphan posted_by_bsky_uri keys + Prunes oldest records by posted_at if necessary. """ posted_tweets = state.get("posted_tweets", {}) - cutoff = arrow.utcnow().shift(days=-max_age_days) - kept_items = [] + if len(posted_tweets) <= max_entries: + return state + sortable = [] for key, record in posted_tweets.items(): - posted_at_raw = record.get("posted_at") - keep = True + posted_at = record.get("posted_at") or "" + sortable.append((key, posted_at)) - if posted_at_raw: - try: - posted_at = arrow.get(posted_at_raw) - if posted_at < cutoff: - keep = False - except Exception: - pass + sortable.sort(key=lambda x: x[1], reverse=True) + keep_keys = {key for key, _ in sortable[:max_entries]} - if keep: - kept_items.append((key, record)) + new_posted_tweets = {} + for key, record in posted_tweets.items(): + if key in keep_keys: + new_posted_tweets[key] = record - kept_items.sort(key=lambda item: item[1].get("posted_at", ""), reverse=True) - kept_items = kept_items[:max_entries] - - keep_keys = {key for key, _ in kept_items} - - state["posted_tweets"] = {key: record for key, record in kept_items} - - posted_by_bsky_uri = state.get("posted_by_bsky_uri", {}) - state["posted_by_bsky_uri"] = { - bsky_uri: key - for bsky_uri, key in posted_by_bsky_uri.items() - if key in keep_keys - } + new_posted_by_bsky_uri = {} + for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items(): + if key in keep_keys: + new_posted_by_bsky_uri[bsky_uri] = key + state["posted_tweets"] = new_posted_tweets + state["posted_by_bsky_uri"] = new_posted_by_bsky_uri return state @@ -458,7 +422,7 @@ def get_recent_bsky_posts(client, handle, limit=30): if getattr(record, "reply", None) is not None: continue - text = getattr(record, "text", "") or "" + text = getattr(record, "text", "") or "" normalized_text = normalize_post_text(text) urls = [] @@ -905,7 +869,7 @@ def download_and_crop_video(video_url, output_path): logging.error("โŒ Downloaded video has invalid or unknown duration.") return None - end_time = min(59, duration) + end_time = min(VIDEO_MAX_DURATION_SECONDS, duration) if hasattr(video_clip, "subclipped"): cropped_clip = video_clip.subclipped(0, end_time) @@ -927,7 +891,7 @@ def download_and_crop_video(video_url, output_path): return None os.replace(temp_output, output_path) - logging.info(f"โœ… Video cropped to 59 seconds: {output_path}") + logging.info(f"โœ… Video cropped to {int(end_time)} seconds: {output_path}") return output_path except Exception as e: @@ -935,8 +899,12 @@ def download_and_crop_video(video_url, output_path): return None finally: - safe_remove_file(temp_input) - safe_remove_file(temp_output) + for path in [temp_input, temp_output]: + if os.path.exists(path): + try: + os.remove(path) + except Exception: + pass def candidate_matches_existing_bsky(candidate, recent_bsky_posts): @@ -971,8 +939,6 @@ def sync_feeds(args): logging.info("๐Ÿ”„ Starting sync cycle...") try: state = load_state(STATE_PATH) - state = prune_state(state) - save_state(state, STATE_PATH) tweets = scrape_tweets_via_playwright( args.twitter_username, @@ -1063,7 +1029,7 @@ def sync_feeds(args): return new_posts = 0 - browser_state_file = "twitter_browser_state.json" + state_file = "twitter_browser_state.json" with sync_playwright() as p: browser = p.chromium.launch( @@ -1078,8 +1044,8 @@ def sync_feeds(args): ), "viewport": {"width": 1920, "height": 1080}, } - if os.path.exists(browser_state_file): - context_kwargs["storage_state"] = browser_state_file + if os.path.exists(state_file): + context_kwargs["storage_state"] = state_file context = browser.new_context(**context_kwargs) @@ -1113,7 +1079,7 @@ def sync_feeds(args): logging.warning("โš ๏ธ Tweet has video marker but no tweet URL. Skipping video.") continue - temp_video_path = build_temp_video_output_path(tweet) + temp_video_path = "temp_video.mp4" try: real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url) @@ -1134,9 +1100,8 @@ def sync_feeds(args): video_embed = build_video_embed(video_blob, dynamic_alt) finally: - safe_remove_file(temp_video_path) - safe_remove_file(temp_video_path.replace(".mp4", "_source.mp4")) - safe_remove_file(temp_video_path.replace(".mp4", "_cropped.mp4")) + if os.path.exists(temp_video_path): + os.remove(temp_video_path) try: post_result = None @@ -1152,7 +1117,7 @@ def sync_feeds(args): bsky_uri = getattr(post_result, "uri", None) remember_posted_tweet(state, candidate, bsky_uri=bsky_uri) - state = prune_state(state) + state = prune_state(state, max_entries=5000) save_state(state, STATE_PATH) recent_bsky_posts.insert(0, { @@ -1222,4 +1187,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()