From cb6355b1f71a160c32e79a081f1f8a167c893a5c Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Sat, 4 Apr 2026 19:36:31 +0000 Subject: [PATCH] Timestamp and duplication improvements --- twitter2bsky_daemon.py | 103 ++++++++++++++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 17 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 04c7911..3985cfe 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -11,14 +11,20 @@ from atproto import Client, client_utils, models from playwright.sync_api import sync_playwright from moviepy import VideoFileClip -# --- Logging Setup --- +# --- Configuration --- LOG_PATH = "twitter2bsky.log" +SCRAPE_TWEET_LIMIT = 30 +DEDUPE_BSKY_LIMIT = 30 +TWEET_MAX_AGE_DAYS = 3 + +# --- Logging Setup --- logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()], level=logging.INFO, ) + # --- Custom Classes --- class ScrapedMedia: def __init__(self, url, media_type="photo"): @@ -80,12 +86,57 @@ def get_blob_from_file(file_path, client): return None -def get_last_bsky(client, handle): - timeline = client.get_author_feed(handle) - for titem in timeline.feed: - if titem.reason is None and getattr(titem.post.record, "reply", None) is None: - return arrow.get(titem.post.record.created_at) - return arrow.get(0) +def normalize_post_text(text): + """ + Normalize post text for duplicate detection. + """ + if not text: + return "" + + text = text.replace("\r", "\n") + text = re.sub(r"\s+", " ", text).strip() + return text.lower() + + +def get_recent_bsky_texts(client, handle, limit=30): + """ + Fetch recent top-level Bluesky post texts for duplicate detection. + """ + recent_texts = [] + + try: + timeline = client.get_author_feed(handle, limit=limit) + + for item in timeline.feed: + try: + if item.reason is not None: + continue + + record = item.post.record + if getattr(record, "reply", None) is not None: + continue + + text = getattr(record, "text", "") or "" + normalized = normalize_post_text(text) + + if normalized: + recent_texts.append(normalized) + + except Exception as e: + logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}") + + except Exception as e: + logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}") + + return recent_texts + + +def is_duplicate_bsky_text(candidate_text, recent_texts): + """ + Returns True if the candidate text already exists in recent Bluesky posts. + """ + normalized_candidate = normalize_post_text(candidate_text) + return normalized_candidate in set(recent_texts) def make_rich(content): @@ -278,9 +329,9 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): time.sleep(3) articles = page.locator("article").all() - logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing...") + logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...") - for article in articles[:10]: + for article in articles[:SCRAPE_TWEET_LIMIT]: try: time_el = article.locator("time").first if not time_el.is_visible(): @@ -385,7 +436,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): logging.info(f"📺 Found HLS playlist URL: {url}") return - if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l): + if ".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l: if is_audio_only_mp4(url, content_type): logging.info(f"🔇 Ignoring audio-only MP4: {url}") return @@ -421,7 +472,6 @@ def extract_video_url_from_tweet_page(context, tweet_url): logging.info("▶️ Clicked video player") except Exception as e: logging.info(f"⚠️ First player click failed: {e}") - else: logging.warning("⚠️ No video player locator found on tweet page") @@ -475,6 +525,7 @@ def download_and_crop_video(video_url, output_path): video_url_l = video_url.lower() if ".m3u8" in video_url_l: + logging.info("📺 Using HLS ffmpeg mode") download_cmd = [ "ffmpeg", "-y", @@ -485,6 +536,7 @@ def download_and_crop_video(video_url, output_path): temp_input, ] else: + logging.info("🎥 Using direct MP4 ffmpeg mode") download_cmd = [ "ffmpeg", "-y", @@ -554,6 +606,7 @@ def download_and_crop_video(video_url, output_path): except Exception: pass + # --- Main Sync Function --- def sync_feeds(args): logging.info("🔄 Starting sync cycle...") @@ -571,7 +624,16 @@ def sync_feeds(args): bsky_client = Client() bsky_client.login(args.bsky_handle, args.bsky_password) - last_bsky_time = get_last_bsky(bsky_client, args.bsky_handle) + + recent_bsky_texts = get_recent_bsky_texts( + bsky_client, + args.bsky_handle, + limit=DEDUPE_BSKY_LIMIT + ) + logging.info(f"🧠 Loaded {len(recent_bsky_texts)} recent Bluesky post texts for duplicate detection.") + + too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS) + logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}") new_posts = 0 state_file = "twitter_browser_state.json" @@ -597,12 +659,10 @@ def sync_feeds(args): for tweet in reversed(tweets): tweet_time = arrow.get(tweet.created_on) - if tweet_time <= last_bsky_time: - #if False: + if tweet_time < too_old_cutoff: + logging.info(f"⏭️ Skipping old tweet from {tweet_time}") continue - logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...") - raw_text = tweet.text.strip() if len(raw_text) > 295: @@ -614,6 +674,12 @@ def sync_feeds(args): raw_text = truncated + "..." logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.") + if is_duplicate_bsky_text(raw_text, recent_bsky_texts): + logging.info("⏭️ Skipping tweet because its text already exists in the last 30 Bluesky posts.") + continue + + logging.info(f"📝 Found candidate tweet from {tweet_time}. Posting to Bluesky...") + rich_text = make_rich(raw_text) dynamic_alt = build_dynamic_alt(raw_text) @@ -670,6 +736,9 @@ def sync_feeds(args): else: bsky_client.send_post(text=rich_text, langs=["ca"]) + recent_bsky_texts.insert(0, normalize_post_text(raw_text)) + recent_bsky_texts = recent_bsky_texts[:DEDUPE_BSKY_LIMIT] + new_posts += 1 logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}") time.sleep(5) @@ -726,4 +795,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()