From cb6355b1f71a160c32e79a081f1f8a167c893a5c Mon Sep 17 00:00:00 2001
From: Guillem Hernandez Sola <guillem@agile611.com>
Date: Sat, 4 Apr 2026 19:36:31 +0000
Subject: [PATCH] Timestamp and duplication improvements

---
 twitter2bsky_daemon.py | 103 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 17 deletions(-)

diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py
index 04c7911..3985cfe 100644
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -11,14 +11,20 @@ from atproto import Client, client_utils, models
 from playwright.sync_api import sync_playwright
 from moviepy import VideoFileClip
 
-# --- Logging Setup ---
+# --- Configuration ---
 LOG_PATH = "twitter2bsky.log"
+SCRAPE_TWEET_LIMIT = 30
+DEDUPE_BSKY_LIMIT = 30
+TWEET_MAX_AGE_DAYS = 3
+
+# --- Logging Setup ---
 logging.basicConfig(
     format="%(asctime)s [%(levelname)s] %(message)s",
     handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
     level=logging.INFO,
 )
 
+
 # --- Custom Classes ---
 class ScrapedMedia:
     def __init__(self, url, media_type="photo"):
@@ -80,12 +86,57 @@ def get_blob_from_file(file_path, client):
     return None
 
 
-def get_last_bsky(client, handle):
-    timeline = client.get_author_feed(handle)
-    for titem in timeline.feed:
-        if titem.reason is None and getattr(titem.post.record, "reply", None) is None:
-            return arrow.get(titem.post.record.created_at)
-    return arrow.get(0)
+def normalize_post_text(text):
+    """
+    Normalize post text for duplicate detection.
+    """
+    if not text:
+        return ""
+
+    text = text.replace("\r", "\n")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text.lower()
+
+
+def get_recent_bsky_texts(client, handle, limit=30):
+    """
+    Fetch recent top-level Bluesky post texts for duplicate detection.
+    """
+    recent_texts = []
+
+    try:
+        timeline = client.get_author_feed(handle, limit=limit)
+
+        for item in timeline.feed:
+            try:
+                if item.reason is not None:
+                    continue
+
+                record = item.post.record
+                if getattr(record, "reply", None) is not None:
+                    continue
+
+                text = getattr(record, "text", "") or ""
+                normalized = normalize_post_text(text)
+
+                if normalized:
+                    recent_texts.append(normalized)
+
+            except Exception as e:
+                logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}")
+
+    except Exception as e:
+        logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}")
+
+    return recent_texts
+
+
+def is_duplicate_bsky_text(candidate_text, recent_texts):
+    """
+    Returns True if the candidate text already exists in recent Bluesky posts.
+    """
+    normalized_candidate = normalize_post_text(candidate_text)
+    return normalized_candidate in set(recent_texts)
 
 
 def make_rich(content):
@@ -278,9 +329,9 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
             time.sleep(3)
 
             articles = page.locator("article").all()
-            logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing...")
+            logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...")
 
-            for article in articles[:10]:
+            for article in articles[:SCRAPE_TWEET_LIMIT]:
                 try:
                     time_el = article.locator("time").first
                     if not time_el.is_visible():
@@ -385,7 +436,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
                     logging.info(f"📺 Found HLS playlist URL: {url}")
                 return
 
-            if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l):
+            if ".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l:
                 if is_audio_only_mp4(url, content_type):
                     logging.info(f"🔇 Ignoring audio-only MP4: {url}")
                     return
@@ -421,7 +472,6 @@ def extract_video_url_from_tweet_page(context, tweet_url):
                 logging.info("▶️ Clicked video player")
             except Exception as e:
                 logging.info(f"⚠️ First player click failed: {e}")
-
         else:
             logging.warning("⚠️ No video player locator found on tweet page")
 
@@ -475,6 +525,7 @@ def download_and_crop_video(video_url, output_path):
         video_url_l = video_url.lower()
 
         if ".m3u8" in video_url_l:
+            logging.info("📺 Using HLS ffmpeg mode")
             download_cmd = [
                 "ffmpeg",
                 "-y",
@@ -485,6 +536,7 @@ def download_and_crop_video(video_url, output_path):
                 temp_input,
             ]
         else:
+            logging.info("🎥 Using direct MP4 ffmpeg mode")
             download_cmd = [
                 "ffmpeg",
                 "-y",
@@ -554,6 +606,7 @@ def download_and_crop_video(video_url, output_path):
                 except Exception:
                     pass
 
+
 # --- Main Sync Function ---
 def sync_feeds(args):
     logging.info("🔄 Starting sync cycle...")
@@ -571,7 +624,16 @@ def sync_feeds(args):
 
         bsky_client = Client()
         bsky_client.login(args.bsky_handle, args.bsky_password)
-        last_bsky_time = get_last_bsky(bsky_client, args.bsky_handle)
+
+        recent_bsky_texts = get_recent_bsky_texts(
+            bsky_client,
+            args.bsky_handle,
+            limit=DEDUPE_BSKY_LIMIT
+        )
+        logging.info(f"🧠 Loaded {len(recent_bsky_texts)} recent Bluesky post texts for duplicate detection.")
+
+        too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS)
+        logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
 
         new_posts = 0
         state_file = "twitter_browser_state.json"
@@ -597,12 +659,10 @@ def sync_feeds(args):
             for tweet in reversed(tweets):
                 tweet_time = arrow.get(tweet.created_on)
 
-                if tweet_time <= last_bsky_time:
-                #if False:
+                if tweet_time < too_old_cutoff:
+                    logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
                     continue
 
-                logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...")
-
                 raw_text = tweet.text.strip()
 
                 if len(raw_text) > 295:
@@ -614,6 +674,12 @@ def sync_feeds(args):
                         raw_text = truncated + "..."
                     logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.")
 
+                if is_duplicate_bsky_text(raw_text, recent_bsky_texts):
+                    logging.info("⏭️ Skipping tweet because its text already exists in the last 30 Bluesky posts.")
+                    continue
+
+                logging.info(f"📝 Found candidate tweet from {tweet_time}. Posting to Bluesky...")
+
                 rich_text = make_rich(raw_text)
                 dynamic_alt = build_dynamic_alt(raw_text)
 
@@ -670,6 +736,9 @@ def sync_feeds(args):
                     else:
                         bsky_client.send_post(text=rich_text, langs=["ca"])
 
+                    recent_bsky_texts.insert(0, normalize_post_text(raw_text))
+                    recent_bsky_texts = recent_bsky_texts[:DEDUPE_BSKY_LIMIT]
+
                     new_posts += 1
                     logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}")
                     time.sleep(5)
@@ -726,4 +795,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()