diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 3985cfe..a42c8a0 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -86,6 +86,23 @@ def get_blob_from_file(file_path, client): return None +def prepare_post_text(text): + """ + Prepare the final text exactly as it would be posted to Bluesky. + """ + raw_text = (text or "").strip() + + if len(raw_text) > 295: + truncated = raw_text[:290] + last_space = truncated.rfind(" ") + if last_space > 0: + raw_text = truncated[:last_space] + "..." + else: + raw_text = truncated + "..." + + return raw_text + + def normalize_post_text(text): """ Normalize post text for duplicate detection. @@ -98,11 +115,12 @@ def normalize_post_text(text): return text.lower() -def get_recent_bsky_texts(client, handle, limit=30): +def get_recent_bsky_posts(client, handle, limit=30): """ - Fetch recent top-level Bluesky post texts for duplicate detection. + Fetch recent top-level Bluesky posts for duplicate detection. + Returns a list of dicts with original and normalized text. """ - recent_texts = [] + recent_posts = [] try: timeline = client.get_author_feed(handle, limit=limit) @@ -117,10 +135,15 @@ def get_recent_bsky_texts(client, handle, limit=30): continue text = getattr(record, "text", "") or "" - normalized = normalize_post_text(text) + prepared = prepare_post_text(text) + normalized = normalize_post_text(prepared) if normalized: - recent_texts.append(normalized) + recent_posts.append({ + "text": prepared, + "normalized_text": normalized, + "created_at": getattr(record, "created_at", None), + }) except Exception as e: logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}") @@ -128,15 +151,7 @@ def get_recent_bsky_texts(client, handle, limit=30): except Exception as e: logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}") - return recent_texts - - -def is_duplicate_bsky_text(candidate_text, recent_texts): - """ - Returns True if the candidate text already exists in recent Bluesky posts. - """ - normalized_candidate = normalize_post_text(candidate_text) - return normalized_candidate in set(recent_texts) + return recent_posts def make_rich(content): @@ -377,25 +392,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): def extract_video_url_from_tweet_page(context, tweet_url): - """ - Open tweet page and capture media requests. - - Strategy: - - listen for network responses - - wait for player - - scroll into view - - click player - - poll for a few seconds - - retry interaction once if needed - - Preference: - 1. HLS .m3u8 - 2. real video .mp4 - - Ignore: - - .m4s - - audio-only mp4 - """ page = context.new_page() best_m3u8_url = None best_video_mp4_url = None @@ -625,16 +621,61 @@ def sync_feeds(args): bsky_client = Client() bsky_client.login(args.bsky_handle, args.bsky_password) - recent_bsky_texts = get_recent_bsky_texts( + recent_bsky_posts = get_recent_bsky_posts( bsky_client, args.bsky_handle, limit=DEDUPE_BSKY_LIMIT ) - logging.info(f"🧠 Loaded {len(recent_bsky_texts)} recent Bluesky post texts for duplicate detection.") + recent_bsky_text_set = {post["normalized_text"] for post in recent_bsky_posts if post["normalized_text"]} + + logging.info(f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for 30-vs-30 duplicate detection.") + logging.info(f"🧠 Built normalized Bluesky dedupe set with {len(recent_bsky_text_set)} entries.") too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS) logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}") + candidate_tweets = [] + + for tweet in reversed(tweets): + try: + tweet_time = arrow.get(tweet.created_on) + + if tweet_time < too_old_cutoff: + logging.info(f"⏭️ Skipping old tweet from {tweet_time}") + continue + + prepared_text = prepare_post_text(tweet.text) + normalized_text = normalize_post_text(prepared_text) + + if not normalized_text: + logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}") + continue + + candidate_tweets.append({ + "tweet": tweet, + "tweet_time": tweet_time, + "raw_text": prepared_text, + "normalized_text": normalized_text, + }) + + except Exception as e: + logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}") + + logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for comparison against recent Bluesky posts.") + + tweets_to_post = [] + for candidate in candidate_tweets: + if candidate["normalized_text"] in recent_bsky_text_set: + logging.info("⏭️ Skipping candidate because text already exists in the last 30 Bluesky posts.") + continue + tweets_to_post.append(candidate) + + logging.info(f"📬 {len(tweets_to_post)} tweets remain after 30-vs-30 duplicate filtering.") + + if not tweets_to_post: + logging.info("✅ No new tweets need posting after duplicate comparison.") + return + new_posts = 0 state_file = "twitter_browser_state.json" @@ -656,29 +697,12 @@ def sync_feeds(args): context = browser.new_context(**context_kwargs) - for tweet in reversed(tweets): - tweet_time = arrow.get(tweet.created_on) + for candidate in tweets_to_post: + tweet = candidate["tweet"] + tweet_time = candidate["tweet_time"] + raw_text = candidate["raw_text"] - if tweet_time < too_old_cutoff: - logging.info(f"⏭️ Skipping old tweet from {tweet_time}") - continue - - raw_text = tweet.text.strip() - - if len(raw_text) > 295: - truncated = raw_text[:290] - last_space = truncated.rfind(" ") - if last_space > 0: - raw_text = truncated[:last_space] + "..." - else: - raw_text = truncated + "..." - logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.") - - if is_duplicate_bsky_text(raw_text, recent_bsky_texts): - logging.info("⏭️ Skipping tweet because its text already exists in the last 30 Bluesky posts.") - continue - - logging.info(f"📝 Found candidate tweet from {tweet_time}. Posting to Bluesky...") + logging.info(f"📝 Posting missing tweet from {tweet_time} to Bluesky...") rich_text = make_rich(raw_text) dynamic_alt = build_dynamic_alt(raw_text) @@ -736,9 +760,7 @@ def sync_feeds(args): else: bsky_client.send_post(text=rich_text, langs=["ca"]) - recent_bsky_texts.insert(0, normalize_post_text(raw_text)) - recent_bsky_texts = recent_bsky_texts[:DEDUPE_BSKY_LIMIT] - + recent_bsky_text_set.add(candidate["normalized_text"]) new_posts += 1 logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}") time.sleep(5) @@ -795,4 +817,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file