From 62ec99a03bdf1d2aafc65d2315a8a7b4aad65181 Mon Sep 17 00:00:00 2001
From: Guillem Hernandez Sola <guillem.hernandez.sola@gmail.com>
Date: Thu, 9 Apr 2026 17:15:02 +0200
Subject: [PATCH] fix(sync): preserve full text for long link posts and use
 expanded URL only for Bluesky external cards

---
 twitter2bsky_daemon.py | 102 ++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 42 deletions(-)

diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py
index 4825ac1..1775bcd 100644
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -289,7 +289,7 @@ def canonicalize_tweet_url(url):
 def is_x_or_twitter_domain(url):
     try:
         hostname = (urlparse(url).hostname or "").lower()
-        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
+        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
     except Exception:
         return False
 
@@ -326,6 +326,34 @@ def extract_ordered_non_x_urls(text):
     return ordered
 
 
+def remove_url_from_visible_text(text, url_to_remove):
+    """
+    Remove a specific URL from visible text while preserving paragraph structure as much as possible.
+    """
+    if not text or not url_to_remove:
+        return text
+
+    canonical_target = canonicalize_url(url_to_remove)
+    lines = text.splitlines()
+    cleaned_lines = []
+
+    for line in lines:
+        line_urls = extract_urls_from_text(line)
+        new_line = line
+
+        for url in line_urls:
+            if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
+                new_line = new_line.replace(url, "").strip()
+
+        cleaned_lines.append(new_line)
+
+    result = "\n".join(cleaned_lines)
+    result = re.sub(r"[ \t]+", " ", result)
+    result = re.sub(r"\n{3,}", "\n\n", result).strip()
+
+    return result
+
+
 def looks_like_title_plus_url_post(text):
     if not text:
         return False
@@ -354,7 +382,16 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
     return truncated + "..."
 
 
-def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
+def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
+    """
+    Choose the final visible Bluesky text.
+
+    Rules:
+    - If full text fits, keep it exactly.
+    - If it doesn't fit and there is a long external URL:
+      - prefer full text WITHOUT the URL if that fits
+      - otherwise fall back to truncation
+    """
     text = (full_clean_text or "").strip()
     if not text:
         return text
@@ -362,38 +399,11 @@ def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
     if len(text) <= BSKY_TEXT_MAX_LENGTH:
         return text
 
-    if keep_url:
-        canonical_keep = canonicalize_url(keep_url)
-        urls = extract_ordered_non_x_urls(text)
-
-        matched_url = None
-        for url in urls:
-            if canonicalize_url(url) == canonical_keep:
-                matched_url = url
-                break
-
-        if matched_url and matched_url in text:
-            idx = text.find(matched_url)
-            prefix = text[:idx].rstrip()
-            suffix = matched_url
-
-            reserve = len(suffix) + 1
-            available = BSKY_TEXT_MAX_LENGTH - reserve
-
-            if available > 10:
-                trimmed_prefix = prefix
-                if len(trimmed_prefix) > available:
-                    trimmed_prefix = trimmed_prefix[:available - 3]
-                    last_space = trimmed_prefix.rfind(" ")
-                    if last_space > 0:
-                        trimmed_prefix = trimmed_prefix[:last_space] + "..."
-                    else:
-                        trimmed_prefix = trimmed_prefix + "..."
-
-                final_text = f"{trimmed_prefix.rstrip()} {suffix}".strip()
-                if len(final_text) <= BSKY_TEXT_MAX_LENGTH:
-                    logging.info("🔗 Preserved non-X URL in final Bluesky text for card generation")
-                    return final_text
+    if primary_non_x_url and prefer_full_text_without_url:
+        text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
+        if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
+            logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
+            return text_without_url
 
     return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
 
@@ -780,10 +790,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
 
 
 def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
-    """
-    Compress/resize normal tweet images so they fit within Bluesky image blob limits.
-    Returns JPEG bytes or None.
-    """
     try:
         with Image.open(io.BytesIO(image_bytes)) as img:
             img = img.convert("RGB")
@@ -1606,14 +1612,26 @@ def sync_feeds(args):
                 canonical_non_x_urls = set(ordered_non_x_urls)
                 primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
 
-                raw_text = prepare_post_text_for_bsky(full_clean_text, keep_url=primary_non_x_url)
+                has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
+                has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
+
+                # Link-only/text-only posts with external cards get special visible text handling.
+                if primary_non_x_url and not has_video and not has_photo:
+                    raw_text = choose_final_visible_text(
+                        full_clean_text,
+                        primary_non_x_url=primary_non_x_url,
+                        prefer_full_text_without_url=True,
+                    )
+                else:
+                    raw_text = choose_final_visible_text(
+                        full_clean_text,
+                        primary_non_x_url=None,
+                        prefer_full_text_without_url=False,
+                    )
 
                 media_fingerprint = build_media_fingerprint(tweet)
                 text_media_key = build_text_media_key(normalized_text, media_fingerprint)
 
-                has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
-                has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
-
                 candidate_tweets.append({
                     "tweet": tweet,
                     "tweet_time": tweet_time,