From 3a4b6ce65e296b2484ed8902a8cd8be25e7026dd Mon Sep 17 00:00:00 2001
From: Guillem Hernandez Sola <guillem@agile611.com>
Date: Fri, 10 Apr 2026 13:36:45 +0200
Subject: [PATCH] fix(sync): preserve exact original tweet text, visible links,
 and hashtags when post fits Bluesky

---
 twitter2bsky_daemon.py | 188 +++++++++++++++++++++++------------------
 1 file changed, 107 insertions(+), 81 deletions(-)

diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py
index e4571be..99f3c41 100644
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -297,8 +297,9 @@ def is_x_or_twitter_domain(url):
 def extract_urls_from_text(text):
     if not text:
         return []
+
     repaired = repair_broken_urls(text)
-    return re.findall(r"https?://[^\s]+", repaired)
+    return re.findall(r"https?://[^\s#]+", repaired)
 
 
 def extract_non_x_urls_from_text(text):
@@ -326,6 +327,14 @@ def extract_ordered_non_x_urls(text):
     return ordered
 
 
+def extract_first_visible_non_x_url(text):
+    for url in extract_non_x_urls_from_text(text or ""):
+        canonical = canonicalize_url(url)
+        if canonical:
+            return canonical
+    return None
+
+
 def remove_url_from_visible_text(text, url_to_remove):
     if not text or not url_to_remove:
         return text
@@ -339,9 +348,12 @@ def remove_url_from_visible_text(text, url_to_remove):
         new_line = line
 
         for url in line_urls:
-            if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
-                new_line = new_line.replace(url, "").strip()
+            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(url))
+            if cleaned_candidate == canonical_target:
+                pattern = re.escape(url)
+                new_line = re.sub(pattern, "", new_line)
 
+        new_line = re.sub(r"[ \t]+", " ", new_line).strip()
         cleaned_lines.append(new_line)
 
     result = "\n".join(cleaned_lines)
@@ -368,64 +380,6 @@ def looks_like_title_plus_url_post(text):
     return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 
 
-def find_tail_preservation_start(text, primary_non_x_url):
-    if not text or not primary_non_x_url:
-        return None
-
-    url_pos = text.find(primary_non_x_url)
-    if url_pos == -1:
-        return None
-
-    hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
-    has_hashtag_after_url = hashtag_match is not None
-
-    candidates = [url_pos]
-
-    # Prefer clause boundaries before the URL.
-    clause_patterns = [
-        r"\.\s+",
-        r":\s+",
-        r";\s+",
-        r"!\s+",
-        r"\?\s+",
-        r",\s+",
-    ]
-
-    before = text[:url_pos]
-    for pattern in clause_patterns:
-        for match in re.finditer(pattern, before):
-            candidates.append(match.end())
-
-    # Prefer previous line break if present.
-    last_newline = before.rfind("\n")
-    if last_newline != -1:
-        candidates.append(last_newline + 1)
-
-    # If there are hashtags after the URL, preserve a more generous block before it.
-    if has_hashtag_after_url:
-        generous_start = max(0, url_pos - 120)
-        while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
-            generous_start -= 1
-        candidates.append(generous_start)
-
-    # Choose the closest reasonable boundary before the URL, but not too close.
-    reasonable_candidates = [
-        c for c in candidates
-        if 0 <= c < url_pos and (url_pos - c) <= 180
-    ]
-
-    if reasonable_candidates:
-        start = min(reasonable_candidates, key=lambda c: (url_pos - c))
-        # If the nearest boundary is too close, fall back to a slightly earlier one.
-        if url_pos - start < 35:
-            farther = [c for c in reasonable_candidates if url_pos - c >= 35]
-            if farther:
-                start = min(farther, key=lambda c: (url_pos - c))
-        return start
-
-    return url_pos
-
-
 def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
     if not text or not primary_non_x_url:
         return False
@@ -445,6 +399,59 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
     return False
 
 
+def find_tail_preservation_start(text, primary_non_x_url):
+    if not text or not primary_non_x_url:
+        return None
+
+    url_pos = text.find(primary_non_x_url)
+    if url_pos == -1:
+        return None
+
+    hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
+    has_hashtag_after_url = hashtag_match is not None
+
+    candidates = [url_pos]
+
+    clause_patterns = [
+        r"\.\s+",
+        r":\s+",
+        r";\s+",
+        r"!\s+",
+        r"\?\s+",
+        r",\s+",
+    ]
+
+    before = text[:url_pos]
+    for pattern in clause_patterns:
+        for match in re.finditer(pattern, before):
+            candidates.append(match.end())
+
+    last_newline = before.rfind("\n")
+    if last_newline != -1:
+        candidates.append(last_newline + 1)
+
+    if has_hashtag_after_url:
+        generous_start = max(0, url_pos - 120)
+        while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
+            generous_start -= 1
+        candidates.append(generous_start)
+
+    reasonable_candidates = [
+        c for c in candidates
+        if 0 <= c < url_pos and (url_pos - c) <= 180
+    ]
+
+    if reasonable_candidates:
+        start = min(reasonable_candidates, key=lambda c: (url_pos - c))
+        if url_pos - start < 35:
+            farther = [c for c in reasonable_candidates if url_pos - c >= 35]
+            if farther:
+                start = min(farther, key=lambda c: (url_pos - c))
+        return start
+
+    return url_pos
+
+
 def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
     if len(text) <= max_length:
         return text
@@ -469,14 +476,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
 
     reserve = len(tail) + 4
     if reserve >= max_length:
-        # Tail too large; keep the tail itself and trim from its front carefully.
-        shortened_tail = tail
-        if len(shortened_tail) > max_length - 3:
-            shortened_tail = shortened_tail[-(max_length - 3):]
-            first_space = shortened_tail.find(" ")
-            if first_space > 0 and first_space < 40:
-                shortened_tail = shortened_tail[first_space + 1:]
-        return "..." + shortened_tail[-(max_length - 3):]
+        shortened_tail = tail[-(max_length - 3):].strip()
+
+        first_space = shortened_tail.find(" ")
+        if 0 <= first_space <= 30:
+            shortened_tail = shortened_tail[first_space + 1:].strip()
+
+        return f"...{shortened_tail}"
 
     available_prefix = max_length - reserve
     prefix = text[:tail_start].rstrip()
@@ -485,9 +491,12 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
         prefix = prefix[:available_prefix].rstrip()
         last_space = prefix.rfind(" ")
         if last_space > 20:
-            prefix = prefix[:last_space]
+            prefix = prefix[:last_space].rstrip()
 
     final_text = f"{prefix}... {tail}".strip()
+    final_text = re.sub(r"[ \t]+", " ", final_text)
+    final_text = re.sub(r"\n{3,}", "\n\n", final_text).strip()
+
     if len(final_text) <= max_length:
         return final_text
 
@@ -495,11 +504,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
 
 
 def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
-    text = (full_clean_text or "").strip()
+    text = clean_post_text(full_clean_text or "")
     if not text:
         return text
 
+    # Golden rule: preserve exact original cleaned tweet text if it fits.
     if len(text) <= BSKY_TEXT_MAX_LENGTH:
+        logging.info("🟢 Original cleaned tweet text fits in Bluesky. Preserving exact text.")
         return text
 
     if primary_non_x_url:
@@ -517,7 +528,9 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
                 logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
                 return text_without_url
 
-    return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
+    truncated = truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
+    logging.info("✂️ Falling back to safe truncation for visible Bluesky text")
+    return truncated
 
 
 def normalize_post_text(text):
@@ -1203,21 +1216,31 @@ def make_rich(content):
                     text_builder.text(" ")
                 continue
 
-            if word.startswith("http://") or word.startswith("https://"):
-                if word.startswith("http://"):
-                    word = word.replace("http://", "https://", 1)
+            cleaned_word = strip_trailing_url_punctuation(word)
 
-                word = strip_trailing_url_punctuation(word)
-                clean_url_value = clean_url(word)
+            if cleaned_word.startswith("http://") or cleaned_word.startswith("https://"):
+                if cleaned_word.startswith("http://"):
+                    cleaned_word = cleaned_word.replace("http://", "https://", 1)
+
+                clean_url_value = clean_url(cleaned_word)
 
                 if clean_url_value and is_valid_url(clean_url_value):
                     text_builder.link(clean_url_value, clean_url_value)
+                    trailing = word[len(cleaned_word):]
+                    if trailing:
+                        text_builder.text(trailing)
                 else:
                     text_builder.text(word)
 
-            elif word.startswith("#"):
-                clean_tag = word[1:].rstrip(".,;:!?)'\"…")
-                text_builder.tag(word, clean_tag)
+            elif cleaned_word.startswith("#") and len(cleaned_word) > 1:
+                clean_tag = cleaned_word[1:].rstrip(".,;:!?)'\"…")
+                if clean_tag:
+                    text_builder.tag(cleaned_word, clean_tag)
+                    trailing = word[len(cleaned_word):]
+                    if trailing:
+                        text_builder.text(trailing)
+                else:
+                    text_builder.text(word)
 
             else:
                 text_builder.text(word)
@@ -1722,7 +1745,10 @@ def sync_feeds(args):
 
                 ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
                 canonical_non_x_urls = set(ordered_non_x_urls)
-                primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
+
+                primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
+                if not primary_non_x_url and ordered_non_x_urls:
+                    primary_non_x_url = ordered_non_x_urls[0]
 
                 has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
                 has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))