fix(sync): preserve exact original tweet text, visible links, and hashtags when post fits Bluesky

2026-04-10 13:36:45 +02:00
parent fbc8dda1e2
commit 3a4b6ce65e
1 changed files with 107 additions and 81 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -297,8 +297,9 @@ def is_x_or_twitter_domain(url):
 def extract_urls_from_text(text):
    if not text:
        return []
    repaired = repair_broken_urls(text)
-    return re.findall(r"https?://[^\s]+", repaired)
+    return re.findall(r"https?://[^\s#]+", repaired)
 def extract_non_x_urls_from_text(text):
@@ -326,6 +327,14 @@ def extract_ordered_non_x_urls(text):
    return ordered
 def extract_first_visible_non_x_url(text):
    for url in extract_non_x_urls_from_text(text or ""):
        canonical = canonicalize_url(url)
        if canonical:
            return canonical
    return None
 def remove_url_from_visible_text(text, url_to_remove):
    if not text or not url_to_remove:
        return text
@@ -339,9 +348,12 @@ def remove_url_from_visible_text(text, url_to_remove):
        new_line = line
        for url in line_urls:
-            if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
+            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(url))
-                new_line = new_line.replace(url, "").strip()
+            if cleaned_candidate == canonical_target:
                pattern = re.escape(url)
                new_line = re.sub(pattern, "", new_line)
        new_line = re.sub(r"[ \t]+", " ", new_line).strip()
        cleaned_lines.append(new_line)
    result = "\n".join(cleaned_lines)
@@ -368,64 +380,6 @@ def looks_like_title_plus_url_post(text):
    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 def find_tail_preservation_start(text, primary_non_x_url):
    if not text or not primary_non_x_url:
        return None
    url_pos = text.find(primary_non_x_url)
    if url_pos == -1:
        return None
    hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
    has_hashtag_after_url = hashtag_match is not None
    candidates = [url_pos]
    # Prefer clause boundaries before the URL.
    clause_patterns = [
        r"\.\s+",
        r":\s+",
        r";\s+",
        r"!\s+",
        r"\?\s+",
        r",\s+",
    ]
    before = text[:url_pos]
    for pattern in clause_patterns:
        for match in re.finditer(pattern, before):
            candidates.append(match.end())
    # Prefer previous line break if present.
    last_newline = before.rfind("\n")
    if last_newline != -1:
        candidates.append(last_newline + 1)
    # If there are hashtags after the URL, preserve a more generous block before it.
    if has_hashtag_after_url:
        generous_start = max(0, url_pos - 120)
        while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
            generous_start -= 1
        candidates.append(generous_start)
    # Choose the closest reasonable boundary before the URL, but not too close.
    reasonable_candidates = [
        c for c in candidates
        if 0 <= c < url_pos and (url_pos - c) <= 180
    ]
    if reasonable_candidates:
        start = min(reasonable_candidates, key=lambda c: (url_pos - c))
        # If the nearest boundary is too close, fall back to a slightly earlier one.
        if url_pos - start < 35:
            farther = [c for c in reasonable_candidates if url_pos - c >= 35]
            if farther:
                start = min(farther, key=lambda c: (url_pos - c))
        return start
    return url_pos
 def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
    if not text or not primary_non_x_url:
        return False
@@ -445,6 +399,59 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
    return False
 def find_tail_preservation_start(text, primary_non_x_url):
    if not text or not primary_non_x_url:
        return None
    url_pos = text.find(primary_non_x_url)
    if url_pos == -1:
        return None
    hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
    has_hashtag_after_url = hashtag_match is not None
    candidates = [url_pos]
    clause_patterns = [
        r"\.\s+",
        r":\s+",
        r";\s+",
        r"!\s+",
        r"\?\s+",
        r",\s+",
    ]
    before = text[:url_pos]
    for pattern in clause_patterns:
        for match in re.finditer(pattern, before):
            candidates.append(match.end())
    last_newline = before.rfind("\n")
    if last_newline != -1:
        candidates.append(last_newline + 1)
    if has_hashtag_after_url:
        generous_start = max(0, url_pos - 120)
        while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
            generous_start -= 1
        candidates.append(generous_start)
    reasonable_candidates = [
        c for c in candidates
        if 0 <= c < url_pos and (url_pos - c) <= 180
    ]
    if reasonable_candidates:
        start = min(reasonable_candidates, key=lambda c: (url_pos - c))
        if url_pos - start < 35:
            farther = [c for c in reasonable_candidates if url_pos - c >= 35]
            if farther:
                start = min(farther, key=lambda c: (url_pos - c))
        return start
    return url_pos
 def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    if len(text) <= max_length:
        return text
@@ -469,14 +476,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
    reserve = len(tail) + 4
    if reserve >= max_length:
-        # Tail too large; keep the tail itself and trim from its front carefully.
+        shortened_tail = tail[-(max_length - 3):].strip()
-        shortened_tail = tail
+
        if len(shortened_tail) > max_length - 3:
            shortened_tail = shortened_tail[-(max_length - 3):]
        first_space = shortened_tail.find(" ")
-            if first_space > 0 and first_space < 40:
+        if 0 <= first_space <= 30:
-                shortened_tail = shortened_tail[first_space + 1:]
+            shortened_tail = shortened_tail[first_space + 1:].strip()
-        return "..." + shortened_tail[-(max_length - 3):]
+
        return f"...{shortened_tail}"
    available_prefix = max_length - reserve
    prefix = text[:tail_start].rstrip()
@@ -485,9 +491,12 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
        prefix = prefix[:available_prefix].rstrip()
        last_space = prefix.rfind(" ")
        if last_space > 20:
-            prefix = prefix[:last_space]
+            prefix = prefix[:last_space].rstrip()
    final_text = f"{prefix}... {tail}".strip()
    final_text = re.sub(r"[ \t]+", " ", final_text)
    final_text = re.sub(r"\n{3,}", "\n\n", final_text).strip()
    if len(final_text) <= max_length:
        return final_text
@@ -495,11 +504,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
 def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
-    text = (full_clean_text or "").strip()
+    text = clean_post_text(full_clean_text or "")
    if not text:
        return text
    # Golden rule: preserve exact original cleaned tweet text if it fits.
    if len(text) <= BSKY_TEXT_MAX_LENGTH:
        logging.info("🟢 Original cleaned tweet text fits in Bluesky. Preserving exact text.")
        return text
    if primary_non_x_url:
@@ -517,7 +528,9 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
                logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
                return text_without_url
-    return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
+    truncated = truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
    logging.info("✂️ Falling back to safe truncation for visible Bluesky text")
    return truncated
 def normalize_post_text(text):
@@ -1203,21 +1216,31 @@ def make_rich(content):
                    text_builder.text(" ")
                continue
-            if word.startswith("http://") or word.startswith("https://"):
+            cleaned_word = strip_trailing_url_punctuation(word)
                if word.startswith("http://"):
                    word = word.replace("http://", "https://", 1)
-                word = strip_trailing_url_punctuation(word)
+            if cleaned_word.startswith("http://") or cleaned_word.startswith("https://"):
-                clean_url_value = clean_url(word)
+                if cleaned_word.startswith("http://"):
                    cleaned_word = cleaned_word.replace("http://", "https://", 1)
                clean_url_value = clean_url(cleaned_word)
                if clean_url_value and is_valid_url(clean_url_value):
                    text_builder.link(clean_url_value, clean_url_value)
                    trailing = word[len(cleaned_word):]
                    if trailing:
                        text_builder.text(trailing)
                else:
                    text_builder.text(word)
-            elif word.startswith("#"):
+            elif cleaned_word.startswith("#") and len(cleaned_word) > 1:
-                clean_tag = word[1:].rstrip(".,;:!?)'\"…")
+                clean_tag = cleaned_word[1:].rstrip(".,;:!?)'\"…")
-                text_builder.tag(word, clean_tag)
+                if clean_tag:
                    text_builder.tag(cleaned_word, clean_tag)
                    trailing = word[len(cleaned_word):]
                    if trailing:
                        text_builder.text(trailing)
                else:
                    text_builder.text(word)
            else:
                text_builder.text(word)
@@ -1722,7 +1745,10 @@ def sync_feeds(args):
                ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
                canonical_non_x_urls = set(ordered_non_x_urls)
-                primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
+
                primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
                if not primary_non_x_url and ordered_non_x_urls:
                    primary_non_x_url = ordered_non_x_urls[0]
                has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
                has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))