fix(sync): preserve meaningful url/tag tails in long tweet text instead of truncating them away

2026-04-10 09:49:01 +02:00
parent 28dfe6d718
commit 3c666a87c3
1 changed files with 76 additions and 19 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -327,9 +327,6 @@ def extract_ordered_non_x_urls(text):
 def remove_url_from_visible_text(text, url_to_remove):
    """
    Remove a specific URL from visible text while preserving paragraph structure as much as possible.
    """
    if not text or not url_to_remove:
        return text
@@ -371,6 +368,33 @@ def looks_like_title_plus_url_post(text):
    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
    if not text or not primary_non_x_url:
        return False
    repaired = repair_broken_urls(text)
    idx = repaired.find(primary_non_x_url)
    if idx == -1:
        return False
    tail = repaired[idx:].strip()
    if not tail.startswith(("http://", "https://")):
        return False
    # URL followed by optional hashtags / trailing words is a meaningful tail we should try to preserve.
    if re.search(r"https?://\S+.*#[^\s#]+", tail):
        return True
    # Also treat URL preceded by meaningful text ending as important if the URL is not on its own line.
    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if lines:
        for line in lines:
            if primary_non_x_url in line and not line.startswith(("http://", "https://")):
                return True
    return False
 def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    if len(text) <= max_length:
        return text
@@ -382,16 +406,39 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    return truncated + "..."
-def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
+def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
-    """
+    if not text or tail_start is None or tail_start < 0 or tail_start >= len(text):
-    Choose the final visible Bluesky text.
+        return truncate_text_safely(text, max_length)
-    Rules:
+    if len(text) <= max_length:
-    - If full text fits, keep it exactly.
+        return text
-    - If it doesn't fit and there is a long external URL:
+
-      - prefer full text WITHOUT the URL if that fits
+    tail = text[tail_start:].strip()
-      - otherwise fall back to truncation
+    if not tail:
-    """
+        return truncate_text_safely(text, max_length)
    # Need room for "... " + tail
    reserve = len(tail) + 4
    if reserve >= max_length:
        return truncate_text_safely(text, max_length)
    available_prefix = max_length - reserve
    prefix = text[:tail_start].rstrip()
    if len(prefix) > available_prefix:
        prefix = prefix[:available_prefix].rstrip()
        last_space = prefix.rfind(" ")
        if last_space > 20:
            prefix = prefix[:last_space]
    final_text = f"{prefix}... {tail}".strip()
    if len(final_text) <= max_length:
        return final_text
    return truncate_text_safely(text, max_length)
 def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
    text = (full_clean_text or "").strip()
    if not text:
        return text
@@ -399,7 +446,18 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
    if len(text) <= BSKY_TEXT_MAX_LENGTH:
        return text
-    if primary_non_x_url and prefer_full_text_without_url:
+    if primary_non_x_url:
        # If the URL and hashtag tail are semantically important, preserve the tail first.
        if looks_like_url_and_tag_tail(text, primary_non_x_url):
            url_pos = text.find(primary_non_x_url)
            if url_pos != -1:
                preserved = truncate_text_preserving_tail(text, url_pos, BSKY_TEXT_MAX_LENGTH)
                if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH:
                    logging.info("🔗 Preserving meaningful URL/tag tail in visible Bluesky text")
                    return preserved
        # For article-card-style posts, prefer removing the URL entirely from visible text.
        if prefer_full_text_without_url:
            text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
            if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
                logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
@@ -1615,7 +1673,6 @@ def sync_feeds(args):
                has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
                has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
                # Link-only/text-only posts with external cards get special visible text handling.
                if primary_non_x_url and not has_video and not has_photo:
                    raw_text = choose_final_visible_text(
                        full_clean_text,
@@ -1625,7 +1682,7 @@ def sync_feeds(args):
                else:
                    raw_text = choose_final_visible_text(
                        full_clean_text,
-                        primary_non_x_url=None,
+                        primary_non_x_url=primary_non_x_url,
                        prefer_full_text_without_url=False,
                    )