From 62ec99a03bdf1d2aafc65d2315a8a7b4aad65181 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Thu, 9 Apr 2026 17:15:02 +0200 Subject: [PATCH] fix(sync): preserve full text for long link posts and use expanded URL only for Bluesky external cards --- twitter2bsky_daemon.py | 102 ++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 42 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 4825ac1..1775bcd 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -289,7 +289,7 @@ def canonicalize_tweet_url(url): def is_x_or_twitter_domain(url): try: hostname = (urlparse(url).hostname or "").lower() - return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"} + return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"} except Exception: return False @@ -326,6 +326,34 @@ def extract_ordered_non_x_urls(text): return ordered +def remove_url_from_visible_text(text, url_to_remove): + """ + Remove a specific URL from visible text while preserving paragraph structure as much as possible. + """ + if not text or not url_to_remove: + return text + + canonical_target = canonicalize_url(url_to_remove) + lines = text.splitlines() + cleaned_lines = [] + + for line in lines: + line_urls = extract_urls_from_text(line) + new_line = line + + for url in line_urls: + if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target: + new_line = new_line.replace(url, "").strip() + + cleaned_lines.append(new_line) + + result = "\n".join(cleaned_lines) + result = re.sub(r"[ \t]+", " ", result) + result = re.sub(r"\n{3,}", "\n\n", result).strip() + + return result + + def looks_like_title_plus_url_post(text): if not text: return False @@ -354,7 +382,16 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): return truncated + "..." -def prepare_post_text_for_bsky(full_clean_text, keep_url=None): +def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True): + """ + Choose the final visible Bluesky text. + + Rules: + - If full text fits, keep it exactly. + - If it doesn't fit and there is a long external URL: + - prefer full text WITHOUT the URL if that fits + - otherwise fall back to truncation + """ text = (full_clean_text or "").strip() if not text: return text @@ -362,38 +399,11 @@ def prepare_post_text_for_bsky(full_clean_text, keep_url=None): if len(text) <= BSKY_TEXT_MAX_LENGTH: return text - if keep_url: - canonical_keep = canonicalize_url(keep_url) - urls = extract_ordered_non_x_urls(text) - - matched_url = None - for url in urls: - if canonicalize_url(url) == canonical_keep: - matched_url = url - break - - if matched_url and matched_url in text: - idx = text.find(matched_url) - prefix = text[:idx].rstrip() - suffix = matched_url - - reserve = len(suffix) + 1 - available = BSKY_TEXT_MAX_LENGTH - reserve - - if available > 10: - trimmed_prefix = prefix - if len(trimmed_prefix) > available: - trimmed_prefix = trimmed_prefix[:available - 3] - last_space = trimmed_prefix.rfind(" ") - if last_space > 0: - trimmed_prefix = trimmed_prefix[:last_space] + "..." - else: - trimmed_prefix = trimmed_prefix + "..." - - final_text = f"{trimmed_prefix.rstrip()} {suffix}".strip() - if len(final_text) <= BSKY_TEXT_MAX_LENGTH: - logging.info("🔗 Preserved non-X URL in final Bluesky text for card generation") - return final_text + if primary_non_x_url and prefer_full_text_without_url: + text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip() + if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH: + logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card") + return text_without_url return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH) @@ -780,10 +790,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): - """ - Compress/resize normal tweet images so they fit within Bluesky image blob limits. - Returns JPEG bytes or None. - """ try: with Image.open(io.BytesIO(image_bytes)) as img: img = img.convert("RGB") @@ -1606,14 +1612,26 @@ def sync_feeds(args): canonical_non_x_urls = set(ordered_non_x_urls) primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None - raw_text = prepare_post_text_for_bsky(full_clean_text, keep_url=primary_non_x_url) + has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) + has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) + + # Link-only/text-only posts with external cards get special visible text handling. + if primary_non_x_url and not has_video and not has_photo: + raw_text = choose_final_visible_text( + full_clean_text, + primary_non_x_url=primary_non_x_url, + prefer_full_text_without_url=True, + ) + else: + raw_text = choose_final_visible_text( + full_clean_text, + primary_non_x_url=None, + prefer_full_text_without_url=False, + ) media_fingerprint = build_media_fingerprint(tweet) text_media_key = build_text_media_key(normalized_text, media_fingerprint) - has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) - has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) - candidate_tweets.append({ "tweet": tweet, "tweet_time": tweet_time,