fix(sync): preserve full text for long link posts and use expanded URL only for Bluesky external cards

This commit is contained in:
Guillem Hernandez Sola
2026-04-09 17:15:02 +02:00
parent 24abd8d32f
commit 62ec99a03b

View File

@@ -289,7 +289,7 @@ def canonicalize_tweet_url(url):
def is_x_or_twitter_domain(url):
try:
hostname = (urlparse(url).hostname or "").lower()
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
except Exception:
return False
@@ -326,6 +326,34 @@ def extract_ordered_non_x_urls(text):
return ordered
def remove_url_from_visible_text(text, url_to_remove):
"""
Remove a specific URL from visible text while preserving paragraph structure as much as possible.
"""
if not text or not url_to_remove:
return text
canonical_target = canonicalize_url(url_to_remove)
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line_urls = extract_urls_from_text(line)
new_line = line
for url in line_urls:
if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
new_line = new_line.replace(url, "").strip()
cleaned_lines.append(new_line)
result = "\n".join(cleaned_lines)
result = re.sub(r"[ \t]+", " ", result)
result = re.sub(r"\n{3,}", "\n\n", result).strip()
return result
def looks_like_title_plus_url_post(text):
if not text:
return False
@@ -354,7 +382,16 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
return truncated + "..."
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
"""
Choose the final visible Bluesky text.
Rules:
- If full text fits, keep it exactly.
- If it doesn't fit and there is a long external URL:
- prefer full text WITHOUT the URL if that fits
- otherwise fall back to truncation
"""
text = (full_clean_text or "").strip()
if not text:
return text
@@ -362,38 +399,11 @@ def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
if len(text) <= BSKY_TEXT_MAX_LENGTH:
return text
if keep_url:
canonical_keep = canonicalize_url(keep_url)
urls = extract_ordered_non_x_urls(text)
matched_url = None
for url in urls:
if canonicalize_url(url) == canonical_keep:
matched_url = url
break
if matched_url and matched_url in text:
idx = text.find(matched_url)
prefix = text[:idx].rstrip()
suffix = matched_url
reserve = len(suffix) + 1
available = BSKY_TEXT_MAX_LENGTH - reserve
if available > 10:
trimmed_prefix = prefix
if len(trimmed_prefix) > available:
trimmed_prefix = trimmed_prefix[:available - 3]
last_space = trimmed_prefix.rfind(" ")
if last_space > 0:
trimmed_prefix = trimmed_prefix[:last_space] + "..."
else:
trimmed_prefix = trimmed_prefix + "..."
final_text = f"{trimmed_prefix.rstrip()} {suffix}".strip()
if len(final_text) <= BSKY_TEXT_MAX_LENGTH:
logging.info("🔗 Preserved non-X URL in final Bluesky text for card generation")
return final_text
if primary_non_x_url and prefer_full_text_without_url:
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
return text_without_url
return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
@@ -780,10 +790,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
"""
Compress/resize normal tweet images so they fit within Bluesky image blob limits.
Returns JPEG bytes or None.
"""
try:
with Image.open(io.BytesIO(image_bytes)) as img:
img = img.convert("RGB")
@@ -1606,14 +1612,26 @@ def sync_feeds(args):
canonical_non_x_urls = set(ordered_non_x_urls)
primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
raw_text = prepare_post_text_for_bsky(full_clean_text, keep_url=primary_non_x_url)
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
# Link-only/text-only posts with external cards get special visible text handling.
if primary_non_x_url and not has_video and not has_photo:
raw_text = choose_final_visible_text(
full_clean_text,
primary_non_x_url=primary_non_x_url,
prefer_full_text_without_url=True,
)
else:
raw_text = choose_final_visible_text(
full_clean_text,
primary_non_x_url=None,
prefer_full_text_without_url=False,
)
media_fingerprint = build_media_fingerprint(tweet)
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
candidate_tweets.append({
"tweet": tweet,
"tweet_time": tweet_time,