fix(sync): preserve full text for long link posts and use expanded URL only for Bluesky external cards
This commit is contained in:
@@ -289,7 +289,7 @@ def canonicalize_tweet_url(url):
|
||||
def is_x_or_twitter_domain(url):
|
||||
try:
|
||||
hostname = (urlparse(url).hostname or "").lower()
|
||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
|
||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@@ -326,6 +326,34 @@ def extract_ordered_non_x_urls(text):
|
||||
return ordered
|
||||
|
||||
|
||||
def remove_url_from_visible_text(text, url_to_remove):
|
||||
"""
|
||||
Remove a specific URL from visible text while preserving paragraph structure as much as possible.
|
||||
"""
|
||||
if not text or not url_to_remove:
|
||||
return text
|
||||
|
||||
canonical_target = canonicalize_url(url_to_remove)
|
||||
lines = text.splitlines()
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
line_urls = extract_urls_from_text(line)
|
||||
new_line = line
|
||||
|
||||
for url in line_urls:
|
||||
if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
|
||||
new_line = new_line.replace(url, "").strip()
|
||||
|
||||
cleaned_lines.append(new_line)
|
||||
|
||||
result = "\n".join(cleaned_lines)
|
||||
result = re.sub(r"[ \t]+", " ", result)
|
||||
result = re.sub(r"\n{3,}", "\n\n", result).strip()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def looks_like_title_plus_url_post(text):
|
||||
if not text:
|
||||
return False
|
||||
@@ -354,7 +382,16 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
return truncated + "..."
|
||||
|
||||
|
||||
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
||||
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
||||
"""
|
||||
Choose the final visible Bluesky text.
|
||||
|
||||
Rules:
|
||||
- If full text fits, keep it exactly.
|
||||
- If it doesn't fit and there is a long external URL:
|
||||
- prefer full text WITHOUT the URL if that fits
|
||||
- otherwise fall back to truncation
|
||||
"""
|
||||
text = (full_clean_text or "").strip()
|
||||
if not text:
|
||||
return text
|
||||
@@ -362,38 +399,11 @@ def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
||||
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
||||
return text
|
||||
|
||||
if keep_url:
|
||||
canonical_keep = canonicalize_url(keep_url)
|
||||
urls = extract_ordered_non_x_urls(text)
|
||||
|
||||
matched_url = None
|
||||
for url in urls:
|
||||
if canonicalize_url(url) == canonical_keep:
|
||||
matched_url = url
|
||||
break
|
||||
|
||||
if matched_url and matched_url in text:
|
||||
idx = text.find(matched_url)
|
||||
prefix = text[:idx].rstrip()
|
||||
suffix = matched_url
|
||||
|
||||
reserve = len(suffix) + 1
|
||||
available = BSKY_TEXT_MAX_LENGTH - reserve
|
||||
|
||||
if available > 10:
|
||||
trimmed_prefix = prefix
|
||||
if len(trimmed_prefix) > available:
|
||||
trimmed_prefix = trimmed_prefix[:available - 3]
|
||||
last_space = trimmed_prefix.rfind(" ")
|
||||
if last_space > 0:
|
||||
trimmed_prefix = trimmed_prefix[:last_space] + "..."
|
||||
else:
|
||||
trimmed_prefix = trimmed_prefix + "..."
|
||||
|
||||
final_text = f"{trimmed_prefix.rstrip()} {suffix}".strip()
|
||||
if len(final_text) <= BSKY_TEXT_MAX_LENGTH:
|
||||
logging.info("🔗 Preserved non-X URL in final Bluesky text for card generation")
|
||||
return final_text
|
||||
if primary_non_x_url and prefer_full_text_without_url:
|
||||
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
||||
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
||||
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
||||
return text_without_url
|
||||
|
||||
return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
|
||||
|
||||
@@ -780,10 +790,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
||||
|
||||
|
||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||
"""
|
||||
Compress/resize normal tweet images so they fit within Bluesky image blob limits.
|
||||
Returns JPEG bytes or None.
|
||||
"""
|
||||
try:
|
||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||
img = img.convert("RGB")
|
||||
@@ -1606,14 +1612,26 @@ def sync_feeds(args):
|
||||
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||
primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
|
||||
|
||||
raw_text = prepare_post_text_for_bsky(full_clean_text, keep_url=primary_non_x_url)
|
||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||
|
||||
# Link-only/text-only posts with external cards get special visible text handling.
|
||||
if primary_non_x_url and not has_video and not has_photo:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=primary_non_x_url,
|
||||
prefer_full_text_without_url=True,
|
||||
)
|
||||
else:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=None,
|
||||
prefer_full_text_without_url=False,
|
||||
)
|
||||
|
||||
media_fingerprint = build_media_fingerprint(tweet)
|
||||
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
||||
|
||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||
|
||||
candidate_tweets.append({
|
||||
"tweet": tweet,
|
||||
"tweet_time": tweet_time,
|
||||
|
||||
Reference in New Issue
Block a user