fix(sync): preserve meaningful url/tag tails in long tweet text instead of truncating them away
This commit is contained in:
@@ -327,9 +327,6 @@ def extract_ordered_non_x_urls(text):
|
|||||||
|
|
||||||
|
|
||||||
def remove_url_from_visible_text(text, url_to_remove):
|
def remove_url_from_visible_text(text, url_to_remove):
|
||||||
"""
|
|
||||||
Remove a specific URL from visible text while preserving paragraph structure as much as possible.
|
|
||||||
"""
|
|
||||||
if not text or not url_to_remove:
|
if not text or not url_to_remove:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -371,6 +368,33 @@ def looks_like_title_plus_url_post(text):
|
|||||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
||||||
|
if not text or not primary_non_x_url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
repaired = repair_broken_urls(text)
|
||||||
|
idx = repaired.find(primary_non_x_url)
|
||||||
|
if idx == -1:
|
||||||
|
return False
|
||||||
|
|
||||||
|
tail = repaired[idx:].strip()
|
||||||
|
if not tail.startswith(("http://", "https://")):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# URL followed by optional hashtags / trailing words is a meaningful tail we should try to preserve.
|
||||||
|
if re.search(r"https?://\S+.*#[^\s#]+", tail):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Also treat URL preceded by meaningful text ending as important if the URL is not on its own line.
|
||||||
|
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||||
|
if lines:
|
||||||
|
for line in lines:
|
||||||
|
if primary_non_x_url in line and not line.startswith(("http://", "https://")):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
if len(text) <= max_length:
|
if len(text) <= max_length:
|
||||||
return text
|
return text
|
||||||
@@ -382,16 +406,39 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
return truncated + "..."
|
return truncated + "..."
|
||||||
|
|
||||||
|
|
||||||
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
"""
|
if not text or tail_start is None or tail_start < 0 or tail_start >= len(text):
|
||||||
Choose the final visible Bluesky text.
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
Rules:
|
if len(text) <= max_length:
|
||||||
- If full text fits, keep it exactly.
|
return text
|
||||||
- If it doesn't fit and there is a long external URL:
|
|
||||||
- prefer full text WITHOUT the URL if that fits
|
tail = text[tail_start:].strip()
|
||||||
- otherwise fall back to truncation
|
if not tail:
|
||||||
"""
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
|
# Need room for "... " + tail
|
||||||
|
reserve = len(tail) + 4
|
||||||
|
if reserve >= max_length:
|
||||||
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
|
available_prefix = max_length - reserve
|
||||||
|
prefix = text[:tail_start].rstrip()
|
||||||
|
|
||||||
|
if len(prefix) > available_prefix:
|
||||||
|
prefix = prefix[:available_prefix].rstrip()
|
||||||
|
last_space = prefix.rfind(" ")
|
||||||
|
if last_space > 20:
|
||||||
|
prefix = prefix[:last_space]
|
||||||
|
|
||||||
|
final_text = f"{prefix}... {tail}".strip()
|
||||||
|
if len(final_text) <= max_length:
|
||||||
|
return final_text
|
||||||
|
|
||||||
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
|
|
||||||
|
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
||||||
text = (full_clean_text or "").strip()
|
text = (full_clean_text or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
@@ -399,7 +446,18 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
|
|||||||
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
if primary_non_x_url and prefer_full_text_without_url:
|
if primary_non_x_url:
|
||||||
|
# If the URL and hashtag tail are semantically important, preserve the tail first.
|
||||||
|
if looks_like_url_and_tag_tail(text, primary_non_x_url):
|
||||||
|
url_pos = text.find(primary_non_x_url)
|
||||||
|
if url_pos != -1:
|
||||||
|
preserved = truncate_text_preserving_tail(text, url_pos, BSKY_TEXT_MAX_LENGTH)
|
||||||
|
if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
|
logging.info("🔗 Preserving meaningful URL/tag tail in visible Bluesky text")
|
||||||
|
return preserved
|
||||||
|
|
||||||
|
# For article-card-style posts, prefer removing the URL entirely from visible text.
|
||||||
|
if prefer_full_text_without_url:
|
||||||
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
||||||
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
||||||
@@ -1615,7 +1673,6 @@ def sync_feeds(args):
|
|||||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||||
|
|
||||||
# Link-only/text-only posts with external cards get special visible text handling.
|
|
||||||
if primary_non_x_url and not has_video and not has_photo:
|
if primary_non_x_url and not has_video and not has_photo:
|
||||||
raw_text = choose_final_visible_text(
|
raw_text = choose_final_visible_text(
|
||||||
full_clean_text,
|
full_clean_text,
|
||||||
@@ -1625,7 +1682,7 @@ def sync_feeds(args):
|
|||||||
else:
|
else:
|
||||||
raw_text = choose_final_visible_text(
|
raw_text = choose_final_visible_text(
|
||||||
full_clean_text,
|
full_clean_text,
|
||||||
primary_non_x_url=None,
|
primary_non_x_url=primary_non_x_url,
|
||||||
prefer_full_text_without_url=False,
|
prefer_full_text_without_url=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user