diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 1775bcd..2b014fa 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -327,9 +327,6 @@ def extract_ordered_non_x_urls(text): def remove_url_from_visible_text(text, url_to_remove): - """ - Remove a specific URL from visible text while preserving paragraph structure as much as possible. - """ if not text or not url_to_remove: return text @@ -371,6 +368,33 @@ def looks_like_title_plus_url_post(text): return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) +def looks_like_url_and_tag_tail(text, primary_non_x_url=None): + if not text or not primary_non_x_url: + return False + + repaired = repair_broken_urls(text) + idx = repaired.find(primary_non_x_url) + if idx == -1: + return False + + tail = repaired[idx:].strip() + if not tail.startswith(("http://", "https://")): + return False + + # URL followed by optional hashtags / trailing words is a meaningful tail we should try to preserve. + if re.search(r"https?://\S+.*#[^\s#]+", tail): + return True + + # Also treat URL preceded by meaningful text ending as important if the URL is not on its own line. + lines = [line.strip() for line in repaired.splitlines() if line.strip()] + if lines: + for line in lines: + if primary_non_x_url in line and not line.startswith(("http://", "https://")): + return True + + return False + + def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): if len(text) <= max_length: return text @@ -382,16 +406,39 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): return truncated + "..." -def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True): - """ - Choose the final visible Bluesky text. +def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH): + if not text or tail_start is None or tail_start < 0 or tail_start >= len(text): + return truncate_text_safely(text, max_length) - Rules: - - If full text fits, keep it exactly. - - If it doesn't fit and there is a long external URL: - - prefer full text WITHOUT the URL if that fits - - otherwise fall back to truncation - """ + if len(text) <= max_length: + return text + + tail = text[tail_start:].strip() + if not tail: + return truncate_text_safely(text, max_length) + + # Need room for "... " + tail + reserve = len(tail) + 4 + if reserve >= max_length: + return truncate_text_safely(text, max_length) + + available_prefix = max_length - reserve + prefix = text[:tail_start].rstrip() + + if len(prefix) > available_prefix: + prefix = prefix[:available_prefix].rstrip() + last_space = prefix.rfind(" ") + if last_space > 20: + prefix = prefix[:last_space] + + final_text = f"{prefix}... {tail}".strip() + if len(final_text) <= max_length: + return final_text + + return truncate_text_safely(text, max_length) + + +def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True): text = (full_clean_text or "").strip() if not text: return text @@ -399,11 +446,22 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu if len(text) <= BSKY_TEXT_MAX_LENGTH: return text - if primary_non_x_url and prefer_full_text_without_url: - text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip() - if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH: - logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card") - return text_without_url + if primary_non_x_url: + # If the URL and hashtag tail are semantically important, preserve the tail first. + if looks_like_url_and_tag_tail(text, primary_non_x_url): + url_pos = text.find(primary_non_x_url) + if url_pos != -1: + preserved = truncate_text_preserving_tail(text, url_pos, BSKY_TEXT_MAX_LENGTH) + if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH: + logging.info("🔗 Preserving meaningful URL/tag tail in visible Bluesky text") + return preserved + + # For article-card-style posts, prefer removing the URL entirely from visible text. + if prefer_full_text_without_url: + text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip() + if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH: + logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card") + return text_without_url return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH) @@ -1615,7 +1673,6 @@ def sync_feeds(args): has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) - # Link-only/text-only posts with external cards get special visible text handling. if primary_non_x_url and not has_video and not has_photo: raw_text = choose_final_visible_text( full_clean_text, @@ -1625,7 +1682,7 @@ def sync_feeds(args): else: raw_text = choose_final_visible_text( full_clean_text, - primary_non_x_url=None, + primary_non_x_url=primary_non_x_url, prefer_full_text_without_url=False, )