diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 2b014fa..e4571be 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -368,6 +368,64 @@ def looks_like_title_plus_url_post(text): return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) +def find_tail_preservation_start(text, primary_non_x_url): + if not text or not primary_non_x_url: + return None + + url_pos = text.find(primary_non_x_url) + if url_pos == -1: + return None + + hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:]) + has_hashtag_after_url = hashtag_match is not None + + candidates = [url_pos] + + # Prefer clause boundaries before the URL. + clause_patterns = [ + r"\.\s+", + r":\s+", + r";\s+", + r"!\s+", + r"\?\s+", + r",\s+", + ] + + before = text[:url_pos] + for pattern in clause_patterns: + for match in re.finditer(pattern, before): + candidates.append(match.end()) + + # Prefer previous line break if present. + last_newline = before.rfind("\n") + if last_newline != -1: + candidates.append(last_newline + 1) + + # If there are hashtags after the URL, preserve a more generous block before it. + if has_hashtag_after_url: + generous_start = max(0, url_pos - 120) + while generous_start > 0 and text[generous_start] not in {" ", "\n"}: + generous_start -= 1 + candidates.append(generous_start) + + # Choose the closest reasonable boundary before the URL, but not too close. + reasonable_candidates = [ + c for c in candidates + if 0 <= c < url_pos and (url_pos - c) <= 180 + ] + + if reasonable_candidates: + start = min(reasonable_candidates, key=lambda c: (url_pos - c)) + # If the nearest boundary is too close, fall back to a slightly earlier one. + if url_pos - start < 35: + farther = [c for c in reasonable_candidates if url_pos - c >= 35] + if farther: + start = min(farther, key=lambda c: (url_pos - c)) + return start + + return url_pos + + def looks_like_url_and_tag_tail(text, primary_non_x_url=None): if not text or not primary_non_x_url: return False @@ -381,17 +439,9 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None): if not tail.startswith(("http://", "https://")): return False - # URL followed by optional hashtags / trailing words is a meaningful tail we should try to preserve. if re.search(r"https?://\S+.*#[^\s#]+", tail): return True - # Also treat URL preceded by meaningful text ending as important if the URL is not on its own line. - lines = [line.strip() for line in repaired.splitlines() if line.strip()] - if lines: - for line in lines: - if primary_non_x_url in line and not line.startswith(("http://", "https://")): - return True - return False @@ -417,10 +467,16 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN if not tail: return truncate_text_safely(text, max_length) - # Need room for "... " + tail reserve = len(tail) + 4 if reserve >= max_length: - return truncate_text_safely(text, max_length) + # Tail too large; keep the tail itself and trim from its front carefully. + shortened_tail = tail + if len(shortened_tail) > max_length - 3: + shortened_tail = shortened_tail[-(max_length - 3):] + first_space = shortened_tail.find(" ") + if first_space > 0 and first_space < 40: + shortened_tail = shortened_tail[first_space + 1:] + return "..." + shortened_tail[-(max_length - 3):] available_prefix = max_length - reserve prefix = text[:tail_start].rstrip() @@ -447,17 +503,15 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu return text if primary_non_x_url: - # If the URL and hashtag tail are semantically important, preserve the tail first. - if looks_like_url_and_tag_tail(text, primary_non_x_url): - url_pos = text.find(primary_non_x_url) - if url_pos != -1: - preserved = truncate_text_preserving_tail(text, url_pos, BSKY_TEXT_MAX_LENGTH) - if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH: - logging.info("🔗 Preserving meaningful URL/tag tail in visible Bluesky text") - return preserved + tail_start = find_tail_preservation_start(text, primary_non_x_url) - # For article-card-style posts, prefer removing the URL entirely from visible text. - if prefer_full_text_without_url: + if tail_start is not None: + preserved = truncate_text_preserving_tail(text, tail_start, BSKY_TEXT_MAX_LENGTH) + if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH: + logging.info("🔗 Preserving meaningful ending block with URL/hashtags in visible Bluesky text") + return preserved + + if prefer_full_text_without_url and not looks_like_url_and_tag_tail(text, primary_non_x_url): text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip() if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH: logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")