fix(sync): preserve full ending clause before external url and hashtags in long tweet text
This commit is contained in:
@@ -368,6 +368,64 @@ def looks_like_title_plus_url_post(text):
|
|||||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
|
||||||
|
def find_tail_preservation_start(text, primary_non_x_url):
|
||||||
|
if not text or not primary_non_x_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url_pos = text.find(primary_non_x_url)
|
||||||
|
if url_pos == -1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
|
||||||
|
has_hashtag_after_url = hashtag_match is not None
|
||||||
|
|
||||||
|
candidates = [url_pos]
|
||||||
|
|
||||||
|
# Prefer clause boundaries before the URL.
|
||||||
|
clause_patterns = [
|
||||||
|
r"\.\s+",
|
||||||
|
r":\s+",
|
||||||
|
r";\s+",
|
||||||
|
r"!\s+",
|
||||||
|
r"\?\s+",
|
||||||
|
r",\s+",
|
||||||
|
]
|
||||||
|
|
||||||
|
before = text[:url_pos]
|
||||||
|
for pattern in clause_patterns:
|
||||||
|
for match in re.finditer(pattern, before):
|
||||||
|
candidates.append(match.end())
|
||||||
|
|
||||||
|
# Prefer previous line break if present.
|
||||||
|
last_newline = before.rfind("\n")
|
||||||
|
if last_newline != -1:
|
||||||
|
candidates.append(last_newline + 1)
|
||||||
|
|
||||||
|
# If there are hashtags after the URL, preserve a more generous block before it.
|
||||||
|
if has_hashtag_after_url:
|
||||||
|
generous_start = max(0, url_pos - 120)
|
||||||
|
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
|
||||||
|
generous_start -= 1
|
||||||
|
candidates.append(generous_start)
|
||||||
|
|
||||||
|
# Choose the closest reasonable boundary before the URL, but not too close.
|
||||||
|
reasonable_candidates = [
|
||||||
|
c for c in candidates
|
||||||
|
if 0 <= c < url_pos and (url_pos - c) <= 180
|
||||||
|
]
|
||||||
|
|
||||||
|
if reasonable_candidates:
|
||||||
|
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
|
||||||
|
# If the nearest boundary is too close, fall back to a slightly earlier one.
|
||||||
|
if url_pos - start < 35:
|
||||||
|
farther = [c for c in reasonable_candidates if url_pos - c >= 35]
|
||||||
|
if farther:
|
||||||
|
start = min(farther, key=lambda c: (url_pos - c))
|
||||||
|
return start
|
||||||
|
|
||||||
|
return url_pos
|
||||||
|
|
||||||
|
|
||||||
def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
||||||
if not text or not primary_non_x_url:
|
if not text or not primary_non_x_url:
|
||||||
return False
|
return False
|
||||||
@@ -381,17 +439,9 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
|||||||
if not tail.startswith(("http://", "https://")):
|
if not tail.startswith(("http://", "https://")):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# URL followed by optional hashtags / trailing words is a meaningful tail we should try to preserve.
|
|
||||||
if re.search(r"https?://\S+.*#[^\s#]+", tail):
|
if re.search(r"https?://\S+.*#[^\s#]+", tail):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Also treat URL preceded by meaningful text ending as important if the URL is not on its own line.
|
|
||||||
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
|
||||||
if lines:
|
|
||||||
for line in lines:
|
|
||||||
if primary_non_x_url in line and not line.startswith(("http://", "https://")):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@@ -417,10 +467,16 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
|||||||
if not tail:
|
if not tail:
|
||||||
return truncate_text_safely(text, max_length)
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
# Need room for "... " + tail
|
|
||||||
reserve = len(tail) + 4
|
reserve = len(tail) + 4
|
||||||
if reserve >= max_length:
|
if reserve >= max_length:
|
||||||
return truncate_text_safely(text, max_length)
|
# Tail too large; keep the tail itself and trim from its front carefully.
|
||||||
|
shortened_tail = tail
|
||||||
|
if len(shortened_tail) > max_length - 3:
|
||||||
|
shortened_tail = shortened_tail[-(max_length - 3):]
|
||||||
|
first_space = shortened_tail.find(" ")
|
||||||
|
if first_space > 0 and first_space < 40:
|
||||||
|
shortened_tail = shortened_tail[first_space + 1:]
|
||||||
|
return "..." + shortened_tail[-(max_length - 3):]
|
||||||
|
|
||||||
available_prefix = max_length - reserve
|
available_prefix = max_length - reserve
|
||||||
prefix = text[:tail_start].rstrip()
|
prefix = text[:tail_start].rstrip()
|
||||||
@@ -447,17 +503,15 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
if primary_non_x_url:
|
if primary_non_x_url:
|
||||||
# If the URL and hashtag tail are semantically important, preserve the tail first.
|
tail_start = find_tail_preservation_start(text, primary_non_x_url)
|
||||||
if looks_like_url_and_tag_tail(text, primary_non_x_url):
|
|
||||||
url_pos = text.find(primary_non_x_url)
|
if tail_start is not None:
|
||||||
if url_pos != -1:
|
preserved = truncate_text_preserving_tail(text, tail_start, BSKY_TEXT_MAX_LENGTH)
|
||||||
preserved = truncate_text_preserving_tail(text, url_pos, BSKY_TEXT_MAX_LENGTH)
|
|
||||||
if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH:
|
if preserved and len(preserved) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
logging.info("🔗 Preserving meaningful URL/tag tail in visible Bluesky text")
|
logging.info("🔗 Preserving meaningful ending block with URL/hashtags in visible Bluesky text")
|
||||||
return preserved
|
return preserved
|
||||||
|
|
||||||
# For article-card-style posts, prefer removing the URL entirely from visible text.
|
if prefer_full_text_without_url and not looks_like_url_and_tag_tail(text, primary_non_x_url):
|
||||||
if prefer_full_text_without_url:
|
|
||||||
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
||||||
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
||||||
|
|||||||
Reference in New Issue
Block a user