fix(sync): preserve exact original tweet text, visible links, and hashtags when post fits Bluesky
This commit is contained in:
@@ -297,8 +297,9 @@ def is_x_or_twitter_domain(url):
|
|||||||
def extract_urls_from_text(text):
|
def extract_urls_from_text(text):
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
repaired = repair_broken_urls(text)
|
repaired = repair_broken_urls(text)
|
||||||
return re.findall(r"https?://[^\s]+", repaired)
|
return re.findall(r"https?://[^\s#]+", repaired)
|
||||||
|
|
||||||
|
|
||||||
def extract_non_x_urls_from_text(text):
|
def extract_non_x_urls_from_text(text):
|
||||||
@@ -326,6 +327,14 @@ def extract_ordered_non_x_urls(text):
|
|||||||
return ordered
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def extract_first_visible_non_x_url(text):
|
||||||
|
for url in extract_non_x_urls_from_text(text or ""):
|
||||||
|
canonical = canonicalize_url(url)
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def remove_url_from_visible_text(text, url_to_remove):
|
def remove_url_from_visible_text(text, url_to_remove):
|
||||||
if not text or not url_to_remove:
|
if not text or not url_to_remove:
|
||||||
return text
|
return text
|
||||||
@@ -339,9 +348,12 @@ def remove_url_from_visible_text(text, url_to_remove):
|
|||||||
new_line = line
|
new_line = line
|
||||||
|
|
||||||
for url in line_urls:
|
for url in line_urls:
|
||||||
if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
|
cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(url))
|
||||||
new_line = new_line.replace(url, "").strip()
|
if cleaned_candidate == canonical_target:
|
||||||
|
pattern = re.escape(url)
|
||||||
|
new_line = re.sub(pattern, "", new_line)
|
||||||
|
|
||||||
|
new_line = re.sub(r"[ \t]+", " ", new_line).strip()
|
||||||
cleaned_lines.append(new_line)
|
cleaned_lines.append(new_line)
|
||||||
|
|
||||||
result = "\n".join(cleaned_lines)
|
result = "\n".join(cleaned_lines)
|
||||||
@@ -368,64 +380,6 @@ def looks_like_title_plus_url_post(text):
|
|||||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
|
||||||
def find_tail_preservation_start(text, primary_non_x_url):
|
|
||||||
if not text or not primary_non_x_url:
|
|
||||||
return None
|
|
||||||
|
|
||||||
url_pos = text.find(primary_non_x_url)
|
|
||||||
if url_pos == -1:
|
|
||||||
return None
|
|
||||||
|
|
||||||
hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
|
|
||||||
has_hashtag_after_url = hashtag_match is not None
|
|
||||||
|
|
||||||
candidates = [url_pos]
|
|
||||||
|
|
||||||
# Prefer clause boundaries before the URL.
|
|
||||||
clause_patterns = [
|
|
||||||
r"\.\s+",
|
|
||||||
r":\s+",
|
|
||||||
r";\s+",
|
|
||||||
r"!\s+",
|
|
||||||
r"\?\s+",
|
|
||||||
r",\s+",
|
|
||||||
]
|
|
||||||
|
|
||||||
before = text[:url_pos]
|
|
||||||
for pattern in clause_patterns:
|
|
||||||
for match in re.finditer(pattern, before):
|
|
||||||
candidates.append(match.end())
|
|
||||||
|
|
||||||
# Prefer previous line break if present.
|
|
||||||
last_newline = before.rfind("\n")
|
|
||||||
if last_newline != -1:
|
|
||||||
candidates.append(last_newline + 1)
|
|
||||||
|
|
||||||
# If there are hashtags after the URL, preserve a more generous block before it.
|
|
||||||
if has_hashtag_after_url:
|
|
||||||
generous_start = max(0, url_pos - 120)
|
|
||||||
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
|
|
||||||
generous_start -= 1
|
|
||||||
candidates.append(generous_start)
|
|
||||||
|
|
||||||
# Choose the closest reasonable boundary before the URL, but not too close.
|
|
||||||
reasonable_candidates = [
|
|
||||||
c for c in candidates
|
|
||||||
if 0 <= c < url_pos and (url_pos - c) <= 180
|
|
||||||
]
|
|
||||||
|
|
||||||
if reasonable_candidates:
|
|
||||||
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
|
|
||||||
# If the nearest boundary is too close, fall back to a slightly earlier one.
|
|
||||||
if url_pos - start < 35:
|
|
||||||
farther = [c for c in reasonable_candidates if url_pos - c >= 35]
|
|
||||||
if farther:
|
|
||||||
start = min(farther, key=lambda c: (url_pos - c))
|
|
||||||
return start
|
|
||||||
|
|
||||||
return url_pos
|
|
||||||
|
|
||||||
|
|
||||||
def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
||||||
if not text or not primary_non_x_url:
|
if not text or not primary_non_x_url:
|
||||||
return False
|
return False
|
||||||
@@ -445,6 +399,59 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def find_tail_preservation_start(text, primary_non_x_url):
|
||||||
|
if not text or not primary_non_x_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url_pos = text.find(primary_non_x_url)
|
||||||
|
if url_pos == -1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
hashtag_match = re.search(r"\s#[^\s#]+", text[url_pos:])
|
||||||
|
has_hashtag_after_url = hashtag_match is not None
|
||||||
|
|
||||||
|
candidates = [url_pos]
|
||||||
|
|
||||||
|
clause_patterns = [
|
||||||
|
r"\.\s+",
|
||||||
|
r":\s+",
|
||||||
|
r";\s+",
|
||||||
|
r"!\s+",
|
||||||
|
r"\?\s+",
|
||||||
|
r",\s+",
|
||||||
|
]
|
||||||
|
|
||||||
|
before = text[:url_pos]
|
||||||
|
for pattern in clause_patterns:
|
||||||
|
for match in re.finditer(pattern, before):
|
||||||
|
candidates.append(match.end())
|
||||||
|
|
||||||
|
last_newline = before.rfind("\n")
|
||||||
|
if last_newline != -1:
|
||||||
|
candidates.append(last_newline + 1)
|
||||||
|
|
||||||
|
if has_hashtag_after_url:
|
||||||
|
generous_start = max(0, url_pos - 120)
|
||||||
|
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
|
||||||
|
generous_start -= 1
|
||||||
|
candidates.append(generous_start)
|
||||||
|
|
||||||
|
reasonable_candidates = [
|
||||||
|
c for c in candidates
|
||||||
|
if 0 <= c < url_pos and (url_pos - c) <= 180
|
||||||
|
]
|
||||||
|
|
||||||
|
if reasonable_candidates:
|
||||||
|
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
|
||||||
|
if url_pos - start < 35:
|
||||||
|
farther = [c for c in reasonable_candidates if url_pos - c >= 35]
|
||||||
|
if farther:
|
||||||
|
start = min(farther, key=lambda c: (url_pos - c))
|
||||||
|
return start
|
||||||
|
|
||||||
|
return url_pos
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
if len(text) <= max_length:
|
if len(text) <= max_length:
|
||||||
return text
|
return text
|
||||||
@@ -469,14 +476,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
|||||||
|
|
||||||
reserve = len(tail) + 4
|
reserve = len(tail) + 4
|
||||||
if reserve >= max_length:
|
if reserve >= max_length:
|
||||||
# Tail too large; keep the tail itself and trim from its front carefully.
|
shortened_tail = tail[-(max_length - 3):].strip()
|
||||||
shortened_tail = tail
|
|
||||||
if len(shortened_tail) > max_length - 3:
|
|
||||||
shortened_tail = shortened_tail[-(max_length - 3):]
|
|
||||||
first_space = shortened_tail.find(" ")
|
first_space = shortened_tail.find(" ")
|
||||||
if first_space > 0 and first_space < 40:
|
if 0 <= first_space <= 30:
|
||||||
shortened_tail = shortened_tail[first_space + 1:]
|
shortened_tail = shortened_tail[first_space + 1:].strip()
|
||||||
return "..." + shortened_tail[-(max_length - 3):]
|
|
||||||
|
return f"...{shortened_tail}"
|
||||||
|
|
||||||
available_prefix = max_length - reserve
|
available_prefix = max_length - reserve
|
||||||
prefix = text[:tail_start].rstrip()
|
prefix = text[:tail_start].rstrip()
|
||||||
@@ -485,9 +491,12 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
|||||||
prefix = prefix[:available_prefix].rstrip()
|
prefix = prefix[:available_prefix].rstrip()
|
||||||
last_space = prefix.rfind(" ")
|
last_space = prefix.rfind(" ")
|
||||||
if last_space > 20:
|
if last_space > 20:
|
||||||
prefix = prefix[:last_space]
|
prefix = prefix[:last_space].rstrip()
|
||||||
|
|
||||||
final_text = f"{prefix}... {tail}".strip()
|
final_text = f"{prefix}... {tail}".strip()
|
||||||
|
final_text = re.sub(r"[ \t]+", " ", final_text)
|
||||||
|
final_text = re.sub(r"\n{3,}", "\n\n", final_text).strip()
|
||||||
|
|
||||||
if len(final_text) <= max_length:
|
if len(final_text) <= max_length:
|
||||||
return final_text
|
return final_text
|
||||||
|
|
||||||
@@ -495,11 +504,13 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
|||||||
|
|
||||||
|
|
||||||
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
||||||
text = (full_clean_text or "").strip()
|
text = clean_post_text(full_clean_text or "")
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
# Golden rule: preserve exact original cleaned tweet text if it fits.
|
||||||
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
|
logging.info("🟢 Original cleaned tweet text fits in Bluesky. Preserving exact text.")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
if primary_non_x_url:
|
if primary_non_x_url:
|
||||||
@@ -517,7 +528,9 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
|
|||||||
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
||||||
return text_without_url
|
return text_without_url
|
||||||
|
|
||||||
return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
|
truncated = truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
|
||||||
|
logging.info("✂️ Falling back to safe truncation for visible Bluesky text")
|
||||||
|
return truncated
|
||||||
|
|
||||||
|
|
||||||
def normalize_post_text(text):
|
def normalize_post_text(text):
|
||||||
@@ -1203,21 +1216,31 @@ def make_rich(content):
|
|||||||
text_builder.text(" ")
|
text_builder.text(" ")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if word.startswith("http://") or word.startswith("https://"):
|
cleaned_word = strip_trailing_url_punctuation(word)
|
||||||
if word.startswith("http://"):
|
|
||||||
word = word.replace("http://", "https://", 1)
|
|
||||||
|
|
||||||
word = strip_trailing_url_punctuation(word)
|
if cleaned_word.startswith("http://") or cleaned_word.startswith("https://"):
|
||||||
clean_url_value = clean_url(word)
|
if cleaned_word.startswith("http://"):
|
||||||
|
cleaned_word = cleaned_word.replace("http://", "https://", 1)
|
||||||
|
|
||||||
|
clean_url_value = clean_url(cleaned_word)
|
||||||
|
|
||||||
if clean_url_value and is_valid_url(clean_url_value):
|
if clean_url_value and is_valid_url(clean_url_value):
|
||||||
text_builder.link(clean_url_value, clean_url_value)
|
text_builder.link(clean_url_value, clean_url_value)
|
||||||
|
trailing = word[len(cleaned_word):]
|
||||||
|
if trailing:
|
||||||
|
text_builder.text(trailing)
|
||||||
else:
|
else:
|
||||||
text_builder.text(word)
|
text_builder.text(word)
|
||||||
|
|
||||||
elif word.startswith("#"):
|
elif cleaned_word.startswith("#") and len(cleaned_word) > 1:
|
||||||
clean_tag = word[1:].rstrip(".,;:!?)'\"…")
|
clean_tag = cleaned_word[1:].rstrip(".,;:!?)'\"…")
|
||||||
text_builder.tag(word, clean_tag)
|
if clean_tag:
|
||||||
|
text_builder.tag(cleaned_word, clean_tag)
|
||||||
|
trailing = word[len(cleaned_word):]
|
||||||
|
if trailing:
|
||||||
|
text_builder.text(trailing)
|
||||||
|
else:
|
||||||
|
text_builder.text(word)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
text_builder.text(word)
|
text_builder.text(word)
|
||||||
@@ -1722,7 +1745,10 @@ def sync_feeds(args):
|
|||||||
|
|
||||||
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
||||||
canonical_non_x_urls = set(ordered_non_x_urls)
|
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||||
primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
|
|
||||||
|
primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
|
||||||
|
if not primary_non_x_url and ordered_non_x_urls:
|
||||||
|
primary_non_x_url = ordered_non_x_urls[0]
|
||||||
|
|
||||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||||
|
|||||||
Reference in New Issue
Block a user