fix(sync): preserve full text for long link posts and use expanded URL only for Bluesky external cards
This commit is contained in:
@@ -289,7 +289,7 @@ def canonicalize_tweet_url(url):
|
|||||||
def is_x_or_twitter_domain(url):
|
def is_x_or_twitter_domain(url):
|
||||||
try:
|
try:
|
||||||
hostname = (urlparse(url).hostname or "").lower()
|
hostname = (urlparse(url).hostname or "").lower()
|
||||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
|
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -326,6 +326,34 @@ def extract_ordered_non_x_urls(text):
|
|||||||
return ordered
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def remove_url_from_visible_text(text, url_to_remove):
|
||||||
|
"""
|
||||||
|
Remove a specific URL from visible text while preserving paragraph structure as much as possible.
|
||||||
|
"""
|
||||||
|
if not text or not url_to_remove:
|
||||||
|
return text
|
||||||
|
|
||||||
|
canonical_target = canonicalize_url(url_to_remove)
|
||||||
|
lines = text.splitlines()
|
||||||
|
cleaned_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line_urls = extract_urls_from_text(line)
|
||||||
|
new_line = line
|
||||||
|
|
||||||
|
for url in line_urls:
|
||||||
|
if canonicalize_url(strip_trailing_url_punctuation(url)) == canonical_target:
|
||||||
|
new_line = new_line.replace(url, "").strip()
|
||||||
|
|
||||||
|
cleaned_lines.append(new_line)
|
||||||
|
|
||||||
|
result = "\n".join(cleaned_lines)
|
||||||
|
result = re.sub(r"[ \t]+", " ", result)
|
||||||
|
result = re.sub(r"\n{3,}", "\n\n", result).strip()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def looks_like_title_plus_url_post(text):
|
def looks_like_title_plus_url_post(text):
|
||||||
if not text:
|
if not text:
|
||||||
return False
|
return False
|
||||||
@@ -354,7 +382,16 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
return truncated + "..."
|
return truncated + "..."
|
||||||
|
|
||||||
|
|
||||||
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True):
|
||||||
|
"""
|
||||||
|
Choose the final visible Bluesky text.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- If full text fits, keep it exactly.
|
||||||
|
- If it doesn't fit and there is a long external URL:
|
||||||
|
- prefer full text WITHOUT the URL if that fits
|
||||||
|
- otherwise fall back to truncation
|
||||||
|
"""
|
||||||
text = (full_clean_text or "").strip()
|
text = (full_clean_text or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
@@ -362,38 +399,11 @@ def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
|||||||
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
if keep_url:
|
if primary_non_x_url and prefer_full_text_without_url:
|
||||||
canonical_keep = canonicalize_url(keep_url)
|
text_without_url = remove_url_from_visible_text(text, primary_non_x_url).strip()
|
||||||
urls = extract_ordered_non_x_urls(text)
|
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
|
logging.info("🔗 Keeping full visible text by removing long external URL from body and using external card")
|
||||||
matched_url = None
|
return text_without_url
|
||||||
for url in urls:
|
|
||||||
if canonicalize_url(url) == canonical_keep:
|
|
||||||
matched_url = url
|
|
||||||
break
|
|
||||||
|
|
||||||
if matched_url and matched_url in text:
|
|
||||||
idx = text.find(matched_url)
|
|
||||||
prefix = text[:idx].rstrip()
|
|
||||||
suffix = matched_url
|
|
||||||
|
|
||||||
reserve = len(suffix) + 1
|
|
||||||
available = BSKY_TEXT_MAX_LENGTH - reserve
|
|
||||||
|
|
||||||
if available > 10:
|
|
||||||
trimmed_prefix = prefix
|
|
||||||
if len(trimmed_prefix) > available:
|
|
||||||
trimmed_prefix = trimmed_prefix[:available - 3]
|
|
||||||
last_space = trimmed_prefix.rfind(" ")
|
|
||||||
if last_space > 0:
|
|
||||||
trimmed_prefix = trimmed_prefix[:last_space] + "..."
|
|
||||||
else:
|
|
||||||
trimmed_prefix = trimmed_prefix + "..."
|
|
||||||
|
|
||||||
final_text = f"{trimmed_prefix.rstrip()} {suffix}".strip()
|
|
||||||
if len(final_text) <= BSKY_TEXT_MAX_LENGTH:
|
|
||||||
logging.info("🔗 Preserved non-X URL in final Bluesky text for card generation")
|
|
||||||
return final_text
|
|
||||||
|
|
||||||
return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
|
return truncate_text_safely(text, BSKY_TEXT_MAX_LENGTH)
|
||||||
|
|
||||||
@@ -780,10 +790,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
|
|
||||||
|
|
||||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||||
"""
|
|
||||||
Compress/resize normal tweet images so they fit within Bluesky image blob limits.
|
|
||||||
Returns JPEG bytes or None.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||||
img = img.convert("RGB")
|
img = img.convert("RGB")
|
||||||
@@ -1606,14 +1612,26 @@ def sync_feeds(args):
|
|||||||
canonical_non_x_urls = set(ordered_non_x_urls)
|
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||||
primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
|
primary_non_x_url = ordered_non_x_urls[0] if ordered_non_x_urls else None
|
||||||
|
|
||||||
raw_text = prepare_post_text_for_bsky(full_clean_text, keep_url=primary_non_x_url)
|
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||||
|
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||||
|
|
||||||
|
# Link-only/text-only posts with external cards get special visible text handling.
|
||||||
|
if primary_non_x_url and not has_video and not has_photo:
|
||||||
|
raw_text = choose_final_visible_text(
|
||||||
|
full_clean_text,
|
||||||
|
primary_non_x_url=primary_non_x_url,
|
||||||
|
prefer_full_text_without_url=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_text = choose_final_visible_text(
|
||||||
|
full_clean_text,
|
||||||
|
primary_non_x_url=None,
|
||||||
|
prefer_full_text_without_url=False,
|
||||||
|
)
|
||||||
|
|
||||||
media_fingerprint = build_media_fingerprint(tweet)
|
media_fingerprint = build_media_fingerprint(tweet)
|
||||||
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
||||||
|
|
||||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
|
||||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
|
||||||
|
|
||||||
candidate_tweets.append({
|
candidate_tweets.append({
|
||||||
"tweet": tweet,
|
"tweet": tweet,
|
||||||
"tweet_time": tweet_time,
|
"tweet_time": tweet_time,
|
||||||
|
|||||||
Reference in New Issue
Block a user