fix(rss): try full title+link posts before truncation and add production-safe fallback strategy

This commit is contained in:
Guillem Hernandez Sola
2026-04-09 18:40:58 +02:00
parent e8b727c942
commit e2051baffe

View File

@@ -15,7 +15,13 @@ import html
from urllib.parse import urlparse from urllib.parse import urlparse
from atproto import Client, client_utils, models from atproto import Client, client_utils, models
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PIL import Image
try:
from PIL import Image
PIL_AVAILABLE = True
except ImportError:
Image = None
PIL_AVAILABLE = False
# --- Configuration --- # --- Configuration ---
STATE_PATH = "rss2bsky_state.json" STATE_PATH = "rss2bsky_state.json"
@@ -33,6 +39,7 @@ BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
HTTP_TIMEOUT = 20 HTTP_TIMEOUT = 20
POST_RETRY_DELAY_SECONDS = 2
# --- Logging --- # --- Logging ---
logging.basicConfig( logging.basicConfig(
@@ -41,13 +48,15 @@ logging.basicConfig(
stream=sys.stdout stream=sys.stdout
) )
if not PIL_AVAILABLE:
logging.warning("Pillow is not installed. External card thumbnail compression is disabled.")
# --- Encoding / text helpers --- # --- Encoding / text helpers ---
def fix_encoding(text): def fix_encoding(text):
try: try:
return text.encode("latin-1").decode("utf-8") return text.encode("latin-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError): except (UnicodeEncodeError, UnicodeDecodeError):
logging.warning(f"Error correcting encoding: {text}")
return text return text
@@ -118,37 +127,53 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
return truncated + "..." return truncated + "..."
def build_post_text(title_text, link): def build_post_text_variants(title_text, link):
""" """
For RSS posts we usually want 'title + newline + link'. Build text variants from best to worst.
If it doesn't fit, prefer truncating the title while keeping the link visible.
Preferred:
1. Full title + blank line + real URL
Fallbacks:
2. Truncated title + blank line + real URL
3. Full title only
4. Truncated title only
""" """
title_text = clean_whitespace(title_text) title_text = clean_whitespace(title_text)
link = canonicalize_url(link) or link link = canonicalize_url(link) or link or ""
if not title_text: variants = []
return truncate_text_safely(link) seen = set()
combined = f"{title_text}\n{link}" def add_variant(text):
if len(combined) <= BSKY_TEXT_MAX_LENGTH: cleaned = clean_whitespace(text)
return combined if cleaned and cleaned not in seen:
seen.add(cleaned)
variants.append(cleaned)
reserve = len(link) + 1 if title_text and link:
available = BSKY_TEXT_MAX_LENGTH - reserve add_variant(f"{title_text}\n\n{link}")
if available <= 10: reserve = len(link) + 2
return truncate_text_safely(combined) available = BSKY_TEXT_MAX_LENGTH - reserve
if available > 10:
trimmed_title = title_text
if len(trimmed_title) > available:
trimmed_title = trimmed_title[:available - 3]
last_space = trimmed_title.rfind(" ")
if last_space > 0:
trimmed_title = trimmed_title[:last_space] + "..."
else:
trimmed_title = trimmed_title + "..."
add_variant(f"{trimmed_title}\n\n{link}")
trimmed_title = title_text if title_text:
if len(trimmed_title) > available: add_variant(title_text)
trimmed_title = trimmed_title[:available - 3] add_variant(truncate_text_safely(title_text))
last_space = trimmed_title.rfind(" ")
if last_space > 0:
trimmed_title = trimmed_title[:last_space] + "..."
else:
trimmed_title = trimmed_title + "..."
return f"{trimmed_title}\n{link}" if link:
add_variant(link)
return variants
# --- URL / duplicate helpers --- # --- URL / duplicate helpers ---
@@ -261,7 +286,7 @@ def prune_state(state, max_entries=5000):
return state return state
def remember_posted_entry(state, candidate, bsky_uri=None): def remember_posted_entry(state, candidate, posted_text, bsky_uri=None):
canonical_link = candidate.get("canonical_link") canonical_link = candidate.get("canonical_link")
fallback_key = f"fp:{candidate['entry_fingerprint']}" fallback_key = f"fp:{candidate['entry_fingerprint']}"
state_key = canonical_link or fallback_key state_key = canonical_link or fallback_key
@@ -271,7 +296,7 @@ def remember_posted_entry(state, candidate, bsky_uri=None):
"title_text": candidate["title_text"], "title_text": candidate["title_text"],
"normalized_title": candidate["normalized_title"], "normalized_title": candidate["normalized_title"],
"entry_fingerprint": candidate["entry_fingerprint"], "entry_fingerprint": candidate["entry_fingerprint"],
"post_text": candidate["post_text"], "post_text": posted_text,
"published_at": candidate.get("published_at"), "published_at": candidate.get("published_at"),
"bsky_uri": bsky_uri, "bsky_uri": bsky_uri,
"posted_at": arrow.utcnow().isoformat(), "posted_at": arrow.utcnow().isoformat(),
@@ -371,7 +396,6 @@ def get_recent_bsky_posts(client, handle, limit=DEDUPE_BSKY_LIMIT):
def candidate_matches_existing_bsky(candidate, recent_bsky_posts): def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
candidate_link = candidate["canonical_link"] candidate_link = candidate["canonical_link"]
candidate_post_text_normalized = normalize_text(candidate["post_text"])
candidate_title_normalized = candidate["normalized_title"] candidate_title_normalized = candidate["normalized_title"]
for existing in recent_bsky_posts: for existing in recent_bsky_posts:
@@ -381,11 +405,8 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
if candidate_link and candidate_link in existing_urls: if candidate_link and candidate_link in existing_urls:
return True, "bsky:canonical_link" return True, "bsky:canonical_link"
if candidate_post_text_normalized == existing_text_normalized:
return True, "bsky:normalized_post_text"
if candidate_title_normalized and candidate_title_normalized in existing_text_normalized: if candidate_title_normalized and candidate_title_normalized in existing_text_normalized:
if candidate_link and candidate_link in existing_urls: if not candidate_link or candidate_link in existing_urls:
return True, "bsky:title_plus_link" return True, "bsky:title_plus_link"
return False, None return False, None
@@ -523,6 +544,9 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES): def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
if not PIL_AVAILABLE:
return None
try: try:
with Image.open(io.BytesIO(image_bytes)) as img: with Image.open(io.BytesIO(image_bytes)) as img:
img = img.convert("RGB") img = img.convert("RGB")
@@ -534,7 +558,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim
new_size = (max(1, int(width * scale)), max(1, int(height * scale))) new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
img = img.resize(new_size, Image.LANCZOS) img = img.resize(new_size, Image.LANCZOS)
logging.info(f"Resized external thumb to {new_size[0]}x{new_size[1]}")
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
out = io.BytesIO() out = io.BytesIO()
@@ -676,7 +699,6 @@ def build_candidates_from_feed(feed):
logging.info("Skipping feed item with no usable title and no link.") logging.info("Skipping feed item with no usable title and no link.")
continue continue
post_text = build_post_text(title_text, link or "")
normalized_title = normalize_text(title_text) normalized_title = normalize_text(title_text)
entry_fingerprint = build_entry_fingerprint(normalized_title, link) entry_fingerprint = build_entry_fingerprint(normalized_title, link)
@@ -687,18 +709,54 @@ def build_candidates_from_feed(feed):
"canonical_link": link, "canonical_link": link,
"published_at": published_at.isoformat() if published_at else None, "published_at": published_at.isoformat() if published_at else None,
"published_arrow": published_at, "published_arrow": published_at,
"post_text": post_text,
"entry_fingerprint": entry_fingerprint, "entry_fingerprint": entry_fingerprint,
"post_text_variants": build_post_text_variants(title_text, link),
}) })
except Exception as e: except Exception as e:
logging.warning(f"Failed to prepare feed entry candidate: {e}") logging.warning(f"Failed to prepare feed entry candidate: {e}")
# Sort oldest to newest so posts appear in order
candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0)) candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0))
return candidates return candidates
# --- Posting helpers ---
def is_probable_length_error(exc):
text = repr(exc)
signals = [
"TextTooLong",
"text too long",
"Invalid app.bsky.feed.post record",
"string too long",
"maxLength",
"length",
]
return any(signal.lower() in text.lower() for signal in signals)
def try_send_post_with_variants(client, text_variants, embed, post_lang):
last_exception = None
for idx, variant in enumerate(text_variants, start=1):
try:
logging.info(f"Trying post text variant {idx}/{len(text_variants)} (length={len(variant)})")
rich_text = make_rich(variant)
result = client.send_post(text=rich_text, embed=embed, langs=[post_lang])
return result, variant
except Exception as e:
last_exception = e
logging.warning(f"Post variant {idx} failed: {repr(e)}")
if not is_probable_length_error(e):
raise
if last_exception:
raise last_exception
raise RuntimeError("No text variants available to post.")
# --- Main --- # --- Main ---
def main(): def main():
parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.") parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.")
@@ -788,12 +846,10 @@ def main():
for candidate in entries_to_post: for candidate in entries_to_post:
title_text = candidate["title_text"] title_text = candidate["title_text"]
canonical_link = candidate["canonical_link"] canonical_link = candidate["canonical_link"]
post_text = candidate["post_text"] text_variants = candidate["post_text_variants"]
logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}") logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}")
rich_text = make_rich(post_text)
embed = None embed = None
if canonical_link: if canonical_link:
embed = build_external_link_embed( embed = build_external_link_embed(
@@ -804,22 +860,23 @@ def main():
) )
try: try:
post_result = client.send_post( post_result, posted_text = try_send_post_with_variants(
text=rich_text, client=client,
text_variants=text_variants,
embed=embed, embed=embed,
langs=[post_lang] post_lang=post_lang
) )
bsky_uri = getattr(post_result, "uri", None) bsky_uri = getattr(post_result, "uri", None)
remember_posted_entry(state, candidate, bsky_uri=bsky_uri) remember_posted_entry(state, candidate, posted_text=posted_text, bsky_uri=bsky_uri)
state = prune_state(state, max_entries=5000) state = prune_state(state, max_entries=5000)
save_state(state, state_path) save_state(state, state_path)
recent_bsky_posts.insert(0, { recent_bsky_posts.insert(0, {
"uri": bsky_uri, "uri": bsky_uri,
"text": post_text, "text": posted_text,
"normalized_text": normalize_text(post_text), "normalized_text": normalize_text(posted_text),
"canonical_non_x_urls": {canonical_link} if canonical_link else set(), "canonical_non_x_urls": {canonical_link} if canonical_link else set(),
"created_at": arrow.utcnow().isoformat(), "created_at": arrow.utcnow().isoformat(),
}) })
@@ -827,7 +884,7 @@ def main():
noves_entrades += 1 noves_entrades += 1
logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}") logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}")
time.sleep(2) time.sleep(POST_RETRY_DELAY_SECONDS)
except Exception: except Exception:
logging.exception(f"Failed to post RSS entry {canonical_link or title_text}") logging.exception(f"Failed to post RSS entry {canonical_link or title_text}")