fix(rss): try full title+link posts before truncation and add production-safe fallback strategy

This commit is contained in:
Guillem Hernandez Sola
2026-04-09 18:40:58 +02:00
parent e8b727c942
commit e2051baffe

View File

@@ -15,7 +15,13 @@ import html
from urllib.parse import urlparse
from atproto import Client, client_utils, models
from bs4 import BeautifulSoup
try:
from PIL import Image
PIL_AVAILABLE = True
except ImportError:
Image = None
PIL_AVAILABLE = False
# --- Configuration ---
STATE_PATH = "rss2bsky_state.json"
@@ -33,6 +39,7 @@ BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
HTTP_TIMEOUT = 20
POST_RETRY_DELAY_SECONDS = 2
# --- Logging ---
logging.basicConfig(
@@ -41,13 +48,15 @@ logging.basicConfig(
stream=sys.stdout
)
if not PIL_AVAILABLE:
logging.warning("Pillow is not installed. External card thumbnail compression is disabled.")
# --- Encoding / text helpers ---
def fix_encoding(text):
try:
return text.encode("latin-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
logging.warning(f"Error correcting encoding: {text}")
return text
@@ -118,27 +127,35 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
return truncated + "..."
def build_post_text(title_text, link):
def build_post_text_variants(title_text, link):
"""
For RSS posts we usually want 'title + newline + link'.
If it doesn't fit, prefer truncating the title while keeping the link visible.
Build text variants from best to worst.
Preferred:
1. Full title + blank line + real URL
Fallbacks:
2. Truncated title + blank line + real URL
3. Full title only
4. Truncated title only
"""
title_text = clean_whitespace(title_text)
link = canonicalize_url(link) or link
link = canonicalize_url(link) or link or ""
if not title_text:
return truncate_text_safely(link)
variants = []
seen = set()
combined = f"{title_text}\n{link}"
if len(combined) <= BSKY_TEXT_MAX_LENGTH:
return combined
def add_variant(text):
cleaned = clean_whitespace(text)
if cleaned and cleaned not in seen:
seen.add(cleaned)
variants.append(cleaned)
reserve = len(link) + 1
if title_text and link:
add_variant(f"{title_text}\n\n{link}")
reserve = len(link) + 2
available = BSKY_TEXT_MAX_LENGTH - reserve
if available <= 10:
return truncate_text_safely(combined)
if available > 10:
trimmed_title = title_text
if len(trimmed_title) > available:
trimmed_title = trimmed_title[:available - 3]
@@ -147,8 +164,16 @@ def build_post_text(title_text, link):
trimmed_title = trimmed_title[:last_space] + "..."
else:
trimmed_title = trimmed_title + "..."
add_variant(f"{trimmed_title}\n\n{link}")
return f"{trimmed_title}\n{link}"
if title_text:
add_variant(title_text)
add_variant(truncate_text_safely(title_text))
if link:
add_variant(link)
return variants
# --- URL / duplicate helpers ---
@@ -261,7 +286,7 @@ def prune_state(state, max_entries=5000):
return state
def remember_posted_entry(state, candidate, bsky_uri=None):
def remember_posted_entry(state, candidate, posted_text, bsky_uri=None):
canonical_link = candidate.get("canonical_link")
fallback_key = f"fp:{candidate['entry_fingerprint']}"
state_key = canonical_link or fallback_key
@@ -271,7 +296,7 @@ def remember_posted_entry(state, candidate, bsky_uri=None):
"title_text": candidate["title_text"],
"normalized_title": candidate["normalized_title"],
"entry_fingerprint": candidate["entry_fingerprint"],
"post_text": candidate["post_text"],
"post_text": posted_text,
"published_at": candidate.get("published_at"),
"bsky_uri": bsky_uri,
"posted_at": arrow.utcnow().isoformat(),
@@ -371,7 +396,6 @@ def get_recent_bsky_posts(client, handle, limit=DEDUPE_BSKY_LIMIT):
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
candidate_link = candidate["canonical_link"]
candidate_post_text_normalized = normalize_text(candidate["post_text"])
candidate_title_normalized = candidate["normalized_title"]
for existing in recent_bsky_posts:
@@ -381,11 +405,8 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
if candidate_link and candidate_link in existing_urls:
return True, "bsky:canonical_link"
if candidate_post_text_normalized == existing_text_normalized:
return True, "bsky:normalized_post_text"
if candidate_title_normalized and candidate_title_normalized in existing_text_normalized:
if candidate_link and candidate_link in existing_urls:
if not candidate_link or candidate_link in existing_urls:
return True, "bsky:title_plus_link"
return False, None
@@ -523,6 +544,9 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
if not PIL_AVAILABLE:
return None
try:
with Image.open(io.BytesIO(image_bytes)) as img:
img = img.convert("RGB")
@@ -534,7 +558,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim
new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
img = img.resize(new_size, Image.LANCZOS)
logging.info(f"Resized external thumb to {new_size[0]}x{new_size[1]}")
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
out = io.BytesIO()
@@ -676,7 +699,6 @@ def build_candidates_from_feed(feed):
logging.info("Skipping feed item with no usable title and no link.")
continue
post_text = build_post_text(title_text, link or "")
normalized_title = normalize_text(title_text)
entry_fingerprint = build_entry_fingerprint(normalized_title, link)
@@ -687,18 +709,54 @@ def build_candidates_from_feed(feed):
"canonical_link": link,
"published_at": published_at.isoformat() if published_at else None,
"published_arrow": published_at,
"post_text": post_text,
"entry_fingerprint": entry_fingerprint,
"post_text_variants": build_post_text_variants(title_text, link),
})
except Exception as e:
logging.warning(f"Failed to prepare feed entry candidate: {e}")
# Sort oldest to newest so posts appear in order
candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0))
return candidates
# --- Posting helpers ---
def is_probable_length_error(exc):
text = repr(exc)
signals = [
"TextTooLong",
"text too long",
"Invalid app.bsky.feed.post record",
"string too long",
"maxLength",
"length",
]
return any(signal.lower() in text.lower() for signal in signals)
def try_send_post_with_variants(client, text_variants, embed, post_lang):
last_exception = None
for idx, variant in enumerate(text_variants, start=1):
try:
logging.info(f"Trying post text variant {idx}/{len(text_variants)} (length={len(variant)})")
rich_text = make_rich(variant)
result = client.send_post(text=rich_text, embed=embed, langs=[post_lang])
return result, variant
except Exception as e:
last_exception = e
logging.warning(f"Post variant {idx} failed: {repr(e)}")
if not is_probable_length_error(e):
raise
if last_exception:
raise last_exception
raise RuntimeError("No text variants available to post.")
# --- Main ---
def main():
parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.")
@@ -788,12 +846,10 @@ def main():
for candidate in entries_to_post:
title_text = candidate["title_text"]
canonical_link = candidate["canonical_link"]
post_text = candidate["post_text"]
text_variants = candidate["post_text_variants"]
logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}")
rich_text = make_rich(post_text)
embed = None
if canonical_link:
embed = build_external_link_embed(
@@ -804,22 +860,23 @@ def main():
)
try:
post_result = client.send_post(
text=rich_text,
post_result, posted_text = try_send_post_with_variants(
client=client,
text_variants=text_variants,
embed=embed,
langs=[post_lang]
post_lang=post_lang
)
bsky_uri = getattr(post_result, "uri", None)
remember_posted_entry(state, candidate, bsky_uri=bsky_uri)
remember_posted_entry(state, candidate, posted_text=posted_text, bsky_uri=bsky_uri)
state = prune_state(state, max_entries=5000)
save_state(state, state_path)
recent_bsky_posts.insert(0, {
"uri": bsky_uri,
"text": post_text,
"normalized_text": normalize_text(post_text),
"text": posted_text,
"normalized_text": normalize_text(posted_text),
"canonical_non_x_urls": {canonical_link} if canonical_link else set(),
"created_at": arrow.utcnow().isoformat(),
})
@@ -827,7 +884,7 @@ def main():
noves_entrades += 1
logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}")
time.sleep(2)
time.sleep(POST_RETRY_DELAY_SECONDS)
except Exception:
logging.exception(f"Failed to post RSS entry {canonical_link or title_text}")