From e2051baffeb18d3cdd4c7c9d7089b85824df2f39 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Thu, 9 Apr 2026 18:40:58 +0200 Subject: [PATCH] fix(rss): try full title+link posts before truncation and add production-safe fallback strategy --- rss2bsky.py | 147 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 45 deletions(-) diff --git a/rss2bsky.py b/rss2bsky.py index 3724147..88c78fe 100644 --- a/rss2bsky.py +++ b/rss2bsky.py @@ -15,7 +15,13 @@ import html from urllib.parse import urlparse from atproto import Client, client_utils, models from bs4 import BeautifulSoup -from PIL import Image + +try: + from PIL import Image + PIL_AVAILABLE = True +except ImportError: + Image = None + PIL_AVAILABLE = False # --- Configuration --- STATE_PATH = "rss2bsky_state.json" @@ -33,6 +39,7 @@ BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3 BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 HTTP_TIMEOUT = 20 +POST_RETRY_DELAY_SECONDS = 2 # --- Logging --- logging.basicConfig( @@ -41,13 +48,15 @@ logging.basicConfig( stream=sys.stdout ) +if not PIL_AVAILABLE: + logging.warning("Pillow is not installed. External card thumbnail compression is disabled.") + # --- Encoding / text helpers --- def fix_encoding(text): try: return text.encode("latin-1").decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): - logging.warning(f"Error correcting encoding: {text}") return text @@ -118,37 +127,53 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): return truncated + "..." -def build_post_text(title_text, link): +def build_post_text_variants(title_text, link): """ - For RSS posts we usually want 'title + newline + link'. - If it doesn't fit, prefer truncating the title while keeping the link visible. + Build text variants from best to worst. + + Preferred: + 1. Full title + blank line + real URL + Fallbacks: + 2. Truncated title + blank line + real URL + 3. Full title only + 4. Truncated title only """ title_text = clean_whitespace(title_text) - link = canonicalize_url(link) or link + link = canonicalize_url(link) or link or "" - if not title_text: - return truncate_text_safely(link) + variants = [] + seen = set() - combined = f"{title_text}\n{link}" - if len(combined) <= BSKY_TEXT_MAX_LENGTH: - return combined + def add_variant(text): + cleaned = clean_whitespace(text) + if cleaned and cleaned not in seen: + seen.add(cleaned) + variants.append(cleaned) - reserve = len(link) + 1 - available = BSKY_TEXT_MAX_LENGTH - reserve + if title_text and link: + add_variant(f"{title_text}\n\n{link}") - if available <= 10: - return truncate_text_safely(combined) + reserve = len(link) + 2 + available = BSKY_TEXT_MAX_LENGTH - reserve + if available > 10: + trimmed_title = title_text + if len(trimmed_title) > available: + trimmed_title = trimmed_title[:available - 3] + last_space = trimmed_title.rfind(" ") + if last_space > 0: + trimmed_title = trimmed_title[:last_space] + "..." + else: + trimmed_title = trimmed_title + "..." + add_variant(f"{trimmed_title}\n\n{link}") - trimmed_title = title_text - if len(trimmed_title) > available: - trimmed_title = trimmed_title[:available - 3] - last_space = trimmed_title.rfind(" ") - if last_space > 0: - trimmed_title = trimmed_title[:last_space] + "..." - else: - trimmed_title = trimmed_title + "..." + if title_text: + add_variant(title_text) + add_variant(truncate_text_safely(title_text)) - return f"{trimmed_title}\n{link}" + if link: + add_variant(link) + + return variants # --- URL / duplicate helpers --- @@ -261,7 +286,7 @@ def prune_state(state, max_entries=5000): return state -def remember_posted_entry(state, candidate, bsky_uri=None): +def remember_posted_entry(state, candidate, posted_text, bsky_uri=None): canonical_link = candidate.get("canonical_link") fallback_key = f"fp:{candidate['entry_fingerprint']}" state_key = canonical_link or fallback_key @@ -271,7 +296,7 @@ def remember_posted_entry(state, candidate, bsky_uri=None): "title_text": candidate["title_text"], "normalized_title": candidate["normalized_title"], "entry_fingerprint": candidate["entry_fingerprint"], - "post_text": candidate["post_text"], + "post_text": posted_text, "published_at": candidate.get("published_at"), "bsky_uri": bsky_uri, "posted_at": arrow.utcnow().isoformat(), @@ -371,7 +396,6 @@ def get_recent_bsky_posts(client, handle, limit=DEDUPE_BSKY_LIMIT): def candidate_matches_existing_bsky(candidate, recent_bsky_posts): candidate_link = candidate["canonical_link"] - candidate_post_text_normalized = normalize_text(candidate["post_text"]) candidate_title_normalized = candidate["normalized_title"] for existing in recent_bsky_posts: @@ -381,11 +405,8 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): if candidate_link and candidate_link in existing_urls: return True, "bsky:canonical_link" - if candidate_post_text_normalized == existing_text_normalized: - return True, "bsky:normalized_post_text" - if candidate_title_normalized and candidate_title_normalized in existing_text_normalized: - if candidate_link and candidate_link in existing_urls: + if not candidate_link or candidate_link in existing_urls: return True, "bsky:title_plus_link" return False, None @@ -523,6 +544,9 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES): + if not PIL_AVAILABLE: + return None + try: with Image.open(io.BytesIO(image_bytes)) as img: img = img.convert("RGB") @@ -534,7 +558,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim new_size = (max(1, int(width * scale)), max(1, int(height * scale))) img = img.resize(new_size, Image.LANCZOS) - logging.info(f"Resized external thumb to {new_size[0]}x{new_size[1]}") for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: out = io.BytesIO() @@ -676,7 +699,6 @@ def build_candidates_from_feed(feed): logging.info("Skipping feed item with no usable title and no link.") continue - post_text = build_post_text(title_text, link or "") normalized_title = normalize_text(title_text) entry_fingerprint = build_entry_fingerprint(normalized_title, link) @@ -687,18 +709,54 @@ def build_candidates_from_feed(feed): "canonical_link": link, "published_at": published_at.isoformat() if published_at else None, "published_arrow": published_at, - "post_text": post_text, "entry_fingerprint": entry_fingerprint, + "post_text_variants": build_post_text_variants(title_text, link), }) except Exception as e: logging.warning(f"Failed to prepare feed entry candidate: {e}") - # Sort oldest to newest so posts appear in order candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0)) return candidates +# --- Posting helpers --- +def is_probable_length_error(exc): + text = repr(exc) + signals = [ + "TextTooLong", + "text too long", + "Invalid app.bsky.feed.post record", + "string too long", + "maxLength", + "length", + ] + return any(signal.lower() in text.lower() for signal in signals) + + +def try_send_post_with_variants(client, text_variants, embed, post_lang): + last_exception = None + + for idx, variant in enumerate(text_variants, start=1): + try: + logging.info(f"Trying post text variant {idx}/{len(text_variants)} (length={len(variant)})") + rich_text = make_rich(variant) + result = client.send_post(text=rich_text, embed=embed, langs=[post_lang]) + return result, variant + + except Exception as e: + last_exception = e + logging.warning(f"Post variant {idx} failed: {repr(e)}") + + if not is_probable_length_error(e): + raise + + if last_exception: + raise last_exception + + raise RuntimeError("No text variants available to post.") + + # --- Main --- def main(): parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.") @@ -788,12 +846,10 @@ def main(): for candidate in entries_to_post: title_text = candidate["title_text"] canonical_link = candidate["canonical_link"] - post_text = candidate["post_text"] + text_variants = candidate["post_text_variants"] logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}") - rich_text = make_rich(post_text) - embed = None if canonical_link: embed = build_external_link_embed( @@ -804,22 +860,23 @@ def main(): ) try: - post_result = client.send_post( - text=rich_text, + post_result, posted_text = try_send_post_with_variants( + client=client, + text_variants=text_variants, embed=embed, - langs=[post_lang] + post_lang=post_lang ) bsky_uri = getattr(post_result, "uri", None) - remember_posted_entry(state, candidate, bsky_uri=bsky_uri) + remember_posted_entry(state, candidate, posted_text=posted_text, bsky_uri=bsky_uri) state = prune_state(state, max_entries=5000) save_state(state, state_path) recent_bsky_posts.insert(0, { "uri": bsky_uri, - "text": post_text, - "normalized_text": normalize_text(post_text), + "text": posted_text, + "normalized_text": normalize_text(posted_text), "canonical_non_x_urls": {canonical_link} if canonical_link else set(), "created_at": arrow.utcnow().isoformat(), }) @@ -827,7 +884,7 @@ def main(): noves_entrades += 1 logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}") - time.sleep(2) + time.sleep(POST_RETRY_DELAY_SECONDS) except Exception: logging.exception(f"Failed to post RSS entry {canonical_link or title_text}")