fix(rss): try full title+link posts before truncation and add production-safe fallback strategy
This commit is contained in:
131
rss2bsky.py
131
rss2bsky.py
@@ -15,7 +15,13 @@ import html
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from atproto import Client, client_utils, models
|
from atproto import Client, client_utils, models
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
PIL_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
Image = None
|
||||||
|
PIL_AVAILABLE = False
|
||||||
|
|
||||||
# --- Configuration ---
|
# --- Configuration ---
|
||||||
STATE_PATH = "rss2bsky_state.json"
|
STATE_PATH = "rss2bsky_state.json"
|
||||||
@@ -33,6 +39,7 @@ BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
|
|||||||
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
||||||
|
|
||||||
HTTP_TIMEOUT = 20
|
HTTP_TIMEOUT = 20
|
||||||
|
POST_RETRY_DELAY_SECONDS = 2
|
||||||
|
|
||||||
# --- Logging ---
|
# --- Logging ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -41,13 +48,15 @@ logging.basicConfig(
|
|||||||
stream=sys.stdout
|
stream=sys.stdout
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not PIL_AVAILABLE:
|
||||||
|
logging.warning("Pillow is not installed. External card thumbnail compression is disabled.")
|
||||||
|
|
||||||
|
|
||||||
# --- Encoding / text helpers ---
|
# --- Encoding / text helpers ---
|
||||||
def fix_encoding(text):
|
def fix_encoding(text):
|
||||||
try:
|
try:
|
||||||
return text.encode("latin-1").decode("utf-8")
|
return text.encode("latin-1").decode("utf-8")
|
||||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
logging.warning(f"Error correcting encoding: {text}")
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
@@ -118,27 +127,35 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
return truncated + "..."
|
return truncated + "..."
|
||||||
|
|
||||||
|
|
||||||
def build_post_text(title_text, link):
|
def build_post_text_variants(title_text, link):
|
||||||
"""
|
"""
|
||||||
For RSS posts we usually want 'title + newline + link'.
|
Build text variants from best to worst.
|
||||||
If it doesn't fit, prefer truncating the title while keeping the link visible.
|
|
||||||
|
Preferred:
|
||||||
|
1. Full title + blank line + real URL
|
||||||
|
Fallbacks:
|
||||||
|
2. Truncated title + blank line + real URL
|
||||||
|
3. Full title only
|
||||||
|
4. Truncated title only
|
||||||
"""
|
"""
|
||||||
title_text = clean_whitespace(title_text)
|
title_text = clean_whitespace(title_text)
|
||||||
link = canonicalize_url(link) or link
|
link = canonicalize_url(link) or link or ""
|
||||||
|
|
||||||
if not title_text:
|
variants = []
|
||||||
return truncate_text_safely(link)
|
seen = set()
|
||||||
|
|
||||||
combined = f"{title_text}\n{link}"
|
def add_variant(text):
|
||||||
if len(combined) <= BSKY_TEXT_MAX_LENGTH:
|
cleaned = clean_whitespace(text)
|
||||||
return combined
|
if cleaned and cleaned not in seen:
|
||||||
|
seen.add(cleaned)
|
||||||
|
variants.append(cleaned)
|
||||||
|
|
||||||
reserve = len(link) + 1
|
if title_text and link:
|
||||||
|
add_variant(f"{title_text}\n\n{link}")
|
||||||
|
|
||||||
|
reserve = len(link) + 2
|
||||||
available = BSKY_TEXT_MAX_LENGTH - reserve
|
available = BSKY_TEXT_MAX_LENGTH - reserve
|
||||||
|
if available > 10:
|
||||||
if available <= 10:
|
|
||||||
return truncate_text_safely(combined)
|
|
||||||
|
|
||||||
trimmed_title = title_text
|
trimmed_title = title_text
|
||||||
if len(trimmed_title) > available:
|
if len(trimmed_title) > available:
|
||||||
trimmed_title = trimmed_title[:available - 3]
|
trimmed_title = trimmed_title[:available - 3]
|
||||||
@@ -147,8 +164,16 @@ def build_post_text(title_text, link):
|
|||||||
trimmed_title = trimmed_title[:last_space] + "..."
|
trimmed_title = trimmed_title[:last_space] + "..."
|
||||||
else:
|
else:
|
||||||
trimmed_title = trimmed_title + "..."
|
trimmed_title = trimmed_title + "..."
|
||||||
|
add_variant(f"{trimmed_title}\n\n{link}")
|
||||||
|
|
||||||
return f"{trimmed_title}\n{link}"
|
if title_text:
|
||||||
|
add_variant(title_text)
|
||||||
|
add_variant(truncate_text_safely(title_text))
|
||||||
|
|
||||||
|
if link:
|
||||||
|
add_variant(link)
|
||||||
|
|
||||||
|
return variants
|
||||||
|
|
||||||
|
|
||||||
# --- URL / duplicate helpers ---
|
# --- URL / duplicate helpers ---
|
||||||
@@ -261,7 +286,7 @@ def prune_state(state, max_entries=5000):
|
|||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
def remember_posted_entry(state, candidate, bsky_uri=None):
|
def remember_posted_entry(state, candidate, posted_text, bsky_uri=None):
|
||||||
canonical_link = candidate.get("canonical_link")
|
canonical_link = candidate.get("canonical_link")
|
||||||
fallback_key = f"fp:{candidate['entry_fingerprint']}"
|
fallback_key = f"fp:{candidate['entry_fingerprint']}"
|
||||||
state_key = canonical_link or fallback_key
|
state_key = canonical_link or fallback_key
|
||||||
@@ -271,7 +296,7 @@ def remember_posted_entry(state, candidate, bsky_uri=None):
|
|||||||
"title_text": candidate["title_text"],
|
"title_text": candidate["title_text"],
|
||||||
"normalized_title": candidate["normalized_title"],
|
"normalized_title": candidate["normalized_title"],
|
||||||
"entry_fingerprint": candidate["entry_fingerprint"],
|
"entry_fingerprint": candidate["entry_fingerprint"],
|
||||||
"post_text": candidate["post_text"],
|
"post_text": posted_text,
|
||||||
"published_at": candidate.get("published_at"),
|
"published_at": candidate.get("published_at"),
|
||||||
"bsky_uri": bsky_uri,
|
"bsky_uri": bsky_uri,
|
||||||
"posted_at": arrow.utcnow().isoformat(),
|
"posted_at": arrow.utcnow().isoformat(),
|
||||||
@@ -371,7 +396,6 @@ def get_recent_bsky_posts(client, handle, limit=DEDUPE_BSKY_LIMIT):
|
|||||||
|
|
||||||
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
||||||
candidate_link = candidate["canonical_link"]
|
candidate_link = candidate["canonical_link"]
|
||||||
candidate_post_text_normalized = normalize_text(candidate["post_text"])
|
|
||||||
candidate_title_normalized = candidate["normalized_title"]
|
candidate_title_normalized = candidate["normalized_title"]
|
||||||
|
|
||||||
for existing in recent_bsky_posts:
|
for existing in recent_bsky_posts:
|
||||||
@@ -381,11 +405,8 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
|||||||
if candidate_link and candidate_link in existing_urls:
|
if candidate_link and candidate_link in existing_urls:
|
||||||
return True, "bsky:canonical_link"
|
return True, "bsky:canonical_link"
|
||||||
|
|
||||||
if candidate_post_text_normalized == existing_text_normalized:
|
|
||||||
return True, "bsky:normalized_post_text"
|
|
||||||
|
|
||||||
if candidate_title_normalized and candidate_title_normalized in existing_text_normalized:
|
if candidate_title_normalized and candidate_title_normalized in existing_text_normalized:
|
||||||
if candidate_link and candidate_link in existing_urls:
|
if not candidate_link or candidate_link in existing_urls:
|
||||||
return True, "bsky:title_plus_link"
|
return True, "bsky:title_plus_link"
|
||||||
|
|
||||||
return False, None
|
return False, None
|
||||||
@@ -523,6 +544,9 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
|
|
||||||
|
|
||||||
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
|
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
|
||||||
|
if not PIL_AVAILABLE:
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||||
img = img.convert("RGB")
|
img = img.convert("RGB")
|
||||||
@@ -534,7 +558,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
|||||||
scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim
|
scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim
|
||||||
new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
|
new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
|
||||||
img = img.resize(new_size, Image.LANCZOS)
|
img = img.resize(new_size, Image.LANCZOS)
|
||||||
logging.info(f"Resized external thumb to {new_size[0]}x{new_size[1]}")
|
|
||||||
|
|
||||||
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
|
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
@@ -676,7 +699,6 @@ def build_candidates_from_feed(feed):
|
|||||||
logging.info("Skipping feed item with no usable title and no link.")
|
logging.info("Skipping feed item with no usable title and no link.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
post_text = build_post_text(title_text, link or "")
|
|
||||||
normalized_title = normalize_text(title_text)
|
normalized_title = normalize_text(title_text)
|
||||||
entry_fingerprint = build_entry_fingerprint(normalized_title, link)
|
entry_fingerprint = build_entry_fingerprint(normalized_title, link)
|
||||||
|
|
||||||
@@ -687,18 +709,54 @@ def build_candidates_from_feed(feed):
|
|||||||
"canonical_link": link,
|
"canonical_link": link,
|
||||||
"published_at": published_at.isoformat() if published_at else None,
|
"published_at": published_at.isoformat() if published_at else None,
|
||||||
"published_arrow": published_at,
|
"published_arrow": published_at,
|
||||||
"post_text": post_text,
|
|
||||||
"entry_fingerprint": entry_fingerprint,
|
"entry_fingerprint": entry_fingerprint,
|
||||||
|
"post_text_variants": build_post_text_variants(title_text, link),
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Failed to prepare feed entry candidate: {e}")
|
logging.warning(f"Failed to prepare feed entry candidate: {e}")
|
||||||
|
|
||||||
# Sort oldest to newest so posts appear in order
|
|
||||||
candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0))
|
candidates.sort(key=lambda c: c["published_arrow"] or arrow.get(0))
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
# --- Posting helpers ---
|
||||||
|
def is_probable_length_error(exc):
|
||||||
|
text = repr(exc)
|
||||||
|
signals = [
|
||||||
|
"TextTooLong",
|
||||||
|
"text too long",
|
||||||
|
"Invalid app.bsky.feed.post record",
|
||||||
|
"string too long",
|
||||||
|
"maxLength",
|
||||||
|
"length",
|
||||||
|
]
|
||||||
|
return any(signal.lower() in text.lower() for signal in signals)
|
||||||
|
|
||||||
|
|
||||||
|
def try_send_post_with_variants(client, text_variants, embed, post_lang):
|
||||||
|
last_exception = None
|
||||||
|
|
||||||
|
for idx, variant in enumerate(text_variants, start=1):
|
||||||
|
try:
|
||||||
|
logging.info(f"Trying post text variant {idx}/{len(text_variants)} (length={len(variant)})")
|
||||||
|
rich_text = make_rich(variant)
|
||||||
|
result = client.send_post(text=rich_text, embed=embed, langs=[post_lang])
|
||||||
|
return result, variant
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
last_exception = e
|
||||||
|
logging.warning(f"Post variant {idx} failed: {repr(e)}")
|
||||||
|
|
||||||
|
if not is_probable_length_error(e):
|
||||||
|
raise
|
||||||
|
|
||||||
|
if last_exception:
|
||||||
|
raise last_exception
|
||||||
|
|
||||||
|
raise RuntimeError("No text variants available to post.")
|
||||||
|
|
||||||
|
|
||||||
# --- Main ---
|
# --- Main ---
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.")
|
parser = argparse.ArgumentParser(description="Post RSS to Bluesky with JSON state tracking.")
|
||||||
@@ -788,12 +846,10 @@ def main():
|
|||||||
for candidate in entries_to_post:
|
for candidate in entries_to_post:
|
||||||
title_text = candidate["title_text"]
|
title_text = candidate["title_text"]
|
||||||
canonical_link = candidate["canonical_link"]
|
canonical_link = candidate["canonical_link"]
|
||||||
post_text = candidate["post_text"]
|
text_variants = candidate["post_text_variants"]
|
||||||
|
|
||||||
logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}")
|
logging.info(f"Preparing to post RSS entry: {canonical_link or title_text}")
|
||||||
|
|
||||||
rich_text = make_rich(post_text)
|
|
||||||
|
|
||||||
embed = None
|
embed = None
|
||||||
if canonical_link:
|
if canonical_link:
|
||||||
embed = build_external_link_embed(
|
embed = build_external_link_embed(
|
||||||
@@ -804,22 +860,23 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post_result = client.send_post(
|
post_result, posted_text = try_send_post_with_variants(
|
||||||
text=rich_text,
|
client=client,
|
||||||
|
text_variants=text_variants,
|
||||||
embed=embed,
|
embed=embed,
|
||||||
langs=[post_lang]
|
post_lang=post_lang
|
||||||
)
|
)
|
||||||
|
|
||||||
bsky_uri = getattr(post_result, "uri", None)
|
bsky_uri = getattr(post_result, "uri", None)
|
||||||
|
|
||||||
remember_posted_entry(state, candidate, bsky_uri=bsky_uri)
|
remember_posted_entry(state, candidate, posted_text=posted_text, bsky_uri=bsky_uri)
|
||||||
state = prune_state(state, max_entries=5000)
|
state = prune_state(state, max_entries=5000)
|
||||||
save_state(state, state_path)
|
save_state(state, state_path)
|
||||||
|
|
||||||
recent_bsky_posts.insert(0, {
|
recent_bsky_posts.insert(0, {
|
||||||
"uri": bsky_uri,
|
"uri": bsky_uri,
|
||||||
"text": post_text,
|
"text": posted_text,
|
||||||
"normalized_text": normalize_text(post_text),
|
"normalized_text": normalize_text(posted_text),
|
||||||
"canonical_non_x_urls": {canonical_link} if canonical_link else set(),
|
"canonical_non_x_urls": {canonical_link} if canonical_link else set(),
|
||||||
"created_at": arrow.utcnow().isoformat(),
|
"created_at": arrow.utcnow().isoformat(),
|
||||||
})
|
})
|
||||||
@@ -827,7 +884,7 @@ def main():
|
|||||||
|
|
||||||
noves_entrades += 1
|
noves_entrades += 1
|
||||||
logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}")
|
logging.info(f"Posted RSS entry to Bluesky: {canonical_link or title_text}")
|
||||||
time.sleep(2)
|
time.sleep(POST_RETRY_DELAY_SECONDS)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception(f"Failed to post RSS entry {canonical_link or title_text}")
|
logging.exception(f"Failed to post RSS entry {canonical_link or title_text}")
|
||||||
|
|||||||
Reference in New Issue
Block a user