From e4334f44d2f8c0ca5d2af690f02c9e967cbfe9f6 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 6 Apr 2026 09:28:45 +0200 Subject: [PATCH] Controlling snippet image on blobs --- twitter2bsky_daemon.py | 136 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 130 insertions(+), 6 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 01999d2..7d680ee 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -1,6 +1,7 @@ import argparse import arrow import hashlib +import io import json import logging import re @@ -14,6 +15,7 @@ from atproto import Client, client_utils, models from playwright.sync_api import sync_playwright from moviepy import VideoFileClip from bs4 import BeautifulSoup +from PIL import Image # --- Configuration --- LOG_PATH = "twitter2bsky.log" @@ -26,6 +28,14 @@ BSKY_TEXT_MAX_LENGTH = 275 VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 +# External-card thumbnail constraints: +# The user's PDS returned: +# BlobTooLarge: 1.15MB > 976.56KB +# So we conservatively target a slightly smaller max size for safety. +EXTERNAL_THUMB_MAX_BYTES = 950 * 1024 +EXTERNAL_THUMB_MAX_DIMENSION = 1200 +EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40 + BSKY_BLOB_UPLOAD_MAX_RETRIES = 5 BSKY_BLOB_UPLOAD_BASE_DELAY = 10 BSKY_BLOB_UPLOAD_MAX_DELAY = 300 @@ -99,11 +109,8 @@ def repair_broken_urls(text): original = text - # Join protocol line breaks: https://\nexample.com -> https://example.com text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE) - # Join URL-internal line breaks when the next chunk still looks like URL content. - # This is intentionally conservative but effective for wrapped article URLs. prev_text = None while prev_text != text: prev_text = text @@ -114,7 +121,6 @@ def repair_broken_urls(text): flags=re.IGNORECASE ) - # Also fix accidental spaces inserted inside URLs after the protocol. text = re.sub( r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)", r"\1\2", @@ -348,6 +354,116 @@ def get_blob_from_file(file_path, client): return None +def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES): + """ + Compress/resize an image to fit external thumbnail blob size limits. + Returns JPEG bytes or None. + """ + try: + with Image.open(io.BytesIO(image_bytes)) as img: + img = img.convert("RGB") + + width, height = img.size + max_dim = max(width, height) + + if max_dim > EXTERNAL_THUMB_MAX_DIMENSION: + scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim + new_size = (max(1, int(width * scale)), max(1, int(height * scale))) + img = img.resize(new_size, Image.LANCZOS) + logging.info(f"đŸ–ŧī¸ Resized external thumb to {new_size[0]}x{new_size[1]}") + + # Try progressively lower qualities. + for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: + out = io.BytesIO() + img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True) + data = out.getvalue() + + logging.info( + f"đŸ–ŧī¸ External thumb candidate size at JPEG quality {quality}: " + f"{len(data) / 1024:.2f} KB" + ) + + if len(data) <= max_bytes: + return data + + # If still too large, try a second resize pass. + for target_dim in [1000, 900, 800, 700, 600]: + resized = img.copy() + width, height = resized.size + max_dim = max(width, height) + + if max_dim > target_dim: + scale = target_dim / max_dim + new_size = (max(1, int(width * scale)), max(1, int(height * scale))) + resized = resized.resize(new_size, Image.LANCZOS) + + for quality in [60, 50, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: + out = io.BytesIO() + resized.save(out, format="JPEG", quality=quality, optimize=True, progressive=True) + data = out.getvalue() + + logging.info( + f"đŸ–ŧī¸ External thumb resized to <= {target_dim}px at quality {quality}: " + f"{len(data) / 1024:.2f} KB" + ) + + if len(data) <= max_bytes: + return data + + except Exception as e: + logging.warning(f"Could not compress external thumbnail: {repr(e)}") + + return None + + +def get_external_thumb_blob_from_url(image_url, client, http_client): + """ + Download, size-check, compress if needed, and upload an external-card thumbnail blob. + If the image cannot fit within the PDS blob limit, return None so the external card + can still be posted without a thumbnail. + """ + try: + r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True) + if r.status_code != 200: + logging.warning(f"Could not fetch external thumb {image_url}: HTTP {r.status_code}") + return None + + content = r.content + if not content: + logging.warning(f"Could not fetch external thumb {image_url}: empty body") + return None + + original_size_kb = len(content) / 1024 + logging.info(f"đŸ–ŧī¸ Downloaded external thumb {image_url} ({original_size_kb:.2f} KB)") + + upload_bytes = content + if len(upload_bytes) > EXTERNAL_THUMB_MAX_BYTES: + logging.info( + f"đŸ–ŧī¸ External thumb exceeds safe limit " + f"({original_size_kb:.2f} KB > {EXTERNAL_THUMB_MAX_BYTES / 1024:.2f} KB). Compressing..." + ) + compressed = compress_external_thumb_to_limit(upload_bytes, EXTERNAL_THUMB_MAX_BYTES) + if compressed: + upload_bytes = compressed + logging.info(f"✅ External thumb compressed to {len(upload_bytes) / 1024:.2f} KB") + else: + logging.warning("âš ī¸ Could not compress external thumb to fit limit. Will omit thumbnail.") + return None + else: + logging.info("✅ External thumb already within safe size limit.") + + blob = upload_blob_with_retry(client, upload_bytes, media_label=f"external-thumb:{image_url}") + if blob: + return blob + + logging.warning("âš ī¸ External thumb upload failed. Will omit thumbnail.") + return None + + except Exception as e: + logging.warning(f"Could not fetch/upload external thumb {image_url}: {repr(e)}") + return None + + def fetch_link_metadata(url, http_client): try: r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True) @@ -376,11 +492,19 @@ def fetch_link_metadata(url, http_client): def build_external_link_embed(url, client, http_client, fallback_title="Link"): + """ + Build a Bluesky external embed from a URL. + If the thumbnail image is too large, omit the thumbnail but still return the link card. + """ link_metadata = fetch_link_metadata(url, http_client) thumb_blob = None if link_metadata.get("image"): - thumb_blob = get_blob_from_url(link_metadata["image"], client, http_client) + thumb_blob = get_external_thumb_blob_from_url(link_metadata["image"], client, http_client) + if thumb_blob: + logging.info("✅ External link card thumbnail prepared successfully") + else: + logging.info("â„šī¸ External link card will be posted without thumbnail") if link_metadata.get("title") or link_metadata.get("description") or thumb_blob: return models.AppBskyEmbedExternal.Main( @@ -641,7 +765,7 @@ def get_recent_bsky_posts(client, handle, limit=30): if item.reason is not None: continue - record = item.post.record + record = item.post.record if getattr(record, "reply", None) is not None: continue