Controlling snippet image on blobs

This commit is contained in:
2026-04-06 09:28:45 +02:00
parent 5abd9d685a
commit e4334f44d2

View File

@@ -1,6 +1,7 @@
import argparse
import arrow
import hashlib
import io
import json
import logging
import re
@@ -14,6 +15,7 @@ from atproto import Client, client_utils, models
from playwright.sync_api import sync_playwright
from moviepy import VideoFileClip
from bs4 import BeautifulSoup
from PIL import Image
# --- Configuration ---
LOG_PATH = "twitter2bsky.log"
@@ -26,6 +28,14 @@ BSKY_TEXT_MAX_LENGTH = 275
VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45
# External-card thumbnail constraints:
# The user's PDS returned:
# BlobTooLarge: 1.15MB > 976.56KB
# So we conservatively target a slightly smaller max size for safety.
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
EXTERNAL_THUMB_MAX_DIMENSION = 1200
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
BSKY_BLOB_UPLOAD_MAX_RETRIES = 5
BSKY_BLOB_UPLOAD_BASE_DELAY = 10
BSKY_BLOB_UPLOAD_MAX_DELAY = 300
@@ -99,11 +109,8 @@ def repair_broken_urls(text):
original = text
# Join protocol line breaks: https://\nexample.com -> https://example.com
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
# Join URL-internal line breaks when the next chunk still looks like URL content.
# This is intentionally conservative but effective for wrapped article URLs.
prev_text = None
while prev_text != text:
prev_text = text
@@ -114,7 +121,6 @@ def repair_broken_urls(text):
flags=re.IGNORECASE
)
# Also fix accidental spaces inserted inside URLs after the protocol.
text = re.sub(
r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
r"\1\2",
@@ -348,6 +354,116 @@ def get_blob_from_file(file_path, client):
return None
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
"""
Compress/resize an image to fit external thumbnail blob size limits.
Returns JPEG bytes or None.
"""
try:
with Image.open(io.BytesIO(image_bytes)) as img:
img = img.convert("RGB")
width, height = img.size
max_dim = max(width, height)
if max_dim > EXTERNAL_THUMB_MAX_DIMENSION:
scale = EXTERNAL_THUMB_MAX_DIMENSION / max_dim
new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
img = img.resize(new_size, Image.LANCZOS)
logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
# Try progressively lower qualities.
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
out = io.BytesIO()
img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
data = out.getvalue()
logging.info(
f"🖼️ External thumb candidate size at JPEG quality {quality}: "
f"{len(data) / 1024:.2f} KB"
)
if len(data) <= max_bytes:
return data
# If still too large, try a second resize pass.
for target_dim in [1000, 900, 800, 700, 600]:
resized = img.copy()
width, height = resized.size
max_dim = max(width, height)
if max_dim > target_dim:
scale = target_dim / max_dim
new_size = (max(1, int(width * scale)), max(1, int(height * scale)))
resized = resized.resize(new_size, Image.LANCZOS)
for quality in [60, 50, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
out = io.BytesIO()
resized.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
data = out.getvalue()
logging.info(
f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: "
f"{len(data) / 1024:.2f} KB"
)
if len(data) <= max_bytes:
return data
except Exception as e:
logging.warning(f"Could not compress external thumbnail: {repr(e)}")
return None
def get_external_thumb_blob_from_url(image_url, client, http_client):
"""
Download, size-check, compress if needed, and upload an external-card thumbnail blob.
If the image cannot fit within the PDS blob limit, return None so the external card
can still be posted without a thumbnail.
"""
try:
r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
if r.status_code != 200:
logging.warning(f"Could not fetch external thumb {image_url}: HTTP {r.status_code}")
return None
content = r.content
if not content:
logging.warning(f"Could not fetch external thumb {image_url}: empty body")
return None
original_size_kb = len(content) / 1024
logging.info(f"🖼️ Downloaded external thumb {image_url} ({original_size_kb:.2f} KB)")
upload_bytes = content
if len(upload_bytes) > EXTERNAL_THUMB_MAX_BYTES:
logging.info(
f"🖼️ External thumb exceeds safe limit "
f"({original_size_kb:.2f} KB > {EXTERNAL_THUMB_MAX_BYTES / 1024:.2f} KB). Compressing..."
)
compressed = compress_external_thumb_to_limit(upload_bytes, EXTERNAL_THUMB_MAX_BYTES)
if compressed:
upload_bytes = compressed
logging.info(f"✅ External thumb compressed to {len(upload_bytes) / 1024:.2f} KB")
else:
logging.warning("⚠️ Could not compress external thumb to fit limit. Will omit thumbnail.")
return None
else:
logging.info("✅ External thumb already within safe size limit.")
blob = upload_blob_with_retry(client, upload_bytes, media_label=f"external-thumb:{image_url}")
if blob:
return blob
logging.warning("⚠️ External thumb upload failed. Will omit thumbnail.")
return None
except Exception as e:
logging.warning(f"Could not fetch/upload external thumb {image_url}: {repr(e)}")
return None
def fetch_link_metadata(url, http_client):
try:
r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
@@ -376,11 +492,19 @@ def fetch_link_metadata(url, http_client):
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
"""
Build a Bluesky external embed from a URL.
If the thumbnail image is too large, omit the thumbnail but still return the link card.
"""
link_metadata = fetch_link_metadata(url, http_client)
thumb_blob = None
if link_metadata.get("image"):
thumb_blob = get_blob_from_url(link_metadata["image"], client, http_client)
thumb_blob = get_external_thumb_blob_from_url(link_metadata["image"], client, http_client)
if thumb_blob:
logging.info("✅ External link card thumbnail prepared successfully")
else:
logging.info(" External link card will be posted without thumbnail")
if link_metadata.get("title") or link_metadata.get("description") or thumb_blob:
return models.AppBskyEmbedExternal.Main(