fix(sync): prefer video posts over image fallback and retry transient blob upload failures

This commit is contained in:
Guillem Hernandez Sola
2026-04-07 09:47:51 +02:00
parent 22d39ef2cf
commit bc39e57a78

View File

@@ -28,8 +28,6 @@ BSKY_TEXT_MAX_LENGTH = 275
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45 MAX_VIDEO_UPLOAD_SIZE_MB = 45
# External-card thumbnail constraints:
# Conservative safe target below the observed PDS max (~976.56 KB).
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024 EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
EXTERNAL_THUMB_MAX_DIMENSION = 1200 EXTERNAL_THUMB_MAX_DIMENSION = 1200
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
@@ -41,6 +39,10 @@ MEDIA_DOWNLOAD_TIMEOUT = 30
LINK_METADATA_TIMEOUT = 10 LINK_METADATA_TIMEOUT = 10
DEFAULT_BSKY_BASE_URL = "https://bsky.social" DEFAULT_BSKY_BASE_URL = "https://bsky.social"
# Extra timeout retry tuning for transient blob upload failures
BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
# --- Logging Setup --- # --- Logging Setup ---
logging.basicConfig( logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s", format="%(asctime)s [%(levelname)s] %(message)s",
@@ -88,20 +90,6 @@ def strip_trailing_url_punctuation(url):
def repair_broken_urls(text): def repair_broken_urls(text):
"""
Repair URLs that were split by copied/scraped line breaks.
Examples:
https://
3cat.cat/path
becomes:
https://3cat.cat/path
https://3cat.cat/some-pa
th/article
becomes:
https://3cat.cat/some-path/article
"""
if not text: if not text:
return text return text
@@ -133,33 +121,6 @@ def repair_broken_urls(text):
def repair_broken_mentions(text): def repair_broken_mentions(text):
"""
Repair mention-related line wrapping in scraped text.
Handles cases like:
Ho explica
@martamartorell
La
@sanfenerea
tenia un repte
Hospital
@parctauli
.
conjunt
@bomberscat
-SEM.
becoming:
Ho explica @martamartorell
La @sanfenerea tenia un repte
Hospital @parctauli .
conjunt @bomberscat -SEM.
while preserving real paragraph breaks and standalone mention lines.
"""
if not text: if not text:
return text return text
@@ -183,7 +144,6 @@ def repair_broken_mentions(text):
i += 1 i += 1
continue continue
# If current line is only a mention, try to attach it backward.
if is_mention_only_line(current): if is_mention_only_line(current):
if result and result[-1].strip(): if result and result[-1].strip():
result[-1] = result[-1].rstrip() + " " + stripped result[-1] = result[-1].rstrip() + " " + stripped
@@ -193,8 +153,6 @@ def repair_broken_mentions(text):
i += 1 i += 1
# Attach immediately following continuation lines if they are not blank
# and not another standalone mention.
while i < len(lines): while i < len(lines):
next_line = lines[i] next_line = lines[i]
next_stripped = next_line.strip() next_stripped = next_line.strip()
@@ -214,7 +172,6 @@ def repair_broken_mentions(text):
continue continue
# If current line has text and next line is a mention, merge them.
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]): if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
merged = stripped + " " + lines[i + 1].strip() merged = stripped + " " + lines[i + 1].strip()
changed = True changed = True
@@ -252,10 +209,6 @@ def repair_broken_mentions(text):
def strip_line_edge_whitespace(text): def strip_line_edge_whitespace(text):
"""
Remove leading/trailing whitespace from each line while preserving
the line structure and intentional blank lines.
"""
if not text: if not text:
return text return text
@@ -278,12 +231,6 @@ def strip_line_edge_whitespace(text):
def remove_trailing_ellipsis_line(text): def remove_trailing_ellipsis_line(text):
"""
Remove trailing lines that are only ellipsis markers.
Handles:
- ...
- …
"""
if not text: if not text:
return text return text
@@ -415,8 +362,25 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
return default_delay return default_delay
def is_transient_blob_error(error_obj):
error_text = repr(error_obj)
transient_signals = [
"InvokeTimeoutError",
"ReadTimeout",
"WriteTimeout",
"TimeoutException",
"RemoteProtocolError",
"ConnectError",
"503",
"502",
"504",
]
return any(signal in error_text for signal in transient_signals)
def upload_blob_with_retry(client, binary_data, media_label="media"): def upload_blob_with_retry(client, binary_data, media_label="media"):
last_exception = None last_exception = None
transient_attempts = 0
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1): for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
try: try:
@@ -428,34 +392,46 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
error_text = str(e) error_text = str(e)
is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text
if not is_rate_limited: if is_rate_limited:
logging.warning(f"Could not upload {media_label}: {repr(e)}") backoff_delay = min(
BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)),
BSKY_BLOB_UPLOAD_MAX_DELAY
)
wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay)
if hasattr(e, "response") and e.response is not None: if attempt < BSKY_BLOB_UPLOAD_MAX_RETRIES:
try: logging.warning(
logging.warning(f"Upload response status: {e.response.status_code}") f"⏳ Bluesky blob upload rate-limited for {media_label}. "
logging.warning(f"Upload response body: {e.response.text}") f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s."
except Exception: )
pass time.sleep(wait_seconds)
continue
else:
logging.warning(
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
)
break
return None if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES:
transient_attempts += 1
backoff_delay = min( wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)),
BSKY_BLOB_UPLOAD_MAX_DELAY
)
wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay)
if attempt < BSKY_BLOB_UPLOAD_MAX_RETRIES:
logging.warning( logging.warning(
f"Bluesky blob upload rate-limited for {media_label}. " f"Transient blob upload failure for {media_label}: {repr(e)}. "
f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s." f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
) )
time.sleep(wait_seconds) time.sleep(wait_seconds)
else: continue
logging.warning(
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}" logging.warning(f"Could not upload {media_label}: {repr(e)}")
)
if hasattr(e, "response") and e.response is not None:
try:
logging.warning(f"Upload response status: {e.response.status_code}")
logging.warning(f"Upload response body: {e.response.text}")
except Exception:
pass
return None
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}") logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
return None return None
@@ -517,10 +493,6 @@ def get_blob_from_file(file_path, client):
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES): def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
"""
Compress/resize an image to fit external thumbnail blob size limits.
Returns JPEG bytes or None.
"""
try: try:
with Image.open(io.BytesIO(image_bytes)) as img: with Image.open(io.BytesIO(image_bytes)) as img:
img = img.convert("RGB") img = img.convert("RGB")
@@ -577,11 +549,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
def get_external_thumb_blob_from_url(image_url, client, http_client): def get_external_thumb_blob_from_url(image_url, client, http_client):
"""
Download, size-check, compress if needed, and upload an external-card thumbnail blob.
If the image cannot fit within the PDS blob limit, return None so the external card
can still be posted without a thumbnail.
"""
try: try:
r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True) r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
if r.status_code != 200: if r.status_code != 200:
@@ -652,10 +619,6 @@ def fetch_link_metadata(url, http_client):
def build_external_link_embed(url, client, http_client, fallback_title="Link"): def build_external_link_embed(url, client, http_client, fallback_title="Link"):
"""
Build a Bluesky external embed from a URL.
If the thumbnail image is too large, omit the thumbnail but still return the link card.
"""
link_metadata = fetch_link_metadata(url, http_client) link_metadata = fetch_link_metadata(url, http_client)
thumb_blob = None thumb_blob = None
@@ -1519,6 +1482,9 @@ def sync_feeds(args):
ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text) ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
canonical_non_x_urls = set(ordered_non_x_urls) canonical_non_x_urls = set(ordered_non_x_urls)
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
candidate_tweets.append({ candidate_tweets.append({
"tweet": tweet, "tweet": tweet,
"tweet_time": tweet_time, "tweet_time": tweet_time,
@@ -1530,6 +1496,8 @@ def sync_feeds(args):
"canonical_non_x_urls": canonical_non_x_urls, "canonical_non_x_urls": canonical_non_x_urls,
"ordered_non_x_urls": ordered_non_x_urls, "ordered_non_x_urls": ordered_non_x_urls,
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text), "looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
"has_video": has_video,
"has_photo": has_photo,
}) })
except Exception as e: except Exception as e:
@@ -1593,55 +1561,67 @@ def sync_feeds(args):
external_embed = None external_embed = None
media_upload_failures = [] media_upload_failures = []
if tweet.media: has_video = candidate.get("has_video", False)
for media in tweet.media:
if media.type == "photo":
blob = get_blob_from_url(media.media_url_https, bsky_client, media_http_client)
if blob:
image_embeds.append(
models.AppBskyEmbedImages.Image(
alt=dynamic_alt,
image=blob
)
)
else:
media_upload_failures.append(f"photo:{media.media_url_https}")
elif media.type == "video": # --- VIDEO-FIRST POLICY ---
if not tweet.tweet_url: # If the tweet contains video, try video first and do not degrade to photos
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.") # from the same tweet if video processing/upload fails.
media_upload_failures.append("video:no_tweet_url") if has_video:
continue video_media = next((m for m in (tweet.media or []) if getattr(m, "type", None) == "video"), None)
if video_media:
if not tweet.tweet_url:
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.")
media_upload_failures.append("video:no_tweet_url")
else:
temp_video_path = "temp_video.mp4" temp_video_path = "temp_video.mp4"
try: try:
real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url) real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url)
if not real_video_url: if not real_video_url:
logging.warning(f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}") logging.warning(f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}")
media_upload_failures.append(f"video:resolve_failed:{tweet.tweet_url}") media_upload_failures.append(f"video:resolve_failed:{tweet.tweet_url}")
continue else:
cropped_video_path = download_and_crop_video(real_video_url, temp_video_path)
cropped_video_path = download_and_crop_video(real_video_url, temp_video_path) if not cropped_video_path:
if not cropped_video_path: logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}")
logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}") media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}")
media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}") else:
continue video_blob = get_blob_from_file(cropped_video_path, bsky_client)
if not video_blob:
video_blob = get_blob_from_file(cropped_video_path, bsky_client) logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}")
if not video_blob: media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}")
logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}") else:
media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}") video_embed = build_video_embed(video_blob, dynamic_alt)
continue if not video_embed:
media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}")
video_embed = build_video_embed(video_blob, dynamic_alt)
if not video_embed:
media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}")
finally: finally:
if os.path.exists(temp_video_path): if os.path.exists(temp_video_path):
os.remove(temp_video_path) os.remove(temp_video_path)
# Important: if tweet had video, do NOT upload photos as fallback.
if not video_embed:
logging.warning(
"⚠️ Tweet contains video, but video could not be posted. "
"Skipping photo fallback for this tweet."
)
else:
# Photo-only tweets can post images normally.
if tweet.media:
for media in tweet.media:
if media.type == "photo":
blob = get_blob_from_url(media.media_url_https, bsky_client, media_http_client)
if blob:
image_embeds.append(
models.AppBskyEmbedImages.Image(
alt=dynamic_alt,
image=blob
)
)
else:
media_upload_failures.append(f"photo:{media.media_url_https}")
# If nothing media-based is available, optionally degrade to external card / text-only
if not video_embed and not image_embeds: if not video_embed and not image_embeds:
candidate_url = None candidate_url = None