diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index f80f4d9..993e76a 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -28,8 +28,6 @@ BSKY_TEXT_MAX_LENGTH = 275 VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 -# External-card thumbnail constraints: -# Conservative safe target below the observed PDS max (~976.56 KB). EXTERNAL_THUMB_MAX_BYTES = 950 * 1024 EXTERNAL_THUMB_MAX_DIMENSION = 1200 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40 @@ -41,6 +39,10 @@ MEDIA_DOWNLOAD_TIMEOUT = 30 LINK_METADATA_TIMEOUT = 10 DEFAULT_BSKY_BASE_URL = "https://bsky.social" +# Extra timeout retry tuning for transient blob upload failures +BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3 +BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 + # --- Logging Setup --- logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", @@ -88,20 +90,6 @@ def strip_trailing_url_punctuation(url): def repair_broken_urls(text): - """ - Repair URLs that were split by copied/scraped line breaks. - - Examples: - https:// - 3cat.cat/path - becomes: - https://3cat.cat/path - - https://3cat.cat/some-pa - th/article - becomes: - https://3cat.cat/some-path/article - """ if not text: return text @@ -133,33 +121,6 @@ def repair_broken_urls(text): def repair_broken_mentions(text): - """ - Repair mention-related line wrapping in scraped text. - - Handles cases like: - Ho explica - @martamartorell - - La - @sanfenerea - tenia un repte - - Hospital - @parctauli - . - - conjunt - @bomberscat - -SEM. - - becoming: - Ho explica @martamartorell - La @sanfenerea tenia un repte - Hospital @parctauli . - conjunt @bomberscat -SEM. - - while preserving real paragraph breaks and standalone mention lines. - """ if not text: return text @@ -183,7 +144,6 @@ def repair_broken_mentions(text): i += 1 continue - # If current line is only a mention, try to attach it backward. if is_mention_only_line(current): if result and result[-1].strip(): result[-1] = result[-1].rstrip() + " " + stripped @@ -193,8 +153,6 @@ def repair_broken_mentions(text): i += 1 - # Attach immediately following continuation lines if they are not blank - # and not another standalone mention. while i < len(lines): next_line = lines[i] next_stripped = next_line.strip() @@ -214,7 +172,6 @@ def repair_broken_mentions(text): continue - # If current line has text and next line is a mention, merge them. if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]): merged = stripped + " " + lines[i + 1].strip() changed = True @@ -252,10 +209,6 @@ def repair_broken_mentions(text): def strip_line_edge_whitespace(text): - """ - Remove leading/trailing whitespace from each line while preserving - the line structure and intentional blank lines. - """ if not text: return text @@ -278,12 +231,6 @@ def strip_line_edge_whitespace(text): def remove_trailing_ellipsis_line(text): - """ - Remove trailing lines that are only ellipsis markers. - Handles: - - ... - - … - """ if not text: return text @@ -415,8 +362,25 @@ def get_rate_limit_wait_seconds(error_obj, default_delay): return default_delay +def is_transient_blob_error(error_obj): + error_text = repr(error_obj) + transient_signals = [ + "InvokeTimeoutError", + "ReadTimeout", + "WriteTimeout", + "TimeoutException", + "RemoteProtocolError", + "ConnectError", + "503", + "502", + "504", + ] + return any(signal in error_text for signal in transient_signals) + + def upload_blob_with_retry(client, binary_data, media_label="media"): last_exception = None + transient_attempts = 0 for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1): try: @@ -428,34 +392,46 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): error_text = str(e) is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text - if not is_rate_limited: - logging.warning(f"Could not upload {media_label}: {repr(e)}") + if is_rate_limited: + backoff_delay = min( + BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_BLOB_UPLOAD_MAX_DELAY + ) + wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay) - if hasattr(e, "response") and e.response is not None: - try: - logging.warning(f"Upload response status: {e.response.status_code}") - logging.warning(f"Upload response body: {e.response.text}") - except Exception: - pass + if attempt < BSKY_BLOB_UPLOAD_MAX_RETRIES: + logging.warning( + f"⏳ Bluesky blob upload rate-limited for {media_label}. " + f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s." + ) + time.sleep(wait_seconds) + continue + else: + logging.warning( + f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}" + ) + break - return None - - backoff_delay = min( - BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)), - BSKY_BLOB_UPLOAD_MAX_DELAY - ) - wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay) - - if attempt < BSKY_BLOB_UPLOAD_MAX_RETRIES: + if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES: + transient_attempts += 1 + wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts logging.warning( - f"⏳ Bluesky blob upload rate-limited for {media_label}. " - f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s." + f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. " + f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s." ) time.sleep(wait_seconds) - else: - logging.warning( - f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}" - ) + continue + + logging.warning(f"Could not upload {media_label}: {repr(e)}") + + if hasattr(e, "response") and e.response is not None: + try: + logging.warning(f"Upload response status: {e.response.status_code}") + logging.warning(f"Upload response body: {e.response.text}") + except Exception: + pass + + return None logging.warning(f"Could not upload {media_label}: {repr(last_exception)}") return None @@ -517,10 +493,6 @@ def get_blob_from_file(file_path, client): def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES): - """ - Compress/resize an image to fit external thumbnail blob size limits. - Returns JPEG bytes or None. - """ try: with Image.open(io.BytesIO(image_bytes)) as img: img = img.convert("RGB") @@ -577,11 +549,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B def get_external_thumb_blob_from_url(image_url, client, http_client): - """ - Download, size-check, compress if needed, and upload an external-card thumbnail blob. - If the image cannot fit within the PDS blob limit, return None so the external card - can still be posted without a thumbnail. - """ try: r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True) if r.status_code != 200: @@ -652,10 +619,6 @@ def fetch_link_metadata(url, http_client): def build_external_link_embed(url, client, http_client, fallback_title="Link"): - """ - Build a Bluesky external embed from a URL. - If the thumbnail image is too large, omit the thumbnail but still return the link card. - """ link_metadata = fetch_link_metadata(url, http_client) thumb_blob = None @@ -1519,6 +1482,9 @@ def sync_feeds(args): ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text) canonical_non_x_urls = set(ordered_non_x_urls) + has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) + has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) + candidate_tweets.append({ "tweet": tweet, "tweet_time": tweet_time, @@ -1530,6 +1496,8 @@ def sync_feeds(args): "canonical_non_x_urls": canonical_non_x_urls, "ordered_non_x_urls": ordered_non_x_urls, "looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text), + "has_video": has_video, + "has_photo": has_photo, }) except Exception as e: @@ -1593,55 +1561,67 @@ def sync_feeds(args): external_embed = None media_upload_failures = [] - if tweet.media: - for media in tweet.media: - if media.type == "photo": - blob = get_blob_from_url(media.media_url_https, bsky_client, media_http_client) - if blob: - image_embeds.append( - models.AppBskyEmbedImages.Image( - alt=dynamic_alt, - image=blob - ) - ) - else: - media_upload_failures.append(f"photo:{media.media_url_https}") + has_video = candidate.get("has_video", False) - elif media.type == "video": - if not tweet.tweet_url: - logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.") - media_upload_failures.append("video:no_tweet_url") - continue + # --- VIDEO-FIRST POLICY --- + # If the tweet contains video, try video first and do not degrade to photos + # from the same tweet if video processing/upload fails. + if has_video: + video_media = next((m for m in (tweet.media or []) if getattr(m, "type", None) == "video"), None) + if video_media: + if not tweet.tweet_url: + logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.") + media_upload_failures.append("video:no_tweet_url") + else: temp_video_path = "temp_video.mp4" - try: real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url) if not real_video_url: logging.warning(f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}") media_upload_failures.append(f"video:resolve_failed:{tweet.tweet_url}") - continue - - cropped_video_path = download_and_crop_video(real_video_url, temp_video_path) - if not cropped_video_path: - logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}") - media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}") - continue - - video_blob = get_blob_from_file(cropped_video_path, bsky_client) - if not video_blob: - logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}") - media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}") - continue - - video_embed = build_video_embed(video_blob, dynamic_alt) - if not video_embed: - media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}") - + else: + cropped_video_path = download_and_crop_video(real_video_url, temp_video_path) + if not cropped_video_path: + logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}") + media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}") + else: + video_blob = get_blob_from_file(cropped_video_path, bsky_client) + if not video_blob: + logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}") + media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}") + else: + video_embed = build_video_embed(video_blob, dynamic_alt) + if not video_embed: + media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}") finally: if os.path.exists(temp_video_path): os.remove(temp_video_path) + # Important: if tweet had video, do NOT upload photos as fallback. + if not video_embed: + logging.warning( + "⚠️ Tweet contains video, but video could not be posted. " + "Skipping photo fallback for this tweet." + ) + + else: + # Photo-only tweets can post images normally. + if tweet.media: + for media in tweet.media: + if media.type == "photo": + blob = get_blob_from_url(media.media_url_https, bsky_client, media_http_client) + if blob: + image_embeds.append( + models.AppBskyEmbedImages.Image( + alt=dynamic_alt, + image=blob + ) + ) + else: + media_upload_failures.append(f"photo:{media.media_url_https}") + + # If nothing media-based is available, optionally degrade to external card / text-only if not video_embed and not image_embeds: candidate_url = None