fix(sync): prefer video posts over image fallback and retry transient blob upload failures
This commit is contained in:
@@ -28,8 +28,6 @@ BSKY_TEXT_MAX_LENGTH = 275
|
|||||||
VIDEO_MAX_DURATION_SECONDS = 179
|
VIDEO_MAX_DURATION_SECONDS = 179
|
||||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||||
|
|
||||||
# External-card thumbnail constraints:
|
|
||||||
# Conservative safe target below the observed PDS max (~976.56 KB).
|
|
||||||
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
||||||
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
||||||
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
||||||
@@ -41,6 +39,10 @@ MEDIA_DOWNLOAD_TIMEOUT = 30
|
|||||||
LINK_METADATA_TIMEOUT = 10
|
LINK_METADATA_TIMEOUT = 10
|
||||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||||
|
|
||||||
|
# Extra timeout retry tuning for transient blob upload failures
|
||||||
|
BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
|
||||||
|
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
@@ -88,20 +90,6 @@ def strip_trailing_url_punctuation(url):
|
|||||||
|
|
||||||
|
|
||||||
def repair_broken_urls(text):
|
def repair_broken_urls(text):
|
||||||
"""
|
|
||||||
Repair URLs that were split by copied/scraped line breaks.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
https://
|
|
||||||
3cat.cat/path
|
|
||||||
becomes:
|
|
||||||
https://3cat.cat/path
|
|
||||||
|
|
||||||
https://3cat.cat/some-pa
|
|
||||||
th/article
|
|
||||||
becomes:
|
|
||||||
https://3cat.cat/some-path/article
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -133,33 +121,6 @@ def repair_broken_urls(text):
|
|||||||
|
|
||||||
|
|
||||||
def repair_broken_mentions(text):
|
def repair_broken_mentions(text):
|
||||||
"""
|
|
||||||
Repair mention-related line wrapping in scraped text.
|
|
||||||
|
|
||||||
Handles cases like:
|
|
||||||
Ho explica
|
|
||||||
@martamartorell
|
|
||||||
|
|
||||||
La
|
|
||||||
@sanfenerea
|
|
||||||
tenia un repte
|
|
||||||
|
|
||||||
Hospital
|
|
||||||
@parctauli
|
|
||||||
.
|
|
||||||
|
|
||||||
conjunt
|
|
||||||
@bomberscat
|
|
||||||
-SEM.
|
|
||||||
|
|
||||||
becoming:
|
|
||||||
Ho explica @martamartorell
|
|
||||||
La @sanfenerea tenia un repte
|
|
||||||
Hospital @parctauli .
|
|
||||||
conjunt @bomberscat -SEM.
|
|
||||||
|
|
||||||
while preserving real paragraph breaks and standalone mention lines.
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -183,7 +144,6 @@ def repair_broken_mentions(text):
|
|||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If current line is only a mention, try to attach it backward.
|
|
||||||
if is_mention_only_line(current):
|
if is_mention_only_line(current):
|
||||||
if result and result[-1].strip():
|
if result and result[-1].strip():
|
||||||
result[-1] = result[-1].rstrip() + " " + stripped
|
result[-1] = result[-1].rstrip() + " " + stripped
|
||||||
@@ -193,8 +153,6 @@ def repair_broken_mentions(text):
|
|||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
# Attach immediately following continuation lines if they are not blank
|
|
||||||
# and not another standalone mention.
|
|
||||||
while i < len(lines):
|
while i < len(lines):
|
||||||
next_line = lines[i]
|
next_line = lines[i]
|
||||||
next_stripped = next_line.strip()
|
next_stripped = next_line.strip()
|
||||||
@@ -214,7 +172,6 @@ def repair_broken_mentions(text):
|
|||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If current line has text and next line is a mention, merge them.
|
|
||||||
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
|
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
|
||||||
merged = stripped + " " + lines[i + 1].strip()
|
merged = stripped + " " + lines[i + 1].strip()
|
||||||
changed = True
|
changed = True
|
||||||
@@ -252,10 +209,6 @@ def repair_broken_mentions(text):
|
|||||||
|
|
||||||
|
|
||||||
def strip_line_edge_whitespace(text):
|
def strip_line_edge_whitespace(text):
|
||||||
"""
|
|
||||||
Remove leading/trailing whitespace from each line while preserving
|
|
||||||
the line structure and intentional blank lines.
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -278,12 +231,6 @@ def strip_line_edge_whitespace(text):
|
|||||||
|
|
||||||
|
|
||||||
def remove_trailing_ellipsis_line(text):
|
def remove_trailing_ellipsis_line(text):
|
||||||
"""
|
|
||||||
Remove trailing lines that are only ellipsis markers.
|
|
||||||
Handles:
|
|
||||||
- ...
|
|
||||||
- …
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -415,8 +362,25 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
|||||||
return default_delay
|
return default_delay
|
||||||
|
|
||||||
|
|
||||||
|
def is_transient_blob_error(error_obj):
|
||||||
|
error_text = repr(error_obj)
|
||||||
|
transient_signals = [
|
||||||
|
"InvokeTimeoutError",
|
||||||
|
"ReadTimeout",
|
||||||
|
"WriteTimeout",
|
||||||
|
"TimeoutException",
|
||||||
|
"RemoteProtocolError",
|
||||||
|
"ConnectError",
|
||||||
|
"503",
|
||||||
|
"502",
|
||||||
|
"504",
|
||||||
|
]
|
||||||
|
return any(signal in error_text for signal in transient_signals)
|
||||||
|
|
||||||
|
|
||||||
def upload_blob_with_retry(client, binary_data, media_label="media"):
|
def upload_blob_with_retry(client, binary_data, media_label="media"):
|
||||||
last_exception = None
|
last_exception = None
|
||||||
|
transient_attempts = 0
|
||||||
|
|
||||||
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
|
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
|
||||||
try:
|
try:
|
||||||
@@ -428,18 +392,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
error_text = str(e)
|
error_text = str(e)
|
||||||
is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text
|
is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text
|
||||||
|
|
||||||
if not is_rate_limited:
|
if is_rate_limited:
|
||||||
logging.warning(f"Could not upload {media_label}: {repr(e)}")
|
|
||||||
|
|
||||||
if hasattr(e, "response") and e.response is not None:
|
|
||||||
try:
|
|
||||||
logging.warning(f"Upload response status: {e.response.status_code}")
|
|
||||||
logging.warning(f"Upload response body: {e.response.text}")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
backoff_delay = min(
|
backoff_delay = min(
|
||||||
BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)),
|
BSKY_BLOB_UPLOAD_BASE_DELAY * (2 ** (attempt - 1)),
|
||||||
BSKY_BLOB_UPLOAD_MAX_DELAY
|
BSKY_BLOB_UPLOAD_MAX_DELAY
|
||||||
@@ -452,10 +405,33 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s."
|
f"Retry {attempt}/{BSKY_BLOB_UPLOAD_MAX_RETRIES} after {wait_seconds}s."
|
||||||
)
|
)
|
||||||
time.sleep(wait_seconds)
|
time.sleep(wait_seconds)
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
|
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
|
||||||
)
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES:
|
||||||
|
transient_attempts += 1
|
||||||
|
wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
|
||||||
|
logging.warning(
|
||||||
|
f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. "
|
||||||
|
f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
|
||||||
|
)
|
||||||
|
time.sleep(wait_seconds)
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.warning(f"Could not upload {media_label}: {repr(e)}")
|
||||||
|
|
||||||
|
if hasattr(e, "response") and e.response is not None:
|
||||||
|
try:
|
||||||
|
logging.warning(f"Upload response status: {e.response.status_code}")
|
||||||
|
logging.warning(f"Upload response body: {e.response.text}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
|
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
|
||||||
return None
|
return None
|
||||||
@@ -517,10 +493,6 @@ def get_blob_from_file(file_path, client):
|
|||||||
|
|
||||||
|
|
||||||
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
|
def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_BYTES):
|
||||||
"""
|
|
||||||
Compress/resize an image to fit external thumbnail blob size limits.
|
|
||||||
Returns JPEG bytes or None.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||||
img = img.convert("RGB")
|
img = img.convert("RGB")
|
||||||
@@ -577,11 +549,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
|||||||
|
|
||||||
|
|
||||||
def get_external_thumb_blob_from_url(image_url, client, http_client):
|
def get_external_thumb_blob_from_url(image_url, client, http_client):
|
||||||
"""
|
|
||||||
Download, size-check, compress if needed, and upload an external-card thumbnail blob.
|
|
||||||
If the image cannot fit within the PDS blob limit, return None so the external card
|
|
||||||
can still be posted without a thumbnail.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
|
r = http_client.get(image_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
@@ -652,10 +619,6 @@ def fetch_link_metadata(url, http_client):
|
|||||||
|
|
||||||
|
|
||||||
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||||
"""
|
|
||||||
Build a Bluesky external embed from a URL.
|
|
||||||
If the thumbnail image is too large, omit the thumbnail but still return the link card.
|
|
||||||
"""
|
|
||||||
link_metadata = fetch_link_metadata(url, http_client)
|
link_metadata = fetch_link_metadata(url, http_client)
|
||||||
|
|
||||||
thumb_blob = None
|
thumb_blob = None
|
||||||
@@ -1519,6 +1482,9 @@ def sync_feeds(args):
|
|||||||
ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
|
ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
|
||||||
canonical_non_x_urls = set(ordered_non_x_urls)
|
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||||
|
|
||||||
|
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||||
|
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||||
|
|
||||||
candidate_tweets.append({
|
candidate_tweets.append({
|
||||||
"tweet": tweet,
|
"tweet": tweet,
|
||||||
"tweet_time": tweet_time,
|
"tweet_time": tweet_time,
|
||||||
@@ -1530,6 +1496,8 @@ def sync_feeds(args):
|
|||||||
"canonical_non_x_urls": canonical_non_x_urls,
|
"canonical_non_x_urls": canonical_non_x_urls,
|
||||||
"ordered_non_x_urls": ordered_non_x_urls,
|
"ordered_non_x_urls": ordered_non_x_urls,
|
||||||
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
|
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
|
||||||
|
"has_video": has_video,
|
||||||
|
"has_photo": has_photo,
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1593,6 +1561,52 @@ def sync_feeds(args):
|
|||||||
external_embed = None
|
external_embed = None
|
||||||
media_upload_failures = []
|
media_upload_failures = []
|
||||||
|
|
||||||
|
has_video = candidate.get("has_video", False)
|
||||||
|
|
||||||
|
# --- VIDEO-FIRST POLICY ---
|
||||||
|
# If the tweet contains video, try video first and do not degrade to photos
|
||||||
|
# from the same tweet if video processing/upload fails.
|
||||||
|
if has_video:
|
||||||
|
video_media = next((m for m in (tweet.media or []) if getattr(m, "type", None) == "video"), None)
|
||||||
|
|
||||||
|
if video_media:
|
||||||
|
if not tweet.tweet_url:
|
||||||
|
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.")
|
||||||
|
media_upload_failures.append("video:no_tweet_url")
|
||||||
|
else:
|
||||||
|
temp_video_path = "temp_video.mp4"
|
||||||
|
try:
|
||||||
|
real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url)
|
||||||
|
if not real_video_url:
|
||||||
|
logging.warning(f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}")
|
||||||
|
media_upload_failures.append(f"video:resolve_failed:{tweet.tweet_url}")
|
||||||
|
else:
|
||||||
|
cropped_video_path = download_and_crop_video(real_video_url, temp_video_path)
|
||||||
|
if not cropped_video_path:
|
||||||
|
logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}")
|
||||||
|
media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}")
|
||||||
|
else:
|
||||||
|
video_blob = get_blob_from_file(cropped_video_path, bsky_client)
|
||||||
|
if not video_blob:
|
||||||
|
logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}")
|
||||||
|
media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}")
|
||||||
|
else:
|
||||||
|
video_embed = build_video_embed(video_blob, dynamic_alt)
|
||||||
|
if not video_embed:
|
||||||
|
media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}")
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_video_path):
|
||||||
|
os.remove(temp_video_path)
|
||||||
|
|
||||||
|
# Important: if tweet had video, do NOT upload photos as fallback.
|
||||||
|
if not video_embed:
|
||||||
|
logging.warning(
|
||||||
|
"⚠️ Tweet contains video, but video could not be posted. "
|
||||||
|
"Skipping photo fallback for this tweet."
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Photo-only tweets can post images normally.
|
||||||
if tweet.media:
|
if tweet.media:
|
||||||
for media in tweet.media:
|
for media in tweet.media:
|
||||||
if media.type == "photo":
|
if media.type == "photo":
|
||||||
@@ -1607,41 +1621,7 @@ def sync_feeds(args):
|
|||||||
else:
|
else:
|
||||||
media_upload_failures.append(f"photo:{media.media_url_https}")
|
media_upload_failures.append(f"photo:{media.media_url_https}")
|
||||||
|
|
||||||
elif media.type == "video":
|
# If nothing media-based is available, optionally degrade to external card / text-only
|
||||||
if not tweet.tweet_url:
|
|
||||||
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.")
|
|
||||||
media_upload_failures.append("video:no_tweet_url")
|
|
||||||
continue
|
|
||||||
|
|
||||||
temp_video_path = "temp_video.mp4"
|
|
||||||
|
|
||||||
try:
|
|
||||||
real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url)
|
|
||||||
if not real_video_url:
|
|
||||||
logging.warning(f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}")
|
|
||||||
media_upload_failures.append(f"video:resolve_failed:{tweet.tweet_url}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
cropped_video_path = download_and_crop_video(real_video_url, temp_video_path)
|
|
||||||
if not cropped_video_path:
|
|
||||||
logging.warning(f"⚠️ Video download/crop failed for {tweet.tweet_url}")
|
|
||||||
media_upload_failures.append(f"video:crop_failed:{tweet.tweet_url}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
video_blob = get_blob_from_file(cropped_video_path, bsky_client)
|
|
||||||
if not video_blob:
|
|
||||||
logging.warning(f"⚠️ Video upload blob failed for {tweet.tweet_url}")
|
|
||||||
media_upload_failures.append(f"video:upload_failed:{tweet.tweet_url}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
video_embed = build_video_embed(video_blob, dynamic_alt)
|
|
||||||
if not video_embed:
|
|
||||||
media_upload_failures.append(f"video:embed_failed:{tweet.tweet_url}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
if os.path.exists(temp_video_path):
|
|
||||||
os.remove(temp_video_path)
|
|
||||||
|
|
||||||
if not video_embed and not image_embeds:
|
if not video_embed and not image_embeds:
|
||||||
candidate_url = None
|
candidate_url = None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user