fix(sync): resolve t.co links to final external URLs for Bluesky external cards
This commit is contained in:
@@ -28,12 +28,10 @@ BSKY_TEXT_MAX_LENGTH = 275
|
||||
VIDEO_MAX_DURATION_SECONDS = 179
|
||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||
|
||||
# Tweet image upload safety limits
|
||||
BSKY_IMAGE_MAX_BYTES = 950 * 1024
|
||||
BSKY_IMAGE_MAX_DIMENSION = 2000
|
||||
BSKY_IMAGE_MIN_JPEG_QUALITY = 45
|
||||
|
||||
# External card thumbnail limits
|
||||
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
||||
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
||||
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
||||
@@ -46,6 +44,7 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
||||
|
||||
MEDIA_DOWNLOAD_TIMEOUT = 30
|
||||
LINK_METADATA_TIMEOUT = 10
|
||||
URL_RESOLVE_TIMEOUT = 10
|
||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||
|
||||
# --- Logging Setup ---
|
||||
@@ -289,7 +288,15 @@ def canonicalize_tweet_url(url):
|
||||
def is_x_or_twitter_domain(url):
|
||||
try:
|
||||
hostname = (urlparse(url).hostname or "").lower()
|
||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
|
||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_tco_domain(url):
|
||||
try:
|
||||
hostname = (urlparse(url).hostname or "").lower()
|
||||
return hostname == "t.co"
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@@ -302,13 +309,51 @@ def extract_urls_from_text(text):
|
||||
return re.findall(r"https?://[^\s#]+", repaired)
|
||||
|
||||
|
||||
def resolve_url_if_needed(url, http_client):
|
||||
"""
|
||||
Resolve redirecting URLs such as t.co to their final destination.
|
||||
Keep X/Twitter status URLs if they resolve there.
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
cleaned = canonicalize_url(url)
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
if not is_tco_domain(cleaned):
|
||||
return cleaned
|
||||
|
||||
try:
|
||||
response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
|
||||
final_url = str(response.url)
|
||||
final_url = canonicalize_url(final_url)
|
||||
|
||||
if final_url:
|
||||
logging.info(f"🔗 Resolved t.co URL {cleaned} -> {final_url}")
|
||||
return final_url
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Could not resolve t.co URL {cleaned}: {repr(e)}")
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def extract_non_x_urls_from_text(text):
|
||||
urls = extract_urls_from_text(text)
|
||||
result = []
|
||||
|
||||
for url in urls:
|
||||
cleaned = strip_trailing_url_punctuation(url)
|
||||
if cleaned and not is_x_or_twitter_domain(cleaned):
|
||||
if not cleaned:
|
||||
continue
|
||||
|
||||
# Keep t.co here for later resolution; do not discard it early.
|
||||
if is_tco_domain(cleaned):
|
||||
result.append(cleaned)
|
||||
continue
|
||||
|
||||
if not is_x_or_twitter_domain(cleaned):
|
||||
result.append(cleaned)
|
||||
|
||||
return result
|
||||
@@ -335,6 +380,25 @@ def extract_first_visible_non_x_url(text):
|
||||
return None
|
||||
|
||||
|
||||
def extract_first_resolved_external_url(text, http_client):
|
||||
"""
|
||||
Find the first visible candidate URL, resolve t.co if needed,
|
||||
and return only if the final URL is a non-X external URL.
|
||||
"""
|
||||
for url in extract_non_x_urls_from_text(text or ""):
|
||||
resolved = resolve_url_if_needed(url, http_client)
|
||||
if not resolved:
|
||||
continue
|
||||
|
||||
if is_tco_domain(resolved):
|
||||
continue
|
||||
|
||||
if not is_x_or_twitter_domain(resolved):
|
||||
return resolved
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def remove_url_from_visible_text(text, url_to_remove):
|
||||
if not text or not url_to_remove:
|
||||
return text
|
||||
@@ -508,7 +572,6 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Golden rule: preserve exact original cleaned tweet text if it fits.
|
||||
if len(text) <= BSKY_TEXT_MAX_LENGTH:
|
||||
logging.info("🟢 Original cleaned tweet text fits in Bluesky. Preserving exact text.")
|
||||
return text
|
||||
@@ -693,6 +756,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
|
||||
"text_media_key": candidate["text_media_key"],
|
||||
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
|
||||
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
|
||||
"resolved_primary_external_url": candidate.get("resolved_primary_external_url"),
|
||||
"bsky_uri": bsky_uri,
|
||||
"tweet_created_on": candidate["tweet"].created_on,
|
||||
"tweet_url": candidate["tweet"].tweet_url,
|
||||
@@ -796,7 +860,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
|
||||
canonical_non_x_urls = set()
|
||||
for url in urls:
|
||||
if not is_x_or_twitter_domain(url):
|
||||
if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
|
||||
canonical = canonicalize_url(url)
|
||||
if canonical:
|
||||
canonical_non_x_urls.add(canonical)
|
||||
@@ -1728,66 +1792,77 @@ def sync_feeds(args):
|
||||
|
||||
candidate_tweets = []
|
||||
|
||||
for tweet in reversed(tweets):
|
||||
try:
|
||||
tweet_time = arrow.get(tweet.created_on)
|
||||
with httpx.Client() as resolve_http_client:
|
||||
for tweet in reversed(tweets):
|
||||
try:
|
||||
tweet_time = arrow.get(tweet.created_on)
|
||||
|
||||
if tweet_time < too_old_cutoff:
|
||||
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||
continue
|
||||
if tweet_time < too_old_cutoff:
|
||||
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||
continue
|
||||
|
||||
full_clean_text = clean_post_text(tweet.text)
|
||||
normalized_text = normalize_post_text(full_clean_text)
|
||||
full_clean_text = clean_post_text(tweet.text)
|
||||
normalized_text = normalize_post_text(full_clean_text)
|
||||
|
||||
if not normalized_text:
|
||||
logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
|
||||
continue
|
||||
if not normalized_text:
|
||||
logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
|
||||
continue
|
||||
|
||||
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
||||
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
||||
|
||||
primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
|
||||
if not primary_non_x_url and ordered_non_x_urls:
|
||||
primary_non_x_url = ordered_non_x_urls[0]
|
||||
resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)
|
||||
|
||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||
canonical_non_x_urls = set()
|
||||
if resolved_primary_external_url:
|
||||
canonical_non_x_urls.add(canonicalize_url(resolved_primary_external_url))
|
||||
|
||||
if primary_non_x_url and not has_video and not has_photo:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=primary_non_x_url,
|
||||
prefer_full_text_without_url=True,
|
||||
)
|
||||
else:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=primary_non_x_url,
|
||||
prefer_full_text_without_url=False,
|
||||
)
|
||||
for raw_url in ordered_non_x_urls:
|
||||
if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url):
|
||||
canonical_non_x_urls.add(canonicalize_url(raw_url))
|
||||
|
||||
media_fingerprint = build_media_fingerprint(tweet)
|
||||
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
||||
primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
|
||||
if not primary_non_x_url and ordered_non_x_urls:
|
||||
primary_non_x_url = ordered_non_x_urls[0]
|
||||
|
||||
candidate_tweets.append({
|
||||
"tweet": tweet,
|
||||
"tweet_time": tweet_time,
|
||||
"raw_text": raw_text,
|
||||
"full_clean_text": full_clean_text,
|
||||
"normalized_text": normalized_text,
|
||||
"media_fingerprint": media_fingerprint,
|
||||
"text_media_key": text_media_key,
|
||||
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
|
||||
"canonical_non_x_urls": canonical_non_x_urls,
|
||||
"ordered_non_x_urls": ordered_non_x_urls,
|
||||
"primary_non_x_url": primary_non_x_url,
|
||||
"looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text),
|
||||
"has_video": has_video,
|
||||
"has_photo": has_photo,
|
||||
})
|
||||
has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
|
||||
has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")
|
||||
if primary_non_x_url and not has_video and not has_photo:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=primary_non_x_url,
|
||||
prefer_full_text_without_url=True,
|
||||
)
|
||||
else:
|
||||
raw_text = choose_final_visible_text(
|
||||
full_clean_text,
|
||||
primary_non_x_url=primary_non_x_url,
|
||||
prefer_full_text_without_url=False,
|
||||
)
|
||||
|
||||
media_fingerprint = build_media_fingerprint(tweet)
|
||||
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
||||
|
||||
candidate_tweets.append({
|
||||
"tweet": tweet,
|
||||
"tweet_time": tweet_time,
|
||||
"raw_text": raw_text,
|
||||
"full_clean_text": full_clean_text,
|
||||
"normalized_text": normalized_text,
|
||||
"media_fingerprint": media_fingerprint,
|
||||
"text_media_key": text_media_key,
|
||||
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
|
||||
"canonical_non_x_urls": canonical_non_x_urls,
|
||||
"ordered_non_x_urls": ordered_non_x_urls,
|
||||
"primary_non_x_url": primary_non_x_url,
|
||||
"resolved_primary_external_url": resolved_primary_external_url,
|
||||
"looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text),
|
||||
"has_video": has_video,
|
||||
"has_photo": has_photo,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")
|
||||
|
||||
logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for duplicate comparison.")
|
||||
|
||||
@@ -1904,13 +1979,13 @@ def sync_feeds(args):
|
||||
media_upload_failures.append(f"photo:{media.media_url_https}")
|
||||
|
||||
if not video_embed and not image_embeds:
|
||||
candidate_url = candidate.get("primary_non_x_url")
|
||||
candidate_url = candidate.get("resolved_primary_external_url")
|
||||
|
||||
if candidate_url:
|
||||
if candidate.get("looks_like_title_plus_url"):
|
||||
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
|
||||
logging.info(f"🔗 Detected title+URL post style. Using resolved URL for external card: {candidate_url}")
|
||||
else:
|
||||
logging.info(f"🔗 Using first non-X URL for external card: {candidate_url}")
|
||||
logging.info(f"🔗 Using resolved first external URL for external card: {candidate_url}")
|
||||
|
||||
external_embed = build_external_link_embed(
|
||||
candidate_url,
|
||||
|
||||
Reference in New Issue
Block a user