fix(sync): resolve t.co links to final external URLs for Bluesky external cards

2026-04-13 09:41:34 +02:00
parent eb7b629be3
commit 3f50e7d786
1 changed files with 134 additions and 59 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -28,12 +28,10 @@ BSKY_TEXT_MAX_LENGTH = 275
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45

-# Tweet image upload safety limits
 BSKY_IMAGE_MAX_BYTES = 950 * 1024
 BSKY_IMAGE_MAX_DIMENSION = 2000
 BSKY_IMAGE_MIN_JPEG_QUALITY = 45

-# External card thumbnail limits
 EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
 EXTERNAL_THUMB_MAX_DIMENSION = 1200
 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
@@ -46,6 +44,7 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15

 MEDIA_DOWNLOAD_TIMEOUT = 30
 LINK_METADATA_TIMEOUT = 10
+URL_RESOLVE_TIMEOUT = 10
 DEFAULT_BSKY_BASE_URL = "https://bsky.social"

 # --- Logging Setup ---
@@ -289,7 +288,15 @@ def canonicalize_tweet_url(url):
 def is_x_or_twitter_domain(url):
    try:
        hostname = (urlparse(url).hostname or "").lower()
-        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"}
+        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
+    except Exception:
+        return False
+
+
+def is_tco_domain(url):
+    try:
+        hostname = (urlparse(url).hostname or "").lower()
+        return hostname == "t.co"
    except Exception:
        return False

@@ -302,13 +309,51 @@ def extract_urls_from_text(text):
    return re.findall(r"https?://[^\s#]+", repaired)


+def resolve_url_if_needed(url, http_client):
+    """
+    Resolve redirecting URLs such as t.co to their final destination.
+    Keep X/Twitter status URLs if they resolve there.
+    """
+    if not url:
+        return None
+
+    cleaned = canonicalize_url(url)
+    if not cleaned:
+        return None
+
+    if not is_tco_domain(cleaned):
+        return cleaned
+
+    try:
+        response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
+        final_url = str(response.url)
+        final_url = canonicalize_url(final_url)
+
+        if final_url:
+            logging.info(f"🔗 Resolved t.co URL {cleaned} -> {final_url}")
+            return final_url
+
+    except Exception as e:
+        logging.warning(f"⚠️ Could not resolve t.co URL {cleaned}: {repr(e)}")
+
+    return cleaned
+
+
 def extract_non_x_urls_from_text(text):
    urls = extract_urls_from_text(text)
    result = []

    for url in urls:
        cleaned = strip_trailing_url_punctuation(url)
-        if cleaned and not is_x_or_twitter_domain(cleaned):
+        if not cleaned:
+            continue
+
+        # Keep t.co here for later resolution; do not discard it early.
+        if is_tco_domain(cleaned):
+            result.append(cleaned)
+            continue
+
+        if not is_x_or_twitter_domain(cleaned):
            result.append(cleaned)

    return result
@@ -335,6 +380,25 @@ def extract_first_visible_non_x_url(text):
    return None


+def extract_first_resolved_external_url(text, http_client):
+    """
+    Find the first visible candidate URL, resolve t.co if needed,
+    and return only if the final URL is a non-X external URL.
+    """
+    for url in extract_non_x_urls_from_text(text or ""):
+        resolved = resolve_url_if_needed(url, http_client)
+        if not resolved:
+            continue
+
+        if is_tco_domain(resolved):
+            continue
+
+        if not is_x_or_twitter_domain(resolved):
+            return resolved
+
+    return None
+
+
 def remove_url_from_visible_text(text, url_to_remove):
    if not text or not url_to_remove:
        return text
@@ -508,7 +572,6 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu
    if not text:
        return text

-    # Golden rule: preserve exact original cleaned tweet text if it fits.
    if len(text) <= BSKY_TEXT_MAX_LENGTH:
        logging.info("🟢 Original cleaned tweet text fits in Bluesky. Preserving exact text.")
        return text
@@ -693,6 +756,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
        "text_media_key": candidate["text_media_key"],
        "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
        "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
+        "resolved_primary_external_url": candidate.get("resolved_primary_external_url"),
        "bsky_uri": bsky_uri,
        "tweet_created_on": candidate["tweet"].created_on,
        "tweet_url": candidate["tweet"].tweet_url,
@@ -796,7 +860,7 @@ def get_recent_bsky_posts(client, handle, limit=30):

                canonical_non_x_urls = set()
                for url in urls:
-                    if not is_x_or_twitter_domain(url):
+                    if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
                        canonical = canonicalize_url(url)
                        if canonical:
                            canonical_non_x_urls.add(canonical)
@@ -1728,66 +1792,77 @@ def sync_feeds(args):

        candidate_tweets = []

-        for tweet in reversed(tweets):
-            try:
-                tweet_time = arrow.get(tweet.created_on)
+        with httpx.Client() as resolve_http_client:
+            for tweet in reversed(tweets):
+                try:
+                    tweet_time = arrow.get(tweet.created_on)

-                if tweet_time < too_old_cutoff:
-                    logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
-                    continue
+                    if tweet_time < too_old_cutoff:
+                        logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
+                        continue

-                full_clean_text = clean_post_text(tweet.text)
-                normalized_text = normalize_post_text(full_clean_text)
+                    full_clean_text = clean_post_text(tweet.text)
+                    normalized_text = normalize_post_text(full_clean_text)

-                if not normalized_text:
-                    logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
-                    continue
+                    if not normalized_text:
+                        logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
+                        continue

-                ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
-                canonical_non_x_urls = set(ordered_non_x_urls)
+                    ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)

-                primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
-                if not primary_non_x_url and ordered_non_x_urls:
-                    primary_non_x_url = ordered_non_x_urls[0]
+                    resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)

-                has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
-                has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
+                    canonical_non_x_urls = set()
+                    if resolved_primary_external_url:
+                        canonical_non_x_urls.add(canonicalize_url(resolved_primary_external_url))

-                if primary_non_x_url and not has_video and not has_photo:
-                    raw_text = choose_final_visible_text(
-                        full_clean_text,
-                        primary_non_x_url=primary_non_x_url,
-                        prefer_full_text_without_url=True,
-                    )
-                else:
-                    raw_text = choose_final_visible_text(
-                        full_clean_text,
-                        primary_non_x_url=primary_non_x_url,
-                        prefer_full_text_without_url=False,
-                    )
+                    for raw_url in ordered_non_x_urls:
+                        if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url):
+                            canonical_non_x_urls.add(canonicalize_url(raw_url))

-                media_fingerprint = build_media_fingerprint(tweet)
-                text_media_key = build_text_media_key(normalized_text, media_fingerprint)
+                    primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
+                    if not primary_non_x_url and ordered_non_x_urls:
+                        primary_non_x_url = ordered_non_x_urls[0]

-                candidate_tweets.append({
-                    "tweet": tweet,
-                    "tweet_time": tweet_time,
-                    "raw_text": raw_text,
-                    "full_clean_text": full_clean_text,
-                    "normalized_text": normalized_text,
-                    "media_fingerprint": media_fingerprint,
-                    "text_media_key": text_media_key,
-                    "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
-                    "canonical_non_x_urls": canonical_non_x_urls,
-                    "ordered_non_x_urls": ordered_non_x_urls,
-                    "primary_non_x_url": primary_non_x_url,
-                    "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text),
-                    "has_video": has_video,
-                    "has_photo": has_photo,
-                })
+                    has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
+                    has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))

-            except Exception as e:
-                logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")
+                    if primary_non_x_url and not has_video and not has_photo:
+                        raw_text = choose_final_visible_text(
+                            full_clean_text,
+                            primary_non_x_url=primary_non_x_url,
+                            prefer_full_text_without_url=True,
+                        )
+                    else:
+                        raw_text = choose_final_visible_text(
+                            full_clean_text,
+                            primary_non_x_url=primary_non_x_url,
+                            prefer_full_text_without_url=False,
+                        )
+
+                    media_fingerprint = build_media_fingerprint(tweet)
+                    text_media_key = build_text_media_key(normalized_text, media_fingerprint)
+
+                    candidate_tweets.append({
+                        "tweet": tweet,
+                        "tweet_time": tweet_time,
+                        "raw_text": raw_text,
+                        "full_clean_text": full_clean_text,
+                        "normalized_text": normalized_text,
+                        "media_fingerprint": media_fingerprint,
+                        "text_media_key": text_media_key,
+                        "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
+                        "canonical_non_x_urls": canonical_non_x_urls,
+                        "ordered_non_x_urls": ordered_non_x_urls,
+                        "primary_non_x_url": primary_non_x_url,
+                        "resolved_primary_external_url": resolved_primary_external_url,
+                        "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text),
+                        "has_video": has_video,
+                        "has_photo": has_photo,
+                    })
+
+                except Exception as e:
+                    logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")

        logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for duplicate comparison.")

@@ -1904,13 +1979,13 @@ def sync_feeds(args):
                                    media_upload_failures.append(f"photo:{media.media_url_https}")

                if not video_embed and not image_embeds:
-                    candidate_url = candidate.get("primary_non_x_url")
+                    candidate_url = candidate.get("resolved_primary_external_url")

                    if candidate_url:
                        if candidate.get("looks_like_title_plus_url"):
-                            logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
+                            logging.info(f"🔗 Detected title+URL post style. Using resolved URL for external card: {candidate_url}")
                        else:
-                            logging.info(f"🔗 Using first non-X URL for external card: {candidate_url}")
+                            logging.info(f"🔗 Using resolved first external URL for external card: {candidate_url}")

                        external_embed = build_external_link_embed(
                            candidate_url,