Improving t.co resolving URLs

2026-04-13 10:14:15 +00:00
parent 07bfd5e2d5
commit f6c6be8fab
1 changed files with 115 additions and 19 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -44,7 +44,8 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15

 MEDIA_DOWNLOAD_TIMEOUT = 30
 LINK_METADATA_TIMEOUT = 10
-URL_RESOLVE_TIMEOUT = 10
+URL_RESOLVE_TIMEOUT = 12
+PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
 DEFAULT_BSKY_BASE_URL = "https://bsky.social"

 # --- Logging Setup ---
@@ -301,6 +302,12 @@ def is_tco_domain(url):
        return False


+def is_external_non_x_url(url):
+    if not url:
+        return False
+    return (not is_tco_domain(url)) and (not is_x_or_twitter_domain(url))
+
+
 def extract_urls_from_text(text):
    if not text:
        return []
@@ -309,10 +316,102 @@ def extract_urls_from_text(text):
    return re.findall(r"https?://[^\s#]+", repaired)


+def resolve_tco_with_httpx(url, http_client):
+    try:
+        response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
+        final_url = canonicalize_url(str(response.url))
+        if final_url:
+            logging.info(f"🔗 Resolved t.co with httpx: {url} -> {final_url}")
+            return final_url
+    except Exception as e:
+        logging.warning(f"⚠️ httpx t.co resolution failed for {url}: {repr(e)}")
+
+    return canonicalize_url(url)
+
+
+def resolve_tco_with_playwright(url):
+    """
+    Browser-based fallback for t.co links that do not yield a usable
+    final external URL via httpx.
+    """
+    browser = None
+    context = None
+    page = None
+
+    try:
+        logging.info(f"🌐 Resolving t.co with Playwright: {url}")
+
+        with sync_playwright() as p:
+            browser = p.chromium.launch(
+                headless=True,
+                args=["--disable-blink-features=AutomationControlled"]
+            )
+            context = browser.new_context(
+                user_agent=(
+                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/145.0.7632.6 Safari/537.36"
+                ),
+                viewport={"width": 1280, "height": 900}
+            )
+            page = context.new_page()
+
+            try:
+                page.goto(url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS)
+            except Exception as e:
+                logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}")
+
+            time.sleep(3)
+
+            final_url = canonicalize_url(page.url)
+
+            for _ in range(6):
+                if final_url and is_external_non_x_url(final_url):
+                    break
+
+                try:
+                    page.wait_for_load_state("networkidle", timeout=3000)
+                except Exception:
+                    pass
+
+                time.sleep(1)
+                final_url = canonicalize_url(page.url)
+
+            logging.info(f"🌐 Playwright final URL for {url}: {final_url}")
+            return final_url
+
+    except Exception as e:
+        logging.warning(f"⚠️ Playwright t.co resolution failed for {url}: {repr(e)}")
+        try:
+            if page:
+                take_error_screenshot(page, "tco_resolve_failed")
+        except Exception:
+            pass
+    finally:
+        try:
+            if page:
+                page.close()
+        except Exception:
+            pass
+        try:
+            if context:
+                context.close()
+        except Exception:
+            pass
+        try:
+            if browser:
+                browser.close()
+        except Exception:
+            pass
+
+    return canonicalize_url(url)
+
+
 def resolve_url_if_needed(url, http_client):
    """
    Resolve redirecting URLs such as t.co to their final destination.
-    Keep X/Twitter status URLs if they resolve there.
+    Uses httpx first, then Playwright fallback if still unresolved or
+    still trapped on t.co/X.
    """
    if not url:
        return None
@@ -324,17 +423,17 @@ def resolve_url_if_needed(url, http_client):
    if not is_tco_domain(cleaned):
        return cleaned

-    try:
-        response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
-        final_url = str(response.url)
-        final_url = canonicalize_url(final_url)
+    resolved_http = resolve_tco_with_httpx(cleaned, http_client)
+    if is_external_non_x_url(resolved_http):
+        return resolved_http

-        if final_url:
-            logging.info(f"🔗 Resolved t.co URL {cleaned} -> {final_url}")
-            return final_url
+    resolved_browser = resolve_tco_with_playwright(cleaned)
+    if is_external_non_x_url(resolved_browser):
+        logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}")
+        return resolved_browser

-    except Exception as e:
-        logging.warning(f"⚠️ Could not resolve t.co URL {cleaned}: {repr(e)}")
+    if resolved_http and not is_tco_domain(resolved_http):
+        return resolved_http

    return cleaned

@@ -348,7 +447,7 @@ def extract_non_x_urls_from_text(text):
        if not cleaned:
            continue

-        # Keep t.co here for later resolution; do not discard it early.
+        # Keep t.co for later resolution.
        if is_tco_domain(cleaned):
            result.append(cleaned)
            continue
@@ -390,10 +489,8 @@ def extract_first_resolved_external_url(text, http_client):
        if not resolved:
            continue

-        if is_tco_domain(resolved):
-            continue
-
-        if not is_x_or_twitter_domain(resolved):
+        if is_external_non_x_url(resolved):
+            logging.info(f"✅ Selected resolved external URL for card: {resolved}")
            return resolved

    return None
@@ -1394,7 +1491,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                logging.info(f"👤 Entering username: {username}...")
                time.sleep(1)

-                username_input = page.locator('input[autocomplete="username"]')
+                username_input = page.locator('input[autocomplete="username"]').first
                username_input.wait_for(state="visible", timeout=15000)
                username_input.click(force=True)
                username_input.press_sequentially(username, delay=100)
@@ -1809,7 +1906,6 @@ def sync_feeds(args):
                        continue

                    ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
-
                    resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)

                    canonical_non_x_urls = set()
@@ -2100,4 +2196,4 @@ def main():


 if __name__ == "__main__":
-    main()
+    main()