diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 39096d3..d464317 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -44,7 +44,8 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 MEDIA_DOWNLOAD_TIMEOUT = 30 LINK_METADATA_TIMEOUT = 10 -URL_RESOLVE_TIMEOUT = 10 +URL_RESOLVE_TIMEOUT = 12 +PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000 DEFAULT_BSKY_BASE_URL = "https://bsky.social" # --- Logging Setup --- @@ -301,6 +302,12 @@ def is_tco_domain(url): return False +def is_external_non_x_url(url): + if not url: + return False + return (not is_tco_domain(url)) and (not is_x_or_twitter_domain(url)) + + def extract_urls_from_text(text): if not text: return [] @@ -309,10 +316,102 @@ def extract_urls_from_text(text): return re.findall(r"https?://[^\s#]+", repaired) +def resolve_tco_with_httpx(url, http_client): + try: + response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True) + final_url = canonicalize_url(str(response.url)) + if final_url: + logging.info(f"🔗 Resolved t.co with httpx: {url} -> {final_url}") + return final_url + except Exception as e: + logging.warning(f"⚠️ httpx t.co resolution failed for {url}: {repr(e)}") + + return canonicalize_url(url) + + +def resolve_tco_with_playwright(url): + """ + Browser-based fallback for t.co links that do not yield a usable + final external URL via httpx. + """ + browser = None + context = None + page = None + + try: + logging.info(f"🌐 Resolving t.co with Playwright: {url}") + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"] + ) + context = browser.new_context( + user_agent=( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/145.0.7632.6 Safari/537.36" + ), + viewport={"width": 1280, "height": 900} + ) + page = context.new_page() + + try: + page.goto(url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS) + except Exception as e: + logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}") + + time.sleep(3) + + final_url = canonicalize_url(page.url) + + for _ in range(6): + if final_url and is_external_non_x_url(final_url): + break + + try: + page.wait_for_load_state("networkidle", timeout=3000) + except Exception: + pass + + time.sleep(1) + final_url = canonicalize_url(page.url) + + logging.info(f"🌐 Playwright final URL for {url}: {final_url}") + return final_url + + except Exception as e: + logging.warning(f"⚠️ Playwright t.co resolution failed for {url}: {repr(e)}") + try: + if page: + take_error_screenshot(page, "tco_resolve_failed") + except Exception: + pass + finally: + try: + if page: + page.close() + except Exception: + pass + try: + if context: + context.close() + except Exception: + pass + try: + if browser: + browser.close() + except Exception: + pass + + return canonicalize_url(url) + + def resolve_url_if_needed(url, http_client): """ Resolve redirecting URLs such as t.co to their final destination. - Keep X/Twitter status URLs if they resolve there. + Uses httpx first, then Playwright fallback if still unresolved or + still trapped on t.co/X. """ if not url: return None @@ -324,17 +423,17 @@ def resolve_url_if_needed(url, http_client): if not is_tco_domain(cleaned): return cleaned - try: - response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True) - final_url = str(response.url) - final_url = canonicalize_url(final_url) + resolved_http = resolve_tco_with_httpx(cleaned, http_client) + if is_external_non_x_url(resolved_http): + return resolved_http - if final_url: - logging.info(f"🔗 Resolved t.co URL {cleaned} -> {final_url}") - return final_url + resolved_browser = resolve_tco_with_playwright(cleaned) + if is_external_non_x_url(resolved_browser): + logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}") + return resolved_browser - except Exception as e: - logging.warning(f"⚠️ Could not resolve t.co URL {cleaned}: {repr(e)}") + if resolved_http and not is_tco_domain(resolved_http): + return resolved_http return cleaned @@ -348,7 +447,7 @@ def extract_non_x_urls_from_text(text): if not cleaned: continue - # Keep t.co here for later resolution; do not discard it early. + # Keep t.co for later resolution. if is_tco_domain(cleaned): result.append(cleaned) continue @@ -390,10 +489,8 @@ def extract_first_resolved_external_url(text, http_client): if not resolved: continue - if is_tco_domain(resolved): - continue - - if not is_x_or_twitter_domain(resolved): + if is_external_non_x_url(resolved): + logging.info(f"✅ Selected resolved external URL for card: {resolved}") return resolved return None @@ -1394,7 +1491,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): logging.info(f"👤 Entering username: {username}...") time.sleep(1) - username_input = page.locator('input[autocomplete="username"]') + username_input = page.locator('input[autocomplete="username"]').first username_input.wait_for(state="visible", timeout=15000) username_input.click(force=True) username_input.press_sequentially(username, delay=100) @@ -1809,7 +1906,6 @@ def sync_feeds(args): continue ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) - resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client) canonical_non_x_urls = set() @@ -2100,4 +2196,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()