Improving t.co resolving URLs
This commit is contained in:
@@ -44,7 +44,8 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
|||||||
|
|
||||||
MEDIA_DOWNLOAD_TIMEOUT = 30
|
MEDIA_DOWNLOAD_TIMEOUT = 30
|
||||||
LINK_METADATA_TIMEOUT = 10
|
LINK_METADATA_TIMEOUT = 10
|
||||||
URL_RESOLVE_TIMEOUT = 10
|
URL_RESOLVE_TIMEOUT = 12
|
||||||
|
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
|
||||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
@@ -301,6 +302,12 @@ def is_tco_domain(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_external_non_x_url(url):
|
||||||
|
if not url:
|
||||||
|
return False
|
||||||
|
return (not is_tco_domain(url)) and (not is_x_or_twitter_domain(url))
|
||||||
|
|
||||||
|
|
||||||
def extract_urls_from_text(text):
|
def extract_urls_from_text(text):
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
@@ -309,10 +316,102 @@ def extract_urls_from_text(text):
|
|||||||
return re.findall(r"https?://[^\s#]+", repaired)
|
return re.findall(r"https?://[^\s#]+", repaired)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_tco_with_httpx(url, http_client):
|
||||||
|
try:
|
||||||
|
response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
|
||||||
|
final_url = canonicalize_url(str(response.url))
|
||||||
|
if final_url:
|
||||||
|
logging.info(f"🔗 Resolved t.co with httpx: {url} -> {final_url}")
|
||||||
|
return final_url
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ httpx t.co resolution failed for {url}: {repr(e)}")
|
||||||
|
|
||||||
|
return canonicalize_url(url)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_tco_with_playwright(url):
|
||||||
|
"""
|
||||||
|
Browser-based fallback for t.co links that do not yield a usable
|
||||||
|
final external URL via httpx.
|
||||||
|
"""
|
||||||
|
browser = None
|
||||||
|
context = None
|
||||||
|
page = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
logging.info(f"🌐 Resolving t.co with Playwright: {url}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--disable-blink-features=AutomationControlled"]
|
||||||
|
)
|
||||||
|
context = browser.new_context(
|
||||||
|
user_agent=(
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/145.0.7632.6 Safari/537.36"
|
||||||
|
),
|
||||||
|
viewport={"width": 1280, "height": 900}
|
||||||
|
)
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}")
|
||||||
|
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
final_url = canonicalize_url(page.url)
|
||||||
|
|
||||||
|
for _ in range(6):
|
||||||
|
if final_url and is_external_non_x_url(final_url):
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.wait_for_load_state("networkidle", timeout=3000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
final_url = canonicalize_url(page.url)
|
||||||
|
|
||||||
|
logging.info(f"🌐 Playwright final URL for {url}: {final_url}")
|
||||||
|
return final_url
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Playwright t.co resolution failed for {url}: {repr(e)}")
|
||||||
|
try:
|
||||||
|
if page:
|
||||||
|
take_error_screenshot(page, "tco_resolve_failed")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if page:
|
||||||
|
page.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if context:
|
||||||
|
context.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if browser:
|
||||||
|
browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return canonicalize_url(url)
|
||||||
|
|
||||||
|
|
||||||
def resolve_url_if_needed(url, http_client):
|
def resolve_url_if_needed(url, http_client):
|
||||||
"""
|
"""
|
||||||
Resolve redirecting URLs such as t.co to their final destination.
|
Resolve redirecting URLs such as t.co to their final destination.
|
||||||
Keep X/Twitter status URLs if they resolve there.
|
Uses httpx first, then Playwright fallback if still unresolved or
|
||||||
|
still trapped on t.co/X.
|
||||||
"""
|
"""
|
||||||
if not url:
|
if not url:
|
||||||
return None
|
return None
|
||||||
@@ -324,17 +423,17 @@ def resolve_url_if_needed(url, http_client):
|
|||||||
if not is_tco_domain(cleaned):
|
if not is_tco_domain(cleaned):
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
try:
|
resolved_http = resolve_tco_with_httpx(cleaned, http_client)
|
||||||
response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
|
if is_external_non_x_url(resolved_http):
|
||||||
final_url = str(response.url)
|
return resolved_http
|
||||||
final_url = canonicalize_url(final_url)
|
|
||||||
|
|
||||||
if final_url:
|
resolved_browser = resolve_tco_with_playwright(cleaned)
|
||||||
logging.info(f"🔗 Resolved t.co URL {cleaned} -> {final_url}")
|
if is_external_non_x_url(resolved_browser):
|
||||||
return final_url
|
logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}")
|
||||||
|
return resolved_browser
|
||||||
|
|
||||||
except Exception as e:
|
if resolved_http and not is_tco_domain(resolved_http):
|
||||||
logging.warning(f"⚠️ Could not resolve t.co URL {cleaned}: {repr(e)}")
|
return resolved_http
|
||||||
|
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
@@ -348,7 +447,7 @@ def extract_non_x_urls_from_text(text):
|
|||||||
if not cleaned:
|
if not cleaned:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Keep t.co here for later resolution; do not discard it early.
|
# Keep t.co for later resolution.
|
||||||
if is_tco_domain(cleaned):
|
if is_tco_domain(cleaned):
|
||||||
result.append(cleaned)
|
result.append(cleaned)
|
||||||
continue
|
continue
|
||||||
@@ -390,10 +489,8 @@ def extract_first_resolved_external_url(text, http_client):
|
|||||||
if not resolved:
|
if not resolved:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if is_tco_domain(resolved):
|
if is_external_non_x_url(resolved):
|
||||||
continue
|
logging.info(f"✅ Selected resolved external URL for card: {resolved}")
|
||||||
|
|
||||||
if not is_x_or_twitter_domain(resolved):
|
|
||||||
return resolved
|
return resolved
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -1394,7 +1491,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
logging.info(f"👤 Entering username: {username}...")
|
logging.info(f"👤 Entering username: {username}...")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
username_input = page.locator('input[autocomplete="username"]')
|
username_input = page.locator('input[autocomplete="username"]').first
|
||||||
username_input.wait_for(state="visible", timeout=15000)
|
username_input.wait_for(state="visible", timeout=15000)
|
||||||
username_input.click(force=True)
|
username_input.click(force=True)
|
||||||
username_input.press_sequentially(username, delay=100)
|
username_input.press_sequentially(username, delay=100)
|
||||||
@@ -1809,7 +1906,6 @@ def sync_feeds(args):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
||||||
|
|
||||||
resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)
|
resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)
|
||||||
|
|
||||||
canonical_non_x_urls = set()
|
canonical_non_x_urls = set()
|
||||||
|
|||||||
Reference in New Issue
Block a user