the bot is doing expensive URL and og:title work for tweets that it later discards as already posted

This commit is contained in:
Guillem Hernandez Sola
2026-04-13 18:37:39 +02:00
parent 455a4198a2
commit 351eec4840

View File

@@ -57,6 +57,10 @@ logging.basicConfig(
level=logging.INFO, level=logging.INFO,
) )
# --- Per-run caches for efficiency ---
OG_TITLE_CACHE = {}
URL_RESOLUTION_CACHE = {}
# --- Custom Classes --- # --- Custom Classes ---
class ScrapedMedia: class ScrapedMedia:
@@ -399,7 +403,36 @@ def extract_quoted_text_from_og_title(og_title):
return None return None
def should_fetch_og_title(tweet):
"""
Avoid fetching og:title unless it is likely to improve the text.
"""
text = clean_post_text(tweet.text or "")
urls = extract_urls_from_text(text)
if not text:
return True
if any(is_tco_domain(normalize_urlish_token(u) or u) for u in urls):
return True
if "" in text or text.endswith("..."):
return True
if len(text) < 35:
return True
return False
def fetch_tweet_og_title_text(tweet_url): def fetch_tweet_og_title_text(tweet_url):
if not tweet_url:
return None
if tweet_url in OG_TITLE_CACHE:
logging.info(f"⚡ Using cached og:title text for {tweet_url}")
return OG_TITLE_CACHE[tweet_url]
browser = None browser = None
context = None context = None
page = None page = None
@@ -424,7 +457,7 @@ def fetch_tweet_og_title_text(tweet_url):
page.goto(tweet_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS) page.goto(tweet_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS)
try: try:
page.wait_for_selector('meta[property="og:title"]', timeout=10000) page.wait_for_selector('meta[property="og:title"]', timeout=7000)
except Exception: except Exception:
pass pass
@@ -432,10 +465,13 @@ def fetch_tweet_og_title_text(tweet_url):
extracted = extract_quoted_text_from_og_title(og_title) extracted = extract_quoted_text_from_og_title(og_title)
if extracted: if extracted:
extracted = clean_post_text(extracted)
OG_TITLE_CACHE[tweet_url] = extracted
logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}") logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}")
return clean_post_text(extracted) return extracted
logging.info(f" No usable og:title text extracted for {tweet_url}") logging.info(f" No usable og:title text extracted for {tweet_url}")
OG_TITLE_CACHE[tweet_url] = None
return None return None
except Exception as e: except Exception as e:
@@ -445,6 +481,7 @@ def fetch_tweet_og_title_text(tweet_url):
take_error_screenshot(page, "tweet_og_title_failed") take_error_screenshot(page, "tweet_og_title_failed")
except Exception: except Exception:
pass pass
OG_TITLE_CACHE[tweet_url] = None
return None return None
finally: finally:
try: try:
@@ -505,19 +542,19 @@ def resolve_tco_with_playwright(url):
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}") logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}")
time.sleep(3) time.sleep(2)
final_url = canonicalize_url(page.url) final_url = canonicalize_url(page.url)
for _ in range(6): for _ in range(4):
if final_url and is_external_non_x_url(final_url): if final_url and is_external_non_x_url(final_url):
break break
try: try:
page.wait_for_load_state("networkidle", timeout=3000) page.wait_for_load_state("networkidle", timeout=2000)
except Exception: except Exception:
pass pass
time.sleep(1) time.sleep(0.8)
final_url = canonicalize_url(page.url) final_url = canonicalize_url(page.url)
logging.info(f"🌐 Playwright final URL for {url}: {final_url}") logging.info(f"🌐 Playwright final URL for {url}: {final_url}")
@@ -550,7 +587,7 @@ def resolve_tco_with_playwright(url):
return canonicalize_url(url) return canonicalize_url(url)
def resolve_url_if_needed(url, http_client): def resolve_url_if_needed(url, http_client, allow_playwright_fallback=True):
if not url: if not url:
return None return None
@@ -559,21 +596,34 @@ def resolve_url_if_needed(url, http_client):
if not cleaned: if not cleaned:
return None return None
if cleaned in URL_RESOLUTION_CACHE:
logging.info(f"⚡ Using cached URL resolution: {cleaned} -> {URL_RESOLUTION_CACHE[cleaned]}")
return URL_RESOLUTION_CACHE[cleaned]
if not is_tco_domain(cleaned): if not is_tco_domain(cleaned):
URL_RESOLUTION_CACHE[cleaned] = cleaned
return cleaned return cleaned
resolved_http = resolve_tco_with_httpx(cleaned, http_client) resolved_http = resolve_tco_with_httpx(cleaned, http_client)
if is_external_non_x_url(resolved_http): if is_external_non_x_url(resolved_http):
URL_RESOLUTION_CACHE[cleaned] = resolved_http
return resolved_http
if not allow_playwright_fallback:
URL_RESOLUTION_CACHE[cleaned] = resolved_http
return resolved_http return resolved_http
resolved_browser = resolve_tco_with_playwright(cleaned) resolved_browser = resolve_tco_with_playwright(cleaned)
if is_external_non_x_url(resolved_browser): if is_external_non_x_url(resolved_browser):
logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}") logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}")
URL_RESOLUTION_CACHE[cleaned] = resolved_browser
return resolved_browser return resolved_browser
if resolved_http and not is_tco_domain(resolved_http): if resolved_http and not is_tco_domain(resolved_http):
URL_RESOLUTION_CACHE[cleaned] = resolved_http
return resolved_http return resolved_http
URL_RESOLUTION_CACHE[cleaned] = cleaned
return cleaned return cleaned
@@ -618,9 +668,9 @@ def extract_first_visible_non_x_url(text):
return None return None
def extract_first_resolved_external_url(text, http_client): def extract_first_resolved_external_url(text, http_client, allow_playwright_fallback=True):
for url in extract_non_x_urls_from_text(text or ""): for url in extract_non_x_urls_from_text(text or ""):
resolved = resolve_url_if_needed(url, http_client) resolved = resolve_url_if_needed(url, http_client, allow_playwright_fallback=allow_playwright_fallback)
if not resolved: if not resolved:
continue continue
@@ -631,7 +681,13 @@ def extract_first_resolved_external_url(text, http_client):
return None return None
def sanitize_visible_urls_in_text(text, http_client): def sanitize_visible_urls_in_text(text, http_client, has_media=False):
"""
Faster logic:
- remove x/twitter URLs from visible text
- resolve t.co
- if a t.co resolves to x/twitter and tweet has media, do not use Playwright fallback
"""
if not text: if not text:
return text, None return text, None
@@ -659,9 +715,20 @@ def sanitize_visible_urls_in_text(text, http_client):
final_url = cleaned final_url = cleaned
if is_tco_domain(cleaned): if is_tco_domain(cleaned):
resolved = resolve_url_if_needed(cleaned, http_client) resolved_http_first = resolve_tco_with_httpx(cleaned, http_client)
if resolved:
final_url = resolved if is_external_non_x_url(resolved_http_first):
final_url = resolved_http_first
URL_RESOLUTION_CACHE[cleaned] = final_url
else:
if has_media and resolved_http_first and is_x_or_twitter_domain(resolved_http_first):
final_url = resolved_http_first
URL_RESOLUTION_CACHE[cleaned] = final_url
logging.info(
f"⚡ Skipping Playwright t.co fallback because tweet has media and httpx already resolved to X/Twitter URL: {final_url}"
)
else:
final_url = resolve_url_if_needed(cleaned, http_client, allow_playwright_fallback=True)
if is_x_or_twitter_domain(final_url): if is_x_or_twitter_domain(final_url):
replacements[raw_url] = "" replacements[raw_url] = ""
@@ -727,9 +794,10 @@ def sanitize_visible_urls_in_text(text, http_client):
def build_effective_tweet_text(tweet, http_client): def build_effective_tweet_text(tweet, http_client):
scraped_text = clean_post_text(tweet.text or "") scraped_text = clean_post_text(tweet.text or "")
has_media = bool(tweet.media)
og_title_text = None og_title_text = None
if tweet.tweet_url: if should_fetch_og_title(tweet):
og_title_text = fetch_tweet_og_title_text(tweet.tweet_url) og_title_text = fetch_tweet_og_title_text(tweet.tweet_url)
candidate_text = scraped_text candidate_text = scraped_text
@@ -741,11 +809,19 @@ def build_effective_tweet_text(tweet, http_client):
candidate_text = og_title_text candidate_text = og_title_text
logging.info("🧾 Using og:title-derived tweet text as primary content") logging.info("🧾 Using og:title-derived tweet text as primary content")
candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(candidate_text, http_client) candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(
candidate_text,
http_client,
has_media=has_media,
)
candidate_text = clean_post_text(candidate_text) candidate_text = clean_post_text(candidate_text)
if not resolved_primary_external_url: if not resolved_primary_external_url:
resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client) resolved_primary_external_url = extract_first_resolved_external_url(
candidate_text,
http_client,
allow_playwright_fallback=not has_media,
)
return candidate_text, resolved_primary_external_url return candidate_text, resolved_primary_external_url
@@ -1203,6 +1279,8 @@ def get_recent_bsky_posts(client, handle, limit=30):
if getattr(record, "reply", None) is not None: if getattr(record, "reply", None) is not None:
continue continue
# no-op
text = getattr(record, "text", "") or "" text = getattr(record, "text", "") or ""
normalized_text = normalize_post_text(text) normalized_text = normalize_post_text(text)
@@ -1720,7 +1798,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
) )
page = context.new_page() page = context.new_page()
page.goto("https://x.com/home") page.goto("https://x.com/home")
time.sleep(4) time.sleep(3)
if page.locator('[data-testid="SideNav_NewTweet_Button"]').is_visible() or "/home" in page.url: if page.locator('[data-testid="SideNav_NewTweet_Button"]').is_visible() or "/home" in page.url:
logging.info("✅ Session is valid!") logging.info("✅ Session is valid!")
@@ -1793,7 +1871,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
try: try:
page.wait_for_selector("article", timeout=20000) page.wait_for_selector("article", timeout=20000)
time.sleep(3) time.sleep(2)
articles = page.locator("article").all() articles = page.locator("article").all()
logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...") logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...")
@@ -1905,7 +1983,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
try: try:
logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}") logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}")
page.goto(tweet_url, wait_until="domcontentloaded", timeout=30000) page.goto(tweet_url, wait_until="domcontentloaded", timeout=30000)
time.sleep(3) time.sleep(2)
player = page.locator('[data-testid="videoPlayer"]').first player = page.locator('[data-testid="videoPlayer"]').first
@@ -1923,7 +2001,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
else: else:
logging.warning("⚠️ No video player locator found on tweet page") logging.warning("⚠️ No video player locator found on tweet page")
for _ in range(12): for _ in range(8):
if current_best(): if current_best():
break break
time.sleep(1) time.sleep(1)
@@ -1942,7 +2020,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
except Exception: except Exception:
pass pass
for _ in range(8): for _ in range(5):
if current_best(): if current_best():
break break
time.sleep(1) time.sleep(1)
@@ -2143,20 +2221,41 @@ def sync_feeds(args):
candidate_tweets = [] candidate_tweets = []
# --- Cheap prefilter before expensive processing ---
cheap_candidates = []
for tweet in reversed(tweets):
try:
tweet_time = arrow.get(tweet.created_on)
if tweet_time < too_old_cutoff:
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
continue
canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url)
if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}):
logging.info(f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}")
continue
scraped_text = clean_post_text(tweet.text or "")
if not scraped_text and not tweet.media:
logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
continue
cheap_candidates.append((tweet, tweet_time, canonical_tweet_url))
except Exception as e:
logging.warning(f"⚠️ Failed during cheap prefilter: {e}")
logging.info(f"{len(cheap_candidates)} tweets remain after cheap prefilter.")
with httpx.Client() as resolve_http_client: with httpx.Client() as resolve_http_client:
for tweet in reversed(tweets): for tweet, tweet_time, canonical_tweet_url in cheap_candidates:
try: try:
tweet_time = arrow.get(tweet.created_on)
if tweet_time < too_old_cutoff:
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
continue
full_clean_text, resolved_primary_external_url = build_effective_tweet_text(tweet, resolve_http_client) full_clean_text, resolved_primary_external_url = build_effective_tweet_text(tweet, resolve_http_client)
normalized_text = normalize_post_text(full_clean_text) normalized_text = normalize_post_text(full_clean_text)
if not normalized_text: if not normalized_text and not tweet.media:
logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}") logging.info(f"⏭️ Skipping empty/blank tweet after enrichment from {tweet_time}")
continue continue
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
@@ -2189,7 +2288,7 @@ def sync_feeds(args):
media_fingerprint = build_media_fingerprint(tweet) media_fingerprint = build_media_fingerprint(tweet)
text_media_key = build_text_media_key(normalized_text, media_fingerprint) text_media_key = build_text_media_key(normalized_text, media_fingerprint)
candidate_tweets.append({ candidate = {
"tweet": tweet, "tweet": tweet,
"tweet_time": tweet_time, "tweet_time": tweet_time,
"raw_text": raw_text, "raw_text": raw_text,
@@ -2197,7 +2296,7 @@ def sync_feeds(args):
"normalized_text": normalized_text, "normalized_text": normalized_text,
"media_fingerprint": media_fingerprint, "media_fingerprint": media_fingerprint,
"text_media_key": text_media_key, "text_media_key": text_media_key,
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), "canonical_tweet_url": canonical_tweet_url,
"canonical_non_x_urls": canonical_non_x_urls, "canonical_non_x_urls": canonical_non_x_urls,
"ordered_non_x_urls": ordered_non_x_urls, "ordered_non_x_urls": ordered_non_x_urls,
"primary_non_x_url": primary_non_x_url, "primary_non_x_url": primary_non_x_url,
@@ -2205,30 +2304,26 @@ def sync_feeds(args):
"looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text), "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text),
"has_video": has_video, "has_video": has_video,
"has_photo": has_photo, "has_photo": has_photo,
}) }
is_dup_state, reason_state = candidate_matches_state(candidate, state)
if is_dup_state:
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
continue
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
if is_dup_bsky:
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
continue
candidate_tweets.append(candidate)
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}") logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")
logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for duplicate comparison.") logging.info(f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering.")
tweets_to_post = [] if not candidate_tweets:
for candidate in candidate_tweets:
is_dup_state, reason_state = candidate_matches_state(candidate, state)
if is_dup_state:
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
continue
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
if is_dup_bsky:
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
continue
tweets_to_post.append(candidate)
logging.info(f"📬 {len(tweets_to_post)} tweets remain after duplicate filtering.")
if not tweets_to_post:
logging.info("✅ No new tweets need posting after duplicate comparison.") logging.info("✅ No new tweets need posting after duplicate comparison.")
return return
@@ -2253,7 +2348,7 @@ def sync_feeds(args):
context = browser.new_context(**context_kwargs) context = browser.new_context(**context_kwargs)
for candidate in tweets_to_post: for candidate in candidate_tweets:
tweet = candidate["tweet"] tweet = candidate["tweet"]
tweet_time = candidate["tweet_time"] tweet_time = candidate["tweet_time"]
raw_text = candidate["raw_text"] raw_text = candidate["raw_text"]