diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index b53c99c..23a096f 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -57,6 +57,10 @@ logging.basicConfig( level=logging.INFO, ) +# --- Per-run caches for efficiency --- +OG_TITLE_CACHE = {} +URL_RESOLUTION_CACHE = {} + # --- Custom Classes --- class ScrapedMedia: @@ -399,7 +403,36 @@ def extract_quoted_text_from_og_title(og_title): return None +def should_fetch_og_title(tweet): + """ + Avoid fetching og:title unless it is likely to improve the text. + """ + text = clean_post_text(tweet.text or "") + urls = extract_urls_from_text(text) + + if not text: + return True + + if any(is_tco_domain(normalize_urlish_token(u) or u) for u in urls): + return True + + if "…" in text or text.endswith("..."): + return True + + if len(text) < 35: + return True + + return False + + def fetch_tweet_og_title_text(tweet_url): + if not tweet_url: + return None + + if tweet_url in OG_TITLE_CACHE: + logging.info(f"⚡ Using cached og:title text for {tweet_url}") + return OG_TITLE_CACHE[tweet_url] + browser = None context = None page = None @@ -424,7 +457,7 @@ def fetch_tweet_og_title_text(tweet_url): page.goto(tweet_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS) try: - page.wait_for_selector('meta[property="og:title"]', timeout=10000) + page.wait_for_selector('meta[property="og:title"]', timeout=7000) except Exception: pass @@ -432,10 +465,13 @@ def fetch_tweet_og_title_text(tweet_url): extracted = extract_quoted_text_from_og_title(og_title) if extracted: + extracted = clean_post_text(extracted) + OG_TITLE_CACHE[tweet_url] = extracted logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}") - return clean_post_text(extracted) + return extracted logging.info(f"ℹ️ No usable og:title text extracted for {tweet_url}") + OG_TITLE_CACHE[tweet_url] = None return None except Exception as e: @@ -445,6 +481,7 @@ def fetch_tweet_og_title_text(tweet_url): take_error_screenshot(page, "tweet_og_title_failed") except Exception: pass + OG_TITLE_CACHE[tweet_url] = None return None finally: try: @@ -505,19 +542,19 @@ def resolve_tco_with_playwright(url): except Exception as e: logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}") - time.sleep(3) + time.sleep(2) final_url = canonicalize_url(page.url) - for _ in range(6): + for _ in range(4): if final_url and is_external_non_x_url(final_url): break try: - page.wait_for_load_state("networkidle", timeout=3000) + page.wait_for_load_state("networkidle", timeout=2000) except Exception: pass - time.sleep(1) + time.sleep(0.8) final_url = canonicalize_url(page.url) logging.info(f"🌐 Playwright final URL for {url}: {final_url}") @@ -550,7 +587,7 @@ def resolve_tco_with_playwright(url): return canonicalize_url(url) -def resolve_url_if_needed(url, http_client): +def resolve_url_if_needed(url, http_client, allow_playwright_fallback=True): if not url: return None @@ -559,21 +596,34 @@ def resolve_url_if_needed(url, http_client): if not cleaned: return None + if cleaned in URL_RESOLUTION_CACHE: + logging.info(f"⚡ Using cached URL resolution: {cleaned} -> {URL_RESOLUTION_CACHE[cleaned]}") + return URL_RESOLUTION_CACHE[cleaned] + if not is_tco_domain(cleaned): + URL_RESOLUTION_CACHE[cleaned] = cleaned return cleaned resolved_http = resolve_tco_with_httpx(cleaned, http_client) if is_external_non_x_url(resolved_http): + URL_RESOLUTION_CACHE[cleaned] = resolved_http + return resolved_http + + if not allow_playwright_fallback: + URL_RESOLUTION_CACHE[cleaned] = resolved_http return resolved_http resolved_browser = resolve_tco_with_playwright(cleaned) if is_external_non_x_url(resolved_browser): logging.info(f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}") + URL_RESOLUTION_CACHE[cleaned] = resolved_browser return resolved_browser if resolved_http and not is_tco_domain(resolved_http): + URL_RESOLUTION_CACHE[cleaned] = resolved_http return resolved_http + URL_RESOLUTION_CACHE[cleaned] = cleaned return cleaned @@ -618,9 +668,9 @@ def extract_first_visible_non_x_url(text): return None -def extract_first_resolved_external_url(text, http_client): +def extract_first_resolved_external_url(text, http_client, allow_playwright_fallback=True): for url in extract_non_x_urls_from_text(text or ""): - resolved = resolve_url_if_needed(url, http_client) + resolved = resolve_url_if_needed(url, http_client, allow_playwright_fallback=allow_playwright_fallback) if not resolved: continue @@ -631,7 +681,13 @@ def extract_first_resolved_external_url(text, http_client): return None -def sanitize_visible_urls_in_text(text, http_client): +def sanitize_visible_urls_in_text(text, http_client, has_media=False): + """ + Faster logic: + - remove x/twitter URLs from visible text + - resolve t.co + - if a t.co resolves to x/twitter and tweet has media, do not use Playwright fallback + """ if not text: return text, None @@ -659,9 +715,20 @@ def sanitize_visible_urls_in_text(text, http_client): final_url = cleaned if is_tco_domain(cleaned): - resolved = resolve_url_if_needed(cleaned, http_client) - if resolved: - final_url = resolved + resolved_http_first = resolve_tco_with_httpx(cleaned, http_client) + + if is_external_non_x_url(resolved_http_first): + final_url = resolved_http_first + URL_RESOLUTION_CACHE[cleaned] = final_url + else: + if has_media and resolved_http_first and is_x_or_twitter_domain(resolved_http_first): + final_url = resolved_http_first + URL_RESOLUTION_CACHE[cleaned] = final_url + logging.info( + f"⚡ Skipping Playwright t.co fallback because tweet has media and httpx already resolved to X/Twitter URL: {final_url}" + ) + else: + final_url = resolve_url_if_needed(cleaned, http_client, allow_playwright_fallback=True) if is_x_or_twitter_domain(final_url): replacements[raw_url] = "" @@ -727,9 +794,10 @@ def sanitize_visible_urls_in_text(text, http_client): def build_effective_tweet_text(tweet, http_client): scraped_text = clean_post_text(tweet.text or "") + has_media = bool(tweet.media) og_title_text = None - if tweet.tweet_url: + if should_fetch_og_title(tweet): og_title_text = fetch_tweet_og_title_text(tweet.tweet_url) candidate_text = scraped_text @@ -741,11 +809,19 @@ def build_effective_tweet_text(tweet, http_client): candidate_text = og_title_text logging.info("🧾 Using og:title-derived tweet text as primary content") - candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(candidate_text, http_client) + candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text( + candidate_text, + http_client, + has_media=has_media, + ) candidate_text = clean_post_text(candidate_text) if not resolved_primary_external_url: - resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client) + resolved_primary_external_url = extract_first_resolved_external_url( + candidate_text, + http_client, + allow_playwright_fallback=not has_media, + ) return candidate_text, resolved_primary_external_url @@ -1203,6 +1279,8 @@ def get_recent_bsky_posts(client, handle, limit=30): if getattr(record, "reply", None) is not None: continue + # no-op + text = getattr(record, "text", "") or "" normalized_text = normalize_post_text(text) @@ -1720,7 +1798,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): ) page = context.new_page() page.goto("https://x.com/home") - time.sleep(4) + time.sleep(3) if page.locator('[data-testid="SideNav_NewTweet_Button"]').is_visible() or "/home" in page.url: logging.info("✅ Session is valid!") @@ -1793,7 +1871,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): try: page.wait_for_selector("article", timeout=20000) - time.sleep(3) + time.sleep(2) articles = page.locator("article").all() logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...") @@ -1905,7 +1983,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): try: logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}") page.goto(tweet_url, wait_until="domcontentloaded", timeout=30000) - time.sleep(3) + time.sleep(2) player = page.locator('[data-testid="videoPlayer"]').first @@ -1923,7 +2001,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): else: logging.warning("⚠️ No video player locator found on tweet page") - for _ in range(12): + for _ in range(8): if current_best(): break time.sleep(1) @@ -1942,7 +2020,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): except Exception: pass - for _ in range(8): + for _ in range(5): if current_best(): break time.sleep(1) @@ -2143,20 +2221,41 @@ def sync_feeds(args): candidate_tweets = [] + # --- Cheap prefilter before expensive processing --- + cheap_candidates = [] + for tweet in reversed(tweets): + try: + tweet_time = arrow.get(tweet.created_on) + + if tweet_time < too_old_cutoff: + logging.info(f"⏭️ Skipping old tweet from {tweet_time}") + continue + + canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url) + if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}): + logging.info(f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}") + continue + + scraped_text = clean_post_text(tweet.text or "") + if not scraped_text and not tweet.media: + logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}") + continue + + cheap_candidates.append((tweet, tweet_time, canonical_tweet_url)) + + except Exception as e: + logging.warning(f"⚠️ Failed during cheap prefilter: {e}") + + logging.info(f"⚡ {len(cheap_candidates)} tweets remain after cheap prefilter.") + with httpx.Client() as resolve_http_client: - for tweet in reversed(tweets): + for tweet, tweet_time, canonical_tweet_url in cheap_candidates: try: - tweet_time = arrow.get(tweet.created_on) - - if tweet_time < too_old_cutoff: - logging.info(f"⏭️ Skipping old tweet from {tweet_time}") - continue - full_clean_text, resolved_primary_external_url = build_effective_tweet_text(tweet, resolve_http_client) normalized_text = normalize_post_text(full_clean_text) - if not normalized_text: - logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}") + if not normalized_text and not tweet.media: + logging.info(f"⏭️ Skipping empty/blank tweet after enrichment from {tweet_time}") continue ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) @@ -2189,7 +2288,7 @@ def sync_feeds(args): media_fingerprint = build_media_fingerprint(tweet) text_media_key = build_text_media_key(normalized_text, media_fingerprint) - candidate_tweets.append({ + candidate = { "tweet": tweet, "tweet_time": tweet_time, "raw_text": raw_text, @@ -2197,7 +2296,7 @@ def sync_feeds(args): "normalized_text": normalized_text, "media_fingerprint": media_fingerprint, "text_media_key": text_media_key, - "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), + "canonical_tweet_url": canonical_tweet_url, "canonical_non_x_urls": canonical_non_x_urls, "ordered_non_x_urls": ordered_non_x_urls, "primary_non_x_url": primary_non_x_url, @@ -2205,30 +2304,26 @@ def sync_feeds(args): "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text), "has_video": has_video, "has_photo": has_photo, - }) + } + + is_dup_state, reason_state = candidate_matches_state(candidate, state) + if is_dup_state: + logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}") + continue + + is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts) + if is_dup_bsky: + logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}") + continue + + candidate_tweets.append(candidate) except Exception as e: logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}") - logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for duplicate comparison.") + logging.info(f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering.") - tweets_to_post = [] - for candidate in candidate_tweets: - is_dup_state, reason_state = candidate_matches_state(candidate, state) - if is_dup_state: - logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}") - continue - - is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts) - if is_dup_bsky: - logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}") - continue - - tweets_to_post.append(candidate) - - logging.info(f"📬 {len(tweets_to_post)} tweets remain after duplicate filtering.") - - if not tweets_to_post: + if not candidate_tweets: logging.info("✅ No new tweets need posting after duplicate comparison.") return @@ -2253,7 +2348,7 @@ def sync_feeds(args): context = browser.new_context(**context_kwargs) - for candidate in tweets_to_post: + for candidate in candidate_tweets: tweet = candidate["tweet"] tweet_time = candidate["tweet_time"] raw_text = candidate["raw_text"]