From ab2f6768782b6ba647fd64060763d7aee4bab111 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 13 Apr 2026 16:11:23 +0200 Subject: [PATCH] new og meta --- twitter2bsky_daemon.py | 171 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 151 insertions(+), 20 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index d464317..f70b1ac 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -1,6 +1,7 @@ import argparse import arrow import hashlib +import html import io import json import logging @@ -316,6 +317,102 @@ def extract_urls_from_text(text): return re.findall(r"https?://[^\s#]+", repaired) +def extract_quoted_text_from_og_title(og_title): + """ + Example input: + btv esports on X: "⚽️ Clément Turpin... + https://t.co/bQ89PSZe8R" / X + """ + if not og_title: + return None + + decoded = html.unescape(og_title).strip() + + match = re.search(r'on X:\s*"(?P.*)"\s*/\s*X\s*$', decoded, flags=re.DOTALL) + if match: + extracted = match.group("text").strip() + if extracted: + return extracted + + first_quote = decoded.find('"') + last_quote = decoded.rfind('"') + if 0 <= first_quote < last_quote: + extracted = decoded[first_quote + 1:last_quote].strip() + if extracted: + return extracted + + return None + + +def fetch_tweet_og_title_text(tweet_url): + """ + Open the tweet page and extract the user-facing tweet text from og:title. + This is especially useful when the scraped tweet body misses the t.co URL. + """ + browser = None + context = None + page = None + + try: + logging.info(f"🧾 Fetching og:title from tweet page: {tweet_url}") + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"] + ) + context = browser.new_context( + user_agent=( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/145.0.7632.6 Safari/537.36" + ), + viewport={"width": 1280, "height": 900} + ) + page = context.new_page() + page.goto(tweet_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS) + + try: + page.wait_for_selector('meta[property="og:title"]', timeout=10000) + except Exception: + pass + + og_title = page.locator('meta[property="og:title"]').first.get_attribute("content") + extracted = extract_quoted_text_from_og_title(og_title) + + if extracted: + logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}") + return clean_post_text(extracted) + + logging.info(f"ℹ️ No usable og:title text extracted for {tweet_url}") + return None + + except Exception as e: + logging.warning(f"⚠️ Could not extract og:title text from {tweet_url}: {repr(e)}") + try: + if page: + take_error_screenshot(page, "tweet_og_title_failed") + except Exception: + pass + return None + finally: + try: + if page: + page.close() + except Exception: + pass + try: + if context: + context.close() + except Exception: + pass + try: + if browser: + browser.close() + except Exception: + pass + + def resolve_tco_with_httpx(url, http_client): try: response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True) @@ -330,10 +427,6 @@ def resolve_tco_with_httpx(url, http_client): def resolve_tco_with_playwright(url): - """ - Browser-based fallback for t.co links that do not yield a usable - final external URL via httpx. - """ browser = None context = None page = None @@ -362,7 +455,6 @@ def resolve_tco_with_playwright(url): logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}") time.sleep(3) - final_url = canonicalize_url(page.url) for _ in range(6): @@ -408,11 +500,6 @@ def resolve_tco_with_playwright(url): def resolve_url_if_needed(url, http_client): - """ - Resolve redirecting URLs such as t.co to their final destination. - Uses httpx first, then Playwright fallback if still unresolved or - still trapped on t.co/X. - """ if not url: return None @@ -447,7 +534,6 @@ def extract_non_x_urls_from_text(text): if not cleaned: continue - # Keep t.co for later resolution. if is_tco_domain(cleaned): result.append(cleaned) continue @@ -480,10 +566,6 @@ def extract_first_visible_non_x_url(text): def extract_first_resolved_external_url(text, http_client): - """ - Find the first visible candidate URL, resolve t.co if needed, - and return only if the final URL is a non-X external URL. - """ for url in extract_non_x_urls_from_text(text or ""): resolved = resolve_url_if_needed(url, http_client) if not resolved: @@ -496,6 +578,52 @@ def extract_first_resolved_external_url(text, http_client): return None +def replace_first_tco_with_resolved_url(text, resolved_url, http_client): + if not text or not resolved_url: + return text + + def replacer(match): + candidate = strip_trailing_url_punctuation(match.group(0)) + if is_tco_domain(candidate): + resolved = resolve_url_if_needed(candidate, http_client) + if resolved and is_external_non_x_url(resolved): + logging.info(f"🔁 Replacing visible t.co URL with resolved URL: {candidate} -> {resolved}") + return resolved + return match.group(0) + + return re.sub(r"https?://[^\s#]+", replacer, text, count=1) + + +def build_effective_tweet_text(tweet, http_client): + """ + Prefer the richer og:title-derived text when available, especially if it + contains a t.co URL absent from the scraped body. Then replace visible t.co + with the fully resolved external URL. + """ + scraped_text = clean_post_text(tweet.text or "") + og_title_text = None + + if tweet.tweet_url: + og_title_text = fetch_tweet_og_title_text(tweet.tweet_url) + + candidate_text = scraped_text + if og_title_text: + scraped_urls = extract_urls_from_text(scraped_text) + og_urls = extract_urls_from_text(og_title_text) + + if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls): + candidate_text = og_title_text + logging.info("🧾 Using og:title-derived tweet text as primary content") + + resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client) + + if resolved_primary_external_url: + candidate_text = replace_first_tco_with_resolved_url(candidate_text, resolved_primary_external_url, http_client) + + candidate_text = clean_post_text(candidate_text) + return candidate_text, resolved_primary_external_url + + def remove_url_from_visible_text(text, url_to_remove): if not text or not url_to_remove: return text @@ -1898,7 +2026,7 @@ def sync_feeds(args): logging.info(f"⏭️ Skipping old tweet from {tweet_time}") continue - full_clean_text = clean_post_text(tweet.text) + full_clean_text, resolved_primary_external_url = build_effective_tweet_text(tweet, resolve_http_client) normalized_text = normalize_post_text(full_clean_text) if not normalized_text: @@ -1906,7 +2034,6 @@ def sync_feeds(args): continue ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) - resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client) canonical_non_x_urls = set() if resolved_primary_external_url: @@ -1916,9 +2043,13 @@ def sync_feeds(args): if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url): canonical_non_x_urls.add(canonicalize_url(raw_url)) - primary_non_x_url = extract_first_visible_non_x_url(full_clean_text) - if not primary_non_x_url and ordered_non_x_urls: - primary_non_x_url = ordered_non_x_urls[0] + primary_non_x_url = None + if resolved_primary_external_url: + primary_non_x_url = resolved_primary_external_url + else: + primary_non_x_url = extract_first_visible_non_x_url(full_clean_text) + if not primary_non_x_url and ordered_non_x_urls: + primary_non_x_url = ordered_non_x_urls[0] has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))