new og meta
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import arrow
|
import arrow
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import html
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -316,6 +317,102 @@ def extract_urls_from_text(text):
|
|||||||
return re.findall(r"https?://[^\s#]+", repaired)
|
return re.findall(r"https?://[^\s#]+", repaired)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_quoted_text_from_og_title(og_title):
|
||||||
|
"""
|
||||||
|
Example input:
|
||||||
|
btv esports on X: "⚽️ Clément Turpin...
|
||||||
|
https://t.co/bQ89PSZe8R" / X
|
||||||
|
"""
|
||||||
|
if not og_title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
decoded = html.unescape(og_title).strip()
|
||||||
|
|
||||||
|
match = re.search(r'on X:\s*"(?P<text>.*)"\s*/\s*X\s*$', decoded, flags=re.DOTALL)
|
||||||
|
if match:
|
||||||
|
extracted = match.group("text").strip()
|
||||||
|
if extracted:
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
first_quote = decoded.find('"')
|
||||||
|
last_quote = decoded.rfind('"')
|
||||||
|
if 0 <= first_quote < last_quote:
|
||||||
|
extracted = decoded[first_quote + 1:last_quote].strip()
|
||||||
|
if extracted:
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tweet_og_title_text(tweet_url):
|
||||||
|
"""
|
||||||
|
Open the tweet page and extract the user-facing tweet text from og:title.
|
||||||
|
This is especially useful when the scraped tweet body misses the t.co URL.
|
||||||
|
"""
|
||||||
|
browser = None
|
||||||
|
context = None
|
||||||
|
page = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
logging.info(f"🧾 Fetching og:title from tweet page: {tweet_url}")
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--disable-blink-features=AutomationControlled"]
|
||||||
|
)
|
||||||
|
context = browser.new_context(
|
||||||
|
user_agent=(
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/145.0.7632.6 Safari/537.36"
|
||||||
|
),
|
||||||
|
viewport={"width": 1280, "height": 900}
|
||||||
|
)
|
||||||
|
page = context.new_page()
|
||||||
|
page.goto(tweet_url, wait_until="domcontentloaded", timeout=PLAYWRIGHT_RESOLVE_TIMEOUT_MS)
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.wait_for_selector('meta[property="og:title"]', timeout=10000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
og_title = page.locator('meta[property="og:title"]').first.get_attribute("content")
|
||||||
|
extracted = extract_quoted_text_from_og_title(og_title)
|
||||||
|
|
||||||
|
if extracted:
|
||||||
|
logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}")
|
||||||
|
return clean_post_text(extracted)
|
||||||
|
|
||||||
|
logging.info(f"ℹ️ No usable og:title text extracted for {tweet_url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Could not extract og:title text from {tweet_url}: {repr(e)}")
|
||||||
|
try:
|
||||||
|
if page:
|
||||||
|
take_error_screenshot(page, "tweet_og_title_failed")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if page:
|
||||||
|
page.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if context:
|
||||||
|
context.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if browser:
|
||||||
|
browser.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def resolve_tco_with_httpx(url, http_client):
|
def resolve_tco_with_httpx(url, http_client):
|
||||||
try:
|
try:
|
||||||
response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
|
response = http_client.get(url, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True)
|
||||||
@@ -330,10 +427,6 @@ def resolve_tco_with_httpx(url, http_client):
|
|||||||
|
|
||||||
|
|
||||||
def resolve_tco_with_playwright(url):
|
def resolve_tco_with_playwright(url):
|
||||||
"""
|
|
||||||
Browser-based fallback for t.co links that do not yield a usable
|
|
||||||
final external URL via httpx.
|
|
||||||
"""
|
|
||||||
browser = None
|
browser = None
|
||||||
context = None
|
context = None
|
||||||
page = None
|
page = None
|
||||||
@@ -362,7 +455,6 @@ def resolve_tco_with_playwright(url):
|
|||||||
logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}")
|
logging.warning(f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}")
|
||||||
|
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
final_url = canonicalize_url(page.url)
|
final_url = canonicalize_url(page.url)
|
||||||
|
|
||||||
for _ in range(6):
|
for _ in range(6):
|
||||||
@@ -408,11 +500,6 @@ def resolve_tco_with_playwright(url):
|
|||||||
|
|
||||||
|
|
||||||
def resolve_url_if_needed(url, http_client):
|
def resolve_url_if_needed(url, http_client):
|
||||||
"""
|
|
||||||
Resolve redirecting URLs such as t.co to their final destination.
|
|
||||||
Uses httpx first, then Playwright fallback if still unresolved or
|
|
||||||
still trapped on t.co/X.
|
|
||||||
"""
|
|
||||||
if not url:
|
if not url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -447,7 +534,6 @@ def extract_non_x_urls_from_text(text):
|
|||||||
if not cleaned:
|
if not cleaned:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Keep t.co for later resolution.
|
|
||||||
if is_tco_domain(cleaned):
|
if is_tco_domain(cleaned):
|
||||||
result.append(cleaned)
|
result.append(cleaned)
|
||||||
continue
|
continue
|
||||||
@@ -480,10 +566,6 @@ def extract_first_visible_non_x_url(text):
|
|||||||
|
|
||||||
|
|
||||||
def extract_first_resolved_external_url(text, http_client):
|
def extract_first_resolved_external_url(text, http_client):
|
||||||
"""
|
|
||||||
Find the first visible candidate URL, resolve t.co if needed,
|
|
||||||
and return only if the final URL is a non-X external URL.
|
|
||||||
"""
|
|
||||||
for url in extract_non_x_urls_from_text(text or ""):
|
for url in extract_non_x_urls_from_text(text or ""):
|
||||||
resolved = resolve_url_if_needed(url, http_client)
|
resolved = resolve_url_if_needed(url, http_client)
|
||||||
if not resolved:
|
if not resolved:
|
||||||
@@ -496,6 +578,52 @@ def extract_first_resolved_external_url(text, http_client):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def replace_first_tco_with_resolved_url(text, resolved_url, http_client):
|
||||||
|
if not text or not resolved_url:
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replacer(match):
|
||||||
|
candidate = strip_trailing_url_punctuation(match.group(0))
|
||||||
|
if is_tco_domain(candidate):
|
||||||
|
resolved = resolve_url_if_needed(candidate, http_client)
|
||||||
|
if resolved and is_external_non_x_url(resolved):
|
||||||
|
logging.info(f"🔁 Replacing visible t.co URL with resolved URL: {candidate} -> {resolved}")
|
||||||
|
return resolved
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
|
return re.sub(r"https?://[^\s#]+", replacer, text, count=1)
|
||||||
|
|
||||||
|
|
||||||
|
def build_effective_tweet_text(tweet, http_client):
|
||||||
|
"""
|
||||||
|
Prefer the richer og:title-derived text when available, especially if it
|
||||||
|
contains a t.co URL absent from the scraped body. Then replace visible t.co
|
||||||
|
with the fully resolved external URL.
|
||||||
|
"""
|
||||||
|
scraped_text = clean_post_text(tweet.text or "")
|
||||||
|
og_title_text = None
|
||||||
|
|
||||||
|
if tweet.tweet_url:
|
||||||
|
og_title_text = fetch_tweet_og_title_text(tweet.tweet_url)
|
||||||
|
|
||||||
|
candidate_text = scraped_text
|
||||||
|
if og_title_text:
|
||||||
|
scraped_urls = extract_urls_from_text(scraped_text)
|
||||||
|
og_urls = extract_urls_from_text(og_title_text)
|
||||||
|
|
||||||
|
if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls):
|
||||||
|
candidate_text = og_title_text
|
||||||
|
logging.info("🧾 Using og:title-derived tweet text as primary content")
|
||||||
|
|
||||||
|
resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client)
|
||||||
|
|
||||||
|
if resolved_primary_external_url:
|
||||||
|
candidate_text = replace_first_tco_with_resolved_url(candidate_text, resolved_primary_external_url, http_client)
|
||||||
|
|
||||||
|
candidate_text = clean_post_text(candidate_text)
|
||||||
|
return candidate_text, resolved_primary_external_url
|
||||||
|
|
||||||
|
|
||||||
def remove_url_from_visible_text(text, url_to_remove):
|
def remove_url_from_visible_text(text, url_to_remove):
|
||||||
if not text or not url_to_remove:
|
if not text or not url_to_remove:
|
||||||
return text
|
return text
|
||||||
@@ -1898,7 +2026,7 @@ def sync_feeds(args):
|
|||||||
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
full_clean_text = clean_post_text(tweet.text)
|
full_clean_text, resolved_primary_external_url = build_effective_tweet_text(tweet, resolve_http_client)
|
||||||
normalized_text = normalize_post_text(full_clean_text)
|
normalized_text = normalize_post_text(full_clean_text)
|
||||||
|
|
||||||
if not normalized_text:
|
if not normalized_text:
|
||||||
@@ -1906,7 +2034,6 @@ def sync_feeds(args):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text)
|
||||||
resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client)
|
|
||||||
|
|
||||||
canonical_non_x_urls = set()
|
canonical_non_x_urls = set()
|
||||||
if resolved_primary_external_url:
|
if resolved_primary_external_url:
|
||||||
@@ -1916,6 +2043,10 @@ def sync_feeds(args):
|
|||||||
if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url):
|
if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url):
|
||||||
canonical_non_x_urls.add(canonicalize_url(raw_url))
|
canonical_non_x_urls.add(canonicalize_url(raw_url))
|
||||||
|
|
||||||
|
primary_non_x_url = None
|
||||||
|
if resolved_primary_external_url:
|
||||||
|
primary_non_x_url = resolved_primary_external_url
|
||||||
|
else:
|
||||||
primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
|
primary_non_x_url = extract_first_visible_non_x_url(full_clean_text)
|
||||||
if not primary_non_x_url and ordered_non_x_urls:
|
if not primary_non_x_url and ordered_non_x_urls:
|
||||||
primary_non_x_url = ordered_non_x_urls[0]
|
primary_non_x_url = ordered_non_x_urls[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user