fix(sync): sanitize visible tweet URLs by resolving t.co links and removing concatenated duplicates
This commit is contained in:
@@ -95,11 +95,26 @@ def strip_trailing_url_punctuation(url):
|
|||||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def split_concatenated_urls(text):
|
||||||
|
"""
|
||||||
|
Insert whitespace between concatenated URLs like:
|
||||||
|
https://t.co/aaahttps://t.co/bbb
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
fixed = re.sub(r"(https?://[^\s]+?)(https?://)", r"\1 \2", text)
|
||||||
|
if fixed != text:
|
||||||
|
logging.info("🔧 Split concatenated URLs in text")
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
|
||||||
def repair_broken_urls(text):
|
def repair_broken_urls(text):
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
original = text
|
original = text
|
||||||
|
text = split_concatenated_urls(text)
|
||||||
|
|
||||||
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||||
|
|
||||||
@@ -120,6 +135,8 @@ def repair_broken_urls(text):
|
|||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
text = split_concatenated_urls(text)
|
||||||
|
|
||||||
if text != original:
|
if text != original:
|
||||||
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
||||||
|
|
||||||
@@ -318,16 +335,10 @@ def extract_urls_from_text(text):
|
|||||||
|
|
||||||
|
|
||||||
def extract_quoted_text_from_og_title(og_title):
|
def extract_quoted_text_from_og_title(og_title):
|
||||||
"""
|
|
||||||
Example input:
|
|
||||||
btv esports on X: "⚽️ Clément Turpin...
|
|
||||||
https://t.co/bQ89PSZe8R" / X
|
|
||||||
"""
|
|
||||||
if not og_title:
|
if not og_title:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
decoded = html.unescape(og_title).strip()
|
decoded = html.unescape(og_title).strip()
|
||||||
|
|
||||||
match = re.search(r'on X:\s*"(?P<text>.*)"\s*/\s*X\s*$', decoded, flags=re.DOTALL)
|
match = re.search(r'on X:\s*"(?P<text>.*)"\s*/\s*X\s*$', decoded, flags=re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
extracted = match.group("text").strip()
|
extracted = match.group("text").strip()
|
||||||
@@ -345,10 +356,6 @@ def extract_quoted_text_from_og_title(og_title):
|
|||||||
|
|
||||||
|
|
||||||
def fetch_tweet_og_title_text(tweet_url):
|
def fetch_tweet_og_title_text(tweet_url):
|
||||||
"""
|
|
||||||
Open the tweet page and extract the user-facing tweet text from og:title.
|
|
||||||
This is especially useful when the scraped tweet body misses the t.co URL.
|
|
||||||
"""
|
|
||||||
browser = None
|
browser = None
|
||||||
context = None
|
context = None
|
||||||
page = None
|
page = None
|
||||||
@@ -578,28 +585,82 @@ def extract_first_resolved_external_url(text, http_client):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def replace_first_tco_with_resolved_url(text, resolved_url, http_client):
|
def sanitize_visible_urls_in_text(text, http_client):
|
||||||
if not text or not resolved_url:
|
"""
|
||||||
return text
|
Resolve visible t.co URLs in the text, split malformed concatenations,
|
||||||
|
and deduplicate repeated URLs.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text, None
|
||||||
|
|
||||||
def replacer(match):
|
working = clean_post_text(text)
|
||||||
candidate = strip_trailing_url_punctuation(match.group(0))
|
urls = extract_urls_from_text(working)
|
||||||
if is_tco_domain(candidate):
|
|
||||||
resolved = resolve_url_if_needed(candidate, http_client)
|
|
||||||
if resolved and is_external_non_x_url(resolved):
|
|
||||||
logging.info(f"🔁 Replacing visible t.co URL with resolved URL: {candidate} -> {resolved}")
|
|
||||||
return resolved
|
|
||||||
return match.group(0)
|
|
||||||
|
|
||||||
return re.sub(r"https?://[^\s#]+", replacer, text, count=1)
|
if not urls:
|
||||||
|
return working, None
|
||||||
|
|
||||||
|
replacements = {}
|
||||||
|
first_external_resolved = None
|
||||||
|
seen_final_urls = set()
|
||||||
|
|
||||||
|
for raw_url in urls:
|
||||||
|
cleaned = canonicalize_url(raw_url)
|
||||||
|
if not cleaned:
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_url = cleaned
|
||||||
|
if is_tco_domain(cleaned):
|
||||||
|
resolved = resolve_url_if_needed(cleaned, http_client)
|
||||||
|
if resolved:
|
||||||
|
final_url = resolved
|
||||||
|
|
||||||
|
if is_external_non_x_url(final_url) and not first_external_resolved:
|
||||||
|
first_external_resolved = final_url
|
||||||
|
|
||||||
|
replacements[cleaned] = final_url
|
||||||
|
|
||||||
|
def replace_match(match):
|
||||||
|
raw = match.group(0)
|
||||||
|
cleaned = canonicalize_url(raw)
|
||||||
|
replacement = replacements.get(cleaned, raw)
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
working = re.sub(r"https?://[^\s#]+", replace_match, working)
|
||||||
|
|
||||||
|
# Deduplicate same visible URL repeated back to back or multiple times.
|
||||||
|
deduped_lines = []
|
||||||
|
for line in working.splitlines():
|
||||||
|
line_urls = re.findall(r"https?://[^\s#]+", line)
|
||||||
|
if len(line_urls) > 1:
|
||||||
|
rebuilt = line
|
||||||
|
unique_urls = []
|
||||||
|
for url in line_urls:
|
||||||
|
c = canonicalize_url(url)
|
||||||
|
if c and c not in seen_final_urls:
|
||||||
|
unique_urls.append(url)
|
||||||
|
seen_final_urls.add(c)
|
||||||
|
|
||||||
|
if unique_urls:
|
||||||
|
prefix = re.sub(r"https?://[^\s#]+", "", line).strip()
|
||||||
|
if prefix:
|
||||||
|
rebuilt = prefix + " " + " ".join(unique_urls)
|
||||||
|
else:
|
||||||
|
rebuilt = " ".join(unique_urls)
|
||||||
|
else:
|
||||||
|
rebuilt = re.sub(r"https?://[^\s#]+", "", line).strip()
|
||||||
|
|
||||||
|
deduped_lines.append(rebuilt.strip())
|
||||||
|
else:
|
||||||
|
deduped_lines.append(line.strip())
|
||||||
|
|
||||||
|
working = "\n".join(deduped_lines)
|
||||||
|
working = re.sub(r"[ \t]+", " ", working)
|
||||||
|
working = re.sub(r"\n{3,}", "\n\n", working).strip()
|
||||||
|
|
||||||
|
return working, first_external_resolved
|
||||||
|
|
||||||
|
|
||||||
def build_effective_tweet_text(tweet, http_client):
|
def build_effective_tweet_text(tweet, http_client):
|
||||||
"""
|
|
||||||
Prefer the richer og:title-derived text when available, especially if it
|
|
||||||
contains a t.co URL absent from the scraped body. Then replace visible t.co
|
|
||||||
with the fully resolved external URL.
|
|
||||||
"""
|
|
||||||
scraped_text = clean_post_text(tweet.text or "")
|
scraped_text = clean_post_text(tweet.text or "")
|
||||||
og_title_text = None
|
og_title_text = None
|
||||||
|
|
||||||
@@ -615,12 +676,12 @@ def build_effective_tweet_text(tweet, http_client):
|
|||||||
candidate_text = og_title_text
|
candidate_text = og_title_text
|
||||||
logging.info("🧾 Using og:title-derived tweet text as primary content")
|
logging.info("🧾 Using og:title-derived tweet text as primary content")
|
||||||
|
|
||||||
resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client)
|
candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(candidate_text, http_client)
|
||||||
|
|
||||||
if resolved_primary_external_url:
|
|
||||||
candidate_text = replace_first_tco_with_resolved_url(candidate_text, resolved_primary_external_url, http_client)
|
|
||||||
|
|
||||||
candidate_text = clean_post_text(candidate_text)
|
candidate_text = clean_post_text(candidate_text)
|
||||||
|
|
||||||
|
if not resolved_primary_external_url:
|
||||||
|
resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client)
|
||||||
|
|
||||||
return candidate_text, resolved_primary_external_url
|
return candidate_text, resolved_primary_external_url
|
||||||
|
|
||||||
|
|
||||||
@@ -2058,7 +2119,7 @@ def sync_feeds(args):
|
|||||||
raw_text = choose_final_visible_text(
|
raw_text = choose_final_visible_text(
|
||||||
full_clean_text,
|
full_clean_text,
|
||||||
primary_non_x_url=primary_non_x_url,
|
primary_non_x_url=primary_non_x_url,
|
||||||
prefer_full_text_without_url=True,
|
prefer_full_text_without_url=False,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raw_text = choose_final_visible_text(
|
raw_text = choose_final_visible_text(
|
||||||
|
|||||||
Reference in New Issue
Block a user