From ba313787b686f4453f93ea217ca0441f70fdee62 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 13 Apr 2026 17:02:54 +0200 Subject: [PATCH] fix(sync): sanitize visible tweet URLs by resolving t.co links and removing concatenated duplicates --- twitter2bsky_daemon.py | 129 ++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 34 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index f70b1ac..299b110 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -95,11 +95,26 @@ def strip_trailing_url_punctuation(url): return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip()) +def split_concatenated_urls(text): + """ + Insert whitespace between concatenated URLs like: + https://t.co/aaahttps://t.co/bbb + """ + if not text: + return text + + fixed = re.sub(r"(https?://[^\s]+?)(https?://)", r"\1 \2", text) + if fixed != text: + logging.info("🔧 Split concatenated URLs in text") + return fixed + + def repair_broken_urls(text): if not text: return text original = text + text = split_concatenated_urls(text) text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE) @@ -120,6 +135,8 @@ def repair_broken_urls(text): flags=re.IGNORECASE ) + text = split_concatenated_urls(text) + if text != original: logging.info("🔧 Repaired broken URL wrapping in scraped text") @@ -318,16 +335,10 @@ def extract_urls_from_text(text): def extract_quoted_text_from_og_title(og_title): - """ - Example input: - btv esports on X: "⚽️ Clément Turpin... - https://t.co/bQ89PSZe8R" / X - """ if not og_title: return None decoded = html.unescape(og_title).strip() - match = re.search(r'on X:\s*"(?P.*)"\s*/\s*X\s*$', decoded, flags=re.DOTALL) if match: extracted = match.group("text").strip() @@ -345,10 +356,6 @@ def extract_quoted_text_from_og_title(og_title): def fetch_tweet_og_title_text(tweet_url): - """ - Open the tweet page and extract the user-facing tweet text from og:title. - This is especially useful when the scraped tweet body misses the t.co URL. - """ browser = None context = None page = None @@ -578,28 +585,82 @@ def extract_first_resolved_external_url(text, http_client): return None -def replace_first_tco_with_resolved_url(text, resolved_url, http_client): - if not text or not resolved_url: - return text +def sanitize_visible_urls_in_text(text, http_client): + """ + Resolve visible t.co URLs in the text, split malformed concatenations, + and deduplicate repeated URLs. + """ + if not text: + return text, None - def replacer(match): - candidate = strip_trailing_url_punctuation(match.group(0)) - if is_tco_domain(candidate): - resolved = resolve_url_if_needed(candidate, http_client) - if resolved and is_external_non_x_url(resolved): - logging.info(f"🔁 Replacing visible t.co URL with resolved URL: {candidate} -> {resolved}") - return resolved - return match.group(0) + working = clean_post_text(text) + urls = extract_urls_from_text(working) - return re.sub(r"https?://[^\s#]+", replacer, text, count=1) + if not urls: + return working, None + + replacements = {} + first_external_resolved = None + seen_final_urls = set() + + for raw_url in urls: + cleaned = canonicalize_url(raw_url) + if not cleaned: + continue + + final_url = cleaned + if is_tco_domain(cleaned): + resolved = resolve_url_if_needed(cleaned, http_client) + if resolved: + final_url = resolved + + if is_external_non_x_url(final_url) and not first_external_resolved: + first_external_resolved = final_url + + replacements[cleaned] = final_url + + def replace_match(match): + raw = match.group(0) + cleaned = canonicalize_url(raw) + replacement = replacements.get(cleaned, raw) + return replacement + + working = re.sub(r"https?://[^\s#]+", replace_match, working) + + # Deduplicate same visible URL repeated back to back or multiple times. + deduped_lines = [] + for line in working.splitlines(): + line_urls = re.findall(r"https?://[^\s#]+", line) + if len(line_urls) > 1: + rebuilt = line + unique_urls = [] + for url in line_urls: + c = canonicalize_url(url) + if c and c not in seen_final_urls: + unique_urls.append(url) + seen_final_urls.add(c) + + if unique_urls: + prefix = re.sub(r"https?://[^\s#]+", "", line).strip() + if prefix: + rebuilt = prefix + " " + " ".join(unique_urls) + else: + rebuilt = " ".join(unique_urls) + else: + rebuilt = re.sub(r"https?://[^\s#]+", "", line).strip() + + deduped_lines.append(rebuilt.strip()) + else: + deduped_lines.append(line.strip()) + + working = "\n".join(deduped_lines) + working = re.sub(r"[ \t]+", " ", working) + working = re.sub(r"\n{3,}", "\n\n", working).strip() + + return working, first_external_resolved def build_effective_tweet_text(tweet, http_client): - """ - Prefer the richer og:title-derived text when available, especially if it - contains a t.co URL absent from the scraped body. Then replace visible t.co - with the fully resolved external URL. - """ scraped_text = clean_post_text(tweet.text or "") og_title_text = None @@ -615,12 +676,12 @@ def build_effective_tweet_text(tweet, http_client): candidate_text = og_title_text logging.info("🧾 Using og:title-derived tweet text as primary content") - resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client) - - if resolved_primary_external_url: - candidate_text = replace_first_tco_with_resolved_url(candidate_text, resolved_primary_external_url, http_client) - + candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(candidate_text, http_client) candidate_text = clean_post_text(candidate_text) + + if not resolved_primary_external_url: + resolved_primary_external_url = extract_first_resolved_external_url(candidate_text, http_client) + return candidate_text, resolved_primary_external_url @@ -2058,7 +2119,7 @@ def sync_feeds(args): raw_text = choose_final_visible_text( full_clean_text, primary_non_x_url=primary_non_x_url, - prefer_full_text_without_url=True, + prefer_full_text_without_url=False, ) else: raw_text = choose_final_visible_text( @@ -2327,4 +2388,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file