fix(sync): sanitize visible tweet URLs by resolving t.co links and removing concatenated duplicates 2

2026-04-13 17:40:19 +02:00
parent ba313787b6
commit 526272fe30
1 changed files with 104 additions and 66 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -96,10 +96,6 @@ def strip_trailing_url_punctuation(url):
 def split_concatenated_urls(text):
    """
    Insert whitespace between concatenated URLs like:
    https://t.co/aaahttps://t.co/bbb
    """
    if not text:
        return text
@@ -290,6 +286,23 @@ def canonicalize_url(url):
    return strip_trailing_url_punctuation(url.strip())
 def normalize_urlish_token(token):
    if not token:
        return None
    token = strip_trailing_url_punctuation(token.strip())
    if not token:
        return None
    if token.startswith(("http://", "https://")):
        return token
    if token.startswith("www."):
        return f"https://{token}"
    return None
 def canonicalize_tweet_url(url):
    if not url:
        return None
@@ -306,7 +319,8 @@ def canonicalize_tweet_url(url):
 def is_x_or_twitter_domain(url):
    try:
-        hostname = (urlparse(url).hostname or "").lower()
+        normalized = normalize_urlish_token(url) or url
        hostname = (urlparse(normalized).hostname or "").lower()
        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
    except Exception:
        return False
@@ -314,7 +328,8 @@ def is_x_or_twitter_domain(url):
 def is_tco_domain(url):
    try:
-        hostname = (urlparse(url).hostname or "").lower()
+        normalized = normalize_urlish_token(url) or url
        hostname = (urlparse(normalized).hostname or "").lower()
        return hostname == "t.co"
    except Exception:
        return False
@@ -331,7 +346,9 @@ def extract_urls_from_text(text):
        return []
    repaired = repair_broken_urls(text)
-    return re.findall(r"https?://[^\s#]+", repaired)
+
    pattern = r'(?:(?:https?://)|(?:www\.))[^\s<>"\']+'
    return re.findall(pattern, repaired)
 def extract_quoted_text_from_og_title(og_title):
@@ -510,7 +527,8 @@ def resolve_url_if_needed(url, http_client):
    if not url:
        return None
-    cleaned = canonicalize_url(url)
+    normalized = normalize_urlish_token(url) or url
    cleaned = canonicalize_url(normalized)
    if not cleaned:
        return None
@@ -537,7 +555,8 @@ def extract_non_x_urls_from_text(text):
    result = []
    for url in urls:
-        cleaned = strip_trailing_url_punctuation(url)
+        normalized = normalize_urlish_token(url)
        cleaned = strip_trailing_url_punctuation(normalized or url)
        if not cleaned:
            continue
@@ -587,71 +606,94 @@ def extract_first_resolved_external_url(text, http_client):
 def sanitize_visible_urls_in_text(text, http_client):
    """
-    Resolve visible t.co URLs in the text, split malformed concatenations,
+    Resolve visible t.co URLs in the text, remove x.com/twitter.com URLs from
-    and deduplicate repeated URLs.
+    visible text, normalize www. URLs, and deduplicate repeated external URLs.
    """
    if not text:
        return text, None
    working = clean_post_text(text)
-    urls = extract_urls_from_text(working)
+    url_pattern = r'(?:(?:https?://)|(?:www\.))[^\s<>"\']+'
    urls = re.findall(url_pattern, working)
    if not urls:
        return working, None
    replacements = {}
    first_external_resolved = None
-    seen_final_urls = set()
+    seen_external_per_line = set()
    for raw_url in urls:
-        cleaned = canonicalize_url(raw_url)
+        normalized = normalize_urlish_token(raw_url)
        cleaned = canonicalize_url(normalized or raw_url)
        if not cleaned:
            continue
        if is_x_or_twitter_domain(cleaned):
            replacements[raw_url] = ""
            logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}")
            continue
        final_url = cleaned
        if is_tco_domain(cleaned):
            resolved = resolve_url_if_needed(cleaned, http_client)
            if resolved:
                final_url = resolved
            if is_x_or_twitter_domain(final_url):
                replacements[raw_url] = ""
                logging.info(f"🧹 Removing resolved X/Twitter URL from visible text: {final_url}")
                continue
        if normalized and normalized.startswith("https://www."):
            final_url = normalized
        elif normalized and normalized.startswith("http://www."):
            final_url = normalized
        if is_external_non_x_url(final_url) and not first_external_resolved:
            first_external_resolved = final_url
-        replacements[cleaned] = final_url
+        replacements[raw_url] = final_url
    def replace_match(match):
        raw = match.group(0)
-        cleaned = canonicalize_url(raw)
+        return replacements.get(raw, raw)
        replacement = replacements.get(cleaned, raw)
        return replacement
-    working = re.sub(r"https?://[^\s#]+", replace_match, working)
+    working = re.sub(url_pattern, replace_match, working)
    # Deduplicate same visible URL repeated back to back or multiple times.
    deduped_lines = []
    for line in working.splitlines():
-        line_urls = re.findall(r"https?://[^\s#]+", line)
+        line_urls = re.findall(url_pattern, line)
        if len(line_urls) > 1:
-            rebuilt = line
+            prefix = re.sub(url_pattern, "", line).strip()
-            unique_urls = []
+            kept_urls = []
            for url in line_urls:
                c = canonicalize_url(url)
                if c and c not in seen_final_urls:
                    unique_urls.append(url)
                    seen_final_urls.add(c)
-            if unique_urls:
+            seen_external_per_line.clear()
-                prefix = re.sub(r"https?://[^\s#]+", "", line).strip()
+            for url in line_urls:
-                if prefix:
+                normalized = normalize_urlish_token(url) or url
-                    rebuilt = prefix + " " + " ".join(unique_urls)
+                canonical = canonicalize_url(normalized)
-                else:
+
-                    rebuilt = " ".join(unique_urls)
+                if not canonical:
                    continue
                if is_x_or_twitter_domain(canonical):
                    continue
                if canonical in seen_external_per_line:
                    continue
                seen_external_per_line.add(canonical)
                kept_urls.append(url)
            if prefix and kept_urls:
                rebuilt = prefix + " " + " ".join(kept_urls)
            elif prefix:
                rebuilt = prefix
            else:
-                rebuilt = re.sub(r"https?://[^\s#]+", "", line).strip()
+                rebuilt = " ".join(kept_urls)
            deduped_lines.append(rebuilt.strip())
        else:
-            deduped_lines.append(line.strip())
+            cleaned_line = re.sub(r"\s{2,}", " ", line).strip()
            deduped_lines.append(cleaned_line)
    working = "\n".join(deduped_lines)
    working = re.sub(r"[ \t]+", " ", working)
@@ -698,7 +740,8 @@ def remove_url_from_visible_text(text, url_to_remove):
        new_line = line
        for url in line_urls:
-            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(url))
+            normalized = normalize_urlish_token(url) or url
            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(normalized))
            if cleaned_candidate == canonical_target:
                pattern = re.escape(url)
                new_line = re.sub(pattern, "", new_line)
@@ -727,7 +770,7 @@ def looks_like_title_plus_url_post(text):
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
    total_urls = extract_ordered_non_x_urls(repaired)
-    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
+    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://", "www."))
 def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
@@ -740,10 +783,10 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
        return False
    tail = repaired[idx:].strip()
-    if not tail.startswith(("http://", "https://")):
+    if not tail.startswith(("http://", "https://", "www.")):
        return False
-    if re.search(r"https?://\S+.*#[^\s#]+", tail):
+    if re.search(r"(?:https?://|www\.)\S+.*#[^\s#]+", tail):
        return True
    return False
@@ -1147,7 +1190,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
                canonical_non_x_urls = set()
                for url in urls:
                    if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
-                        canonical = canonicalize_url(url)
+                        canonical = canonicalize_url(normalize_urlish_token(url) or url)
                        if canonical:
                            canonical_non_x_urls.add(canonical)
@@ -1567,20 +1610,22 @@ def make_rich(content):
                continue
            cleaned_word = strip_trailing_url_punctuation(word)
            normalized_candidate = normalize_urlish_token(cleaned_word)
-            if cleaned_word.startswith("http://") or cleaned_word.startswith("https://"):
+            if normalized_candidate:
-                if cleaned_word.startswith("http://"):
+                if is_x_or_twitter_domain(normalized_candidate):
                    cleaned_word = cleaned_word.replace("http://", "https://", 1)
                clean_url_value = clean_url(cleaned_word)
                if clean_url_value and is_valid_url(clean_url_value):
                    text_builder.link(clean_url_value, clean_url_value)
                    trailing = word[len(cleaned_word):]
                    if trailing:
                        text_builder.text(trailing)
                else:
                    text_builder.text(word)
                else:
                    clean_url_value = clean_url(normalized_candidate)
                    if clean_url_value and is_valid_url(clean_url_value):
                        display_text = cleaned_word
                        text_builder.link(display_text, clean_url_value)
                        trailing = word[len(cleaned_word):]
                        if trailing:
                            text_builder.text(trailing)
                    else:
                        text_builder.text(word)
            elif cleaned_word.startswith("#") and len(cleaned_word) > 1:
                clean_tag = cleaned_word[1:].rstrip(".,;:!?)'\"…")
@@ -1607,7 +1652,7 @@ def make_rich(content):
 def build_dynamic_alt(raw_text):
    dynamic_alt = clean_post_text(raw_text)
    dynamic_alt = dynamic_alt.replace("\n", " ").strip()
-    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
+    dynamic_alt = re.sub(r"(?:(?:https?://)|(?:www\.))\S+", "", dynamic_alt).strip()
    if len(dynamic_alt) > 150:
        dynamic_alt = dynamic_alt[:147] + "..."
@@ -2115,18 +2160,11 @@ def sync_feeds(args):
                    has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
                    has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))
-                    if primary_non_x_url and not has_video and not has_photo:
+                    raw_text = choose_final_visible_text(
-                        raw_text = choose_final_visible_text(
+                        full_clean_text,
-                            full_clean_text,
+                        primary_non_x_url=primary_non_x_url,
-                            primary_non_x_url=primary_non_x_url,
+                        prefer_full_text_without_url=False,
-                            prefer_full_text_without_url=False,
+                    )
                        )
                    else:
                        raw_text = choose_final_visible_text(
                            full_clean_text,
                            primary_non_x_url=primary_non_x_url,
                            prefer_full_text_without_url=False,
                        )
                    media_fingerprint = build_media_fingerprint(tweet)
                    text_media_key = build_text_media_key(normalized_text, media_fingerprint)