fix(sync): sanitize visible tweet URLs by resolving t.co links and removing concatenated duplicates 2

2026-04-13 17:40:19 +02:00
parent ba313787b6
commit 526272fe30
1 changed files with 104 additions and 66 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -96,10 +96,6 @@ def strip_trailing_url_punctuation(url):


 def split_concatenated_urls(text):
-    """
-    Insert whitespace between concatenated URLs like:
-    https://t.co/aaahttps://t.co/bbb
-    """
    if not text:
        return text

@@ -290,6 +286,23 @@ def canonicalize_url(url):
    return strip_trailing_url_punctuation(url.strip())


+def normalize_urlish_token(token):
+    if not token:
+        return None
+
+    token = strip_trailing_url_punctuation(token.strip())
+    if not token:
+        return None
+
+    if token.startswith(("http://", "https://")):
+        return token
+
+    if token.startswith("www."):
+        return f"https://{token}"
+
+    return None
+
+
 def canonicalize_tweet_url(url):
    if not url:
        return None
@@ -306,7 +319,8 @@ def canonicalize_tweet_url(url):

 def is_x_or_twitter_domain(url):
    try:
-        hostname = (urlparse(url).hostname or "").lower()
+        normalized = normalize_urlish_token(url) or url
+        hostname = (urlparse(normalized).hostname or "").lower()
        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
    except Exception:
        return False
@@ -314,7 +328,8 @@ def is_x_or_twitter_domain(url):

 def is_tco_domain(url):
    try:
-        hostname = (urlparse(url).hostname or "").lower()
+        normalized = normalize_urlish_token(url) or url
+        hostname = (urlparse(normalized).hostname or "").lower()
        return hostname == "t.co"
    except Exception:
        return False
@@ -331,7 +346,9 @@ def extract_urls_from_text(text):
        return []

    repaired = repair_broken_urls(text)
-    return re.findall(r"https?://[^\s#]+", repaired)
+
+    pattern = r'(?:(?:https?://)|(?:www\.))[^\s<>"\']+'
+    return re.findall(pattern, repaired)


 def extract_quoted_text_from_og_title(og_title):
@@ -510,7 +527,8 @@ def resolve_url_if_needed(url, http_client):
    if not url:
        return None

-    cleaned = canonicalize_url(url)
+    normalized = normalize_urlish_token(url) or url
+    cleaned = canonicalize_url(normalized)
    if not cleaned:
        return None

@@ -537,7 +555,8 @@ def extract_non_x_urls_from_text(text):
    result = []

    for url in urls:
-        cleaned = strip_trailing_url_punctuation(url)
+        normalized = normalize_urlish_token(url)
+        cleaned = strip_trailing_url_punctuation(normalized or url)
        if not cleaned:
            continue

@@ -587,71 +606,94 @@ def extract_first_resolved_external_url(text, http_client):

 def sanitize_visible_urls_in_text(text, http_client):
    """
-    Resolve visible t.co URLs in the text, split malformed concatenations,
-    and deduplicate repeated URLs.
+    Resolve visible t.co URLs in the text, remove x.com/twitter.com URLs from
+    visible text, normalize www. URLs, and deduplicate repeated external URLs.
    """
    if not text:
        return text, None

    working = clean_post_text(text)
-    urls = extract_urls_from_text(working)
+    url_pattern = r'(?:(?:https?://)|(?:www\.))[^\s<>"\']+'
+    urls = re.findall(url_pattern, working)

    if not urls:
        return working, None

    replacements = {}
    first_external_resolved = None
-    seen_final_urls = set()
+    seen_external_per_line = set()

    for raw_url in urls:
-        cleaned = canonicalize_url(raw_url)
+        normalized = normalize_urlish_token(raw_url)
+        cleaned = canonicalize_url(normalized or raw_url)
        if not cleaned:
            continue

+        if is_x_or_twitter_domain(cleaned):
+            replacements[raw_url] = ""
+            logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}")
+            continue
+
        final_url = cleaned
        if is_tco_domain(cleaned):
            resolved = resolve_url_if_needed(cleaned, http_client)
            if resolved:
                final_url = resolved

+            if is_x_or_twitter_domain(final_url):
+                replacements[raw_url] = ""
+                logging.info(f"🧹 Removing resolved X/Twitter URL from visible text: {final_url}")
+                continue
+
+        if normalized and normalized.startswith("https://www."):
+            final_url = normalized
+        elif normalized and normalized.startswith("http://www."):
+            final_url = normalized
+
        if is_external_non_x_url(final_url) and not first_external_resolved:
            first_external_resolved = final_url

-        replacements[cleaned] = final_url
+        replacements[raw_url] = final_url

    def replace_match(match):
        raw = match.group(0)
-        cleaned = canonicalize_url(raw)
-        replacement = replacements.get(cleaned, raw)
-        return replacement
+        return replacements.get(raw, raw)

-    working = re.sub(r"https?://[^\s#]+", replace_match, working)
+    working = re.sub(url_pattern, replace_match, working)

-    # Deduplicate same visible URL repeated back to back or multiple times.
    deduped_lines = []
    for line in working.splitlines():
-        line_urls = re.findall(r"https?://[^\s#]+", line)
+        line_urls = re.findall(url_pattern, line)
        if len(line_urls) > 1:
-            rebuilt = line
-            unique_urls = []
-            for url in line_urls:
-                c = canonicalize_url(url)
-                if c and c not in seen_final_urls:
-                    unique_urls.append(url)
-                    seen_final_urls.add(c)
+            prefix = re.sub(url_pattern, "", line).strip()
+            kept_urls = []

-            if unique_urls:
-                prefix = re.sub(r"https?://[^\s#]+", "", line).strip()
-                if prefix:
-                    rebuilt = prefix + " " + " ".join(unique_urls)
-                else:
-                    rebuilt = " ".join(unique_urls)
+            seen_external_per_line.clear()
+            for url in line_urls:
+                normalized = normalize_urlish_token(url) or url
+                canonical = canonicalize_url(normalized)
+
+                if not canonical:
+                    continue
+                if is_x_or_twitter_domain(canonical):
+                    continue
+                if canonical in seen_external_per_line:
+                    continue
+
+                seen_external_per_line.add(canonical)
+                kept_urls.append(url)
+
+            if prefix and kept_urls:
+                rebuilt = prefix + " " + " ".join(kept_urls)
+            elif prefix:
+                rebuilt = prefix
            else:
-                rebuilt = re.sub(r"https?://[^\s#]+", "", line).strip()
+                rebuilt = " ".join(kept_urls)

            deduped_lines.append(rebuilt.strip())
        else:
-            deduped_lines.append(line.strip())
+            cleaned_line = re.sub(r"\s{2,}", " ", line).strip()
+            deduped_lines.append(cleaned_line)

    working = "\n".join(deduped_lines)
    working = re.sub(r"[ \t]+", " ", working)
@@ -698,7 +740,8 @@ def remove_url_from_visible_text(text, url_to_remove):
        new_line = line

        for url in line_urls:
-            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(url))
+            normalized = normalize_urlish_token(url) or url
+            cleaned_candidate = canonicalize_url(strip_trailing_url_punctuation(normalized))
            if cleaned_candidate == canonical_target:
                pattern = re.escape(url)
                new_line = re.sub(pattern, "", new_line)
@@ -727,7 +770,7 @@ def looks_like_title_plus_url_post(text):
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
    total_urls = extract_ordered_non_x_urls(repaired)

-    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
+    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://", "www."))


 def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
@@ -740,10 +783,10 @@ def looks_like_url_and_tag_tail(text, primary_non_x_url=None):
        return False

    tail = repaired[idx:].strip()
-    if not tail.startswith(("http://", "https://")):
+    if not tail.startswith(("http://", "https://", "www.")):
        return False

-    if re.search(r"https?://\S+.*#[^\s#]+", tail):
+    if re.search(r"(?:https?://|www\.)\S+.*#[^\s#]+", tail):
        return True

    return False
@@ -1147,7 +1190,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
                canonical_non_x_urls = set()
                for url in urls:
                    if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
-                        canonical = canonicalize_url(url)
+                        canonical = canonicalize_url(normalize_urlish_token(url) or url)
                        if canonical:
                            canonical_non_x_urls.add(canonical)

@@ -1567,20 +1610,22 @@ def make_rich(content):
                continue

            cleaned_word = strip_trailing_url_punctuation(word)
+            normalized_candidate = normalize_urlish_token(cleaned_word)

-            if cleaned_word.startswith("http://") or cleaned_word.startswith("https://"):
-                if cleaned_word.startswith("http://"):
-                    cleaned_word = cleaned_word.replace("http://", "https://", 1)
-
-                clean_url_value = clean_url(cleaned_word)
-
-                if clean_url_value and is_valid_url(clean_url_value):
-                    text_builder.link(clean_url_value, clean_url_value)
-                    trailing = word[len(cleaned_word):]
-                    if trailing:
-                        text_builder.text(trailing)
-                else:
+            if normalized_candidate:
+                if is_x_or_twitter_domain(normalized_candidate):
                    text_builder.text(word)
+                else:
+                    clean_url_value = clean_url(normalized_candidate)
+
+                    if clean_url_value and is_valid_url(clean_url_value):
+                        display_text = cleaned_word
+                        text_builder.link(display_text, clean_url_value)
+                        trailing = word[len(cleaned_word):]
+                        if trailing:
+                            text_builder.text(trailing)
+                    else:
+                        text_builder.text(word)

            elif cleaned_word.startswith("#") and len(cleaned_word) > 1:
                clean_tag = cleaned_word[1:].rstrip(".,;:!?)'\"…")
@@ -1607,7 +1652,7 @@ def make_rich(content):
 def build_dynamic_alt(raw_text):
    dynamic_alt = clean_post_text(raw_text)
    dynamic_alt = dynamic_alt.replace("\n", " ").strip()
-    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
+    dynamic_alt = re.sub(r"(?:(?:https?://)|(?:www\.))\S+", "", dynamic_alt).strip()

    if len(dynamic_alt) > 150:
        dynamic_alt = dynamic_alt[:147] + "..."
@@ -2115,18 +2160,11 @@ def sync_feeds(args):
                    has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or []))
                    has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or []))

-                    if primary_non_x_url and not has_video and not has_photo:
-                        raw_text = choose_final_visible_text(
-                            full_clean_text,
-                            primary_non_x_url=primary_non_x_url,
-                            prefer_full_text_without_url=False,
-                        )
-                    else:
-                        raw_text = choose_final_visible_text(
-                            full_clean_text,
-                            primary_non_x_url=primary_non_x_url,
-                            prefer_full_text_without_url=False,
-                        )
+                    raw_text = choose_final_visible_text(
+                        full_clean_text,
+                        primary_non_x_url=primary_non_x_url,
+                        prefer_full_text_without_url=False,
+                    )

                    media_fingerprint = build_media_fingerprint(tweet)
                    text_media_key = build_text_media_key(normalized_text, media_fingerprint)