🔧 Repaired broken mention wrapping in scraped text

2026-04-09 16:44:08 +02:00
parent 19ec55717e
commit 3d1e202d62
1 changed files with 88 additions and 6 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -250,6 +250,93 @@ def clean_post_text(text):
    return raw_text.strip()


+def clean_url(url):
+    trimmed_url = url.strip()
+    cleaned_url = re.sub(r"\s+", "", trimmed_url)
+    cleaned_url = strip_trailing_url_punctuation(cleaned_url)
+
+    if is_valid_url(cleaned_url):
+        return cleaned_url
+    return None
+
+
+def canonicalize_url(url):
+    if not url:
+        return None
+    return strip_trailing_url_punctuation(url.strip())
+
+
+def canonicalize_tweet_url(url):
+    if not url:
+        return None
+
+    url = url.strip()
+    match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE)
+    if not match:
+        return url.lower()
+
+    handle = match.group(1).lower()
+    tweet_id = match.group(2)
+    return f"https://x.com/{handle}/status/{tweet_id}"
+
+
+def is_x_or_twitter_domain(url):
+    try:
+        hostname = (urlparse(url).hostname or "").lower()
+        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
+    except Exception:
+        return False
+
+
+def extract_urls_from_text(text):
+    if not text:
+        return []
+    repaired = repair_broken_urls(text)
+    return re.findall(r"https?://[^\s]+", repaired)
+
+
+def extract_non_x_urls_from_text(text):
+    urls = extract_urls_from_text(text)
+    result = []
+
+    for url in urls:
+        cleaned = strip_trailing_url_punctuation(url)
+        if cleaned and not is_x_or_twitter_domain(cleaned):
+            result.append(cleaned)
+
+    return result
+
+
+def extract_ordered_non_x_urls(text):
+    seen = set()
+    ordered = []
+
+    for url in extract_non_x_urls_from_text(text):
+        canonical = canonicalize_url(url)
+        if canonical and canonical not in seen:
+            seen.add(canonical)
+            ordered.append(canonical)
+
+    return ordered
+
+
+def looks_like_title_plus_url_post(text):
+    if not text:
+        return False
+
+    repaired = repair_broken_urls(text)
+    repaired = strip_line_edge_whitespace(repaired)
+    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
+    if len(lines) < 2:
+        return False
+
+    last_line = lines[-1]
+    urls_in_last_line = extract_ordered_non_x_urls(last_line)
+    total_urls = extract_ordered_non_x_urls(repaired)
+
+    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
+
+
 def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    if len(text) <= max_length:
        return text
@@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):


 def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
-    """
-    Prepare final Bluesky post text.
-    If keep_url is provided and exists in the text, try to preserve it in the final output
-    by truncating the body before the URL instead of cutting the URL away.
-    """
    text = (full_clean_text or "").strip()
    if not text:
        return text