🔧 Repaired broken mention wrapping in scraped text

2026-04-09 16:44:08 +02:00
parent 19ec55717e
commit 3d1e202d62
1 changed files with 88 additions and 6 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -250,6 +250,93 @@ def clean_post_text(text):
    return raw_text.strip()
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
    cleaned_url = strip_trailing_url_punctuation(cleaned_url)
    if is_valid_url(cleaned_url):
        return cleaned_url
    return None
 def canonicalize_url(url):
    if not url:
        return None
    return strip_trailing_url_punctuation(url.strip())
 def canonicalize_tweet_url(url):
    if not url:
        return None
    url = url.strip()
    match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE)
    if not match:
        return url.lower()
    handle = match.group(1).lower()
    tweet_id = match.group(2)
    return f"https://x.com/{handle}/status/{tweet_id}"
 def is_x_or_twitter_domain(url):
    try:
        hostname = (urlparse(url).hostname or "").lower()
        return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
    except Exception:
        return False
 def extract_urls_from_text(text):
    if not text:
        return []
    repaired = repair_broken_urls(text)
    return re.findall(r"https?://[^\s]+", repaired)
 def extract_non_x_urls_from_text(text):
    urls = extract_urls_from_text(text)
    result = []
    for url in urls:
        cleaned = strip_trailing_url_punctuation(url)
        if cleaned and not is_x_or_twitter_domain(cleaned):
            result.append(cleaned)
    return result
 def extract_ordered_non_x_urls(text):
    seen = set()
    ordered = []
    for url in extract_non_x_urls_from_text(text):
        canonical = canonicalize_url(url)
        if canonical and canonical not in seen:
            seen.add(canonical)
            ordered.append(canonical)
    return ordered
 def looks_like_title_plus_url_post(text):
    if not text:
        return False
    repaired = repair_broken_urls(text)
    repaired = strip_line_edge_whitespace(repaired)
    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if len(lines) < 2:
        return False
    last_line = lines[-1]
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
    total_urls = extract_ordered_non_x_urls(repaired)
    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    if len(text) <= max_length:
        return text
@@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
 def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
    """
    Prepare final Bluesky post text.
    If keep_url is provided and exists in the text, try to preserve it in the final output
    by truncating the body before the URL instead of cutting the URL away.
    """
    text = (full_clean_text or "").strip()
    if not text:
        return text
@@ -1699,4 +1781,4 @@ def main():
 if __name__ == "__main__":
-    main()
+    main()