🔧 Repaired broken mention wrapping in scraped text

This commit is contained in:
Guillem Hernandez Sola
2026-04-09 16:44:08 +02:00
parent 19ec55717e
commit 3d1e202d62

View File

@@ -250,6 +250,93 @@ def clean_post_text(text):
return raw_text.strip()
def clean_url(url):
trimmed_url = url.strip()
cleaned_url = re.sub(r"\s+", "", trimmed_url)
cleaned_url = strip_trailing_url_punctuation(cleaned_url)
if is_valid_url(cleaned_url):
return cleaned_url
return None
def canonicalize_url(url):
if not url:
return None
return strip_trailing_url_punctuation(url.strip())
def canonicalize_tweet_url(url):
if not url:
return None
url = url.strip()
match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE)
if not match:
return url.lower()
handle = match.group(1).lower()
tweet_id = match.group(2)
return f"https://x.com/{handle}/status/{tweet_id}"
def is_x_or_twitter_domain(url):
try:
hostname = (urlparse(url).hostname or "").lower()
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
except Exception:
return False
def extract_urls_from_text(text):
if not text:
return []
repaired = repair_broken_urls(text)
return re.findall(r"https?://[^\s]+", repaired)
def extract_non_x_urls_from_text(text):
urls = extract_urls_from_text(text)
result = []
for url in urls:
cleaned = strip_trailing_url_punctuation(url)
if cleaned and not is_x_or_twitter_domain(cleaned):
result.append(cleaned)
return result
def extract_ordered_non_x_urls(text):
seen = set()
ordered = []
for url in extract_non_x_urls_from_text(text):
canonical = canonicalize_url(url)
if canonical and canonical not in seen:
seen.add(canonical)
ordered.append(canonical)
return ordered
def looks_like_title_plus_url_post(text):
if not text:
return False
repaired = repair_broken_urls(text)
repaired = strip_line_edge_whitespace(repaired)
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
if len(lines) < 2:
return False
last_line = lines[-1]
urls_in_last_line = extract_ordered_non_x_urls(last_line)
total_urls = extract_ordered_non_x_urls(repaired)
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
if len(text) <= max_length:
return text
@@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
"""
Prepare final Bluesky post text.
If keep_url is provided and exists in the text, try to preserve it in the final output
by truncating the body before the URL instead of cutting the URL away.
"""
text = (full_clean_text or "").strip()
if not text:
return text