🔧 Repaired broken mention wrapping in scraped text
This commit is contained in:
@@ -250,6 +250,93 @@ def clean_post_text(text):
|
||||
return raw_text.strip()
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
trimmed_url = url.strip()
|
||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||
cleaned_url = strip_trailing_url_punctuation(cleaned_url)
|
||||
|
||||
if is_valid_url(cleaned_url):
|
||||
return cleaned_url
|
||||
return None
|
||||
|
||||
|
||||
def canonicalize_url(url):
|
||||
if not url:
|
||||
return None
|
||||
return strip_trailing_url_punctuation(url.strip())
|
||||
|
||||
|
||||
def canonicalize_tweet_url(url):
|
||||
if not url:
|
||||
return None
|
||||
|
||||
url = url.strip()
|
||||
match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE)
|
||||
if not match:
|
||||
return url.lower()
|
||||
|
||||
handle = match.group(1).lower()
|
||||
tweet_id = match.group(2)
|
||||
return f"https://x.com/{handle}/status/{tweet_id}"
|
||||
|
||||
|
||||
def is_x_or_twitter_domain(url):
|
||||
try:
|
||||
hostname = (urlparse(url).hostname or "").lower()
|
||||
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def extract_urls_from_text(text):
|
||||
if not text:
|
||||
return []
|
||||
repaired = repair_broken_urls(text)
|
||||
return re.findall(r"https?://[^\s]+", repaired)
|
||||
|
||||
|
||||
def extract_non_x_urls_from_text(text):
|
||||
urls = extract_urls_from_text(text)
|
||||
result = []
|
||||
|
||||
for url in urls:
|
||||
cleaned = strip_trailing_url_punctuation(url)
|
||||
if cleaned and not is_x_or_twitter_domain(cleaned):
|
||||
result.append(cleaned)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_ordered_non_x_urls(text):
|
||||
seen = set()
|
||||
ordered = []
|
||||
|
||||
for url in extract_non_x_urls_from_text(text):
|
||||
canonical = canonicalize_url(url)
|
||||
if canonical and canonical not in seen:
|
||||
seen.add(canonical)
|
||||
ordered.append(canonical)
|
||||
|
||||
return ordered
|
||||
|
||||
|
||||
def looks_like_title_plus_url_post(text):
|
||||
if not text:
|
||||
return False
|
||||
|
||||
repaired = repair_broken_urls(text)
|
||||
repaired = strip_line_edge_whitespace(repaired)
|
||||
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||
if len(lines) < 2:
|
||||
return False
|
||||
|
||||
last_line = lines[-1]
|
||||
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
||||
total_urls = extract_ordered_non_x_urls(repaired)
|
||||
|
||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||
|
||||
|
||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
@@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
|
||||
|
||||
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
||||
"""
|
||||
Prepare final Bluesky post text.
|
||||
If keep_url is provided and exists in the text, try to preserve it in the final output
|
||||
by truncating the body before the URL instead of cutting the URL away.
|
||||
"""
|
||||
text = (full_clean_text or "").strip()
|
||||
if not text:
|
||||
return text
|
||||
|
||||
Reference in New Issue
Block a user