🔧 Repaired broken mention wrapping in scraped text
This commit is contained in:
@@ -250,6 +250,93 @@ def clean_post_text(text):
|
|||||||
return raw_text.strip()
|
return raw_text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
trimmed_url = url.strip()
|
||||||
|
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||||
|
cleaned_url = strip_trailing_url_punctuation(cleaned_url)
|
||||||
|
|
||||||
|
if is_valid_url(cleaned_url):
|
||||||
|
return cleaned_url
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def canonicalize_url(url):
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
return strip_trailing_url_punctuation(url.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def canonicalize_tweet_url(url):
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url = url.strip()
|
||||||
|
match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE)
|
||||||
|
if not match:
|
||||||
|
return url.lower()
|
||||||
|
|
||||||
|
handle = match.group(1).lower()
|
||||||
|
tweet_id = match.group(2)
|
||||||
|
return f"https://x.com/{handle}/status/{tweet_id}"
|
||||||
|
|
||||||
|
|
||||||
|
def is_x_or_twitter_domain(url):
|
||||||
|
try:
|
||||||
|
hostname = (urlparse(url).hostname or "").lower()
|
||||||
|
return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"}
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_urls_from_text(text):
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
repaired = repair_broken_urls(text)
|
||||||
|
return re.findall(r"https?://[^\s]+", repaired)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_non_x_urls_from_text(text):
|
||||||
|
urls = extract_urls_from_text(text)
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
cleaned = strip_trailing_url_punctuation(url)
|
||||||
|
if cleaned and not is_x_or_twitter_domain(cleaned):
|
||||||
|
result.append(cleaned)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ordered_non_x_urls(text):
|
||||||
|
seen = set()
|
||||||
|
ordered = []
|
||||||
|
|
||||||
|
for url in extract_non_x_urls_from_text(text):
|
||||||
|
canonical = canonicalize_url(url)
|
||||||
|
if canonical and canonical not in seen:
|
||||||
|
seen.add(canonical)
|
||||||
|
ordered.append(canonical)
|
||||||
|
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_title_plus_url_post(text):
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
repaired = repair_broken_urls(text)
|
||||||
|
repaired = strip_line_edge_whitespace(repaired)
|
||||||
|
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||||
|
if len(lines) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
last_line = lines[-1]
|
||||||
|
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
||||||
|
total_urls = extract_ordered_non_x_urls(repaired)
|
||||||
|
|
||||||
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
if len(text) <= max_length:
|
if len(text) <= max_length:
|
||||||
return text
|
return text
|
||||||
@@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
def prepare_post_text_for_bsky(full_clean_text, keep_url=None):
|
||||||
"""
|
|
||||||
Prepare final Bluesky post text.
|
|
||||||
If keep_url is provided and exists in the text, try to preserve it in the final output
|
|
||||||
by truncating the body before the URL instead of cutting the URL away.
|
|
||||||
"""
|
|
||||||
text = (full_clean_text or "").strip()
|
text = (full_clean_text or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
@@ -1699,4 +1781,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user