diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 76c5254..40ddc5e 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -250,6 +250,93 @@ def clean_post_text(text): return raw_text.strip() +def clean_url(url): + trimmed_url = url.strip() + cleaned_url = re.sub(r"\s+", "", trimmed_url) + cleaned_url = strip_trailing_url_punctuation(cleaned_url) + + if is_valid_url(cleaned_url): + return cleaned_url + return None + + +def canonicalize_url(url): + if not url: + return None + return strip_trailing_url_punctuation(url.strip()) + + +def canonicalize_tweet_url(url): + if not url: + return None + + url = url.strip() + match = re.search(r"https?://(?:www\.)?(?:x\.com|twitter\.com)/([^/]+)/status/(\d+)", url, re.IGNORECASE) + if not match: + return url.lower() + + handle = match.group(1).lower() + tweet_id = match.group(2) + return f"https://x.com/{handle}/status/{tweet_id}" + + +def is_x_or_twitter_domain(url): + try: + hostname = (urlparse(url).hostname or "").lower() + return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"} + except Exception: + return False + + +def extract_urls_from_text(text): + if not text: + return [] + repaired = repair_broken_urls(text) + return re.findall(r"https?://[^\s]+", repaired) + + +def extract_non_x_urls_from_text(text): + urls = extract_urls_from_text(text) + result = [] + + for url in urls: + cleaned = strip_trailing_url_punctuation(url) + if cleaned and not is_x_or_twitter_domain(cleaned): + result.append(cleaned) + + return result + + +def extract_ordered_non_x_urls(text): + seen = set() + ordered = [] + + for url in extract_non_x_urls_from_text(text): + canonical = canonicalize_url(url) + if canonical and canonical not in seen: + seen.add(canonical) + ordered.append(canonical) + + return ordered + + +def looks_like_title_plus_url_post(text): + if not text: + return False + + repaired = repair_broken_urls(text) + repaired = strip_line_edge_whitespace(repaired) + lines = [line.strip() for line in repaired.splitlines() if line.strip()] + if len(lines) < 2: + return False + + last_line = lines[-1] + urls_in_last_line = extract_ordered_non_x_urls(last_line) + total_urls = extract_ordered_non_x_urls(repaired) + + return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) + + def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): if len(text) <= max_length: return text @@ -262,11 +349,6 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): def prepare_post_text_for_bsky(full_clean_text, keep_url=None): - """ - Prepare final Bluesky post text. - If keep_url is provided and exists in the text, try to preserve it in the final output - by truncating the body before the URL instead of cutting the URL away. - """ text = (full_clean_text or "").strip() if not text: return text @@ -1699,4 +1781,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file