From 5abd9d685accb743c60f66217a00c35ce7d913fa Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Sun, 5 Apr 2026 22:51:18 +0200 Subject: [PATCH] New test for rich snippet --- twitter2bsky_daemon.py | 136 +++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index c3d2bfa..01999d2 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30 TWEET_MAX_AGE_DAYS = 3 BSKY_TEXT_MAX_LENGTH = 275 -# Video handling notes: -# - Bluesky video support is constrained not just by duration, but also by -# practical upload limits like final file size, bitrate, resolution, and -# server-side proxy/PDS body-size caps. -# - Custom PDSes such as eurosky.social may accept images fine but fail on -# larger video blob uploads. VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 @@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url): return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip()) +def repair_broken_urls(text): + """ + Repair URLs that were split by copied/scraped line breaks. + + Examples: + https:// + 3cat.cat/path + becomes: + https://3cat.cat/path + + https://3cat.cat/some-pa + th/article + becomes: + https://3cat.cat/some-path/article + """ + if not text: + return text + + original = text + + # Join protocol line breaks: https://\nexample.com -> https://example.com + text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE) + + # Join URL-internal line breaks when the next chunk still looks like URL content. + # This is intentionally conservative but effective for wrapped article URLs. + prev_text = None + while prev_text != text: + prev_text = text + text = re.sub( + r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)", + r"\1\2", + text, + flags=re.IGNORECASE + ) + + # Also fix accidental spaces inserted inside URLs after the protocol. + text = re.sub( + r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)", + r"\1\2", + text, + flags=re.IGNORECASE + ) + + if text != original: + logging.info("šŸ”§ Repaired broken URL wrapping in scraped text") + + return text + + def clean_url(url): trimmed_url = url.strip() cleaned_url = re.sub(r"\s+", "", trimmed_url) @@ -102,9 +145,6 @@ def canonicalize_url(url): def canonicalize_tweet_url(url): - """ - Canonicalize x.com/twitter.com status URLs for internal dedupe only. - """ if not url: return None @@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url): def extract_urls_from_text(text): if not text: return [] - return re.findall(r"https?://[^\s]+", text) + repaired = repair_broken_urls(text) + return re.findall(r"https?://[^\s]+", repaired) def extract_non_x_urls_from_text(text): @@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text): def extract_ordered_non_x_urls(text): - """ - Extract non-X URLs preserving original order and uniqueness. - This is used for posting decisions, especially external link-card creation. - """ seen = set() ordered = [] @@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text): def extract_urls_from_facets(record): - """ - Extract link URLs from Bluesky rich text facets if present. - """ urls = [] try: @@ -182,25 +216,17 @@ def extract_urls_from_facets(record): def looks_like_title_plus_url_post(text): - """ - Detect the specific desired style: - - some title/body text - - one non-X URL, typically on the last line - - Example: - Headline text... - https://example.com/story - """ if not text: return False - lines = [line.strip() for line in text.splitlines() if line.strip()] + repaired = repair_broken_urls(text) + lines = [line.strip() for line in repaired.splitlines() if line.strip()] if len(lines) < 2: return False last_line = lines[-1] urls_in_last_line = extract_ordered_non_x_urls(last_line) - total_urls = extract_ordered_non_x_urls(text) + total_urls = extract_ordered_non_x_urls(repaired) return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) @@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client): def fetch_link_metadata(url, http_client): - """ - Fetch metadata used to build a Bluesky external link card. - """ try: r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True) r.raise_for_status() @@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client): def build_external_link_embed(url, client, http_client, fallback_title="Link"): - """ - Build a Bluesky external embed from a URL. - This is only used when there is no image/video embed. - """ link_metadata = fetch_link_metadata(url, http_client) thumb_blob = None @@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"): def prepare_post_text(text): - raw_text = (text or "").strip() + raw_text = repair_broken_urls((text or "").strip()) if len(raw_text) > BSKY_TEXT_MAX_LENGTH: truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3] @@ -394,6 +413,7 @@ def normalize_post_text(text): if not text: return "" + text = repair_broken_urls(text) text = text.replace("\r", "\n") text = re.sub(r"\s+", " ", text).strip() return text.lower() @@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password): return client -# --- Local State Management --- def default_state(): return { "version": 1, @@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000): return state -# --- Bluesky Post History --- def get_recent_bsky_posts(client, handle, limit=30): recent_posts = [] @@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30): def make_rich(content): text_builder = client_utils.TextBuilder() - - def repair_url(match): - raw = match.group(0) - - if "\n" not in raw and "\r" not in raw: - return strip_trailing_url_punctuation(raw) - - glued = raw.replace("\n", "").replace("\r", "") - test_url = strip_trailing_url_punctuation(glued) - - if is_valid_url(test_url): - return test_url - - parts = raw.split("\n") - test_part0 = strip_trailing_url_punctuation(parts[0]) - if is_valid_url(test_part0): - return raw - - return test_url - - content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip()) + content = repair_broken_urls(content.strip()) lines = content.splitlines() for line_idx, line in enumerate(lines): @@ -730,7 +728,7 @@ def make_rich(content): def build_dynamic_alt(raw_text): - dynamic_alt = raw_text.replace("\n", " ").strip() + dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip() dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip() if len(dynamic_alt) > 150: @@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text): return None -# --- Playwright Scraping --- def scrape_tweets_via_playwright(username, password, email, target_handle): tweets = [] state_file = "twitter_browser_state.json" @@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): return False, None -# --- Main Sync Function --- def sync_feeds(args): logging.info("šŸ”„ Starting sync cycle...") try: @@ -1348,19 +1344,16 @@ def sync_feeds(args): if os.path.exists(temp_video_path): os.remove(temp_video_path) - # Only create the external rich snippet when there is no uploaded media. - # This specifically supports posts in the style: - # headline text - # https://news-site/article if not video_embed and not image_embeds: candidate_url = None - if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"): + if candidate.get("ordered_non_x_urls"): candidate_url = candidate["ordered_non_x_urls"][0] - logging.info(f"šŸ”— Detected title+URL post style. Using URL for external card: {candidate_url}") - elif candidate.get("ordered_non_x_urls"): - candidate_url = candidate["ordered_non_x_urls"][0] - logging.info(f"šŸ”— Text-only post with non-X URL. Using first URL for external card: {candidate_url}") + + if candidate.get("looks_like_title_plus_url"): + logging.info(f"šŸ”— Detected title+URL post style. Using URL for external card: {candidate_url}") + else: + logging.info(f"šŸ”— Text-only post with non-X URL. Using first URL for external card: {candidate_url}") if candidate_url: external_embed = build_external_link_embed( @@ -1433,7 +1426,6 @@ def sync_feeds(args): logging.error(f"āŒ Error during sync cycle: {e}") -# --- Main Execution --- def main(): load_dotenv()