From 5abd9d685accb743c60f66217a00c35ce7d913fa Mon Sep 17 00:00:00 2001
From: Guillem Hernandez Sola <guillem@agile611.com>
Date: Sun, 5 Apr 2026 22:51:18 +0200
Subject: [PATCH] New test for rich snippet

---
 twitter2bsky_daemon.py | 136 +++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 72 deletions(-)

diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py
index c3d2bfa..01999d2 100644
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
 TWEET_MAX_AGE_DAYS = 3
 BSKY_TEXT_MAX_LENGTH = 275
 
-# Video handling notes:
-# - Bluesky video support is constrained not just by duration, but also by
-#   practical upload limits like final file size, bitrate, resolution, and
-#   server-side proxy/PDS body-size caps.
-# - Custom PDSes such as eurosky.social may accept images fine but fail on
-#   larger video blob uploads.
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45
 
@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
     return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
 
 
+def repair_broken_urls(text):
+    """
+    Repair URLs that were split by copied/scraped line breaks.
+
+    Examples:
+      https://
+      3cat.cat/path
+    becomes:
+      https://3cat.cat/path
+
+      https://3cat.cat/some-pa
+      th/article
+    becomes:
+      https://3cat.cat/some-path/article
+    """
+    if not text:
+        return text
+
+    original = text
+
+    # Join protocol line breaks: https://\nexample.com -> https://example.com
+    text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
+
+    # Join URL-internal line breaks when the next chunk still looks like URL content.
+    # This is intentionally conservative but effective for wrapped article URLs.
+    prev_text = None
+    while prev_text != text:
+        prev_text = text
+        text = re.sub(
+            r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
+            r"\1\2",
+            text,
+            flags=re.IGNORECASE
+        )
+
+    # Also fix accidental spaces inserted inside URLs after the protocol.
+    text = re.sub(
+        r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
+        r"\1\2",
+        text,
+        flags=re.IGNORECASE
+    )
+
+    if text != original:
+        logging.info("🔧 Repaired broken URL wrapping in scraped text")
+
+    return text
+
+
 def clean_url(url):
     trimmed_url = url.strip()
     cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -102,9 +145,6 @@ def canonicalize_url(url):
 
 
 def canonicalize_tweet_url(url):
-    """
-    Canonicalize x.com/twitter.com status URLs for internal dedupe only.
-    """
     if not url:
         return None
 
@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
 def extract_urls_from_text(text):
     if not text:
         return []
-    return re.findall(r"https?://[^\s]+", text)
+    repaired = repair_broken_urls(text)
+    return re.findall(r"https?://[^\s]+", repaired)
 
 
 def extract_non_x_urls_from_text(text):
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):
 
 
 def extract_ordered_non_x_urls(text):
-    """
-    Extract non-X URLs preserving original order and uniqueness.
-    This is used for posting decisions, especially external link-card creation.
-    """
     seen = set()
     ordered = []
 
@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):
 
 
 def extract_urls_from_facets(record):
-    """
-    Extract link URLs from Bluesky rich text facets if present.
-    """
     urls = []
 
     try:
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):
 
 
 def looks_like_title_plus_url_post(text):
-    """
-    Detect the specific desired style:
-    - some title/body text
-    - one non-X URL, typically on the last line
-
-    Example:
-      Headline text...
-      https://example.com/story
-    """
     if not text:
         return False
 
-    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    repaired = repair_broken_urls(text)
+    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
     if len(lines) < 2:
         return False
 
     last_line = lines[-1]
     urls_in_last_line = extract_ordered_non_x_urls(last_line)
-    total_urls = extract_ordered_non_x_urls(text)
+    total_urls = extract_ordered_non_x_urls(repaired)
 
     return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 
@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):
 
 
 def fetch_link_metadata(url, http_client):
-    """
-    Fetch metadata used to build a Bluesky external link card.
-    """
     try:
         r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
         r.raise_for_status()
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):
 
 
 def build_external_link_embed(url, client, http_client, fallback_title="Link"):
-    """
-    Build a Bluesky external embed from a URL.
-    This is only used when there is no image/video embed.
-    """
     link_metadata = fetch_link_metadata(url, http_client)
 
     thumb_blob = None
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
 
 
 def prepare_post_text(text):
-    raw_text = (text or "").strip()
+    raw_text = repair_broken_urls((text or "").strip())
 
     if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
         truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -394,6 +413,7 @@ def normalize_post_text(text):
     if not text:
         return ""
 
+    text = repair_broken_urls(text)
     text = text.replace("\r", "\n")
     text = re.sub(r"\s+", " ", text).strip()
     return text.lower()
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
     return client
 
 
-# --- Local State Management ---
 def default_state():
     return {
         "version": 1,
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
     return state
 
 
-# --- Bluesky Post History ---
 def get_recent_bsky_posts(client, handle, limit=30):
     recent_posts = []
 
@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
 
 def make_rich(content):
     text_builder = client_utils.TextBuilder()
-
-    def repair_url(match):
-        raw = match.group(0)
-
-        if "\n" not in raw and "\r" not in raw:
-            return strip_trailing_url_punctuation(raw)
-
-        glued = raw.replace("\n", "").replace("\r", "")
-        test_url = strip_trailing_url_punctuation(glued)
-
-        if is_valid_url(test_url):
-            return test_url
-
-        parts = raw.split("\n")
-        test_part0 = strip_trailing_url_punctuation(parts[0])
-        if is_valid_url(test_part0):
-            return raw
-
-        return test_url
-
-    content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
+    content = repair_broken_urls(content.strip())
     lines = content.splitlines()
 
     for line_idx, line in enumerate(lines):
@@ -730,7 +728,7 @@ def make_rich(content):
 
 
 def build_dynamic_alt(raw_text):
-    dynamic_alt = raw_text.replace("\n", " ").strip()
+    dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
     dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
 
     if len(dynamic_alt) > 150:
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
         return None
 
 
-# --- Playwright Scraping ---
 def scrape_tweets_via_playwright(username, password, email, target_handle):
     tweets = []
     state_file = "twitter_browser_state.json"
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
     return False, None
 
 
-# --- Main Sync Function ---
 def sync_feeds(args):
     logging.info("🔄 Starting sync cycle...")
     try:
@@ -1348,19 +1344,16 @@ def sync_feeds(args):
                                 if os.path.exists(temp_video_path):
                                     os.remove(temp_video_path)
 
-                # Only create the external rich snippet when there is no uploaded media.
-                # This specifically supports posts in the style:
-                #   headline text
-                #   https://news-site/article
                 if not video_embed and not image_embeds:
                     candidate_url = None
 
-                    if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
+                    if candidate.get("ordered_non_x_urls"):
                         candidate_url = candidate["ordered_non_x_urls"][0]
-                        logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
-                    elif candidate.get("ordered_non_x_urls"):
-                        candidate_url = candidate["ordered_non_x_urls"][0]
-                        logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
+
+                        if candidate.get("looks_like_title_plus_url"):
+                            logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
+                        else:
+                            logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
 
                     if candidate_url:
                         external_embed = build_external_link_embed(
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
         logging.error(f"❌ Error during sync cycle: {e}")
 
 
-# --- Main Execution ---
 def main():
     load_dotenv()