New test for rich snippet

2026-04-05 22:51:18 +02:00
parent 7614545893
commit 5abd9d685a
1 changed files with 64 additions and 72 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
 TWEET_MAX_AGE_DAYS = 3
 BSKY_TEXT_MAX_LENGTH = 275

-# Video handling notes:
-# - Bluesky video support is constrained not just by duration, but also by
-#   practical upload limits like final file size, bitrate, resolution, and
-#   server-side proxy/PDS body-size caps.
-# - Custom PDSes such as eurosky.social may accept images fine but fail on
-#   larger video blob uploads.
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45

@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())


+def repair_broken_urls(text):
+    """
+    Repair URLs that were split by copied/scraped line breaks.
+
+    Examples:
+      https://
+      3cat.cat/path
+    becomes:
+      https://3cat.cat/path
+
+      https://3cat.cat/some-pa
+      th/article
+    becomes:
+      https://3cat.cat/some-path/article
+    """
+    if not text:
+        return text
+
+    original = text
+
+    # Join protocol line breaks: https://\nexample.com -> https://example.com
+    text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
+
+    # Join URL-internal line breaks when the next chunk still looks like URL content.
+    # This is intentionally conservative but effective for wrapped article URLs.
+    prev_text = None
+    while prev_text != text:
+        prev_text = text
+        text = re.sub(
+            r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
+            r"\1\2",
+            text,
+            flags=re.IGNORECASE
+        )
+
+    # Also fix accidental spaces inserted inside URLs after the protocol.
+    text = re.sub(
+        r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
+        r"\1\2",
+        text,
+        flags=re.IGNORECASE
+    )
+
+    if text != original:
+        logging.info("🔧 Repaired broken URL wrapping in scraped text")
+
+    return text
+
+
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -102,9 +145,6 @@ def canonicalize_url(url):


 def canonicalize_tweet_url(url):
-    """
-    Canonicalize x.com/twitter.com status URLs for internal dedupe only.
-    """
    if not url:
        return None

@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
 def extract_urls_from_text(text):
    if not text:
        return []
-    return re.findall(r"https?://[^\s]+", text)
+    repaired = repair_broken_urls(text)
+    return re.findall(r"https?://[^\s]+", repaired)


 def extract_non_x_urls_from_text(text):
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):


 def extract_ordered_non_x_urls(text):
-    """
-    Extract non-X URLs preserving original order and uniqueness.
-    This is used for posting decisions, especially external link-card creation.
-    """
    seen = set()
    ordered = []

@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):


 def extract_urls_from_facets(record):
-    """
-    Extract link URLs from Bluesky rich text facets if present.
-    """
    urls = []

    try:
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):


 def looks_like_title_plus_url_post(text):
-    """
-    Detect the specific desired style:
-    - some title/body text
-    - one non-X URL, typically on the last line
-
-    Example:
-      Headline text...
-      https://example.com/story
-    """
    if not text:
        return False

-    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    repaired = repair_broken_urls(text)
+    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if len(lines) < 2:
        return False

    last_line = lines[-1]
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
-    total_urls = extract_ordered_non_x_urls(text)
+    total_urls = extract_ordered_non_x_urls(repaired)

    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))

@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):


 def fetch_link_metadata(url, http_client):
-    """
-    Fetch metadata used to build a Bluesky external link card.
-    """
    try:
        r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
        r.raise_for_status()
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):


 def build_external_link_embed(url, client, http_client, fallback_title="Link"):
-    """
-    Build a Bluesky external embed from a URL.
-    This is only used when there is no image/video embed.
-    """
    link_metadata = fetch_link_metadata(url, http_client)

    thumb_blob = None
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):


 def prepare_post_text(text):
-    raw_text = (text or "").strip()
+    raw_text = repair_broken_urls((text or "").strip())

    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
        truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -394,6 +413,7 @@ def normalize_post_text(text):
    if not text:
        return ""

+    text = repair_broken_urls(text)
    text = text.replace("\r", "\n")
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
    return client


-# --- Local State Management ---
 def default_state():
    return {
        "version": 1,
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
    return state


-# --- Bluesky Post History ---
 def get_recent_bsky_posts(client, handle, limit=30):
    recent_posts = []

@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):

 def make_rich(content):
    text_builder = client_utils.TextBuilder()
-
-    def repair_url(match):
-        raw = match.group(0)
-
-        if "\n" not in raw and "\r" not in raw:
-            return strip_trailing_url_punctuation(raw)
-
-        glued = raw.replace("\n", "").replace("\r", "")
-        test_url = strip_trailing_url_punctuation(glued)
-
-        if is_valid_url(test_url):
-            return test_url
-
-        parts = raw.split("\n")
-        test_part0 = strip_trailing_url_punctuation(parts[0])
-        if is_valid_url(test_part0):
-            return raw
-
-        return test_url
-
-    content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
+    content = repair_broken_urls(content.strip())
    lines = content.splitlines()

    for line_idx, line in enumerate(lines):
@@ -730,7 +728,7 @@ def make_rich(content):


 def build_dynamic_alt(raw_text):
-    dynamic_alt = raw_text.replace("\n", " ").strip()
+    dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()

    if len(dynamic_alt) > 150:
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
        return None


-# --- Playwright Scraping ---
 def scrape_tweets_via_playwright(username, password, email, target_handle):
    tweets = []
    state_file = "twitter_browser_state.json"
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
    return False, None


-# --- Main Sync Function ---
 def sync_feeds(args):
    logging.info("🔄 Starting sync cycle...")
    try:
@@ -1348,18 +1344,15 @@ def sync_feeds(args):
                                if os.path.exists(temp_video_path):
                                    os.remove(temp_video_path)

-                # Only create the external rich snippet when there is no uploaded media.
-                # This specifically supports posts in the style:
-                #   headline text
-                #   https://news-site/article
                if not video_embed and not image_embeds:
                    candidate_url = None

-                    if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
+                    if candidate.get("ordered_non_x_urls"):
                        candidate_url = candidate["ordered_non_x_urls"][0]
+
+                        if candidate.get("looks_like_title_plus_url"):
                            logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
-                    elif candidate.get("ordered_non_x_urls"):
-                        candidate_url = candidate["ordered_non_x_urls"][0]
+                        else:
                            logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")

                    if candidate_url:
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
        logging.error(f"❌ Error during sync cycle: {e}")


-# --- Main Execution ---
 def main():
    load_dotenv()