New test for rich snippet

2026-04-05 22:51:18 +02:00
parent 7614545893
commit 5abd9d685a
1 changed files with 64 additions and 72 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
 TWEET_MAX_AGE_DAYS = 3
 BSKY_TEXT_MAX_LENGTH = 275
 # Video handling notes:
 # - Bluesky video support is constrained not just by duration, but also by
 #   practical upload limits like final file size, bitrate, resolution, and
 #   server-side proxy/PDS body-size caps.
 # - Custom PDSes such as eurosky.social may accept images fine but fail on
 #   larger video blob uploads.
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
 def repair_broken_urls(text):
    """
    Repair URLs that were split by copied/scraped line breaks.
    Examples:
      https://
      3cat.cat/path
    becomes:
      https://3cat.cat/path
      https://3cat.cat/some-pa
      th/article
    becomes:
      https://3cat.cat/some-path/article
    """
    if not text:
        return text
    original = text
    # Join protocol line breaks: https://\nexample.com -> https://example.com
    text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
    # Join URL-internal line breaks when the next chunk still looks like URL content.
    # This is intentionally conservative but effective for wrapped article URLs.
    prev_text = None
    while prev_text != text:
        prev_text = text
        text = re.sub(
            r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
            r"\1\2",
            text,
            flags=re.IGNORECASE
        )
    # Also fix accidental spaces inserted inside URLs after the protocol.
    text = re.sub(
        r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
        r"\1\2",
        text,
        flags=re.IGNORECASE
    )
    if text != original:
        logging.info("🔧 Repaired broken URL wrapping in scraped text")
    return text
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -102,9 +145,6 @@ def canonicalize_url(url):
 def canonicalize_tweet_url(url):
    """
    Canonicalize x.com/twitter.com status URLs for internal dedupe only.
    """
    if not url:
        return None
@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
 def extract_urls_from_text(text):
    if not text:
        return []
-    return re.findall(r"https?://[^\s]+", text)
+    repaired = repair_broken_urls(text)
    return re.findall(r"https?://[^\s]+", repaired)
 def extract_non_x_urls_from_text(text):
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):
 def extract_ordered_non_x_urls(text):
    """
    Extract non-X URLs preserving original order and uniqueness.
    This is used for posting decisions, especially external link-card creation.
    """
    seen = set()
    ordered = []
@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):
 def extract_urls_from_facets(record):
    """
    Extract link URLs from Bluesky rich text facets if present.
    """
    urls = []
    try:
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):
 def looks_like_title_plus_url_post(text):
    """
    Detect the specific desired style:
    - some title/body text
    - one non-X URL, typically on the last line
    Example:
      Headline text...
      https://example.com/story
    """
    if not text:
        return False
-    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    repaired = repair_broken_urls(text)
    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if len(lines) < 2:
        return False
    last_line = lines[-1]
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
-    total_urls = extract_ordered_non_x_urls(text)
+    total_urls = extract_ordered_non_x_urls(repaired)
    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):
 def fetch_link_metadata(url, http_client):
    """
    Fetch metadata used to build a Bluesky external link card.
    """
    try:
        r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
        r.raise_for_status()
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):
 def build_external_link_embed(url, client, http_client, fallback_title="Link"):
    """
    Build a Bluesky external embed from a URL.
    This is only used when there is no image/video embed.
    """
    link_metadata = fetch_link_metadata(url, http_client)
    thumb_blob = None
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
 def prepare_post_text(text):
-    raw_text = (text or "").strip()
+    raw_text = repair_broken_urls((text or "").strip())
    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
        truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -394,6 +413,7 @@ def normalize_post_text(text):
    if not text:
        return ""
    text = repair_broken_urls(text)
    text = text.replace("\r", "\n")
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
    return client
 # --- Local State Management ---
 def default_state():
    return {
        "version": 1,
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
    return state
 # --- Bluesky Post History ---
 def get_recent_bsky_posts(client, handle, limit=30):
    recent_posts = []
@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
 def make_rich(content):
    text_builder = client_utils.TextBuilder()
-
+    content = repair_broken_urls(content.strip())
    def repair_url(match):
        raw = match.group(0)
        if "\n" not in raw and "\r" not in raw:
            return strip_trailing_url_punctuation(raw)
        glued = raw.replace("\n", "").replace("\r", "")
        test_url = strip_trailing_url_punctuation(glued)
        if is_valid_url(test_url):
            return test_url
        parts = raw.split("\n")
        test_part0 = strip_trailing_url_punctuation(parts[0])
        if is_valid_url(test_part0):
            return raw
        return test_url
    content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
    lines = content.splitlines()
    for line_idx, line in enumerate(lines):
@@ -730,7 +728,7 @@ def make_rich(content):
 def build_dynamic_alt(raw_text):
-    dynamic_alt = raw_text.replace("\n", " ").strip()
+    dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
    if len(dynamic_alt) > 150:
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
        return None
 # --- Playwright Scraping ---
 def scrape_tweets_via_playwright(username, password, email, target_handle):
    tweets = []
    state_file = "twitter_browser_state.json"
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
    return False, None
 # --- Main Sync Function ---
 def sync_feeds(args):
    logging.info("🔄 Starting sync cycle...")
    try:
@@ -1348,18 +1344,15 @@ def sync_feeds(args):
                                if os.path.exists(temp_video_path):
                                    os.remove(temp_video_path)
                # Only create the external rich snippet when there is no uploaded media.
                # This specifically supports posts in the style:
                #   headline text
                #   https://news-site/article
                if not video_embed and not image_embeds:
                    candidate_url = None
-                    if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
+                    if candidate.get("ordered_non_x_urls"):
                        candidate_url = candidate["ordered_non_x_urls"][0]
                        if candidate.get("looks_like_title_plus_url"):
                            logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
-                    elif candidate.get("ordered_non_x_urls"):
+                        else:
                        candidate_url = candidate["ordered_non_x_urls"][0]
                            logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
                    if candidate_url:
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
        logging.error(f"❌ Error during sync cycle: {e}")
 # --- Main Execution ---
 def main():
    load_dotenv()