fix(sync): clean scraped post text and improve external link card thumbnail handling

2026-04-06 22:23:04 +02:00
parent e4334f44d2
commit 8529016386
1 changed files with 183 additions and 9 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45
 # External-card thumbnail constraints:
-# The user's PDS returned:
+# Conservative safe target below the observed PDS max (~976.56 KB).
 #   BlobTooLarge: 1.15MB > 976.56KB
 # So we conservatively target a slightly smaller max size for safety.
 EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
 EXTERNAL_THUMB_MAX_DIMENSION = 1200
 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
@@ -134,6 +132,169 @@ def repair_broken_urls(text):
    return text
 def repair_broken_mentions(text):
    """
    Repair mention-related line wrapping in scraped text.
    Handles cases like:
      Ho explica
      @martamartorell
      La
      @sanfenerea
      tenia un repte
      Hospital
      @parctauli
      .
      conjunt
      @bomberscat
      -SEM.
    becoming:
      Ho explica @martamartorell
      La @sanfenerea tenia un repte
      Hospital @parctauli .
      conjunt @bomberscat -SEM.
    while preserving real paragraph breaks and standalone mention lines.
    """
    if not text:
        return text
    lines = text.splitlines()
    result = []
    i = 0
    changed = False
    def is_mention_only_line(s):
        return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip()))
    def is_blank_line(s):
        return not s.strip()
    while i < len(lines):
        current = lines[i]
        stripped = current.strip()
        if is_blank_line(current):
            result.append("")
            i += 1
            continue
        # If current line is only a mention, try to attach it backward.
        if is_mention_only_line(current):
            if result and result[-1].strip():
                result[-1] = result[-1].rstrip() + " " + stripped
                changed = True
            else:
                result.append(stripped)
            i += 1
            # Attach immediately following continuation lines if they are not blank
            # and not another standalone mention.
            while i < len(lines):
                next_line = lines[i]
                next_stripped = next_line.strip()
                if is_blank_line(next_line):
                    break
                if is_mention_only_line(next_line):
                    break
                result[-1] = result[-1].rstrip() + " " + next_stripped
                changed = True
                i += 1
                if i < len(lines) and is_blank_line(lines[i]):
                    break
            continue
        # If current line has text and next line is a mention, merge them.
        if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
            merged = stripped + " " + lines[i + 1].strip()
            changed = True
            i += 2
            while i < len(lines):
                next_line = lines[i]
                next_stripped = next_line.strip()
                if is_blank_line(next_line):
                    break
                if is_mention_only_line(next_line):
                    break
                merged = merged.rstrip() + " " + next_stripped
                changed = True
                i += 1
                if i < len(lines) and is_blank_line(lines[i]):
                    break
            result.append(merged)
            continue
        result.append(stripped)
        i += 1
    new_text = "\n".join(result)
    if changed:
        logging.info("🔧 Repaired broken mention wrapping in scraped text")
    return new_text
 def strip_line_edge_whitespace(text):
    """
    Remove leading/trailing whitespace from each line while preserving
    the line structure and intentional blank lines.
    """
    if not text:
        return text
    lines = text.splitlines()
    cleaned_lines = []
    changed = False
    for line in lines:
        cleaned = line.strip()
        if cleaned != line:
            changed = True
        cleaned_lines.append(cleaned)
    new_text = "\n".join(cleaned_lines)
    if changed:
        logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines")
    return new_text
 def remove_trailing_ellipsis_line(text):
    """
    Remove trailing lines that are only ellipsis markers.
    Handles:
    - ...
    - …
    """
    if not text:
        return text
    lines = text.splitlines()
    while lines and lines[-1].strip() in {"...", "…"}:
        lines.pop()
    return "\n".join(lines).strip()
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text):
        return False
    repaired = repair_broken_urls(text)
    repaired = strip_line_edge_whitespace(repaired)
    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if len(lines) < 2:
        return False
@@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
                img = img.resize(new_size, Image.LANCZOS)
                logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
            # Try progressively lower qualities.
            for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
                out = io.BytesIO()
                img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
@@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
                if len(data) <= max_bytes:
                    return data
            # If still too large, try a second resize pass.
            for target_dim in [1000, 900, 800, 700, 600]:
                resized = img.copy()
                width, height = resized.size
@@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
 def prepare_post_text(text):
-    raw_text = repair_broken_urls((text or "").strip())
+    raw_text = (text or "").strip()
    raw_text = repair_broken_urls(raw_text)
    raw_text = repair_broken_mentions(raw_text)
    raw_text = strip_line_edge_whitespace(raw_text)
    raw_text = remove_trailing_ellipsis_line(raw_text)
    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
        truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -530,7 +694,7 @@ def prepare_post_text(text):
        else:
            raw_text = truncated + "..."
-    return raw_text
+    return raw_text.strip()
 def normalize_post_text(text):
@@ -538,6 +702,9 @@ def normalize_post_text(text):
        return ""
    text = repair_broken_urls(text)
    text = repair_broken_mentions(text)
    text = strip_line_edge_whitespace(text)
    text = remove_trailing_ellipsis_line(text)
    text = text.replace("\r", "\n")
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
@@ -765,7 +932,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
                if item.reason is not None:
                    continue
-                    record = item.post.record
+                record = item.post.record
                if getattr(record, "reply", None) is not None:
                    continue
@@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30):
 def make_rich(content):
    text_builder = client_utils.TextBuilder()
    content = repair_broken_urls(content.strip())
    content = repair_broken_mentions(content)
    content = strip_line_edge_whitespace(content)
    content = remove_trailing_ellipsis_line(content)
    lines = content.splitlines()
    for line_idx, line in enumerate(lines):
@@ -852,7 +1022,11 @@ def make_rich(content):
 def build_dynamic_alt(raw_text):
-    dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
+    dynamic_alt = repair_broken_urls(raw_text)
    dynamic_alt = repair_broken_mentions(dynamic_alt)
    dynamic_alt = strip_line_edge_whitespace(dynamic_alt)
    dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt)
    dynamic_alt = dynamic_alt.replace("\n", " ").strip()
    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
    if len(dynamic_alt) > 150: