fix(sync): clean scraped post text and improve external link card thumbnail handling

2026-04-06 22:23:04 +02:00
parent e4334f44d2
commit 8529016386
1 changed files with 183 additions and 9 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45

 # External-card thumbnail constraints:
-# The user's PDS returned:
-#   BlobTooLarge: 1.15MB > 976.56KB
-# So we conservatively target a slightly smaller max size for safety.
+# Conservative safe target below the observed PDS max (~976.56 KB).
 EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
 EXTERNAL_THUMB_MAX_DIMENSION = 1200
 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
@@ -134,6 +132,169 @@ def repair_broken_urls(text):
    return text


+def repair_broken_mentions(text):
+    """
+    Repair mention-related line wrapping in scraped text.
+
+    Handles cases like:
+      Ho explica
+      @martamartorell
+
+      La
+      @sanfenerea
+      tenia un repte
+
+      Hospital
+      @parctauli
+      .
+
+      conjunt
+      @bomberscat
+      -SEM.
+
+    becoming:
+      Ho explica @martamartorell
+      La @sanfenerea tenia un repte
+      Hospital @parctauli .
+      conjunt @bomberscat -SEM.
+
+    while preserving real paragraph breaks and standalone mention lines.
+    """
+    if not text:
+        return text
+
+    lines = text.splitlines()
+    result = []
+    i = 0
+    changed = False
+
+    def is_mention_only_line(s):
+        return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip()))
+
+    def is_blank_line(s):
+        return not s.strip()
+
+    while i < len(lines):
+        current = lines[i]
+        stripped = current.strip()
+
+        if is_blank_line(current):
+            result.append("")
+            i += 1
+            continue
+
+        # If current line is only a mention, try to attach it backward.
+        if is_mention_only_line(current):
+            if result and result[-1].strip():
+                result[-1] = result[-1].rstrip() + " " + stripped
+                changed = True
+            else:
+                result.append(stripped)
+
+            i += 1
+
+            # Attach immediately following continuation lines if they are not blank
+            # and not another standalone mention.
+            while i < len(lines):
+                next_line = lines[i]
+                next_stripped = next_line.strip()
+
+                if is_blank_line(next_line):
+                    break
+
+                if is_mention_only_line(next_line):
+                    break
+
+                result[-1] = result[-1].rstrip() + " " + next_stripped
+                changed = True
+                i += 1
+
+                if i < len(lines) and is_blank_line(lines[i]):
+                    break
+
+            continue
+
+        # If current line has text and next line is a mention, merge them.
+        if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
+            merged = stripped + " " + lines[i + 1].strip()
+            changed = True
+            i += 2
+
+            while i < len(lines):
+                next_line = lines[i]
+                next_stripped = next_line.strip()
+
+                if is_blank_line(next_line):
+                    break
+
+                if is_mention_only_line(next_line):
+                    break
+
+                merged = merged.rstrip() + " " + next_stripped
+                changed = True
+                i += 1
+
+                if i < len(lines) and is_blank_line(lines[i]):
+                    break
+
+            result.append(merged)
+            continue
+
+        result.append(stripped)
+        i += 1
+
+    new_text = "\n".join(result)
+
+    if changed:
+        logging.info("🔧 Repaired broken mention wrapping in scraped text")
+
+    return new_text
+
+
+def strip_line_edge_whitespace(text):
+    """
+    Remove leading/trailing whitespace from each line while preserving
+    the line structure and intentional blank lines.
+    """
+    if not text:
+        return text
+
+    lines = text.splitlines()
+    cleaned_lines = []
+
+    changed = False
+    for line in lines:
+        cleaned = line.strip()
+        if cleaned != line:
+            changed = True
+        cleaned_lines.append(cleaned)
+
+    new_text = "\n".join(cleaned_lines)
+
+    if changed:
+        logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines")
+
+    return new_text
+
+
+def remove_trailing_ellipsis_line(text):
+    """
+    Remove trailing lines that are only ellipsis markers.
+    Handles:
+    - ...
+    - …
+    """
+    if not text:
+        return text
+
+    lines = text.splitlines()
+
+    while lines and lines[-1].strip() in {"...", "…"}:
+        lines.pop()
+
+    return "\n".join(lines).strip()
+
+
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text):
        return False

    repaired = repair_broken_urls(text)
+    repaired = strip_line_edge_whitespace(repaired)
    lines = [line.strip() for line in repaired.splitlines() if line.strip()]
    if len(lines) < 2:
        return False
@@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
                img = img.resize(new_size, Image.LANCZOS)
                logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")

-            # Try progressively lower qualities.
            for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
                out = io.BytesIO()
                img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
@@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
                if len(data) <= max_bytes:
                    return data

-            # If still too large, try a second resize pass.
            for target_dim in [1000, 900, 800, 700, 600]:
                resized = img.copy()
                width, height = resized.size
@@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):


 def prepare_post_text(text):
-    raw_text = repair_broken_urls((text or "").strip())
+    raw_text = (text or "").strip()
+    raw_text = repair_broken_urls(raw_text)
+    raw_text = repair_broken_mentions(raw_text)
+    raw_text = strip_line_edge_whitespace(raw_text)
+    raw_text = remove_trailing_ellipsis_line(raw_text)

    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
        truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -530,7 +694,7 @@ def prepare_post_text(text):
        else:
            raw_text = truncated + "..."

-    return raw_text
+    return raw_text.strip()


 def normalize_post_text(text):
@@ -538,6 +702,9 @@ def normalize_post_text(text):
        return ""

    text = repair_broken_urls(text)
+    text = repair_broken_mentions(text)
+    text = strip_line_edge_whitespace(text)
+    text = remove_trailing_ellipsis_line(text)
    text = text.replace("\r", "\n")
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
@@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30):
 def make_rich(content):
    text_builder = client_utils.TextBuilder()
    content = repair_broken_urls(content.strip())
+    content = repair_broken_mentions(content)
+    content = strip_line_edge_whitespace(content)
+    content = remove_trailing_ellipsis_line(content)
    lines = content.splitlines()

    for line_idx, line in enumerate(lines):
@@ -852,7 +1022,11 @@ def make_rich(content):


 def build_dynamic_alt(raw_text):
-    dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
+    dynamic_alt = repair_broken_urls(raw_text)
+    dynamic_alt = repair_broken_mentions(dynamic_alt)
+    dynamic_alt = strip_line_edge_whitespace(dynamic_alt)
+    dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt)
+    dynamic_alt = dynamic_alt.replace("\n", " ").strip()
    dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()

    if len(dynamic_alt) > 150: