From 8529016386c4fa0afba5b965544de0c531b81439 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 6 Apr 2026 22:23:04 +0200 Subject: [PATCH] fix(sync): clean scraped post text and improve external link card thumbnail handling --- twitter2bsky_daemon.py | 192 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 183 insertions(+), 9 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 7d680ee..f80f4d9 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 # External-card thumbnail constraints: -# The user's PDS returned: -# BlobTooLarge: 1.15MB > 976.56KB -# So we conservatively target a slightly smaller max size for safety. +# Conservative safe target below the observed PDS max (~976.56 KB). EXTERNAL_THUMB_MAX_BYTES = 950 * 1024 EXTERNAL_THUMB_MAX_DIMENSION = 1200 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40 @@ -134,6 +132,169 @@ def repair_broken_urls(text): return text +def repair_broken_mentions(text): + """ + Repair mention-related line wrapping in scraped text. + + Handles cases like: + Ho explica + @martamartorell + + La + @sanfenerea + tenia un repte + + Hospital + @parctauli + . + + conjunt + @bomberscat + -SEM. + + becoming: + Ho explica @martamartorell + La @sanfenerea tenia un repte + Hospital @parctauli . + conjunt @bomberscat -SEM. + + while preserving real paragraph breaks and standalone mention lines. + """ + if not text: + return text + + lines = text.splitlines() + result = [] + i = 0 + changed = False + + def is_mention_only_line(s): + return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip())) + + def is_blank_line(s): + return not s.strip() + + while i < len(lines): + current = lines[i] + stripped = current.strip() + + if is_blank_line(current): + result.append("") + i += 1 + continue + + # If current line is only a mention, try to attach it backward. + if is_mention_only_line(current): + if result and result[-1].strip(): + result[-1] = result[-1].rstrip() + " " + stripped + changed = True + else: + result.append(stripped) + + i += 1 + + # Attach immediately following continuation lines if they are not blank + # and not another standalone mention. + while i < len(lines): + next_line = lines[i] + next_stripped = next_line.strip() + + if is_blank_line(next_line): + break + + if is_mention_only_line(next_line): + break + + result[-1] = result[-1].rstrip() + " " + next_stripped + changed = True + i += 1 + + if i < len(lines) and is_blank_line(lines[i]): + break + + continue + + # If current line has text and next line is a mention, merge them. + if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]): + merged = stripped + " " + lines[i + 1].strip() + changed = True + i += 2 + + while i < len(lines): + next_line = lines[i] + next_stripped = next_line.strip() + + if is_blank_line(next_line): + break + + if is_mention_only_line(next_line): + break + + merged = merged.rstrip() + " " + next_stripped + changed = True + i += 1 + + if i < len(lines) and is_blank_line(lines[i]): + break + + result.append(merged) + continue + + result.append(stripped) + i += 1 + + new_text = "\n".join(result) + + if changed: + logging.info("🔧 Repaired broken mention wrapping in scraped text") + + return new_text + + +def strip_line_edge_whitespace(text): + """ + Remove leading/trailing whitespace from each line while preserving + the line structure and intentional blank lines. + """ + if not text: + return text + + lines = text.splitlines() + cleaned_lines = [] + + changed = False + for line in lines: + cleaned = line.strip() + if cleaned != line: + changed = True + cleaned_lines.append(cleaned) + + new_text = "\n".join(cleaned_lines) + + if changed: + logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines") + + return new_text + + +def remove_trailing_ellipsis_line(text): + """ + Remove trailing lines that are only ellipsis markers. + Handles: + - ... + - … + """ + if not text: + return text + + lines = text.splitlines() + + while lines and lines[-1].strip() in {"...", "…"}: + lines.pop() + + return "\n".join(lines).strip() + + def clean_url(url): trimmed_url = url.strip() cleaned_url = re.sub(r"\s+", "", trimmed_url) @@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text): return False repaired = repair_broken_urls(text) + repaired = strip_line_edge_whitespace(repaired) lines = [line.strip() for line in repaired.splitlines() if line.strip()] if len(lines) < 2: return False @@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B img = img.resize(new_size, Image.LANCZOS) logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}") - # Try progressively lower qualities. for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: out = io.BytesIO() img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True) @@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B if len(data) <= max_bytes: return data - # If still too large, try a second resize pass. for target_dim in [1000, 900, 800, 700, 600]: resized = img.copy() width, height = resized.size @@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"): def prepare_post_text(text): - raw_text = repair_broken_urls((text or "").strip()) + raw_text = (text or "").strip() + raw_text = repair_broken_urls(raw_text) + raw_text = repair_broken_mentions(raw_text) + raw_text = strip_line_edge_whitespace(raw_text) + raw_text = remove_trailing_ellipsis_line(raw_text) if len(raw_text) > BSKY_TEXT_MAX_LENGTH: truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3] @@ -530,7 +694,7 @@ def prepare_post_text(text): else: raw_text = truncated + "..." - return raw_text + return raw_text.strip() def normalize_post_text(text): @@ -538,6 +702,9 @@ def normalize_post_text(text): return "" text = repair_broken_urls(text) + text = repair_broken_mentions(text) + text = strip_line_edge_whitespace(text) + text = remove_trailing_ellipsis_line(text) text = text.replace("\r", "\n") text = re.sub(r"\s+", " ", text).strip() return text.lower() @@ -765,7 +932,7 @@ def get_recent_bsky_posts(client, handle, limit=30): if item.reason is not None: continue - record = item.post.record + record = item.post.record if getattr(record, "reply", None) is not None: continue @@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30): def make_rich(content): text_builder = client_utils.TextBuilder() content = repair_broken_urls(content.strip()) + content = repair_broken_mentions(content) + content = strip_line_edge_whitespace(content) + content = remove_trailing_ellipsis_line(content) lines = content.splitlines() for line_idx, line in enumerate(lines): @@ -852,7 +1022,11 @@ def make_rich(content): def build_dynamic_alt(raw_text): - dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip() + dynamic_alt = repair_broken_urls(raw_text) + dynamic_alt = repair_broken_mentions(dynamic_alt) + dynamic_alt = strip_line_edge_whitespace(dynamic_alt) + dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt) + dynamic_alt = dynamic_alt.replace("\n", " ").strip() dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip() if len(dynamic_alt) > 150: