fix(sync): clean scraped post text and improve external link card thumbnail handling

This commit is contained in:
Guillem Hernandez Sola
2026-04-06 22:23:04 +02:00
parent e4334f44d2
commit 8529016386

View File

@@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45
# External-card thumbnail constraints:
# The user's PDS returned:
# BlobTooLarge: 1.15MB > 976.56KB
# So we conservatively target a slightly smaller max size for safety.
# Conservative safe target below the observed PDS max (~976.56 KB).
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
EXTERNAL_THUMB_MAX_DIMENSION = 1200
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
@@ -134,6 +132,169 @@ def repair_broken_urls(text):
return text
def repair_broken_mentions(text):
"""
Repair mention-related line wrapping in scraped text.
Handles cases like:
Ho explica
@martamartorell
La
@sanfenerea
tenia un repte
Hospital
@parctauli
.
conjunt
@bomberscat
-SEM.
becoming:
Ho explica @martamartorell
La @sanfenerea tenia un repte
Hospital @parctauli .
conjunt @bomberscat -SEM.
while preserving real paragraph breaks and standalone mention lines.
"""
if not text:
return text
lines = text.splitlines()
result = []
i = 0
changed = False
def is_mention_only_line(s):
return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip()))
def is_blank_line(s):
return not s.strip()
while i < len(lines):
current = lines[i]
stripped = current.strip()
if is_blank_line(current):
result.append("")
i += 1
continue
# If current line is only a mention, try to attach it backward.
if is_mention_only_line(current):
if result and result[-1].strip():
result[-1] = result[-1].rstrip() + " " + stripped
changed = True
else:
result.append(stripped)
i += 1
# Attach immediately following continuation lines if they are not blank
# and not another standalone mention.
while i < len(lines):
next_line = lines[i]
next_stripped = next_line.strip()
if is_blank_line(next_line):
break
if is_mention_only_line(next_line):
break
result[-1] = result[-1].rstrip() + " " + next_stripped
changed = True
i += 1
if i < len(lines) and is_blank_line(lines[i]):
break
continue
# If current line has text and next line is a mention, merge them.
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
merged = stripped + " " + lines[i + 1].strip()
changed = True
i += 2
while i < len(lines):
next_line = lines[i]
next_stripped = next_line.strip()
if is_blank_line(next_line):
break
if is_mention_only_line(next_line):
break
merged = merged.rstrip() + " " + next_stripped
changed = True
i += 1
if i < len(lines) and is_blank_line(lines[i]):
break
result.append(merged)
continue
result.append(stripped)
i += 1
new_text = "\n".join(result)
if changed:
logging.info("🔧 Repaired broken mention wrapping in scraped text")
return new_text
def strip_line_edge_whitespace(text):
"""
Remove leading/trailing whitespace from each line while preserving
the line structure and intentional blank lines.
"""
if not text:
return text
lines = text.splitlines()
cleaned_lines = []
changed = False
for line in lines:
cleaned = line.strip()
if cleaned != line:
changed = True
cleaned_lines.append(cleaned)
new_text = "\n".join(cleaned_lines)
if changed:
logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines")
return new_text
def remove_trailing_ellipsis_line(text):
"""
Remove trailing lines that are only ellipsis markers.
Handles:
- ...
- …
"""
if not text:
return text
lines = text.splitlines()
while lines and lines[-1].strip() in {"...", ""}:
lines.pop()
return "\n".join(lines).strip()
def clean_url(url):
trimmed_url = url.strip()
cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text):
return False
repaired = repair_broken_urls(text)
repaired = strip_line_edge_whitespace(repaired)
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
if len(lines) < 2:
return False
@@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
img = img.resize(new_size, Image.LANCZOS)
logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
# Try progressively lower qualities.
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
out = io.BytesIO()
img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
@@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
if len(data) <= max_bytes:
return data
# If still too large, try a second resize pass.
for target_dim in [1000, 900, 800, 700, 600]:
resized = img.copy()
width, height = resized.size
@@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
def prepare_post_text(text):
raw_text = repair_broken_urls((text or "").strip())
raw_text = (text or "").strip()
raw_text = repair_broken_urls(raw_text)
raw_text = repair_broken_mentions(raw_text)
raw_text = strip_line_edge_whitespace(raw_text)
raw_text = remove_trailing_ellipsis_line(raw_text)
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -530,7 +694,7 @@ def prepare_post_text(text):
else:
raw_text = truncated + "..."
return raw_text
return raw_text.strip()
def normalize_post_text(text):
@@ -538,6 +702,9 @@ def normalize_post_text(text):
return ""
text = repair_broken_urls(text)
text = repair_broken_mentions(text)
text = strip_line_edge_whitespace(text)
text = remove_trailing_ellipsis_line(text)
text = text.replace("\r", "\n")
text = re.sub(r"\s+", " ", text).strip()
return text.lower()
@@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30):
def make_rich(content):
text_builder = client_utils.TextBuilder()
content = repair_broken_urls(content.strip())
content = repair_broken_mentions(content)
content = strip_line_edge_whitespace(content)
content = remove_trailing_ellipsis_line(content)
lines = content.splitlines()
for line_idx, line in enumerate(lines):
@@ -852,7 +1022,11 @@ def make_rich(content):
def build_dynamic_alt(raw_text):
dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
dynamic_alt = repair_broken_urls(raw_text)
dynamic_alt = repair_broken_mentions(dynamic_alt)
dynamic_alt = strip_line_edge_whitespace(dynamic_alt)
dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt)
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
if len(dynamic_alt) > 150: