fix(sync): clean scraped post text and improve external link card thumbnail handling
This commit is contained in:
@@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179
|
||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||
|
||||
# External-card thumbnail constraints:
|
||||
# The user's PDS returned:
|
||||
# BlobTooLarge: 1.15MB > 976.56KB
|
||||
# So we conservatively target a slightly smaller max size for safety.
|
||||
# Conservative safe target below the observed PDS max (~976.56 KB).
|
||||
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
||||
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
||||
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
||||
@@ -134,6 +132,169 @@ def repair_broken_urls(text):
|
||||
return text
|
||||
|
||||
|
||||
def repair_broken_mentions(text):
|
||||
"""
|
||||
Repair mention-related line wrapping in scraped text.
|
||||
|
||||
Handles cases like:
|
||||
Ho explica
|
||||
@martamartorell
|
||||
|
||||
La
|
||||
@sanfenerea
|
||||
tenia un repte
|
||||
|
||||
Hospital
|
||||
@parctauli
|
||||
.
|
||||
|
||||
conjunt
|
||||
@bomberscat
|
||||
-SEM.
|
||||
|
||||
becoming:
|
||||
Ho explica @martamartorell
|
||||
La @sanfenerea tenia un repte
|
||||
Hospital @parctauli .
|
||||
conjunt @bomberscat -SEM.
|
||||
|
||||
while preserving real paragraph breaks and standalone mention lines.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
lines = text.splitlines()
|
||||
result = []
|
||||
i = 0
|
||||
changed = False
|
||||
|
||||
def is_mention_only_line(s):
|
||||
return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip()))
|
||||
|
||||
def is_blank_line(s):
|
||||
return not s.strip()
|
||||
|
||||
while i < len(lines):
|
||||
current = lines[i]
|
||||
stripped = current.strip()
|
||||
|
||||
if is_blank_line(current):
|
||||
result.append("")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# If current line is only a mention, try to attach it backward.
|
||||
if is_mention_only_line(current):
|
||||
if result and result[-1].strip():
|
||||
result[-1] = result[-1].rstrip() + " " + stripped
|
||||
changed = True
|
||||
else:
|
||||
result.append(stripped)
|
||||
|
||||
i += 1
|
||||
|
||||
# Attach immediately following continuation lines if they are not blank
|
||||
# and not another standalone mention.
|
||||
while i < len(lines):
|
||||
next_line = lines[i]
|
||||
next_stripped = next_line.strip()
|
||||
|
||||
if is_blank_line(next_line):
|
||||
break
|
||||
|
||||
if is_mention_only_line(next_line):
|
||||
break
|
||||
|
||||
result[-1] = result[-1].rstrip() + " " + next_stripped
|
||||
changed = True
|
||||
i += 1
|
||||
|
||||
if i < len(lines) and is_blank_line(lines[i]):
|
||||
break
|
||||
|
||||
continue
|
||||
|
||||
# If current line has text and next line is a mention, merge them.
|
||||
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
|
||||
merged = stripped + " " + lines[i + 1].strip()
|
||||
changed = True
|
||||
i += 2
|
||||
|
||||
while i < len(lines):
|
||||
next_line = lines[i]
|
||||
next_stripped = next_line.strip()
|
||||
|
||||
if is_blank_line(next_line):
|
||||
break
|
||||
|
||||
if is_mention_only_line(next_line):
|
||||
break
|
||||
|
||||
merged = merged.rstrip() + " " + next_stripped
|
||||
changed = True
|
||||
i += 1
|
||||
|
||||
if i < len(lines) and is_blank_line(lines[i]):
|
||||
break
|
||||
|
||||
result.append(merged)
|
||||
continue
|
||||
|
||||
result.append(stripped)
|
||||
i += 1
|
||||
|
||||
new_text = "\n".join(result)
|
||||
|
||||
if changed:
|
||||
logging.info("🔧 Repaired broken mention wrapping in scraped text")
|
||||
|
||||
return new_text
|
||||
|
||||
|
||||
def strip_line_edge_whitespace(text):
|
||||
"""
|
||||
Remove leading/trailing whitespace from each line while preserving
|
||||
the line structure and intentional blank lines.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
lines = text.splitlines()
|
||||
cleaned_lines = []
|
||||
|
||||
changed = False
|
||||
for line in lines:
|
||||
cleaned = line.strip()
|
||||
if cleaned != line:
|
||||
changed = True
|
||||
cleaned_lines.append(cleaned)
|
||||
|
||||
new_text = "\n".join(cleaned_lines)
|
||||
|
||||
if changed:
|
||||
logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines")
|
||||
|
||||
return new_text
|
||||
|
||||
|
||||
def remove_trailing_ellipsis_line(text):
|
||||
"""
|
||||
Remove trailing lines that are only ellipsis markers.
|
||||
Handles:
|
||||
- ...
|
||||
- …
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
lines = text.splitlines()
|
||||
|
||||
while lines and lines[-1].strip() in {"...", "…"}:
|
||||
lines.pop()
|
||||
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
trimmed_url = url.strip()
|
||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||
@@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text):
|
||||
return False
|
||||
|
||||
repaired = repair_broken_urls(text)
|
||||
repaired = strip_line_edge_whitespace(repaired)
|
||||
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||
if len(lines) < 2:
|
||||
return False
|
||||
@@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
||||
img = img.resize(new_size, Image.LANCZOS)
|
||||
logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
|
||||
|
||||
# Try progressively lower qualities.
|
||||
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
|
||||
out = io.BytesIO()
|
||||
img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
|
||||
@@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
||||
if len(data) <= max_bytes:
|
||||
return data
|
||||
|
||||
# If still too large, try a second resize pass.
|
||||
for target_dim in [1000, 900, 800, 700, 600]:
|
||||
resized = img.copy()
|
||||
width, height = resized.size
|
||||
@@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||
|
||||
|
||||
def prepare_post_text(text):
|
||||
raw_text = repair_broken_urls((text or "").strip())
|
||||
raw_text = (text or "").strip()
|
||||
raw_text = repair_broken_urls(raw_text)
|
||||
raw_text = repair_broken_mentions(raw_text)
|
||||
raw_text = strip_line_edge_whitespace(raw_text)
|
||||
raw_text = remove_trailing_ellipsis_line(raw_text)
|
||||
|
||||
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
||||
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
||||
@@ -530,7 +694,7 @@ def prepare_post_text(text):
|
||||
else:
|
||||
raw_text = truncated + "..."
|
||||
|
||||
return raw_text
|
||||
return raw_text.strip()
|
||||
|
||||
|
||||
def normalize_post_text(text):
|
||||
@@ -538,6 +702,9 @@ def normalize_post_text(text):
|
||||
return ""
|
||||
|
||||
text = repair_broken_urls(text)
|
||||
text = repair_broken_mentions(text)
|
||||
text = strip_line_edge_whitespace(text)
|
||||
text = remove_trailing_ellipsis_line(text)
|
||||
text = text.replace("\r", "\n")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text.lower()
|
||||
@@ -765,7 +932,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
if item.reason is not None:
|
||||
continue
|
||||
|
||||
record = item.post.record
|
||||
record = item.post.record
|
||||
if getattr(record, "reply", None) is not None:
|
||||
continue
|
||||
|
||||
@@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
def make_rich(content):
|
||||
text_builder = client_utils.TextBuilder()
|
||||
content = repair_broken_urls(content.strip())
|
||||
content = repair_broken_mentions(content)
|
||||
content = strip_line_edge_whitespace(content)
|
||||
content = remove_trailing_ellipsis_line(content)
|
||||
lines = content.splitlines()
|
||||
|
||||
for line_idx, line in enumerate(lines):
|
||||
@@ -852,7 +1022,11 @@ def make_rich(content):
|
||||
|
||||
|
||||
def build_dynamic_alt(raw_text):
|
||||
dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
|
||||
dynamic_alt = repair_broken_urls(raw_text)
|
||||
dynamic_alt = repair_broken_mentions(dynamic_alt)
|
||||
dynamic_alt = strip_line_edge_whitespace(dynamic_alt)
|
||||
dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt)
|
||||
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
|
||||
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
||||
|
||||
if len(dynamic_alt) > 150:
|
||||
|
||||
Reference in New Issue
Block a user