fix(sync): clean scraped post text and improve external link card thumbnail handling
This commit is contained in:
@@ -29,9 +29,7 @@ VIDEO_MAX_DURATION_SECONDS = 179
|
|||||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||||
|
|
||||||
# External-card thumbnail constraints:
|
# External-card thumbnail constraints:
|
||||||
# The user's PDS returned:
|
# Conservative safe target below the observed PDS max (~976.56 KB).
|
||||||
# BlobTooLarge: 1.15MB > 976.56KB
|
|
||||||
# So we conservatively target a slightly smaller max size for safety.
|
|
||||||
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
EXTERNAL_THUMB_MAX_BYTES = 950 * 1024
|
||||||
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
EXTERNAL_THUMB_MAX_DIMENSION = 1200
|
||||||
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40
|
||||||
@@ -134,6 +132,169 @@ def repair_broken_urls(text):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def repair_broken_mentions(text):
|
||||||
|
"""
|
||||||
|
Repair mention-related line wrapping in scraped text.
|
||||||
|
|
||||||
|
Handles cases like:
|
||||||
|
Ho explica
|
||||||
|
@martamartorell
|
||||||
|
|
||||||
|
La
|
||||||
|
@sanfenerea
|
||||||
|
tenia un repte
|
||||||
|
|
||||||
|
Hospital
|
||||||
|
@parctauli
|
||||||
|
.
|
||||||
|
|
||||||
|
conjunt
|
||||||
|
@bomberscat
|
||||||
|
-SEM.
|
||||||
|
|
||||||
|
becoming:
|
||||||
|
Ho explica @martamartorell
|
||||||
|
La @sanfenerea tenia un repte
|
||||||
|
Hospital @parctauli .
|
||||||
|
conjunt @bomberscat -SEM.
|
||||||
|
|
||||||
|
while preserving real paragraph breaks and standalone mention lines.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
lines = text.splitlines()
|
||||||
|
result = []
|
||||||
|
i = 0
|
||||||
|
changed = False
|
||||||
|
|
||||||
|
def is_mention_only_line(s):
|
||||||
|
return bool(re.fullmatch(r"@[A-Za-z0-9_]+", s.strip()))
|
||||||
|
|
||||||
|
def is_blank_line(s):
|
||||||
|
return not s.strip()
|
||||||
|
|
||||||
|
while i < len(lines):
|
||||||
|
current = lines[i]
|
||||||
|
stripped = current.strip()
|
||||||
|
|
||||||
|
if is_blank_line(current):
|
||||||
|
result.append("")
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If current line is only a mention, try to attach it backward.
|
||||||
|
if is_mention_only_line(current):
|
||||||
|
if result and result[-1].strip():
|
||||||
|
result[-1] = result[-1].rstrip() + " " + stripped
|
||||||
|
changed = True
|
||||||
|
else:
|
||||||
|
result.append(stripped)
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Attach immediately following continuation lines if they are not blank
|
||||||
|
# and not another standalone mention.
|
||||||
|
while i < len(lines):
|
||||||
|
next_line = lines[i]
|
||||||
|
next_stripped = next_line.strip()
|
||||||
|
|
||||||
|
if is_blank_line(next_line):
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_mention_only_line(next_line):
|
||||||
|
break
|
||||||
|
|
||||||
|
result[-1] = result[-1].rstrip() + " " + next_stripped
|
||||||
|
changed = True
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if i < len(lines) and is_blank_line(lines[i]):
|
||||||
|
break
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If current line has text and next line is a mention, merge them.
|
||||||
|
if i + 1 < len(lines) and is_mention_only_line(lines[i + 1]):
|
||||||
|
merged = stripped + " " + lines[i + 1].strip()
|
||||||
|
changed = True
|
||||||
|
i += 2
|
||||||
|
|
||||||
|
while i < len(lines):
|
||||||
|
next_line = lines[i]
|
||||||
|
next_stripped = next_line.strip()
|
||||||
|
|
||||||
|
if is_blank_line(next_line):
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_mention_only_line(next_line):
|
||||||
|
break
|
||||||
|
|
||||||
|
merged = merged.rstrip() + " " + next_stripped
|
||||||
|
changed = True
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if i < len(lines) and is_blank_line(lines[i]):
|
||||||
|
break
|
||||||
|
|
||||||
|
result.append(merged)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(stripped)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
new_text = "\n".join(result)
|
||||||
|
|
||||||
|
if changed:
|
||||||
|
logging.info("🔧 Repaired broken mention wrapping in scraped text")
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def strip_line_edge_whitespace(text):
|
||||||
|
"""
|
||||||
|
Remove leading/trailing whitespace from each line while preserving
|
||||||
|
the line structure and intentional blank lines.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
lines = text.splitlines()
|
||||||
|
cleaned_lines = []
|
||||||
|
|
||||||
|
changed = False
|
||||||
|
for line in lines:
|
||||||
|
cleaned = line.strip()
|
||||||
|
if cleaned != line:
|
||||||
|
changed = True
|
||||||
|
cleaned_lines.append(cleaned)
|
||||||
|
|
||||||
|
new_text = "\n".join(cleaned_lines)
|
||||||
|
|
||||||
|
if changed:
|
||||||
|
logging.info("🔧 Stripped leading/trailing whitespace from scraped text lines")
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def remove_trailing_ellipsis_line(text):
|
||||||
|
"""
|
||||||
|
Remove trailing lines that are only ellipsis markers.
|
||||||
|
Handles:
|
||||||
|
- ...
|
||||||
|
- …
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
lines = text.splitlines()
|
||||||
|
|
||||||
|
while lines and lines[-1].strip() in {"...", "…"}:
|
||||||
|
lines.pop()
|
||||||
|
|
||||||
|
return "\n".join(lines).strip()
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
trimmed_url = url.strip()
|
trimmed_url = url.strip()
|
||||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||||
@@ -226,6 +387,7 @@ def looks_like_title_plus_url_post(text):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
repaired = repair_broken_urls(text)
|
repaired = repair_broken_urls(text)
|
||||||
|
repaired = strip_line_edge_whitespace(repaired)
|
||||||
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||||
if len(lines) < 2:
|
if len(lines) < 2:
|
||||||
return False
|
return False
|
||||||
@@ -372,7 +534,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
|||||||
img = img.resize(new_size, Image.LANCZOS)
|
img = img.resize(new_size, Image.LANCZOS)
|
||||||
logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
|
logging.info(f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}")
|
||||||
|
|
||||||
# Try progressively lower qualities.
|
|
||||||
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
|
for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]:
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
|
img.save(out, format="JPEG", quality=quality, optimize=True, progressive=True)
|
||||||
@@ -386,7 +547,6 @@ def compress_external_thumb_to_limit(image_bytes, max_bytes=EXTERNAL_THUMB_MAX_B
|
|||||||
if len(data) <= max_bytes:
|
if len(data) <= max_bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# If still too large, try a second resize pass.
|
|
||||||
for target_dim in [1000, 900, 800, 700, 600]:
|
for target_dim in [1000, 900, 800, 700, 600]:
|
||||||
resized = img.copy()
|
resized = img.copy()
|
||||||
width, height = resized.size
|
width, height = resized.size
|
||||||
@@ -520,7 +680,11 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_post_text(text):
|
def prepare_post_text(text):
|
||||||
raw_text = repair_broken_urls((text or "").strip())
|
raw_text = (text or "").strip()
|
||||||
|
raw_text = repair_broken_urls(raw_text)
|
||||||
|
raw_text = repair_broken_mentions(raw_text)
|
||||||
|
raw_text = strip_line_edge_whitespace(raw_text)
|
||||||
|
raw_text = remove_trailing_ellipsis_line(raw_text)
|
||||||
|
|
||||||
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
||||||
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
||||||
@@ -530,7 +694,7 @@ def prepare_post_text(text):
|
|||||||
else:
|
else:
|
||||||
raw_text = truncated + "..."
|
raw_text = truncated + "..."
|
||||||
|
|
||||||
return raw_text
|
return raw_text.strip()
|
||||||
|
|
||||||
|
|
||||||
def normalize_post_text(text):
|
def normalize_post_text(text):
|
||||||
@@ -538,6 +702,9 @@ def normalize_post_text(text):
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
text = repair_broken_urls(text)
|
text = repair_broken_urls(text)
|
||||||
|
text = repair_broken_mentions(text)
|
||||||
|
text = strip_line_edge_whitespace(text)
|
||||||
|
text = remove_trailing_ellipsis_line(text)
|
||||||
text = text.replace("\r", "\n")
|
text = text.replace("\r", "\n")
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
return text.lower()
|
return text.lower()
|
||||||
@@ -765,7 +932,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
if item.reason is not None:
|
if item.reason is not None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
record = item.post.record
|
record = item.post.record
|
||||||
if getattr(record, "reply", None) is not None:
|
if getattr(record, "reply", None) is not None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -808,6 +975,9 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
text_builder = client_utils.TextBuilder()
|
text_builder = client_utils.TextBuilder()
|
||||||
content = repair_broken_urls(content.strip())
|
content = repair_broken_urls(content.strip())
|
||||||
|
content = repair_broken_mentions(content)
|
||||||
|
content = strip_line_edge_whitespace(content)
|
||||||
|
content = remove_trailing_ellipsis_line(content)
|
||||||
lines = content.splitlines()
|
lines = content.splitlines()
|
||||||
|
|
||||||
for line_idx, line in enumerate(lines):
|
for line_idx, line in enumerate(lines):
|
||||||
@@ -852,7 +1022,11 @@ def make_rich(content):
|
|||||||
|
|
||||||
|
|
||||||
def build_dynamic_alt(raw_text):
|
def build_dynamic_alt(raw_text):
|
||||||
dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
|
dynamic_alt = repair_broken_urls(raw_text)
|
||||||
|
dynamic_alt = repair_broken_mentions(dynamic_alt)
|
||||||
|
dynamic_alt = strip_line_edge_whitespace(dynamic_alt)
|
||||||
|
dynamic_alt = remove_trailing_ellipsis_line(dynamic_alt)
|
||||||
|
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
|
||||||
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
||||||
|
|
||||||
if len(dynamic_alt) > 150:
|
if len(dynamic_alt) > 150:
|
||||||
|
|||||||
Reference in New Issue
Block a user