Fixed some truncation

This commit is contained in:
Guillem Hernandez Sola
2026-04-23 20:01:57 +02:00
parent baa055a36e
commit 2401da8e5f
2 changed files with 11 additions and 5 deletions

View File

@@ -46,7 +46,8 @@ pipeline {
fastfeedparser \ fastfeedparser \
beautifulsoup4 \ beautifulsoup4 \
charset-normalizer \ charset-normalizer \
Pillow Pillow \
grapheme
# Verify required imports # Verify required imports
"${VENV_DIR}/bin/python" -c "import fastfeedparser; print('fastfeedparser OK')" "${VENV_DIR}/bin/python" -c "import fastfeedparser; print('fastfeedparser OK')"

View File

@@ -19,6 +19,7 @@ from playwright.sync_api import sync_playwright
from moviepy import VideoFileClip from moviepy import VideoFileClip
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PIL import Image from PIL import Image
import grapheme # add to imports at top
# --- Configuration --- # --- Configuration ---
LOG_PATH = "twitter2bsky.log" LOG_PATH = "twitter2bsky.log"
@@ -26,7 +27,7 @@ STATE_PATH = "twitter2bsky_state.json"
SCRAPE_TWEET_LIMIT = 30 SCRAPE_TWEET_LIMIT = 30
DEDUPE_BSKY_LIMIT = 30 DEDUPE_BSKY_LIMIT = 30
TWEET_MAX_AGE_DAYS = 3 TWEET_MAX_AGE_DAYS = 3
BSKY_TEXT_MAX_LENGTH = 275 BSKY_TEXT_MAX_LENGTH = 300
DEFAULT_BSKY_LANGS = ["ca"] DEFAULT_BSKY_LANGS = ["ca"]
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
@@ -108,6 +109,9 @@ _cache = _RunCache()
def reset_caches(): def reset_caches():
_cache.clear() _cache.clear()
def grapheme_len(text):
"""Return the grapheme cluster count, matching Bluesky's character counting."""
return grapheme.length(text)
# --- Custom Classes --- # --- Custom Classes ---
class ScrapedMedia: class ScrapedMedia:
@@ -1160,16 +1164,17 @@ def find_tail_preservation_start(text, primary_non_x_url):
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
if len(text) <= max_length: if grapheme_len(text) <= max_length:
return text return text
truncated = text[: max_length - 3] # Truncate by grapheme clusters
clusters = list(grapheme.graphemes(text))
truncated = "".join(clusters[: max_length - 3])
last_space = truncated.rfind(" ") last_space = truncated.rfind(" ")
if last_space > TRUNCATE_MIN_PREFIX_CHARS: if last_space > TRUNCATE_MIN_PREFIX_CHARS:
return truncated[:last_space] + "..." return truncated[:last_space] + "..."
return truncated + "..." return truncated + "..."
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH): def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
if ( if (
not text not text