From 375191c497cd3e870148083c725a7851756f0c07 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Thu, 30 Apr 2026 10:20:41 +0200 Subject: [PATCH] Add --bsky-langs es to Jenkins pipeline for specific language tagging --- jenkins/3catTw | 3 ++- twitter2bsky_daemon.py | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/jenkins/3catTw b/jenkins/3catTw index a38bbdc..7fe9926 100644 --- a/jenkins/3catTw +++ b/jenkins/3catTw @@ -82,7 +82,8 @@ pipeline { --twitter-handle "$TWITTER_3CAT_HANDLE" \ --bsky-handle "$BSKY_3CAT_HANDLE" \ --bsky-password "$BSKY_3CAT_APP_PASSWORD" \ - --bsky-base-url https://eurosky.social + --bsky-base-url https://eurosky.social \ + --bsky-langs ca ''' } } diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 72fb1f2..53411bf 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -97,6 +97,7 @@ class _RunCache: self.og_title: dict = {} self.url_resolution: dict = {} self.url_validity: dict = {} + self.locale: str = "en-US" # ← ADDED locale cache here def clear(self): self.og_title.clear() @@ -113,6 +114,30 @@ def grapheme_len(text): """Return the grapheme cluster count, matching Bluesky's character counting.""" return grapheme.length(text) +# BCP-47 language tag → sensible locale for Playwright +_LANG_TO_LOCALE = { + "ca": "ca-ES", + "es": "es-ES", + "en": "en-US", + "fr": "fr-FR", + "de": "de-DE", + "pt": "pt-PT", + "it": "it-IT", + "nl": "nl-NL", + "eu": "eu-ES", + "gl": "gl-ES", +} + +def bsky_langs_to_playwright_locale(bsky_langs): + """ + Convert the first configured Bluesky language tag to a Playwright locale + string (e.g. ['ca'] → 'ca-ES'). Falls back to 'en-US' if unknown. + """ + if not bsky_langs: + return "en-US" + primary = bsky_langs[0].strip().lower() + return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}") + # --- Custom Classes --- class ScrapedMedia: def __init__(self, url, media_type="photo"): @@ -562,7 +587,7 @@ def should_fetch_og_title(tweet): return False -def fetch_tweet_og_title_text(tweet_url): +def fetch_tweet_og_title_text(tweet_url, locale="en-US"): if not tweet_url: return None @@ -589,6 +614,7 @@ def fetch_tweet_og_title_text(tweet_url): "Chrome/145.0.7632.6 Safari/537.36" ), viewport={"width": 1280, "height": 900}, + locale=_cache.locale, # ← USE CACHE ) page = browser_context.new_page() page.goto( @@ -665,7 +691,7 @@ def resolve_tco_with_httpx(url, http_client): return canonicalize_url(url) -def resolve_tco_with_playwright(url): +def resolve_tco_with_playwright(url, locale="en-US"): browser = None browser_context = None page = None @@ -685,6 +711,7 @@ def resolve_tco_with_playwright(url): "Chrome/145.0.7632.6 Safari/537.36" ), viewport={"width": 1280, "height": 900}, + locale=locale, ) page = browser_context.new_page() @@ -2337,7 +2364,7 @@ def build_video_embed(video_blob, alt_text): # --- Twitter Scraping --- -def scrape_tweets_via_playwright(username, password, email, target_handle): +def scrape_tweets_via_playwright(username, password, email, target_handle, locale="en-US"): tweets = [] state_file = "twitter_browser_state.json" @@ -2372,6 +2399,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): user_agent=clean_ua, viewport={"width": 1920, "height": 1080}, storage_state=state_file, + locale=locale, ) session_check_page = browser_context.new_page() session_check_page.goto("https://x.com/home") @@ -2406,6 +2434,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): browser_context = browser.new_context( user_agent=clean_ua, viewport={"width": 1920, "height": 1080}, + locale=locale, # ✅ add this ) login_page = browser_context.new_page() @@ -2980,9 +3009,10 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): # --- Main Sync Logic --- def sync_feeds(args): logging.info("🔄 Starting sync cycle...") - dry_run = getattr(args, "dry_run", False) bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS + bot_locale = bsky_langs_to_playwright_locale(bsky_langs) # ✅ now defined + _cache.locale = bot_locale if dry_run: logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.") @@ -2996,6 +3026,7 @@ def sync_feeds(args): args.twitter_password, args.twitter_email, args.twitter_handle, + locale=bot_locale, ) if not tweets: @@ -3213,6 +3244,7 @@ def sync_feeds(args): "Chrome/145.0.7632.6 Safari/537.36" ), "viewport": {"width": 1920, "height": 1080}, + "locale": bot_locale, } if os.path.exists(browser_state_file): context_kwargs["storage_state"] = browser_state_file