Add --bsky-langs es to Jenkins pipeline for specific language tagging

This commit is contained in:
Guillem Hernandez Sola
2026-04-30 10:20:41 +02:00
parent 19b8e09f29
commit 375191c497
2 changed files with 38 additions and 5 deletions

View File

@@ -82,7 +82,8 @@ pipeline {
--twitter-handle "$TWITTER_3CAT_HANDLE" \
--bsky-handle "$BSKY_3CAT_HANDLE" \
--bsky-password "$BSKY_3CAT_APP_PASSWORD" \
--bsky-base-url https://eurosky.social
--bsky-base-url https://eurosky.social \
--bsky-langs ca
'''
}
}

View File

@@ -97,6 +97,7 @@ class _RunCache:
self.og_title: dict = {}
self.url_resolution: dict = {}
self.url_validity: dict = {}
self.locale: str = "en-US" # ← ADDED locale cache here
def clear(self):
self.og_title.clear()
@@ -113,6 +114,30 @@ def grapheme_len(text):
"""Return the grapheme cluster count, matching Bluesky's character counting."""
return grapheme.length(text)
# BCP-47 language tag → sensible locale for Playwright
_LANG_TO_LOCALE = {
"ca": "ca-ES",
"es": "es-ES",
"en": "en-US",
"fr": "fr-FR",
"de": "de-DE",
"pt": "pt-PT",
"it": "it-IT",
"nl": "nl-NL",
"eu": "eu-ES",
"gl": "gl-ES",
}
def bsky_langs_to_playwright_locale(bsky_langs):
"""
Convert the first configured Bluesky language tag to a Playwright locale
string (e.g. ['ca'] → 'ca-ES'). Falls back to 'en-US' if unknown.
"""
if not bsky_langs:
return "en-US"
primary = bsky_langs[0].strip().lower()
return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}")
# --- Custom Classes ---
class ScrapedMedia:
def __init__(self, url, media_type="photo"):
@@ -562,7 +587,7 @@ def should_fetch_og_title(tweet):
return False
def fetch_tweet_og_title_text(tweet_url):
def fetch_tweet_og_title_text(tweet_url, locale="en-US"):
if not tweet_url:
return None
@@ -589,6 +614,7 @@ def fetch_tweet_og_title_text(tweet_url):
"Chrome/145.0.7632.6 Safari/537.36"
),
viewport={"width": 1280, "height": 900},
locale=_cache.locale, # ← USE CACHE
)
page = browser_context.new_page()
page.goto(
@@ -665,7 +691,7 @@ def resolve_tco_with_httpx(url, http_client):
return canonicalize_url(url)
def resolve_tco_with_playwright(url):
def resolve_tco_with_playwright(url, locale="en-US"):
browser = None
browser_context = None
page = None
@@ -685,6 +711,7 @@ def resolve_tco_with_playwright(url):
"Chrome/145.0.7632.6 Safari/537.36"
),
viewport={"width": 1280, "height": 900},
locale=locale,
)
page = browser_context.new_page()
@@ -2337,7 +2364,7 @@ def build_video_embed(video_blob, alt_text):
# --- Twitter Scraping ---
def scrape_tweets_via_playwright(username, password, email, target_handle):
def scrape_tweets_via_playwright(username, password, email, target_handle, locale="en-US"):
tweets = []
state_file = "twitter_browser_state.json"
@@ -2372,6 +2399,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
user_agent=clean_ua,
viewport={"width": 1920, "height": 1080},
storage_state=state_file,
locale=locale,
)
session_check_page = browser_context.new_page()
session_check_page.goto("https://x.com/home")
@@ -2406,6 +2434,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
browser_context = browser.new_context(
user_agent=clean_ua,
viewport={"width": 1920, "height": 1080},
locale=locale, # ✅ add this
)
login_page = browser_context.new_page()
@@ -2980,9 +3009,10 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
# --- Main Sync Logic ---
def sync_feeds(args):
logging.info("🔄 Starting sync cycle...")
dry_run = getattr(args, "dry_run", False)
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
bot_locale = bsky_langs_to_playwright_locale(bsky_langs) # ✅ now defined
_cache.locale = bot_locale
if dry_run:
logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
@@ -2996,6 +3026,7 @@ def sync_feeds(args):
args.twitter_password,
args.twitter_email,
args.twitter_handle,
locale=bot_locale,
)
if not tweets:
@@ -3213,6 +3244,7 @@ def sync_feeds(args):
"Chrome/145.0.7632.6 Safari/537.36"
),
"viewport": {"width": 1920, "height": 1080},
"locale": bot_locale,
}
if os.path.exists(browser_state_file):
context_kwargs["storage_state"] = browser_state_file