Updated indentation

This commit is contained in:
2026-05-10 06:12:34 +00:00
parent 51216085da
commit 9f5af47dda

View File

@@ -1,3 +1,7 @@
id: twitter2bsky
name: twitter2bsky.py
type: code.python
content: |-
import argparse import argparse
import arrow import arrow
import hashlib import hashlib
@@ -19,7 +23,7 @@ from playwright.sync_api import sync_playwright
from moviepy import VideoFileClip from moviepy import VideoFileClip
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from PIL import Image from PIL import Image
import grapheme # add to imports at top import grapheme
# --- Configuration --- # --- Configuration ---
LOG_PATH = "twitter2bsky.log" LOG_PATH = "twitter2bsky.log"
@@ -51,7 +55,6 @@ BSKY_SEND_POST_MAX_RETRIES = 3
BSKY_SEND_POST_BASE_DELAY = 5 BSKY_SEND_POST_BASE_DELAY = 5
BSKY_SEND_POST_MAX_DELAY = 60 BSKY_SEND_POST_MAX_DELAY = 60
# --- Login hardening (NEW) ---
BSKY_LOGIN_MAX_RETRIES = 4 BSKY_LOGIN_MAX_RETRIES = 4
BSKY_LOGIN_BASE_DELAY = 10 BSKY_LOGIN_BASE_DELAY = 10
BSKY_LOGIN_MAX_DELAY = 600 BSKY_LOGIN_MAX_DELAY = 600
@@ -97,9 +100,9 @@ class _RunCache:
self.og_title: dict = {} self.og_title: dict = {}
self.url_resolution: dict = {} self.url_resolution: dict = {}
self.url_validity: dict = {} self.url_validity: dict = {}
self.locale: str = "en-US" # ← ADDED locale cache here self.locale: str = "en-US"
self.video_hash_owner: dict = {} # sha256 -> tweet_id self.video_hash_owner: dict = {}
self.video_url_owner: dict = {} # media_url -> tweet_id self.video_url_owner: dict = {}
def clear(self): def clear(self):
self.og_title.clear() self.og_title.clear()
@@ -112,12 +115,14 @@ _cache = _RunCache()
def reset_caches(): def reset_caches():
_cache.clear()
# === VIDEO BINDING PATCH APPLIED === # === VIDEO BINDING PATCH APPLIED ===
def sha256_bytes(data: bytes): def sha256_bytes(data: bytes):
return hashlib.sha256(data).hexdigest() return hashlib.sha256(data).hexdigest()
def sha256_file(path, chunk_size=1024 * 1024): def sha256_file(path, chunk_size=1024 * 1024):
h = hashlib.sha256() h = hashlib.sha256()
with open(path, "rb") as f: with open(path, "rb") as f:
@@ -128,15 +133,17 @@ def sha256_file(path, chunk_size=1024 * 1024):
h.update(chunk) h.update(chunk)
return h.hexdigest() return h.hexdigest()
def media_url_looks_audio_only(url): def media_url_looks_audio_only(url):
u = (url or "").lower() u = (url or "").lower()
return "/aud/" in u or "/audio/" in u or "mp4a" in u return "/aud/" in u or "/audio/" in u or "mp4a" in u
_cache.clear()
def grapheme_len(text): def grapheme_len(text):
"""Return the grapheme cluster count, matching Bluesky's character counting.""" """Return the grapheme cluster count, matching Bluesky's character counting."""
return grapheme.length(text) return grapheme.length(text)
# BCP-47 language tag → sensible locale for Playwright # BCP-47 language tag → sensible locale for Playwright
_LANG_TO_LOCALE = { _LANG_TO_LOCALE = {
"ca": "ca-ES", "ca": "ca-ES",
@@ -151,6 +158,7 @@ _LANG_TO_LOCALE = {
"gl": "gl-ES", "gl": "gl-ES",
} }
def bsky_langs_to_playwright_locale(bsky_langs): def bsky_langs_to_playwright_locale(bsky_langs):
""" """
Convert the first configured Bluesky language tag to a Playwright locale Convert the first configured Bluesky language tag to a Playwright locale
@@ -161,6 +169,7 @@ def bsky_langs_to_playwright_locale(bsky_langs):
primary = bsky_langs[0].strip().lower() primary = bsky_langs[0].strip().lower()
return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}") return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}")
# --- Custom Classes --- # --- Custom Classes ---
class ScrapedMedia: class ScrapedMedia:
def __init__(self, url, media_type="photo"): def __init__(self, url, media_type="photo"):
@@ -204,11 +213,6 @@ def is_valid_url(url):
def strip_trailing_url_punctuation(url): def strip_trailing_url_punctuation(url):
if not url: if not url:
return url return url
# Strip a trailing hashtag-style fragment (#Word) that is really a social
# hashtag glued to the end of a URL with no space, e.g.
# https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
# Only stripped when it starts with a letter so real anchors like
# /page#section-2 inside a longer sentence are left alone.
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip()) url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url) return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
@@ -254,7 +258,6 @@ def repair_broken_urls(text):
original = text original = text
text = split_concatenated_urls(text) text = split_concatenated_urls(text)
# Split glued hashtag suffixes before any rejoining passes
text = split_url_hashtag_suffix(text) text = split_url_hashtag_suffix(text)
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE) text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
@@ -277,8 +280,6 @@ def repair_broken_urls(text):
) )
text = split_concatenated_urls(text) text = split_concatenated_urls(text)
# Run hashtag split again after rejoining passes — the rejoining regex
# contains # in its character class so it can re-glue a fragment.
text = split_url_hashtag_suffix(text) text = split_url_hashtag_suffix(text)
if text != original: if text != original:
@@ -637,7 +638,7 @@ def fetch_tweet_og_title_text(tweet_url, locale="en-US"):
"Chrome/145.0.7632.6 Safari/537.36" "Chrome/145.0.7632.6 Safari/537.36"
), ),
viewport={"width": 1280, "height": 900}, viewport={"width": 1280, "height": 900},
locale=_cache.locale, # ← USE CACHE locale=_cache.locale,
) )
page = browser_context.new_page() page = browser_context.new_page()
page.goto( page.goto(
@@ -1212,6 +1213,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
return url_pos return url_pos
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
if grapheme_len(text) <= max_length: if grapheme_len(text) <= max_length:
return text return text
@@ -1222,6 +1224,8 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
if last_space > TRUNCATE_MIN_PREFIX_CHARS: if last_space > TRUNCATE_MIN_PREFIX_CHARS:
return truncated[:last_space] return truncated[:last_space]
return truncated return truncated
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH): def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
if ( if (
not text not text
@@ -1264,6 +1268,7 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
return truncate_text_safely(text, max_length) return truncate_text_safely(text, max_length)
def choose_final_visible_text( def choose_final_visible_text(
full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True
): ):
@@ -1398,7 +1403,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
).hexdigest() ).hexdigest()
# --- Login hardening helpers (NEW) --- # --- Login hardening helpers ---
def is_rate_limited_error(error_obj): def is_rate_limited_error(error_obj):
text = repr(error_obj).lower() text = repr(error_obj).lower()
return ( return (
@@ -1457,12 +1462,10 @@ def create_bsky_client(base_url, handle, password):
except Exception as e: except Exception as e:
logging.exception("❌ Bluesky login exception") logging.exception("❌ Bluesky login exception")
# Fail fast on invalid credentials
if is_auth_error(e): if is_auth_error(e):
logging.error("❌ Bluesky auth failed (invalid handle/app password).") logging.error("❌ Bluesky auth failed (invalid handle/app password).")
raise raise
# Respect explicit rate-limit timing
if is_rate_limited_error(e): if is_rate_limited_error(e):
if attempt < max_attempts: if attempt < max_attempts:
wait = get_rate_limit_wait_seconds(e, default_delay=base_delay) wait = get_rate_limit_wait_seconds(e, default_delay=base_delay)
@@ -1477,7 +1480,6 @@ def create_bsky_client(base_url, handle, password):
logging.error("❌ Exhausted Bluesky login retries due to rate limiting.") logging.error("❌ Exhausted Bluesky login retries due to rate limiting.")
raise raise
# Retry transient/network problems
if is_network_error(e) or is_transient_error(e): if is_network_error(e) or is_transient_error(e):
if attempt < max_attempts: if attempt < max_attempts:
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max) wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
@@ -1491,7 +1493,6 @@ def create_bsky_client(base_url, handle, password):
logging.error("❌ Exhausted Bluesky login retries after transient/network errors.") logging.error("❌ Exhausted Bluesky login retries after transient/network errors.")
raise raise
# Unknown errors: bounded retry anyway
if attempt < max_attempts: if attempt < max_attempts:
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max) wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
logging.warning( logging.warning(
@@ -1720,7 +1721,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
return recent_posts return recent_posts
# --- Upload / Retry Helpers --- # --- Upload / Retry Helpers ---
def get_rate_limit_wait_seconds(error_obj, default_delay): def get_rate_limit_wait_seconds(error_obj, default_delay):
""" """
@@ -1733,7 +1733,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
try: try:
now_ts = int(time.time()) now_ts = int(time.time())
# Direct headers on exception
headers = getattr(error_obj, "headers", None) or {} headers = getattr(error_obj, "headers", None) or {}
retry_after = headers.get("retry-after") or headers.get("Retry-After") retry_after = headers.get("retry-after") or headers.get("Retry-After")
if retry_after: if retry_after:
@@ -1751,7 +1750,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
pass pass
try: try:
# Nested response headers
response = getattr(error_obj, "response", None) response = getattr(error_obj, "response", None)
headers = getattr(response, "headers", None) or {} headers = getattr(response, "headers", None) or {}
now_ts = int(time.time()) now_ts = int(time.time())
@@ -1921,6 +1919,7 @@ def send_post_with_retry(client, **kwargs):
raise last_exception raise last_exception
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
try: try:
with Image.open(io.BytesIO(image_bytes)) as img: with Image.open(io.BytesIO(image_bytes)) as img:
@@ -2304,11 +2303,6 @@ def build_external_link_embed(
def make_rich(content): def make_rich(content):
# NOTE: Bluesky supports native @mention facets, but resolving a Twitter
# handle to a Bluesky DID requires an external lookup. That mapping is not
# available here so @mentions are passed through as plain text. If you add
# a handle-mapping table in the future, call
# text_builder.mention(word, did) here instead of text_builder.text(word).
text_builder = client_utils.TextBuilder() text_builder = client_utils.TextBuilder()
content = clean_post_text(content) content = clean_post_text(content)
lines = content.splitlines() lines = content.splitlines()
@@ -2464,7 +2458,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local
browser_context = browser.new_context( browser_context = browser.new_context(
user_agent=clean_ua, user_agent=clean_ua,
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
locale=locale, # ✅ add this locale=locale,
) )
login_page = browser_context.new_page() login_page = browser_context.new_page()
@@ -3024,7 +3018,7 @@ def sync_feeds(args):
logging.info("🔄 Starting sync cycle...") logging.info("🔄 Starting sync cycle...")
dry_run = getattr(args, "dry_run", False) dry_run = getattr(args, "dry_run", False)
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
bot_locale = bsky_langs_to_playwright_locale(bsky_langs) # ✅ now defined bot_locale = bsky_langs_to_playwright_locale(bsky_langs)
_cache.locale = bot_locale _cache.locale = bot_locale
if dry_run: if dry_run:
@@ -3239,8 +3233,7 @@ def sync_feeds(args):
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering." f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
) )
# Pre-resolve video URLs in isolated contexts
# Pre-resolve video URLs in isolated contexts (deterministic text<->media binding)
if candidate_tweets: if candidate_tweets:
with sync_playwright() as p_pre: with sync_playwright() as p_pre:
pre_browser = p_pre.chromium.launch( pre_browser = p_pre.chromium.launch(
@@ -3304,9 +3297,7 @@ def sync_feeds(args):
) )
if dry_run: if dry_run:
logging.info( logging.info(f" 📄 Text: {raw_text[:200]}")
f" 📄 Text: {raw_text[:200]}"
)
logging.info( logging.info(
f" 🔗 Primary external URL: " f" 🔗 Primary external URL: "
f"{candidate.get('resolved_primary_external_url', 'None')}" f"{candidate.get('resolved_primary_external_url', 'None')}"
@@ -3603,6 +3594,7 @@ def sync_feeds(args):
except Exception as e: except Exception as e:
logging.error(f"❌ Error during sync cycle: {e}") logging.error(f"❌ Error during sync cycle: {e}")
def main(): def main():
load_dotenv() load_dotenv()
@@ -3613,8 +3605,6 @@ def main():
) )
parser.add_argument( parser.add_argument(
"--twitter-password", "--twitter-password",
# NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
# Prefer setting TWITTER_PASSWORD in your .env file instead.
help="Your Twitter login password", help="Your Twitter login password",
) )
parser.add_argument( parser.add_argument(
@@ -3631,8 +3621,6 @@ def main():
) )
parser.add_argument( parser.add_argument(
"--bsky-password", "--bsky-password",
# NOTE (FIX #15): same warning as --twitter-password above.
# Prefer setting BSKY_APP_PASSWORD in your .env file instead.
help="Your Bluesky app password", help="Your Bluesky app password",
) )
parser.add_argument( parser.add_argument(
@@ -3657,8 +3645,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
# Resolve credentials: CLI args take priority, then env vars. # Resolve credentials: CLI args take priority, then env vars.
# FIX #15 — env vars are the secure path; CLI args expose secrets in # Prefer .env / environment variables to avoid exposing secrets in process list.
# the process list. Operators should prefer .env / environment variables.
args.twitter_username = args.twitter_username or os.getenv( args.twitter_username = args.twitter_username or os.getenv(
"TWITTER_USERNAME" "TWITTER_USERNAME"
) )