Updated indentation
This commit is contained in:
@@ -1,3 +1,7 @@
|
||||
id: twitter2bsky
|
||||
name: twitter2bsky.py
|
||||
type: code.python
|
||||
content: |-
|
||||
import argparse
|
||||
import arrow
|
||||
import hashlib
|
||||
@@ -19,7 +23,7 @@ from playwright.sync_api import sync_playwright
|
||||
from moviepy import VideoFileClip
|
||||
from bs4 import BeautifulSoup
|
||||
from PIL import Image
|
||||
import grapheme # add to imports at top
|
||||
import grapheme
|
||||
|
||||
# --- Configuration ---
|
||||
LOG_PATH = "twitter2bsky.log"
|
||||
@@ -51,7 +55,6 @@ BSKY_SEND_POST_MAX_RETRIES = 3
|
||||
BSKY_SEND_POST_BASE_DELAY = 5
|
||||
BSKY_SEND_POST_MAX_DELAY = 60
|
||||
|
||||
# --- Login hardening (NEW) ---
|
||||
BSKY_LOGIN_MAX_RETRIES = 4
|
||||
BSKY_LOGIN_BASE_DELAY = 10
|
||||
BSKY_LOGIN_MAX_DELAY = 600
|
||||
@@ -97,9 +100,9 @@ class _RunCache:
|
||||
self.og_title: dict = {}
|
||||
self.url_resolution: dict = {}
|
||||
self.url_validity: dict = {}
|
||||
self.locale: str = "en-US" # ← ADDED locale cache here
|
||||
self.video_hash_owner: dict = {} # sha256 -> tweet_id
|
||||
self.video_url_owner: dict = {} # media_url -> tweet_id
|
||||
self.locale: str = "en-US"
|
||||
self.video_hash_owner: dict = {}
|
||||
self.video_url_owner: dict = {}
|
||||
|
||||
def clear(self):
|
||||
self.og_title.clear()
|
||||
@@ -112,12 +115,14 @@ _cache = _RunCache()
|
||||
|
||||
|
||||
def reset_caches():
|
||||
_cache.clear()
|
||||
|
||||
|
||||
# === VIDEO BINDING PATCH APPLIED ===
|
||||
def sha256_bytes(data: bytes):
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def sha256_file(path, chunk_size=1024 * 1024):
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
@@ -128,15 +133,17 @@ def sha256_file(path, chunk_size=1024 * 1024):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def media_url_looks_audio_only(url):
|
||||
u = (url or "").lower()
|
||||
return "/aud/" in u or "/audio/" in u or "mp4a" in u
|
||||
_cache.clear()
|
||||
|
||||
|
||||
def grapheme_len(text):
|
||||
"""Return the grapheme cluster count, matching Bluesky's character counting."""
|
||||
return grapheme.length(text)
|
||||
|
||||
|
||||
# BCP-47 language tag → sensible locale for Playwright
|
||||
_LANG_TO_LOCALE = {
|
||||
"ca": "ca-ES",
|
||||
@@ -151,6 +158,7 @@ _LANG_TO_LOCALE = {
|
||||
"gl": "gl-ES",
|
||||
}
|
||||
|
||||
|
||||
def bsky_langs_to_playwright_locale(bsky_langs):
|
||||
"""
|
||||
Convert the first configured Bluesky language tag to a Playwright locale
|
||||
@@ -161,6 +169,7 @@ def bsky_langs_to_playwright_locale(bsky_langs):
|
||||
primary = bsky_langs[0].strip().lower()
|
||||
return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}")
|
||||
|
||||
|
||||
# --- Custom Classes ---
|
||||
class ScrapedMedia:
|
||||
def __init__(self, url, media_type="photo"):
|
||||
@@ -204,11 +213,6 @@ def is_valid_url(url):
|
||||
def strip_trailing_url_punctuation(url):
|
||||
if not url:
|
||||
return url
|
||||
# Strip a trailing hashtag-style fragment (#Word) that is really a social
|
||||
# hashtag glued to the end of a URL with no space, e.g.
|
||||
# https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
|
||||
# Only stripped when it starts with a letter so real anchors like
|
||||
# /page#section-2 inside a longer sentence are left alone.
|
||||
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
|
||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
|
||||
|
||||
@@ -254,7 +258,6 @@ def repair_broken_urls(text):
|
||||
|
||||
original = text
|
||||
text = split_concatenated_urls(text)
|
||||
# Split glued hashtag suffixes before any rejoining passes
|
||||
text = split_url_hashtag_suffix(text)
|
||||
|
||||
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||
@@ -277,8 +280,6 @@ def repair_broken_urls(text):
|
||||
)
|
||||
|
||||
text = split_concatenated_urls(text)
|
||||
# Run hashtag split again after rejoining passes — the rejoining regex
|
||||
# contains # in its character class so it can re-glue a fragment.
|
||||
text = split_url_hashtag_suffix(text)
|
||||
|
||||
if text != original:
|
||||
@@ -637,7 +638,7 @@ def fetch_tweet_og_title_text(tweet_url, locale="en-US"):
|
||||
"Chrome/145.0.7632.6 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1280, "height": 900},
|
||||
locale=_cache.locale, # ← USE CACHE
|
||||
locale=_cache.locale,
|
||||
)
|
||||
page = browser_context.new_page()
|
||||
page.goto(
|
||||
@@ -1212,6 +1213,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
|
||||
|
||||
return url_pos
|
||||
|
||||
|
||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
if grapheme_len(text) <= max_length:
|
||||
return text
|
||||
@@ -1222,6 +1224,8 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
|
||||
return truncated[:last_space]
|
||||
return truncated
|
||||
|
||||
|
||||
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||
if (
|
||||
not text
|
||||
@@ -1264,6 +1268,7 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
||||
|
||||
return truncate_text_safely(text, max_length)
|
||||
|
||||
|
||||
def choose_final_visible_text(
|
||||
full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True
|
||||
):
|
||||
@@ -1398,7 +1403,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
|
||||
).hexdigest()
|
||||
|
||||
|
||||
# --- Login hardening helpers (NEW) ---
|
||||
# --- Login hardening helpers ---
|
||||
def is_rate_limited_error(error_obj):
|
||||
text = repr(error_obj).lower()
|
||||
return (
|
||||
@@ -1457,12 +1462,10 @@ def create_bsky_client(base_url, handle, password):
|
||||
except Exception as e:
|
||||
logging.exception("❌ Bluesky login exception")
|
||||
|
||||
# Fail fast on invalid credentials
|
||||
if is_auth_error(e):
|
||||
logging.error("❌ Bluesky auth failed (invalid handle/app password).")
|
||||
raise
|
||||
|
||||
# Respect explicit rate-limit timing
|
||||
if is_rate_limited_error(e):
|
||||
if attempt < max_attempts:
|
||||
wait = get_rate_limit_wait_seconds(e, default_delay=base_delay)
|
||||
@@ -1477,7 +1480,6 @@ def create_bsky_client(base_url, handle, password):
|
||||
logging.error("❌ Exhausted Bluesky login retries due to rate limiting.")
|
||||
raise
|
||||
|
||||
# Retry transient/network problems
|
||||
if is_network_error(e) or is_transient_error(e):
|
||||
if attempt < max_attempts:
|
||||
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||
@@ -1491,7 +1493,6 @@ def create_bsky_client(base_url, handle, password):
|
||||
logging.error("❌ Exhausted Bluesky login retries after transient/network errors.")
|
||||
raise
|
||||
|
||||
# Unknown errors: bounded retry anyway
|
||||
if attempt < max_attempts:
|
||||
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||
logging.warning(
|
||||
@@ -1720,7 +1721,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
|
||||
return recent_posts
|
||||
|
||||
|
||||
# --- Upload / Retry Helpers ---
|
||||
def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||
"""
|
||||
@@ -1733,7 +1733,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||
try:
|
||||
now_ts = int(time.time())
|
||||
|
||||
# Direct headers on exception
|
||||
headers = getattr(error_obj, "headers", None) or {}
|
||||
retry_after = headers.get("retry-after") or headers.get("Retry-After")
|
||||
if retry_after:
|
||||
@@ -1751,7 +1750,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||
pass
|
||||
|
||||
try:
|
||||
# Nested response headers
|
||||
response = getattr(error_obj, "response", None)
|
||||
headers = getattr(response, "headers", None) or {}
|
||||
now_ts = int(time.time())
|
||||
@@ -1921,6 +1919,7 @@ def send_post_with_retry(client, **kwargs):
|
||||
|
||||
raise last_exception
|
||||
|
||||
|
||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||
try:
|
||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||
@@ -2304,11 +2303,6 @@ def build_external_link_embed(
|
||||
|
||||
|
||||
def make_rich(content):
|
||||
# NOTE: Bluesky supports native @mention facets, but resolving a Twitter
|
||||
# handle to a Bluesky DID requires an external lookup. That mapping is not
|
||||
# available here so @mentions are passed through as plain text. If you add
|
||||
# a handle-mapping table in the future, call
|
||||
# text_builder.mention(word, did) here instead of text_builder.text(word).
|
||||
text_builder = client_utils.TextBuilder()
|
||||
content = clean_post_text(content)
|
||||
lines = content.splitlines()
|
||||
@@ -2464,7 +2458,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local
|
||||
browser_context = browser.new_context(
|
||||
user_agent=clean_ua,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale=locale, # ✅ add this
|
||||
locale=locale,
|
||||
)
|
||||
login_page = browser_context.new_page()
|
||||
|
||||
@@ -3024,7 +3018,7 @@ def sync_feeds(args):
|
||||
logging.info("🔄 Starting sync cycle...")
|
||||
dry_run = getattr(args, "dry_run", False)
|
||||
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
||||
bot_locale = bsky_langs_to_playwright_locale(bsky_langs) # ✅ now defined
|
||||
bot_locale = bsky_langs_to_playwright_locale(bsky_langs)
|
||||
_cache.locale = bot_locale
|
||||
|
||||
if dry_run:
|
||||
@@ -3239,8 +3233,7 @@ def sync_feeds(args):
|
||||
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
|
||||
)
|
||||
|
||||
|
||||
# Pre-resolve video URLs in isolated contexts (deterministic text<->media binding)
|
||||
# Pre-resolve video URLs in isolated contexts
|
||||
if candidate_tweets:
|
||||
with sync_playwright() as p_pre:
|
||||
pre_browser = p_pre.chromium.launch(
|
||||
@@ -3304,9 +3297,7 @@ def sync_feeds(args):
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
logging.info(
|
||||
f" 📄 Text: {raw_text[:200]}"
|
||||
)
|
||||
logging.info(f" 📄 Text: {raw_text[:200]}")
|
||||
logging.info(
|
||||
f" 🔗 Primary external URL: "
|
||||
f"{candidate.get('resolved_primary_external_url', 'None')}"
|
||||
@@ -3603,6 +3594,7 @@ def sync_feeds(args):
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Error during sync cycle: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
load_dotenv()
|
||||
|
||||
@@ -3613,8 +3605,6 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--twitter-password",
|
||||
# NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
|
||||
# Prefer setting TWITTER_PASSWORD in your .env file instead.
|
||||
help="Your Twitter login password",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -3631,8 +3621,6 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bsky-password",
|
||||
# NOTE (FIX #15): same warning as --twitter-password above.
|
||||
# Prefer setting BSKY_APP_PASSWORD in your .env file instead.
|
||||
help="Your Bluesky app password",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -3657,8 +3645,7 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
# Resolve credentials: CLI args take priority, then env vars.
|
||||
# FIX #15 — env vars are the secure path; CLI args expose secrets in
|
||||
# the process list. Operators should prefer .env / environment variables.
|
||||
# Prefer .env / environment variables to avoid exposing secrets in process list.
|
||||
args.twitter_username = args.twitter_username or os.getenv(
|
||||
"TWITTER_USERNAME"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user