Updated indentation
This commit is contained in:
@@ -1,3 +1,7 @@
|
|||||||
|
id: twitter2bsky
|
||||||
|
name: twitter2bsky.py
|
||||||
|
type: code.python
|
||||||
|
content: |-
|
||||||
import argparse
|
import argparse
|
||||||
import arrow
|
import arrow
|
||||||
import hashlib
|
import hashlib
|
||||||
@@ -19,7 +23,7 @@ from playwright.sync_api import sync_playwright
|
|||||||
from moviepy import VideoFileClip
|
from moviepy import VideoFileClip
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import grapheme # add to imports at top
|
import grapheme
|
||||||
|
|
||||||
# --- Configuration ---
|
# --- Configuration ---
|
||||||
LOG_PATH = "twitter2bsky.log"
|
LOG_PATH = "twitter2bsky.log"
|
||||||
@@ -51,7 +55,6 @@ BSKY_SEND_POST_MAX_RETRIES = 3
|
|||||||
BSKY_SEND_POST_BASE_DELAY = 5
|
BSKY_SEND_POST_BASE_DELAY = 5
|
||||||
BSKY_SEND_POST_MAX_DELAY = 60
|
BSKY_SEND_POST_MAX_DELAY = 60
|
||||||
|
|
||||||
# --- Login hardening (NEW) ---
|
|
||||||
BSKY_LOGIN_MAX_RETRIES = 4
|
BSKY_LOGIN_MAX_RETRIES = 4
|
||||||
BSKY_LOGIN_BASE_DELAY = 10
|
BSKY_LOGIN_BASE_DELAY = 10
|
||||||
BSKY_LOGIN_MAX_DELAY = 600
|
BSKY_LOGIN_MAX_DELAY = 600
|
||||||
@@ -97,9 +100,9 @@ class _RunCache:
|
|||||||
self.og_title: dict = {}
|
self.og_title: dict = {}
|
||||||
self.url_resolution: dict = {}
|
self.url_resolution: dict = {}
|
||||||
self.url_validity: dict = {}
|
self.url_validity: dict = {}
|
||||||
self.locale: str = "en-US" # ← ADDED locale cache here
|
self.locale: str = "en-US"
|
||||||
self.video_hash_owner: dict = {} # sha256 -> tweet_id
|
self.video_hash_owner: dict = {}
|
||||||
self.video_url_owner: dict = {} # media_url -> tweet_id
|
self.video_url_owner: dict = {}
|
||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
self.og_title.clear()
|
self.og_title.clear()
|
||||||
@@ -112,12 +115,14 @@ _cache = _RunCache()
|
|||||||
|
|
||||||
|
|
||||||
def reset_caches():
|
def reset_caches():
|
||||||
|
_cache.clear()
|
||||||
|
|
||||||
|
|
||||||
# === VIDEO BINDING PATCH APPLIED ===
|
# === VIDEO BINDING PATCH APPLIED ===
|
||||||
def sha256_bytes(data: bytes):
|
def sha256_bytes(data: bytes):
|
||||||
return hashlib.sha256(data).hexdigest()
|
return hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def sha256_file(path, chunk_size=1024 * 1024):
|
def sha256_file(path, chunk_size=1024 * 1024):
|
||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
@@ -128,15 +133,17 @@ def sha256_file(path, chunk_size=1024 * 1024):
|
|||||||
h.update(chunk)
|
h.update(chunk)
|
||||||
return h.hexdigest()
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def media_url_looks_audio_only(url):
|
def media_url_looks_audio_only(url):
|
||||||
u = (url or "").lower()
|
u = (url or "").lower()
|
||||||
return "/aud/" in u or "/audio/" in u or "mp4a" in u
|
return "/aud/" in u or "/audio/" in u or "mp4a" in u
|
||||||
_cache.clear()
|
|
||||||
|
|
||||||
def grapheme_len(text):
|
def grapheme_len(text):
|
||||||
"""Return the grapheme cluster count, matching Bluesky's character counting."""
|
"""Return the grapheme cluster count, matching Bluesky's character counting."""
|
||||||
return grapheme.length(text)
|
return grapheme.length(text)
|
||||||
|
|
||||||
|
|
||||||
# BCP-47 language tag → sensible locale for Playwright
|
# BCP-47 language tag → sensible locale for Playwright
|
||||||
_LANG_TO_LOCALE = {
|
_LANG_TO_LOCALE = {
|
||||||
"ca": "ca-ES",
|
"ca": "ca-ES",
|
||||||
@@ -151,6 +158,7 @@ _LANG_TO_LOCALE = {
|
|||||||
"gl": "gl-ES",
|
"gl": "gl-ES",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def bsky_langs_to_playwright_locale(bsky_langs):
|
def bsky_langs_to_playwright_locale(bsky_langs):
|
||||||
"""
|
"""
|
||||||
Convert the first configured Bluesky language tag to a Playwright locale
|
Convert the first configured Bluesky language tag to a Playwright locale
|
||||||
@@ -161,6 +169,7 @@ def bsky_langs_to_playwright_locale(bsky_langs):
|
|||||||
primary = bsky_langs[0].strip().lower()
|
primary = bsky_langs[0].strip().lower()
|
||||||
return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}")
|
return _LANG_TO_LOCALE.get(primary, f"{primary}-{primary.upper()}")
|
||||||
|
|
||||||
|
|
||||||
# --- Custom Classes ---
|
# --- Custom Classes ---
|
||||||
class ScrapedMedia:
|
class ScrapedMedia:
|
||||||
def __init__(self, url, media_type="photo"):
|
def __init__(self, url, media_type="photo"):
|
||||||
@@ -204,11 +213,6 @@ def is_valid_url(url):
|
|||||||
def strip_trailing_url_punctuation(url):
|
def strip_trailing_url_punctuation(url):
|
||||||
if not url:
|
if not url:
|
||||||
return url
|
return url
|
||||||
# Strip a trailing hashtag-style fragment (#Word) that is really a social
|
|
||||||
# hashtag glued to the end of a URL with no space, e.g.
|
|
||||||
# https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
|
|
||||||
# Only stripped when it starts with a letter so real anchors like
|
|
||||||
# /page#section-2 inside a longer sentence are left alone.
|
|
||||||
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
|
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
|
||||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
|
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
|
||||||
|
|
||||||
@@ -254,7 +258,6 @@ def repair_broken_urls(text):
|
|||||||
|
|
||||||
original = text
|
original = text
|
||||||
text = split_concatenated_urls(text)
|
text = split_concatenated_urls(text)
|
||||||
# Split glued hashtag suffixes before any rejoining passes
|
|
||||||
text = split_url_hashtag_suffix(text)
|
text = split_url_hashtag_suffix(text)
|
||||||
|
|
||||||
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||||
@@ -277,8 +280,6 @@ def repair_broken_urls(text):
|
|||||||
)
|
)
|
||||||
|
|
||||||
text = split_concatenated_urls(text)
|
text = split_concatenated_urls(text)
|
||||||
# Run hashtag split again after rejoining passes — the rejoining regex
|
|
||||||
# contains # in its character class so it can re-glue a fragment.
|
|
||||||
text = split_url_hashtag_suffix(text)
|
text = split_url_hashtag_suffix(text)
|
||||||
|
|
||||||
if text != original:
|
if text != original:
|
||||||
@@ -637,7 +638,7 @@ def fetch_tweet_og_title_text(tweet_url, locale="en-US"):
|
|||||||
"Chrome/145.0.7632.6 Safari/537.36"
|
"Chrome/145.0.7632.6 Safari/537.36"
|
||||||
),
|
),
|
||||||
viewport={"width": 1280, "height": 900},
|
viewport={"width": 1280, "height": 900},
|
||||||
locale=_cache.locale, # ← USE CACHE
|
locale=_cache.locale,
|
||||||
)
|
)
|
||||||
page = browser_context.new_page()
|
page = browser_context.new_page()
|
||||||
page.goto(
|
page.goto(
|
||||||
@@ -1212,6 +1213,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
|
|||||||
|
|
||||||
return url_pos
|
return url_pos
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
if grapheme_len(text) <= max_length:
|
if grapheme_len(text) <= max_length:
|
||||||
return text
|
return text
|
||||||
@@ -1222,6 +1224,8 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
|
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
|
||||||
return truncated[:last_space]
|
return truncated[:last_space]
|
||||||
return truncated
|
return truncated
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
|
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
if (
|
if (
|
||||||
not text
|
not text
|
||||||
@@ -1264,6 +1268,7 @@ def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LEN
|
|||||||
|
|
||||||
return truncate_text_safely(text, max_length)
|
return truncate_text_safely(text, max_length)
|
||||||
|
|
||||||
|
|
||||||
def choose_final_visible_text(
|
def choose_final_visible_text(
|
||||||
full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True
|
full_clean_text, primary_non_x_url=None, prefer_full_text_without_url=True
|
||||||
):
|
):
|
||||||
@@ -1398,7 +1403,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
|
|||||||
).hexdigest()
|
).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
# --- Login hardening helpers (NEW) ---
|
# --- Login hardening helpers ---
|
||||||
def is_rate_limited_error(error_obj):
|
def is_rate_limited_error(error_obj):
|
||||||
text = repr(error_obj).lower()
|
text = repr(error_obj).lower()
|
||||||
return (
|
return (
|
||||||
@@ -1457,12 +1462,10 @@ def create_bsky_client(base_url, handle, password):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("❌ Bluesky login exception")
|
logging.exception("❌ Bluesky login exception")
|
||||||
|
|
||||||
# Fail fast on invalid credentials
|
|
||||||
if is_auth_error(e):
|
if is_auth_error(e):
|
||||||
logging.error("❌ Bluesky auth failed (invalid handle/app password).")
|
logging.error("❌ Bluesky auth failed (invalid handle/app password).")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Respect explicit rate-limit timing
|
|
||||||
if is_rate_limited_error(e):
|
if is_rate_limited_error(e):
|
||||||
if attempt < max_attempts:
|
if attempt < max_attempts:
|
||||||
wait = get_rate_limit_wait_seconds(e, default_delay=base_delay)
|
wait = get_rate_limit_wait_seconds(e, default_delay=base_delay)
|
||||||
@@ -1477,7 +1480,6 @@ def create_bsky_client(base_url, handle, password):
|
|||||||
logging.error("❌ Exhausted Bluesky login retries due to rate limiting.")
|
logging.error("❌ Exhausted Bluesky login retries due to rate limiting.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Retry transient/network problems
|
|
||||||
if is_network_error(e) or is_transient_error(e):
|
if is_network_error(e) or is_transient_error(e):
|
||||||
if attempt < max_attempts:
|
if attempt < max_attempts:
|
||||||
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||||
@@ -1491,7 +1493,6 @@ def create_bsky_client(base_url, handle, password):
|
|||||||
logging.error("❌ Exhausted Bluesky login retries after transient/network errors.")
|
logging.error("❌ Exhausted Bluesky login retries after transient/network errors.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Unknown errors: bounded retry anyway
|
|
||||||
if attempt < max_attempts:
|
if attempt < max_attempts:
|
||||||
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
wait = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||||
logging.warning(
|
logging.warning(
|
||||||
@@ -1720,7 +1721,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
|
|
||||||
return recent_posts
|
return recent_posts
|
||||||
|
|
||||||
|
|
||||||
# --- Upload / Retry Helpers ---
|
# --- Upload / Retry Helpers ---
|
||||||
def get_rate_limit_wait_seconds(error_obj, default_delay):
|
def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||||
"""
|
"""
|
||||||
@@ -1733,7 +1733,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
|||||||
try:
|
try:
|
||||||
now_ts = int(time.time())
|
now_ts = int(time.time())
|
||||||
|
|
||||||
# Direct headers on exception
|
|
||||||
headers = getattr(error_obj, "headers", None) or {}
|
headers = getattr(error_obj, "headers", None) or {}
|
||||||
retry_after = headers.get("retry-after") or headers.get("Retry-After")
|
retry_after = headers.get("retry-after") or headers.get("Retry-After")
|
||||||
if retry_after:
|
if retry_after:
|
||||||
@@ -1751,7 +1750,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Nested response headers
|
|
||||||
response = getattr(error_obj, "response", None)
|
response = getattr(error_obj, "response", None)
|
||||||
headers = getattr(response, "headers", None) or {}
|
headers = getattr(response, "headers", None) or {}
|
||||||
now_ts = int(time.time())
|
now_ts = int(time.time())
|
||||||
@@ -1921,6 +1919,7 @@ def send_post_with_retry(client, **kwargs):
|
|||||||
|
|
||||||
raise last_exception
|
raise last_exception
|
||||||
|
|
||||||
|
|
||||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||||
try:
|
try:
|
||||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||||
@@ -2304,11 +2303,6 @@ def build_external_link_embed(
|
|||||||
|
|
||||||
|
|
||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
# NOTE: Bluesky supports native @mention facets, but resolving a Twitter
|
|
||||||
# handle to a Bluesky DID requires an external lookup. That mapping is not
|
|
||||||
# available here so @mentions are passed through as plain text. If you add
|
|
||||||
# a handle-mapping table in the future, call
|
|
||||||
# text_builder.mention(word, did) here instead of text_builder.text(word).
|
|
||||||
text_builder = client_utils.TextBuilder()
|
text_builder = client_utils.TextBuilder()
|
||||||
content = clean_post_text(content)
|
content = clean_post_text(content)
|
||||||
lines = content.splitlines()
|
lines = content.splitlines()
|
||||||
@@ -2464,7 +2458,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local
|
|||||||
browser_context = browser.new_context(
|
browser_context = browser.new_context(
|
||||||
user_agent=clean_ua,
|
user_agent=clean_ua,
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1920, "height": 1080},
|
||||||
locale=locale, # ✅ add this
|
locale=locale,
|
||||||
)
|
)
|
||||||
login_page = browser_context.new_page()
|
login_page = browser_context.new_page()
|
||||||
|
|
||||||
@@ -3024,7 +3018,7 @@ def sync_feeds(args):
|
|||||||
logging.info("🔄 Starting sync cycle...")
|
logging.info("🔄 Starting sync cycle...")
|
||||||
dry_run = getattr(args, "dry_run", False)
|
dry_run = getattr(args, "dry_run", False)
|
||||||
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
||||||
bot_locale = bsky_langs_to_playwright_locale(bsky_langs) # ✅ now defined
|
bot_locale = bsky_langs_to_playwright_locale(bsky_langs)
|
||||||
_cache.locale = bot_locale
|
_cache.locale = bot_locale
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
@@ -3239,8 +3233,7 @@ def sync_feeds(args):
|
|||||||
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
|
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Pre-resolve video URLs in isolated contexts
|
||||||
# Pre-resolve video URLs in isolated contexts (deterministic text<->media binding)
|
|
||||||
if candidate_tweets:
|
if candidate_tweets:
|
||||||
with sync_playwright() as p_pre:
|
with sync_playwright() as p_pre:
|
||||||
pre_browser = p_pre.chromium.launch(
|
pre_browser = p_pre.chromium.launch(
|
||||||
@@ -3304,9 +3297,7 @@ def sync_feeds(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
logging.info(
|
logging.info(f" 📄 Text: {raw_text[:200]}")
|
||||||
f" 📄 Text: {raw_text[:200]}"
|
|
||||||
)
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f" 🔗 Primary external URL: "
|
f" 🔗 Primary external URL: "
|
||||||
f"{candidate.get('resolved_primary_external_url', 'None')}"
|
f"{candidate.get('resolved_primary_external_url', 'None')}"
|
||||||
@@ -3603,6 +3594,7 @@ def sync_feeds(args):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"❌ Error during sync cycle: {e}")
|
logging.error(f"❌ Error during sync cycle: {e}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@@ -3613,8 +3605,6 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--twitter-password",
|
"--twitter-password",
|
||||||
# NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
|
|
||||||
# Prefer setting TWITTER_PASSWORD in your .env file instead.
|
|
||||||
help="Your Twitter login password",
|
help="Your Twitter login password",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -3631,8 +3621,6 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bsky-password",
|
"--bsky-password",
|
||||||
# NOTE (FIX #15): same warning as --twitter-password above.
|
|
||||||
# Prefer setting BSKY_APP_PASSWORD in your .env file instead.
|
|
||||||
help="Your Bluesky app password",
|
help="Your Bluesky app password",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -3657,8 +3645,7 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Resolve credentials: CLI args take priority, then env vars.
|
# Resolve credentials: CLI args take priority, then env vars.
|
||||||
# FIX #15 — env vars are the secure path; CLI args expose secrets in
|
# Prefer .env / environment variables to avoid exposing secrets in process list.
|
||||||
# the process list. Operators should prefer .env / environment variables.
|
|
||||||
args.twitter_username = args.twitter_username or os.getenv(
|
args.twitter_username = args.twitter_username or os.getenv(
|
||||||
"TWITTER_USERNAME"
|
"TWITTER_USERNAME"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user