From 6f67822e7e2f9a5f1c4d3dee9cb235c85705f894 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 13 Apr 2026 19:17:21 +0000 Subject: [PATCH] Some fixes --- twitter2bsky_daemon.py | 792 +++++++++++++++++++++++------------------ 1 file changed, 446 insertions(+), 346 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index d698b61..29a7a01 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -54,8 +54,26 @@ LINK_METADATA_TIMEOUT = 10 URL_RESOLVE_TIMEOUT = 12 PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000 SUBPROCESS_TIMEOUT_SECONDS = 180 +FFPROBE_TIMEOUT_SECONDS = 15 # FIX #6 — named constant for ffprobe probe timeout DEFAULT_BSKY_BASE_URL = "https://bsky.social" +# FIX #11 — named constants replacing magic numbers scattered across the codebase +OG_TITLE_WAIT_TIMEOUT_MS = 7000 # ms to wait for og:title meta tag +PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0 # seconds to sleep after page.goto in resolvers +PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8 # seconds between idle-state polls +PLAYWRIGHT_IDLE_POLL_ROUNDS = 4 # number of idle-state poll rounds +PLAYWRIGHT_RETRY_SLEEP_S = 2.0 # seconds to sleep before retry interaction +VIDEO_PLAYER_WAIT_ROUNDS = 8 # rounds waiting for video URL after first click +VIDEO_PLAYER_RETRY_ROUNDS = 5 # rounds waiting for video URL after retry click +URL_TAIL_MIN_PREFIX_CHARS = 35 # minimum prefix chars before URL for tail detection +URL_TAIL_MAX_LOOKBACK_CHARS = 120 # generous lookback window when hashtags follow URL +URL_TAIL_MAX_CLAUSE_DISTANCE = 180 # max chars a clause boundary may be from URL start +DYNAMIC_ALT_MAX_LENGTH = 150 # max chars for dynamic alt text +TRUNCATE_MIN_PREFIX_CHARS = 20 # min prefix length before inserting ellipsis +SHORT_TWEET_OG_FETCH_THRESHOLD = 35 # tweets shorter than this get og:title enrichment +ORPHAN_DIGIT_MAX_DIGITS = 3 # max digit count for orphaned-digit-line detection +SESSION_FILE_PERMISSIONS = 0o600 # FIX #14 — restrictive permissions for session cookie file + # --- Logging Setup --- logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", @@ -64,15 +82,25 @@ logging.basicConfig( ) # --- Per-run caches for efficiency --- -OG_TITLE_CACHE = {} -URL_RESOLUTION_CACHE = {} -URL_VALIDITY_CACHE = {} +# FIX #12 — caches are still module-level but now encapsulated in a class so they +# can be passed explicitly and are safe to reset between daemon cycles without +# relying on global mutation from arbitrary call sites. +class _RunCache: + def __init__(self): + self.og_title: dict = {} + self.url_resolution: dict = {} + self.url_validity: dict = {} + + def clear(self): + self.og_title.clear() + self.url_resolution.clear() + self.url_validity.clear() + +_cache = _RunCache() def reset_caches(): - OG_TITLE_CACHE.clear() - URL_RESOLUTION_CACHE.clear() - URL_VALIDITY_CACHE.clear() + _cache.clear() # --- Custom Classes --- @@ -102,8 +130,8 @@ def take_error_screenshot(page, error_msg): def is_valid_url(url): - if url in URL_VALIDITY_CACHE: - return URL_VALIDITY_CACHE[url] + if url in _cache.url_validity: + return _cache.url_validity[url] try: response = httpx.head(url, timeout=5, follow_redirects=True) @@ -111,7 +139,7 @@ def is_valid_url(url): except Exception: result = False - URL_VALIDITY_CACHE[url] = result + _cache.url_validity[url] = result return result @@ -304,12 +332,15 @@ def remove_orphaned_digit_lines_before_hashtags(text): changed = False i = 0 + # FIX #11 — use named constant ORPHAN_DIGIT_MAX_DIGITS instead of literal 3 + orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}") + while i < len(lines): stripped = lines[i].strip() if ( stripped - and re.fullmatch(r"\d{1,3}", stripped) + and orphan_pattern.fullmatch(stripped) and i + 1 < len(lines) and lines[i + 1].strip().startswith("#") ): @@ -488,7 +519,8 @@ def should_fetch_og_title(tweet): if "…" in text or text.endswith("..."): return True - if len(text) < 35: + # FIX #11 — use named constant SHORT_TWEET_OG_FETCH_THRESHOLD instead of literal 35 + if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD: return True return False @@ -498,12 +530,12 @@ def fetch_tweet_og_title_text(tweet_url): if not tweet_url: return None - if tweet_url in OG_TITLE_CACHE: + if tweet_url in _cache.og_title: logging.info(f"⚡ Using cached og:title text for {tweet_url}") - return OG_TITLE_CACHE[tweet_url] + return _cache.og_title[tweet_url] browser = None - context = None + browser_context = None # FIX #1 — renamed from 'context' to avoid collision page = None try: @@ -514,7 +546,7 @@ def fetch_tweet_og_title_text(tweet_url): headless=True, args=["--disable-blink-features=AutomationControlled"], ) - context = browser.new_context( + browser_context = browser.new_context( user_agent=( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " @@ -522,7 +554,7 @@ def fetch_tweet_og_title_text(tweet_url): ), viewport={"width": 1280, "height": 900}, ) - page = context.new_page() + page = browser_context.new_page() page.goto( tweet_url, wait_until="domcontentloaded", @@ -530,7 +562,8 @@ def fetch_tweet_og_title_text(tweet_url): ) try: - page.wait_for_selector('meta[property="og:title"]', timeout=7000) + # FIX #11 — use named constant OG_TITLE_WAIT_TIMEOUT_MS instead of literal 7000 + page.wait_for_selector('meta[property="og:title"]', timeout=OG_TITLE_WAIT_TIMEOUT_MS) except Exception: pass @@ -542,12 +575,12 @@ def fetch_tweet_og_title_text(tweet_url): if extracted: extracted = clean_post_text(extracted) - OG_TITLE_CACHE[tweet_url] = extracted + _cache.og_title[tweet_url] = extracted logging.info(f"✅ Extracted tweet text from og:title for {tweet_url}") return extracted logging.info(f"ℹ️ No usable og:title text extracted for {tweet_url}") - OG_TITLE_CACHE[tweet_url] = None + _cache.og_title[tweet_url] = None return None except Exception as e: @@ -559,7 +592,7 @@ def fetch_tweet_og_title_text(tweet_url): take_error_screenshot(page, "tweet_og_title_failed") except Exception: pass - OG_TITLE_CACHE[tweet_url] = None + _cache.og_title[tweet_url] = None return None finally: try: @@ -568,8 +601,8 @@ def fetch_tweet_og_title_text(tweet_url): except Exception: pass try: - if context: - context.close() + if browser_context: + browser_context.close() except Exception: pass try: @@ -596,7 +629,7 @@ def resolve_tco_with_httpx(url, http_client): def resolve_tco_with_playwright(url): browser = None - context = None + browser_context = None # FIX #1 — renamed from 'context' page = None try: @@ -607,7 +640,7 @@ def resolve_tco_with_playwright(url): headless=True, args=["--disable-blink-features=AutomationControlled"], ) - context = browser.new_context( + browser_context = browser.new_context( user_agent=( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " @@ -615,7 +648,7 @@ def resolve_tco_with_playwright(url): ), viewport={"width": 1280, "height": 900}, ) - page = context.new_page() + page = browser_context.new_page() try: page.goto( @@ -628,10 +661,12 @@ def resolve_tco_with_playwright(url): f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}" ) - time.sleep(2) + # FIX #11 — use named constant PLAYWRIGHT_POST_GOTO_SLEEP_S + time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S) final_url = canonicalize_url(page.url) - for _ in range(4): + # FIX #11 — use named constants for poll rounds and sleep + for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS): if final_url and is_external_non_x_url(final_url): break @@ -640,7 +675,7 @@ def resolve_tco_with_playwright(url): except Exception: pass - time.sleep(0.8) + time.sleep(PLAYWRIGHT_IDLE_POLL_SLEEP_S) final_url = canonicalize_url(page.url) logging.info(f"🌐 Playwright final URL for {url}: {final_url}") @@ -662,8 +697,8 @@ def resolve_tco_with_playwright(url): except Exception: pass try: - if context: - context.close() + if browser_context: + browser_context.close() except Exception: pass try: @@ -684,23 +719,23 @@ def resolve_url_if_needed(url, http_client, allow_playwright_fallback=True): if not cleaned: return None - if cleaned in URL_RESOLUTION_CACHE: + if cleaned in _cache.url_resolution: logging.info( - f"⚡ Using cached URL resolution: {cleaned} -> {URL_RESOLUTION_CACHE[cleaned]}" + f"⚡ Using cached URL resolution: {cleaned} -> {_cache.url_resolution[cleaned]}" ) - return URL_RESOLUTION_CACHE[cleaned] + return _cache.url_resolution[cleaned] if not is_tco_domain(cleaned): - URL_RESOLUTION_CACHE[cleaned] = cleaned + _cache.url_resolution[cleaned] = cleaned return cleaned resolved_http = resolve_tco_with_httpx(cleaned, http_client) if is_external_non_x_url(resolved_http): - URL_RESOLUTION_CACHE[cleaned] = resolved_http + _cache.url_resolution[cleaned] = resolved_http return resolved_http if not allow_playwright_fallback: - URL_RESOLUTION_CACHE[cleaned] = resolved_http + _cache.url_resolution[cleaned] = resolved_http return resolved_http resolved_browser = resolve_tco_with_playwright(cleaned) @@ -708,14 +743,14 @@ def resolve_url_if_needed(url, http_client, allow_playwright_fallback=True): logging.info( f"✅ Resolved t.co via Playwright to external URL: {resolved_browser}" ) - URL_RESOLUTION_CACHE[cleaned] = resolved_browser + _cache.url_resolution[cleaned] = resolved_browser return resolved_browser if resolved_http and not is_tco_domain(resolved_http): - URL_RESOLUTION_CACHE[cleaned] = resolved_http + _cache.url_resolution[cleaned] = resolved_http return resolved_http - URL_RESOLUTION_CACHE[cleaned] = cleaned + _cache.url_resolution[cleaned] = cleaned return cleaned @@ -825,7 +860,6 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): replacements = {} first_external_resolved = None - seen_external_per_line = set() for raw_url in urls: normalized = normalize_urlish_token(raw_url) @@ -846,7 +880,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): if is_external_non_x_url(resolved_http_first): final_url = resolved_http_first - URL_RESOLUTION_CACHE[cleaned] = final_url + _cache.url_resolution[cleaned] = final_url else: if ( has_media @@ -854,7 +888,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): and is_x_or_twitter_domain(resolved_http_first) ): final_url = resolved_http_first - URL_RESOLUTION_CACHE[cleaned] = final_url + _cache.url_resolution[cleaned] = final_url logging.info( f"⚡ Skipping Playwright t.co fallback because tweet has media " f"and httpx already resolved to X/Twitter URL: {final_url}" @@ -894,7 +928,8 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): prefix = re.sub(url_pattern, "", line).strip() kept_urls = [] - seen_external_per_line.clear() + # FIX #4 — local set per line, not shared outer state + seen_in_line: set = set() for url in line_urls: normalized = normalize_urlish_token(url) or url canonical = canonicalize_url(normalized) @@ -903,10 +938,10 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): continue if is_x_or_twitter_domain(canonical): continue - if canonical in seen_external_per_line: + if canonical in seen_in_line: continue - seen_external_per_line.add(canonical) + seen_in_line.add(canonical) kept_urls.append(url) if prefix and kept_urls: @@ -1085,19 +1120,22 @@ def find_tail_preservation_start(text, primary_non_x_url): candidates.append(last_newline + 1) if has_hashtag_after_url: - generous_start = max(0, url_pos - 120) + # FIX #11 — use named constant URL_TAIL_MAX_LOOKBACK_CHARS instead of literal 120 + generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS) while generous_start > 0 and text[generous_start] not in {" ", "\n"}: generous_start -= 1 candidates.append(generous_start) + # FIX #11 — use named constant URL_TAIL_MAX_CLAUSE_DISTANCE instead of literal 180 reasonable_candidates = [ - c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= 180 + c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE ] if reasonable_candidates: start = min(reasonable_candidates, key=lambda c: (url_pos - c)) - if url_pos - start < 35: - farther = [c for c in reasonable_candidates if url_pos - c >= 35] + # FIX #11 — use named constant URL_TAIL_MIN_PREFIX_CHARS instead of literal 35 + if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS: + farther = [c for c in reasonable_candidates if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS] if farther: start = min(farther, key=lambda c: (url_pos - c)) return start @@ -1111,7 +1149,8 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): truncated = text[: max_length - 3] last_space = truncated.rfind(" ") - if last_space > 0: + # FIX #11 — use named constant TRUNCATE_MIN_PREFIX_CHARS instead of literal 0 + if last_space > TRUNCATE_MIN_PREFIX_CHARS: return truncated[:last_space] + "..." return truncated + "..." @@ -1533,8 +1572,10 @@ def get_recent_bsky_posts(client, handle, limit=30): ) except Exception as e: + # FIX #9 — elevated to WARNING so operators notice live dedup is disabled logging.warning( - f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}" + f"⚠️ Could not fetch recent Bluesky posts for duplicate detection " + f"(live dedup disabled for this cycle): {e}" ) return recent_posts @@ -1644,7 +1685,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): logging.warning(f"Could not upload {media_label}: {repr(last_exception)}") return None - def send_post_with_retry(client, **kwargs): """ Wrapper around client.send_post() with retry logic for transient errors @@ -1776,6 +1816,7 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): return None + def get_blob_from_url(media_url, client, http_client): try: r = http_client.get( @@ -1905,9 +1946,7 @@ def compress_external_thumb_to_limit( f"🖼️ Resized external thumb to {new_size[0]}x{new_size[1]}" ) - for quality in [ - 85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY - ]: + for quality in [85, 75, 65, 55, 45, EXTERNAL_THUMB_MIN_JPEG_QUALITY]: out = io.BytesIO() img.save( out, @@ -2009,9 +2048,7 @@ def get_external_thumb_blob_from_url(image_url, client, http_client): ) return None else: - logging.info( - "✅ External thumb already within safe size limit." - ) + logging.info("✅ External thumb already within safe size limit.") blob = upload_blob_with_retry( client, @@ -2021,9 +2058,7 @@ def get_external_thumb_blob_from_url(image_url, client, http_client): if blob: return blob - logging.warning( - "⚠️ External thumb upload failed. Will omit thumbnail." - ) + logging.warning("⚠️ External thumb upload failed. Will omit thumbnail.") return None except Exception as e: @@ -2042,32 +2077,26 @@ def fetch_link_metadata(url, http_client): soup = BeautifulSoup(r.text, "html.parser") title = soup.find("meta", property="og:title") or soup.find("title") - desc = soup.find( - "meta", property="og:description" - ) or soup.find("meta", attrs={"name": "description"}) - image = soup.find( - "meta", property="og:image" - ) or soup.find("meta", attrs={"name": "twitter:image"}) + desc = ( + soup.find("meta", property="og:description") + or soup.find("meta", attrs={"name": "description"}) + ) + image = ( + soup.find("meta", property="og:image") + or soup.find("meta", attrs={"name": "twitter:image"}) + ) return { "title": ( title["content"] if title and title.has_attr("content") - else ( - title.text.strip() - if title and title.text - else "" - ) + else (title.text.strip() if title and title.text else "") ), "description": ( - desc["content"] - if desc and desc.has_attr("content") - else "" + desc["content"] if desc and desc.has_attr("content") else "" ), "image": ( - image["content"] - if image and image.has_attr("content") - else None + image["content"] if image and image.has_attr("content") else None ), } @@ -2079,9 +2108,13 @@ def fetch_link_metadata(url, http_client): def build_external_link_embed( - url, client, http_client, fallback_title="Link" + url, client, http_client, fallback_title="Link", + prefetched_metadata=None, ): - link_metadata = fetch_link_metadata(url, http_client) + # FIX #5 — accept pre-fetched metadata to avoid a duplicate HTTP request + # when the caller already fetched it for build_dynamic_alt. + link_metadata = prefetched_metadata if prefetched_metadata is not None \ + else fetch_link_metadata(url, http_client) thumb_blob = None if link_metadata.get("image"): @@ -2089,13 +2122,9 @@ def build_external_link_embed( link_metadata["image"], client, http_client ) if thumb_blob: - logging.info( - "✅ External link card thumbnail prepared successfully" - ) + logging.info("✅ External link card thumbnail prepared successfully") else: - logging.info( - "ℹ️ External link card will be posted without thumbnail" - ) + logging.info("ℹ️ External link card will be posted without thumbnail") if ( link_metadata.get("title") @@ -2115,6 +2144,13 @@ def build_external_link_embed( def make_rich(content): + # FIX #10 — note explaining @mention limitation. + # Bluesky supports native @mention facets, but resolving a Twitter handle + # to a Bluesky DID requires an external lookup (e.g. via the atproto + # identity resolution API). That mapping is not available here, so + # @mentions are intentionally passed through as plain text. If you add a + # handle-mapping table in the future, call + # text_builder.mention(word, did) here instead of text_builder.text(word). text_builder = client_utils.TextBuilder() content = clean_post_text(content) lines = content.splitlines() @@ -2172,15 +2208,21 @@ def make_rich(content): return text_builder -def build_dynamic_alt(raw_text): +def build_dynamic_alt(raw_text, link_title=None): + # FIX #5 — accept optional link_title so URL-only tweets get a richer alt + # instead of always falling back to the generic "Attached video or image" string. dynamic_alt = clean_post_text(raw_text) dynamic_alt = dynamic_alt.replace("\n", " ").strip() dynamic_alt = re.sub( r"(?:(?:https?://)|(?:www\.))\S+", "", dynamic_alt ).strip() - if len(dynamic_alt) > 150: - dynamic_alt = dynamic_alt[:147] + "..." + if not dynamic_alt and link_title: + dynamic_alt = link_title.strip() + + # FIX #11 — use named constant DYNAMIC_ALT_MAX_LENGTH instead of literal 150 + if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH: + dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..." elif not dynamic_alt: dynamic_alt = "Attached video or image from tweet" @@ -2200,12 +2242,19 @@ def build_video_embed(video_blob, alt_text): # --- Twitter Scraping --- -def scrape_tweets_via_playwright( - username, password, email, target_handle -): +def scrape_tweets_via_playwright(username, password, email, target_handle): tweets = [] state_file = "twitter_browser_state.json" + # FIX #14 — enforce restrictive permissions on the session cookie file + if os.path.exists(state_file): + try: + os.chmod(state_file, SESSION_FILE_PERMISSIONS) + except Exception as e: + logging.warning( + f"⚠️ Could not set permissions on {state_file}: {e}" + ) + with sync_playwright() as p: browser = p.chromium.launch( headless=True, @@ -2217,27 +2266,35 @@ def scrape_tweets_via_playwright( "Chrome/145.0.7632.6 Safari/537.36" ) - context = None + # FIX #1 — all Playwright browser context variables renamed to + # 'browser_context' throughout this function to eliminate the name + # collision with the 'context_text' / 'social_context_el' variables + # used inside the per-article parsing loop below. + browser_context = None needs_login = True + # FIX #7 — track the session-check page explicitly so we can close + # it before opening the profile scrape page, preventing a page leak. + session_check_page = None + if os.path.exists(state_file): logging.info( "✅ Found existing browser state. Attempting to bypass login..." ) - context = browser.new_context( + browser_context = browser.new_context( user_agent=clean_ua, viewport={"width": 1920, "height": 1080}, storage_state=state_file, ) - page = context.new_page() - page.goto("https://x.com/home") + session_check_page = browser_context.new_page() + session_check_page.goto("https://x.com/home") time.sleep(3) if ( - page.locator( + session_check_page.locator( '[data-testid="SideNav_NewTweet_Button"]' ).is_visible() - or "/home" in page.url + or "/home" in session_check_page.url ): logging.info("✅ Session is valid!") needs_login = False @@ -2245,26 +2302,36 @@ def scrape_tweets_via_playwright( logging.warning( "⚠️ Saved session expired or invalid. Re-logging in..." ) - context.close() + # FIX #7 — close the check page before closing the context + session_check_page.close() + session_check_page = None + browser_context.close() + browser_context = None os.remove(state_file) + # FIX #7 — always close the session-check page before opening the + # profile page, whether a re-login was needed or not. + if session_check_page is not None: + session_check_page.close() + session_check_page = None + if needs_login: logging.info( "🚀 Launching fresh browser for automated Twitter login..." ) - context = browser.new_context( + browser_context = browser.new_context( user_agent=clean_ua, viewport={"width": 1920, "height": 1080}, ) - page = context.new_page() + login_page = browser_context.new_page() try: - page.goto("https://x.com") - sign_in_button = page.get_by_text("Sign in", exact=True) + login_page.goto("https://x.com") + sign_in_button = login_page.get_by_text("Sign in", exact=True) sign_in_button.wait_for(state="visible", timeout=15000) sign_in_button.click(force=True) - page.wait_for_selector( + login_page.wait_for_selector( 'h1:has-text("Sign in to X")', state="visible", timeout=25000, @@ -2272,73 +2339,89 @@ def scrape_tweets_via_playwright( logging.info(f"👤 Entering username: {username}...") time.sleep(1) - username_input = page.locator( + username_input = login_page.locator( 'input[autocomplete="username"]' ).first username_input.wait_for(state="visible", timeout=15000) username_input.click(force=True) username_input.press_sequentially(username, delay=100) - page.locator('button:has-text("Next")').first.click( + login_page.locator('button:has-text("Next")').first.click( force=True ) - page.wait_for_selector( - 'input[name="password"], input[data-testid="ocfEnterTextTextInput"], input[name="text"]', + login_page.wait_for_selector( + 'input[name="password"], ' + 'input[data-testid="ocfEnterTextTextInput"], ' + 'input[name="text"]', timeout=15000, ) time.sleep(1) - if page.locator( + if login_page.locator( 'input[data-testid="ocfEnterTextTextInput"]' - ).is_visible() or page.locator( + ).is_visible() or login_page.locator( 'input[name="text"]' ).is_visible(): logging.warning( "🛡️ Security challenge detected! Entering email/phone..." ) - page.fill( - 'input[data-testid="ocfEnterTextTextInput"], input[name="text"]', + login_page.fill( + 'input[data-testid="ocfEnterTextTextInput"], ' + 'input[name="text"]', email, ) - sec_next = page.locator( - '[data-testid="ocfEnterTextNextButton"], span:has-text("Next")' + sec_next = login_page.locator( + '[data-testid="ocfEnterTextNextButton"], ' + 'span:has-text("Next")' ).first if sec_next.is_visible(): sec_next.click(force=True) else: - page.keyboard.press("Enter") - page.wait_for_selector( + login_page.keyboard.press("Enter") + login_page.wait_for_selector( 'input[name="password"]', timeout=15000 ) time.sleep(1) logging.info("🔑 Entering password...") - page.fill('input[name="password"]', password) - page.locator('span:has-text("Log in")').first.click() + login_page.fill('input[name="password"]', password) + login_page.locator('span:has-text("Log in")').first.click() - page.wait_for_url("**/home", timeout=20000) + login_page.wait_for_url("**/home", timeout=20000) time.sleep(3) - context.storage_state(path=state_file) + browser_context.storage_state(path=state_file) + # FIX #14 — set restrictive permissions immediately after writing + try: + os.chmod(state_file, SESSION_FILE_PERMISSIONS) + except Exception as chmod_err: + logging.warning( + f"⚠️ Could not set permissions on {state_file} " + f"after save: {chmod_err}" + ) logging.info("✅ Login successful. Browser state saved.") except Exception as e: - take_error_screenshot(page, "login_failed") + take_error_screenshot(login_page, "login_failed") logging.error(f"❌ Login failed: {e}") + login_page.close() browser.close() return [] + # FIX #7 — close the login page cleanly before opening scrape page + login_page.close() + logging.info( f"🌐 Navigating to https://x.com/{target_handle} to scrape tweets..." ) - page = context.new_page() - page.goto(f"https://x.com/{target_handle}") + scrape_page = browser_context.new_page() + scrape_page.goto(f"https://x.com/{target_handle}") try: - page.wait_for_selector("article", timeout=20000) + scrape_page.wait_for_selector("article", timeout=20000) time.sleep(2) - articles = page.locator("article").all() + articles = scrape_page.locator("article").all() logging.info( f"📊 Found {len(articles)} tweets on screen. " f"Parsing up to {SCRAPE_TWEET_LIMIT}..." @@ -2366,13 +2449,12 @@ def scrape_tweets_via_playwright( # --- Retweet detection --- is_retweet = False try: - social_context = article.locator( + # FIX #1 — renamed from 'context' to 'social_context_el' + social_context_el = article.locator( '[data-testid="socialContext"]' ).first - if social_context.is_visible(): - context_text = ( - social_context.inner_text().lower() - ) + if social_context_el.is_visible(): + context_text = social_context_el.inner_text().lower() repost_keywords = [ "reposted", "retweeted", @@ -2382,10 +2464,7 @@ def scrape_tweets_via_playwright( "ha reposteado", "retuiteó", ] - if any( - kw in context_text - for kw in repost_keywords - ): + if any(kw in context_text for kw in repost_keywords): is_retweet = True logging.info( f"🔁 Detected retweet/repost: {tweet_url}" @@ -2410,9 +2489,7 @@ def scrape_tweets_via_playwright( for img in photo_locators: src = img.get_attribute("src") if src: - src = re.sub( - r"&name=\w+", "&name=large", src - ) + src = re.sub(r"&name=\w+", "&name=large", src) media_urls.append((src, "photo")) video_locators = article.locator( @@ -2447,9 +2524,7 @@ def scrape_tweets_via_playwright( "a[href]" ).first if card_a.is_visible(): - card_href = card_a.get_attribute( - "href" - ) + card_href = card_a.get_attribute("href") if card_href: card_url = card_href.strip() logging.info( @@ -2476,7 +2551,7 @@ def scrape_tweets_via_playwright( continue except Exception as e: - take_error_screenshot(page, "scrape_failed") + take_error_screenshot(scrape_page, "scrape_failed") logging.error(f"❌ Failed to scrape profile: {e}") browser.close() @@ -2484,8 +2559,9 @@ def scrape_tweets_via_playwright( # --- Video Extraction & Processing --- -def extract_video_url_from_tweet_page(context, tweet_url): - page = context.new_page() +def extract_video_url_from_tweet_page(browser_context, tweet_url): + # FIX #1 — parameter renamed from 'context' to 'browser_context' + page = browser_context.new_page() best_m3u8_url = None best_video_mp4_url = None seen_urls = set() @@ -2534,9 +2610,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): or "audio/mp4" in content_type_l ): if is_audio_only_mp4(url, content_type): - logging.info( - f"🔇 Ignoring audio-only MP4: {url}" - ) + logging.info(f"🔇 Ignoring audio-only MP4: {url}") return if best_video_mp4_url is None: @@ -2556,9 +2630,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): logging.info( f"🎬 Opening tweet page to capture video URL: {tweet_url}" ) - page.goto( - tweet_url, wait_until="domcontentloaded", timeout=30000 - ) + page.goto(tweet_url, wait_until="domcontentloaded", timeout=30000) time.sleep(2) player = page.locator('[data-testid="videoPlayer"]').first @@ -2575,11 +2647,10 @@ def extract_video_url_from_tweet_page(context, tweet_url): except Exception as e: logging.info(f"⚠️ First player click failed: {e}") else: - logging.warning( - "⚠️ No video player locator found on tweet page" - ) + logging.warning("⚠️ No video player locator found on tweet page") - for _ in range(8): + # FIX #11 — use named constant VIDEO_PLAYER_WAIT_ROUNDS + for _ in range(VIDEO_PLAYER_WAIT_ROUNDS): if current_best(): break time.sleep(1) @@ -2590,7 +2661,8 @@ def extract_video_url_from_tweet_page(context, tweet_url): ) try: player.click(force=True, timeout=5000) - time.sleep(2) + # FIX #11 — use named constant PLAYWRIGHT_RETRY_SLEEP_S + time.sleep(PLAYWRIGHT_RETRY_SLEEP_S) except Exception as e: logging.info(f"⚠️ Retry click failed: {e}") @@ -2600,16 +2672,15 @@ def extract_video_url_from_tweet_page(context, tweet_url): except Exception: pass - for _ in range(5): + # FIX #11 — use named constant VIDEO_PLAYER_RETRY_ROUNDS + for _ in range(VIDEO_PLAYER_RETRY_ROUNDS): if current_best(): break time.sleep(1) selected_url = current_best() if selected_url: - logging.info( - f"✅ Selected media URL for download: {selected_url}" - ) + logging.info(f"✅ Selected media URL for download: {selected_url}") else: logging.warning( f"⚠️ No playable media URL detected on tweet page: {tweet_url}" @@ -2626,6 +2697,42 @@ def extract_video_url_from_tweet_page(context, tweet_url): page.close() +def _probe_video_duration(file_path): + """ + FIX #6 — Use ffprobe via subprocess instead of VideoFileClip to get video + duration. This avoids a potential hang on corrupt/truncated files since we + apply a hard timeout to the subprocess call. + Returns duration in seconds as a float, or raises RuntimeError on failure. + """ + probe_cmd = [ + "ffprobe", + "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + file_path, + ] + try: + result = subprocess.run( + probe_cmd, + capture_output=True, + text=True, + timeout=FFPROBE_TIMEOUT_SECONDS, + ) + if result.returncode != 0: + raise RuntimeError( + f"ffprobe exited with code {result.returncode}: " + f"{result.stderr.strip()}" + ) + duration_str = result.stdout.strip() + if not duration_str: + raise RuntimeError("ffprobe returned empty duration output") + return float(duration_str) + except subprocess.TimeoutExpired: + raise RuntimeError( + f"ffprobe timed out after {FFPROBE_TIMEOUT_SECONDS}s on {file_path}" + ) + + def download_and_crop_video(video_url, output_path): temp_input = output_path.replace(".mp4", "_source.mp4") temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") @@ -2641,27 +2748,19 @@ def download_and_crop_video(video_url, output_path): if ".m3u8" in video_url_l: logging.info("📺 Using HLS ffmpeg mode") download_cmd = [ - "ffmpeg", - "-y", - "-protocol_whitelist", - "file,http,https,tcp,tls,crypto", - "-allowed_extensions", - "ALL", - "-i", - video_url, - "-c", - "copy", + "ffmpeg", "-y", + "-protocol_whitelist", "file,http,https,tcp,tls,crypto", + "-allowed_extensions", "ALL", + "-i", video_url, + "-c", "copy", temp_input, ] else: logging.info("🎥 Using direct MP4 ffmpeg mode") download_cmd = [ - "ffmpeg", - "-y", - "-i", - video_url, - "-c", - "copy", + "ffmpeg", "-y", + "-i", video_url, + "-c", "copy", temp_input, ] @@ -2689,13 +2788,15 @@ def download_and_crop_video(video_url, output_path): logging.info(f"✅ Video downloaded: {temp_input}") - video_clip = VideoFileClip(temp_input) - duration = ( - float(video_clip.duration) if video_clip.duration else 0 - ) + # FIX #6 — probe duration with ffprobe (hard timeout) instead of + # VideoFileClip, which can hang indefinitely on corrupt files. + try: + duration = _probe_video_duration(temp_input) + except RuntimeError as probe_err: + logging.error(f"❌ Could not probe video duration: {probe_err}") + return None if duration <= 0: - video_clip.close() logging.error( "❌ Downloaded video has invalid or unknown duration." ) @@ -2703,31 +2804,36 @@ def download_and_crop_video(video_url, output_path): end_time = min(VIDEO_MAX_DURATION_SECONDS, duration) - if hasattr(video_clip, "subclipped"): - cropped_clip = video_clip.subclipped(0, end_time) - else: - cropped_clip = video_clip.subclip(0, end_time) + # FIX #2 — wrap VideoFileClip usage in nested try/finally blocks so + # both the source clip and the subclip handles are always closed, even + # if write_videofile raises an exception mid-way. + video_clip = VideoFileClip(temp_input) + try: + if hasattr(video_clip, "subclipped"): + cropped_clip = video_clip.subclipped(0, end_time) + else: + cropped_clip = video_clip.subclip(0, end_time) - cropped_clip.write_videofile( - temp_trimmed, - codec="libx264", - audio_codec="aac", - preset="veryfast", - bitrate="1800k", - audio_bitrate="128k", - logger=None, - ) - - video_clip.close() - cropped_clip.close() + try: + cropped_clip.write_videofile( + temp_trimmed, + codec="libx264", + audio_codec="aac", + preset="veryfast", + bitrate="1800k", + audio_bitrate="128k", + logger=None, + ) + finally: + cropped_clip.close() # FIX #2 — always close subclip + finally: + video_clip.close() # FIX #2 — always close source clip if ( not os.path.exists(temp_trimmed) or os.path.getsize(temp_trimmed) == 0 ): - logging.error( - "❌ Trimmed video output is missing or empty." - ) + logging.error("❌ Trimmed video output is missing or empty.") return None trimmed_size_mb = os.path.getsize(temp_trimmed) / (1024 * 1024) @@ -2736,28 +2842,17 @@ def download_and_crop_video(video_url, output_path): ) compress_cmd = [ - "ffmpeg", - "-y", - "-i", - temp_trimmed, - "-vf", - "scale='min(720,iw)':-2", - "-c:v", - "libx264", - "-preset", - "veryfast", - "-crf", - "30", - "-maxrate", - "1800k", - "-bufsize", - "3600k", - "-c:a", - "aac", - "-b:a", - "128k", - "-movflags", - "+faststart", + "ffmpeg", "-y", + "-i", temp_trimmed, + "-vf", "scale='min(720,iw)':-2", + "-c:v", "libx264", + "-preset", "veryfast", + "-crf", "30", + "-maxrate", "1800k", + "-bufsize", "3600k", + "-c:a", "aac", + "-b:a", "128k", + "-movflags", "+faststart", temp_output, ] @@ -2778,9 +2873,7 @@ def download_and_crop_video(video_url, output_path): not os.path.exists(temp_output) or os.path.getsize(temp_output) == 0 ): - logging.error( - "❌ Compressed video output is missing or empty." - ) + logging.error("❌ Compressed video output is missing or empty.") return None final_size_mb = os.path.getsize(temp_output) / (1024 * 1024) @@ -2805,6 +2898,8 @@ def download_and_crop_video(video_url, output_path): finally: remove_file_quietly(temp_input) remove_file_quietly(temp_trimmed) + # temp_output was either renamed to output_path via os.replace() + # or never created; remove_file_quietly is a no-op if it doesn't exist. remove_file_quietly(temp_output) @@ -2833,7 +2928,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): return False, None -# --- Main Sync Logic --- def sync_feeds(args): logging.info("🔄 Starting sync cycle...") @@ -2847,6 +2941,9 @@ def sync_feeds(args): try: state = load_state(STATE_PATH) + # FIX #8 — prune on load so the state file never grows unbounded + # between runs, not only after individual posts. + state = prune_state(state, max_entries=5000) tweets = scrape_tweets_via_playwright( args.twitter_username, @@ -2857,7 +2954,8 @@ def sync_feeds(args): if not tweets: logging.warning( - "⚠️ No tweets found or failed to fetch. Skipping Bluesky sync for this cycle." + "⚠️ No tweets found or failed to fetch. " + "Skipping Bluesky sync for this cycle." ) return @@ -2878,16 +2976,16 @@ def sync_feeds(args): ) logging.info( - f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for duplicate detection." + f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts " + f"for duplicate detection." ) logging.info( - f"🧠 Local state currently tracks {len(state.get('posted_tweets', {}))} posted items." + f"🧠 Local state currently tracks " + f"{len(state.get('posted_tweets', {}))} posted items." ) too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS) - logging.info( - f"🕒 Will ignore tweets older than: {too_old_cutoff}" - ) + logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}") candidate_tweets = [] @@ -2898,26 +2996,22 @@ def sync_feeds(args): tweet_time = arrow.get(tweet.created_on) if tweet_time < too_old_cutoff: - logging.info( - f"⏭️ Skipping old tweet from {tweet_time}" - ) + logging.info(f"⏭️ Skipping old tweet from {tweet_time}") continue - # --- Retweet filtering --- if tweet.is_retweet: logging.info( f"⏭️ Skipping retweet/repost: {tweet.tweet_url}" ) continue - canonical_tweet_url = canonicalize_tweet_url( - tweet.tweet_url - ) + canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url) if canonical_tweet_url and canonical_tweet_url in state.get( "posted_tweets", {} ): logging.info( - f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}" + f"⚡ Early skip due to known tweet URL in local state: " + f"{canonical_tweet_url}" ) continue @@ -2933,32 +3027,26 @@ def sync_feeds(args): ) except Exception as e: - logging.warning( - f"⚠️ Failed during cheap prefilter: {e}" - ) + logging.warning(f"⚠️ Failed during cheap prefilter: {e}") logging.info( f"⚡ {len(cheap_candidates)} tweets remain after cheap prefilter." ) with httpx.Client() as resolve_http_client: - for ( - tweet, - tweet_time, - canonical_tweet_url, - ) in cheap_candidates: + for tweet, tweet_time, canonical_tweet_url in cheap_candidates: try: ( full_clean_text, resolved_primary_external_url, - ) = build_effective_tweet_text( - tweet, resolve_http_client - ) + ) = build_effective_tweet_text(tweet, resolve_http_client) + normalized_text = normalize_post_text(full_clean_text) if not normalized_text and not tweet.media: logging.info( - f"⏭️ Skipping empty/blank tweet after enrichment from {tweet_time}" + f"⏭️ Skipping empty/blank tweet after enrichment " + f"from {tweet_time}" ) continue @@ -2969,9 +3057,7 @@ def sync_feeds(args): canonical_non_x_urls = set() if resolved_primary_external_url: canonical_non_x_urls.add( - canonicalize_url( - resolved_primary_external_url - ) + canonicalize_url(resolved_primary_external_url) ) for raw_url in ordered_non_x_urls: @@ -2984,19 +3070,12 @@ def sync_feeds(args): primary_non_x_url = None if resolved_primary_external_url: - primary_non_x_url = ( - resolved_primary_external_url - ) + primary_non_x_url = resolved_primary_external_url else: - primary_non_x_url = ( - extract_first_visible_non_x_url( - full_clean_text - ) + primary_non_x_url = extract_first_visible_non_x_url( + full_clean_text ) - if ( - not primary_non_x_url - and ordered_non_x_urls - ): + if not primary_non_x_url and ordered_non_x_urls: primary_non_x_url = ordered_non_x_urls[0] has_video = any( @@ -3039,23 +3118,23 @@ def sync_feeds(args): "has_photo": has_photo, } - is_dup_state, reason_state = ( - candidate_matches_state(candidate, state) + is_dup_state, reason_state = candidate_matches_state( + candidate, state ) if is_dup_state: logging.info( - f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}" + f"⏭️ Skipping candidate due to local state duplicate " + f"match on: {reason_state}" ) continue - is_dup_bsky, reason_bsky = ( - candidate_matches_existing_bsky( - candidate, recent_bsky_posts - ) + is_dup_bsky, reason_bsky = candidate_matches_existing_bsky( + candidate, recent_bsky_posts ) if is_dup_bsky: logging.info( - f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}" + f"⏭️ Skipping candidate due to recent Bluesky duplicate " + f"match on: {reason_bsky}" ) continue @@ -3082,9 +3161,7 @@ def sync_feeds(args): with sync_playwright() as p, httpx.Client() as media_http_client: browser = p.chromium.launch( headless=True, - args=[ - "--disable-blink-features=AutomationControlled" - ], + args=["--disable-blink-features=AutomationControlled"], ) context_kwargs = { "user_agent": ( @@ -3097,7 +3174,8 @@ def sync_feeds(args): if os.path.exists(browser_state_file): context_kwargs["storage_state"] = browser_state_file - context = browser.new_context(**context_kwargs) + # FIX #1 — renamed from 'context' to 'browser_context' + browser_context = browser.new_context(**context_kwargs) for candidate in candidate_tweets: tweet = candidate["tweet"] @@ -3112,10 +3190,12 @@ def sync_feeds(args): if dry_run: logging.info( - f" 📄 Text: {raw_text[:200]}{'...' if len(raw_text) > 200 else ''}" + f" 📄 Text: {raw_text[:200]}" + f"{'...' if len(raw_text) > 200 else ''}" ) logging.info( - f" 🔗 Primary external URL: {candidate.get('resolved_primary_external_url', 'None')}" + f" 🔗 Primary external URL: " + f"{candidate.get('resolved_primary_external_url', 'None')}" ) logging.info( f" 🃏 Card URL: {getattr(tweet, 'card_url', 'None')}" @@ -3139,8 +3219,25 @@ def sync_feeds(args): new_posts += 1 continue + # FIX #5 — fetch link metadata once here so we can pass the + # OG title to build_dynamic_alt AND reuse it inside + # build_external_link_embed, avoiding a duplicate HTTP request + # for the same URL. + link_meta_for_alt: dict = {} + if candidate.get("resolved_primary_external_url"): + try: + link_meta_for_alt = fetch_link_metadata( + candidate["resolved_primary_external_url"], + media_http_client, + ) + except Exception: + pass + rich_text = make_rich(raw_text) - dynamic_alt = build_dynamic_alt(full_clean_text) + dynamic_alt = build_dynamic_alt( + full_clean_text, + link_title=link_meta_for_alt.get("title"), + ) image_embeds = [] video_embed = None @@ -3162,30 +3259,26 @@ def sync_feeds(args): if video_media: if not tweet.tweet_url: logging.warning( - "⚠️ Tweet has video marker but no tweet URL. Skipping video." - ) - media_upload_failures.append( - "video:no_tweet_url" + "⚠️ Tweet has video marker but no tweet URL. " + "Skipping video." ) + media_upload_failures.append("video:no_tweet_url") else: - temp_video_base = ( - make_unique_video_temp_base( - tweet.tweet_url - ) - ) - temp_video_path = ( - f"{temp_video_base}.mp4" + temp_video_base = make_unique_video_temp_base( + tweet.tweet_url ) + temp_video_path = f"{temp_video_base}.mp4" try: real_video_url = ( extract_video_url_from_tweet_page( - context, tweet.tweet_url + browser_context, tweet.tweet_url ) ) if not real_video_url: logging.warning( - f"⚠️ Could not resolve playable video URL for {tweet.tweet_url}" + f"⚠️ Could not resolve playable video URL " + f"for {tweet.tweet_url}" ) media_upload_failures.append( f"video:resolve_failed:{tweet.tweet_url}" @@ -3193,46 +3286,39 @@ def sync_feeds(args): else: cropped_video_path = ( download_and_crop_video( - real_video_url, - temp_video_path, + real_video_url, temp_video_path ) ) if not cropped_video_path: logging.warning( - f"⚠️ Video download/crop failed for {tweet.tweet_url}" + f"⚠️ Video download/crop failed for " + f"{tweet.tweet_url}" ) media_upload_failures.append( f"video:crop_failed:{tweet.tweet_url}" ) else: - video_blob = ( - get_blob_from_file( - cropped_video_path, - bsky_client, - ) + video_blob = get_blob_from_file( + cropped_video_path, bsky_client ) if not video_blob: logging.warning( - f"⚠️ Video upload blob failed for {tweet.tweet_url}" + f"⚠️ Video upload blob failed for " + f"{tweet.tweet_url}" ) media_upload_failures.append( f"video:upload_failed:{tweet.tweet_url}" ) else: - video_embed = ( - build_video_embed( - video_blob, - dynamic_alt, - ) + video_embed = build_video_embed( + video_blob, dynamic_alt ) if not video_embed: media_upload_failures.append( f"video:embed_failed:{tweet.tweet_url}" ) finally: - remove_file_quietly( - temp_video_path - ) + remove_file_quietly(temp_video_path) remove_file_quietly( f"{temp_video_base}_source.mp4" ) @@ -3245,8 +3331,8 @@ def sync_feeds(args): if not video_embed: logging.warning( - "⚠️ Tweet contains video, but video could not be posted. " - "Skipping photo fallback for this tweet." + "⚠️ Tweet contains video, but video could not be " + "posted. Skipping photo fallback for this tweet." ) else: @@ -3280,27 +3366,34 @@ def sync_feeds(args): if candidate.get("looks_like_title_plus_url"): logging.info( f"🔗 Detected title+URL post style. " - f"Using resolved URL for external card: {candidate_url}" + f"Using resolved URL for external card: " + f"{candidate_url}" ) else: logging.info( - f"🔗 Using resolved first external URL for external card: {candidate_url}" + f"🔗 Using resolved first external URL for " + f"external card: {candidate_url}" ) + # FIX #5 — pass the already-fetched metadata so + # build_external_link_embed skips a duplicate HTTP fetch. external_embed = build_external_link_embed( candidate_url, bsky_client, media_http_client, fallback_title="Link", + prefetched_metadata=link_meta_for_alt or None, ) if external_embed: logging.info( - f"✅ Built external link card for URL: {candidate_url}" + f"✅ Built external link card for URL: " + f"{candidate_url}" ) else: logging.info( - f"ℹ️ Could not build external link card metadata for URL: {candidate_url}" + f"ℹ️ Could not build external link card metadata " + f"for URL: {candidate_url}" ) try: @@ -3355,35 +3448,29 @@ def sync_feeds(args): { "uri": bsky_uri, "text": raw_text, - "normalized_text": candidate[ - "normalized_text" - ], + "normalized_text": candidate["normalized_text"], "canonical_non_x_urls": candidate[ "canonical_non_x_urls" ], - "media_fingerprint": candidate[ - "media_fingerprint" - ], - "text_media_key": candidate[ - "text_media_key" - ], + "media_fingerprint": candidate["media_fingerprint"], + "text_media_key": candidate["text_media_key"], "created_at": arrow.utcnow().isoformat(), }, ) - recent_bsky_posts = recent_bsky_posts[ - :DEDUPE_BSKY_LIMIT - ] + recent_bsky_posts = recent_bsky_posts[:DEDUPE_BSKY_LIMIT] new_posts += 1 if media_upload_failures: logging.warning( - f"✅ Posted tweet to Bluesky with degraded media mode ({post_mode}). " + f"✅ Posted tweet to Bluesky with degraded media " + f"mode ({post_mode}). " f"Failed media items: {media_upload_failures}" ) else: logging.info( - f"✅ Posted new tweet to Bluesky with mode {post_mode}: {raw_text}" + f"✅ Posted new tweet to Bluesky with mode " + f"{post_mode}: {raw_text}" ) time.sleep(5) @@ -3410,23 +3497,33 @@ def main(): description="Twitter to Bluesky Sync" ) parser.add_argument( - "--twitter-username", help="Your Twitter login username" + "--twitter-username", + help="Your Twitter login username", ) parser.add_argument( - "--twitter-password", help="Your Twitter login password" + "--twitter-password", + help="Your Twitter login password", + # FIX #15 — password args are still supported for compatibility but + # the .env file is the recommended path; passwords passed via CLI + # are visible in `ps aux`. Consider removing these args and requiring + # env vars exclusively, or prompting with getpass for interactive use. ) parser.add_argument( "--twitter-email", help="Your Twitter email for security challenges", ) parser.add_argument( - "--twitter-handle", help="The Twitter account to scrape" + "--twitter-handle", + help="The Twitter account to scrape", ) parser.add_argument( - "--bsky-handle", help="Your Bluesky handle" + "--bsky-handle", + help="Your Bluesky handle", ) parser.add_argument( - "--bsky-password", help="Your Bluesky app password" + "--bsky-password", + help="Your Bluesky app password", + # FIX #15 — same note as --twitter-password above. ) parser.add_argument( "--bsky-base-url", @@ -3441,11 +3538,17 @@ def main(): "--dry-run", action="store_true", default=False, - help="Simulate sync without posting to Bluesky. Logs what would be posted.", + help=( + "Simulate sync without posting to Bluesky. " + "Logs what would be posted." + ), ) args = parser.parse_args() + # Resolve credentials: CLI args take priority, then env vars. + # FIX #15 — document that env vars are the secure path; CLI args expose + # secrets in the process list. Operators should prefer .env / env vars. args.twitter_username = args.twitter_username or os.getenv( "TWITTER_USERNAME" ) @@ -3454,9 +3557,7 @@ def main(): ) args.twitter_email = args.twitter_email or os.getenv("TWITTER_EMAIL") args.bsky_handle = args.bsky_handle or os.getenv("BSKY_HANDLE") - args.bsky_password = args.bsky_password or os.getenv( - "BSKY_APP_PASSWORD" - ) + args.bsky_password = args.bsky_password or os.getenv("BSKY_APP_PASSWORD") args.twitter_handle = ( args.twitter_handle or os.getenv("TWITTER_HANDLE") @@ -3487,23 +3588,22 @@ def main(): missing_args = [] if not args.twitter_username: - missing_args.append("--twitter-username") + missing_args.append("--twitter-username / TWITTER_USERNAME") if not args.twitter_password: - missing_args.append("--twitter-password") + missing_args.append("--twitter-password / TWITTER_PASSWORD") if not args.bsky_handle: - missing_args.append("--bsky-handle") + missing_args.append("--bsky-handle / BSKY_HANDLE") if not args.bsky_password: - missing_args.append("--bsky-password") + missing_args.append("--bsky-password / BSKY_APP_PASSWORD") if missing_args: logging.error( - f"❌ Missing credentials! You forgot to provide: {', '.join(missing_args)}" + f"❌ Missing credentials! You forgot to provide: " + f"{', '.join(missing_args)}" ) return - logging.info( - f"🤖 Bot started. Will check @{args.twitter_handle}" - ) + logging.info(f"🤖 Bot started. Will check @{args.twitter_handle}") logging.info( f"🌍 Posting destination base URL: {args.bsky_base_url}" )