diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 29a7a01..4aa324a 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -54,37 +54,36 @@ LINK_METADATA_TIMEOUT = 10 URL_RESOLVE_TIMEOUT = 12 PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000 SUBPROCESS_TIMEOUT_SECONDS = 180 -FFPROBE_TIMEOUT_SECONDS = 15 # FIX #6 — named constant for ffprobe probe timeout +FFPROBE_TIMEOUT_SECONDS = 15 DEFAULT_BSKY_BASE_URL = "https://bsky.social" -# FIX #11 — named constants replacing magic numbers scattered across the codebase -OG_TITLE_WAIT_TIMEOUT_MS = 7000 # ms to wait for og:title meta tag -PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0 # seconds to sleep after page.goto in resolvers -PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8 # seconds between idle-state polls -PLAYWRIGHT_IDLE_POLL_ROUNDS = 4 # number of idle-state poll rounds -PLAYWRIGHT_RETRY_SLEEP_S = 2.0 # seconds to sleep before retry interaction -VIDEO_PLAYER_WAIT_ROUNDS = 8 # rounds waiting for video URL after first click -VIDEO_PLAYER_RETRY_ROUNDS = 5 # rounds waiting for video URL after retry click -URL_TAIL_MIN_PREFIX_CHARS = 35 # minimum prefix chars before URL for tail detection -URL_TAIL_MAX_LOOKBACK_CHARS = 120 # generous lookback window when hashtags follow URL -URL_TAIL_MAX_CLAUSE_DISTANCE = 180 # max chars a clause boundary may be from URL start -DYNAMIC_ALT_MAX_LENGTH = 150 # max chars for dynamic alt text -TRUNCATE_MIN_PREFIX_CHARS = 20 # min prefix length before inserting ellipsis -SHORT_TWEET_OG_FETCH_THRESHOLD = 35 # tweets shorter than this get og:title enrichment -ORPHAN_DIGIT_MAX_DIGITS = 3 # max digit count for orphaned-digit-line detection -SESSION_FILE_PERMISSIONS = 0o600 # FIX #14 — restrictive permissions for session cookie file +OG_TITLE_WAIT_TIMEOUT_MS = 7000 +PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0 +PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8 +PLAYWRIGHT_IDLE_POLL_ROUNDS = 4 +PLAYWRIGHT_RETRY_SLEEP_S = 2.0 +VIDEO_PLAYER_WAIT_ROUNDS = 8 +VIDEO_PLAYER_RETRY_ROUNDS = 5 +URL_TAIL_MIN_PREFIX_CHARS = 35 +URL_TAIL_MAX_LOOKBACK_CHARS = 120 +URL_TAIL_MAX_CLAUSE_DISTANCE = 180 +DYNAMIC_ALT_MAX_LENGTH = 150 +TRUNCATE_MIN_PREFIX_CHARS = 20 +SHORT_TWEET_OG_FETCH_THRESHOLD = 35 +ORPHAN_DIGIT_MAX_DIGITS = 3 +SESSION_FILE_PERMISSIONS = 0o600 # --- Logging Setup --- logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()], + handlers=[ + logging.FileHandler(LOG_PATH, encoding="utf-8"), + logging.StreamHandler(), + ], level=logging.INFO, ) -# --- Per-run caches for efficiency --- -# FIX #12 — caches are still module-level but now encapsulated in a class so they -# can be passed explicitly and are safe to reset between daemon cycles without -# relying on global mutation from arbitrary call sites. +# --- Per-run caches --- class _RunCache: def __init__(self): self.og_title: dict = {} @@ -146,7 +145,38 @@ def is_valid_url(url): def strip_trailing_url_punctuation(url): if not url: return url - return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip()) + # Strip a trailing hashtag-style fragment (#Word) that is really a social + # hashtag glued to the end of a URL with no space, e.g. + # https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat + # Only stripped when it starts with a letter so real anchors like + # /page#section-2 inside a longer sentence are left alone. + url = re.sub(r"#[A-Za-z]\w*$", "", url.strip()) + return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url) + + +def split_url_hashtag_suffix(text): + """ + Split a URL that has a hashtag fragment glued to it with no space, e.g.: + 'https://cit.transit.gencat.cat#SCT' + becomes: + 'https://cit.transit.gencat.cat #SCT' + + Only splits when the fragment looks like a social hashtag: starts with # + followed by a letter then word characters. The lookahead (?=\\s|$) ensures + we only act at a word boundary so mid-sentence anchors followed by more + URL path are left untouched. + """ + if not text: + return text + + fixed = re.sub( + r"(https?://[^\s#<>\"']+)(#[A-Za-z]\w*)(?=\s|$)", + r"\1 \2", + text, + ) + if fixed != text: + logging.info("🔧 Split hashtag suffix from URL in text") + return fixed def split_concatenated_urls(text): @@ -165,6 +195,8 @@ def repair_broken_urls(text): original = text text = split_concatenated_urls(text) + # Split glued hashtag suffixes before any rejoining passes + text = split_url_hashtag_suffix(text) text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE) @@ -186,6 +218,9 @@ def repair_broken_urls(text): ) text = split_concatenated_urls(text) + # Run hashtag split again after rejoining passes — the rejoining regex + # contains # in its character class so it can re-glue a fragment. + text = split_url_hashtag_suffix(text) if text != original: logging.info("🔧 Repaired broken URL wrapping in scraped text") @@ -232,7 +267,6 @@ def repair_broken_mentions(text): if is_blank_line(next_line): break - if is_mention_only_line(next_line): break @@ -256,7 +290,6 @@ def repair_broken_mentions(text): if is_blank_line(next_line): break - if is_mention_only_line(next_line): break @@ -287,8 +320,8 @@ def strip_line_edge_whitespace(text): lines = text.splitlines() cleaned_lines = [] - changed = False + for line in lines: cleaned = line.strip() if cleaned != line: @@ -316,11 +349,6 @@ def remove_trailing_ellipsis_line(text): def remove_orphaned_digit_lines_before_hashtags(text): - """ - Remove lines that contain only a number (e.g. '5') when they appear - immediately before a line starting with a hashtag. These are typically - scraped UI artifacts (image counts, engagement badges, etc.). - """ if not text: return text @@ -331,8 +359,6 @@ def remove_orphaned_digit_lines_before_hashtags(text): result = [] changed = False i = 0 - - # FIX #11 — use named constant ORPHAN_DIGIT_MAX_DIGITS instead of literal 3 orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}") while i < len(lines): @@ -519,7 +545,6 @@ def should_fetch_og_title(tweet): if "…" in text or text.endswith("..."): return True - # FIX #11 — use named constant SHORT_TWEET_OG_FETCH_THRESHOLD instead of literal 35 if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD: return True @@ -535,7 +560,7 @@ def fetch_tweet_og_title_text(tweet_url): return _cache.og_title[tweet_url] browser = None - browser_context = None # FIX #1 — renamed from 'context' to avoid collision + browser_context = None page = None try: @@ -562,8 +587,10 @@ def fetch_tweet_og_title_text(tweet_url): ) try: - # FIX #11 — use named constant OG_TITLE_WAIT_TIMEOUT_MS instead of literal 7000 - page.wait_for_selector('meta[property="og:title"]', timeout=OG_TITLE_WAIT_TIMEOUT_MS) + page.wait_for_selector( + 'meta[property="og:title"]', + timeout=OG_TITLE_WAIT_TIMEOUT_MS, + ) except Exception: pass @@ -629,7 +656,7 @@ def resolve_tco_with_httpx(url, http_client): def resolve_tco_with_playwright(url): browser = None - browser_context = None # FIX #1 — renamed from 'context' + browser_context = None page = None try: @@ -661,11 +688,9 @@ def resolve_tco_with_playwright(url): f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}" ) - # FIX #11 — use named constant PLAYWRIGHT_POST_GOTO_SLEEP_S time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S) final_url = canonicalize_url(page.url) - # FIX #11 — use named constants for poll rounds and sleep for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS): if final_url and is_external_non_x_url(final_url): break @@ -815,10 +840,6 @@ def extract_first_resolved_external_url( def resolve_card_url(card_url, http_client): - """ - Resolve a card URL (typically t.co) scraped from the tweet's link preview card. - Returns the final external URL or None. - """ if not card_url: return None @@ -869,9 +890,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): if is_x_or_twitter_domain(cleaned): replacements[raw_url] = "" - logging.info( - f"🧹 Removing X/Twitter URL from visible text: {cleaned}" - ) + logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}") continue final_url = cleaned @@ -927,9 +946,8 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False): if len(line_urls) > 1: prefix = re.sub(url_pattern, "", line).strip() kept_urls = [] - - # FIX #4 — local set per line, not shared outer state seen_in_line: set = set() + for url in line_urls: normalized = normalize_urlish_token(url) or url canonical = canonicalize_url(normalized) @@ -976,20 +994,15 @@ def build_effective_tweet_text(tweet, http_client): scraped_urls = extract_urls_from_text(scraped_text) og_urls = extract_urls_from_text(og_title_text) - if len(og_title_text) >= len(scraped_text) or ( - og_urls and not scraped_urls - ): + if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls): candidate_text = og_title_text logging.info("🧾 Using og:title-derived tweet text as primary content") candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text( - candidate_text, - http_client, - has_media=has_media, + candidate_text, http_client, has_media=has_media, ) candidate_text = clean_post_text(candidate_text) - # --- Resolve the card_url scraped from the tweet's link preview --- resolved_card_url = resolve_card_url( getattr(tweet, "card_url", None), http_client ) @@ -1102,12 +1115,7 @@ def find_tail_preservation_start(text, primary_non_x_url): candidates = [url_pos] clause_patterns = [ - r"\.\s+", - r":\s+", - r";\s+", - r"!\s+", - r"\?\s+", - r",\s+", + r"\.\s+", r":\s+", r";\s+", r"!\s+", r"\?\s+", r",\s+", ] before = text[:url_pos] @@ -1120,22 +1128,23 @@ def find_tail_preservation_start(text, primary_non_x_url): candidates.append(last_newline + 1) if has_hashtag_after_url: - # FIX #11 — use named constant URL_TAIL_MAX_LOOKBACK_CHARS instead of literal 120 generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS) while generous_start > 0 and text[generous_start] not in {" ", "\n"}: generous_start -= 1 candidates.append(generous_start) - # FIX #11 — use named constant URL_TAIL_MAX_CLAUSE_DISTANCE instead of literal 180 reasonable_candidates = [ - c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE + c for c in candidates + if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE ] if reasonable_candidates: start = min(reasonable_candidates, key=lambda c: (url_pos - c)) - # FIX #11 — use named constant URL_TAIL_MIN_PREFIX_CHARS instead of literal 35 if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS: - farther = [c for c in reasonable_candidates if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS] + farther = [ + c for c in reasonable_candidates + if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS + ] if farther: start = min(farther, key=lambda c: (url_pos - c)) return start @@ -1149,15 +1158,12 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH): truncated = text[: max_length - 3] last_space = truncated.rfind(" ") - # FIX #11 — use named constant TRUNCATE_MIN_PREFIX_CHARS instead of literal 0 if last_space > TRUNCATE_MIN_PREFIX_CHARS: return truncated[:last_space] + "..." return truncated + "..." -def truncate_text_preserving_tail( - text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH -): +def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH): if ( not text or tail_start is None @@ -1176,11 +1182,9 @@ def truncate_text_preserving_tail( reserve = len(tail) + 4 if reserve >= max_length: shortened_tail = tail[-(max_length - 3) :].strip() - first_space = shortened_tail.find(" ") if 0 <= first_space <= 30: shortened_tail = shortened_tail[first_space + 1 :].strip() - return f"...{shortened_tail}" available_prefix = max_length - reserve @@ -1234,10 +1238,7 @@ def choose_final_visible_text( text_without_url = remove_url_from_visible_text( text, primary_non_x_url ).strip() - if ( - text_without_url - and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH - ): + if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH: logging.info( "🔗 Keeping full visible text by removing long external URL from body and using external card" ) @@ -1267,7 +1268,6 @@ def build_media_fingerprint(tweet): for media in tweet.media: media_type = getattr(media, "type", "unknown") media_url = getattr(media, "media_url_https", "") or "" - stable_value = media_url if media_type == "photo": @@ -1338,9 +1338,7 @@ def build_text_media_key(normalized_text, media_fingerprint): def create_bsky_client(base_url, handle, password): normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/") - logging.info( - f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}" - ) + logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}") try: client = Client(base_url=normalized_base_url) @@ -1355,9 +1353,7 @@ def create_bsky_client(base_url, handle, password): elif hasattr(client, "_base_url"): client._base_url = normalized_base_url except Exception as e: - logging.warning( - f"⚠️ Could not apply custom base URL cleanly: {e}" - ) + logging.warning(f"⚠️ Could not apply custom base URL cleanly: {e}") client.login(handle, password) return client @@ -1426,16 +1422,12 @@ def remember_posted_tweet(state, candidate, bsky_uri=None): "canonical_tweet_url": canonical_tweet_url, "normalized_text": candidate["normalized_text"], "raw_text": candidate["raw_text"], - "full_clean_text": candidate.get( - "full_clean_text", candidate["raw_text"] - ), + "full_clean_text": candidate.get("full_clean_text", candidate["raw_text"]), "media_fingerprint": candidate["media_fingerprint"], "text_media_key": candidate["text_media_key"], "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]), "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []), - "resolved_primary_external_url": candidate.get( - "resolved_primary_external_url" - ), + "resolved_primary_external_url": candidate.get("resolved_primary_external_url"), "bsky_uri": bsky_uri, "tweet_created_on": candidate["tweet"].created_on, "tweet_url": candidate["tweet"].tweet_url, @@ -1483,15 +1475,16 @@ def prune_state(state, max_entries=5000): sortable.sort(key=lambda x: x[1], reverse=True) keep_keys = {key for key, _ in sortable[:max_entries]} - new_posted_tweets = {} - for key, record in posted_tweets.items(): - if key in keep_keys: - new_posted_tweets[key] = record - - new_posted_by_bsky_uri = {} - for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items(): - if key in keep_keys: - new_posted_by_bsky_uri[bsky_uri] = key + new_posted_tweets = { + key: record + for key, record in posted_tweets.items() + if key in keep_keys + } + new_posted_by_bsky_uri = { + bsky_uri: key + for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items() + if key in keep_keys + } state["posted_tweets"] = new_posted_tweets state["posted_by_bsky_uri"] = new_posted_by_bsky_uri @@ -1540,9 +1533,7 @@ def get_recent_bsky_posts(client, handle, limit=30): canonical_non_x_urls = set() for url in urls: - if not is_tco_domain(url) and not is_x_or_twitter_domain( - url - ): + if not is_tco_domain(url) and not is_x_or_twitter_domain(url): canonical = canonicalize_url( normalize_urlish_token(url) or url ) @@ -1572,7 +1563,6 @@ def get_recent_bsky_posts(client, handle, limit=30): ) except Exception as e: - # FIX #9 — elevated to WARNING so operators notice live dedup is disabled logging.warning( f"⚠️ Could not fetch recent Bluesky posts for duplicate detection " f"(live dedup disabled for this cycle): {e}" @@ -1648,7 +1638,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): continue else: logging.warning( - f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}" + f"❌ Exhausted blob upload retries for {media_label} " + f"after rate limiting: {repr(e)}" ) break @@ -1657,12 +1648,11 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES ): transient_attempts += 1 - wait_seconds = ( - BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts - ) + wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts logging.warning( f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. " - f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s." + f"Transient retry {transient_attempts}/" + f"{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s." ) time.sleep(wait_seconds) continue @@ -1674,9 +1664,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): logging.warning( f"Upload response status: {e.response.status_code}" ) - logging.warning( - f"Upload response body: {e.response.text}" - ) + logging.warning(f"Upload response body: {e.response.text}") except Exception: pass @@ -1685,11 +1673,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): logging.warning(f"Could not upload {media_label}: {repr(last_exception)}") return None + def send_post_with_retry(client, **kwargs): - """ - Wrapper around client.send_post() with retry logic for transient errors - and rate limiting. - """ last_exception = None for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1): @@ -1723,10 +1708,7 @@ def send_post_with_retry(client, **kwargs): ) raise - if ( - is_transient_error(e) - and attempt < BSKY_SEND_POST_MAX_RETRIES - ): + if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES: wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt logging.warning( f"⏳ Transient send_post failure: {repr(e)}. " @@ -1739,8 +1721,6 @@ def send_post_with_retry(client, **kwargs): raise last_exception - -# --- Image Compression --- def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): try: with Image.open(io.BytesIO(image_bytes)) as img: @@ -1770,12 +1750,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): progressive=True, ) data = out.getvalue() - logging.info( f"🖼️ Post image candidate size at JPEG quality {quality}: " f"{len(data)} bytes ({len(data) / 1024:.2f} KB)" ) - if len(data) <= max_bytes: return data @@ -1802,12 +1780,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): progressive=True, ) data = out.getvalue() - logging.info( f"🖼️ Post image resized to <= {target_dim}px at quality {quality}: " f"{len(data)} bytes ({len(data) / 1024:.2f} KB)" ) - if len(data) <= max_bytes: return data @@ -1865,9 +1841,7 @@ def get_blob_from_url(media_url, client, http_client): ) return None - return upload_blob_with_retry( - client, upload_bytes, media_label=media_url - ) + return upload_blob_with_retry(client, upload_bytes, media_label=media_url) except Exception as e: logging.warning(f"Could not fetch media {media_url}: {repr(e)}") @@ -1902,23 +1876,17 @@ def get_blob_from_file(file_path, client): with open(file_path, "rb") as f: binary_data = f.read() - return upload_blob_with_retry( - client, binary_data, media_label=file_path - ) + return upload_blob_with_retry(client, binary_data, media_label=file_path) except Exception as e: - logging.warning( - f"Could not upload local file {file_path}: {repr(e)}" - ) + logging.warning(f"Could not upload local file {file_path}: {repr(e)}") if hasattr(e, "response") and e.response is not None: try: logging.warning( f"Upload response status: {e.response.status_code}" ) - logging.warning( - f"Upload response body: {e.response.text}" - ) + logging.warning(f"Upload response body: {e.response.text}") except Exception: pass @@ -1956,12 +1924,10 @@ def compress_external_thumb_to_limit( progressive=True, ) data = out.getvalue() - logging.info( f"🖼️ External thumb candidate size at JPEG quality {quality}: " f"{len(data) / 1024:.2f} KB" ) - if len(data) <= max_bytes: return data @@ -1988,19 +1954,15 @@ def compress_external_thumb_to_limit( progressive=True, ) data = out.getvalue() - logging.info( f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: " f"{len(data) / 1024:.2f} KB" ) - if len(data) <= max_bytes: return data except Exception as e: - logging.warning( - f"Could not compress external thumbnail: {repr(e)}" - ) + logging.warning(f"Could not compress external thumbnail: {repr(e)}") return None @@ -2101,20 +2063,18 @@ def fetch_link_metadata(url, http_client): } except Exception as e: - logging.warning( - f"Could not fetch link metadata for {url}: {repr(e)}" - ) + logging.warning(f"Could not fetch link metadata for {url}: {repr(e)}") return {} def build_external_link_embed( - url, client, http_client, fallback_title="Link", - prefetched_metadata=None, + url, client, http_client, fallback_title="Link", prefetched_metadata=None, ): - # FIX #5 — accept pre-fetched metadata to avoid a duplicate HTTP request - # when the caller already fetched it for build_dynamic_alt. - link_metadata = prefetched_metadata if prefetched_metadata is not None \ + link_metadata = ( + prefetched_metadata + if prefetched_metadata is not None else fetch_link_metadata(url, http_client) + ) thumb_blob = None if link_metadata.get("image"): @@ -2144,12 +2104,10 @@ def build_external_link_embed( def make_rich(content): - # FIX #10 — note explaining @mention limitation. - # Bluesky supports native @mention facets, but resolving a Twitter handle - # to a Bluesky DID requires an external lookup (e.g. via the atproto - # identity resolution API). That mapping is not available here, so - # @mentions are intentionally passed through as plain text. If you add a - # handle-mapping table in the future, call + # NOTE: Bluesky supports native @mention facets, but resolving a Twitter + # handle to a Bluesky DID requires an external lookup. That mapping is not + # available here so @mentions are passed through as plain text. If you add + # a handle-mapping table in the future, call # text_builder.mention(word, did) here instead of text_builder.text(word). text_builder = client_utils.TextBuilder() content = clean_post_text(content) @@ -2178,8 +2136,7 @@ def make_rich(content): clean_url_value = clean_url(normalized_candidate) if clean_url_value and is_valid_url(clean_url_value): - display_text = cleaned_word - text_builder.link(display_text, clean_url_value) + text_builder.link(cleaned_word, clean_url_value) trailing = word[len(cleaned_word):] if trailing: text_builder.text(trailing) @@ -2209,8 +2166,6 @@ def make_rich(content): def build_dynamic_alt(raw_text, link_title=None): - # FIX #5 — accept optional link_title so URL-only tweets get a richer alt - # instead of always falling back to the generic "Attached video or image" string. dynamic_alt = clean_post_text(raw_text) dynamic_alt = dynamic_alt.replace("\n", " ").strip() dynamic_alt = re.sub( @@ -2220,7 +2175,6 @@ def build_dynamic_alt(raw_text, link_title=None): if not dynamic_alt and link_title: dynamic_alt = link_title.strip() - # FIX #11 — use named constant DYNAMIC_ALT_MAX_LENGTH instead of literal 150 if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH: dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..." elif not dynamic_alt: @@ -2231,9 +2185,7 @@ def build_dynamic_alt(raw_text, link_title=None): def build_video_embed(video_blob, alt_text): try: - return models.AppBskyEmbedVideo.Main( - video=video_blob, alt=alt_text - ) + return models.AppBskyEmbedVideo.Main(video=video_blob, alt=alt_text) except AttributeError: logging.error( "❌ Your atproto version does not support AppBskyEmbedVideo. Upgrade atproto." @@ -2246,7 +2198,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): tweets = [] state_file = "twitter_browser_state.json" - # FIX #14 — enforce restrictive permissions on the session cookie file if os.path.exists(state_file): try: os.chmod(state_file, SESSION_FILE_PERMISSIONS) @@ -2266,15 +2217,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): "Chrome/145.0.7632.6 Safari/537.36" ) - # FIX #1 — all Playwright browser context variables renamed to - # 'browser_context' throughout this function to eliminate the name - # collision with the 'context_text' / 'social_context_el' variables - # used inside the per-article parsing loop below. browser_context = None needs_login = True - - # FIX #7 — track the session-check page explicitly so we can close - # it before opening the profile scrape page, preventing a page leak. session_check_page = None if os.path.exists(state_file): @@ -2302,15 +2246,12 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): logging.warning( "⚠️ Saved session expired or invalid. Re-logging in..." ) - # FIX #7 — close the check page before closing the context session_check_page.close() session_check_page = None browser_context.close() browser_context = None os.remove(state_file) - # FIX #7 — always close the session-check page before opening the - # profile page, whether a re-login was needed or not. if session_check_page is not None: session_check_page.close() session_check_page = None @@ -2391,7 +2332,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): time.sleep(3) browser_context.storage_state(path=state_file) - # FIX #14 — set restrictive permissions immediately after writing try: os.chmod(state_file, SESSION_FILE_PERMISSIONS) except Exception as chmod_err: @@ -2408,7 +2348,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): browser.close() return [] - # FIX #7 — close the login page cleanly before opening scrape page login_page.close() logging.info( @@ -2446,10 +2385,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): else href ) - # --- Retweet detection --- is_retweet = False try: - # FIX #1 — renamed from 'context' to 'social_context_el' social_context_el = article.locator( '[data-testid="socialContext"]' ).first @@ -2498,7 +2435,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): if video_locators: media_urls.append((tweet_url or "", "video")) - # --- Card URL extraction (link preview card) --- card_url = None try: card_locator = article.locator( @@ -2520,9 +2456,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): '[data-testid="card.wrapper"] [role="link"]' ).first if card_role_link.is_visible(): - card_a = card_role_link.locator( - "a[href]" - ).first + card_a = card_role_link.locator("a[href]").first if card_a.is_visible(): card_href = card_a.get_attribute("href") if card_href: @@ -2545,9 +2479,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): ) except Exception as e: - logging.warning( - f"⚠️ Failed to parse a specific tweet: {e}" - ) + logging.warning(f"⚠️ Failed to parse a specific tweet: {e}") continue except Exception as e: @@ -2560,7 +2492,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): # --- Video Extraction & Processing --- def extract_video_url_from_tweet_page(browser_context, tweet_url): - # FIX #1 — parameter renamed from 'context' to 'browser_context' page = browser_context.new_page() best_m3u8_url = None best_video_mp4_url = None @@ -2573,10 +2504,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): "/aud/" in url_l or "/audio/" in url_l or "mp4a" in url_l - or ( - "audio/" in content_type_l - and "video/" not in content_type_l - ) + or ("audio/" in content_type_l and "video/" not in content_type_l) ) def handle_response(response): @@ -2649,7 +2577,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): else: logging.warning("⚠️ No video player locator found on tweet page") - # FIX #11 — use named constant VIDEO_PLAYER_WAIT_ROUNDS for _ in range(VIDEO_PLAYER_WAIT_ROUNDS): if current_best(): break @@ -2661,7 +2588,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): ) try: player.click(force=True, timeout=5000) - # FIX #11 — use named constant PLAYWRIGHT_RETRY_SLEEP_S time.sleep(PLAYWRIGHT_RETRY_SLEEP_S) except Exception as e: logging.info(f"⚠️ Retry click failed: {e}") @@ -2672,7 +2598,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): except Exception: pass - # FIX #11 — use named constant VIDEO_PLAYER_RETRY_ROUNDS for _ in range(VIDEO_PLAYER_RETRY_ROUNDS): if current_best(): break @@ -2698,12 +2623,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): def _probe_video_duration(file_path): - """ - FIX #6 — Use ffprobe via subprocess instead of VideoFileClip to get video - duration. This avoids a potential hang on corrupt/truncated files since we - apply a hard timeout to the subprocess call. - Returns duration in seconds as a float, or raises RuntimeError on failure. - """ probe_cmd = [ "ffprobe", "-v", "error", @@ -2739,9 +2658,7 @@ def download_and_crop_video(video_url, output_path): temp_output = output_path.replace(".mp4", "_compressed.mp4") try: - logging.info( - f"⬇️ Downloading video source with ffmpeg: {video_url}" - ) + logging.info(f"⬇️ Downloading video source with ffmpeg: {video_url}") video_url_l = video_url.lower() @@ -2781,15 +2698,11 @@ def download_and_crop_video(video_url, output_path): not os.path.exists(temp_input) or os.path.getsize(temp_input) == 0 ): - logging.error( - "❌ Downloaded video source file is missing or empty." - ) + logging.error("❌ Downloaded video source file is missing or empty.") return None logging.info(f"✅ Video downloaded: {temp_input}") - # FIX #6 — probe duration with ffprobe (hard timeout) instead of - # VideoFileClip, which can hang indefinitely on corrupt files. try: duration = _probe_video_duration(temp_input) except RuntimeError as probe_err: @@ -2797,16 +2710,11 @@ def download_and_crop_video(video_url, output_path): return None if duration <= 0: - logging.error( - "❌ Downloaded video has invalid or unknown duration." - ) + logging.error("❌ Downloaded video has invalid or unknown duration.") return None end_time = min(VIDEO_MAX_DURATION_SECONDS, duration) - # FIX #2 — wrap VideoFileClip usage in nested try/finally blocks so - # both the source clip and the subclip handles are always closed, even - # if write_videofile raises an exception mid-way. video_clip = VideoFileClip(temp_input) try: if hasattr(video_clip, "subclipped"): @@ -2825,9 +2733,9 @@ def download_and_crop_video(video_url, output_path): logger=None, ) finally: - cropped_clip.close() # FIX #2 — always close subclip + cropped_clip.close() finally: - video_clip.close() # FIX #2 — always close source clip + video_clip.close() if ( not os.path.exists(temp_trimmed) @@ -2898,8 +2806,6 @@ def download_and_crop_video(video_url, output_path): finally: remove_file_quietly(temp_input) remove_file_quietly(temp_trimmed) - # temp_output was either renamed to output_path via os.replace() - # or never created; remove_file_quietly is a no-op if it doesn't exist. remove_file_quietly(temp_output) @@ -2928,6 +2834,7 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): return False, None +# --- Main Sync Logic --- def sync_feeds(args): logging.info("🔄 Starting sync cycle...") @@ -2935,14 +2842,10 @@ def sync_feeds(args): bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS if dry_run: - logging.info( - "🧪 DRY RUN MODE — no posts will be created on Bluesky." - ) + logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.") try: state = load_state(STATE_PATH) - # FIX #8 — prune on load so the state file never grows unbounded - # between runs, not only after individual posts. state = prune_state(state, max_entries=5000) tweets = scrape_tweets_via_playwright( @@ -2988,9 +2891,8 @@ def sync_feeds(args): logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}") candidate_tweets = [] - - # --- Cheap prefilter before expensive processing --- cheap_candidates = [] + for tweet in reversed(tweets): try: tweet_time = arrow.get(tweet.created_on) @@ -3064,9 +2966,7 @@ def sync_feeds(args): if not is_tco_domain( raw_url ) and not is_x_or_twitter_domain(raw_url): - canonical_non_x_urls.add( - canonicalize_url(raw_url) - ) + canonical_non_x_urls.add(canonicalize_url(raw_url)) primary_non_x_url = None if resolved_primary_external_url: @@ -3174,7 +3074,6 @@ def sync_feeds(args): if os.path.exists(browser_state_file): context_kwargs["storage_state"] = browser_state_file - # FIX #1 — renamed from 'context' to 'browser_context' browser_context = browser.new_context(**context_kwargs) for candidate in candidate_tweets: @@ -3219,10 +3118,6 @@ def sync_feeds(args): new_posts += 1 continue - # FIX #5 — fetch link metadata once here so we can pass the - # OG title to build_dynamic_alt AND reuse it inside - # build_external_link_embed, avoiding a duplicate HTTP request - # for the same URL. link_meta_for_alt: dict = {} if candidate.get("resolved_primary_external_url"): try: @@ -3284,10 +3179,8 @@ def sync_feeds(args): f"video:resolve_failed:{tweet.tweet_url}" ) else: - cropped_video_path = ( - download_and_crop_video( - real_video_url, temp_video_path - ) + cropped_video_path = download_and_crop_video( + real_video_url, temp_video_path ) if not cropped_video_path: logging.warning( @@ -3375,8 +3268,6 @@ def sync_feeds(args): f"external card: {candidate_url}" ) - # FIX #5 — pass the already-fetched metadata so - # build_external_link_embed skips a duplicate HTTP fetch. external_embed = build_external_link_embed( candidate_url, bsky_client, @@ -3489,24 +3380,19 @@ def sync_feeds(args): except Exception as e: logging.error(f"❌ Error during sync cycle: {e}") - def main(): load_dotenv() - parser = argparse.ArgumentParser( - description="Twitter to Bluesky Sync" - ) + parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync") parser.add_argument( "--twitter-username", help="Your Twitter login username", ) parser.add_argument( "--twitter-password", + # NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`. + # Prefer setting TWITTER_PASSWORD in your .env file instead. help="Your Twitter login password", - # FIX #15 — password args are still supported for compatibility but - # the .env file is the recommended path; passwords passed via CLI - # are visible in `ps aux`. Consider removing these args and requiring - # env vars exclusively, or prompting with getpass for interactive use. ) parser.add_argument( "--twitter-email", @@ -3522,8 +3408,9 @@ def main(): ) parser.add_argument( "--bsky-password", + # NOTE (FIX #15): same warning as --twitter-password above. + # Prefer setting BSKY_APP_PASSWORD in your .env file instead. help="Your Bluesky app password", - # FIX #15 — same note as --twitter-password above. ) parser.add_argument( "--bsky-base-url", @@ -3547,8 +3434,8 @@ def main(): args = parser.parse_args() # Resolve credentials: CLI args take priority, then env vars. - # FIX #15 — document that env vars are the secure path; CLI args expose - # secrets in the process list. Operators should prefer .env / env vars. + # FIX #15 — env vars are the secure path; CLI args expose secrets in + # the process list. Operators should prefer .env / environment variables. args.twitter_username = args.twitter_username or os.getenv( "TWITTER_USERNAME" ) @@ -3620,3 +3507,4 @@ def main(): if __name__ == "__main__": main() +