Added #fix

2026-04-13 19:33:59 +00:00
parent 6f67822e7e
commit 3810cef150
1 changed files with 140 additions and 252 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -54,37 +54,36 @@ LINK_METADATA_TIMEOUT = 10
 URL_RESOLVE_TIMEOUT = 12
 PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
 SUBPROCESS_TIMEOUT_SECONDS = 180
-FFPROBE_TIMEOUT_SECONDS = 15          # FIX #6 — named constant for ffprobe probe timeout
+FFPROBE_TIMEOUT_SECONDS = 15
 DEFAULT_BSKY_BASE_URL = "https://bsky.social"
-# FIX #11 — named constants replacing magic numbers scattered across the codebase
+OG_TITLE_WAIT_TIMEOUT_MS = 7000
-OG_TITLE_WAIT_TIMEOUT_MS = 7000       # ms to wait for og:title meta tag
+PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0
-PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0    # seconds to sleep after page.goto in resolvers
+PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8
-PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8    # seconds between idle-state polls
+PLAYWRIGHT_IDLE_POLL_ROUNDS = 4
-PLAYWRIGHT_IDLE_POLL_ROUNDS = 4       # number of idle-state poll rounds
+PLAYWRIGHT_RETRY_SLEEP_S = 2.0
-PLAYWRIGHT_RETRY_SLEEP_S = 2.0        # seconds to sleep before retry interaction
+VIDEO_PLAYER_WAIT_ROUNDS = 8
-VIDEO_PLAYER_WAIT_ROUNDS = 8          # rounds waiting for video URL after first click
+VIDEO_PLAYER_RETRY_ROUNDS = 5
-VIDEO_PLAYER_RETRY_ROUNDS = 5         # rounds waiting for video URL after retry click
+URL_TAIL_MIN_PREFIX_CHARS = 35
-URL_TAIL_MIN_PREFIX_CHARS = 35        # minimum prefix chars before URL for tail detection
+URL_TAIL_MAX_LOOKBACK_CHARS = 120
-URL_TAIL_MAX_LOOKBACK_CHARS = 120     # generous lookback window when hashtags follow URL
+URL_TAIL_MAX_CLAUSE_DISTANCE = 180
-URL_TAIL_MAX_CLAUSE_DISTANCE = 180    # max chars a clause boundary may be from URL start
+DYNAMIC_ALT_MAX_LENGTH = 150
-DYNAMIC_ALT_MAX_LENGTH = 150          # max chars for dynamic alt text
+TRUNCATE_MIN_PREFIX_CHARS = 20
-TRUNCATE_MIN_PREFIX_CHARS = 20        # min prefix length before inserting ellipsis
+SHORT_TWEET_OG_FETCH_THRESHOLD = 35
-SHORT_TWEET_OG_FETCH_THRESHOLD = 35   # tweets shorter than this get og:title enrichment
+ORPHAN_DIGIT_MAX_DIGITS = 3
-ORPHAN_DIGIT_MAX_DIGITS = 3           # max digit count for orphaned-digit-line detection
+SESSION_FILE_PERMISSIONS = 0o600
 SESSION_FILE_PERMISSIONS = 0o600      # FIX #14 — restrictive permissions for session cookie file
 # --- Logging Setup ---
 logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
+    handlers=[
        logging.FileHandler(LOG_PATH, encoding="utf-8"),
        logging.StreamHandler(),
    ],
    level=logging.INFO,
 )
-# --- Per-run caches for efficiency ---
+# --- Per-run caches ---
 # FIX #12 — caches are still module-level but now encapsulated in a class so they
 # can be passed explicitly and are safe to reset between daemon cycles without
 # relying on global mutation from arbitrary call sites.
 class _RunCache:
    def __init__(self):
        self.og_title: dict = {}
@@ -146,7 +145,38 @@ def is_valid_url(url):
 def strip_trailing_url_punctuation(url):
    if not url:
        return url
-    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
+    # Strip a trailing hashtag-style fragment (#Word) that is really a social
    # hashtag glued to the end of a URL with no space, e.g.
    # https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
    # Only stripped when it starts with a letter so real anchors like
    # /page#section-2 inside a longer sentence are left alone.
    url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
 def split_url_hashtag_suffix(text):
    """
    Split a URL that has a hashtag fragment glued to it with no space, e.g.:
        'https://cit.transit.gencat.cat#SCT'
    becomes:
        'https://cit.transit.gencat.cat #SCT'
    Only splits when the fragment looks like a social hashtag: starts with #
    followed by a letter then word characters. The lookahead (?=\\s|$) ensures
    we only act at a word boundary so mid-sentence anchors followed by more
    URL path are left untouched.
    """
    if not text:
        return text
    fixed = re.sub(
        r"(https?://[^\s#<>\"']+)(#[A-Za-z]\w*)(?=\s|$)",
        r"\1 \2",
        text,
    )
    if fixed != text:
        logging.info("🔧 Split hashtag suffix from URL in text")
    return fixed
 def split_concatenated_urls(text):
@@ -165,6 +195,8 @@ def repair_broken_urls(text):
    original = text
    text = split_concatenated_urls(text)
    # Split glued hashtag suffixes before any rejoining passes
    text = split_url_hashtag_suffix(text)
    text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
@@ -186,6 +218,9 @@ def repair_broken_urls(text):
    )
    text = split_concatenated_urls(text)
    # Run hashtag split again after rejoining passes — the rejoining regex
    # contains # in its character class so it can re-glue a fragment.
    text = split_url_hashtag_suffix(text)
    if text != original:
        logging.info("🔧 Repaired broken URL wrapping in scraped text")
@@ -232,7 +267,6 @@ def repair_broken_mentions(text):
                if is_blank_line(next_line):
                    break
                if is_mention_only_line(next_line):
                    break
@@ -256,7 +290,6 @@ def repair_broken_mentions(text):
                if is_blank_line(next_line):
                    break
                if is_mention_only_line(next_line):
                    break
@@ -287,8 +320,8 @@ def strip_line_edge_whitespace(text):
    lines = text.splitlines()
    cleaned_lines = []
    changed = False
    for line in lines:
        cleaned = line.strip()
        if cleaned != line:
@@ -316,11 +349,6 @@ def remove_trailing_ellipsis_line(text):
 def remove_orphaned_digit_lines_before_hashtags(text):
    """
    Remove lines that contain only a number (e.g. '5') when they appear
    immediately before a line starting with a hashtag. These are typically
    scraped UI artifacts (image counts, engagement badges, etc.).
    """
    if not text:
        return text
@@ -331,8 +359,6 @@ def remove_orphaned_digit_lines_before_hashtags(text):
    result = []
    changed = False
    i = 0
    # FIX #11 — use named constant ORPHAN_DIGIT_MAX_DIGITS instead of literal 3
    orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}")
    while i < len(lines):
@@ -519,7 +545,6 @@ def should_fetch_og_title(tweet):
    if "…" in text or text.endswith("..."):
        return True
    # FIX #11 — use named constant SHORT_TWEET_OG_FETCH_THRESHOLD instead of literal 35
    if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD:
        return True
@@ -535,7 +560,7 @@ def fetch_tweet_og_title_text(tweet_url):
        return _cache.og_title[tweet_url]
    browser = None
-    browser_context = None  # FIX #1 — renamed from 'context' to avoid collision
+    browser_context = None
    page = None
    try:
@@ -562,8 +587,10 @@ def fetch_tweet_og_title_text(tweet_url):
            )
            try:
-                # FIX #11 — use named constant OG_TITLE_WAIT_TIMEOUT_MS instead of literal 7000
+                page.wait_for_selector(
-                page.wait_for_selector('meta[property="og:title"]', timeout=OG_TITLE_WAIT_TIMEOUT_MS)
+                    'meta[property="og:title"]',
                    timeout=OG_TITLE_WAIT_TIMEOUT_MS,
                )
            except Exception:
                pass
@@ -629,7 +656,7 @@ def resolve_tco_with_httpx(url, http_client):
 def resolve_tco_with_playwright(url):
    browser = None
-    browser_context = None  # FIX #1 — renamed from 'context'
+    browser_context = None
    page = None
    try:
@@ -661,11 +688,9 @@ def resolve_tco_with_playwright(url):
                    f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}"
                )
            # FIX #11 — use named constant PLAYWRIGHT_POST_GOTO_SLEEP_S
            time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S)
            final_url = canonicalize_url(page.url)
            # FIX #11 — use named constants for poll rounds and sleep
            for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS):
                if final_url and is_external_non_x_url(final_url):
                    break
@@ -815,10 +840,6 @@ def extract_first_resolved_external_url(
 def resolve_card_url(card_url, http_client):
    """
    Resolve a card URL (typically t.co) scraped from the tweet's link preview card.
    Returns the final external URL or None.
    """
    if not card_url:
        return None
@@ -869,9 +890,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
        if is_x_or_twitter_domain(cleaned):
            replacements[raw_url] = ""
-            logging.info(
+            logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}")
                f"🧹 Removing X/Twitter URL from visible text: {cleaned}"
            )
            continue
        final_url = cleaned
@@ -927,9 +946,8 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
        if len(line_urls) > 1:
            prefix = re.sub(url_pattern, "", line).strip()
            kept_urls = []
            # FIX #4 — local set per line, not shared outer state
            seen_in_line: set = set()
            for url in line_urls:
                normalized = normalize_urlish_token(url) or url
                canonical = canonicalize_url(normalized)
@@ -976,20 +994,15 @@ def build_effective_tweet_text(tweet, http_client):
        scraped_urls = extract_urls_from_text(scraped_text)
        og_urls = extract_urls_from_text(og_title_text)
-        if len(og_title_text) >= len(scraped_text) or (
+        if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls):
            og_urls and not scraped_urls
        ):
            candidate_text = og_title_text
            logging.info("🧾 Using og:title-derived tweet text as primary content")
    candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(
-        candidate_text,
+        candidate_text, http_client, has_media=has_media,
        http_client,
        has_media=has_media,
    )
    candidate_text = clean_post_text(candidate_text)
    # --- Resolve the card_url scraped from the tweet's link preview ---
    resolved_card_url = resolve_card_url(
        getattr(tweet, "card_url", None), http_client
    )
@@ -1102,12 +1115,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
    candidates = [url_pos]
    clause_patterns = [
-        r"\.\s+",
+        r"\.\s+", r":\s+", r";\s+", r"!\s+", r"\?\s+", r",\s+",
        r":\s+",
        r";\s+",
        r"!\s+",
        r"\?\s+",
        r",\s+",
    ]
    before = text[:url_pos]
@@ -1120,22 +1128,23 @@ def find_tail_preservation_start(text, primary_non_x_url):
        candidates.append(last_newline + 1)
    if has_hashtag_after_url:
        # FIX #11 — use named constant URL_TAIL_MAX_LOOKBACK_CHARS instead of literal 120
        generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS)
        while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
            generous_start -= 1
        candidates.append(generous_start)
    # FIX #11 — use named constant URL_TAIL_MAX_CLAUSE_DISTANCE instead of literal 180
    reasonable_candidates = [
-        c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
+        c for c in candidates
        if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
    ]
    if reasonable_candidates:
        start = min(reasonable_candidates, key=lambda c: (url_pos - c))
        # FIX #11 — use named constant URL_TAIL_MIN_PREFIX_CHARS instead of literal 35
        if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS:
-            farther = [c for c in reasonable_candidates if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS]
+            farther = [
                c for c in reasonable_candidates
                if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS
            ]
            if farther:
                start = min(farther, key=lambda c: (url_pos - c))
        return start
@@ -1149,15 +1158,12 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
    truncated = text[: max_length - 3]
    last_space = truncated.rfind(" ")
    # FIX #11 — use named constant TRUNCATE_MIN_PREFIX_CHARS instead of literal 0
    if last_space > TRUNCATE_MIN_PREFIX_CHARS:
        return truncated[:last_space] + "..."
    return truncated + "..."
-def truncate_text_preserving_tail(
+def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
    text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH
 ):
    if (
        not text
        or tail_start is None
@@ -1176,11 +1182,9 @@ def truncate_text_preserving_tail(
    reserve = len(tail) + 4
    if reserve >= max_length:
        shortened_tail = tail[-(max_length - 3) :].strip()
        first_space = shortened_tail.find(" ")
        if 0 <= first_space <= 30:
            shortened_tail = shortened_tail[first_space + 1 :].strip()
        return f"...{shortened_tail}"
    available_prefix = max_length - reserve
@@ -1234,10 +1238,7 @@ def choose_final_visible_text(
            text_without_url = remove_url_from_visible_text(
                text, primary_non_x_url
            ).strip()
-            if (
+            if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
                text_without_url
                and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH
            ):
                logging.info(
                    "🔗 Keeping full visible text by removing long external URL from body and using external card"
                )
@@ -1267,7 +1268,6 @@ def build_media_fingerprint(tweet):
    for media in tweet.media:
        media_type = getattr(media, "type", "unknown")
        media_url = getattr(media, "media_url_https", "") or ""
        stable_value = media_url
        if media_type == "photo":
@@ -1338,9 +1338,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
 def create_bsky_client(base_url, handle, password):
    normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
-    logging.info(
+    logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
        f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}"
    )
    try:
        client = Client(base_url=normalized_base_url)
@@ -1355,9 +1353,7 @@ def create_bsky_client(base_url, handle, password):
            elif hasattr(client, "_base_url"):
                client._base_url = normalized_base_url
        except Exception as e:
-            logging.warning(
+            logging.warning(f"⚠️ Could not apply custom base URL cleanly: {e}")
                f"⚠️ Could not apply custom base URL cleanly: {e}"
            )
    client.login(handle, password)
    return client
@@ -1426,16 +1422,12 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
        "canonical_tweet_url": canonical_tweet_url,
        "normalized_text": candidate["normalized_text"],
        "raw_text": candidate["raw_text"],
-        "full_clean_text": candidate.get(
+        "full_clean_text": candidate.get("full_clean_text", candidate["raw_text"]),
            "full_clean_text", candidate["raw_text"]
        ),
        "media_fingerprint": candidate["media_fingerprint"],
        "text_media_key": candidate["text_media_key"],
        "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
        "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
-        "resolved_primary_external_url": candidate.get(
+        "resolved_primary_external_url": candidate.get("resolved_primary_external_url"),
            "resolved_primary_external_url"
        ),
        "bsky_uri": bsky_uri,
        "tweet_created_on": candidate["tweet"].created_on,
        "tweet_url": candidate["tweet"].tweet_url,
@@ -1483,15 +1475,16 @@ def prune_state(state, max_entries=5000):
    sortable.sort(key=lambda x: x[1], reverse=True)
    keep_keys = {key for key, _ in sortable[:max_entries]}
-    new_posted_tweets = {}
+    new_posted_tweets = {
-    for key, record in posted_tweets.items():
+        key: record
-        if key in keep_keys:
+        for key, record in posted_tweets.items()
-            new_posted_tweets[key] = record
+        if key in keep_keys
-
+    }
-    new_posted_by_bsky_uri = {}
+    new_posted_by_bsky_uri = {
-    for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items():
+        bsky_uri: key
-        if key in keep_keys:
+        for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items()
-            new_posted_by_bsky_uri[bsky_uri] = key
+        if key in keep_keys
    }
    state["posted_tweets"] = new_posted_tweets
    state["posted_by_bsky_uri"] = new_posted_by_bsky_uri
@@ -1540,9 +1533,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
                canonical_non_x_urls = set()
                for url in urls:
-                    if not is_tco_domain(url) and not is_x_or_twitter_domain(
+                    if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
                        url
                    ):
                        canonical = canonicalize_url(
                            normalize_urlish_token(url) or url
                        )
@@ -1572,7 +1563,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
                )
    except Exception as e:
        # FIX #9 — elevated to WARNING so operators notice live dedup is disabled
        logging.warning(
            f"⚠️ Could not fetch recent Bluesky posts for duplicate detection "
            f"(live dedup disabled for this cycle): {e}"
@@ -1648,7 +1638,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
                    continue
                else:
                    logging.warning(
-                        f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
+                        f"❌ Exhausted blob upload retries for {media_label} "
                        f"after rate limiting: {repr(e)}"
                    )
                    break
@@ -1657,12 +1648,11 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
                and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES
            ):
                transient_attempts += 1
-                wait_seconds = (
+                wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
                    BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
                )
                logging.warning(
                    f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. "
-                    f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
+                    f"Transient retry {transient_attempts}/"
                    f"{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
                )
                time.sleep(wait_seconds)
                continue
@@ -1674,9 +1664,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
                    logging.warning(
                        f"Upload response status: {e.response.status_code}"
                    )
-                    logging.warning(
+                    logging.warning(f"Upload response body: {e.response.text}")
                        f"Upload response body: {e.response.text}"
                    )
                except Exception:
                    pass
@@ -1685,11 +1673,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
    logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
    return None
 def send_post_with_retry(client, **kwargs):
    """
    Wrapper around client.send_post() with retry logic for transient errors
    and rate limiting.
    """
    last_exception = None
    for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
@@ -1723,10 +1708,7 @@ def send_post_with_retry(client, **kwargs):
                    )
                    raise
-            if (
+            if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES:
                is_transient_error(e)
                and attempt < BSKY_SEND_POST_MAX_RETRIES
            ):
                wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
                logging.warning(
                    f"⏳ Transient send_post failure: {repr(e)}. "
@@ -1739,8 +1721,6 @@ def send_post_with_retry(client, **kwargs):
    raise last_exception
 # --- Image Compression ---
 def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
    try:
        with Image.open(io.BytesIO(image_bytes)) as img:
@@ -1770,12 +1750,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
                    progressive=True,
                )
                data = out.getvalue()
                logging.info(
                    f"🖼️ Post image candidate size at JPEG quality {quality}: "
                    f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
                )
                if len(data) <= max_bytes:
                    return data
@@ -1802,12 +1780,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
                        progressive=True,
                    )
                    data = out.getvalue()
                    logging.info(
                        f"🖼️ Post image resized to <= {target_dim}px at quality {quality}: "
                        f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
                    )
                    if len(data) <= max_bytes:
                        return data
@@ -1865,9 +1841,7 @@ def get_blob_from_url(media_url, client, http_client):
                    )
                    return None
-        return upload_blob_with_retry(
+        return upload_blob_with_retry(client, upload_bytes, media_label=media_url)
            client, upload_bytes, media_label=media_url
        )
    except Exception as e:
        logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
@@ -1902,23 +1876,17 @@ def get_blob_from_file(file_path, client):
        with open(file_path, "rb") as f:
            binary_data = f.read()
-        return upload_blob_with_retry(
+        return upload_blob_with_retry(client, binary_data, media_label=file_path)
            client, binary_data, media_label=file_path
        )
    except Exception as e:
-        logging.warning(
+        logging.warning(f"Could not upload local file {file_path}: {repr(e)}")
            f"Could not upload local file {file_path}: {repr(e)}"
        )
        if hasattr(e, "response") and e.response is not None:
            try:
                logging.warning(
                    f"Upload response status: {e.response.status_code}"
                )
-                logging.warning(
+                logging.warning(f"Upload response body: {e.response.text}")
                    f"Upload response body: {e.response.text}"
                )
            except Exception:
                pass
@@ -1956,12 +1924,10 @@ def compress_external_thumb_to_limit(
                    progressive=True,
                )
                data = out.getvalue()
                logging.info(
                    f"🖼️ External thumb candidate size at JPEG quality {quality}: "
                    f"{len(data) / 1024:.2f} KB"
                )
                if len(data) <= max_bytes:
                    return data
@@ -1988,19 +1954,15 @@ def compress_external_thumb_to_limit(
                        progressive=True,
                    )
                    data = out.getvalue()
                    logging.info(
                        f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: "
                        f"{len(data) / 1024:.2f} KB"
                    )
                    if len(data) <= max_bytes:
                        return data
    except Exception as e:
-        logging.warning(
+        logging.warning(f"Could not compress external thumbnail: {repr(e)}")
            f"Could not compress external thumbnail: {repr(e)}"
        )
    return None
@@ -2101,20 +2063,18 @@ def fetch_link_metadata(url, http_client):
        }
    except Exception as e:
-        logging.warning(
+        logging.warning(f"Could not fetch link metadata for {url}: {repr(e)}")
            f"Could not fetch link metadata for {url}: {repr(e)}"
        )
        return {}
 def build_external_link_embed(
-    url, client, http_client, fallback_title="Link",
+    url, client, http_client, fallback_title="Link", prefetched_metadata=None,
    prefetched_metadata=None,
 ):
-    # FIX #5 — accept pre-fetched metadata to avoid a duplicate HTTP request
+    link_metadata = (
-    # when the caller already fetched it for build_dynamic_alt.
+        prefetched_metadata
-    link_metadata = prefetched_metadata if prefetched_metadata is not None \
+        if prefetched_metadata is not None
        else fetch_link_metadata(url, http_client)
    )
    thumb_blob = None
    if link_metadata.get("image"):
@@ -2144,12 +2104,10 @@ def build_external_link_embed(
 def make_rich(content):
-    # FIX #10 — note explaining @mention limitation.
+    # NOTE: Bluesky supports native @mention facets, but resolving a Twitter
-    # Bluesky supports native @mention facets, but resolving a Twitter handle
+    # handle to a Bluesky DID requires an external lookup. That mapping is not
-    # to a Bluesky DID requires an external lookup (e.g. via the atproto
+    # available here so @mentions are passed through as plain text. If you add
-    # identity resolution API). That mapping is not available here, so
+    # a handle-mapping table in the future, call
    # @mentions are intentionally passed through as plain text. If you add a
    # handle-mapping table in the future, call
    # text_builder.mention(word, did) here instead of text_builder.text(word).
    text_builder = client_utils.TextBuilder()
    content = clean_post_text(content)
@@ -2178,8 +2136,7 @@ def make_rich(content):
                    clean_url_value = clean_url(normalized_candidate)
                    if clean_url_value and is_valid_url(clean_url_value):
-                        display_text = cleaned_word
+                        text_builder.link(cleaned_word, clean_url_value)
                        text_builder.link(display_text, clean_url_value)
                        trailing = word[len(cleaned_word):]
                        if trailing:
                            text_builder.text(trailing)
@@ -2209,8 +2166,6 @@ def make_rich(content):
 def build_dynamic_alt(raw_text, link_title=None):
    # FIX #5 — accept optional link_title so URL-only tweets get a richer alt
    # instead of always falling back to the generic "Attached video or image" string.
    dynamic_alt = clean_post_text(raw_text)
    dynamic_alt = dynamic_alt.replace("\n", " ").strip()
    dynamic_alt = re.sub(
@@ -2220,7 +2175,6 @@ def build_dynamic_alt(raw_text, link_title=None):
    if not dynamic_alt and link_title:
        dynamic_alt = link_title.strip()
    # FIX #11 — use named constant DYNAMIC_ALT_MAX_LENGTH instead of literal 150
    if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH:
        dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..."
    elif not dynamic_alt:
@@ -2231,9 +2185,7 @@ def build_dynamic_alt(raw_text, link_title=None):
 def build_video_embed(video_blob, alt_text):
    try:
-        return models.AppBskyEmbedVideo.Main(
+        return models.AppBskyEmbedVideo.Main(video=video_blob, alt=alt_text)
            video=video_blob, alt=alt_text
        )
    except AttributeError:
        logging.error(
            "❌ Your atproto version does not support AppBskyEmbedVideo. Upgrade atproto."
@@ -2246,7 +2198,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
    tweets = []
    state_file = "twitter_browser_state.json"
    # FIX #14 — enforce restrictive permissions on the session cookie file
    if os.path.exists(state_file):
        try:
            os.chmod(state_file, SESSION_FILE_PERMISSIONS)
@@ -2266,15 +2217,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
            "Chrome/145.0.7632.6 Safari/537.36"
        )
        # FIX #1 — all Playwright browser context variables renamed to
        # 'browser_context' throughout this function to eliminate the name
        # collision with the 'context_text' / 'social_context_el' variables
        # used inside the per-article parsing loop below.
        browser_context = None
        needs_login = True
        # FIX #7 — track the session-check page explicitly so we can close
        # it before opening the profile scrape page, preventing a page leak.
        session_check_page = None
        if os.path.exists(state_file):
@@ -2302,15 +2246,12 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                logging.warning(
                    "⚠️ Saved session expired or invalid. Re-logging in..."
                )
                # FIX #7 — close the check page before closing the context
                session_check_page.close()
                session_check_page = None
                browser_context.close()
                browser_context = None
                os.remove(state_file)
        # FIX #7 — always close the session-check page before opening the
        # profile page, whether a re-login was needed or not.
        if session_check_page is not None:
            session_check_page.close()
            session_check_page = None
@@ -2391,7 +2332,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                time.sleep(3)
                browser_context.storage_state(path=state_file)
                # FIX #14 — set restrictive permissions immediately after writing
                try:
                    os.chmod(state_file, SESSION_FILE_PERMISSIONS)
                except Exception as chmod_err:
@@ -2408,7 +2348,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                browser.close()
                return []
            # FIX #7 — close the login page cleanly before opening scrape page
            login_page.close()
        logging.info(
@@ -2446,10 +2385,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                                else href
                            )
                    # --- Retweet detection ---
                    is_retweet = False
                    try:
                        # FIX #1 — renamed from 'context' to 'social_context_el'
                        social_context_el = article.locator(
                            '[data-testid="socialContext"]'
                        ).first
@@ -2498,7 +2435,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                    if video_locators:
                        media_urls.append((tweet_url or "", "video"))
                    # --- Card URL extraction (link preview card) ---
                    card_url = None
                    try:
                        card_locator = article.locator(
@@ -2520,9 +2456,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                                '[data-testid="card.wrapper"] [role="link"]'
                            ).first
                            if card_role_link.is_visible():
-                                card_a = card_role_link.locator(
+                                card_a = card_role_link.locator("a[href]").first
                                    "a[href]"
                                ).first
                                if card_a.is_visible():
                                    card_href = card_a.get_attribute("href")
                                    if card_href:
@@ -2545,9 +2479,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
                    )
                except Exception as e:
-                    logging.warning(
+                    logging.warning(f"⚠️ Failed to parse a specific tweet: {e}")
                        f"⚠️ Failed to parse a specific tweet: {e}"
                    )
                    continue
        except Exception as e:
@@ -2560,7 +2492,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
 # --- Video Extraction & Processing ---
 def extract_video_url_from_tweet_page(browser_context, tweet_url):
    # FIX #1 — parameter renamed from 'context' to 'browser_context'
    page = browser_context.new_page()
    best_m3u8_url = None
    best_video_mp4_url = None
@@ -2573,10 +2504,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
            "/aud/" in url_l
            or "/audio/" in url_l
            or "mp4a" in url_l
-            or (
+            or ("audio/" in content_type_l and "video/" not in content_type_l)
                "audio/" in content_type_l
                and "video/" not in content_type_l
            )
        )
    def handle_response(response):
@@ -2649,7 +2577,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
        else:
            logging.warning("⚠️ No video player locator found on tweet page")
        # FIX #11 — use named constant VIDEO_PLAYER_WAIT_ROUNDS
        for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
            if current_best():
                break
@@ -2661,7 +2588,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
            )
            try:
                player.click(force=True, timeout=5000)
                # FIX #11 — use named constant PLAYWRIGHT_RETRY_SLEEP_S
                time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
            except Exception as e:
                logging.info(f"⚠️ Retry click failed: {e}")
@@ -2672,7 +2598,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
            except Exception:
                pass
            # FIX #11 — use named constant VIDEO_PLAYER_RETRY_ROUNDS
            for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
                if current_best():
                    break
@@ -2698,12 +2623,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
 def _probe_video_duration(file_path):
    """
    FIX #6 — Use ffprobe via subprocess instead of VideoFileClip to get video
    duration. This avoids a potential hang on corrupt/truncated files since we
    apply a hard timeout to the subprocess call.
    Returns duration in seconds as a float, or raises RuntimeError on failure.
    """
    probe_cmd = [
        "ffprobe",
        "-v", "error",
@@ -2739,9 +2658,7 @@ def download_and_crop_video(video_url, output_path):
    temp_output = output_path.replace(".mp4", "_compressed.mp4")
    try:
-        logging.info(
+        logging.info(f"⬇️ Downloading video source with ffmpeg: {video_url}")
            f"⬇️ Downloading video source with ffmpeg: {video_url}"
        )
        video_url_l = video_url.lower()
@@ -2781,15 +2698,11 @@ def download_and_crop_video(video_url, output_path):
            not os.path.exists(temp_input)
            or os.path.getsize(temp_input) == 0
        ):
-            logging.error(
+            logging.error("❌ Downloaded video source file is missing or empty.")
                "❌ Downloaded video source file is missing or empty."
            )
            return None
        logging.info(f"✅ Video downloaded: {temp_input}")
        # FIX #6 — probe duration with ffprobe (hard timeout) instead of
        # VideoFileClip, which can hang indefinitely on corrupt files.
        try:
            duration = _probe_video_duration(temp_input)
        except RuntimeError as probe_err:
@@ -2797,16 +2710,11 @@ def download_and_crop_video(video_url, output_path):
            return None
        if duration <= 0:
-            logging.error(
+            logging.error("❌ Downloaded video has invalid or unknown duration.")
                "❌ Downloaded video has invalid or unknown duration."
            )
            return None
        end_time = min(VIDEO_MAX_DURATION_SECONDS, duration)
        # FIX #2 — wrap VideoFileClip usage in nested try/finally blocks so
        # both the source clip and the subclip handles are always closed, even
        # if write_videofile raises an exception mid-way.
        video_clip = VideoFileClip(temp_input)
        try:
            if hasattr(video_clip, "subclipped"):
@@ -2825,9 +2733,9 @@ def download_and_crop_video(video_url, output_path):
                    logger=None,
                )
            finally:
-                cropped_clip.close()  # FIX #2 — always close subclip
+                cropped_clip.close()
        finally:
-            video_clip.close()  # FIX #2 — always close source clip
+            video_clip.close()
        if (
            not os.path.exists(temp_trimmed)
@@ -2898,8 +2806,6 @@ def download_and_crop_video(video_url, output_path):
    finally:
        remove_file_quietly(temp_input)
        remove_file_quietly(temp_trimmed)
        # temp_output was either renamed to output_path via os.replace()
        # or never created; remove_file_quietly is a no-op if it doesn't exist.
        remove_file_quietly(temp_output)
@@ -2928,6 +2834,7 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
    return False, None
 # --- Main Sync Logic ---
 def sync_feeds(args):
    logging.info("🔄 Starting sync cycle...")
@@ -2935,14 +2842,10 @@ def sync_feeds(args):
    bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
    if dry_run:
-        logging.info(
+        logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
            "🧪 DRY RUN MODE — no posts will be created on Bluesky."
        )
    try:
        state = load_state(STATE_PATH)
        # FIX #8 — prune on load so the state file never grows unbounded
        # between runs, not only after individual posts.
        state = prune_state(state, max_entries=5000)
        tweets = scrape_tweets_via_playwright(
@@ -2988,9 +2891,8 @@ def sync_feeds(args):
        logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
        candidate_tweets = []
        # --- Cheap prefilter before expensive processing ---
        cheap_candidates = []
        for tweet in reversed(tweets):
            try:
                tweet_time = arrow.get(tweet.created_on)
@@ -3064,9 +2966,7 @@ def sync_feeds(args):
                        if not is_tco_domain(
                            raw_url
                        ) and not is_x_or_twitter_domain(raw_url):
-                            canonical_non_x_urls.add(
+                            canonical_non_x_urls.add(canonicalize_url(raw_url))
                                canonicalize_url(raw_url)
                            )
                    primary_non_x_url = None
                    if resolved_primary_external_url:
@@ -3174,7 +3074,6 @@ def sync_feeds(args):
            if os.path.exists(browser_state_file):
                context_kwargs["storage_state"] = browser_state_file
            # FIX #1 — renamed from 'context' to 'browser_context'
            browser_context = browser.new_context(**context_kwargs)
            for candidate in candidate_tweets:
@@ -3219,10 +3118,6 @@ def sync_feeds(args):
                    new_posts += 1
                    continue
                # FIX #5 — fetch link metadata once here so we can pass the
                # OG title to build_dynamic_alt AND reuse it inside
                # build_external_link_embed, avoiding a duplicate HTTP request
                # for the same URL.
                link_meta_for_alt: dict = {}
                if candidate.get("resolved_primary_external_url"):
                    try:
@@ -3284,11 +3179,9 @@ def sync_feeds(args):
                                        f"video:resolve_failed:{tweet.tweet_url}"
                                    )
                                else:
-                                    cropped_video_path = (
+                                    cropped_video_path = download_and_crop_video(
                                        download_and_crop_video(
                                        real_video_url, temp_video_path
                                    )
                                    )
                                    if not cropped_video_path:
                                        logging.warning(
                                            f"⚠️ Video download/crop failed for "
@@ -3375,8 +3268,6 @@ def sync_feeds(args):
                                f"external card: {candidate_url}"
                            )
                        # FIX #5 — pass the already-fetched metadata so
                        # build_external_link_embed skips a duplicate HTTP fetch.
                        external_embed = build_external_link_embed(
                            candidate_url,
                            bsky_client,
@@ -3489,24 +3380,19 @@ def sync_feeds(args):
    except Exception as e:
        logging.error(f"❌ Error during sync cycle: {e}")
 def main():
    load_dotenv()
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync")
        description="Twitter to Bluesky Sync"
    )
    parser.add_argument(
        "--twitter-username",
        help="Your Twitter login username",
    )
    parser.add_argument(
        "--twitter-password",
        # NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
        # Prefer setting TWITTER_PASSWORD in your .env file instead.
        help="Your Twitter login password",
        # FIX #15 — password args are still supported for compatibility but
        # the .env file is the recommended path; passwords passed via CLI
        # are visible in `ps aux`. Consider removing these args and requiring
        # env vars exclusively, or prompting with getpass for interactive use.
    )
    parser.add_argument(
        "--twitter-email",
@@ -3522,8 +3408,9 @@ def main():
    )
    parser.add_argument(
        "--bsky-password",
        # NOTE (FIX #15): same warning as --twitter-password above.
        # Prefer setting BSKY_APP_PASSWORD in your .env file instead.
        help="Your Bluesky app password",
        # FIX #15 — same note as --twitter-password above.
    )
    parser.add_argument(
        "--bsky-base-url",
@@ -3547,8 +3434,8 @@ def main():
    args = parser.parse_args()
    # Resolve credentials: CLI args take priority, then env vars.
-    # FIX #15 — document that env vars are the secure path; CLI args expose
+    # FIX #15 — env vars are the secure path; CLI args expose secrets in
-    # secrets in the process list. Operators should prefer .env / env vars.
+    # the process list. Operators should prefer .env / environment variables.
    args.twitter_username = args.twitter_username or os.getenv(
        "TWITTER_USERNAME"
    )
@@ -3620,3 +3507,4 @@ def main():
 if __name__ == "__main__":
    main()