Added #fix

This commit is contained in:
2026-04-13 19:33:59 +00:00
parent 6f67822e7e
commit 3810cef150

View File

@@ -54,37 +54,36 @@ LINK_METADATA_TIMEOUT = 10
URL_RESOLVE_TIMEOUT = 12
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
SUBPROCESS_TIMEOUT_SECONDS = 180
FFPROBE_TIMEOUT_SECONDS = 15 # FIX #6 — named constant for ffprobe probe timeout
FFPROBE_TIMEOUT_SECONDS = 15
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
# FIX #11 — named constants replacing magic numbers scattered across the codebase
OG_TITLE_WAIT_TIMEOUT_MS = 7000 # ms to wait for og:title meta tag
PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0 # seconds to sleep after page.goto in resolvers
PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8 # seconds between idle-state polls
PLAYWRIGHT_IDLE_POLL_ROUNDS = 4 # number of idle-state poll rounds
PLAYWRIGHT_RETRY_SLEEP_S = 2.0 # seconds to sleep before retry interaction
VIDEO_PLAYER_WAIT_ROUNDS = 8 # rounds waiting for video URL after first click
VIDEO_PLAYER_RETRY_ROUNDS = 5 # rounds waiting for video URL after retry click
URL_TAIL_MIN_PREFIX_CHARS = 35 # minimum prefix chars before URL for tail detection
URL_TAIL_MAX_LOOKBACK_CHARS = 120 # generous lookback window when hashtags follow URL
URL_TAIL_MAX_CLAUSE_DISTANCE = 180 # max chars a clause boundary may be from URL start
DYNAMIC_ALT_MAX_LENGTH = 150 # max chars for dynamic alt text
TRUNCATE_MIN_PREFIX_CHARS = 20 # min prefix length before inserting ellipsis
SHORT_TWEET_OG_FETCH_THRESHOLD = 35 # tweets shorter than this get og:title enrichment
ORPHAN_DIGIT_MAX_DIGITS = 3 # max digit count for orphaned-digit-line detection
SESSION_FILE_PERMISSIONS = 0o600 # FIX #14 — restrictive permissions for session cookie file
OG_TITLE_WAIT_TIMEOUT_MS = 7000
PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0
PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8
PLAYWRIGHT_IDLE_POLL_ROUNDS = 4
PLAYWRIGHT_RETRY_SLEEP_S = 2.0
VIDEO_PLAYER_WAIT_ROUNDS = 8
VIDEO_PLAYER_RETRY_ROUNDS = 5
URL_TAIL_MIN_PREFIX_CHARS = 35
URL_TAIL_MAX_LOOKBACK_CHARS = 120
URL_TAIL_MAX_CLAUSE_DISTANCE = 180
DYNAMIC_ALT_MAX_LENGTH = 150
TRUNCATE_MIN_PREFIX_CHARS = 20
SHORT_TWEET_OG_FETCH_THRESHOLD = 35
ORPHAN_DIGIT_MAX_DIGITS = 3
SESSION_FILE_PERMISSIONS = 0o600
# --- Logging Setup ---
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
handlers=[
logging.FileHandler(LOG_PATH, encoding="utf-8"),
logging.StreamHandler(),
],
level=logging.INFO,
)
# --- Per-run caches for efficiency ---
# FIX #12 — caches are still module-level but now encapsulated in a class so they
# can be passed explicitly and are safe to reset between daemon cycles without
# relying on global mutation from arbitrary call sites.
# --- Per-run caches ---
class _RunCache:
def __init__(self):
self.og_title: dict = {}
@@ -146,7 +145,38 @@ def is_valid_url(url):
def strip_trailing_url_punctuation(url):
if not url:
return url
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
# Strip a trailing hashtag-style fragment (#Word) that is really a social
# hashtag glued to the end of a URL with no space, e.g.
# https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
# Only stripped when it starts with a letter so real anchors like
# /page#section-2 inside a longer sentence are left alone.
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
def split_url_hashtag_suffix(text):
"""
Split a URL that has a hashtag fragment glued to it with no space, e.g.:
'https://cit.transit.gencat.cat#SCT'
becomes:
'https://cit.transit.gencat.cat #SCT'
Only splits when the fragment looks like a social hashtag: starts with #
followed by a letter then word characters. The lookahead (?=\\s|$) ensures
we only act at a word boundary so mid-sentence anchors followed by more
URL path are left untouched.
"""
if not text:
return text
fixed = re.sub(
r"(https?://[^\s#<>\"']+)(#[A-Za-z]\w*)(?=\s|$)",
r"\1 \2",
text,
)
if fixed != text:
logging.info("🔧 Split hashtag suffix from URL in text")
return fixed
def split_concatenated_urls(text):
@@ -165,6 +195,8 @@ def repair_broken_urls(text):
original = text
text = split_concatenated_urls(text)
# Split glued hashtag suffixes before any rejoining passes
text = split_url_hashtag_suffix(text)
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
@@ -186,6 +218,9 @@ def repair_broken_urls(text):
)
text = split_concatenated_urls(text)
# Run hashtag split again after rejoining passes — the rejoining regex
# contains # in its character class so it can re-glue a fragment.
text = split_url_hashtag_suffix(text)
if text != original:
logging.info("🔧 Repaired broken URL wrapping in scraped text")
@@ -232,7 +267,6 @@ def repair_broken_mentions(text):
if is_blank_line(next_line):
break
if is_mention_only_line(next_line):
break
@@ -256,7 +290,6 @@ def repair_broken_mentions(text):
if is_blank_line(next_line):
break
if is_mention_only_line(next_line):
break
@@ -287,8 +320,8 @@ def strip_line_edge_whitespace(text):
lines = text.splitlines()
cleaned_lines = []
changed = False
for line in lines:
cleaned = line.strip()
if cleaned != line:
@@ -316,11 +349,6 @@ def remove_trailing_ellipsis_line(text):
def remove_orphaned_digit_lines_before_hashtags(text):
"""
Remove lines that contain only a number (e.g. '5') when they appear
immediately before a line starting with a hashtag. These are typically
scraped UI artifacts (image counts, engagement badges, etc.).
"""
if not text:
return text
@@ -331,8 +359,6 @@ def remove_orphaned_digit_lines_before_hashtags(text):
result = []
changed = False
i = 0
# FIX #11 — use named constant ORPHAN_DIGIT_MAX_DIGITS instead of literal 3
orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}")
while i < len(lines):
@@ -519,7 +545,6 @@ def should_fetch_og_title(tweet):
if "" in text or text.endswith("..."):
return True
# FIX #11 — use named constant SHORT_TWEET_OG_FETCH_THRESHOLD instead of literal 35
if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD:
return True
@@ -535,7 +560,7 @@ def fetch_tweet_og_title_text(tweet_url):
return _cache.og_title[tweet_url]
browser = None
browser_context = None # FIX #1 — renamed from 'context' to avoid collision
browser_context = None
page = None
try:
@@ -562,8 +587,10 @@ def fetch_tweet_og_title_text(tweet_url):
)
try:
# FIX #11 — use named constant OG_TITLE_WAIT_TIMEOUT_MS instead of literal 7000
page.wait_for_selector('meta[property="og:title"]', timeout=OG_TITLE_WAIT_TIMEOUT_MS)
page.wait_for_selector(
'meta[property="og:title"]',
timeout=OG_TITLE_WAIT_TIMEOUT_MS,
)
except Exception:
pass
@@ -629,7 +656,7 @@ def resolve_tco_with_httpx(url, http_client):
def resolve_tco_with_playwright(url):
browser = None
browser_context = None # FIX #1 — renamed from 'context'
browser_context = None
page = None
try:
@@ -661,11 +688,9 @@ def resolve_tco_with_playwright(url):
f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}"
)
# FIX #11 — use named constant PLAYWRIGHT_POST_GOTO_SLEEP_S
time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S)
final_url = canonicalize_url(page.url)
# FIX #11 — use named constants for poll rounds and sleep
for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS):
if final_url and is_external_non_x_url(final_url):
break
@@ -815,10 +840,6 @@ def extract_first_resolved_external_url(
def resolve_card_url(card_url, http_client):
"""
Resolve a card URL (typically t.co) scraped from the tweet's link preview card.
Returns the final external URL or None.
"""
if not card_url:
return None
@@ -869,9 +890,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
if is_x_or_twitter_domain(cleaned):
replacements[raw_url] = ""
logging.info(
f"🧹 Removing X/Twitter URL from visible text: {cleaned}"
)
logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}")
continue
final_url = cleaned
@@ -927,9 +946,8 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
if len(line_urls) > 1:
prefix = re.sub(url_pattern, "", line).strip()
kept_urls = []
# FIX #4 — local set per line, not shared outer state
seen_in_line: set = set()
for url in line_urls:
normalized = normalize_urlish_token(url) or url
canonical = canonicalize_url(normalized)
@@ -976,20 +994,15 @@ def build_effective_tweet_text(tweet, http_client):
scraped_urls = extract_urls_from_text(scraped_text)
og_urls = extract_urls_from_text(og_title_text)
if len(og_title_text) >= len(scraped_text) or (
og_urls and not scraped_urls
):
if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls):
candidate_text = og_title_text
logging.info("🧾 Using og:title-derived tweet text as primary content")
candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(
candidate_text,
http_client,
has_media=has_media,
candidate_text, http_client, has_media=has_media,
)
candidate_text = clean_post_text(candidate_text)
# --- Resolve the card_url scraped from the tweet's link preview ---
resolved_card_url = resolve_card_url(
getattr(tweet, "card_url", None), http_client
)
@@ -1102,12 +1115,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
candidates = [url_pos]
clause_patterns = [
r"\.\s+",
r":\s+",
r";\s+",
r"!\s+",
r"\?\s+",
r",\s+",
r"\.\s+", r":\s+", r";\s+", r"!\s+", r"\?\s+", r",\s+",
]
before = text[:url_pos]
@@ -1120,22 +1128,23 @@ def find_tail_preservation_start(text, primary_non_x_url):
candidates.append(last_newline + 1)
if has_hashtag_after_url:
# FIX #11 — use named constant URL_TAIL_MAX_LOOKBACK_CHARS instead of literal 120
generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS)
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
generous_start -= 1
candidates.append(generous_start)
# FIX #11 — use named constant URL_TAIL_MAX_CLAUSE_DISTANCE instead of literal 180
reasonable_candidates = [
c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
c for c in candidates
if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
]
if reasonable_candidates:
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
# FIX #11 — use named constant URL_TAIL_MIN_PREFIX_CHARS instead of literal 35
if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS:
farther = [c for c in reasonable_candidates if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS]
farther = [
c for c in reasonable_candidates
if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS
]
if farther:
start = min(farther, key=lambda c: (url_pos - c))
return start
@@ -1149,15 +1158,12 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
truncated = text[: max_length - 3]
last_space = truncated.rfind(" ")
# FIX #11 — use named constant TRUNCATE_MIN_PREFIX_CHARS instead of literal 0
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
return truncated[:last_space] + "..."
return truncated + "..."
def truncate_text_preserving_tail(
text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH
):
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
if (
not text
or tail_start is None
@@ -1176,11 +1182,9 @@ def truncate_text_preserving_tail(
reserve = len(tail) + 4
if reserve >= max_length:
shortened_tail = tail[-(max_length - 3) :].strip()
first_space = shortened_tail.find(" ")
if 0 <= first_space <= 30:
shortened_tail = shortened_tail[first_space + 1 :].strip()
return f"...{shortened_tail}"
available_prefix = max_length - reserve
@@ -1234,10 +1238,7 @@ def choose_final_visible_text(
text_without_url = remove_url_from_visible_text(
text, primary_non_x_url
).strip()
if (
text_without_url
and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH
):
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
logging.info(
"🔗 Keeping full visible text by removing long external URL from body and using external card"
)
@@ -1267,7 +1268,6 @@ def build_media_fingerprint(tweet):
for media in tweet.media:
media_type = getattr(media, "type", "unknown")
media_url = getattr(media, "media_url_https", "") or ""
stable_value = media_url
if media_type == "photo":
@@ -1338,9 +1338,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
def create_bsky_client(base_url, handle, password):
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
logging.info(
f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}"
)
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
try:
client = Client(base_url=normalized_base_url)
@@ -1355,9 +1353,7 @@ def create_bsky_client(base_url, handle, password):
elif hasattr(client, "_base_url"):
client._base_url = normalized_base_url
except Exception as e:
logging.warning(
f"⚠️ Could not apply custom base URL cleanly: {e}"
)
logging.warning(f"⚠️ Could not apply custom base URL cleanly: {e}")
client.login(handle, password)
return client
@@ -1426,16 +1422,12 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
"canonical_tweet_url": canonical_tweet_url,
"normalized_text": candidate["normalized_text"],
"raw_text": candidate["raw_text"],
"full_clean_text": candidate.get(
"full_clean_text", candidate["raw_text"]
),
"full_clean_text": candidate.get("full_clean_text", candidate["raw_text"]),
"media_fingerprint": candidate["media_fingerprint"],
"text_media_key": candidate["text_media_key"],
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
"resolved_primary_external_url": candidate.get(
"resolved_primary_external_url"
),
"resolved_primary_external_url": candidate.get("resolved_primary_external_url"),
"bsky_uri": bsky_uri,
"tweet_created_on": candidate["tweet"].created_on,
"tweet_url": candidate["tweet"].tweet_url,
@@ -1483,15 +1475,16 @@ def prune_state(state, max_entries=5000):
sortable.sort(key=lambda x: x[1], reverse=True)
keep_keys = {key for key, _ in sortable[:max_entries]}
new_posted_tweets = {}
for key, record in posted_tweets.items():
if key in keep_keys:
new_posted_tweets[key] = record
new_posted_by_bsky_uri = {}
for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items():
if key in keep_keys:
new_posted_by_bsky_uri[bsky_uri] = key
new_posted_tweets = {
key: record
for key, record in posted_tweets.items()
if key in keep_keys
}
new_posted_by_bsky_uri = {
bsky_uri: key
for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items()
if key in keep_keys
}
state["posted_tweets"] = new_posted_tweets
state["posted_by_bsky_uri"] = new_posted_by_bsky_uri
@@ -1540,9 +1533,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
canonical_non_x_urls = set()
for url in urls:
if not is_tco_domain(url) and not is_x_or_twitter_domain(
url
):
if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
canonical = canonicalize_url(
normalize_urlish_token(url) or url
)
@@ -1572,7 +1563,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
)
except Exception as e:
# FIX #9 — elevated to WARNING so operators notice live dedup is disabled
logging.warning(
f"⚠️ Could not fetch recent Bluesky posts for duplicate detection "
f"(live dedup disabled for this cycle): {e}"
@@ -1648,7 +1638,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
continue
else:
logging.warning(
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
f"❌ Exhausted blob upload retries for {media_label} "
f"after rate limiting: {repr(e)}"
)
break
@@ -1657,12 +1648,11 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES
):
transient_attempts += 1
wait_seconds = (
BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
)
wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
logging.warning(
f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. "
f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
f"Transient retry {transient_attempts}/"
f"{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
)
time.sleep(wait_seconds)
continue
@@ -1674,9 +1664,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
logging.warning(
f"Upload response status: {e.response.status_code}"
)
logging.warning(
f"Upload response body: {e.response.text}"
)
logging.warning(f"Upload response body: {e.response.text}")
except Exception:
pass
@@ -1685,11 +1673,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
return None
def send_post_with_retry(client, **kwargs):
"""
Wrapper around client.send_post() with retry logic for transient errors
and rate limiting.
"""
last_exception = None
for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
@@ -1723,10 +1708,7 @@ def send_post_with_retry(client, **kwargs):
)
raise
if (
is_transient_error(e)
and attempt < BSKY_SEND_POST_MAX_RETRIES
):
if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES:
wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
logging.warning(
f"⏳ Transient send_post failure: {repr(e)}. "
@@ -1739,8 +1721,6 @@ def send_post_with_retry(client, **kwargs):
raise last_exception
# --- Image Compression ---
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
try:
with Image.open(io.BytesIO(image_bytes)) as img:
@@ -1770,12 +1750,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
progressive=True,
)
data = out.getvalue()
logging.info(
f"🖼️ Post image candidate size at JPEG quality {quality}: "
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
)
if len(data) <= max_bytes:
return data
@@ -1802,12 +1780,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
progressive=True,
)
data = out.getvalue()
logging.info(
f"🖼️ Post image resized to <= {target_dim}px at quality {quality}: "
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
)
if len(data) <= max_bytes:
return data
@@ -1865,9 +1841,7 @@ def get_blob_from_url(media_url, client, http_client):
)
return None
return upload_blob_with_retry(
client, upload_bytes, media_label=media_url
)
return upload_blob_with_retry(client, upload_bytes, media_label=media_url)
except Exception as e:
logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
@@ -1902,23 +1876,17 @@ def get_blob_from_file(file_path, client):
with open(file_path, "rb") as f:
binary_data = f.read()
return upload_blob_with_retry(
client, binary_data, media_label=file_path
)
return upload_blob_with_retry(client, binary_data, media_label=file_path)
except Exception as e:
logging.warning(
f"Could not upload local file {file_path}: {repr(e)}"
)
logging.warning(f"Could not upload local file {file_path}: {repr(e)}")
if hasattr(e, "response") and e.response is not None:
try:
logging.warning(
f"Upload response status: {e.response.status_code}"
)
logging.warning(
f"Upload response body: {e.response.text}"
)
logging.warning(f"Upload response body: {e.response.text}")
except Exception:
pass
@@ -1956,12 +1924,10 @@ def compress_external_thumb_to_limit(
progressive=True,
)
data = out.getvalue()
logging.info(
f"🖼️ External thumb candidate size at JPEG quality {quality}: "
f"{len(data) / 1024:.2f} KB"
)
if len(data) <= max_bytes:
return data
@@ -1988,19 +1954,15 @@ def compress_external_thumb_to_limit(
progressive=True,
)
data = out.getvalue()
logging.info(
f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: "
f"{len(data) / 1024:.2f} KB"
)
if len(data) <= max_bytes:
return data
except Exception as e:
logging.warning(
f"Could not compress external thumbnail: {repr(e)}"
)
logging.warning(f"Could not compress external thumbnail: {repr(e)}")
return None
@@ -2101,20 +2063,18 @@ def fetch_link_metadata(url, http_client):
}
except Exception as e:
logging.warning(
f"Could not fetch link metadata for {url}: {repr(e)}"
)
logging.warning(f"Could not fetch link metadata for {url}: {repr(e)}")
return {}
def build_external_link_embed(
url, client, http_client, fallback_title="Link",
prefetched_metadata=None,
url, client, http_client, fallback_title="Link", prefetched_metadata=None,
):
# FIX #5 — accept pre-fetched metadata to avoid a duplicate HTTP request
# when the caller already fetched it for build_dynamic_alt.
link_metadata = prefetched_metadata if prefetched_metadata is not None \
link_metadata = (
prefetched_metadata
if prefetched_metadata is not None
else fetch_link_metadata(url, http_client)
)
thumb_blob = None
if link_metadata.get("image"):
@@ -2144,12 +2104,10 @@ def build_external_link_embed(
def make_rich(content):
# FIX #10 — note explaining @mention limitation.
# Bluesky supports native @mention facets, but resolving a Twitter handle
# to a Bluesky DID requires an external lookup (e.g. via the atproto
# identity resolution API). That mapping is not available here, so
# @mentions are intentionally passed through as plain text. If you add a
# handle-mapping table in the future, call
# NOTE: Bluesky supports native @mention facets, but resolving a Twitter
# handle to a Bluesky DID requires an external lookup. That mapping is not
# available here so @mentions are passed through as plain text. If you add
# a handle-mapping table in the future, call
# text_builder.mention(word, did) here instead of text_builder.text(word).
text_builder = client_utils.TextBuilder()
content = clean_post_text(content)
@@ -2178,8 +2136,7 @@ def make_rich(content):
clean_url_value = clean_url(normalized_candidate)
if clean_url_value and is_valid_url(clean_url_value):
display_text = cleaned_word
text_builder.link(display_text, clean_url_value)
text_builder.link(cleaned_word, clean_url_value)
trailing = word[len(cleaned_word):]
if trailing:
text_builder.text(trailing)
@@ -2209,8 +2166,6 @@ def make_rich(content):
def build_dynamic_alt(raw_text, link_title=None):
# FIX #5 — accept optional link_title so URL-only tweets get a richer alt
# instead of always falling back to the generic "Attached video or image" string.
dynamic_alt = clean_post_text(raw_text)
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
dynamic_alt = re.sub(
@@ -2220,7 +2175,6 @@ def build_dynamic_alt(raw_text, link_title=None):
if not dynamic_alt and link_title:
dynamic_alt = link_title.strip()
# FIX #11 — use named constant DYNAMIC_ALT_MAX_LENGTH instead of literal 150
if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH:
dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..."
elif not dynamic_alt:
@@ -2231,9 +2185,7 @@ def build_dynamic_alt(raw_text, link_title=None):
def build_video_embed(video_blob, alt_text):
try:
return models.AppBskyEmbedVideo.Main(
video=video_blob, alt=alt_text
)
return models.AppBskyEmbedVideo.Main(video=video_blob, alt=alt_text)
except AttributeError:
logging.error(
"❌ Your atproto version does not support AppBskyEmbedVideo. Upgrade atproto."
@@ -2246,7 +2198,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
tweets = []
state_file = "twitter_browser_state.json"
# FIX #14 — enforce restrictive permissions on the session cookie file
if os.path.exists(state_file):
try:
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
@@ -2266,15 +2217,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
"Chrome/145.0.7632.6 Safari/537.36"
)
# FIX #1 — all Playwright browser context variables renamed to
# 'browser_context' throughout this function to eliminate the name
# collision with the 'context_text' / 'social_context_el' variables
# used inside the per-article parsing loop below.
browser_context = None
needs_login = True
# FIX #7 — track the session-check page explicitly so we can close
# it before opening the profile scrape page, preventing a page leak.
session_check_page = None
if os.path.exists(state_file):
@@ -2302,15 +2246,12 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
logging.warning(
"⚠️ Saved session expired or invalid. Re-logging in..."
)
# FIX #7 — close the check page before closing the context
session_check_page.close()
session_check_page = None
browser_context.close()
browser_context = None
os.remove(state_file)
# FIX #7 — always close the session-check page before opening the
# profile page, whether a re-login was needed or not.
if session_check_page is not None:
session_check_page.close()
session_check_page = None
@@ -2391,7 +2332,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
time.sleep(3)
browser_context.storage_state(path=state_file)
# FIX #14 — set restrictive permissions immediately after writing
try:
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
except Exception as chmod_err:
@@ -2408,7 +2348,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
browser.close()
return []
# FIX #7 — close the login page cleanly before opening scrape page
login_page.close()
logging.info(
@@ -2446,10 +2385,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
else href
)
# --- Retweet detection ---
is_retweet = False
try:
# FIX #1 — renamed from 'context' to 'social_context_el'
social_context_el = article.locator(
'[data-testid="socialContext"]'
).first
@@ -2498,7 +2435,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
if video_locators:
media_urls.append((tweet_url or "", "video"))
# --- Card URL extraction (link preview card) ---
card_url = None
try:
card_locator = article.locator(
@@ -2520,9 +2456,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
'[data-testid="card.wrapper"] [role="link"]'
).first
if card_role_link.is_visible():
card_a = card_role_link.locator(
"a[href]"
).first
card_a = card_role_link.locator("a[href]").first
if card_a.is_visible():
card_href = card_a.get_attribute("href")
if card_href:
@@ -2545,9 +2479,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
)
except Exception as e:
logging.warning(
f"⚠️ Failed to parse a specific tweet: {e}"
)
logging.warning(f"⚠️ Failed to parse a specific tweet: {e}")
continue
except Exception as e:
@@ -2560,7 +2492,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
# --- Video Extraction & Processing ---
def extract_video_url_from_tweet_page(browser_context, tweet_url):
# FIX #1 — parameter renamed from 'context' to 'browser_context'
page = browser_context.new_page()
best_m3u8_url = None
best_video_mp4_url = None
@@ -2573,10 +2504,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
"/aud/" in url_l
or "/audio/" in url_l
or "mp4a" in url_l
or (
"audio/" in content_type_l
and "video/" not in content_type_l
)
or ("audio/" in content_type_l and "video/" not in content_type_l)
)
def handle_response(response):
@@ -2649,7 +2577,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
else:
logging.warning("⚠️ No video player locator found on tweet page")
# FIX #11 — use named constant VIDEO_PLAYER_WAIT_ROUNDS
for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
if current_best():
break
@@ -2661,7 +2588,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
)
try:
player.click(force=True, timeout=5000)
# FIX #11 — use named constant PLAYWRIGHT_RETRY_SLEEP_S
time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
except Exception as e:
logging.info(f"⚠️ Retry click failed: {e}")
@@ -2672,7 +2598,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
except Exception:
pass
# FIX #11 — use named constant VIDEO_PLAYER_RETRY_ROUNDS
for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
if current_best():
break
@@ -2698,12 +2623,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
def _probe_video_duration(file_path):
"""
FIX #6 — Use ffprobe via subprocess instead of VideoFileClip to get video
duration. This avoids a potential hang on corrupt/truncated files since we
apply a hard timeout to the subprocess call.
Returns duration in seconds as a float, or raises RuntimeError on failure.
"""
probe_cmd = [
"ffprobe",
"-v", "error",
@@ -2739,9 +2658,7 @@ def download_and_crop_video(video_url, output_path):
temp_output = output_path.replace(".mp4", "_compressed.mp4")
try:
logging.info(
f"⬇️ Downloading video source with ffmpeg: {video_url}"
)
logging.info(f"⬇️ Downloading video source with ffmpeg: {video_url}")
video_url_l = video_url.lower()
@@ -2781,15 +2698,11 @@ def download_and_crop_video(video_url, output_path):
not os.path.exists(temp_input)
or os.path.getsize(temp_input) == 0
):
logging.error(
"❌ Downloaded video source file is missing or empty."
)
logging.error("❌ Downloaded video source file is missing or empty.")
return None
logging.info(f"✅ Video downloaded: {temp_input}")
# FIX #6 — probe duration with ffprobe (hard timeout) instead of
# VideoFileClip, which can hang indefinitely on corrupt files.
try:
duration = _probe_video_duration(temp_input)
except RuntimeError as probe_err:
@@ -2797,16 +2710,11 @@ def download_and_crop_video(video_url, output_path):
return None
if duration <= 0:
logging.error(
"❌ Downloaded video has invalid or unknown duration."
)
logging.error("❌ Downloaded video has invalid or unknown duration.")
return None
end_time = min(VIDEO_MAX_DURATION_SECONDS, duration)
# FIX #2 — wrap VideoFileClip usage in nested try/finally blocks so
# both the source clip and the subclip handles are always closed, even
# if write_videofile raises an exception mid-way.
video_clip = VideoFileClip(temp_input)
try:
if hasattr(video_clip, "subclipped"):
@@ -2825,9 +2733,9 @@ def download_and_crop_video(video_url, output_path):
logger=None,
)
finally:
cropped_clip.close() # FIX #2 — always close subclip
cropped_clip.close()
finally:
video_clip.close() # FIX #2 — always close source clip
video_clip.close()
if (
not os.path.exists(temp_trimmed)
@@ -2898,8 +2806,6 @@ def download_and_crop_video(video_url, output_path):
finally:
remove_file_quietly(temp_input)
remove_file_quietly(temp_trimmed)
# temp_output was either renamed to output_path via os.replace()
# or never created; remove_file_quietly is a no-op if it doesn't exist.
remove_file_quietly(temp_output)
@@ -2928,6 +2834,7 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
return False, None
# --- Main Sync Logic ---
def sync_feeds(args):
logging.info("🔄 Starting sync cycle...")
@@ -2935,14 +2842,10 @@ def sync_feeds(args):
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
if dry_run:
logging.info(
"🧪 DRY RUN MODE — no posts will be created on Bluesky."
)
logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
try:
state = load_state(STATE_PATH)
# FIX #8 — prune on load so the state file never grows unbounded
# between runs, not only after individual posts.
state = prune_state(state, max_entries=5000)
tweets = scrape_tweets_via_playwright(
@@ -2988,9 +2891,8 @@ def sync_feeds(args):
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
candidate_tweets = []
# --- Cheap prefilter before expensive processing ---
cheap_candidates = []
for tweet in reversed(tweets):
try:
tweet_time = arrow.get(tweet.created_on)
@@ -3064,9 +2966,7 @@ def sync_feeds(args):
if not is_tco_domain(
raw_url
) and not is_x_or_twitter_domain(raw_url):
canonical_non_x_urls.add(
canonicalize_url(raw_url)
)
canonical_non_x_urls.add(canonicalize_url(raw_url))
primary_non_x_url = None
if resolved_primary_external_url:
@@ -3174,7 +3074,6 @@ def sync_feeds(args):
if os.path.exists(browser_state_file):
context_kwargs["storage_state"] = browser_state_file
# FIX #1 — renamed from 'context' to 'browser_context'
browser_context = browser.new_context(**context_kwargs)
for candidate in candidate_tweets:
@@ -3219,10 +3118,6 @@ def sync_feeds(args):
new_posts += 1
continue
# FIX #5 — fetch link metadata once here so we can pass the
# OG title to build_dynamic_alt AND reuse it inside
# build_external_link_embed, avoiding a duplicate HTTP request
# for the same URL.
link_meta_for_alt: dict = {}
if candidate.get("resolved_primary_external_url"):
try:
@@ -3284,11 +3179,9 @@ def sync_feeds(args):
f"video:resolve_failed:{tweet.tweet_url}"
)
else:
cropped_video_path = (
download_and_crop_video(
cropped_video_path = download_and_crop_video(
real_video_url, temp_video_path
)
)
if not cropped_video_path:
logging.warning(
f"⚠️ Video download/crop failed for "
@@ -3375,8 +3268,6 @@ def sync_feeds(args):
f"external card: {candidate_url}"
)
# FIX #5 — pass the already-fetched metadata so
# build_external_link_embed skips a duplicate HTTP fetch.
external_embed = build_external_link_embed(
candidate_url,
bsky_client,
@@ -3489,24 +3380,19 @@ def sync_feeds(args):
except Exception as e:
logging.error(f"❌ Error during sync cycle: {e}")
def main():
load_dotenv()
parser = argparse.ArgumentParser(
description="Twitter to Bluesky Sync"
)
parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync")
parser.add_argument(
"--twitter-username",
help="Your Twitter login username",
)
parser.add_argument(
"--twitter-password",
# NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
# Prefer setting TWITTER_PASSWORD in your .env file instead.
help="Your Twitter login password",
# FIX #15 — password args are still supported for compatibility but
# the .env file is the recommended path; passwords passed via CLI
# are visible in `ps aux`. Consider removing these args and requiring
# env vars exclusively, or prompting with getpass for interactive use.
)
parser.add_argument(
"--twitter-email",
@@ -3522,8 +3408,9 @@ def main():
)
parser.add_argument(
"--bsky-password",
# NOTE (FIX #15): same warning as --twitter-password above.
# Prefer setting BSKY_APP_PASSWORD in your .env file instead.
help="Your Bluesky app password",
# FIX #15 — same note as --twitter-password above.
)
parser.add_argument(
"--bsky-base-url",
@@ -3547,8 +3434,8 @@ def main():
args = parser.parse_args()
# Resolve credentials: CLI args take priority, then env vars.
# FIX #15 — document that env vars are the secure path; CLI args expose
# secrets in the process list. Operators should prefer .env / env vars.
# FIX #15 — env vars are the secure path; CLI args expose secrets in
# the process list. Operators should prefer .env / environment variables.
args.twitter_username = args.twitter_username or os.getenv(
"TWITTER_USERNAME"
)
@@ -3620,3 +3507,4 @@ def main():
if __name__ == "__main__":
main()