Added #fix
This commit is contained in:
@@ -54,37 +54,36 @@ LINK_METADATA_TIMEOUT = 10
|
|||||||
URL_RESOLVE_TIMEOUT = 12
|
URL_RESOLVE_TIMEOUT = 12
|
||||||
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
|
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
|
||||||
SUBPROCESS_TIMEOUT_SECONDS = 180
|
SUBPROCESS_TIMEOUT_SECONDS = 180
|
||||||
FFPROBE_TIMEOUT_SECONDS = 15 # FIX #6 — named constant for ffprobe probe timeout
|
FFPROBE_TIMEOUT_SECONDS = 15
|
||||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||||
|
|
||||||
# FIX #11 — named constants replacing magic numbers scattered across the codebase
|
OG_TITLE_WAIT_TIMEOUT_MS = 7000
|
||||||
OG_TITLE_WAIT_TIMEOUT_MS = 7000 # ms to wait for og:title meta tag
|
PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0
|
||||||
PLAYWRIGHT_POST_GOTO_SLEEP_S = 2.0 # seconds to sleep after page.goto in resolvers
|
PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8
|
||||||
PLAYWRIGHT_IDLE_POLL_SLEEP_S = 0.8 # seconds between idle-state polls
|
PLAYWRIGHT_IDLE_POLL_ROUNDS = 4
|
||||||
PLAYWRIGHT_IDLE_POLL_ROUNDS = 4 # number of idle-state poll rounds
|
PLAYWRIGHT_RETRY_SLEEP_S = 2.0
|
||||||
PLAYWRIGHT_RETRY_SLEEP_S = 2.0 # seconds to sleep before retry interaction
|
VIDEO_PLAYER_WAIT_ROUNDS = 8
|
||||||
VIDEO_PLAYER_WAIT_ROUNDS = 8 # rounds waiting for video URL after first click
|
VIDEO_PLAYER_RETRY_ROUNDS = 5
|
||||||
VIDEO_PLAYER_RETRY_ROUNDS = 5 # rounds waiting for video URL after retry click
|
URL_TAIL_MIN_PREFIX_CHARS = 35
|
||||||
URL_TAIL_MIN_PREFIX_CHARS = 35 # minimum prefix chars before URL for tail detection
|
URL_TAIL_MAX_LOOKBACK_CHARS = 120
|
||||||
URL_TAIL_MAX_LOOKBACK_CHARS = 120 # generous lookback window when hashtags follow URL
|
URL_TAIL_MAX_CLAUSE_DISTANCE = 180
|
||||||
URL_TAIL_MAX_CLAUSE_DISTANCE = 180 # max chars a clause boundary may be from URL start
|
DYNAMIC_ALT_MAX_LENGTH = 150
|
||||||
DYNAMIC_ALT_MAX_LENGTH = 150 # max chars for dynamic alt text
|
TRUNCATE_MIN_PREFIX_CHARS = 20
|
||||||
TRUNCATE_MIN_PREFIX_CHARS = 20 # min prefix length before inserting ellipsis
|
SHORT_TWEET_OG_FETCH_THRESHOLD = 35
|
||||||
SHORT_TWEET_OG_FETCH_THRESHOLD = 35 # tweets shorter than this get og:title enrichment
|
ORPHAN_DIGIT_MAX_DIGITS = 3
|
||||||
ORPHAN_DIGIT_MAX_DIGITS = 3 # max digit count for orphaned-digit-line detection
|
SESSION_FILE_PERMISSIONS = 0o600
|
||||||
SESSION_FILE_PERMISSIONS = 0o600 # FIX #14 — restrictive permissions for session cookie file
|
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
|
handlers=[
|
||||||
|
logging.FileHandler(LOG_PATH, encoding="utf-8"),
|
||||||
|
logging.StreamHandler(),
|
||||||
|
],
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Per-run caches for efficiency ---
|
# --- Per-run caches ---
|
||||||
# FIX #12 — caches are still module-level but now encapsulated in a class so they
|
|
||||||
# can be passed explicitly and are safe to reset between daemon cycles without
|
|
||||||
# relying on global mutation from arbitrary call sites.
|
|
||||||
class _RunCache:
|
class _RunCache:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.og_title: dict = {}
|
self.og_title: dict = {}
|
||||||
@@ -146,7 +145,38 @@ def is_valid_url(url):
|
|||||||
def strip_trailing_url_punctuation(url):
|
def strip_trailing_url_punctuation(url):
|
||||||
if not url:
|
if not url:
|
||||||
return url
|
return url
|
||||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
# Strip a trailing hashtag-style fragment (#Word) that is really a social
|
||||||
|
# hashtag glued to the end of a URL with no space, e.g.
|
||||||
|
# https://cit.transit.gencat.cat#SCT → https://cit.transit.gencat.cat
|
||||||
|
# Only stripped when it starts with a letter so real anchors like
|
||||||
|
# /page#section-2 inside a longer sentence are left alone.
|
||||||
|
url = re.sub(r"#[A-Za-z]\w*$", "", url.strip())
|
||||||
|
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url)
|
||||||
|
|
||||||
|
|
||||||
|
def split_url_hashtag_suffix(text):
|
||||||
|
"""
|
||||||
|
Split a URL that has a hashtag fragment glued to it with no space, e.g.:
|
||||||
|
'https://cit.transit.gencat.cat#SCT'
|
||||||
|
becomes:
|
||||||
|
'https://cit.transit.gencat.cat #SCT'
|
||||||
|
|
||||||
|
Only splits when the fragment looks like a social hashtag: starts with #
|
||||||
|
followed by a letter then word characters. The lookahead (?=\\s|$) ensures
|
||||||
|
we only act at a word boundary so mid-sentence anchors followed by more
|
||||||
|
URL path are left untouched.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
fixed = re.sub(
|
||||||
|
r"(https?://[^\s#<>\"']+)(#[A-Za-z]\w*)(?=\s|$)",
|
||||||
|
r"\1 \2",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
if fixed != text:
|
||||||
|
logging.info("🔧 Split hashtag suffix from URL in text")
|
||||||
|
return fixed
|
||||||
|
|
||||||
|
|
||||||
def split_concatenated_urls(text):
|
def split_concatenated_urls(text):
|
||||||
@@ -165,6 +195,8 @@ def repair_broken_urls(text):
|
|||||||
|
|
||||||
original = text
|
original = text
|
||||||
text = split_concatenated_urls(text)
|
text = split_concatenated_urls(text)
|
||||||
|
# Split glued hashtag suffixes before any rejoining passes
|
||||||
|
text = split_url_hashtag_suffix(text)
|
||||||
|
|
||||||
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||||
|
|
||||||
@@ -186,6 +218,9 @@ def repair_broken_urls(text):
|
|||||||
)
|
)
|
||||||
|
|
||||||
text = split_concatenated_urls(text)
|
text = split_concatenated_urls(text)
|
||||||
|
# Run hashtag split again after rejoining passes — the rejoining regex
|
||||||
|
# contains # in its character class so it can re-glue a fragment.
|
||||||
|
text = split_url_hashtag_suffix(text)
|
||||||
|
|
||||||
if text != original:
|
if text != original:
|
||||||
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
||||||
@@ -232,7 +267,6 @@ def repair_broken_mentions(text):
|
|||||||
|
|
||||||
if is_blank_line(next_line):
|
if is_blank_line(next_line):
|
||||||
break
|
break
|
||||||
|
|
||||||
if is_mention_only_line(next_line):
|
if is_mention_only_line(next_line):
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -256,7 +290,6 @@ def repair_broken_mentions(text):
|
|||||||
|
|
||||||
if is_blank_line(next_line):
|
if is_blank_line(next_line):
|
||||||
break
|
break
|
||||||
|
|
||||||
if is_mention_only_line(next_line):
|
if is_mention_only_line(next_line):
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -287,8 +320,8 @@ def strip_line_edge_whitespace(text):
|
|||||||
|
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
cleaned_lines = []
|
cleaned_lines = []
|
||||||
|
|
||||||
changed = False
|
changed = False
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
cleaned = line.strip()
|
cleaned = line.strip()
|
||||||
if cleaned != line:
|
if cleaned != line:
|
||||||
@@ -316,11 +349,6 @@ def remove_trailing_ellipsis_line(text):
|
|||||||
|
|
||||||
|
|
||||||
def remove_orphaned_digit_lines_before_hashtags(text):
|
def remove_orphaned_digit_lines_before_hashtags(text):
|
||||||
"""
|
|
||||||
Remove lines that contain only a number (e.g. '5') when they appear
|
|
||||||
immediately before a line starting with a hashtag. These are typically
|
|
||||||
scraped UI artifacts (image counts, engagement badges, etc.).
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@@ -331,8 +359,6 @@ def remove_orphaned_digit_lines_before_hashtags(text):
|
|||||||
result = []
|
result = []
|
||||||
changed = False
|
changed = False
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
# FIX #11 — use named constant ORPHAN_DIGIT_MAX_DIGITS instead of literal 3
|
|
||||||
orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}")
|
orphan_pattern = re.compile(rf"\d{{1,{ORPHAN_DIGIT_MAX_DIGITS}}}")
|
||||||
|
|
||||||
while i < len(lines):
|
while i < len(lines):
|
||||||
@@ -519,7 +545,6 @@ def should_fetch_og_title(tweet):
|
|||||||
if "…" in text or text.endswith("..."):
|
if "…" in text or text.endswith("..."):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# FIX #11 — use named constant SHORT_TWEET_OG_FETCH_THRESHOLD instead of literal 35
|
|
||||||
if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD:
|
if len(text) < SHORT_TWEET_OG_FETCH_THRESHOLD:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -535,7 +560,7 @@ def fetch_tweet_og_title_text(tweet_url):
|
|||||||
return _cache.og_title[tweet_url]
|
return _cache.og_title[tweet_url]
|
||||||
|
|
||||||
browser = None
|
browser = None
|
||||||
browser_context = None # FIX #1 — renamed from 'context' to avoid collision
|
browser_context = None
|
||||||
page = None
|
page = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -562,8 +587,10 @@ def fetch_tweet_og_title_text(tweet_url):
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# FIX #11 — use named constant OG_TITLE_WAIT_TIMEOUT_MS instead of literal 7000
|
page.wait_for_selector(
|
||||||
page.wait_for_selector('meta[property="og:title"]', timeout=OG_TITLE_WAIT_TIMEOUT_MS)
|
'meta[property="og:title"]',
|
||||||
|
timeout=OG_TITLE_WAIT_TIMEOUT_MS,
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -629,7 +656,7 @@ def resolve_tco_with_httpx(url, http_client):
|
|||||||
|
|
||||||
def resolve_tco_with_playwright(url):
|
def resolve_tco_with_playwright(url):
|
||||||
browser = None
|
browser = None
|
||||||
browser_context = None # FIX #1 — renamed from 'context'
|
browser_context = None
|
||||||
page = None
|
page = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -661,11 +688,9 @@ def resolve_tco_with_playwright(url):
|
|||||||
f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}"
|
f"⚠️ Initial Playwright goto failed for {url}: {repr(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# FIX #11 — use named constant PLAYWRIGHT_POST_GOTO_SLEEP_S
|
|
||||||
time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S)
|
time.sleep(PLAYWRIGHT_POST_GOTO_SLEEP_S)
|
||||||
final_url = canonicalize_url(page.url)
|
final_url = canonicalize_url(page.url)
|
||||||
|
|
||||||
# FIX #11 — use named constants for poll rounds and sleep
|
|
||||||
for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS):
|
for _ in range(PLAYWRIGHT_IDLE_POLL_ROUNDS):
|
||||||
if final_url and is_external_non_x_url(final_url):
|
if final_url and is_external_non_x_url(final_url):
|
||||||
break
|
break
|
||||||
@@ -815,10 +840,6 @@ def extract_first_resolved_external_url(
|
|||||||
|
|
||||||
|
|
||||||
def resolve_card_url(card_url, http_client):
|
def resolve_card_url(card_url, http_client):
|
||||||
"""
|
|
||||||
Resolve a card URL (typically t.co) scraped from the tweet's link preview card.
|
|
||||||
Returns the final external URL or None.
|
|
||||||
"""
|
|
||||||
if not card_url:
|
if not card_url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -869,9 +890,7 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
|
|||||||
|
|
||||||
if is_x_or_twitter_domain(cleaned):
|
if is_x_or_twitter_domain(cleaned):
|
||||||
replacements[raw_url] = ""
|
replacements[raw_url] = ""
|
||||||
logging.info(
|
logging.info(f"🧹 Removing X/Twitter URL from visible text: {cleaned}")
|
||||||
f"🧹 Removing X/Twitter URL from visible text: {cleaned}"
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
final_url = cleaned
|
final_url = cleaned
|
||||||
@@ -927,9 +946,8 @@ def sanitize_visible_urls_in_text(text, http_client, has_media=False):
|
|||||||
if len(line_urls) > 1:
|
if len(line_urls) > 1:
|
||||||
prefix = re.sub(url_pattern, "", line).strip()
|
prefix = re.sub(url_pattern, "", line).strip()
|
||||||
kept_urls = []
|
kept_urls = []
|
||||||
|
|
||||||
# FIX #4 — local set per line, not shared outer state
|
|
||||||
seen_in_line: set = set()
|
seen_in_line: set = set()
|
||||||
|
|
||||||
for url in line_urls:
|
for url in line_urls:
|
||||||
normalized = normalize_urlish_token(url) or url
|
normalized = normalize_urlish_token(url) or url
|
||||||
canonical = canonicalize_url(normalized)
|
canonical = canonicalize_url(normalized)
|
||||||
@@ -976,20 +994,15 @@ def build_effective_tweet_text(tweet, http_client):
|
|||||||
scraped_urls = extract_urls_from_text(scraped_text)
|
scraped_urls = extract_urls_from_text(scraped_text)
|
||||||
og_urls = extract_urls_from_text(og_title_text)
|
og_urls = extract_urls_from_text(og_title_text)
|
||||||
|
|
||||||
if len(og_title_text) >= len(scraped_text) or (
|
if len(og_title_text) >= len(scraped_text) or (og_urls and not scraped_urls):
|
||||||
og_urls and not scraped_urls
|
|
||||||
):
|
|
||||||
candidate_text = og_title_text
|
candidate_text = og_title_text
|
||||||
logging.info("🧾 Using og:title-derived tweet text as primary content")
|
logging.info("🧾 Using og:title-derived tweet text as primary content")
|
||||||
|
|
||||||
candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(
|
candidate_text, resolved_primary_external_url = sanitize_visible_urls_in_text(
|
||||||
candidate_text,
|
candidate_text, http_client, has_media=has_media,
|
||||||
http_client,
|
|
||||||
has_media=has_media,
|
|
||||||
)
|
)
|
||||||
candidate_text = clean_post_text(candidate_text)
|
candidate_text = clean_post_text(candidate_text)
|
||||||
|
|
||||||
# --- Resolve the card_url scraped from the tweet's link preview ---
|
|
||||||
resolved_card_url = resolve_card_url(
|
resolved_card_url = resolve_card_url(
|
||||||
getattr(tweet, "card_url", None), http_client
|
getattr(tweet, "card_url", None), http_client
|
||||||
)
|
)
|
||||||
@@ -1102,12 +1115,7 @@ def find_tail_preservation_start(text, primary_non_x_url):
|
|||||||
candidates = [url_pos]
|
candidates = [url_pos]
|
||||||
|
|
||||||
clause_patterns = [
|
clause_patterns = [
|
||||||
r"\.\s+",
|
r"\.\s+", r":\s+", r";\s+", r"!\s+", r"\?\s+", r",\s+",
|
||||||
r":\s+",
|
|
||||||
r";\s+",
|
|
||||||
r"!\s+",
|
|
||||||
r"\?\s+",
|
|
||||||
r",\s+",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
before = text[:url_pos]
|
before = text[:url_pos]
|
||||||
@@ -1120,22 +1128,23 @@ def find_tail_preservation_start(text, primary_non_x_url):
|
|||||||
candidates.append(last_newline + 1)
|
candidates.append(last_newline + 1)
|
||||||
|
|
||||||
if has_hashtag_after_url:
|
if has_hashtag_after_url:
|
||||||
# FIX #11 — use named constant URL_TAIL_MAX_LOOKBACK_CHARS instead of literal 120
|
|
||||||
generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS)
|
generous_start = max(0, url_pos - URL_TAIL_MAX_LOOKBACK_CHARS)
|
||||||
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
|
while generous_start > 0 and text[generous_start] not in {" ", "\n"}:
|
||||||
generous_start -= 1
|
generous_start -= 1
|
||||||
candidates.append(generous_start)
|
candidates.append(generous_start)
|
||||||
|
|
||||||
# FIX #11 — use named constant URL_TAIL_MAX_CLAUSE_DISTANCE instead of literal 180
|
|
||||||
reasonable_candidates = [
|
reasonable_candidates = [
|
||||||
c for c in candidates if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
|
c for c in candidates
|
||||||
|
if 0 <= c < url_pos and (url_pos - c) <= URL_TAIL_MAX_CLAUSE_DISTANCE
|
||||||
]
|
]
|
||||||
|
|
||||||
if reasonable_candidates:
|
if reasonable_candidates:
|
||||||
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
|
start = min(reasonable_candidates, key=lambda c: (url_pos - c))
|
||||||
# FIX #11 — use named constant URL_TAIL_MIN_PREFIX_CHARS instead of literal 35
|
|
||||||
if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS:
|
if url_pos - start < URL_TAIL_MIN_PREFIX_CHARS:
|
||||||
farther = [c for c in reasonable_candidates if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS]
|
farther = [
|
||||||
|
c for c in reasonable_candidates
|
||||||
|
if url_pos - c >= URL_TAIL_MIN_PREFIX_CHARS
|
||||||
|
]
|
||||||
if farther:
|
if farther:
|
||||||
start = min(farther, key=lambda c: (url_pos - c))
|
start = min(farther, key=lambda c: (url_pos - c))
|
||||||
return start
|
return start
|
||||||
@@ -1149,15 +1158,12 @@ def truncate_text_safely(text, max_length=BSKY_TEXT_MAX_LENGTH):
|
|||||||
|
|
||||||
truncated = text[: max_length - 3]
|
truncated = text[: max_length - 3]
|
||||||
last_space = truncated.rfind(" ")
|
last_space = truncated.rfind(" ")
|
||||||
# FIX #11 — use named constant TRUNCATE_MIN_PREFIX_CHARS instead of literal 0
|
|
||||||
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
|
if last_space > TRUNCATE_MIN_PREFIX_CHARS:
|
||||||
return truncated[:last_space] + "..."
|
return truncated[:last_space] + "..."
|
||||||
return truncated + "..."
|
return truncated + "..."
|
||||||
|
|
||||||
|
|
||||||
def truncate_text_preserving_tail(
|
def truncate_text_preserving_tail(text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH):
|
||||||
text, tail_start, max_length=BSKY_TEXT_MAX_LENGTH
|
|
||||||
):
|
|
||||||
if (
|
if (
|
||||||
not text
|
not text
|
||||||
or tail_start is None
|
or tail_start is None
|
||||||
@@ -1176,11 +1182,9 @@ def truncate_text_preserving_tail(
|
|||||||
reserve = len(tail) + 4
|
reserve = len(tail) + 4
|
||||||
if reserve >= max_length:
|
if reserve >= max_length:
|
||||||
shortened_tail = tail[-(max_length - 3) :].strip()
|
shortened_tail = tail[-(max_length - 3) :].strip()
|
||||||
|
|
||||||
first_space = shortened_tail.find(" ")
|
first_space = shortened_tail.find(" ")
|
||||||
if 0 <= first_space <= 30:
|
if 0 <= first_space <= 30:
|
||||||
shortened_tail = shortened_tail[first_space + 1 :].strip()
|
shortened_tail = shortened_tail[first_space + 1 :].strip()
|
||||||
|
|
||||||
return f"...{shortened_tail}"
|
return f"...{shortened_tail}"
|
||||||
|
|
||||||
available_prefix = max_length - reserve
|
available_prefix = max_length - reserve
|
||||||
@@ -1234,10 +1238,7 @@ def choose_final_visible_text(
|
|||||||
text_without_url = remove_url_from_visible_text(
|
text_without_url = remove_url_from_visible_text(
|
||||||
text, primary_non_x_url
|
text, primary_non_x_url
|
||||||
).strip()
|
).strip()
|
||||||
if (
|
if text_without_url and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH:
|
||||||
text_without_url
|
|
||||||
and len(text_without_url) <= BSKY_TEXT_MAX_LENGTH
|
|
||||||
):
|
|
||||||
logging.info(
|
logging.info(
|
||||||
"🔗 Keeping full visible text by removing long external URL from body and using external card"
|
"🔗 Keeping full visible text by removing long external URL from body and using external card"
|
||||||
)
|
)
|
||||||
@@ -1267,7 +1268,6 @@ def build_media_fingerprint(tweet):
|
|||||||
for media in tweet.media:
|
for media in tweet.media:
|
||||||
media_type = getattr(media, "type", "unknown")
|
media_type = getattr(media, "type", "unknown")
|
||||||
media_url = getattr(media, "media_url_https", "") or ""
|
media_url = getattr(media, "media_url_https", "") or ""
|
||||||
|
|
||||||
stable_value = media_url
|
stable_value = media_url
|
||||||
|
|
||||||
if media_type == "photo":
|
if media_type == "photo":
|
||||||
@@ -1338,9 +1338,7 @@ def build_text_media_key(normalized_text, media_fingerprint):
|
|||||||
|
|
||||||
def create_bsky_client(base_url, handle, password):
|
def create_bsky_client(base_url, handle, password):
|
||||||
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
|
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
|
||||||
logging.info(
|
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
|
||||||
f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = Client(base_url=normalized_base_url)
|
client = Client(base_url=normalized_base_url)
|
||||||
@@ -1355,9 +1353,7 @@ def create_bsky_client(base_url, handle, password):
|
|||||||
elif hasattr(client, "_base_url"):
|
elif hasattr(client, "_base_url"):
|
||||||
client._base_url = normalized_base_url
|
client._base_url = normalized_base_url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"⚠️ Could not apply custom base URL cleanly: {e}")
|
||||||
f"⚠️ Could not apply custom base URL cleanly: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
client.login(handle, password)
|
client.login(handle, password)
|
||||||
return client
|
return client
|
||||||
@@ -1426,16 +1422,12 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
|
|||||||
"canonical_tweet_url": canonical_tweet_url,
|
"canonical_tweet_url": canonical_tweet_url,
|
||||||
"normalized_text": candidate["normalized_text"],
|
"normalized_text": candidate["normalized_text"],
|
||||||
"raw_text": candidate["raw_text"],
|
"raw_text": candidate["raw_text"],
|
||||||
"full_clean_text": candidate.get(
|
"full_clean_text": candidate.get("full_clean_text", candidate["raw_text"]),
|
||||||
"full_clean_text", candidate["raw_text"]
|
|
||||||
),
|
|
||||||
"media_fingerprint": candidate["media_fingerprint"],
|
"media_fingerprint": candidate["media_fingerprint"],
|
||||||
"text_media_key": candidate["text_media_key"],
|
"text_media_key": candidate["text_media_key"],
|
||||||
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
|
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
|
||||||
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
|
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
|
||||||
"resolved_primary_external_url": candidate.get(
|
"resolved_primary_external_url": candidate.get("resolved_primary_external_url"),
|
||||||
"resolved_primary_external_url"
|
|
||||||
),
|
|
||||||
"bsky_uri": bsky_uri,
|
"bsky_uri": bsky_uri,
|
||||||
"tweet_created_on": candidate["tweet"].created_on,
|
"tweet_created_on": candidate["tweet"].created_on,
|
||||||
"tweet_url": candidate["tweet"].tweet_url,
|
"tweet_url": candidate["tweet"].tweet_url,
|
||||||
@@ -1483,15 +1475,16 @@ def prune_state(state, max_entries=5000):
|
|||||||
sortable.sort(key=lambda x: x[1], reverse=True)
|
sortable.sort(key=lambda x: x[1], reverse=True)
|
||||||
keep_keys = {key for key, _ in sortable[:max_entries]}
|
keep_keys = {key for key, _ in sortable[:max_entries]}
|
||||||
|
|
||||||
new_posted_tweets = {}
|
new_posted_tweets = {
|
||||||
for key, record in posted_tweets.items():
|
key: record
|
||||||
if key in keep_keys:
|
for key, record in posted_tweets.items()
|
||||||
new_posted_tweets[key] = record
|
if key in keep_keys
|
||||||
|
}
|
||||||
new_posted_by_bsky_uri = {}
|
new_posted_by_bsky_uri = {
|
||||||
for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items():
|
bsky_uri: key
|
||||||
if key in keep_keys:
|
for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items()
|
||||||
new_posted_by_bsky_uri[bsky_uri] = key
|
if key in keep_keys
|
||||||
|
}
|
||||||
|
|
||||||
state["posted_tweets"] = new_posted_tweets
|
state["posted_tweets"] = new_posted_tweets
|
||||||
state["posted_by_bsky_uri"] = new_posted_by_bsky_uri
|
state["posted_by_bsky_uri"] = new_posted_by_bsky_uri
|
||||||
@@ -1540,9 +1533,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
|
|
||||||
canonical_non_x_urls = set()
|
canonical_non_x_urls = set()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if not is_tco_domain(url) and not is_x_or_twitter_domain(
|
if not is_tco_domain(url) and not is_x_or_twitter_domain(url):
|
||||||
url
|
|
||||||
):
|
|
||||||
canonical = canonicalize_url(
|
canonical = canonicalize_url(
|
||||||
normalize_urlish_token(url) or url
|
normalize_urlish_token(url) or url
|
||||||
)
|
)
|
||||||
@@ -1572,7 +1563,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# FIX #9 — elevated to WARNING so operators notice live dedup is disabled
|
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⚠️ Could not fetch recent Bluesky posts for duplicate detection "
|
f"⚠️ Could not fetch recent Bluesky posts for duplicate detection "
|
||||||
f"(live dedup disabled for this cycle): {e}"
|
f"(live dedup disabled for this cycle): {e}"
|
||||||
@@ -1648,7 +1638,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"❌ Exhausted blob upload retries for {media_label} after rate limiting: {repr(e)}"
|
f"❌ Exhausted blob upload retries for {media_label} "
|
||||||
|
f"after rate limiting: {repr(e)}"
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -1657,12 +1648,11 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES
|
and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES
|
||||||
):
|
):
|
||||||
transient_attempts += 1
|
transient_attempts += 1
|
||||||
wait_seconds = (
|
wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
|
||||||
BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
|
|
||||||
)
|
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. "
|
f"⏳ Transient blob upload failure for {media_label}: {repr(e)}. "
|
||||||
f"Transient retry {transient_attempts}/{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
|
f"Transient retry {transient_attempts}/"
|
||||||
|
f"{BSKY_BLOB_TRANSIENT_ERROR_RETRIES} after {wait_seconds}s."
|
||||||
)
|
)
|
||||||
time.sleep(wait_seconds)
|
time.sleep(wait_seconds)
|
||||||
continue
|
continue
|
||||||
@@ -1674,9 +1664,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
logging.warning(
|
logging.warning(
|
||||||
f"Upload response status: {e.response.status_code}"
|
f"Upload response status: {e.response.status_code}"
|
||||||
)
|
)
|
||||||
logging.warning(
|
logging.warning(f"Upload response body: {e.response.text}")
|
||||||
f"Upload response body: {e.response.text}"
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1685,11 +1673,8 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
|
logging.warning(f"Could not upload {media_label}: {repr(last_exception)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def send_post_with_retry(client, **kwargs):
|
def send_post_with_retry(client, **kwargs):
|
||||||
"""
|
|
||||||
Wrapper around client.send_post() with retry logic for transient errors
|
|
||||||
and rate limiting.
|
|
||||||
"""
|
|
||||||
last_exception = None
|
last_exception = None
|
||||||
|
|
||||||
for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
|
for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
|
||||||
@@ -1723,10 +1708,7 @@ def send_post_with_retry(client, **kwargs):
|
|||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if (
|
if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES:
|
||||||
is_transient_error(e)
|
|
||||||
and attempt < BSKY_SEND_POST_MAX_RETRIES
|
|
||||||
):
|
|
||||||
wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
|
wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⏳ Transient send_post failure: {repr(e)}. "
|
f"⏳ Transient send_post failure: {repr(e)}. "
|
||||||
@@ -1739,8 +1721,6 @@ def send_post_with_retry(client, **kwargs):
|
|||||||
|
|
||||||
raise last_exception
|
raise last_exception
|
||||||
|
|
||||||
|
|
||||||
# --- Image Compression ---
|
|
||||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||||
try:
|
try:
|
||||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||||
@@ -1770,12 +1750,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
|||||||
progressive=True,
|
progressive=True,
|
||||||
)
|
)
|
||||||
data = out.getvalue()
|
data = out.getvalue()
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🖼️ Post image candidate size at JPEG quality {quality}: "
|
f"🖼️ Post image candidate size at JPEG quality {quality}: "
|
||||||
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
|
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(data) <= max_bytes:
|
if len(data) <= max_bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@@ -1802,12 +1780,10 @@ def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
|||||||
progressive=True,
|
progressive=True,
|
||||||
)
|
)
|
||||||
data = out.getvalue()
|
data = out.getvalue()
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🖼️ Post image resized to <= {target_dim}px at quality {quality}: "
|
f"🖼️ Post image resized to <= {target_dim}px at quality {quality}: "
|
||||||
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
|
f"{len(data)} bytes ({len(data) / 1024:.2f} KB)"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(data) <= max_bytes:
|
if len(data) <= max_bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@@ -1865,9 +1841,7 @@ def get_blob_from_url(media_url, client, http_client):
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return upload_blob_with_retry(
|
return upload_blob_with_retry(client, upload_bytes, media_label=media_url)
|
||||||
client, upload_bytes, media_label=media_url
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
|
logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
|
||||||
@@ -1902,23 +1876,17 @@ def get_blob_from_file(file_path, client):
|
|||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
binary_data = f.read()
|
binary_data = f.read()
|
||||||
|
|
||||||
return upload_blob_with_retry(
|
return upload_blob_with_retry(client, binary_data, media_label=file_path)
|
||||||
client, binary_data, media_label=file_path
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"Could not upload local file {file_path}: {repr(e)}")
|
||||||
f"Could not upload local file {file_path}: {repr(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if hasattr(e, "response") and e.response is not None:
|
if hasattr(e, "response") and e.response is not None:
|
||||||
try:
|
try:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"Upload response status: {e.response.status_code}"
|
f"Upload response status: {e.response.status_code}"
|
||||||
)
|
)
|
||||||
logging.warning(
|
logging.warning(f"Upload response body: {e.response.text}")
|
||||||
f"Upload response body: {e.response.text}"
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1956,12 +1924,10 @@ def compress_external_thumb_to_limit(
|
|||||||
progressive=True,
|
progressive=True,
|
||||||
)
|
)
|
||||||
data = out.getvalue()
|
data = out.getvalue()
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🖼️ External thumb candidate size at JPEG quality {quality}: "
|
f"🖼️ External thumb candidate size at JPEG quality {quality}: "
|
||||||
f"{len(data) / 1024:.2f} KB"
|
f"{len(data) / 1024:.2f} KB"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(data) <= max_bytes:
|
if len(data) <= max_bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@@ -1988,19 +1954,15 @@ def compress_external_thumb_to_limit(
|
|||||||
progressive=True,
|
progressive=True,
|
||||||
)
|
)
|
||||||
data = out.getvalue()
|
data = out.getvalue()
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: "
|
f"🖼️ External thumb resized to <= {target_dim}px at quality {quality}: "
|
||||||
f"{len(data) / 1024:.2f} KB"
|
f"{len(data) / 1024:.2f} KB"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(data) <= max_bytes:
|
if len(data) <= max_bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"Could not compress external thumbnail: {repr(e)}")
|
||||||
f"Could not compress external thumbnail: {repr(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -2101,20 +2063,18 @@ def fetch_link_metadata(url, http_client):
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"Could not fetch link metadata for {url}: {repr(e)}")
|
||||||
f"Could not fetch link metadata for {url}: {repr(e)}"
|
|
||||||
)
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def build_external_link_embed(
|
def build_external_link_embed(
|
||||||
url, client, http_client, fallback_title="Link",
|
url, client, http_client, fallback_title="Link", prefetched_metadata=None,
|
||||||
prefetched_metadata=None,
|
|
||||||
):
|
):
|
||||||
# FIX #5 — accept pre-fetched metadata to avoid a duplicate HTTP request
|
link_metadata = (
|
||||||
# when the caller already fetched it for build_dynamic_alt.
|
prefetched_metadata
|
||||||
link_metadata = prefetched_metadata if prefetched_metadata is not None \
|
if prefetched_metadata is not None
|
||||||
else fetch_link_metadata(url, http_client)
|
else fetch_link_metadata(url, http_client)
|
||||||
|
)
|
||||||
|
|
||||||
thumb_blob = None
|
thumb_blob = None
|
||||||
if link_metadata.get("image"):
|
if link_metadata.get("image"):
|
||||||
@@ -2144,12 +2104,10 @@ def build_external_link_embed(
|
|||||||
|
|
||||||
|
|
||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
# FIX #10 — note explaining @mention limitation.
|
# NOTE: Bluesky supports native @mention facets, but resolving a Twitter
|
||||||
# Bluesky supports native @mention facets, but resolving a Twitter handle
|
# handle to a Bluesky DID requires an external lookup. That mapping is not
|
||||||
# to a Bluesky DID requires an external lookup (e.g. via the atproto
|
# available here so @mentions are passed through as plain text. If you add
|
||||||
# identity resolution API). That mapping is not available here, so
|
# a handle-mapping table in the future, call
|
||||||
# @mentions are intentionally passed through as plain text. If you add a
|
|
||||||
# handle-mapping table in the future, call
|
|
||||||
# text_builder.mention(word, did) here instead of text_builder.text(word).
|
# text_builder.mention(word, did) here instead of text_builder.text(word).
|
||||||
text_builder = client_utils.TextBuilder()
|
text_builder = client_utils.TextBuilder()
|
||||||
content = clean_post_text(content)
|
content = clean_post_text(content)
|
||||||
@@ -2178,8 +2136,7 @@ def make_rich(content):
|
|||||||
clean_url_value = clean_url(normalized_candidate)
|
clean_url_value = clean_url(normalized_candidate)
|
||||||
|
|
||||||
if clean_url_value and is_valid_url(clean_url_value):
|
if clean_url_value and is_valid_url(clean_url_value):
|
||||||
display_text = cleaned_word
|
text_builder.link(cleaned_word, clean_url_value)
|
||||||
text_builder.link(display_text, clean_url_value)
|
|
||||||
trailing = word[len(cleaned_word):]
|
trailing = word[len(cleaned_word):]
|
||||||
if trailing:
|
if trailing:
|
||||||
text_builder.text(trailing)
|
text_builder.text(trailing)
|
||||||
@@ -2209,8 +2166,6 @@ def make_rich(content):
|
|||||||
|
|
||||||
|
|
||||||
def build_dynamic_alt(raw_text, link_title=None):
|
def build_dynamic_alt(raw_text, link_title=None):
|
||||||
# FIX #5 — accept optional link_title so URL-only tweets get a richer alt
|
|
||||||
# instead of always falling back to the generic "Attached video or image" string.
|
|
||||||
dynamic_alt = clean_post_text(raw_text)
|
dynamic_alt = clean_post_text(raw_text)
|
||||||
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
|
dynamic_alt = dynamic_alt.replace("\n", " ").strip()
|
||||||
dynamic_alt = re.sub(
|
dynamic_alt = re.sub(
|
||||||
@@ -2220,7 +2175,6 @@ def build_dynamic_alt(raw_text, link_title=None):
|
|||||||
if not dynamic_alt and link_title:
|
if not dynamic_alt and link_title:
|
||||||
dynamic_alt = link_title.strip()
|
dynamic_alt = link_title.strip()
|
||||||
|
|
||||||
# FIX #11 — use named constant DYNAMIC_ALT_MAX_LENGTH instead of literal 150
|
|
||||||
if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH:
|
if len(dynamic_alt) > DYNAMIC_ALT_MAX_LENGTH:
|
||||||
dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..."
|
dynamic_alt = dynamic_alt[:DYNAMIC_ALT_MAX_LENGTH - 3] + "..."
|
||||||
elif not dynamic_alt:
|
elif not dynamic_alt:
|
||||||
@@ -2231,9 +2185,7 @@ def build_dynamic_alt(raw_text, link_title=None):
|
|||||||
|
|
||||||
def build_video_embed(video_blob, alt_text):
|
def build_video_embed(video_blob, alt_text):
|
||||||
try:
|
try:
|
||||||
return models.AppBskyEmbedVideo.Main(
|
return models.AppBskyEmbedVideo.Main(video=video_blob, alt=alt_text)
|
||||||
video=video_blob, alt=alt_text
|
|
||||||
)
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
logging.error(
|
logging.error(
|
||||||
"❌ Your atproto version does not support AppBskyEmbedVideo. Upgrade atproto."
|
"❌ Your atproto version does not support AppBskyEmbedVideo. Upgrade atproto."
|
||||||
@@ -2246,7 +2198,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
tweets = []
|
tweets = []
|
||||||
state_file = "twitter_browser_state.json"
|
state_file = "twitter_browser_state.json"
|
||||||
|
|
||||||
# FIX #14 — enforce restrictive permissions on the session cookie file
|
|
||||||
if os.path.exists(state_file):
|
if os.path.exists(state_file):
|
||||||
try:
|
try:
|
||||||
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
|
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
|
||||||
@@ -2266,15 +2217,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
"Chrome/145.0.7632.6 Safari/537.36"
|
"Chrome/145.0.7632.6 Safari/537.36"
|
||||||
)
|
)
|
||||||
|
|
||||||
# FIX #1 — all Playwright browser context variables renamed to
|
|
||||||
# 'browser_context' throughout this function to eliminate the name
|
|
||||||
# collision with the 'context_text' / 'social_context_el' variables
|
|
||||||
# used inside the per-article parsing loop below.
|
|
||||||
browser_context = None
|
browser_context = None
|
||||||
needs_login = True
|
needs_login = True
|
||||||
|
|
||||||
# FIX #7 — track the session-check page explicitly so we can close
|
|
||||||
# it before opening the profile scrape page, preventing a page leak.
|
|
||||||
session_check_page = None
|
session_check_page = None
|
||||||
|
|
||||||
if os.path.exists(state_file):
|
if os.path.exists(state_file):
|
||||||
@@ -2302,15 +2246,12 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
logging.warning(
|
logging.warning(
|
||||||
"⚠️ Saved session expired or invalid. Re-logging in..."
|
"⚠️ Saved session expired or invalid. Re-logging in..."
|
||||||
)
|
)
|
||||||
# FIX #7 — close the check page before closing the context
|
|
||||||
session_check_page.close()
|
session_check_page.close()
|
||||||
session_check_page = None
|
session_check_page = None
|
||||||
browser_context.close()
|
browser_context.close()
|
||||||
browser_context = None
|
browser_context = None
|
||||||
os.remove(state_file)
|
os.remove(state_file)
|
||||||
|
|
||||||
# FIX #7 — always close the session-check page before opening the
|
|
||||||
# profile page, whether a re-login was needed or not.
|
|
||||||
if session_check_page is not None:
|
if session_check_page is not None:
|
||||||
session_check_page.close()
|
session_check_page.close()
|
||||||
session_check_page = None
|
session_check_page = None
|
||||||
@@ -2391,7 +2332,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
browser_context.storage_state(path=state_file)
|
browser_context.storage_state(path=state_file)
|
||||||
# FIX #14 — set restrictive permissions immediately after writing
|
|
||||||
try:
|
try:
|
||||||
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
|
os.chmod(state_file, SESSION_FILE_PERMISSIONS)
|
||||||
except Exception as chmod_err:
|
except Exception as chmod_err:
|
||||||
@@ -2408,7 +2348,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
browser.close()
|
browser.close()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# FIX #7 — close the login page cleanly before opening scrape page
|
|
||||||
login_page.close()
|
login_page.close()
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
@@ -2446,10 +2385,8 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
else href
|
else href
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Retweet detection ---
|
|
||||||
is_retweet = False
|
is_retweet = False
|
||||||
try:
|
try:
|
||||||
# FIX #1 — renamed from 'context' to 'social_context_el'
|
|
||||||
social_context_el = article.locator(
|
social_context_el = article.locator(
|
||||||
'[data-testid="socialContext"]'
|
'[data-testid="socialContext"]'
|
||||||
).first
|
).first
|
||||||
@@ -2498,7 +2435,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
if video_locators:
|
if video_locators:
|
||||||
media_urls.append((tweet_url or "", "video"))
|
media_urls.append((tweet_url or "", "video"))
|
||||||
|
|
||||||
# --- Card URL extraction (link preview card) ---
|
|
||||||
card_url = None
|
card_url = None
|
||||||
try:
|
try:
|
||||||
card_locator = article.locator(
|
card_locator = article.locator(
|
||||||
@@ -2520,9 +2456,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
'[data-testid="card.wrapper"] [role="link"]'
|
'[data-testid="card.wrapper"] [role="link"]'
|
||||||
).first
|
).first
|
||||||
if card_role_link.is_visible():
|
if card_role_link.is_visible():
|
||||||
card_a = card_role_link.locator(
|
card_a = card_role_link.locator("a[href]").first
|
||||||
"a[href]"
|
|
||||||
).first
|
|
||||||
if card_a.is_visible():
|
if card_a.is_visible():
|
||||||
card_href = card_a.get_attribute("href")
|
card_href = card_a.get_attribute("href")
|
||||||
if card_href:
|
if card_href:
|
||||||
@@ -2545,9 +2479,7 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"⚠️ Failed to parse a specific tweet: {e}")
|
||||||
f"⚠️ Failed to parse a specific tweet: {e}"
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -2560,7 +2492,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
|
|
||||||
# --- Video Extraction & Processing ---
|
# --- Video Extraction & Processing ---
|
||||||
def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
||||||
# FIX #1 — parameter renamed from 'context' to 'browser_context'
|
|
||||||
page = browser_context.new_page()
|
page = browser_context.new_page()
|
||||||
best_m3u8_url = None
|
best_m3u8_url = None
|
||||||
best_video_mp4_url = None
|
best_video_mp4_url = None
|
||||||
@@ -2573,10 +2504,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
"/aud/" in url_l
|
"/aud/" in url_l
|
||||||
or "/audio/" in url_l
|
or "/audio/" in url_l
|
||||||
or "mp4a" in url_l
|
or "mp4a" in url_l
|
||||||
or (
|
or ("audio/" in content_type_l and "video/" not in content_type_l)
|
||||||
"audio/" in content_type_l
|
|
||||||
and "video/" not in content_type_l
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_response(response):
|
def handle_response(response):
|
||||||
@@ -2649,7 +2577,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
else:
|
else:
|
||||||
logging.warning("⚠️ No video player locator found on tweet page")
|
logging.warning("⚠️ No video player locator found on tweet page")
|
||||||
|
|
||||||
# FIX #11 — use named constant VIDEO_PLAYER_WAIT_ROUNDS
|
|
||||||
for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
|
for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
|
||||||
if current_best():
|
if current_best():
|
||||||
break
|
break
|
||||||
@@ -2661,7 +2588,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
player.click(force=True, timeout=5000)
|
player.click(force=True, timeout=5000)
|
||||||
# FIX #11 — use named constant PLAYWRIGHT_RETRY_SLEEP_S
|
|
||||||
time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
|
time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(f"⚠️ Retry click failed: {e}")
|
logging.info(f"⚠️ Retry click failed: {e}")
|
||||||
@@ -2672,7 +2598,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# FIX #11 — use named constant VIDEO_PLAYER_RETRY_ROUNDS
|
|
||||||
for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
|
for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
|
||||||
if current_best():
|
if current_best():
|
||||||
break
|
break
|
||||||
@@ -2698,12 +2623,6 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
|
|
||||||
|
|
||||||
def _probe_video_duration(file_path):
|
def _probe_video_duration(file_path):
|
||||||
"""
|
|
||||||
FIX #6 — Use ffprobe via subprocess instead of VideoFileClip to get video
|
|
||||||
duration. This avoids a potential hang on corrupt/truncated files since we
|
|
||||||
apply a hard timeout to the subprocess call.
|
|
||||||
Returns duration in seconds as a float, or raises RuntimeError on failure.
|
|
||||||
"""
|
|
||||||
probe_cmd = [
|
probe_cmd = [
|
||||||
"ffprobe",
|
"ffprobe",
|
||||||
"-v", "error",
|
"-v", "error",
|
||||||
@@ -2739,9 +2658,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(
|
logging.info(f"⬇️ Downloading video source with ffmpeg: {video_url}")
|
||||||
f"⬇️ Downloading video source with ffmpeg: {video_url}"
|
|
||||||
)
|
|
||||||
|
|
||||||
video_url_l = video_url.lower()
|
video_url_l = video_url.lower()
|
||||||
|
|
||||||
@@ -2781,15 +2698,11 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
not os.path.exists(temp_input)
|
not os.path.exists(temp_input)
|
||||||
or os.path.getsize(temp_input) == 0
|
or os.path.getsize(temp_input) == 0
|
||||||
):
|
):
|
||||||
logging.error(
|
logging.error("❌ Downloaded video source file is missing or empty.")
|
||||||
"❌ Downloaded video source file is missing or empty."
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
logging.info(f"✅ Video downloaded: {temp_input}")
|
logging.info(f"✅ Video downloaded: {temp_input}")
|
||||||
|
|
||||||
# FIX #6 — probe duration with ffprobe (hard timeout) instead of
|
|
||||||
# VideoFileClip, which can hang indefinitely on corrupt files.
|
|
||||||
try:
|
try:
|
||||||
duration = _probe_video_duration(temp_input)
|
duration = _probe_video_duration(temp_input)
|
||||||
except RuntimeError as probe_err:
|
except RuntimeError as probe_err:
|
||||||
@@ -2797,16 +2710,11 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if duration <= 0:
|
if duration <= 0:
|
||||||
logging.error(
|
logging.error("❌ Downloaded video has invalid or unknown duration.")
|
||||||
"❌ Downloaded video has invalid or unknown duration."
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
end_time = min(VIDEO_MAX_DURATION_SECONDS, duration)
|
end_time = min(VIDEO_MAX_DURATION_SECONDS, duration)
|
||||||
|
|
||||||
# FIX #2 — wrap VideoFileClip usage in nested try/finally blocks so
|
|
||||||
# both the source clip and the subclip handles are always closed, even
|
|
||||||
# if write_videofile raises an exception mid-way.
|
|
||||||
video_clip = VideoFileClip(temp_input)
|
video_clip = VideoFileClip(temp_input)
|
||||||
try:
|
try:
|
||||||
if hasattr(video_clip, "subclipped"):
|
if hasattr(video_clip, "subclipped"):
|
||||||
@@ -2825,9 +2733,9 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
logger=None,
|
logger=None,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
cropped_clip.close() # FIX #2 — always close subclip
|
cropped_clip.close()
|
||||||
finally:
|
finally:
|
||||||
video_clip.close() # FIX #2 — always close source clip
|
video_clip.close()
|
||||||
|
|
||||||
if (
|
if (
|
||||||
not os.path.exists(temp_trimmed)
|
not os.path.exists(temp_trimmed)
|
||||||
@@ -2898,8 +2806,6 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
finally:
|
finally:
|
||||||
remove_file_quietly(temp_input)
|
remove_file_quietly(temp_input)
|
||||||
remove_file_quietly(temp_trimmed)
|
remove_file_quietly(temp_trimmed)
|
||||||
# temp_output was either renamed to output_path via os.replace()
|
|
||||||
# or never created; remove_file_quietly is a no-op if it doesn't exist.
|
|
||||||
remove_file_quietly(temp_output)
|
remove_file_quietly(temp_output)
|
||||||
|
|
||||||
|
|
||||||
@@ -2928,6 +2834,7 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
|||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
|
# --- Main Sync Logic ---
|
||||||
def sync_feeds(args):
|
def sync_feeds(args):
|
||||||
logging.info("🔄 Starting sync cycle...")
|
logging.info("🔄 Starting sync cycle...")
|
||||||
|
|
||||||
@@ -2935,14 +2842,10 @@ def sync_feeds(args):
|
|||||||
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
logging.info(
|
logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
|
||||||
"🧪 DRY RUN MODE — no posts will be created on Bluesky."
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
state = load_state(STATE_PATH)
|
state = load_state(STATE_PATH)
|
||||||
# FIX #8 — prune on load so the state file never grows unbounded
|
|
||||||
# between runs, not only after individual posts.
|
|
||||||
state = prune_state(state, max_entries=5000)
|
state = prune_state(state, max_entries=5000)
|
||||||
|
|
||||||
tweets = scrape_tweets_via_playwright(
|
tweets = scrape_tweets_via_playwright(
|
||||||
@@ -2988,9 +2891,8 @@ def sync_feeds(args):
|
|||||||
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
|
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
|
||||||
|
|
||||||
candidate_tweets = []
|
candidate_tweets = []
|
||||||
|
|
||||||
# --- Cheap prefilter before expensive processing ---
|
|
||||||
cheap_candidates = []
|
cheap_candidates = []
|
||||||
|
|
||||||
for tweet in reversed(tweets):
|
for tweet in reversed(tweets):
|
||||||
try:
|
try:
|
||||||
tweet_time = arrow.get(tweet.created_on)
|
tweet_time = arrow.get(tweet.created_on)
|
||||||
@@ -3064,9 +2966,7 @@ def sync_feeds(args):
|
|||||||
if not is_tco_domain(
|
if not is_tco_domain(
|
||||||
raw_url
|
raw_url
|
||||||
) and not is_x_or_twitter_domain(raw_url):
|
) and not is_x_or_twitter_domain(raw_url):
|
||||||
canonical_non_x_urls.add(
|
canonical_non_x_urls.add(canonicalize_url(raw_url))
|
||||||
canonicalize_url(raw_url)
|
|
||||||
)
|
|
||||||
|
|
||||||
primary_non_x_url = None
|
primary_non_x_url = None
|
||||||
if resolved_primary_external_url:
|
if resolved_primary_external_url:
|
||||||
@@ -3174,7 +3074,6 @@ def sync_feeds(args):
|
|||||||
if os.path.exists(browser_state_file):
|
if os.path.exists(browser_state_file):
|
||||||
context_kwargs["storage_state"] = browser_state_file
|
context_kwargs["storage_state"] = browser_state_file
|
||||||
|
|
||||||
# FIX #1 — renamed from 'context' to 'browser_context'
|
|
||||||
browser_context = browser.new_context(**context_kwargs)
|
browser_context = browser.new_context(**context_kwargs)
|
||||||
|
|
||||||
for candidate in candidate_tweets:
|
for candidate in candidate_tweets:
|
||||||
@@ -3219,10 +3118,6 @@ def sync_feeds(args):
|
|||||||
new_posts += 1
|
new_posts += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# FIX #5 — fetch link metadata once here so we can pass the
|
|
||||||
# OG title to build_dynamic_alt AND reuse it inside
|
|
||||||
# build_external_link_embed, avoiding a duplicate HTTP request
|
|
||||||
# for the same URL.
|
|
||||||
link_meta_for_alt: dict = {}
|
link_meta_for_alt: dict = {}
|
||||||
if candidate.get("resolved_primary_external_url"):
|
if candidate.get("resolved_primary_external_url"):
|
||||||
try:
|
try:
|
||||||
@@ -3284,11 +3179,9 @@ def sync_feeds(args):
|
|||||||
f"video:resolve_failed:{tweet.tweet_url}"
|
f"video:resolve_failed:{tweet.tweet_url}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cropped_video_path = (
|
cropped_video_path = download_and_crop_video(
|
||||||
download_and_crop_video(
|
|
||||||
real_video_url, temp_video_path
|
real_video_url, temp_video_path
|
||||||
)
|
)
|
||||||
)
|
|
||||||
if not cropped_video_path:
|
if not cropped_video_path:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⚠️ Video download/crop failed for "
|
f"⚠️ Video download/crop failed for "
|
||||||
@@ -3375,8 +3268,6 @@ def sync_feeds(args):
|
|||||||
f"external card: {candidate_url}"
|
f"external card: {candidate_url}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# FIX #5 — pass the already-fetched metadata so
|
|
||||||
# build_external_link_embed skips a duplicate HTTP fetch.
|
|
||||||
external_embed = build_external_link_embed(
|
external_embed = build_external_link_embed(
|
||||||
candidate_url,
|
candidate_url,
|
||||||
bsky_client,
|
bsky_client,
|
||||||
@@ -3489,24 +3380,19 @@ def sync_feeds(args):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"❌ Error during sync cycle: {e}")
|
logging.error(f"❌ Error during sync cycle: {e}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync")
|
||||||
description="Twitter to Bluesky Sync"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--twitter-username",
|
"--twitter-username",
|
||||||
help="Your Twitter login username",
|
help="Your Twitter login username",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--twitter-password",
|
"--twitter-password",
|
||||||
|
# NOTE (FIX #15): passwords passed via CLI are visible in `ps aux`.
|
||||||
|
# Prefer setting TWITTER_PASSWORD in your .env file instead.
|
||||||
help="Your Twitter login password",
|
help="Your Twitter login password",
|
||||||
# FIX #15 — password args are still supported for compatibility but
|
|
||||||
# the .env file is the recommended path; passwords passed via CLI
|
|
||||||
# are visible in `ps aux`. Consider removing these args and requiring
|
|
||||||
# env vars exclusively, or prompting with getpass for interactive use.
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--twitter-email",
|
"--twitter-email",
|
||||||
@@ -3522,8 +3408,9 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bsky-password",
|
"--bsky-password",
|
||||||
|
# NOTE (FIX #15): same warning as --twitter-password above.
|
||||||
|
# Prefer setting BSKY_APP_PASSWORD in your .env file instead.
|
||||||
help="Your Bluesky app password",
|
help="Your Bluesky app password",
|
||||||
# FIX #15 — same note as --twitter-password above.
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bsky-base-url",
|
"--bsky-base-url",
|
||||||
@@ -3547,8 +3434,8 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Resolve credentials: CLI args take priority, then env vars.
|
# Resolve credentials: CLI args take priority, then env vars.
|
||||||
# FIX #15 — document that env vars are the secure path; CLI args expose
|
# FIX #15 — env vars are the secure path; CLI args expose secrets in
|
||||||
# secrets in the process list. Operators should prefer .env / env vars.
|
# the process list. Operators should prefer .env / environment variables.
|
||||||
args.twitter_username = args.twitter_username or os.getenv(
|
args.twitter_username = args.twitter_username or os.getenv(
|
||||||
"TWITTER_USERNAME"
|
"TWITTER_USERNAME"
|
||||||
)
|
)
|
||||||
@@ -3620,3 +3507,4 @@ def main():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user