Refactor using Claude 4.6 Opus

This commit is contained in:
2026-04-13 17:38:40 +00:00
parent 351eec4840
commit 35ba7a7a5e

View File

@@ -26,6 +26,7 @@ SCRAPE_TWEET_LIMIT = 30
DEDUPE_BSKY_LIMIT = 30 DEDUPE_BSKY_LIMIT = 30
TWEET_MAX_AGE_DAYS = 3 TWEET_MAX_AGE_DAYS = 3
BSKY_TEXT_MAX_LENGTH = 275 BSKY_TEXT_MAX_LENGTH = 275
DEFAULT_BSKY_LANGS = ["ca"]
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -44,10 +45,15 @@ BSKY_BLOB_UPLOAD_MAX_DELAY = 300
BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3 BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
BSKY_SEND_POST_MAX_RETRIES = 3
BSKY_SEND_POST_BASE_DELAY = 5
BSKY_SEND_POST_MAX_DELAY = 60
MEDIA_DOWNLOAD_TIMEOUT = 30 MEDIA_DOWNLOAD_TIMEOUT = 30
LINK_METADATA_TIMEOUT = 10 LINK_METADATA_TIMEOUT = 10
URL_RESOLVE_TIMEOUT = 12 URL_RESOLVE_TIMEOUT = 12
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000 PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
SUBPROCESS_TIMEOUT_SECONDS = 180
DEFAULT_BSKY_BASE_URL = "https://bsky.social" DEFAULT_BSKY_BASE_URL = "https://bsky.social"
# --- Logging Setup --- # --- Logging Setup ---
@@ -60,6 +66,13 @@ logging.basicConfig(
# --- Per-run caches for efficiency --- # --- Per-run caches for efficiency ---
OG_TITLE_CACHE = {} OG_TITLE_CACHE = {}
URL_RESOLUTION_CACHE = {} URL_RESOLUTION_CACHE = {}
URL_VALIDITY_CACHE = {}
def reset_caches():
OG_TITLE_CACHE.clear()
URL_RESOLUTION_CACHE.clear()
URL_VALIDITY_CACHE.clear()
# --- Custom Classes --- # --- Custom Classes ---
@@ -70,10 +83,12 @@ class ScrapedMedia:
class ScrapedTweet: class ScrapedTweet:
def __init__(self, created_on, text, media_urls, tweet_url=None): def __init__(self, created_on, text, media_urls, tweet_url=None, card_url=None, is_retweet=False):
self.created_on = created_on self.created_on = created_on
self.text = text self.text = text
self.tweet_url = tweet_url self.tweet_url = tweet_url
self.card_url = card_url
self.is_retweet = is_retweet
self.media = [ScrapedMedia(url, media_type) for url, media_type in media_urls] self.media = [ScrapedMedia(url, media_type) for url, media_type in media_urls]
@@ -87,11 +102,17 @@ def take_error_screenshot(page, error_msg):
def is_valid_url(url): def is_valid_url(url):
if url in URL_VALIDITY_CACHE:
return URL_VALIDITY_CACHE[url]
try: try:
response = httpx.head(url, timeout=5, follow_redirects=True) response = httpx.head(url, timeout=5, follow_redirects=True)
return response.status_code < 500 result = response.status_code < 500
except Exception: except Exception:
return False result = False
URL_VALIDITY_CACHE[url] = result
return result
def strip_trailing_url_punctuation(url): def strip_trailing_url_punctuation(url):
@@ -404,9 +425,6 @@ def extract_quoted_text_from_og_title(og_title):
def should_fetch_og_title(tweet): def should_fetch_og_title(tweet):
"""
Avoid fetching og:title unless it is likely to improve the text.
"""
text = clean_post_text(tweet.text or "") text = clean_post_text(tweet.text or "")
urls = extract_urls_from_text(text) urls = extract_urls_from_text(text)
@@ -681,12 +699,40 @@ def extract_first_resolved_external_url(text, http_client, allow_playwright_fall
return None return None
def resolve_card_url(card_url, http_client):
"""
Resolve a card URL (typically t.co) scraped from the tweet's link preview card.
Returns the final external URL or None.
"""
if not card_url:
return None
cleaned = canonicalize_url(card_url.strip())
if not cleaned:
return None
if is_external_non_x_url(cleaned):
logging.info(f"🔗 Card URL is already external: {cleaned}")
return cleaned
if is_tco_domain(cleaned):
resolved = resolve_url_if_needed(cleaned, http_client, allow_playwright_fallback=True)
if resolved and is_external_non_x_url(resolved):
logging.info(f"🔗 Resolved card t.co URL: {cleaned} -> {resolved}")
return resolved
if is_x_or_twitter_domain(cleaned):
logging.info(f" Card URL resolves to X/Twitter domain, ignoring: {cleaned}")
return None
return cleaned
def sanitize_visible_urls_in_text(text, http_client, has_media=False): def sanitize_visible_urls_in_text(text, http_client, has_media=False):
""" """
Faster logic:
- remove x/twitter URLs from visible text - remove x/twitter URLs from visible text
- resolve t.co - resolve t.co
- if a t.co resolves to x/twitter and tweet has media, do not use Playwright fallback - if a t.co resolves to x/twitter and tweet has media, skip Playwright fallback
""" """
if not text: if not text:
return text, None return text, None
@@ -816,6 +862,20 @@ def build_effective_tweet_text(tweet, http_client):
) )
candidate_text = clean_post_text(candidate_text) candidate_text = clean_post_text(candidate_text)
# --- KEY FIX: also resolve the card_url scraped from the tweet's link preview ---
resolved_card_url = resolve_card_url(getattr(tweet, "card_url", None), http_client)
if resolved_card_url and is_external_non_x_url(resolved_card_url):
if not resolved_primary_external_url:
resolved_primary_external_url = resolved_card_url
logging.info(f"🔗 Using resolved card URL as primary external URL: {resolved_card_url}")
elif resolved_primary_external_url != resolved_card_url:
logging.info(
f" Card URL ({resolved_card_url}) differs from text URL ({resolved_primary_external_url}). "
f"Preferring card URL for external embed."
)
resolved_primary_external_url = resolved_card_url
if not resolved_primary_external_url: if not resolved_primary_external_url:
resolved_primary_external_url = extract_first_resolved_external_url( resolved_primary_external_url = extract_first_resolved_external_url(
candidate_text, candidate_text,
@@ -1279,8 +1339,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
if getattr(record, "reply", None) is not None: if getattr(record, "reply", None) is not None:
continue continue
# no-op
text = getattr(record, "text", "") or "" text = getattr(record, "text", "") or ""
normalized_text = normalize_post_text(text) normalized_text = normalize_post_text(text)
@@ -1333,7 +1391,7 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
return default_delay return default_delay
def is_transient_blob_error(error_obj): def is_transient_error(error_obj):
error_text = repr(error_obj) error_text = repr(error_obj)
transient_signals = [ transient_signals = [
"InvokeTimeoutError", "InvokeTimeoutError",
@@ -1383,7 +1441,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
) )
break break
if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES: if is_transient_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES:
transient_attempts += 1 transient_attempts += 1
wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
logging.warning( logging.warning(
@@ -1408,6 +1466,54 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
return None return None
def send_post_with_retry(client, **kwargs):
"""
Wrapper around client.send_post() with retry logic for transient errors
and rate limiting.
"""
last_exception = None
for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
try:
return client.send_post(**kwargs)
except Exception as e:
last_exception = e
error_text = str(e)
is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text
if is_rate_limited:
backoff_delay = min(
BSKY_SEND_POST_BASE_DELAY * (2 ** (attempt - 1)),
BSKY_SEND_POST_MAX_DELAY
)
wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay)
if attempt < BSKY_SEND_POST_MAX_RETRIES:
logging.warning(
f"⏳ Bluesky send_post rate-limited. "
f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s."
)
time.sleep(wait_seconds)
continue
else:
logging.error(f"❌ Exhausted send_post retries after rate limiting: {repr(e)}")
raise
if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES:
wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
logging.warning(
f"⏳ Transient send_post failure: {repr(e)}. "
f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s."
)
time.sleep(wait_seconds)
continue
raise
raise last_exception
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
try: try:
with Image.open(io.BytesIO(image_bytes)) as img: with Image.open(io.BytesIO(image_bytes)) as img:
@@ -1505,7 +1611,6 @@ def get_blob_from_url(media_url, client, http_client):
logging.warning(f"Could not fetch media {media_url}: {repr(e)}") logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
return None return None
def get_blob_from_file(file_path, client): def get_blob_from_file(file_path, client):
try: try:
if not os.path.exists(file_path): if not os.path.exists(file_path):
@@ -1891,6 +1996,18 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
if href: if href:
tweet_url = f"https://x.com{href}" if href.startswith("/") else href tweet_url = f"https://x.com{href}" if href.startswith("/") else href
# --- Retweet detection ---
is_retweet = False
try:
social_context = article.locator('[data-testid="socialContext"]').first
if social_context.is_visible():
context_text = social_context.inner_text().lower()
if "reposted" in context_text or "retweeted" in context_text or "ha repostejat" in context_text or "ha retuitat" in context_text or "repostejat" in context_text:
is_retweet = True
logging.info(f"🔁 Detected retweet/repost: {tweet_url}")
except Exception:
pass
text_locator = article.locator('[data-testid="tweetText"]').first text_locator = article.locator('[data-testid="tweetText"]').first
text = text_locator.inner_text() if text_locator.is_visible() else "" text = text_locator.inner_text() if text_locator.is_visible() else ""
@@ -1907,7 +2024,38 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
if video_locators: if video_locators:
media_urls.append((tweet_url or "", "video")) media_urls.append((tweet_url or "", "video"))
tweets.append(ScrapedTweet(created_at, text, media_urls, tweet_url=tweet_url)) # --- Card URL extraction (link preview card) ---
card_url = None
try:
card_locator = article.locator('[data-testid="card.wrapper"] a[href]').first
if card_locator.is_visible():
card_href = card_locator.get_attribute("href")
if card_href:
card_url = card_href.strip()
logging.info(f"🃏 Scraped card URL from tweet: {card_url}")
except Exception:
pass
# Fallback: try to find card link via role="link" inside card wrapper
if not card_url:
try:
card_role_link = article.locator('[data-testid="card.wrapper"] [role="link"]').first
if card_role_link.is_visible():
card_a = card_role_link.locator("a[href]").first
if card_a.is_visible():
card_href = card_a.get_attribute("href")
if card_href:
card_url = card_href.strip()
logging.info(f"🃏 Scraped card URL (fallback) from tweet: {card_url}")
except Exception:
pass
tweets.append(ScrapedTweet(
created_at, text, media_urls,
tweet_url=tweet_url,
card_url=card_url,
is_retweet=is_retweet,
))
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Failed to parse a specific tweet: {e}") logging.warning(f"⚠️ Failed to parse a specific tweet: {e}")
@@ -2071,7 +2219,10 @@ def download_and_crop_video(video_url, output_path):
temp_input, temp_input,
] ]
download_result = subprocess.run(download_cmd, capture_output=True, text=True) download_result = subprocess.run(
download_cmd, capture_output=True, text=True,
timeout=SUBPROCESS_TIMEOUT_SECONDS
)
if download_result.returncode != 0: if download_result.returncode != 0:
logging.error(f"❌ ffmpeg download failed:\n{download_result.stderr}") logging.error(f"❌ ffmpeg download failed:\n{download_result.stderr}")
@@ -2134,7 +2285,10 @@ def download_and_crop_video(video_url, output_path):
temp_output, temp_output,
] ]
compress_result = subprocess.run(compress_cmd, capture_output=True, text=True) compress_result = subprocess.run(
compress_cmd, capture_output=True, text=True,
timeout=SUBPROCESS_TIMEOUT_SECONDS
)
if compress_result.returncode != 0: if compress_result.returncode != 0:
logging.error(f"❌ ffmpeg compression failed:\n{compress_result.stderr}") logging.error(f"❌ ffmpeg compression failed:\n{compress_result.stderr}")
@@ -2151,6 +2305,10 @@ def download_and_crop_video(video_url, output_path):
logging.info(f"✅ Final video ready: {output_path}") logging.info(f"✅ Final video ready: {output_path}")
return output_path return output_path
except subprocess.TimeoutExpired:
logging.error(f"❌ ffmpeg subprocess timed out after {SUBPROCESS_TIMEOUT_SECONDS}s")
return None
except Exception as e: except Exception as e:
logging.error(f"❌ Error processing video: {repr(e)}") logging.error(f"❌ Error processing video: {repr(e)}")
return None return None
@@ -2187,6 +2345,13 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
def sync_feeds(args): def sync_feeds(args):
logging.info("🔄 Starting sync cycle...") logging.info("🔄 Starting sync cycle...")
dry_run = getattr(args, "dry_run", False)
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
if dry_run:
logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
try: try:
state = load_state(STATE_PATH) state = load_state(STATE_PATH)
@@ -2201,17 +2366,21 @@ def sync_feeds(args):
logging.warning("⚠️ No tweets found or failed to fetch. Skipping Bluesky sync for this cycle.") logging.warning("⚠️ No tweets found or failed to fetch. Skipping Bluesky sync for this cycle.")
return return
bsky_client = create_bsky_client( bsky_client = None
args.bsky_base_url, if not dry_run:
args.bsky_handle, bsky_client = create_bsky_client(
args.bsky_password args.bsky_base_url,
) args.bsky_handle,
args.bsky_password
)
recent_bsky_posts = get_recent_bsky_posts( recent_bsky_posts = []
bsky_client, if not dry_run:
args.bsky_handle, recent_bsky_posts = get_recent_bsky_posts(
limit=DEDUPE_BSKY_LIMIT bsky_client,
) args.bsky_handle,
limit=DEDUPE_BSKY_LIMIT
)
logging.info(f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for duplicate detection.") logging.info(f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for duplicate detection.")
logging.info(f"🧠 Local state currently tracks {len(state.get('posted_tweets', {}))} posted items.") logging.info(f"🧠 Local state currently tracks {len(state.get('posted_tweets', {}))} posted items.")
@@ -2231,6 +2400,11 @@ def sync_feeds(args):
logging.info(f"⏭️ Skipping old tweet from {tweet_time}") logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
continue continue
# --- Retweet filtering ---
if tweet.is_retweet:
logging.info(f"⏭️ Skipping retweet/repost: {tweet.tweet_url}")
continue
canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url) canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url)
if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}): if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}):
logging.info(f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}") logging.info(f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}")
@@ -2354,7 +2528,20 @@ def sync_feeds(args):
raw_text = candidate["raw_text"] raw_text = candidate["raw_text"]
full_clean_text = candidate["full_clean_text"] full_clean_text = candidate["full_clean_text"]
logging.info(f"📝 Posting missing tweet from {tweet_time} to Bluesky...") logging.info(f"📝 {'[DRY RUN] Would post' if dry_run else 'Posting'} missing tweet from {tweet_time} to Bluesky...")
if dry_run:
logging.info(f" 📄 Text: {raw_text[:200]}{'...' if len(raw_text) > 200 else ''}")
logging.info(f" 🔗 Primary external URL: {candidate.get('resolved_primary_external_url', 'None')}")
logging.info(f" 🃏 Card URL: {getattr(tweet, 'card_url', 'None')}")
logging.info(f" 🎬 Has video: {candidate.get('has_video', False)}")
logging.info(f" 🖼️ Has photo: {candidate.get('has_photo', False)}")
logging.info(f" 🔁 Is retweet: {getattr(tweet, 'is_retweet', False)}")
remember_posted_tweet(state, candidate, bsky_uri=f"dry-run:{arrow.utcnow().isoformat()}")
save_state(state, STATE_PATH)
new_posts += 1
continue
rich_text = make_rich(raw_text) rich_text = make_rich(raw_text)
dynamic_alt = build_dynamic_alt(full_clean_text) dynamic_alt = build_dynamic_alt(full_clean_text)
@@ -2423,6 +2610,7 @@ def sync_feeds(args):
else: else:
media_upload_failures.append(f"photo:{media.media_url_https}") media_upload_failures.append(f"photo:{media.media_url_https}")
# --- External link card logic (KEY FIX for t.co card URLs) ---
if not video_embed and not image_embeds: if not video_embed and not image_embeds:
candidate_url = candidate.get("resolved_primary_external_url") candidate_url = candidate.get("resolved_primary_external_url")
@@ -2449,17 +2637,25 @@ def sync_feeds(args):
post_mode = "text" post_mode = "text"
if video_embed: if video_embed:
post_result = bsky_client.send_post(text=rich_text, embed=video_embed, langs=["ca"]) post_result = send_post_with_retry(
bsky_client, text=rich_text, embed=video_embed, langs=bsky_langs
)
post_mode = "video" post_mode = "video"
elif image_embeds: elif image_embeds:
embed = models.AppBskyEmbedImages.Main(images=image_embeds) embed = models.AppBskyEmbedImages.Main(images=image_embeds)
post_result = bsky_client.send_post(text=rich_text, embed=embed, langs=["ca"]) post_result = send_post_with_retry(
bsky_client, text=rich_text, embed=embed, langs=bsky_langs
)
post_mode = f"images:{len(image_embeds)}" post_mode = f"images:{len(image_embeds)}"
elif external_embed: elif external_embed:
post_result = bsky_client.send_post(text=rich_text, embed=external_embed, langs=["ca"]) post_result = send_post_with_retry(
bsky_client, text=rich_text, embed=external_embed, langs=bsky_langs
)
post_mode = "external_link_card" post_mode = "external_link_card"
else: else:
post_result = bsky_client.send_post(text=rich_text, langs=["ca"]) post_result = send_post_with_retry(
bsky_client, text=rich_text, langs=bsky_langs
)
post_mode = "text_only" post_mode = "text_only"
bsky_uri = getattr(post_result, "uri", None) bsky_uri = getattr(post_result, "uri", None)
@@ -2513,6 +2709,17 @@ def main():
parser.add_argument("--bsky-handle", help="Your Bluesky handle") parser.add_argument("--bsky-handle", help="Your Bluesky handle")
parser.add_argument("--bsky-password", help="Your Bluesky app password") parser.add_argument("--bsky-password", help="Your Bluesky app password")
parser.add_argument("--bsky-base-url", help="Bluesky/ATProto PDS base URL, e.g. https://eurosky.social") parser.add_argument("--bsky-base-url", help="Bluesky/ATProto PDS base URL, e.g. https://eurosky.social")
parser.add_argument(
"--bsky-langs",
help="Comma-separated language codes for Bluesky posts (default: ca)",
default=None,
)
parser.add_argument(
"--dry-run",
action="store_true",
default=False,
help="Simulate sync without posting to Bluesky. Logs what would be posted.",
)
args = parser.parse_args() args = parser.parse_args()
@@ -2524,6 +2731,15 @@ def main():
args.twitter_handle = args.twitter_handle or os.getenv("TWITTER_HANDLE") or args.twitter_username args.twitter_handle = args.twitter_handle or os.getenv("TWITTER_HANDLE") or args.twitter_username
args.bsky_base_url = args.bsky_base_url if args.bsky_base_url else DEFAULT_BSKY_BASE_URL args.bsky_base_url = args.bsky_base_url if args.bsky_base_url else DEFAULT_BSKY_BASE_URL
# --- Language handling: CLI > env > default (Catalan) ---
raw_langs = args.bsky_langs or os.getenv("BSKY_LANGS")
if raw_langs:
args.bsky_langs = [lang.strip() for lang in raw_langs.split(",") if lang.strip()]
logging.info(f"🌍 Using configured Bluesky languages: {args.bsky_langs}")
else:
args.bsky_langs = DEFAULT_BSKY_LANGS
logging.info(f"🌍 Using default Bluesky languages: {args.bsky_langs}")
missing_args = [] missing_args = []
if not args.twitter_username: if not args.twitter_username:
missing_args.append("--twitter-username") missing_args.append("--twitter-username")
@@ -2540,6 +2756,11 @@ def main():
logging.info(f"🤖 Bot started. Will check @{args.twitter_handle}") logging.info(f"🤖 Bot started. Will check @{args.twitter_handle}")
logging.info(f"🌍 Posting destination base URL: {args.bsky_base_url}") logging.info(f"🌍 Posting destination base URL: {args.bsky_base_url}")
if args.dry_run:
logging.info("🧪 DRY RUN MODE ENABLED — no posts will be created.")
reset_caches()
sync_feeds(args) sync_feeds(args)
logging.info("🤖 Bot finished.") logging.info("🤖 Bot finished.")