From 35ba7a7a5e3858720275abfa75bc3de0e4b676e3 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Mon, 13 Apr 2026 17:38:40 +0000 Subject: [PATCH] Refactor using Claude 4.6 Opus --- twitter2bsky_daemon.py | 285 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 253 insertions(+), 32 deletions(-) diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 23a096f..eefc880 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -26,6 +26,7 @@ SCRAPE_TWEET_LIMIT = 30 DEDUPE_BSKY_LIMIT = 30 TWEET_MAX_AGE_DAYS = 3 BSKY_TEXT_MAX_LENGTH = 275 +DEFAULT_BSKY_LANGS = ["ca"] VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 @@ -44,10 +45,15 @@ BSKY_BLOB_UPLOAD_MAX_DELAY = 300 BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3 BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 +BSKY_SEND_POST_MAX_RETRIES = 3 +BSKY_SEND_POST_BASE_DELAY = 5 +BSKY_SEND_POST_MAX_DELAY = 60 + MEDIA_DOWNLOAD_TIMEOUT = 30 LINK_METADATA_TIMEOUT = 10 URL_RESOLVE_TIMEOUT = 12 PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000 +SUBPROCESS_TIMEOUT_SECONDS = 180 DEFAULT_BSKY_BASE_URL = "https://bsky.social" # --- Logging Setup --- @@ -60,6 +66,13 @@ logging.basicConfig( # --- Per-run caches for efficiency --- OG_TITLE_CACHE = {} URL_RESOLUTION_CACHE = {} +URL_VALIDITY_CACHE = {} + + +def reset_caches(): + OG_TITLE_CACHE.clear() + URL_RESOLUTION_CACHE.clear() + URL_VALIDITY_CACHE.clear() # --- Custom Classes --- @@ -70,10 +83,12 @@ class ScrapedMedia: class ScrapedTweet: - def __init__(self, created_on, text, media_urls, tweet_url=None): + def __init__(self, created_on, text, media_urls, tweet_url=None, card_url=None, is_retweet=False): self.created_on = created_on self.text = text self.tweet_url = tweet_url + self.card_url = card_url + self.is_retweet = is_retweet self.media = [ScrapedMedia(url, media_type) for url, media_type in media_urls] @@ -87,11 +102,17 @@ def take_error_screenshot(page, error_msg): def is_valid_url(url): + if url in URL_VALIDITY_CACHE: + return URL_VALIDITY_CACHE[url] + try: response = httpx.head(url, timeout=5, follow_redirects=True) - return response.status_code < 500 + result = response.status_code < 500 except Exception: - return False + result = False + + URL_VALIDITY_CACHE[url] = result + return result def strip_trailing_url_punctuation(url): @@ -404,9 +425,6 @@ def extract_quoted_text_from_og_title(og_title): def should_fetch_og_title(tweet): - """ - Avoid fetching og:title unless it is likely to improve the text. - """ text = clean_post_text(tweet.text or "") urls = extract_urls_from_text(text) @@ -681,12 +699,40 @@ def extract_first_resolved_external_url(text, http_client, allow_playwright_fall return None +def resolve_card_url(card_url, http_client): + """ + Resolve a card URL (typically t.co) scraped from the tweet's link preview card. + Returns the final external URL or None. + """ + if not card_url: + return None + + cleaned = canonicalize_url(card_url.strip()) + if not cleaned: + return None + + if is_external_non_x_url(cleaned): + logging.info(f"๐Ÿ”— Card URL is already external: {cleaned}") + return cleaned + + if is_tco_domain(cleaned): + resolved = resolve_url_if_needed(cleaned, http_client, allow_playwright_fallback=True) + if resolved and is_external_non_x_url(resolved): + logging.info(f"๐Ÿ”— Resolved card t.co URL: {cleaned} -> {resolved}") + return resolved + + if is_x_or_twitter_domain(cleaned): + logging.info(f"โ„น๏ธ Card URL resolves to X/Twitter domain, ignoring: {cleaned}") + return None + + return cleaned + + def sanitize_visible_urls_in_text(text, http_client, has_media=False): """ - Faster logic: - remove x/twitter URLs from visible text - resolve t.co - - if a t.co resolves to x/twitter and tweet has media, do not use Playwright fallback + - if a t.co resolves to x/twitter and tweet has media, skip Playwright fallback """ if not text: return text, None @@ -816,6 +862,20 @@ def build_effective_tweet_text(tweet, http_client): ) candidate_text = clean_post_text(candidate_text) + # --- KEY FIX: also resolve the card_url scraped from the tweet's link preview --- + resolved_card_url = resolve_card_url(getattr(tweet, "card_url", None), http_client) + + if resolved_card_url and is_external_non_x_url(resolved_card_url): + if not resolved_primary_external_url: + resolved_primary_external_url = resolved_card_url + logging.info(f"๐Ÿ”— Using resolved card URL as primary external URL: {resolved_card_url}") + elif resolved_primary_external_url != resolved_card_url: + logging.info( + f"โ„น๏ธ Card URL ({resolved_card_url}) differs from text URL ({resolved_primary_external_url}). " + f"Preferring card URL for external embed." + ) + resolved_primary_external_url = resolved_card_url + if not resolved_primary_external_url: resolved_primary_external_url = extract_first_resolved_external_url( candidate_text, @@ -1279,8 +1339,6 @@ def get_recent_bsky_posts(client, handle, limit=30): if getattr(record, "reply", None) is not None: continue - # no-op - text = getattr(record, "text", "") or "" normalized_text = normalize_post_text(text) @@ -1333,7 +1391,7 @@ def get_rate_limit_wait_seconds(error_obj, default_delay): return default_delay -def is_transient_blob_error(error_obj): +def is_transient_error(error_obj): error_text = repr(error_obj) transient_signals = [ "InvokeTimeoutError", @@ -1383,7 +1441,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): ) break - if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES: + if is_transient_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES: transient_attempts += 1 wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts logging.warning( @@ -1408,6 +1466,54 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): return None +def send_post_with_retry(client, **kwargs): + """ + Wrapper around client.send_post() with retry logic for transient errors + and rate limiting. + """ + last_exception = None + + for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1): + try: + return client.send_post(**kwargs) + + except Exception as e: + last_exception = e + error_text = str(e) + is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text + + if is_rate_limited: + backoff_delay = min( + BSKY_SEND_POST_BASE_DELAY * (2 ** (attempt - 1)), + BSKY_SEND_POST_MAX_DELAY + ) + wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay) + + if attempt < BSKY_SEND_POST_MAX_RETRIES: + logging.warning( + f"โณ Bluesky send_post rate-limited. " + f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s." + ) + time.sleep(wait_seconds) + continue + else: + logging.error(f"โŒ Exhausted send_post retries after rate limiting: {repr(e)}") + raise + + if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES: + wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt + logging.warning( + f"โณ Transient send_post failure: {repr(e)}. " + f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s." + ) + time.sleep(wait_seconds) + continue + + raise + + raise last_exception + + def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES): try: with Image.open(io.BytesIO(image_bytes)) as img: @@ -1505,7 +1611,6 @@ def get_blob_from_url(media_url, client, http_client): logging.warning(f"Could not fetch media {media_url}: {repr(e)}") return None - def get_blob_from_file(file_path, client): try: if not os.path.exists(file_path): @@ -1891,6 +1996,18 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): if href: tweet_url = f"https://x.com{href}" if href.startswith("/") else href + # --- Retweet detection --- + is_retweet = False + try: + social_context = article.locator('[data-testid="socialContext"]').first + if social_context.is_visible(): + context_text = social_context.inner_text().lower() + if "reposted" in context_text or "retweeted" in context_text or "ha repostejat" in context_text or "ha retuitat" in context_text or "repostejat" in context_text: + is_retweet = True + logging.info(f"๐Ÿ” Detected retweet/repost: {tweet_url}") + except Exception: + pass + text_locator = article.locator('[data-testid="tweetText"]').first text = text_locator.inner_text() if text_locator.is_visible() else "" @@ -1907,7 +2024,38 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): if video_locators: media_urls.append((tweet_url or "", "video")) - tweets.append(ScrapedTweet(created_at, text, media_urls, tweet_url=tweet_url)) + # --- Card URL extraction (link preview card) --- + card_url = None + try: + card_locator = article.locator('[data-testid="card.wrapper"] a[href]').first + if card_locator.is_visible(): + card_href = card_locator.get_attribute("href") + if card_href: + card_url = card_href.strip() + logging.info(f"๐Ÿƒ Scraped card URL from tweet: {card_url}") + except Exception: + pass + + # Fallback: try to find card link via role="link" inside card wrapper + if not card_url: + try: + card_role_link = article.locator('[data-testid="card.wrapper"] [role="link"]').first + if card_role_link.is_visible(): + card_a = card_role_link.locator("a[href]").first + if card_a.is_visible(): + card_href = card_a.get_attribute("href") + if card_href: + card_url = card_href.strip() + logging.info(f"๐Ÿƒ Scraped card URL (fallback) from tweet: {card_url}") + except Exception: + pass + + tweets.append(ScrapedTweet( + created_at, text, media_urls, + tweet_url=tweet_url, + card_url=card_url, + is_retweet=is_retweet, + )) except Exception as e: logging.warning(f"โš ๏ธ Failed to parse a specific tweet: {e}") @@ -2071,7 +2219,10 @@ def download_and_crop_video(video_url, output_path): temp_input, ] - download_result = subprocess.run(download_cmd, capture_output=True, text=True) + download_result = subprocess.run( + download_cmd, capture_output=True, text=True, + timeout=SUBPROCESS_TIMEOUT_SECONDS + ) if download_result.returncode != 0: logging.error(f"โŒ ffmpeg download failed:\n{download_result.stderr}") @@ -2134,7 +2285,10 @@ def download_and_crop_video(video_url, output_path): temp_output, ] - compress_result = subprocess.run(compress_cmd, capture_output=True, text=True) + compress_result = subprocess.run( + compress_cmd, capture_output=True, text=True, + timeout=SUBPROCESS_TIMEOUT_SECONDS + ) if compress_result.returncode != 0: logging.error(f"โŒ ffmpeg compression failed:\n{compress_result.stderr}") @@ -2151,6 +2305,10 @@ def download_and_crop_video(video_url, output_path): logging.info(f"โœ… Final video ready: {output_path}") return output_path + except subprocess.TimeoutExpired: + logging.error(f"โŒ ffmpeg subprocess timed out after {SUBPROCESS_TIMEOUT_SECONDS}s") + return None + except Exception as e: logging.error(f"โŒ Error processing video: {repr(e)}") return None @@ -2187,6 +2345,13 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts): def sync_feeds(args): logging.info("๐Ÿ”„ Starting sync cycle...") + + dry_run = getattr(args, "dry_run", False) + bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS + + if dry_run: + logging.info("๐Ÿงช DRY RUN MODE โ€” no posts will be created on Bluesky.") + try: state = load_state(STATE_PATH) @@ -2201,17 +2366,21 @@ def sync_feeds(args): logging.warning("โš ๏ธ No tweets found or failed to fetch. Skipping Bluesky sync for this cycle.") return - bsky_client = create_bsky_client( - args.bsky_base_url, - args.bsky_handle, - args.bsky_password - ) + bsky_client = None + if not dry_run: + bsky_client = create_bsky_client( + args.bsky_base_url, + args.bsky_handle, + args.bsky_password + ) - recent_bsky_posts = get_recent_bsky_posts( - bsky_client, - args.bsky_handle, - limit=DEDUPE_BSKY_LIMIT - ) + recent_bsky_posts = [] + if not dry_run: + recent_bsky_posts = get_recent_bsky_posts( + bsky_client, + args.bsky_handle, + limit=DEDUPE_BSKY_LIMIT + ) logging.info(f"๐Ÿง  Loaded {len(recent_bsky_posts)} recent Bluesky posts for duplicate detection.") logging.info(f"๐Ÿง  Local state currently tracks {len(state.get('posted_tweets', {}))} posted items.") @@ -2231,6 +2400,11 @@ def sync_feeds(args): logging.info(f"โญ๏ธ Skipping old tweet from {tweet_time}") continue + # --- Retweet filtering --- + if tweet.is_retweet: + logging.info(f"โญ๏ธ Skipping retweet/repost: {tweet.tweet_url}") + continue + canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url) if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}): logging.info(f"โšก Early skip due to known tweet URL in local state: {canonical_tweet_url}") @@ -2354,7 +2528,20 @@ def sync_feeds(args): raw_text = candidate["raw_text"] full_clean_text = candidate["full_clean_text"] - logging.info(f"๐Ÿ“ Posting missing tweet from {tweet_time} to Bluesky...") + logging.info(f"๐Ÿ“ {'[DRY RUN] Would post' if dry_run else 'Posting'} missing tweet from {tweet_time} to Bluesky...") + + if dry_run: + logging.info(f" ๐Ÿ“„ Text: {raw_text[:200]}{'...' if len(raw_text) > 200 else ''}") + logging.info(f" ๐Ÿ”— Primary external URL: {candidate.get('resolved_primary_external_url', 'None')}") + logging.info(f" ๐Ÿƒ Card URL: {getattr(tweet, 'card_url', 'None')}") + logging.info(f" ๐ŸŽฌ Has video: {candidate.get('has_video', False)}") + logging.info(f" ๐Ÿ–ผ๏ธ Has photo: {candidate.get('has_photo', False)}") + logging.info(f" ๐Ÿ” Is retweet: {getattr(tweet, 'is_retweet', False)}") + + remember_posted_tweet(state, candidate, bsky_uri=f"dry-run:{arrow.utcnow().isoformat()}") + save_state(state, STATE_PATH) + new_posts += 1 + continue rich_text = make_rich(raw_text) dynamic_alt = build_dynamic_alt(full_clean_text) @@ -2423,6 +2610,7 @@ def sync_feeds(args): else: media_upload_failures.append(f"photo:{media.media_url_https}") + # --- External link card logic (KEY FIX for t.co card URLs) --- if not video_embed and not image_embeds: candidate_url = candidate.get("resolved_primary_external_url") @@ -2449,17 +2637,25 @@ def sync_feeds(args): post_mode = "text" if video_embed: - post_result = bsky_client.send_post(text=rich_text, embed=video_embed, langs=["ca"]) + post_result = send_post_with_retry( + bsky_client, text=rich_text, embed=video_embed, langs=bsky_langs + ) post_mode = "video" elif image_embeds: embed = models.AppBskyEmbedImages.Main(images=image_embeds) - post_result = bsky_client.send_post(text=rich_text, embed=embed, langs=["ca"]) + post_result = send_post_with_retry( + bsky_client, text=rich_text, embed=embed, langs=bsky_langs + ) post_mode = f"images:{len(image_embeds)}" elif external_embed: - post_result = bsky_client.send_post(text=rich_text, embed=external_embed, langs=["ca"]) + post_result = send_post_with_retry( + bsky_client, text=rich_text, embed=external_embed, langs=bsky_langs + ) post_mode = "external_link_card" else: - post_result = bsky_client.send_post(text=rich_text, langs=["ca"]) + post_result = send_post_with_retry( + bsky_client, text=rich_text, langs=bsky_langs + ) post_mode = "text_only" bsky_uri = getattr(post_result, "uri", None) @@ -2513,6 +2709,17 @@ def main(): parser.add_argument("--bsky-handle", help="Your Bluesky handle") parser.add_argument("--bsky-password", help="Your Bluesky app password") parser.add_argument("--bsky-base-url", help="Bluesky/ATProto PDS base URL, e.g. https://eurosky.social") + parser.add_argument( + "--bsky-langs", + help="Comma-separated language codes for Bluesky posts (default: ca)", + default=None, + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="Simulate sync without posting to Bluesky. Logs what would be posted.", + ) args = parser.parse_args() @@ -2524,6 +2731,15 @@ def main(): args.twitter_handle = args.twitter_handle or os.getenv("TWITTER_HANDLE") or args.twitter_username args.bsky_base_url = args.bsky_base_url if args.bsky_base_url else DEFAULT_BSKY_BASE_URL + # --- Language handling: CLI > env > default (Catalan) --- + raw_langs = args.bsky_langs or os.getenv("BSKY_LANGS") + if raw_langs: + args.bsky_langs = [lang.strip() for lang in raw_langs.split(",") if lang.strip()] + logging.info(f"๐ŸŒ Using configured Bluesky languages: {args.bsky_langs}") + else: + args.bsky_langs = DEFAULT_BSKY_LANGS + logging.info(f"๐ŸŒ Using default Bluesky languages: {args.bsky_langs}") + missing_args = [] if not args.twitter_username: missing_args.append("--twitter-username") @@ -2540,9 +2756,14 @@ def main(): logging.info(f"๐Ÿค– Bot started. Will check @{args.twitter_handle}") logging.info(f"๐ŸŒ Posting destination base URL: {args.bsky_base_url}") + + if args.dry_run: + logging.info("๐Ÿงช DRY RUN MODE ENABLED โ€” no posts will be created.") + + reset_caches() sync_feeds(args) logging.info("๐Ÿค– Bot finished.") if __name__ == "__main__": - main() \ No newline at end of file + main()