Refactor using Claude 4.6 Opus
This commit is contained in:
@@ -26,6 +26,7 @@ SCRAPE_TWEET_LIMIT = 30
|
||||
DEDUPE_BSKY_LIMIT = 30
|
||||
TWEET_MAX_AGE_DAYS = 3
|
||||
BSKY_TEXT_MAX_LENGTH = 275
|
||||
DEFAULT_BSKY_LANGS = ["ca"]
|
||||
|
||||
VIDEO_MAX_DURATION_SECONDS = 179
|
||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||
@@ -44,10 +45,15 @@ BSKY_BLOB_UPLOAD_MAX_DELAY = 300
|
||||
BSKY_BLOB_TRANSIENT_ERROR_RETRIES = 3
|
||||
BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15
|
||||
|
||||
BSKY_SEND_POST_MAX_RETRIES = 3
|
||||
BSKY_SEND_POST_BASE_DELAY = 5
|
||||
BSKY_SEND_POST_MAX_DELAY = 60
|
||||
|
||||
MEDIA_DOWNLOAD_TIMEOUT = 30
|
||||
LINK_METADATA_TIMEOUT = 10
|
||||
URL_RESOLVE_TIMEOUT = 12
|
||||
PLAYWRIGHT_RESOLVE_TIMEOUT_MS = 30000
|
||||
SUBPROCESS_TIMEOUT_SECONDS = 180
|
||||
DEFAULT_BSKY_BASE_URL = "https://bsky.social"
|
||||
|
||||
# --- Logging Setup ---
|
||||
@@ -60,6 +66,13 @@ logging.basicConfig(
|
||||
# --- Per-run caches for efficiency ---
|
||||
OG_TITLE_CACHE = {}
|
||||
URL_RESOLUTION_CACHE = {}
|
||||
URL_VALIDITY_CACHE = {}
|
||||
|
||||
|
||||
def reset_caches():
|
||||
OG_TITLE_CACHE.clear()
|
||||
URL_RESOLUTION_CACHE.clear()
|
||||
URL_VALIDITY_CACHE.clear()
|
||||
|
||||
|
||||
# --- Custom Classes ---
|
||||
@@ -70,10 +83,12 @@ class ScrapedMedia:
|
||||
|
||||
|
||||
class ScrapedTweet:
|
||||
def __init__(self, created_on, text, media_urls, tweet_url=None):
|
||||
def __init__(self, created_on, text, media_urls, tweet_url=None, card_url=None, is_retweet=False):
|
||||
self.created_on = created_on
|
||||
self.text = text
|
||||
self.tweet_url = tweet_url
|
||||
self.card_url = card_url
|
||||
self.is_retweet = is_retweet
|
||||
self.media = [ScrapedMedia(url, media_type) for url, media_type in media_urls]
|
||||
|
||||
|
||||
@@ -87,11 +102,17 @@ def take_error_screenshot(page, error_msg):
|
||||
|
||||
|
||||
def is_valid_url(url):
|
||||
if url in URL_VALIDITY_CACHE:
|
||||
return URL_VALIDITY_CACHE[url]
|
||||
|
||||
try:
|
||||
response = httpx.head(url, timeout=5, follow_redirects=True)
|
||||
return response.status_code < 500
|
||||
result = response.status_code < 500
|
||||
except Exception:
|
||||
return False
|
||||
result = False
|
||||
|
||||
URL_VALIDITY_CACHE[url] = result
|
||||
return result
|
||||
|
||||
|
||||
def strip_trailing_url_punctuation(url):
|
||||
@@ -404,9 +425,6 @@ def extract_quoted_text_from_og_title(og_title):
|
||||
|
||||
|
||||
def should_fetch_og_title(tweet):
|
||||
"""
|
||||
Avoid fetching og:title unless it is likely to improve the text.
|
||||
"""
|
||||
text = clean_post_text(tweet.text or "")
|
||||
urls = extract_urls_from_text(text)
|
||||
|
||||
@@ -681,12 +699,40 @@ def extract_first_resolved_external_url(text, http_client, allow_playwright_fall
|
||||
return None
|
||||
|
||||
|
||||
def resolve_card_url(card_url, http_client):
|
||||
"""
|
||||
Resolve a card URL (typically t.co) scraped from the tweet's link preview card.
|
||||
Returns the final external URL or None.
|
||||
"""
|
||||
if not card_url:
|
||||
return None
|
||||
|
||||
cleaned = canonicalize_url(card_url.strip())
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
if is_external_non_x_url(cleaned):
|
||||
logging.info(f"🔗 Card URL is already external: {cleaned}")
|
||||
return cleaned
|
||||
|
||||
if is_tco_domain(cleaned):
|
||||
resolved = resolve_url_if_needed(cleaned, http_client, allow_playwright_fallback=True)
|
||||
if resolved and is_external_non_x_url(resolved):
|
||||
logging.info(f"🔗 Resolved card t.co URL: {cleaned} -> {resolved}")
|
||||
return resolved
|
||||
|
||||
if is_x_or_twitter_domain(cleaned):
|
||||
logging.info(f"ℹ️ Card URL resolves to X/Twitter domain, ignoring: {cleaned}")
|
||||
return None
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def sanitize_visible_urls_in_text(text, http_client, has_media=False):
|
||||
"""
|
||||
Faster logic:
|
||||
- remove x/twitter URLs from visible text
|
||||
- resolve t.co
|
||||
- if a t.co resolves to x/twitter and tweet has media, do not use Playwright fallback
|
||||
- if a t.co resolves to x/twitter and tweet has media, skip Playwright fallback
|
||||
"""
|
||||
if not text:
|
||||
return text, None
|
||||
@@ -816,6 +862,20 @@ def build_effective_tweet_text(tweet, http_client):
|
||||
)
|
||||
candidate_text = clean_post_text(candidate_text)
|
||||
|
||||
# --- KEY FIX: also resolve the card_url scraped from the tweet's link preview ---
|
||||
resolved_card_url = resolve_card_url(getattr(tweet, "card_url", None), http_client)
|
||||
|
||||
if resolved_card_url and is_external_non_x_url(resolved_card_url):
|
||||
if not resolved_primary_external_url:
|
||||
resolved_primary_external_url = resolved_card_url
|
||||
logging.info(f"🔗 Using resolved card URL as primary external URL: {resolved_card_url}")
|
||||
elif resolved_primary_external_url != resolved_card_url:
|
||||
logging.info(
|
||||
f"ℹ️ Card URL ({resolved_card_url}) differs from text URL ({resolved_primary_external_url}). "
|
||||
f"Preferring card URL for external embed."
|
||||
)
|
||||
resolved_primary_external_url = resolved_card_url
|
||||
|
||||
if not resolved_primary_external_url:
|
||||
resolved_primary_external_url = extract_first_resolved_external_url(
|
||||
candidate_text,
|
||||
@@ -1279,8 +1339,6 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
if getattr(record, "reply", None) is not None:
|
||||
continue
|
||||
|
||||
# no-op
|
||||
|
||||
text = getattr(record, "text", "") or ""
|
||||
normalized_text = normalize_post_text(text)
|
||||
|
||||
@@ -1333,7 +1391,7 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||
return default_delay
|
||||
|
||||
|
||||
def is_transient_blob_error(error_obj):
|
||||
def is_transient_error(error_obj):
|
||||
error_text = repr(error_obj)
|
||||
transient_signals = [
|
||||
"InvokeTimeoutError",
|
||||
@@ -1383,7 +1441,7 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
||||
)
|
||||
break
|
||||
|
||||
if is_transient_blob_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES:
|
||||
if is_transient_error(e) and transient_attempts < BSKY_BLOB_TRANSIENT_ERROR_RETRIES:
|
||||
transient_attempts += 1
|
||||
wait_seconds = BSKY_BLOB_TRANSIENT_ERROR_DELAY * transient_attempts
|
||||
logging.warning(
|
||||
@@ -1408,6 +1466,54 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
||||
return None
|
||||
|
||||
|
||||
def send_post_with_retry(client, **kwargs):
|
||||
"""
|
||||
Wrapper around client.send_post() with retry logic for transient errors
|
||||
and rate limiting.
|
||||
"""
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(1, BSKY_SEND_POST_MAX_RETRIES + 1):
|
||||
try:
|
||||
return client.send_post(**kwargs)
|
||||
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
error_text = str(e)
|
||||
is_rate_limited = "429" in error_text or "RateLimitExceeded" in error_text
|
||||
|
||||
if is_rate_limited:
|
||||
backoff_delay = min(
|
||||
BSKY_SEND_POST_BASE_DELAY * (2 ** (attempt - 1)),
|
||||
BSKY_SEND_POST_MAX_DELAY
|
||||
)
|
||||
wait_seconds = get_rate_limit_wait_seconds(e, backoff_delay)
|
||||
|
||||
if attempt < BSKY_SEND_POST_MAX_RETRIES:
|
||||
logging.warning(
|
||||
f"⏳ Bluesky send_post rate-limited. "
|
||||
f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s."
|
||||
)
|
||||
time.sleep(wait_seconds)
|
||||
continue
|
||||
else:
|
||||
logging.error(f"❌ Exhausted send_post retries after rate limiting: {repr(e)}")
|
||||
raise
|
||||
|
||||
if is_transient_error(e) and attempt < BSKY_SEND_POST_MAX_RETRIES:
|
||||
wait_seconds = BSKY_SEND_POST_BASE_DELAY * attempt
|
||||
logging.warning(
|
||||
f"⏳ Transient send_post failure: {repr(e)}. "
|
||||
f"Retry {attempt}/{BSKY_SEND_POST_MAX_RETRIES} after {wait_seconds}s."
|
||||
)
|
||||
time.sleep(wait_seconds)
|
||||
continue
|
||||
|
||||
raise
|
||||
|
||||
raise last_exception
|
||||
|
||||
|
||||
def compress_post_image_to_limit(image_bytes, max_bytes=BSKY_IMAGE_MAX_BYTES):
|
||||
try:
|
||||
with Image.open(io.BytesIO(image_bytes)) as img:
|
||||
@@ -1505,7 +1611,6 @@ def get_blob_from_url(media_url, client, http_client):
|
||||
logging.warning(f"Could not fetch media {media_url}: {repr(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def get_blob_from_file(file_path, client):
|
||||
try:
|
||||
if not os.path.exists(file_path):
|
||||
@@ -1891,6 +1996,18 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
||||
if href:
|
||||
tweet_url = f"https://x.com{href}" if href.startswith("/") else href
|
||||
|
||||
# --- Retweet detection ---
|
||||
is_retweet = False
|
||||
try:
|
||||
social_context = article.locator('[data-testid="socialContext"]').first
|
||||
if social_context.is_visible():
|
||||
context_text = social_context.inner_text().lower()
|
||||
if "reposted" in context_text or "retweeted" in context_text or "ha repostejat" in context_text or "ha retuitat" in context_text or "repostejat" in context_text:
|
||||
is_retweet = True
|
||||
logging.info(f"🔁 Detected retweet/repost: {tweet_url}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
text_locator = article.locator('[data-testid="tweetText"]').first
|
||||
text = text_locator.inner_text() if text_locator.is_visible() else ""
|
||||
|
||||
@@ -1907,7 +2024,38 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
||||
if video_locators:
|
||||
media_urls.append((tweet_url or "", "video"))
|
||||
|
||||
tweets.append(ScrapedTweet(created_at, text, media_urls, tweet_url=tweet_url))
|
||||
# --- Card URL extraction (link preview card) ---
|
||||
card_url = None
|
||||
try:
|
||||
card_locator = article.locator('[data-testid="card.wrapper"] a[href]').first
|
||||
if card_locator.is_visible():
|
||||
card_href = card_locator.get_attribute("href")
|
||||
if card_href:
|
||||
card_url = card_href.strip()
|
||||
logging.info(f"🃏 Scraped card URL from tweet: {card_url}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: try to find card link via role="link" inside card wrapper
|
||||
if not card_url:
|
||||
try:
|
||||
card_role_link = article.locator('[data-testid="card.wrapper"] [role="link"]').first
|
||||
if card_role_link.is_visible():
|
||||
card_a = card_role_link.locator("a[href]").first
|
||||
if card_a.is_visible():
|
||||
card_href = card_a.get_attribute("href")
|
||||
if card_href:
|
||||
card_url = card_href.strip()
|
||||
logging.info(f"🃏 Scraped card URL (fallback) from tweet: {card_url}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tweets.append(ScrapedTweet(
|
||||
created_at, text, media_urls,
|
||||
tweet_url=tweet_url,
|
||||
card_url=card_url,
|
||||
is_retweet=is_retweet,
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Failed to parse a specific tweet: {e}")
|
||||
@@ -2071,7 +2219,10 @@ def download_and_crop_video(video_url, output_path):
|
||||
temp_input,
|
||||
]
|
||||
|
||||
download_result = subprocess.run(download_cmd, capture_output=True, text=True)
|
||||
download_result = subprocess.run(
|
||||
download_cmd, capture_output=True, text=True,
|
||||
timeout=SUBPROCESS_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
if download_result.returncode != 0:
|
||||
logging.error(f"❌ ffmpeg download failed:\n{download_result.stderr}")
|
||||
@@ -2134,7 +2285,10 @@ def download_and_crop_video(video_url, output_path):
|
||||
temp_output,
|
||||
]
|
||||
|
||||
compress_result = subprocess.run(compress_cmd, capture_output=True, text=True)
|
||||
compress_result = subprocess.run(
|
||||
compress_cmd, capture_output=True, text=True,
|
||||
timeout=SUBPROCESS_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
if compress_result.returncode != 0:
|
||||
logging.error(f"❌ ffmpeg compression failed:\n{compress_result.stderr}")
|
||||
@@ -2151,6 +2305,10 @@ def download_and_crop_video(video_url, output_path):
|
||||
logging.info(f"✅ Final video ready: {output_path}")
|
||||
return output_path
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logging.error(f"❌ ffmpeg subprocess timed out after {SUBPROCESS_TIMEOUT_SECONDS}s")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Error processing video: {repr(e)}")
|
||||
return None
|
||||
@@ -2187,6 +2345,13 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
||||
|
||||
def sync_feeds(args):
|
||||
logging.info("🔄 Starting sync cycle...")
|
||||
|
||||
dry_run = getattr(args, "dry_run", False)
|
||||
bsky_langs = getattr(args, "bsky_langs", None) or DEFAULT_BSKY_LANGS
|
||||
|
||||
if dry_run:
|
||||
logging.info("🧪 DRY RUN MODE — no posts will be created on Bluesky.")
|
||||
|
||||
try:
|
||||
state = load_state(STATE_PATH)
|
||||
|
||||
@@ -2201,17 +2366,21 @@ def sync_feeds(args):
|
||||
logging.warning("⚠️ No tweets found or failed to fetch. Skipping Bluesky sync for this cycle.")
|
||||
return
|
||||
|
||||
bsky_client = create_bsky_client(
|
||||
args.bsky_base_url,
|
||||
args.bsky_handle,
|
||||
args.bsky_password
|
||||
)
|
||||
bsky_client = None
|
||||
if not dry_run:
|
||||
bsky_client = create_bsky_client(
|
||||
args.bsky_base_url,
|
||||
args.bsky_handle,
|
||||
args.bsky_password
|
||||
)
|
||||
|
||||
recent_bsky_posts = get_recent_bsky_posts(
|
||||
bsky_client,
|
||||
args.bsky_handle,
|
||||
limit=DEDUPE_BSKY_LIMIT
|
||||
)
|
||||
recent_bsky_posts = []
|
||||
if not dry_run:
|
||||
recent_bsky_posts = get_recent_bsky_posts(
|
||||
bsky_client,
|
||||
args.bsky_handle,
|
||||
limit=DEDUPE_BSKY_LIMIT
|
||||
)
|
||||
|
||||
logging.info(f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for duplicate detection.")
|
||||
logging.info(f"🧠 Local state currently tracks {len(state.get('posted_tweets', {}))} posted items.")
|
||||
@@ -2231,6 +2400,11 @@ def sync_feeds(args):
|
||||
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||
continue
|
||||
|
||||
# --- Retweet filtering ---
|
||||
if tweet.is_retweet:
|
||||
logging.info(f"⏭️ Skipping retweet/repost: {tweet.tweet_url}")
|
||||
continue
|
||||
|
||||
canonical_tweet_url = canonicalize_tweet_url(tweet.tweet_url)
|
||||
if canonical_tweet_url and canonical_tweet_url in state.get("posted_tweets", {}):
|
||||
logging.info(f"⚡ Early skip due to known tweet URL in local state: {canonical_tweet_url}")
|
||||
@@ -2354,7 +2528,20 @@ def sync_feeds(args):
|
||||
raw_text = candidate["raw_text"]
|
||||
full_clean_text = candidate["full_clean_text"]
|
||||
|
||||
logging.info(f"📝 Posting missing tweet from {tweet_time} to Bluesky...")
|
||||
logging.info(f"📝 {'[DRY RUN] Would post' if dry_run else 'Posting'} missing tweet from {tweet_time} to Bluesky...")
|
||||
|
||||
if dry_run:
|
||||
logging.info(f" 📄 Text: {raw_text[:200]}{'...' if len(raw_text) > 200 else ''}")
|
||||
logging.info(f" 🔗 Primary external URL: {candidate.get('resolved_primary_external_url', 'None')}")
|
||||
logging.info(f" 🃏 Card URL: {getattr(tweet, 'card_url', 'None')}")
|
||||
logging.info(f" 🎬 Has video: {candidate.get('has_video', False)}")
|
||||
logging.info(f" 🖼️ Has photo: {candidate.get('has_photo', False)}")
|
||||
logging.info(f" 🔁 Is retweet: {getattr(tweet, 'is_retweet', False)}")
|
||||
|
||||
remember_posted_tweet(state, candidate, bsky_uri=f"dry-run:{arrow.utcnow().isoformat()}")
|
||||
save_state(state, STATE_PATH)
|
||||
new_posts += 1
|
||||
continue
|
||||
|
||||
rich_text = make_rich(raw_text)
|
||||
dynamic_alt = build_dynamic_alt(full_clean_text)
|
||||
@@ -2423,6 +2610,7 @@ def sync_feeds(args):
|
||||
else:
|
||||
media_upload_failures.append(f"photo:{media.media_url_https}")
|
||||
|
||||
# --- External link card logic (KEY FIX for t.co card URLs) ---
|
||||
if not video_embed and not image_embeds:
|
||||
candidate_url = candidate.get("resolved_primary_external_url")
|
||||
|
||||
@@ -2449,17 +2637,25 @@ def sync_feeds(args):
|
||||
post_mode = "text"
|
||||
|
||||
if video_embed:
|
||||
post_result = bsky_client.send_post(text=rich_text, embed=video_embed, langs=["ca"])
|
||||
post_result = send_post_with_retry(
|
||||
bsky_client, text=rich_text, embed=video_embed, langs=bsky_langs
|
||||
)
|
||||
post_mode = "video"
|
||||
elif image_embeds:
|
||||
embed = models.AppBskyEmbedImages.Main(images=image_embeds)
|
||||
post_result = bsky_client.send_post(text=rich_text, embed=embed, langs=["ca"])
|
||||
post_result = send_post_with_retry(
|
||||
bsky_client, text=rich_text, embed=embed, langs=bsky_langs
|
||||
)
|
||||
post_mode = f"images:{len(image_embeds)}"
|
||||
elif external_embed:
|
||||
post_result = bsky_client.send_post(text=rich_text, embed=external_embed, langs=["ca"])
|
||||
post_result = send_post_with_retry(
|
||||
bsky_client, text=rich_text, embed=external_embed, langs=bsky_langs
|
||||
)
|
||||
post_mode = "external_link_card"
|
||||
else:
|
||||
post_result = bsky_client.send_post(text=rich_text, langs=["ca"])
|
||||
post_result = send_post_with_retry(
|
||||
bsky_client, text=rich_text, langs=bsky_langs
|
||||
)
|
||||
post_mode = "text_only"
|
||||
|
||||
bsky_uri = getattr(post_result, "uri", None)
|
||||
@@ -2513,6 +2709,17 @@ def main():
|
||||
parser.add_argument("--bsky-handle", help="Your Bluesky handle")
|
||||
parser.add_argument("--bsky-password", help="Your Bluesky app password")
|
||||
parser.add_argument("--bsky-base-url", help="Bluesky/ATProto PDS base URL, e.g. https://eurosky.social")
|
||||
parser.add_argument(
|
||||
"--bsky-langs",
|
||||
help="Comma-separated language codes for Bluesky posts (default: ca)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Simulate sync without posting to Bluesky. Logs what would be posted.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -2524,6 +2731,15 @@ def main():
|
||||
args.twitter_handle = args.twitter_handle or os.getenv("TWITTER_HANDLE") or args.twitter_username
|
||||
args.bsky_base_url = args.bsky_base_url if args.bsky_base_url else DEFAULT_BSKY_BASE_URL
|
||||
|
||||
# --- Language handling: CLI > env > default (Catalan) ---
|
||||
raw_langs = args.bsky_langs or os.getenv("BSKY_LANGS")
|
||||
if raw_langs:
|
||||
args.bsky_langs = [lang.strip() for lang in raw_langs.split(",") if lang.strip()]
|
||||
logging.info(f"🌍 Using configured Bluesky languages: {args.bsky_langs}")
|
||||
else:
|
||||
args.bsky_langs = DEFAULT_BSKY_LANGS
|
||||
logging.info(f"🌍 Using default Bluesky languages: {args.bsky_langs}")
|
||||
|
||||
missing_args = []
|
||||
if not args.twitter_username:
|
||||
missing_args.append("--twitter-username")
|
||||
@@ -2540,9 +2756,14 @@ def main():
|
||||
|
||||
logging.info(f"🤖 Bot started. Will check @{args.twitter_handle}")
|
||||
logging.info(f"🌍 Posting destination base URL: {args.bsky_base_url}")
|
||||
|
||||
if args.dry_run:
|
||||
logging.info("🧪 DRY RUN MODE ENABLED — no posts will be created.")
|
||||
|
||||
reset_caches()
|
||||
sync_feeds(args)
|
||||
logging.info("🤖 Bot finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user