diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 061f83d..f2a9b9f 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -98,16 +98,39 @@ class _RunCache: self.url_resolution: dict = {} self.url_validity: dict = {} self.locale: str = "en-US" # ← ADDED locale cache here + self.video_hash_owner: dict = {} # sha256 -> tweet_id + self.video_url_owner: dict = {} # media_url -> tweet_id def clear(self): self.og_title.clear() self.url_resolution.clear() self.url_validity.clear() + self.video_hash_owner.clear() + self.video_url_owner.clear() _cache = _RunCache() def reset_caches(): + + +# === VIDEO BINDING PATCH APPLIED === +def sha256_bytes(data: bytes): + return hashlib.sha256(data).hexdigest() + +def sha256_file(path, chunk_size=1024 * 1024): + h = hashlib.sha256() + with open(path, "rb") as f: + while True: + chunk = f.read(chunk_size) + if not chunk: + break + h.update(chunk) + return h.hexdigest() + +def media_url_looks_audio_only(url): + u = (url or "").lower() + return "/aud/" in u or "/audio/" in u or "mp4a" in u _cache.clear() def grapheme_len(text): @@ -1294,7 +1317,7 @@ def normalize_post_text(text): return text.lower() -def build_media_fingerprint(tweet): +def build_media_fingerprint(tweet, resolved_video_hash=None): if not tweet or not tweet.media: return "no-media" @@ -1309,9 +1332,13 @@ def build_media_fingerprint(tweet): stable_value = re.sub(r"[?&]name=\w+", "", stable_value) stable_value = re.sub(r"[?&]format=\w+", "", stable_value) elif media_type == "video": - stable_value = canonicalize_tweet_url( + tweet_key = canonicalize_tweet_url( tweet.tweet_url or media_url or "" ) + if resolved_video_hash: + stable_value = f"{tweet_key}|vh:{resolved_video_hash}" + else: + stable_value = tweet_key parts.append(f"{media_type}:{stable_value}") @@ -1551,6 +1578,9 @@ def remember_posted_tweet(state, candidate, bsky_uri=None): "bsky_uri": bsky_uri, "tweet_created_on": candidate["tweet"].created_on, "tweet_url": candidate["tweet"].tweet_url, + "tweet_id": candidate.get("tweet_id"), + "resolved_video_url": candidate.get("resolved_video_url"), + "resolved_video_hash": candidate.get("resolved_video_hash"), "posted_at": arrow.utcnow().isoformat(), } @@ -2663,89 +2693,76 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local # --- Video Extraction & Processing --- -def extract_video_url_from_tweet_page(browser_context, tweet_url): - page = browser_context.new_page() +def extract_video_url_from_tweet_page_isolated(browser, tweet_url, tweet_id=None, locale="en-US"): + ctx = None + page = None best_m3u8_url = None best_video_mp4_url = None - seen_urls = set() # ← scoped per call, so already reset per tweet ✅ + seen_urls = set() - def is_audio_only_mp4(url, content_type): - url_l = url.lower() - content_type_l = content_type.lower() - return ( - "/aud/" in url_l - or "/audio/" in url_l - or "mp4a" in url_l - or ("audio/" in content_type_l and "video/" not in content_type_l) - ) + def current_best(): + return best_m3u8_url or best_video_mp4_url def handle_response(response): nonlocal best_m3u8_url, best_video_mp4_url try: url = response.url - if url in seen_urls: + if not url or url in seen_urls: return seen_urls.add(url) + owner = _cache.video_url_owner.get(url) + if owner and tweet_id and owner != tweet_id: + logging.warning(f"[tweet_id={tweet_id}] Rejecting URL owned by tweet_id={owner}: {url}") + return + + content_type = (response.headers.get("content-type") or "").lower() url_l = url.lower() - content_type = response.headers.get("content-type", "") - content_type_l = content_type.lower() if ".m4s" in url_l: return - if ( - ".m3u8" in url_l - or "application/vnd.apple.mpegurl" in content_type_l - or "application/x-mpegurl" in content_type_l - ): + if ".m3u8" in url_l or "application/vnd.apple.mpegurl" in content_type or "application/x-mpegurl" in content_type: if best_m3u8_url is None: best_m3u8_url = url - logging.info(f"📺 Found HLS playlist URL: {url}") return - if ( - ".mp4" in url_l - or "video/mp4" in content_type_l - or "audio/mp4" in content_type_l - ): - if is_audio_only_mp4(url, content_type): - logging.info(f"🔇 Ignoring audio-only MP4: {url}") + if ".mp4" in url_l or "video/mp4" in content_type or "audio/mp4" in content_type: + if media_url_looks_audio_only(url): return - if best_video_mp4_url is None: best_video_mp4_url = url - logging.info(f"🎥 Found VIDEO MP4 URL: {url}") return - except Exception as e: - logging.debug(f"Response parsing error: {e}") - - page.on("response", handle_response) - - def current_best(): - return best_m3u8_url or best_video_mp4_url + logging.debug(f"[tweet_id={tweet_id}] response parse error: {e}") try: - logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}") + ctx = browser.new_context( + user_agent=( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/145.0.7632.6 Safari/537.36" + ), + viewport={"width": 1920, "height": 1080}, + locale=locale, + ) + page = ctx.new_page() + page.on("response", handle_response) + + logging.info(f"[tweet_id={tweet_id}] 🎬 Opening tweet page to capture video URL: {tweet_url}") page.goto(tweet_url, wait_until="domcontentloaded", timeout=40000) time.sleep(2) player = page.locator('[data-testid="videoPlayer"]').first - if player.count() > 0: try: player.scroll_into_view_if_needed(timeout=5000) except Exception: pass - try: player.click(force=True, timeout=5000) - logging.info("▶️ Clicked video player") - except Exception as e: - logging.info(f"⚠️ First player click failed: {e}") - else: - logging.warning("⚠️ No video player locator found on tweet page") + except Exception: + pass for _ in range(VIDEO_PLAYER_WAIT_ROUNDS): if current_best(): @@ -2753,42 +2770,42 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url): time.sleep(1) if not current_best() and player.count() > 0: - logging.info("🔁 No media URL found yet, retrying player interaction...") try: player.click(force=True, timeout=5000) - time.sleep(PLAYWRIGHT_RETRY_SLEEP_S) - except Exception as e: - logging.info(f"⚠️ Retry click failed: {e}") - - try: - page.keyboard.press("Space") - time.sleep(1) except Exception: pass - + try: + page.keyboard.press("Space") + except Exception: + pass for _ in range(VIDEO_PLAYER_RETRY_ROUNDS): if current_best(): break time.sleep(1) - selected_url = current_best() - if selected_url: - logging.info(f"✅ Selected media URL for download: {selected_url}") - else: - logging.warning( - f"⚠️ No playable media URL detected on tweet page: {tweet_url}" - ) + selected = current_best() + if selected and tweet_id: + _cache.video_url_owner[selected] = tweet_id - return selected_url + logging.info(f"[tweet_id={tweet_id}] ✅ Selected media URL for download: {selected}") + return selected except Exception as e: - logging.warning( - f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}" - ) + logging.warning(f"[tweet_id={tweet_id}] ⚠️ Could not extract video URL: {e}") return None finally: - page.remove_listener("response", handle_response) # ← FIX 1: detach before close - page.close() + try: + if page: + page.remove_listener("response", handle_response) + page.close() + except Exception: + pass + try: + if ctx: + ctx.close() + except Exception: + pass + def _probe_video_duration(file_path): probe_cmd = [ @@ -3186,6 +3203,9 @@ def sync_feeds(args): ), "has_video": has_video, "has_photo": has_photo, + "tweet_id": extract_tweet_id(tweet.tweet_url), + "resolved_video_url": None, + "resolved_video_hash": None, } is_dup_state, reason_state = candidate_matches_state( @@ -3219,6 +3239,31 @@ def sync_feeds(args): f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering." ) + + # Pre-resolve video URLs in isolated contexts (deterministic text<->media binding) + if candidate_tweets: + with sync_playwright() as p_pre: + pre_browser = p_pre.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"], + ) + try: + for c in candidate_tweets: + if not c.get("has_video"): + continue + t = c["tweet"] + tid = c.get("tweet_id") + if not t.tweet_url or not tid: + continue + c["resolved_video_url"] = extract_video_url_from_tweet_page_isolated( + pre_browser, + t.tweet_url, + tweet_id=tid, + locale=bot_locale, + ) + finally: + pre_browser.close() + if not candidate_tweets: logging.info( "✅ No new tweets need posting after duplicate comparison." @@ -3335,11 +3380,8 @@ def sync_feeds(args): temp_video_path = f"{temp_video_base}.mp4" try: - real_video_url = ( - extract_video_url_from_tweet_page( - browser_context, tweet.tweet_url - ) - ) + tweet_id = candidate.get("tweet_id") + real_video_url = candidate.get("resolved_video_url") if not real_video_url: logging.warning( f"⚠️ Could not resolve playable video URL " @@ -3361,9 +3403,20 @@ def sync_feeds(args): f"video:crop_failed:{tweet.tweet_url}" ) else: - video_blob = get_blob_from_file( - cropped_video_path, bsky_client - ) + video_hash = sha256_file(cropped_video_path) + candidate["resolved_video_hash"] = video_hash + owner = _cache.video_hash_owner.get(video_hash) + if owner and owner != tweet_id: + logging.warning( + f"[tweet_id={tweet_id}] ⚠️ Video hash already owned by tweet_id={owner}. Rejecting media." + ) + media_upload_failures.append(f"video:hash_owned_by:{owner}") + video_blob = None + else: + _cache.video_hash_owner[video_hash] = tweet_id + video_blob = get_blob_from_file( + cropped_video_path, bsky_client + ) if not video_blob: logging.warning( f"⚠️ Video upload blob failed for "