Added fixes

2026-05-10 07:59:55 +02:00
parent 6b1de20d09
commit 51216085da
1 changed files with 130 additions and 77 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -98,16 +98,39 @@ class _RunCache:
        self.url_resolution: dict = {}
        self.url_validity: dict = {}
        self.locale: str = "en-US"   # ← ADDED locale cache here
+        self.video_hash_owner: dict = {}  # sha256 -> tweet_id
+        self.video_url_owner: dict = {}   # media_url -> tweet_id

    def clear(self):
        self.og_title.clear()
        self.url_resolution.clear()
        self.url_validity.clear()
+        self.video_hash_owner.clear()
+        self.video_url_owner.clear()

 _cache = _RunCache()


 def reset_caches():
+
+
+# === VIDEO BINDING PATCH APPLIED ===
+def sha256_bytes(data: bytes):
+    return hashlib.sha256(data).hexdigest()
+
+def sha256_file(path, chunk_size=1024 * 1024):
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(chunk_size)
+            if not chunk:
+                break
+            h.update(chunk)
+    return h.hexdigest()
+
+def media_url_looks_audio_only(url):
+    u = (url or "").lower()
+    return "/aud/" in u or "/audio/" in u or "mp4a" in u
    _cache.clear()

 def grapheme_len(text):
@@ -1294,7 +1317,7 @@ def normalize_post_text(text):
    return text.lower()


-def build_media_fingerprint(tweet):
+def build_media_fingerprint(tweet, resolved_video_hash=None):
    if not tweet or not tweet.media:
        return "no-media"

@@ -1309,9 +1332,13 @@ def build_media_fingerprint(tweet):
            stable_value = re.sub(r"[?&]name=\w+", "", stable_value)
            stable_value = re.sub(r"[?&]format=\w+", "", stable_value)
        elif media_type == "video":
-            stable_value = canonicalize_tweet_url(
+            tweet_key = canonicalize_tweet_url(
                tweet.tweet_url or media_url or ""
            )
+            if resolved_video_hash:
+                stable_value = f"{tweet_key}|vh:{resolved_video_hash}"
+            else:
+                stable_value = tweet_key

        parts.append(f"{media_type}:{stable_value}")

@@ -1551,6 +1578,9 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
        "bsky_uri": bsky_uri,
        "tweet_created_on": candidate["tweet"].created_on,
        "tweet_url": candidate["tweet"].tweet_url,
+        "tweet_id": candidate.get("tweet_id"),
+        "resolved_video_url": candidate.get("resolved_video_url"),
+        "resolved_video_hash": candidate.get("resolved_video_hash"),
        "posted_at": arrow.utcnow().isoformat(),
    }

@@ -2663,89 +2693,76 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local


 # --- Video Extraction & Processing ---
-def extract_video_url_from_tweet_page(browser_context, tweet_url):
-    page = browser_context.new_page()
+def extract_video_url_from_tweet_page_isolated(browser, tweet_url, tweet_id=None, locale="en-US"):
+    ctx = None
+    page = None
    best_m3u8_url = None
    best_video_mp4_url = None
-    seen_urls = set()  # ← scoped per call, so already reset per tweet ✅
+    seen_urls = set()

-    def is_audio_only_mp4(url, content_type):
-        url_l = url.lower()
-        content_type_l = content_type.lower()
-        return (
-            "/aud/" in url_l
-            or "/audio/" in url_l
-            or "mp4a" in url_l
-            or ("audio/" in content_type_l and "video/" not in content_type_l)
-        )
+    def current_best():
+        return best_m3u8_url or best_video_mp4_url

    def handle_response(response):
        nonlocal best_m3u8_url, best_video_mp4_url
        try:
            url = response.url
-            if url in seen_urls:
+            if not url or url in seen_urls:
                return
            seen_urls.add(url)

+            owner = _cache.video_url_owner.get(url)
+            if owner and tweet_id and owner != tweet_id:
+                logging.warning(f"[tweet_id={tweet_id}] Rejecting URL owned by tweet_id={owner}: {url}")
+                return
+
+            content_type = (response.headers.get("content-type") or "").lower()
            url_l = url.lower()
-            content_type = response.headers.get("content-type", "")
-            content_type_l = content_type.lower()

            if ".m4s" in url_l:
                return

-            if (
-                ".m3u8" in url_l
-                or "application/vnd.apple.mpegurl" in content_type_l
-                or "application/x-mpegurl" in content_type_l
-            ):
+            if ".m3u8" in url_l or "application/vnd.apple.mpegurl" in content_type or "application/x-mpegurl" in content_type:
                if best_m3u8_url is None:
                    best_m3u8_url = url
-                    logging.info(f"📺 Found HLS playlist URL: {url}")
                return

-            if (
-                ".mp4" in url_l
-                or "video/mp4" in content_type_l
-                or "audio/mp4" in content_type_l
-            ):
-                if is_audio_only_mp4(url, content_type):
-                    logging.info(f"🔇 Ignoring audio-only MP4: {url}")
+            if ".mp4" in url_l or "video/mp4" in content_type or "audio/mp4" in content_type:
+                if media_url_looks_audio_only(url):
                    return
-
                if best_video_mp4_url is None:
                    best_video_mp4_url = url
-                    logging.info(f"🎥 Found VIDEO MP4 URL: {url}")
                return
-
        except Exception as e:
-            logging.debug(f"Response parsing error: {e}")
-
-    page.on("response", handle_response)
-
-    def current_best():
-        return best_m3u8_url or best_video_mp4_url
+            logging.debug(f"[tweet_id={tweet_id}] response parse error: {e}")

    try:
-        logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}")
+        ctx = browser.new_context(
+            user_agent=(
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/145.0.7632.6 Safari/537.36"
+            ),
+            viewport={"width": 1920, "height": 1080},
+            locale=locale,
+        )
+        page = ctx.new_page()
+        page.on("response", handle_response)
+
+        logging.info(f"[tweet_id={tweet_id}] 🎬 Opening tweet page to capture video URL: {tweet_url}")
        page.goto(tweet_url, wait_until="domcontentloaded", timeout=40000)
        time.sleep(2)

        player = page.locator('[data-testid="videoPlayer"]').first
-
        if player.count() > 0:
            try:
                player.scroll_into_view_if_needed(timeout=5000)
            except Exception:
                pass
-
            try:
                player.click(force=True, timeout=5000)
-                logging.info("▶️ Clicked video player")
-            except Exception as e:
-                logging.info(f"⚠️ First player click failed: {e}")
-        else:
-            logging.warning("⚠️ No video player locator found on tweet page")
+            except Exception:
+                pass

        for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
            if current_best():
@@ -2753,42 +2770,42 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
            time.sleep(1)

        if not current_best() and player.count() > 0:
-            logging.info("🔁 No media URL found yet, retrying player interaction...")
            try:
                player.click(force=True, timeout=5000)
-                time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
-            except Exception as e:
-                logging.info(f"⚠️ Retry click failed: {e}")
-
-            try:
-                page.keyboard.press("Space")
-                time.sleep(1)
            except Exception:
                pass
-
+            try:
+                page.keyboard.press("Space")
+            except Exception:
+                pass
            for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
                if current_best():
                    break
                time.sleep(1)

-        selected_url = current_best()
-        if selected_url:
-            logging.info(f"✅ Selected media URL for download: {selected_url}")
-        else:
-            logging.warning(
-                f"⚠️ No playable media URL detected on tweet page: {tweet_url}"
-            )
+        selected = current_best()
+        if selected and tweet_id:
+            _cache.video_url_owner[selected] = tweet_id

-        return selected_url
+        logging.info(f"[tweet_id={tweet_id}] ✅ Selected media URL for download: {selected}")
+        return selected

    except Exception as e:
-        logging.warning(
-            f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}"
-        )
+        logging.warning(f"[tweet_id={tweet_id}] ⚠️ Could not extract video URL: {e}")
        return None
    finally:
-        page.remove_listener("response", handle_response)  # ← FIX 1: detach before close
-        page.close()
+        try:
+            if page:
+                page.remove_listener("response", handle_response)
+                page.close()
+        except Exception:
+            pass
+        try:
+            if ctx:
+                ctx.close()
+        except Exception:
+            pass
+

 def _probe_video_duration(file_path):
    probe_cmd = [
@@ -3186,6 +3203,9 @@ def sync_feeds(args):
                        ),
                        "has_video": has_video,
                        "has_photo": has_photo,
+                        "tweet_id": extract_tweet_id(tweet.tweet_url),
+                        "resolved_video_url": None,
+                        "resolved_video_hash": None,
                    }

                    is_dup_state, reason_state = candidate_matches_state(
@@ -3219,6 +3239,31 @@ def sync_feeds(args):
            f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
        )

+
+        # Pre-resolve video URLs in isolated contexts (deterministic text<->media binding)
+        if candidate_tweets:
+            with sync_playwright() as p_pre:
+                pre_browser = p_pre.chromium.launch(
+                    headless=True,
+                    args=["--disable-blink-features=AutomationControlled"],
+                )
+                try:
+                    for c in candidate_tweets:
+                        if not c.get("has_video"):
+                            continue
+                        t = c["tweet"]
+                        tid = c.get("tweet_id")
+                        if not t.tweet_url or not tid:
+                            continue
+                        c["resolved_video_url"] = extract_video_url_from_tweet_page_isolated(
+                            pre_browser,
+                            t.tweet_url,
+                            tweet_id=tid,
+                            locale=bot_locale,
+                        )
+                finally:
+                    pre_browser.close()
+
        if not candidate_tweets:
            logging.info(
                "✅ No new tweets need posting after duplicate comparison."
@@ -3335,11 +3380,8 @@ def sync_feeds(args):
                            temp_video_path = f"{temp_video_base}.mp4"

                            try:
-                                real_video_url = (
-                                    extract_video_url_from_tweet_page(
-                                        browser_context, tweet.tweet_url
-                                    )
-                                )
+                                tweet_id = candidate.get("tweet_id")
+                                real_video_url = candidate.get("resolved_video_url")
                                if not real_video_url:
                                    logging.warning(
                                        f"⚠️ Could not resolve playable video URL "
@@ -3361,9 +3403,20 @@ def sync_feeds(args):
                                            f"video:crop_failed:{tweet.tweet_url}"
                                        )
                                    else:
-                                        video_blob = get_blob_from_file(
-                                            cropped_video_path, bsky_client
-                                        )
+                                        video_hash = sha256_file(cropped_video_path)
+                                        candidate["resolved_video_hash"] = video_hash
+                                        owner = _cache.video_hash_owner.get(video_hash)
+                                        if owner and owner != tweet_id:
+                                            logging.warning(
+                                                f"[tweet_id={tweet_id}] ⚠️ Video hash already owned by tweet_id={owner}. Rejecting media."
+                                            )
+                                            media_upload_failures.append(f"video:hash_owned_by:{owner}")
+                                            video_blob = None
+                                        else:
+                                            _cache.video_hash_owner[video_hash] = tweet_id
+                                            video_blob = get_blob_from_file(
+                                                cropped_video_path, bsky_client
+                                            )
                                        if not video_blob:
                                            logging.warning(
                                                f"⚠️ Video upload blob failed for "