fix: prevent video URL bleed-through and stale listener across tweets

- Add `page.remove_listener("response", handle_response)` in
  `extract_video_url_from_tweet_page` finally block to detach the
  network listener before page close, preventing ghost callbacks
  from leaking captured URLs across tweet iterations.

- Confirmed `build_media_fingerprint` already uses `canonicalize_tweet_url`
  as stable video identifier instead of unreliable `media_url_https`,
  avoiding false fingerprint collisions between different video tweets.

- Confirmed `sync_feeds` already guards against `None` video URL after
  extraction, ensuring no silent fallthrough to stale captures.
This commit is contained in:
Guillem Hernandez Sola
2026-05-05 13:39:22 +02:00
parent 388c327a13
commit 09b9d16791

View File

@@ -2667,7 +2667,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
page = browser_context.new_page()
best_m3u8_url = None
best_video_mp4_url = None
seen_urls = set()
seen_urls = set() # ← scoped per call, so already reset per tweet ✅
def is_audio_only_mp4(url, content_type):
url_l = url.lower()
@@ -2727,9 +2727,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
return best_m3u8_url or best_video_mp4_url
try:
logging.info(
f"🎬 Opening tweet page to capture video URL: {tweet_url}"
)
logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}")
page.goto(tweet_url, wait_until="domcontentloaded", timeout=40000)
time.sleep(2)
@@ -2755,9 +2753,7 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
time.sleep(1)
if not current_best() and player.count() > 0:
logging.info(
"🔁 No media URL found yet, retrying player interaction..."
)
logging.info("🔁 No media URL found yet, retrying player interaction...")
try:
player.click(force=True, timeout=5000)
time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
@@ -2791,9 +2787,9 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
)
return None
finally:
page.remove_listener("response", handle_response) # ← FIX 1: detach before close
page.close()
def _probe_video_duration(file_path):
probe_cmd = [
"ffprobe",