Added fixes
This commit is contained in:
@@ -98,16 +98,39 @@ class _RunCache:
|
|||||||
self.url_resolution: dict = {}
|
self.url_resolution: dict = {}
|
||||||
self.url_validity: dict = {}
|
self.url_validity: dict = {}
|
||||||
self.locale: str = "en-US" # ← ADDED locale cache here
|
self.locale: str = "en-US" # ← ADDED locale cache here
|
||||||
|
self.video_hash_owner: dict = {} # sha256 -> tweet_id
|
||||||
|
self.video_url_owner: dict = {} # media_url -> tweet_id
|
||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
self.og_title.clear()
|
self.og_title.clear()
|
||||||
self.url_resolution.clear()
|
self.url_resolution.clear()
|
||||||
self.url_validity.clear()
|
self.url_validity.clear()
|
||||||
|
self.video_hash_owner.clear()
|
||||||
|
self.video_url_owner.clear()
|
||||||
|
|
||||||
_cache = _RunCache()
|
_cache = _RunCache()
|
||||||
|
|
||||||
|
|
||||||
def reset_caches():
|
def reset_caches():
|
||||||
|
|
||||||
|
|
||||||
|
# === VIDEO BINDING PATCH APPLIED ===
|
||||||
|
def sha256_bytes(data: bytes):
|
||||||
|
return hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
|
def sha256_file(path, chunk_size=1024 * 1024):
|
||||||
|
h = hashlib.sha256()
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
chunk = f.read(chunk_size)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
h.update(chunk)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
def media_url_looks_audio_only(url):
|
||||||
|
u = (url or "").lower()
|
||||||
|
return "/aud/" in u or "/audio/" in u or "mp4a" in u
|
||||||
_cache.clear()
|
_cache.clear()
|
||||||
|
|
||||||
def grapheme_len(text):
|
def grapheme_len(text):
|
||||||
@@ -1294,7 +1317,7 @@ def normalize_post_text(text):
|
|||||||
return text.lower()
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
def build_media_fingerprint(tweet):
|
def build_media_fingerprint(tweet, resolved_video_hash=None):
|
||||||
if not tweet or not tweet.media:
|
if not tweet or not tweet.media:
|
||||||
return "no-media"
|
return "no-media"
|
||||||
|
|
||||||
@@ -1309,9 +1332,13 @@ def build_media_fingerprint(tweet):
|
|||||||
stable_value = re.sub(r"[?&]name=\w+", "", stable_value)
|
stable_value = re.sub(r"[?&]name=\w+", "", stable_value)
|
||||||
stable_value = re.sub(r"[?&]format=\w+", "", stable_value)
|
stable_value = re.sub(r"[?&]format=\w+", "", stable_value)
|
||||||
elif media_type == "video":
|
elif media_type == "video":
|
||||||
stable_value = canonicalize_tweet_url(
|
tweet_key = canonicalize_tweet_url(
|
||||||
tweet.tweet_url or media_url or ""
|
tweet.tweet_url or media_url or ""
|
||||||
)
|
)
|
||||||
|
if resolved_video_hash:
|
||||||
|
stable_value = f"{tweet_key}|vh:{resolved_video_hash}"
|
||||||
|
else:
|
||||||
|
stable_value = tweet_key
|
||||||
|
|
||||||
parts.append(f"{media_type}:{stable_value}")
|
parts.append(f"{media_type}:{stable_value}")
|
||||||
|
|
||||||
@@ -1551,6 +1578,9 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
|
|||||||
"bsky_uri": bsky_uri,
|
"bsky_uri": bsky_uri,
|
||||||
"tweet_created_on": candidate["tweet"].created_on,
|
"tweet_created_on": candidate["tweet"].created_on,
|
||||||
"tweet_url": candidate["tweet"].tweet_url,
|
"tweet_url": candidate["tweet"].tweet_url,
|
||||||
|
"tweet_id": candidate.get("tweet_id"),
|
||||||
|
"resolved_video_url": candidate.get("resolved_video_url"),
|
||||||
|
"resolved_video_hash": candidate.get("resolved_video_hash"),
|
||||||
"posted_at": arrow.utcnow().isoformat(),
|
"posted_at": arrow.utcnow().isoformat(),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2663,89 +2693,76 @@ def scrape_tweets_via_playwright(username, password, email, target_handle, local
|
|||||||
|
|
||||||
|
|
||||||
# --- Video Extraction & Processing ---
|
# --- Video Extraction & Processing ---
|
||||||
def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
def extract_video_url_from_tweet_page_isolated(browser, tweet_url, tweet_id=None, locale="en-US"):
|
||||||
page = browser_context.new_page()
|
ctx = None
|
||||||
|
page = None
|
||||||
best_m3u8_url = None
|
best_m3u8_url = None
|
||||||
best_video_mp4_url = None
|
best_video_mp4_url = None
|
||||||
seen_urls = set() # ← scoped per call, so already reset per tweet ✅
|
seen_urls = set()
|
||||||
|
|
||||||
def is_audio_only_mp4(url, content_type):
|
def current_best():
|
||||||
url_l = url.lower()
|
return best_m3u8_url or best_video_mp4_url
|
||||||
content_type_l = content_type.lower()
|
|
||||||
return (
|
|
||||||
"/aud/" in url_l
|
|
||||||
or "/audio/" in url_l
|
|
||||||
or "mp4a" in url_l
|
|
||||||
or ("audio/" in content_type_l and "video/" not in content_type_l)
|
|
||||||
)
|
|
||||||
|
|
||||||
def handle_response(response):
|
def handle_response(response):
|
||||||
nonlocal best_m3u8_url, best_video_mp4_url
|
nonlocal best_m3u8_url, best_video_mp4_url
|
||||||
try:
|
try:
|
||||||
url = response.url
|
url = response.url
|
||||||
if url in seen_urls:
|
if not url or url in seen_urls:
|
||||||
return
|
return
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
|
|
||||||
|
owner = _cache.video_url_owner.get(url)
|
||||||
|
if owner and tweet_id and owner != tweet_id:
|
||||||
|
logging.warning(f"[tweet_id={tweet_id}] Rejecting URL owned by tweet_id={owner}: {url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
content_type = (response.headers.get("content-type") or "").lower()
|
||||||
url_l = url.lower()
|
url_l = url.lower()
|
||||||
content_type = response.headers.get("content-type", "")
|
|
||||||
content_type_l = content_type.lower()
|
|
||||||
|
|
||||||
if ".m4s" in url_l:
|
if ".m4s" in url_l:
|
||||||
return
|
return
|
||||||
|
|
||||||
if (
|
if ".m3u8" in url_l or "application/vnd.apple.mpegurl" in content_type or "application/x-mpegurl" in content_type:
|
||||||
".m3u8" in url_l
|
|
||||||
or "application/vnd.apple.mpegurl" in content_type_l
|
|
||||||
or "application/x-mpegurl" in content_type_l
|
|
||||||
):
|
|
||||||
if best_m3u8_url is None:
|
if best_m3u8_url is None:
|
||||||
best_m3u8_url = url
|
best_m3u8_url = url
|
||||||
logging.info(f"📺 Found HLS playlist URL: {url}")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if (
|
if ".mp4" in url_l or "video/mp4" in content_type or "audio/mp4" in content_type:
|
||||||
".mp4" in url_l
|
if media_url_looks_audio_only(url):
|
||||||
or "video/mp4" in content_type_l
|
|
||||||
or "audio/mp4" in content_type_l
|
|
||||||
):
|
|
||||||
if is_audio_only_mp4(url, content_type):
|
|
||||||
logging.info(f"🔇 Ignoring audio-only MP4: {url}")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if best_video_mp4_url is None:
|
if best_video_mp4_url is None:
|
||||||
best_video_mp4_url = url
|
best_video_mp4_url = url
|
||||||
logging.info(f"🎥 Found VIDEO MP4 URL: {url}")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(f"Response parsing error: {e}")
|
logging.debug(f"[tweet_id={tweet_id}] response parse error: {e}")
|
||||||
|
|
||||||
page.on("response", handle_response)
|
|
||||||
|
|
||||||
def current_best():
|
|
||||||
return best_m3u8_url or best_video_mp4_url
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info(f"🎬 Opening tweet page to capture video URL: {tweet_url}")
|
ctx = browser.new_context(
|
||||||
|
user_agent=(
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/145.0.7632.6 Safari/537.36"
|
||||||
|
),
|
||||||
|
viewport={"width": 1920, "height": 1080},
|
||||||
|
locale=locale,
|
||||||
|
)
|
||||||
|
page = ctx.new_page()
|
||||||
|
page.on("response", handle_response)
|
||||||
|
|
||||||
|
logging.info(f"[tweet_id={tweet_id}] 🎬 Opening tweet page to capture video URL: {tweet_url}")
|
||||||
page.goto(tweet_url, wait_until="domcontentloaded", timeout=40000)
|
page.goto(tweet_url, wait_until="domcontentloaded", timeout=40000)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
player = page.locator('[data-testid="videoPlayer"]').first
|
player = page.locator('[data-testid="videoPlayer"]').first
|
||||||
|
|
||||||
if player.count() > 0:
|
if player.count() > 0:
|
||||||
try:
|
try:
|
||||||
player.scroll_into_view_if_needed(timeout=5000)
|
player.scroll_into_view_if_needed(timeout=5000)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
player.click(force=True, timeout=5000)
|
player.click(force=True, timeout=5000)
|
||||||
logging.info("▶️ Clicked video player")
|
except Exception:
|
||||||
except Exception as e:
|
pass
|
||||||
logging.info(f"⚠️ First player click failed: {e}")
|
|
||||||
else:
|
|
||||||
logging.warning("⚠️ No video player locator found on tweet page")
|
|
||||||
|
|
||||||
for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
|
for _ in range(VIDEO_PLAYER_WAIT_ROUNDS):
|
||||||
if current_best():
|
if current_best():
|
||||||
@@ -2753,42 +2770,42 @@ def extract_video_url_from_tweet_page(browser_context, tweet_url):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
if not current_best() and player.count() > 0:
|
if not current_best() and player.count() > 0:
|
||||||
logging.info("🔁 No media URL found yet, retrying player interaction...")
|
|
||||||
try:
|
try:
|
||||||
player.click(force=True, timeout=5000)
|
player.click(force=True, timeout=5000)
|
||||||
time.sleep(PLAYWRIGHT_RETRY_SLEEP_S)
|
|
||||||
except Exception as e:
|
|
||||||
logging.info(f"⚠️ Retry click failed: {e}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
page.keyboard.press("Space")
|
|
||||||
time.sleep(1)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
try:
|
||||||
|
page.keyboard.press("Space")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
|
for _ in range(VIDEO_PLAYER_RETRY_ROUNDS):
|
||||||
if current_best():
|
if current_best():
|
||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
selected_url = current_best()
|
selected = current_best()
|
||||||
if selected_url:
|
if selected and tweet_id:
|
||||||
logging.info(f"✅ Selected media URL for download: {selected_url}")
|
_cache.video_url_owner[selected] = tweet_id
|
||||||
else:
|
|
||||||
logging.warning(
|
|
||||||
f"⚠️ No playable media URL detected on tweet page: {tweet_url}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return selected_url
|
logging.info(f"[tweet_id={tweet_id}] ✅ Selected media URL for download: {selected}")
|
||||||
|
return selected
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(
|
logging.warning(f"[tweet_id={tweet_id}] ⚠️ Could not extract video URL: {e}")
|
||||||
f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}"
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
finally:
|
finally:
|
||||||
page.remove_listener("response", handle_response) # ← FIX 1: detach before close
|
try:
|
||||||
page.close()
|
if page:
|
||||||
|
page.remove_listener("response", handle_response)
|
||||||
|
page.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
if ctx:
|
||||||
|
ctx.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _probe_video_duration(file_path):
|
def _probe_video_duration(file_path):
|
||||||
probe_cmd = [
|
probe_cmd = [
|
||||||
@@ -3186,6 +3203,9 @@ def sync_feeds(args):
|
|||||||
),
|
),
|
||||||
"has_video": has_video,
|
"has_video": has_video,
|
||||||
"has_photo": has_photo,
|
"has_photo": has_photo,
|
||||||
|
"tweet_id": extract_tweet_id(tweet.tweet_url),
|
||||||
|
"resolved_video_url": None,
|
||||||
|
"resolved_video_hash": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
is_dup_state, reason_state = candidate_matches_state(
|
is_dup_state, reason_state = candidate_matches_state(
|
||||||
@@ -3219,6 +3239,31 @@ def sync_feeds(args):
|
|||||||
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
|
f"📬 {len(candidate_tweets)} tweets remain after duplicate filtering."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Pre-resolve video URLs in isolated contexts (deterministic text<->media binding)
|
||||||
|
if candidate_tweets:
|
||||||
|
with sync_playwright() as p_pre:
|
||||||
|
pre_browser = p_pre.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--disable-blink-features=AutomationControlled"],
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
for c in candidate_tweets:
|
||||||
|
if not c.get("has_video"):
|
||||||
|
continue
|
||||||
|
t = c["tweet"]
|
||||||
|
tid = c.get("tweet_id")
|
||||||
|
if not t.tweet_url or not tid:
|
||||||
|
continue
|
||||||
|
c["resolved_video_url"] = extract_video_url_from_tweet_page_isolated(
|
||||||
|
pre_browser,
|
||||||
|
t.tweet_url,
|
||||||
|
tweet_id=tid,
|
||||||
|
locale=bot_locale,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
pre_browser.close()
|
||||||
|
|
||||||
if not candidate_tweets:
|
if not candidate_tweets:
|
||||||
logging.info(
|
logging.info(
|
||||||
"✅ No new tweets need posting after duplicate comparison."
|
"✅ No new tweets need posting after duplicate comparison."
|
||||||
@@ -3335,11 +3380,8 @@ def sync_feeds(args):
|
|||||||
temp_video_path = f"{temp_video_base}.mp4"
|
temp_video_path = f"{temp_video_base}.mp4"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
real_video_url = (
|
tweet_id = candidate.get("tweet_id")
|
||||||
extract_video_url_from_tweet_page(
|
real_video_url = candidate.get("resolved_video_url")
|
||||||
browser_context, tweet.tweet_url
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if not real_video_url:
|
if not real_video_url:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⚠️ Could not resolve playable video URL "
|
f"⚠️ Could not resolve playable video URL "
|
||||||
@@ -3361,9 +3403,20 @@ def sync_feeds(args):
|
|||||||
f"video:crop_failed:{tweet.tweet_url}"
|
f"video:crop_failed:{tweet.tweet_url}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
video_blob = get_blob_from_file(
|
video_hash = sha256_file(cropped_video_path)
|
||||||
cropped_video_path, bsky_client
|
candidate["resolved_video_hash"] = video_hash
|
||||||
)
|
owner = _cache.video_hash_owner.get(video_hash)
|
||||||
|
if owner and owner != tweet_id:
|
||||||
|
logging.warning(
|
||||||
|
f"[tweet_id={tweet_id}] ⚠️ Video hash already owned by tweet_id={owner}. Rejecting media."
|
||||||
|
)
|
||||||
|
media_upload_failures.append(f"video:hash_owned_by:{owner}")
|
||||||
|
video_blob = None
|
||||||
|
else:
|
||||||
|
_cache.video_hash_owner[video_hash] = tweet_id
|
||||||
|
video_blob = get_blob_from_file(
|
||||||
|
cropped_video_path, bsky_client
|
||||||
|
)
|
||||||
if not video_blob:
|
if not video_blob:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"⚠️ Video upload blob failed for "
|
f"⚠️ Video upload blob failed for "
|
||||||
|
|||||||
Reference in New Issue
Block a user