diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index ddf8fbd..5840e2c 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -181,7 +181,6 @@ def build_video_embed(video_blob, alt_text): # --- Playwright Scraping --- def scrape_tweets_via_playwright(username, password, email, target_handle): - """Logs in (or loads session) and scrapes tweets directly from the DOM.""" tweets = [] state_file = "twitter_browser_state.json" @@ -333,19 +332,18 @@ def scrape_tweets_via_playwright(username, password, email, target_handle): def extract_video_url_from_tweet_page(context, tweet_url): """ - Opens a tweet page and captures the best real video URL. + Prefer HLS playlist first because it usually contains the full playable stream + with audio + video and proper timing metadata. - Preference order: - 1. real video .mp4 - 2. .m3u8 playlist + Fallback to direct video MP4 only if no HLS playlist is found. - Ignores: + Ignore: - .m4s fragments - - audio-only mp4 URLs + - audio-only MP4 URLs """ page = context.new_page() - best_video_mp4_url = None best_m3u8_url = None + best_video_mp4_url = None def is_audio_only_mp4(url, content_type): url_l = url.lower() @@ -358,7 +356,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): ) def handle_response(response): - nonlocal best_video_mp4_url, best_m3u8_url + nonlocal best_m3u8_url, best_video_mp4_url try: url = response.url url_l = url.lower() @@ -368,16 +366,6 @@ def extract_video_url_from_tweet_page(context, tweet_url): if ".m4s" in url_l: return - if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l) and is_audio_only_mp4(url, content_type): - logging.info(f"🔇 Ignoring audio-only MP4: {url}") - return - - if ".mp4" in url_l or "video/mp4" in content_type_l: - if best_video_mp4_url is None: - best_video_mp4_url = url - logging.info(f"🎥 Found VIDEO MP4 URL: {url}") - return - if ( ".m3u8" in url_l or "application/vnd.apple.mpegurl" in content_type_l or @@ -388,6 +376,16 @@ def extract_video_url_from_tweet_page(context, tweet_url): logging.info(f"📺 Found HLS playlist URL: {url}") return + if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l) and is_audio_only_mp4(url, content_type): + logging.info(f"🔇 Ignoring audio-only MP4: {url}") + return + + if ".mp4" in url_l or "video/mp4" in content_type_l: + if best_video_mp4_url is None: + best_video_mp4_url = url + logging.info(f"🎥 Found VIDEO MP4 URL: {url}") + return + except Exception: pass @@ -406,7 +404,10 @@ def extract_video_url_from_tweet_page(context, tweet_url): except Exception: pass - return best_video_mp4_url or best_m3u8_url + selected_url = best_m3u8_url or best_video_mp4_url + if selected_url: + logging.info(f"✅ Selected media URL for download: {selected_url}") + return selected_url except Exception as e: logging.warning(f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}") @@ -419,7 +420,9 @@ def extract_video_url_from_tweet_page(context, tweet_url): def download_and_crop_video(video_url, output_path): """ Downloads a video from MP4 or HLS (.m3u8), then trims it to max 59 seconds. - Requires ffmpeg installed on the system. + Uses ffmpeg for download and MoviePy for crop. + + HLS is preferred because it usually produces a complete muxed file. """ temp_input = output_path.replace(".mp4", "_source.mp4") temp_output = output_path.replace(".mp4", "_cropped.mp4") @@ -430,10 +433,13 @@ def download_and_crop_video(video_url, output_path): download_cmd = [ "ffmpeg", "-y", + "-protocol_whitelist", "file,http,https,tcp,tls,crypto", + "-allowed_extensions", "ALL", "-i", video_url, "-c", "copy", temp_input, ] + download_result = subprocess.run( download_cmd, capture_output=True, @@ -480,10 +486,6 @@ def download_and_crop_video(video_url, output_path): return None os.replace(temp_output, output_path) - - if os.path.exists(temp_input): - os.remove(temp_input) - logging.info(f"✅ Video cropped to 59 seconds: {output_path}") return output_path @@ -543,8 +545,8 @@ def sync_feeds(args): for tweet in reversed(tweets): tweet_time = arrow.get(tweet.created_on) - if tweet_time <= last_bsky_time: - #if False: + #if tweet_time <= last_bsky_time: + if False: continue logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...") @@ -636,7 +638,6 @@ def main(): load_dotenv() parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync") - parser.add_argument("--twitter-username", help="Your Twitter login username") parser.add_argument("--twitter-password", help="Your Twitter login password") parser.add_argument("--twitter-email", help="Your Twitter email for security challenges")