Added new yml

This commit is contained in:
Guillem Hernandez Sola
2026-03-30 17:54:40 +02:00
parent 2450ab75b2
commit 9faabf48d0

View File

@@ -181,7 +181,6 @@ def build_video_embed(video_blob, alt_text):
# --- Playwright Scraping --- # --- Playwright Scraping ---
def scrape_tweets_via_playwright(username, password, email, target_handle): def scrape_tweets_via_playwright(username, password, email, target_handle):
"""Logs in (or loads session) and scrapes tweets directly from the DOM."""
tweets = [] tweets = []
state_file = "twitter_browser_state.json" state_file = "twitter_browser_state.json"
@@ -333,19 +332,18 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
def extract_video_url_from_tweet_page(context, tweet_url): def extract_video_url_from_tweet_page(context, tweet_url):
""" """
Opens a tweet page and captures the best real video URL. Prefer HLS playlist first because it usually contains the full playable stream
with audio + video and proper timing metadata.
Preference order: Fallback to direct video MP4 only if no HLS playlist is found.
1. real video .mp4
2. .m3u8 playlist
Ignores: Ignore:
- .m4s fragments - .m4s fragments
- audio-only mp4 URLs - audio-only MP4 URLs
""" """
page = context.new_page() page = context.new_page()
best_video_mp4_url = None
best_m3u8_url = None best_m3u8_url = None
best_video_mp4_url = None
def is_audio_only_mp4(url, content_type): def is_audio_only_mp4(url, content_type):
url_l = url.lower() url_l = url.lower()
@@ -358,7 +356,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
) )
def handle_response(response): def handle_response(response):
nonlocal best_video_mp4_url, best_m3u8_url nonlocal best_m3u8_url, best_video_mp4_url
try: try:
url = response.url url = response.url
url_l = url.lower() url_l = url.lower()
@@ -368,16 +366,6 @@ def extract_video_url_from_tweet_page(context, tweet_url):
if ".m4s" in url_l: if ".m4s" in url_l:
return return
if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l) and is_audio_only_mp4(url, content_type):
logging.info(f"🔇 Ignoring audio-only MP4: {url}")
return
if ".mp4" in url_l or "video/mp4" in content_type_l:
if best_video_mp4_url is None:
best_video_mp4_url = url
logging.info(f"🎥 Found VIDEO MP4 URL: {url}")
return
if ( if (
".m3u8" in url_l or ".m3u8" in url_l or
"application/vnd.apple.mpegurl" in content_type_l or "application/vnd.apple.mpegurl" in content_type_l or
@@ -388,6 +376,16 @@ def extract_video_url_from_tweet_page(context, tweet_url):
logging.info(f"📺 Found HLS playlist URL: {url}") logging.info(f"📺 Found HLS playlist URL: {url}")
return return
if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l) and is_audio_only_mp4(url, content_type):
logging.info(f"🔇 Ignoring audio-only MP4: {url}")
return
if ".mp4" in url_l or "video/mp4" in content_type_l:
if best_video_mp4_url is None:
best_video_mp4_url = url
logging.info(f"🎥 Found VIDEO MP4 URL: {url}")
return
except Exception: except Exception:
pass pass
@@ -406,7 +404,10 @@ def extract_video_url_from_tweet_page(context, tweet_url):
except Exception: except Exception:
pass pass
return best_video_mp4_url or best_m3u8_url selected_url = best_m3u8_url or best_video_mp4_url
if selected_url:
logging.info(f"✅ Selected media URL for download: {selected_url}")
return selected_url
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}") logging.warning(f"⚠️ Could not extract video URL from tweet page {tweet_url}: {e}")
@@ -419,7 +420,9 @@ def extract_video_url_from_tweet_page(context, tweet_url):
def download_and_crop_video(video_url, output_path): def download_and_crop_video(video_url, output_path):
""" """
Downloads a video from MP4 or HLS (.m3u8), then trims it to max 59 seconds. Downloads a video from MP4 or HLS (.m3u8), then trims it to max 59 seconds.
Requires ffmpeg installed on the system. Uses ffmpeg for download and MoviePy for crop.
HLS is preferred because it usually produces a complete muxed file.
""" """
temp_input = output_path.replace(".mp4", "_source.mp4") temp_input = output_path.replace(".mp4", "_source.mp4")
temp_output = output_path.replace(".mp4", "_cropped.mp4") temp_output = output_path.replace(".mp4", "_cropped.mp4")
@@ -430,10 +433,13 @@ def download_and_crop_video(video_url, output_path):
download_cmd = [ download_cmd = [
"ffmpeg", "ffmpeg",
"-y", "-y",
"-protocol_whitelist", "file,http,https,tcp,tls,crypto",
"-allowed_extensions", "ALL",
"-i", video_url, "-i", video_url,
"-c", "copy", "-c", "copy",
temp_input, temp_input,
] ]
download_result = subprocess.run( download_result = subprocess.run(
download_cmd, download_cmd,
capture_output=True, capture_output=True,
@@ -480,10 +486,6 @@ def download_and_crop_video(video_url, output_path):
return None return None
os.replace(temp_output, output_path) os.replace(temp_output, output_path)
if os.path.exists(temp_input):
os.remove(temp_input)
logging.info(f"✅ Video cropped to 59 seconds: {output_path}") logging.info(f"✅ Video cropped to 59 seconds: {output_path}")
return output_path return output_path
@@ -543,8 +545,8 @@ def sync_feeds(args):
for tweet in reversed(tweets): for tweet in reversed(tweets):
tweet_time = arrow.get(tweet.created_on) tweet_time = arrow.get(tweet.created_on)
if tweet_time <= last_bsky_time: #if tweet_time <= last_bsky_time:
#if False: if False:
continue continue
logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...") logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...")
@@ -636,7 +638,6 @@ def main():
load_dotenv() load_dotenv()
parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync") parser = argparse.ArgumentParser(description="Twitter to Bluesky Sync")
parser.add_argument("--twitter-username", help="Your Twitter login username") parser.add_argument("--twitter-username", help="Your Twitter login username")
parser.add_argument("--twitter-password", help="Your Twitter login password") parser.add_argument("--twitter-password", help="Your Twitter login password")
parser.add_argument("--twitter-email", help="Your Twitter email for security challenges") parser.add_argument("--twitter-email", help="Your Twitter email for security challenges")