New video limit 179 seconds
This commit is contained in:
@@ -8,7 +8,6 @@ import httpx
|
|||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from atproto import Client, client_utils, models
|
from atproto import Client, client_utils, models
|
||||||
@@ -21,9 +20,7 @@ STATE_PATH = "twitter2bsky_state.json"
|
|||||||
SCRAPE_TWEET_LIMIT = 30
|
SCRAPE_TWEET_LIMIT = 30
|
||||||
DEDUPE_BSKY_LIMIT = 30
|
DEDUPE_BSKY_LIMIT = 30
|
||||||
TWEET_MAX_AGE_DAYS = 3
|
TWEET_MAX_AGE_DAYS = 3
|
||||||
|
VIDEO_MAX_DURATION_SECONDS = 179
|
||||||
STATE_MAX_ENTRIES = 5000
|
|
||||||
STATE_MAX_AGE_DAYS = 180
|
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -265,27 +262,6 @@ def build_text_media_key(normalized_text, media_fingerprint):
|
|||||||
return hashlib.sha256(f"{normalized_text}||{media_fingerprint}".encode("utf-8")).hexdigest()
|
return hashlib.sha256(f"{normalized_text}||{media_fingerprint}".encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def safe_remove_file(path):
|
|
||||||
if path and os.path.exists(path):
|
|
||||||
try:
|
|
||||||
os.remove(path)
|
|
||||||
logging.debug(f"🧹 Removed temp file: {path}")
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"⚠️ Could not remove temp file {path}: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def build_temp_video_output_path(tweet):
|
|
||||||
"""
|
|
||||||
Create a unique temp mp4 path for this tweet.
|
|
||||||
"""
|
|
||||||
canonical_url = canonicalize_tweet_url(tweet.tweet_url) or ""
|
|
||||||
seed = canonical_url or f"{tweet.created_on}_{tweet.text[:50]}"
|
|
||||||
suffix = hashlib.sha256(seed.encode("utf-8")).hexdigest()[:12]
|
|
||||||
|
|
||||||
temp_dir = tempfile.gettempdir()
|
|
||||||
return os.path.join(temp_dir, f"twitter2bsky_{suffix}.mp4")
|
|
||||||
|
|
||||||
|
|
||||||
# --- Local State Management ---
|
# --- Local State Management ---
|
||||||
def default_state():
|
def default_state():
|
||||||
return {
|
return {
|
||||||
@@ -382,59 +358,47 @@ def candidate_matches_state(candidate, state):
|
|||||||
if canonical_tweet_url and canonical_tweet_url in posted_tweets:
|
if canonical_tweet_url and canonical_tweet_url in posted_tweets:
|
||||||
return True, "state:tweet_url"
|
return True, "state:tweet_url"
|
||||||
|
|
||||||
for record in posted_tweets.values():
|
for _, record in posted_tweets.items():
|
||||||
if record.get("text_media_key") == text_media_key:
|
if record.get("text_media_key") == text_media_key:
|
||||||
return True, "state:text_media_fingerprint"
|
return True, "state:text_media_fingerprint"
|
||||||
|
|
||||||
for record in posted_tweets.values():
|
for _, record in posted_tweets.items():
|
||||||
if record.get("normalized_text") == normalized_text:
|
if record.get("normalized_text") == normalized_text:
|
||||||
return True, "state:normalized_text"
|
return True, "state:normalized_text"
|
||||||
|
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
def prune_state(state, max_entries=STATE_MAX_ENTRIES, max_age_days=STATE_MAX_AGE_DAYS):
|
def prune_state(state, max_entries=5000):
|
||||||
"""
|
"""
|
||||||
Keep state file from growing forever.
|
Keep state file from growing forever.
|
||||||
Prunes:
|
Prunes oldest records by posted_at if necessary.
|
||||||
- entries older than max_age_days
|
|
||||||
- entries beyond max_entries, keeping newest first
|
|
||||||
- orphan posted_by_bsky_uri keys
|
|
||||||
"""
|
"""
|
||||||
posted_tweets = state.get("posted_tweets", {})
|
posted_tweets = state.get("posted_tweets", {})
|
||||||
cutoff = arrow.utcnow().shift(days=-max_age_days)
|
|
||||||
|
|
||||||
kept_items = []
|
if len(posted_tweets) <= max_entries:
|
||||||
|
return state
|
||||||
|
|
||||||
|
sortable = []
|
||||||
for key, record in posted_tweets.items():
|
for key, record in posted_tweets.items():
|
||||||
posted_at_raw = record.get("posted_at")
|
posted_at = record.get("posted_at") or ""
|
||||||
keep = True
|
sortable.append((key, posted_at))
|
||||||
|
|
||||||
if posted_at_raw:
|
sortable.sort(key=lambda x: x[1], reverse=True)
|
||||||
try:
|
keep_keys = {key for key, _ in sortable[:max_entries]}
|
||||||
posted_at = arrow.get(posted_at_raw)
|
|
||||||
if posted_at < cutoff:
|
|
||||||
keep = False
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if keep:
|
new_posted_tweets = {}
|
||||||
kept_items.append((key, record))
|
for key, record in posted_tweets.items():
|
||||||
|
if key in keep_keys:
|
||||||
|
new_posted_tweets[key] = record
|
||||||
|
|
||||||
kept_items.sort(key=lambda item: item[1].get("posted_at", ""), reverse=True)
|
new_posted_by_bsky_uri = {}
|
||||||
kept_items = kept_items[:max_entries]
|
for bsky_uri, key in state.get("posted_by_bsky_uri", {}).items():
|
||||||
|
if key in keep_keys:
|
||||||
keep_keys = {key for key, _ in kept_items}
|
new_posted_by_bsky_uri[bsky_uri] = key
|
||||||
|
|
||||||
state["posted_tweets"] = {key: record for key, record in kept_items}
|
|
||||||
|
|
||||||
posted_by_bsky_uri = state.get("posted_by_bsky_uri", {})
|
|
||||||
state["posted_by_bsky_uri"] = {
|
|
||||||
bsky_uri: key
|
|
||||||
for bsky_uri, key in posted_by_bsky_uri.items()
|
|
||||||
if key in keep_keys
|
|
||||||
}
|
|
||||||
|
|
||||||
|
state["posted_tweets"] = new_posted_tweets
|
||||||
|
state["posted_by_bsky_uri"] = new_posted_by_bsky_uri
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
@@ -458,7 +422,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
if getattr(record, "reply", None) is not None:
|
if getattr(record, "reply", None) is not None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = getattr(record, "text", "") or ""
|
text = getattr(record, "text", "") or ""
|
||||||
normalized_text = normalize_post_text(text)
|
normalized_text = normalize_post_text(text)
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
@@ -905,7 +869,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
logging.error("❌ Downloaded video has invalid or unknown duration.")
|
logging.error("❌ Downloaded video has invalid or unknown duration.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
end_time = min(59, duration)
|
end_time = min(VIDEO_MAX_DURATION_SECONDS, duration)
|
||||||
|
|
||||||
if hasattr(video_clip, "subclipped"):
|
if hasattr(video_clip, "subclipped"):
|
||||||
cropped_clip = video_clip.subclipped(0, end_time)
|
cropped_clip = video_clip.subclipped(0, end_time)
|
||||||
@@ -927,7 +891,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
os.replace(temp_output, output_path)
|
os.replace(temp_output, output_path)
|
||||||
logging.info(f"✅ Video cropped to 59 seconds: {output_path}")
|
logging.info(f"✅ Video cropped to {int(end_time)} seconds: {output_path}")
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -935,8 +899,12 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
safe_remove_file(temp_input)
|
for path in [temp_input, temp_output]:
|
||||||
safe_remove_file(temp_output)
|
if os.path.exists(path):
|
||||||
|
try:
|
||||||
|
os.remove(path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
||||||
@@ -971,8 +939,6 @@ def sync_feeds(args):
|
|||||||
logging.info("🔄 Starting sync cycle...")
|
logging.info("🔄 Starting sync cycle...")
|
||||||
try:
|
try:
|
||||||
state = load_state(STATE_PATH)
|
state = load_state(STATE_PATH)
|
||||||
state = prune_state(state)
|
|
||||||
save_state(state, STATE_PATH)
|
|
||||||
|
|
||||||
tweets = scrape_tweets_via_playwright(
|
tweets = scrape_tweets_via_playwright(
|
||||||
args.twitter_username,
|
args.twitter_username,
|
||||||
@@ -1063,7 +1029,7 @@ def sync_feeds(args):
|
|||||||
return
|
return
|
||||||
|
|
||||||
new_posts = 0
|
new_posts = 0
|
||||||
browser_state_file = "twitter_browser_state.json"
|
state_file = "twitter_browser_state.json"
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(
|
browser = p.chromium.launch(
|
||||||
@@ -1078,8 +1044,8 @@ def sync_feeds(args):
|
|||||||
),
|
),
|
||||||
"viewport": {"width": 1920, "height": 1080},
|
"viewport": {"width": 1920, "height": 1080},
|
||||||
}
|
}
|
||||||
if os.path.exists(browser_state_file):
|
if os.path.exists(state_file):
|
||||||
context_kwargs["storage_state"] = browser_state_file
|
context_kwargs["storage_state"] = state_file
|
||||||
|
|
||||||
context = browser.new_context(**context_kwargs)
|
context = browser.new_context(**context_kwargs)
|
||||||
|
|
||||||
@@ -1113,7 +1079,7 @@ def sync_feeds(args):
|
|||||||
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.")
|
logging.warning("⚠️ Tweet has video marker but no tweet URL. Skipping video.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
temp_video_path = build_temp_video_output_path(tweet)
|
temp_video_path = "temp_video.mp4"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url)
|
real_video_url = extract_video_url_from_tweet_page(context, tweet.tweet_url)
|
||||||
@@ -1134,9 +1100,8 @@ def sync_feeds(args):
|
|||||||
video_embed = build_video_embed(video_blob, dynamic_alt)
|
video_embed = build_video_embed(video_blob, dynamic_alt)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
safe_remove_file(temp_video_path)
|
if os.path.exists(temp_video_path):
|
||||||
safe_remove_file(temp_video_path.replace(".mp4", "_source.mp4"))
|
os.remove(temp_video_path)
|
||||||
safe_remove_file(temp_video_path.replace(".mp4", "_cropped.mp4"))
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post_result = None
|
post_result = None
|
||||||
@@ -1152,7 +1117,7 @@ def sync_feeds(args):
|
|||||||
bsky_uri = getattr(post_result, "uri", None)
|
bsky_uri = getattr(post_result, "uri", None)
|
||||||
|
|
||||||
remember_posted_tweet(state, candidate, bsky_uri=bsky_uri)
|
remember_posted_tweet(state, candidate, bsky_uri=bsky_uri)
|
||||||
state = prune_state(state)
|
state = prune_state(state, max_entries=5000)
|
||||||
save_state(state, STATE_PATH)
|
save_state(state, STATE_PATH)
|
||||||
|
|
||||||
recent_bsky_posts.insert(0, {
|
recent_bsky_posts.insert(0, {
|
||||||
|
|||||||
Reference in New Issue
Block a user