Timestamp and duplication improvements
This commit is contained in:
@@ -11,14 +11,20 @@ from atproto import Client, client_utils, models
|
|||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from moviepy import VideoFileClip
|
from moviepy import VideoFileClip
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Configuration ---
|
||||||
LOG_PATH = "twitter2bsky.log"
|
LOG_PATH = "twitter2bsky.log"
|
||||||
|
SCRAPE_TWEET_LIMIT = 30
|
||||||
|
DEDUPE_BSKY_LIMIT = 30
|
||||||
|
TWEET_MAX_AGE_DAYS = 3
|
||||||
|
|
||||||
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
|
handlers=[logging.FileHandler(LOG_PATH, encoding="utf-8"), logging.StreamHandler()],
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# --- Custom Classes ---
|
# --- Custom Classes ---
|
||||||
class ScrapedMedia:
|
class ScrapedMedia:
|
||||||
def __init__(self, url, media_type="photo"):
|
def __init__(self, url, media_type="photo"):
|
||||||
@@ -80,12 +86,57 @@ def get_blob_from_file(file_path, client):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_last_bsky(client, handle):
|
def normalize_post_text(text):
|
||||||
timeline = client.get_author_feed(handle)
|
"""
|
||||||
for titem in timeline.feed:
|
Normalize post text for duplicate detection.
|
||||||
if titem.reason is None and getattr(titem.post.record, "reply", None) is None:
|
"""
|
||||||
return arrow.get(titem.post.record.created_at)
|
if not text:
|
||||||
return arrow.get(0)
|
return ""
|
||||||
|
|
||||||
|
text = text.replace("\r", "\n")
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def get_recent_bsky_texts(client, handle, limit=30):
|
||||||
|
"""
|
||||||
|
Fetch recent top-level Bluesky post texts for duplicate detection.
|
||||||
|
"""
|
||||||
|
recent_texts = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
timeline = client.get_author_feed(handle, limit=limit)
|
||||||
|
|
||||||
|
for item in timeline.feed:
|
||||||
|
try:
|
||||||
|
if item.reason is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
record = item.post.record
|
||||||
|
if getattr(record, "reply", None) is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text = getattr(record, "text", "") or ""
|
||||||
|
normalized = normalize_post_text(text)
|
||||||
|
|
||||||
|
if normalized:
|
||||||
|
recent_texts.append(normalized)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}")
|
||||||
|
|
||||||
|
return recent_texts
|
||||||
|
|
||||||
|
|
||||||
|
def is_duplicate_bsky_text(candidate_text, recent_texts):
|
||||||
|
"""
|
||||||
|
Returns True if the candidate text already exists in recent Bluesky posts.
|
||||||
|
"""
|
||||||
|
normalized_candidate = normalize_post_text(candidate_text)
|
||||||
|
return normalized_candidate in set(recent_texts)
|
||||||
|
|
||||||
|
|
||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
@@ -278,9 +329,9 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
articles = page.locator("article").all()
|
articles = page.locator("article").all()
|
||||||
logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing...")
|
logging.info(f"📊 Found {len(articles)} tweets on screen. Parsing up to {SCRAPE_TWEET_LIMIT}...")
|
||||||
|
|
||||||
for article in articles[:10]:
|
for article in articles[:SCRAPE_TWEET_LIMIT]:
|
||||||
try:
|
try:
|
||||||
time_el = article.locator("time").first
|
time_el = article.locator("time").first
|
||||||
if not time_el.is_visible():
|
if not time_el.is_visible():
|
||||||
@@ -385,7 +436,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
|
|||||||
logging.info(f"📺 Found HLS playlist URL: {url}")
|
logging.info(f"📺 Found HLS playlist URL: {url}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if (".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l):
|
if ".mp4" in url_l or "video/mp4" in content_type_l or "audio/mp4" in content_type_l:
|
||||||
if is_audio_only_mp4(url, content_type):
|
if is_audio_only_mp4(url, content_type):
|
||||||
logging.info(f"🔇 Ignoring audio-only MP4: {url}")
|
logging.info(f"🔇 Ignoring audio-only MP4: {url}")
|
||||||
return
|
return
|
||||||
@@ -421,7 +472,6 @@ def extract_video_url_from_tweet_page(context, tweet_url):
|
|||||||
logging.info("▶️ Clicked video player")
|
logging.info("▶️ Clicked video player")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(f"⚠️ First player click failed: {e}")
|
logging.info(f"⚠️ First player click failed: {e}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logging.warning("⚠️ No video player locator found on tweet page")
|
logging.warning("⚠️ No video player locator found on tweet page")
|
||||||
|
|
||||||
@@ -475,6 +525,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
video_url_l = video_url.lower()
|
video_url_l = video_url.lower()
|
||||||
|
|
||||||
if ".m3u8" in video_url_l:
|
if ".m3u8" in video_url_l:
|
||||||
|
logging.info("📺 Using HLS ffmpeg mode")
|
||||||
download_cmd = [
|
download_cmd = [
|
||||||
"ffmpeg",
|
"ffmpeg",
|
||||||
"-y",
|
"-y",
|
||||||
@@ -485,6 +536,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
temp_input,
|
temp_input,
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
|
logging.info("🎥 Using direct MP4 ffmpeg mode")
|
||||||
download_cmd = [
|
download_cmd = [
|
||||||
"ffmpeg",
|
"ffmpeg",
|
||||||
"-y",
|
"-y",
|
||||||
@@ -554,6 +606,7 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# --- Main Sync Function ---
|
# --- Main Sync Function ---
|
||||||
def sync_feeds(args):
|
def sync_feeds(args):
|
||||||
logging.info("🔄 Starting sync cycle...")
|
logging.info("🔄 Starting sync cycle...")
|
||||||
@@ -571,7 +624,16 @@ def sync_feeds(args):
|
|||||||
|
|
||||||
bsky_client = Client()
|
bsky_client = Client()
|
||||||
bsky_client.login(args.bsky_handle, args.bsky_password)
|
bsky_client.login(args.bsky_handle, args.bsky_password)
|
||||||
last_bsky_time = get_last_bsky(bsky_client, args.bsky_handle)
|
|
||||||
|
recent_bsky_texts = get_recent_bsky_texts(
|
||||||
|
bsky_client,
|
||||||
|
args.bsky_handle,
|
||||||
|
limit=DEDUPE_BSKY_LIMIT
|
||||||
|
)
|
||||||
|
logging.info(f"🧠 Loaded {len(recent_bsky_texts)} recent Bluesky post texts for duplicate detection.")
|
||||||
|
|
||||||
|
too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS)
|
||||||
|
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
|
||||||
|
|
||||||
new_posts = 0
|
new_posts = 0
|
||||||
state_file = "twitter_browser_state.json"
|
state_file = "twitter_browser_state.json"
|
||||||
@@ -597,12 +659,10 @@ def sync_feeds(args):
|
|||||||
for tweet in reversed(tweets):
|
for tweet in reversed(tweets):
|
||||||
tweet_time = arrow.get(tweet.created_on)
|
tweet_time = arrow.get(tweet.created_on)
|
||||||
|
|
||||||
if tweet_time <= last_bsky_time:
|
if tweet_time < too_old_cutoff:
|
||||||
#if False:
|
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logging.info(f"📝 Found new tweet from {tweet_time}. Posting to Bluesky...")
|
|
||||||
|
|
||||||
raw_text = tweet.text.strip()
|
raw_text = tweet.text.strip()
|
||||||
|
|
||||||
if len(raw_text) > 295:
|
if len(raw_text) > 295:
|
||||||
@@ -614,6 +674,12 @@ def sync_feeds(args):
|
|||||||
raw_text = truncated + "..."
|
raw_text = truncated + "..."
|
||||||
logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.")
|
logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.")
|
||||||
|
|
||||||
|
if is_duplicate_bsky_text(raw_text, recent_bsky_texts):
|
||||||
|
logging.info("⏭️ Skipping tweet because its text already exists in the last 30 Bluesky posts.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.info(f"📝 Found candidate tweet from {tweet_time}. Posting to Bluesky...")
|
||||||
|
|
||||||
rich_text = make_rich(raw_text)
|
rich_text = make_rich(raw_text)
|
||||||
dynamic_alt = build_dynamic_alt(raw_text)
|
dynamic_alt = build_dynamic_alt(raw_text)
|
||||||
|
|
||||||
@@ -670,6 +736,9 @@ def sync_feeds(args):
|
|||||||
else:
|
else:
|
||||||
bsky_client.send_post(text=rich_text, langs=["ca"])
|
bsky_client.send_post(text=rich_text, langs=["ca"])
|
||||||
|
|
||||||
|
recent_bsky_texts.insert(0, normalize_post_text(raw_text))
|
||||||
|
recent_bsky_texts = recent_bsky_texts[:DEDUPE_BSKY_LIMIT]
|
||||||
|
|
||||||
new_posts += 1
|
new_posts += 1
|
||||||
logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}")
|
logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}")
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|||||||
Reference in New Issue
Block a user