30 vs 30
This commit is contained in:
@@ -86,6 +86,23 @@ def get_blob_from_file(file_path, client):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_post_text(text):
|
||||||
|
"""
|
||||||
|
Prepare the final text exactly as it would be posted to Bluesky.
|
||||||
|
"""
|
||||||
|
raw_text = (text or "").strip()
|
||||||
|
|
||||||
|
if len(raw_text) > 295:
|
||||||
|
truncated = raw_text[:290]
|
||||||
|
last_space = truncated.rfind(" ")
|
||||||
|
if last_space > 0:
|
||||||
|
raw_text = truncated[:last_space] + "..."
|
||||||
|
else:
|
||||||
|
raw_text = truncated + "..."
|
||||||
|
|
||||||
|
return raw_text
|
||||||
|
|
||||||
|
|
||||||
def normalize_post_text(text):
|
def normalize_post_text(text):
|
||||||
"""
|
"""
|
||||||
Normalize post text for duplicate detection.
|
Normalize post text for duplicate detection.
|
||||||
@@ -98,11 +115,12 @@ def normalize_post_text(text):
|
|||||||
return text.lower()
|
return text.lower()
|
||||||
|
|
||||||
|
|
||||||
def get_recent_bsky_texts(client, handle, limit=30):
|
def get_recent_bsky_posts(client, handle, limit=30):
|
||||||
"""
|
"""
|
||||||
Fetch recent top-level Bluesky post texts for duplicate detection.
|
Fetch recent top-level Bluesky posts for duplicate detection.
|
||||||
|
Returns a list of dicts with original and normalized text.
|
||||||
"""
|
"""
|
||||||
recent_texts = []
|
recent_posts = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
timeline = client.get_author_feed(handle, limit=limit)
|
timeline = client.get_author_feed(handle, limit=limit)
|
||||||
@@ -117,10 +135,15 @@ def get_recent_bsky_texts(client, handle, limit=30):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
text = getattr(record, "text", "") or ""
|
text = getattr(record, "text", "") or ""
|
||||||
normalized = normalize_post_text(text)
|
prepared = prepare_post_text(text)
|
||||||
|
normalized = normalize_post_text(prepared)
|
||||||
|
|
||||||
if normalized:
|
if normalized:
|
||||||
recent_texts.append(normalized)
|
recent_posts.append({
|
||||||
|
"text": prepared,
|
||||||
|
"normalized_text": normalized,
|
||||||
|
"created_at": getattr(record, "created_at", None),
|
||||||
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}")
|
logging.debug(f"Skipping one Bluesky feed item during dedupe fetch: {e}")
|
||||||
@@ -128,15 +151,7 @@ def get_recent_bsky_texts(client, handle, limit=30):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}")
|
logging.warning(f"⚠️ Could not fetch recent Bluesky posts for duplicate detection: {e}")
|
||||||
|
|
||||||
return recent_texts
|
return recent_posts
|
||||||
|
|
||||||
|
|
||||||
def is_duplicate_bsky_text(candidate_text, recent_texts):
|
|
||||||
"""
|
|
||||||
Returns True if the candidate text already exists in recent Bluesky posts.
|
|
||||||
"""
|
|
||||||
normalized_candidate = normalize_post_text(candidate_text)
|
|
||||||
return normalized_candidate in set(recent_texts)
|
|
||||||
|
|
||||||
|
|
||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
@@ -377,25 +392,6 @@ def scrape_tweets_via_playwright(username, password, email, target_handle):
|
|||||||
|
|
||||||
|
|
||||||
def extract_video_url_from_tweet_page(context, tweet_url):
|
def extract_video_url_from_tweet_page(context, tweet_url):
|
||||||
"""
|
|
||||||
Open tweet page and capture media requests.
|
|
||||||
|
|
||||||
Strategy:
|
|
||||||
- listen for network responses
|
|
||||||
- wait for player
|
|
||||||
- scroll into view
|
|
||||||
- click player
|
|
||||||
- poll for a few seconds
|
|
||||||
- retry interaction once if needed
|
|
||||||
|
|
||||||
Preference:
|
|
||||||
1. HLS .m3u8
|
|
||||||
2. real video .mp4
|
|
||||||
|
|
||||||
Ignore:
|
|
||||||
- .m4s
|
|
||||||
- audio-only mp4
|
|
||||||
"""
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
best_m3u8_url = None
|
best_m3u8_url = None
|
||||||
best_video_mp4_url = None
|
best_video_mp4_url = None
|
||||||
@@ -625,16 +621,61 @@ def sync_feeds(args):
|
|||||||
bsky_client = Client()
|
bsky_client = Client()
|
||||||
bsky_client.login(args.bsky_handle, args.bsky_password)
|
bsky_client.login(args.bsky_handle, args.bsky_password)
|
||||||
|
|
||||||
recent_bsky_texts = get_recent_bsky_texts(
|
recent_bsky_posts = get_recent_bsky_posts(
|
||||||
bsky_client,
|
bsky_client,
|
||||||
args.bsky_handle,
|
args.bsky_handle,
|
||||||
limit=DEDUPE_BSKY_LIMIT
|
limit=DEDUPE_BSKY_LIMIT
|
||||||
)
|
)
|
||||||
logging.info(f"🧠 Loaded {len(recent_bsky_texts)} recent Bluesky post texts for duplicate detection.")
|
recent_bsky_text_set = {post["normalized_text"] for post in recent_bsky_posts if post["normalized_text"]}
|
||||||
|
|
||||||
|
logging.info(f"🧠 Loaded {len(recent_bsky_posts)} recent Bluesky posts for 30-vs-30 duplicate detection.")
|
||||||
|
logging.info(f"🧠 Built normalized Bluesky dedupe set with {len(recent_bsky_text_set)} entries.")
|
||||||
|
|
||||||
too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS)
|
too_old_cutoff = arrow.utcnow().shift(days=-TWEET_MAX_AGE_DAYS)
|
||||||
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
|
logging.info(f"🕒 Will ignore tweets older than: {too_old_cutoff}")
|
||||||
|
|
||||||
|
candidate_tweets = []
|
||||||
|
|
||||||
|
for tweet in reversed(tweets):
|
||||||
|
try:
|
||||||
|
tweet_time = arrow.get(tweet.created_on)
|
||||||
|
|
||||||
|
if tweet_time < too_old_cutoff:
|
||||||
|
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
prepared_text = prepare_post_text(tweet.text)
|
||||||
|
normalized_text = normalize_post_text(prepared_text)
|
||||||
|
|
||||||
|
if not normalized_text:
|
||||||
|
logging.info(f"⏭️ Skipping empty/blank tweet from {tweet_time}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
candidate_tweets.append({
|
||||||
|
"tweet": tweet,
|
||||||
|
"tweet_time": tweet_time,
|
||||||
|
"raw_text": prepared_text,
|
||||||
|
"normalized_text": normalized_text,
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ Failed to prepare candidate tweet: {e}")
|
||||||
|
|
||||||
|
logging.info(f"🧪 Prepared {len(candidate_tweets)} candidate tweets for comparison against recent Bluesky posts.")
|
||||||
|
|
||||||
|
tweets_to_post = []
|
||||||
|
for candidate in candidate_tweets:
|
||||||
|
if candidate["normalized_text"] in recent_bsky_text_set:
|
||||||
|
logging.info("⏭️ Skipping candidate because text already exists in the last 30 Bluesky posts.")
|
||||||
|
continue
|
||||||
|
tweets_to_post.append(candidate)
|
||||||
|
|
||||||
|
logging.info(f"📬 {len(tweets_to_post)} tweets remain after 30-vs-30 duplicate filtering.")
|
||||||
|
|
||||||
|
if not tweets_to_post:
|
||||||
|
logging.info("✅ No new tweets need posting after duplicate comparison.")
|
||||||
|
return
|
||||||
|
|
||||||
new_posts = 0
|
new_posts = 0
|
||||||
state_file = "twitter_browser_state.json"
|
state_file = "twitter_browser_state.json"
|
||||||
|
|
||||||
@@ -656,29 +697,12 @@ def sync_feeds(args):
|
|||||||
|
|
||||||
context = browser.new_context(**context_kwargs)
|
context = browser.new_context(**context_kwargs)
|
||||||
|
|
||||||
for tweet in reversed(tweets):
|
for candidate in tweets_to_post:
|
||||||
tweet_time = arrow.get(tweet.created_on)
|
tweet = candidate["tweet"]
|
||||||
|
tweet_time = candidate["tweet_time"]
|
||||||
|
raw_text = candidate["raw_text"]
|
||||||
|
|
||||||
if tweet_time < too_old_cutoff:
|
logging.info(f"📝 Posting missing tweet from {tweet_time} to Bluesky...")
|
||||||
logging.info(f"⏭️ Skipping old tweet from {tweet_time}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_text = tweet.text.strip()
|
|
||||||
|
|
||||||
if len(raw_text) > 295:
|
|
||||||
truncated = raw_text[:290]
|
|
||||||
last_space = truncated.rfind(" ")
|
|
||||||
if last_space > 0:
|
|
||||||
raw_text = truncated[:last_space] + "..."
|
|
||||||
else:
|
|
||||||
raw_text = truncated + "..."
|
|
||||||
logging.info("✂️ Tweet exceeded 300 characters. Truncated safely for Bluesky.")
|
|
||||||
|
|
||||||
if is_duplicate_bsky_text(raw_text, recent_bsky_texts):
|
|
||||||
logging.info("⏭️ Skipping tweet because its text already exists in the last 30 Bluesky posts.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logging.info(f"📝 Found candidate tweet from {tweet_time}. Posting to Bluesky...")
|
|
||||||
|
|
||||||
rich_text = make_rich(raw_text)
|
rich_text = make_rich(raw_text)
|
||||||
dynamic_alt = build_dynamic_alt(raw_text)
|
dynamic_alt = build_dynamic_alt(raw_text)
|
||||||
@@ -736,9 +760,7 @@ def sync_feeds(args):
|
|||||||
else:
|
else:
|
||||||
bsky_client.send_post(text=rich_text, langs=["ca"])
|
bsky_client.send_post(text=rich_text, langs=["ca"])
|
||||||
|
|
||||||
recent_bsky_texts.insert(0, normalize_post_text(raw_text))
|
recent_bsky_text_set.add(candidate["normalized_text"])
|
||||||
recent_bsky_texts = recent_bsky_texts[:DEDUPE_BSKY_LIMIT]
|
|
||||||
|
|
||||||
new_posts += 1
|
new_posts += 1
|
||||||
logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}")
|
logging.info(f"✅ Posted new tweet to Bluesky: {raw_text}")
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|||||||
Reference in New Issue
Block a user