Added snippets only in urls 2

This commit is contained in:
2026-04-05 21:44:18 +02:00
parent c1a9065744
commit 7614545893

View File

@@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275
# server-side proxy/PDS body-size caps. # server-side proxy/PDS body-size caps.
# - Custom PDSes such as eurosky.social may accept images fine but fail on # - Custom PDSes such as eurosky.social may accept images fine but fail on
# larger video blob uploads. # larger video blob uploads.
# - The safest approach is to:
# 1. cap duration
# 2. compress aggressively
# 3. log final file size
# 4. skip obviously too-large uploads
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -84,10 +79,16 @@ def is_valid_url(url):
return False return False
def strip_trailing_url_punctuation(url):
if not url:
return url
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
def clean_url(url): def clean_url(url):
trimmed_url = url.strip() trimmed_url = url.strip()
cleaned_url = re.sub(r"\s+", "", trimmed_url) cleaned_url = re.sub(r"\s+", "", trimmed_url)
cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url) cleaned_url = strip_trailing_url_punctuation(cleaned_url)
if is_valid_url(cleaned_url): if is_valid_url(cleaned_url):
return cleaned_url return cleaned_url
@@ -97,7 +98,7 @@ def clean_url(url):
def canonicalize_url(url): def canonicalize_url(url):
if not url: if not url:
return None return None
return url.strip() return strip_trailing_url_punctuation(url.strip())
def canonicalize_tweet_url(url): def canonicalize_tweet_url(url):
@@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text):
result = [] result = []
for url in urls: for url in urls:
cleaned = re.sub(r"[…\.]+$", "", url.strip()) cleaned = strip_trailing_url_punctuation(url)
if cleaned and not is_x_or_twitter_domain(cleaned): if cleaned and not is_x_or_twitter_domain(cleaned):
result.append(cleaned) result.append(cleaned)
return result return result
def extract_ordered_non_x_urls(text):
"""
Extract non-X URLs preserving original order and uniqueness.
This is used for posting decisions, especially external link-card creation.
"""
seen = set()
ordered = []
for url in extract_non_x_urls_from_text(text):
canonical = canonicalize_url(url)
if canonical and canonical not in seen:
seen.add(canonical)
ordered.append(canonical)
return ordered
def extract_urls_from_facets(record): def extract_urls_from_facets(record):
""" """
Extract link URLs from Bluesky rich text facets if present. Extract link URLs from Bluesky rich text facets if present.
@@ -163,10 +181,31 @@ def extract_urls_from_facets(record):
return urls return urls
def looks_like_title_plus_url_post(text):
"""
Detect the specific desired style:
- some title/body text
- one non-X URL, typically on the last line
Example:
Headline text...
https://example.com/story
"""
if not text:
return False
lines = [line.strip() for line in text.splitlines() if line.strip()]
if len(lines) < 2:
return False
last_line = lines[-1]
urls_in_last_line = extract_ordered_non_x_urls(last_line)
total_urls = extract_ordered_non_x_urls(text)
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
def get_rate_limit_wait_seconds(error_obj, default_delay): def get_rate_limit_wait_seconds(error_obj, default_delay):
"""
Try to extract a sensible wait time from atproto/http error objects.
"""
try: try:
headers = getattr(error_obj, "headers", None) headers = getattr(error_obj, "headers", None)
if headers: if headers:
@@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
def upload_blob_with_retry(client, binary_data, media_label="media"): def upload_blob_with_retry(client, binary_data, media_label="media"):
"""
Retry Bluesky blob upload when rate-limited.
Diagnostic note:
On alternate PDSes, large video uploads may fail for reasons other than
429 rate limits. In those cases we log the exception more explicitly and
return None so the caller can degrade gracefully.
"""
last_exception = None last_exception = None
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1): for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
@@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
def get_blob_from_url(media_url, client, http_client): def get_blob_from_url(media_url, client, http_client):
"""
Download media and upload to Bluesky with retry support for upload rate limits.
"""
try: try:
r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True) r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
if r.status_code != 200: if r.status_code != 200:
@@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client):
def get_blob_from_file(file_path, client): def get_blob_from_file(file_path, client):
"""
Upload a local file as a Bluesky blob.
Diagnostic notes:
- We log the final file size because this is often the real reason a custom
PDS rejects video uploads.
- Self-hosted or alternate services may have stricter proxy/body-size limits
than bsky.social.
"""
try: try:
if not os.path.exists(file_path): if not os.path.exists(file_path):
logging.warning(f"Could not upload local file {file_path}: file does not exist") logging.warning(f"Could not upload local file {file_path}: file does not exist")
@@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client):
def build_external_link_embed(url, client, http_client, fallback_title="Link"): def build_external_link_embed(url, client, http_client, fallback_title="Link"):
""" """
Build a Bluesky external embed from a URL. Build a Bluesky external embed from a URL.
This is only used when there is no image/video embed.
This should only be used when the post has no image/video embed, because
Bluesky posts can only carry one embed type.
""" """
link_metadata = fetch_link_metadata(url, http_client) link_metadata = fetch_link_metadata(url, http_client)
@@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
def prepare_post_text(text): def prepare_post_text(text):
"""
Prepare the final public text exactly as it should be posted to Bluesky.
Does NOT append the source X URL.
Enforces the Bluesky text limit.
"""
raw_text = (text or "").strip() raw_text = (text or "").strip()
if len(raw_text) > BSKY_TEXT_MAX_LENGTH: if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
@@ -379,9 +391,6 @@ def prepare_post_text(text):
def normalize_post_text(text): def normalize_post_text(text):
"""
Normalize post text for duplicate detection.
"""
if not text: if not text:
return "" return ""
@@ -391,9 +400,6 @@ def normalize_post_text(text):
def build_media_fingerprint(tweet): def build_media_fingerprint(tweet):
"""
Build a deterministic media fingerprint from scraped tweet media.
"""
if not tweet or not tweet.media: if not tweet or not tweet.media:
return "no-media" return "no-media"
@@ -419,9 +425,6 @@ def build_media_fingerprint(tweet):
def build_bsky_media_fingerprint(post_view): def build_bsky_media_fingerprint(post_view):
"""
Best-effort media fingerprint from Bluesky embed structure.
"""
try: try:
embed = getattr(post_view, "embed", None) embed = getattr(post_view, "embed", None)
if not embed: if not embed:
@@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint):
def create_bsky_client(base_url, handle, password): def create_bsky_client(base_url, handle, password):
"""
Create a Bluesky/ATProto client pointed at the desired PDS or service host.
Supports custom hosts like eurosky.social.
"""
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/") normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}") logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
@@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH):
def remember_posted_tweet(state, candidate, bsky_uri=None): def remember_posted_tweet(state, candidate, bsky_uri=None):
"""
Store successful post in local state.
Primary key is canonical tweet URL when available.
Fallback key uses text_media_key.
"""
canonical_tweet_url = candidate.get("canonical_tweet_url") canonical_tweet_url = candidate.get("canonical_tweet_url")
fallback_key = f"textmedia:{candidate['text_media_key']}" fallback_key = f"textmedia:{candidate['text_media_key']}"
state_key = canonical_tweet_url or fallback_key state_key = canonical_tweet_url or fallback_key
@@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
"media_fingerprint": candidate["media_fingerprint"], "media_fingerprint": candidate["media_fingerprint"],
"text_media_key": candidate["text_media_key"], "text_media_key": candidate["text_media_key"],
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]), "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
"bsky_uri": bsky_uri, "bsky_uri": bsky_uri,
"tweet_created_on": candidate["tweet"].created_on, "tweet_created_on": candidate["tweet"].created_on,
"tweet_url": candidate["tweet"].tweet_url, "tweet_url": candidate["tweet"].tweet_url,
@@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
def candidate_matches_state(candidate, state): def candidate_matches_state(candidate, state):
"""
Strong private dedupe using local persistent state.
Match order:
1. canonical tweet URL
2. text + media fingerprint
3. normalized text
"""
canonical_tweet_url = candidate["canonical_tweet_url"] canonical_tweet_url = candidate["canonical_tweet_url"]
text_media_key = candidate["text_media_key"] text_media_key = candidate["text_media_key"]
normalized_text = candidate["normalized_text"] normalized_text = candidate["normalized_text"]
@@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state):
def prune_state(state, max_entries=5000): def prune_state(state, max_entries=5000):
"""
Keep state file from growing forever.
Prunes oldest records by posted_at if necessary.
"""
posted_tweets = state.get("posted_tweets", {}) posted_tweets = state.get("posted_tweets", {})
if len(posted_tweets) <= max_entries: if len(posted_tweets) <= max_entries:
@@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000):
# --- Bluesky Post History --- # --- Bluesky Post History ---
def get_recent_bsky_posts(client, handle, limit=30): def get_recent_bsky_posts(client, handle, limit=30):
"""
Fetch recent top-level Bluesky posts for duplicate detection.
Returns a list of dicts with dedupe keys.
"""
recent_posts = [] recent_posts = []
try: try:
@@ -690,16 +670,16 @@ def make_rich(content):
raw = match.group(0) raw = match.group(0)
if "\n" not in raw and "\r" not in raw: if "\n" not in raw and "\r" not in raw:
return re.sub(r"[…\.]+$", "", raw) return strip_trailing_url_punctuation(raw)
glued = raw.replace("\n", "").replace("\r", "") glued = raw.replace("\n", "").replace("\r", "")
test_url = re.sub(r"[…\.]+$", "", glued) test_url = strip_trailing_url_punctuation(glued)
if is_valid_url(test_url): if is_valid_url(test_url):
return test_url return test_url
parts = raw.split("\n") parts = raw.split("\n")
test_part0 = re.sub(r"[…\.]+$", "", parts[0]) test_part0 = strip_trailing_url_punctuation(parts[0])
if is_valid_url(test_part0): if is_valid_url(test_part0):
return raw return raw
@@ -725,7 +705,7 @@ def make_rich(content):
if word.startswith("http://"): if word.startswith("http://"):
word = word.replace("http://", "https://", 1) word = word.replace("http://", "https://", 1)
word = re.sub(r"[…\.]+$", "", word) word = strip_trailing_url_punctuation(word)
clean_url_value = clean_url(word) clean_url_value = clean_url(word)
if clean_url_value and is_valid_url(clean_url_value): if clean_url_value and is_valid_url(clean_url_value):
@@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
page.close() page.close()
# --- Video Processing ---
def download_and_crop_video(video_url, output_path): def download_and_crop_video(video_url, output_path):
"""
Download, trim, and compress video before upload.
"""
temp_input = output_path.replace(".mp4", "_source.mp4") temp_input = output_path.replace(".mp4", "_source.mp4")
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
temp_output = output_path.replace(".mp4", "_compressed.mp4") temp_output = output_path.replace(".mp4", "_compressed.mp4")
@@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path):
def candidate_matches_existing_bsky(candidate, recent_bsky_posts): def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
"""
Multi-signal dedupe against recent Bluesky posts.
"""
candidate_non_x_urls = candidate["canonical_non_x_urls"] candidate_non_x_urls = candidate["canonical_non_x_urls"]
candidate_text_media_key = candidate["text_media_key"] candidate_text_media_key = candidate["text_media_key"]
candidate_normalized_text = candidate["normalized_text"] candidate_normalized_text = candidate["normalized_text"]
@@ -1249,11 +1222,8 @@ def sync_feeds(args):
media_fingerprint = build_media_fingerprint(tweet) media_fingerprint = build_media_fingerprint(tweet)
text_media_key = build_text_media_key(normalized_text, media_fingerprint) text_media_key = build_text_media_key(normalized_text, media_fingerprint)
canonical_non_x_urls = set() ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
for url in extract_non_x_urls_from_text(prepared_text): canonical_non_x_urls = set(ordered_non_x_urls)
canonical = canonicalize_url(url)
if canonical:
canonical_non_x_urls.add(canonical)
candidate_tweets.append({ candidate_tweets.append({
"tweet": tweet, "tweet": tweet,
@@ -1264,6 +1234,8 @@ def sync_feeds(args):
"text_media_key": text_media_key, "text_media_key": text_media_key,
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
"canonical_non_x_urls": canonical_non_x_urls, "canonical_non_x_urls": canonical_non_x_urls,
"ordered_non_x_urls": ordered_non_x_urls,
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
}) })
except Exception as e: except Exception as e:
@@ -1376,19 +1348,32 @@ def sync_feeds(args):
if os.path.exists(temp_video_path): if os.path.exists(temp_video_path):
os.remove(temp_video_path) os.remove(temp_video_path)
# Only create an external link card if no image/video embed will be used. # Only create the external rich snippet when there is no uploaded media.
if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]: # This specifically supports posts in the style:
first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0] # headline text
external_embed = build_external_link_embed( # https://news-site/article
first_non_x_url, if not video_embed and not image_embeds:
bsky_client, candidate_url = None
media_http_client,
fallback_title="Link" if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
) candidate_url = candidate["ordered_non_x_urls"][0]
if external_embed: logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
logging.info(f"🔗 Built external link card for URL: {first_non_x_url}") elif candidate.get("ordered_non_x_urls"):
else: candidate_url = candidate["ordered_non_x_urls"][0]
logging.info(f" No external link card metadata available for URL: {first_non_x_url}") logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
if candidate_url:
external_embed = build_external_link_embed(
candidate_url,
bsky_client,
media_http_client,
fallback_title="Link"
)
if external_embed:
logging.info(f"✅ Built external link card for URL: {candidate_url}")
else:
logging.info(f" Could not build external link card metadata for URL: {candidate_url}")
try: try:
post_result = None post_result = None