Added snippets only in urls 2

This commit is contained in:
2026-04-05 21:44:18 +02:00
parent c1a9065744
commit 7614545893

View File

@@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275
# server-side proxy/PDS body-size caps.
# - Custom PDSes such as eurosky.social may accept images fine but fail on
# larger video blob uploads.
# - The safest approach is to:
# 1. cap duration
# 2. compress aggressively
# 3. log final file size
# 4. skip obviously too-large uploads
VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -84,10 +79,16 @@ def is_valid_url(url):
return False
def strip_trailing_url_punctuation(url):
if not url:
return url
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
def clean_url(url):
trimmed_url = url.strip()
cleaned_url = re.sub(r"\s+", "", trimmed_url)
cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url)
cleaned_url = strip_trailing_url_punctuation(cleaned_url)
if is_valid_url(cleaned_url):
return cleaned_url
@@ -97,7 +98,7 @@ def clean_url(url):
def canonicalize_url(url):
if not url:
return None
return url.strip()
return strip_trailing_url_punctuation(url.strip())
def canonicalize_tweet_url(url):
@@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text):
result = []
for url in urls:
cleaned = re.sub(r"[…\.]+$", "", url.strip())
cleaned = strip_trailing_url_punctuation(url)
if cleaned and not is_x_or_twitter_domain(cleaned):
result.append(cleaned)
return result
def extract_ordered_non_x_urls(text):
"""
Extract non-X URLs preserving original order and uniqueness.
This is used for posting decisions, especially external link-card creation.
"""
seen = set()
ordered = []
for url in extract_non_x_urls_from_text(text):
canonical = canonicalize_url(url)
if canonical and canonical not in seen:
seen.add(canonical)
ordered.append(canonical)
return ordered
def extract_urls_from_facets(record):
"""
Extract link URLs from Bluesky rich text facets if present.
@@ -163,10 +181,31 @@ def extract_urls_from_facets(record):
return urls
def looks_like_title_plus_url_post(text):
"""
Detect the specific desired style:
- some title/body text
- one non-X URL, typically on the last line
Example:
Headline text...
https://example.com/story
"""
if not text:
return False
lines = [line.strip() for line in text.splitlines() if line.strip()]
if len(lines) < 2:
return False
last_line = lines[-1]
urls_in_last_line = extract_ordered_non_x_urls(last_line)
total_urls = extract_ordered_non_x_urls(text)
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
def get_rate_limit_wait_seconds(error_obj, default_delay):
"""
Try to extract a sensible wait time from atproto/http error objects.
"""
try:
headers = getattr(error_obj, "headers", None)
if headers:
@@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
def upload_blob_with_retry(client, binary_data, media_label="media"):
"""
Retry Bluesky blob upload when rate-limited.
Diagnostic note:
On alternate PDSes, large video uploads may fail for reasons other than
429 rate limits. In those cases we log the exception more explicitly and
return None so the caller can degrade gracefully.
"""
last_exception = None
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
@@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
def get_blob_from_url(media_url, client, http_client):
"""
Download media and upload to Bluesky with retry support for upload rate limits.
"""
try:
r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
if r.status_code != 200:
@@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client):
def get_blob_from_file(file_path, client):
"""
Upload a local file as a Bluesky blob.
Diagnostic notes:
- We log the final file size because this is often the real reason a custom
PDS rejects video uploads.
- Self-hosted or alternate services may have stricter proxy/body-size limits
than bsky.social.
"""
try:
if not os.path.exists(file_path):
logging.warning(f"Could not upload local file {file_path}: file does not exist")
@@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client):
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
"""
Build a Bluesky external embed from a URL.
This should only be used when the post has no image/video embed, because
Bluesky posts can only carry one embed type.
This is only used when there is no image/video embed.
"""
link_metadata = fetch_link_metadata(url, http_client)
@@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
def prepare_post_text(text):
"""
Prepare the final public text exactly as it should be posted to Bluesky.
Does NOT append the source X URL.
Enforces the Bluesky text limit.
"""
raw_text = (text or "").strip()
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
@@ -379,9 +391,6 @@ def prepare_post_text(text):
def normalize_post_text(text):
"""
Normalize post text for duplicate detection.
"""
if not text:
return ""
@@ -391,9 +400,6 @@ def normalize_post_text(text):
def build_media_fingerprint(tweet):
"""
Build a deterministic media fingerprint from scraped tweet media.
"""
if not tweet or not tweet.media:
return "no-media"
@@ -419,9 +425,6 @@ def build_media_fingerprint(tweet):
def build_bsky_media_fingerprint(post_view):
"""
Best-effort media fingerprint from Bluesky embed structure.
"""
try:
embed = getattr(post_view, "embed", None)
if not embed:
@@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint):
def create_bsky_client(base_url, handle, password):
"""
Create a Bluesky/ATProto client pointed at the desired PDS or service host.
Supports custom hosts like eurosky.social.
"""
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
@@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH):
def remember_posted_tweet(state, candidate, bsky_uri=None):
"""
Store successful post in local state.
Primary key is canonical tweet URL when available.
Fallback key uses text_media_key.
"""
canonical_tweet_url = candidate.get("canonical_tweet_url")
fallback_key = f"textmedia:{candidate['text_media_key']}"
state_key = canonical_tweet_url or fallback_key
@@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
"media_fingerprint": candidate["media_fingerprint"],
"text_media_key": candidate["text_media_key"],
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
"bsky_uri": bsky_uri,
"tweet_created_on": candidate["tweet"].created_on,
"tweet_url": candidate["tweet"].tweet_url,
@@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
def candidate_matches_state(candidate, state):
"""
Strong private dedupe using local persistent state.
Match order:
1. canonical tweet URL
2. text + media fingerprint
3. normalized text
"""
canonical_tweet_url = candidate["canonical_tweet_url"]
text_media_key = candidate["text_media_key"]
normalized_text = candidate["normalized_text"]
@@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state):
def prune_state(state, max_entries=5000):
"""
Keep state file from growing forever.
Prunes oldest records by posted_at if necessary.
"""
posted_tweets = state.get("posted_tweets", {})
if len(posted_tweets) <= max_entries:
@@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000):
# --- Bluesky Post History ---
def get_recent_bsky_posts(client, handle, limit=30):
"""
Fetch recent top-level Bluesky posts for duplicate detection.
Returns a list of dicts with dedupe keys.
"""
recent_posts = []
try:
@@ -690,16 +670,16 @@ def make_rich(content):
raw = match.group(0)
if "\n" not in raw and "\r" not in raw:
return re.sub(r"[…\.]+$", "", raw)
return strip_trailing_url_punctuation(raw)
glued = raw.replace("\n", "").replace("\r", "")
test_url = re.sub(r"[…\.]+$", "", glued)
test_url = strip_trailing_url_punctuation(glued)
if is_valid_url(test_url):
return test_url
parts = raw.split("\n")
test_part0 = re.sub(r"[…\.]+$", "", parts[0])
test_part0 = strip_trailing_url_punctuation(parts[0])
if is_valid_url(test_part0):
return raw
@@ -725,7 +705,7 @@ def make_rich(content):
if word.startswith("http://"):
word = word.replace("http://", "https://", 1)
word = re.sub(r"[…\.]+$", "", word)
word = strip_trailing_url_punctuation(word)
clean_url_value = clean_url(word)
if clean_url_value and is_valid_url(clean_url_value):
@@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
page.close()
# --- Video Processing ---
def download_and_crop_video(video_url, output_path):
"""
Download, trim, and compress video before upload.
"""
temp_input = output_path.replace(".mp4", "_source.mp4")
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
temp_output = output_path.replace(".mp4", "_compressed.mp4")
@@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path):
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
"""
Multi-signal dedupe against recent Bluesky posts.
"""
candidate_non_x_urls = candidate["canonical_non_x_urls"]
candidate_text_media_key = candidate["text_media_key"]
candidate_normalized_text = candidate["normalized_text"]
@@ -1249,11 +1222,8 @@ def sync_feeds(args):
media_fingerprint = build_media_fingerprint(tweet)
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
canonical_non_x_urls = set()
for url in extract_non_x_urls_from_text(prepared_text):
canonical = canonicalize_url(url)
if canonical:
canonical_non_x_urls.add(canonical)
ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
canonical_non_x_urls = set(ordered_non_x_urls)
candidate_tweets.append({
"tweet": tweet,
@@ -1264,6 +1234,8 @@ def sync_feeds(args):
"text_media_key": text_media_key,
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
"canonical_non_x_urls": canonical_non_x_urls,
"ordered_non_x_urls": ordered_non_x_urls,
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
})
except Exception as e:
@@ -1376,19 +1348,32 @@ def sync_feeds(args):
if os.path.exists(temp_video_path):
os.remove(temp_video_path)
# Only create an external link card if no image/video embed will be used.
if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]:
first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0]
external_embed = build_external_link_embed(
first_non_x_url,
bsky_client,
media_http_client,
fallback_title="Link"
)
if external_embed:
logging.info(f"🔗 Built external link card for URL: {first_non_x_url}")
else:
logging.info(f" No external link card metadata available for URL: {first_non_x_url}")
# Only create the external rich snippet when there is no uploaded media.
# This specifically supports posts in the style:
# headline text
# https://news-site/article
if not video_embed and not image_embeds:
candidate_url = None
if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
candidate_url = candidate["ordered_non_x_urls"][0]
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
elif candidate.get("ordered_non_x_urls"):
candidate_url = candidate["ordered_non_x_urls"][0]
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
if candidate_url:
external_embed = build_external_link_embed(
candidate_url,
bsky_client,
media_http_client,
fallback_title="Link"
)
if external_embed:
logging.info(f"✅ Built external link card for URL: {candidate_url}")
else:
logging.info(f" Could not build external link card metadata for URL: {candidate_url}")
try:
post_result = None