Added snippets only in urls 2
This commit is contained in:
@@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275
|
|||||||
# server-side proxy/PDS body-size caps.
|
# server-side proxy/PDS body-size caps.
|
||||||
# - Custom PDSes such as eurosky.social may accept images fine but fail on
|
# - Custom PDSes such as eurosky.social may accept images fine but fail on
|
||||||
# larger video blob uploads.
|
# larger video blob uploads.
|
||||||
# - The safest approach is to:
|
|
||||||
# 1. cap duration
|
|
||||||
# 2. compress aggressively
|
|
||||||
# 3. log final file size
|
|
||||||
# 4. skip obviously too-large uploads
|
|
||||||
VIDEO_MAX_DURATION_SECONDS = 179
|
VIDEO_MAX_DURATION_SECONDS = 179
|
||||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||||
|
|
||||||
@@ -84,10 +79,16 @@ def is_valid_url(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def strip_trailing_url_punctuation(url):
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
trimmed_url = url.strip()
|
trimmed_url = url.strip()
|
||||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||||
cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url)
|
cleaned_url = strip_trailing_url_punctuation(cleaned_url)
|
||||||
|
|
||||||
if is_valid_url(cleaned_url):
|
if is_valid_url(cleaned_url):
|
||||||
return cleaned_url
|
return cleaned_url
|
||||||
@@ -97,7 +98,7 @@ def clean_url(url):
|
|||||||
def canonicalize_url(url):
|
def canonicalize_url(url):
|
||||||
if not url:
|
if not url:
|
||||||
return None
|
return None
|
||||||
return url.strip()
|
return strip_trailing_url_punctuation(url.strip())
|
||||||
|
|
||||||
|
|
||||||
def canonicalize_tweet_url(url):
|
def canonicalize_tweet_url(url):
|
||||||
@@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text):
|
|||||||
result = []
|
result = []
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
cleaned = re.sub(r"[…\.]+$", "", url.strip())
|
cleaned = strip_trailing_url_punctuation(url)
|
||||||
if cleaned and not is_x_or_twitter_domain(cleaned):
|
if cleaned and not is_x_or_twitter_domain(cleaned):
|
||||||
result.append(cleaned)
|
result.append(cleaned)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ordered_non_x_urls(text):
|
||||||
|
"""
|
||||||
|
Extract non-X URLs preserving original order and uniqueness.
|
||||||
|
This is used for posting decisions, especially external link-card creation.
|
||||||
|
"""
|
||||||
|
seen = set()
|
||||||
|
ordered = []
|
||||||
|
|
||||||
|
for url in extract_non_x_urls_from_text(text):
|
||||||
|
canonical = canonicalize_url(url)
|
||||||
|
if canonical and canonical not in seen:
|
||||||
|
seen.add(canonical)
|
||||||
|
ordered.append(canonical)
|
||||||
|
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
def extract_urls_from_facets(record):
|
def extract_urls_from_facets(record):
|
||||||
"""
|
"""
|
||||||
Extract link URLs from Bluesky rich text facets if present.
|
Extract link URLs from Bluesky rich text facets if present.
|
||||||
@@ -163,10 +181,31 @@ def extract_urls_from_facets(record):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_title_plus_url_post(text):
|
||||||
|
"""
|
||||||
|
Detect the specific desired style:
|
||||||
|
- some title/body text
|
||||||
|
- one non-X URL, typically on the last line
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Headline text...
|
||||||
|
https://example.com/story
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
if len(lines) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
last_line = lines[-1]
|
||||||
|
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
||||||
|
total_urls = extract_ordered_non_x_urls(text)
|
||||||
|
|
||||||
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
|
||||||
def get_rate_limit_wait_seconds(error_obj, default_delay):
|
def get_rate_limit_wait_seconds(error_obj, default_delay):
|
||||||
"""
|
|
||||||
Try to extract a sensible wait time from atproto/http error objects.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
headers = getattr(error_obj, "headers", None)
|
headers = getattr(error_obj, "headers", None)
|
||||||
if headers:
|
if headers:
|
||||||
@@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
|
|||||||
|
|
||||||
|
|
||||||
def upload_blob_with_retry(client, binary_data, media_label="media"):
|
def upload_blob_with_retry(client, binary_data, media_label="media"):
|
||||||
"""
|
|
||||||
Retry Bluesky blob upload when rate-limited.
|
|
||||||
|
|
||||||
Diagnostic note:
|
|
||||||
On alternate PDSes, large video uploads may fail for reasons other than
|
|
||||||
429 rate limits. In those cases we log the exception more explicitly and
|
|
||||||
return None so the caller can degrade gracefully.
|
|
||||||
"""
|
|
||||||
last_exception = None
|
last_exception = None
|
||||||
|
|
||||||
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
|
for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
|
||||||
@@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
|
|||||||
|
|
||||||
|
|
||||||
def get_blob_from_url(media_url, client, http_client):
|
def get_blob_from_url(media_url, client, http_client):
|
||||||
"""
|
|
||||||
Download media and upload to Bluesky with retry support for upload rate limits.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
|
r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
@@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client):
|
|||||||
|
|
||||||
|
|
||||||
def get_blob_from_file(file_path, client):
|
def get_blob_from_file(file_path, client):
|
||||||
"""
|
|
||||||
Upload a local file as a Bluesky blob.
|
|
||||||
|
|
||||||
Diagnostic notes:
|
|
||||||
- We log the final file size because this is often the real reason a custom
|
|
||||||
PDS rejects video uploads.
|
|
||||||
- Self-hosted or alternate services may have stricter proxy/body-size limits
|
|
||||||
than bsky.social.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
logging.warning(f"Could not upload local file {file_path}: file does not exist")
|
logging.warning(f"Could not upload local file {file_path}: file does not exist")
|
||||||
@@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client):
|
|||||||
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||||
"""
|
"""
|
||||||
Build a Bluesky external embed from a URL.
|
Build a Bluesky external embed from a URL.
|
||||||
|
This is only used when there is no image/video embed.
|
||||||
This should only be used when the post has no image/video embed, because
|
|
||||||
Bluesky posts can only carry one embed type.
|
|
||||||
"""
|
"""
|
||||||
link_metadata = fetch_link_metadata(url, http_client)
|
link_metadata = fetch_link_metadata(url, http_client)
|
||||||
|
|
||||||
@@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_post_text(text):
|
def prepare_post_text(text):
|
||||||
"""
|
|
||||||
Prepare the final public text exactly as it should be posted to Bluesky.
|
|
||||||
Does NOT append the source X URL.
|
|
||||||
Enforces the Bluesky text limit.
|
|
||||||
"""
|
|
||||||
raw_text = (text or "").strip()
|
raw_text = (text or "").strip()
|
||||||
|
|
||||||
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
||||||
@@ -379,9 +391,6 @@ def prepare_post_text(text):
|
|||||||
|
|
||||||
|
|
||||||
def normalize_post_text(text):
|
def normalize_post_text(text):
|
||||||
"""
|
|
||||||
Normalize post text for duplicate detection.
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@@ -391,9 +400,6 @@ def normalize_post_text(text):
|
|||||||
|
|
||||||
|
|
||||||
def build_media_fingerprint(tweet):
|
def build_media_fingerprint(tweet):
|
||||||
"""
|
|
||||||
Build a deterministic media fingerprint from scraped tweet media.
|
|
||||||
"""
|
|
||||||
if not tweet or not tweet.media:
|
if not tweet or not tweet.media:
|
||||||
return "no-media"
|
return "no-media"
|
||||||
|
|
||||||
@@ -419,9 +425,6 @@ def build_media_fingerprint(tweet):
|
|||||||
|
|
||||||
|
|
||||||
def build_bsky_media_fingerprint(post_view):
|
def build_bsky_media_fingerprint(post_view):
|
||||||
"""
|
|
||||||
Best-effort media fingerprint from Bluesky embed structure.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
embed = getattr(post_view, "embed", None)
|
embed = getattr(post_view, "embed", None)
|
||||||
if not embed:
|
if not embed:
|
||||||
@@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint):
|
|||||||
|
|
||||||
|
|
||||||
def create_bsky_client(base_url, handle, password):
|
def create_bsky_client(base_url, handle, password):
|
||||||
"""
|
|
||||||
Create a Bluesky/ATProto client pointed at the desired PDS or service host.
|
|
||||||
Supports custom hosts like eurosky.social.
|
|
||||||
"""
|
|
||||||
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
|
normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
|
||||||
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
|
logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
|
||||||
|
|
||||||
@@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH):
|
|||||||
|
|
||||||
|
|
||||||
def remember_posted_tweet(state, candidate, bsky_uri=None):
|
def remember_posted_tweet(state, candidate, bsky_uri=None):
|
||||||
"""
|
|
||||||
Store successful post in local state.
|
|
||||||
Primary key is canonical tweet URL when available.
|
|
||||||
Fallback key uses text_media_key.
|
|
||||||
"""
|
|
||||||
canonical_tweet_url = candidate.get("canonical_tweet_url")
|
canonical_tweet_url = candidate.get("canonical_tweet_url")
|
||||||
fallback_key = f"textmedia:{candidate['text_media_key']}"
|
fallback_key = f"textmedia:{candidate['text_media_key']}"
|
||||||
state_key = canonical_tweet_url or fallback_key
|
state_key = canonical_tweet_url or fallback_key
|
||||||
@@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
|
|||||||
"media_fingerprint": candidate["media_fingerprint"],
|
"media_fingerprint": candidate["media_fingerprint"],
|
||||||
"text_media_key": candidate["text_media_key"],
|
"text_media_key": candidate["text_media_key"],
|
||||||
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
|
"canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
|
||||||
|
"ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
|
||||||
"bsky_uri": bsky_uri,
|
"bsky_uri": bsky_uri,
|
||||||
"tweet_created_on": candidate["tweet"].created_on,
|
"tweet_created_on": candidate["tweet"].created_on,
|
||||||
"tweet_url": candidate["tweet"].tweet_url,
|
"tweet_url": candidate["tweet"].tweet_url,
|
||||||
@@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
|
|||||||
|
|
||||||
|
|
||||||
def candidate_matches_state(candidate, state):
|
def candidate_matches_state(candidate, state):
|
||||||
"""
|
|
||||||
Strong private dedupe using local persistent state.
|
|
||||||
Match order:
|
|
||||||
1. canonical tweet URL
|
|
||||||
2. text + media fingerprint
|
|
||||||
3. normalized text
|
|
||||||
"""
|
|
||||||
canonical_tweet_url = candidate["canonical_tweet_url"]
|
canonical_tweet_url = candidate["canonical_tweet_url"]
|
||||||
text_media_key = candidate["text_media_key"]
|
text_media_key = candidate["text_media_key"]
|
||||||
normalized_text = candidate["normalized_text"]
|
normalized_text = candidate["normalized_text"]
|
||||||
@@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state):
|
|||||||
|
|
||||||
|
|
||||||
def prune_state(state, max_entries=5000):
|
def prune_state(state, max_entries=5000):
|
||||||
"""
|
|
||||||
Keep state file from growing forever.
|
|
||||||
Prunes oldest records by posted_at if necessary.
|
|
||||||
"""
|
|
||||||
posted_tweets = state.get("posted_tweets", {})
|
posted_tweets = state.get("posted_tweets", {})
|
||||||
|
|
||||||
if len(posted_tweets) <= max_entries:
|
if len(posted_tweets) <= max_entries:
|
||||||
@@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000):
|
|||||||
|
|
||||||
# --- Bluesky Post History ---
|
# --- Bluesky Post History ---
|
||||||
def get_recent_bsky_posts(client, handle, limit=30):
|
def get_recent_bsky_posts(client, handle, limit=30):
|
||||||
"""
|
|
||||||
Fetch recent top-level Bluesky posts for duplicate detection.
|
|
||||||
Returns a list of dicts with dedupe keys.
|
|
||||||
"""
|
|
||||||
recent_posts = []
|
recent_posts = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -690,16 +670,16 @@ def make_rich(content):
|
|||||||
raw = match.group(0)
|
raw = match.group(0)
|
||||||
|
|
||||||
if "\n" not in raw and "\r" not in raw:
|
if "\n" not in raw and "\r" not in raw:
|
||||||
return re.sub(r"[…\.]+$", "", raw)
|
return strip_trailing_url_punctuation(raw)
|
||||||
|
|
||||||
glued = raw.replace("\n", "").replace("\r", "")
|
glued = raw.replace("\n", "").replace("\r", "")
|
||||||
test_url = re.sub(r"[…\.]+$", "", glued)
|
test_url = strip_trailing_url_punctuation(glued)
|
||||||
|
|
||||||
if is_valid_url(test_url):
|
if is_valid_url(test_url):
|
||||||
return test_url
|
return test_url
|
||||||
|
|
||||||
parts = raw.split("\n")
|
parts = raw.split("\n")
|
||||||
test_part0 = re.sub(r"[…\.]+$", "", parts[0])
|
test_part0 = strip_trailing_url_punctuation(parts[0])
|
||||||
if is_valid_url(test_part0):
|
if is_valid_url(test_part0):
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
@@ -725,7 +705,7 @@ def make_rich(content):
|
|||||||
if word.startswith("http://"):
|
if word.startswith("http://"):
|
||||||
word = word.replace("http://", "https://", 1)
|
word = word.replace("http://", "https://", 1)
|
||||||
|
|
||||||
word = re.sub(r"[…\.]+$", "", word)
|
word = strip_trailing_url_punctuation(word)
|
||||||
clean_url_value = clean_url(word)
|
clean_url_value = clean_url(word)
|
||||||
|
|
||||||
if clean_url_value and is_valid_url(clean_url_value):
|
if clean_url_value and is_valid_url(clean_url_value):
|
||||||
@@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
|
|||||||
page.close()
|
page.close()
|
||||||
|
|
||||||
|
|
||||||
# --- Video Processing ---
|
|
||||||
def download_and_crop_video(video_url, output_path):
|
def download_and_crop_video(video_url, output_path):
|
||||||
"""
|
|
||||||
Download, trim, and compress video before upload.
|
|
||||||
"""
|
|
||||||
temp_input = output_path.replace(".mp4", "_source.mp4")
|
temp_input = output_path.replace(".mp4", "_source.mp4")
|
||||||
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
|
||||||
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
temp_output = output_path.replace(".mp4", "_compressed.mp4")
|
||||||
@@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path):
|
|||||||
|
|
||||||
|
|
||||||
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
||||||
"""
|
|
||||||
Multi-signal dedupe against recent Bluesky posts.
|
|
||||||
"""
|
|
||||||
candidate_non_x_urls = candidate["canonical_non_x_urls"]
|
candidate_non_x_urls = candidate["canonical_non_x_urls"]
|
||||||
candidate_text_media_key = candidate["text_media_key"]
|
candidate_text_media_key = candidate["text_media_key"]
|
||||||
candidate_normalized_text = candidate["normalized_text"]
|
candidate_normalized_text = candidate["normalized_text"]
|
||||||
@@ -1249,11 +1222,8 @@ def sync_feeds(args):
|
|||||||
media_fingerprint = build_media_fingerprint(tweet)
|
media_fingerprint = build_media_fingerprint(tweet)
|
||||||
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
text_media_key = build_text_media_key(normalized_text, media_fingerprint)
|
||||||
|
|
||||||
canonical_non_x_urls = set()
|
ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
|
||||||
for url in extract_non_x_urls_from_text(prepared_text):
|
canonical_non_x_urls = set(ordered_non_x_urls)
|
||||||
canonical = canonicalize_url(url)
|
|
||||||
if canonical:
|
|
||||||
canonical_non_x_urls.add(canonical)
|
|
||||||
|
|
||||||
candidate_tweets.append({
|
candidate_tweets.append({
|
||||||
"tweet": tweet,
|
"tweet": tweet,
|
||||||
@@ -1264,6 +1234,8 @@ def sync_feeds(args):
|
|||||||
"text_media_key": text_media_key,
|
"text_media_key": text_media_key,
|
||||||
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
|
"canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
|
||||||
"canonical_non_x_urls": canonical_non_x_urls,
|
"canonical_non_x_urls": canonical_non_x_urls,
|
||||||
|
"ordered_non_x_urls": ordered_non_x_urls,
|
||||||
|
"looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1376,19 +1348,32 @@ def sync_feeds(args):
|
|||||||
if os.path.exists(temp_video_path):
|
if os.path.exists(temp_video_path):
|
||||||
os.remove(temp_video_path)
|
os.remove(temp_video_path)
|
||||||
|
|
||||||
# Only create an external link card if no image/video embed will be used.
|
# Only create the external rich snippet when there is no uploaded media.
|
||||||
if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]:
|
# This specifically supports posts in the style:
|
||||||
first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0]
|
# headline text
|
||||||
external_embed = build_external_link_embed(
|
# https://news-site/article
|
||||||
first_non_x_url,
|
if not video_embed and not image_embeds:
|
||||||
bsky_client,
|
candidate_url = None
|
||||||
media_http_client,
|
|
||||||
fallback_title="Link"
|
if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
|
||||||
)
|
candidate_url = candidate["ordered_non_x_urls"][0]
|
||||||
if external_embed:
|
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
|
||||||
logging.info(f"🔗 Built external link card for URL: {first_non_x_url}")
|
elif candidate.get("ordered_non_x_urls"):
|
||||||
else:
|
candidate_url = candidate["ordered_non_x_urls"][0]
|
||||||
logging.info(f"ℹ️ No external link card metadata available for URL: {first_non_x_url}")
|
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
|
||||||
|
|
||||||
|
if candidate_url:
|
||||||
|
external_embed = build_external_link_embed(
|
||||||
|
candidate_url,
|
||||||
|
bsky_client,
|
||||||
|
media_http_client,
|
||||||
|
fallback_title="Link"
|
||||||
|
)
|
||||||
|
|
||||||
|
if external_embed:
|
||||||
|
logging.info(f"✅ Built external link card for URL: {candidate_url}")
|
||||||
|
else:
|
||||||
|
logging.info(f"ℹ️ Could not build external link card metadata for URL: {candidate_url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
post_result = None
|
post_result = None
|
||||||
|
|||||||
Reference in New Issue
Block a user