diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 9f7b8fb..c3d2bfa 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275 # server-side proxy/PDS body-size caps. # - Custom PDSes such as eurosky.social may accept images fine but fail on # larger video blob uploads. -# - The safest approach is to: -# 1. cap duration -# 2. compress aggressively -# 3. log final file size -# 4. skip obviously too-large uploads VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 @@ -84,10 +79,16 @@ def is_valid_url(url): return False +def strip_trailing_url_punctuation(url): + if not url: + return url + return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip()) + + def clean_url(url): trimmed_url = url.strip() cleaned_url = re.sub(r"\s+", "", trimmed_url) - cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url) + cleaned_url = strip_trailing_url_punctuation(cleaned_url) if is_valid_url(cleaned_url): return cleaned_url @@ -97,7 +98,7 @@ def clean_url(url): def canonicalize_url(url): if not url: return None - return url.strip() + return strip_trailing_url_punctuation(url.strip()) def canonicalize_tweet_url(url): @@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text): result = [] for url in urls: - cleaned = re.sub(r"[…\.]+$", "", url.strip()) + cleaned = strip_trailing_url_punctuation(url) if cleaned and not is_x_or_twitter_domain(cleaned): result.append(cleaned) return result +def extract_ordered_non_x_urls(text): + """ + Extract non-X URLs preserving original order and uniqueness. + This is used for posting decisions, especially external link-card creation. + """ + seen = set() + ordered = [] + + for url in extract_non_x_urls_from_text(text): + canonical = canonicalize_url(url) + if canonical and canonical not in seen: + seen.add(canonical) + ordered.append(canonical) + + return ordered + + def extract_urls_from_facets(record): """ Extract link URLs from Bluesky rich text facets if present. @@ -163,10 +181,31 @@ def extract_urls_from_facets(record): return urls +def looks_like_title_plus_url_post(text): + """ + Detect the specific desired style: + - some title/body text + - one non-X URL, typically on the last line + + Example: + Headline text... + https://example.com/story + """ + if not text: + return False + + lines = [line.strip() for line in text.splitlines() if line.strip()] + if len(lines) < 2: + return False + + last_line = lines[-1] + urls_in_last_line = extract_ordered_non_x_urls(last_line) + total_urls = extract_ordered_non_x_urls(text) + + return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) + + def get_rate_limit_wait_seconds(error_obj, default_delay): - """ - Try to extract a sensible wait time from atproto/http error objects. - """ try: headers = getattr(error_obj, "headers", None) if headers: @@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay): def upload_blob_with_retry(client, binary_data, media_label="media"): - """ - Retry Bluesky blob upload when rate-limited. - - Diagnostic note: - On alternate PDSes, large video uploads may fail for reasons other than - 429 rate limits. In those cases we log the exception more explicitly and - return None so the caller can degrade gracefully. - """ last_exception = None for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1): @@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"): def get_blob_from_url(media_url, client, http_client): - """ - Download media and upload to Bluesky with retry support for upload rate limits. - """ try: r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True) if r.status_code != 200: @@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client): def get_blob_from_file(file_path, client): - """ - Upload a local file as a Bluesky blob. - - Diagnostic notes: - - We log the final file size because this is often the real reason a custom - PDS rejects video uploads. - - Self-hosted or alternate services may have stricter proxy/body-size limits - than bsky.social. - """ try: if not os.path.exists(file_path): logging.warning(f"Could not upload local file {file_path}: file does not exist") @@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client): def build_external_link_embed(url, client, http_client, fallback_title="Link"): """ Build a Bluesky external embed from a URL. - - This should only be used when the post has no image/video embed, because - Bluesky posts can only carry one embed type. + This is only used when there is no image/video embed. """ link_metadata = fetch_link_metadata(url, http_client) @@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"): def prepare_post_text(text): - """ - Prepare the final public text exactly as it should be posted to Bluesky. - Does NOT append the source X URL. - Enforces the Bluesky text limit. - """ raw_text = (text or "").strip() if len(raw_text) > BSKY_TEXT_MAX_LENGTH: @@ -379,9 +391,6 @@ def prepare_post_text(text): def normalize_post_text(text): - """ - Normalize post text for duplicate detection. - """ if not text: return "" @@ -391,9 +400,6 @@ def normalize_post_text(text): def build_media_fingerprint(tweet): - """ - Build a deterministic media fingerprint from scraped tweet media. - """ if not tweet or not tweet.media: return "no-media" @@ -419,9 +425,6 @@ def build_media_fingerprint(tweet): def build_bsky_media_fingerprint(post_view): - """ - Best-effort media fingerprint from Bluesky embed structure. - """ try: embed = getattr(post_view, "embed", None) if not embed: @@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint): def create_bsky_client(base_url, handle, password): - """ - Create a Bluesky/ATProto client pointed at the desired PDS or service host. - Supports custom hosts like eurosky.social. - """ normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/") logging.info(f"šŸ” Connecting Bluesky client via base URL: {normalized_base_url}") @@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH): def remember_posted_tweet(state, candidate, bsky_uri=None): - """ - Store successful post in local state. - Primary key is canonical tweet URL when available. - Fallback key uses text_media_key. - """ canonical_tweet_url = candidate.get("canonical_tweet_url") fallback_key = f"textmedia:{candidate['text_media_key']}" state_key = canonical_tweet_url or fallback_key @@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None): "media_fingerprint": candidate["media_fingerprint"], "text_media_key": candidate["text_media_key"], "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]), + "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []), "bsky_uri": bsky_uri, "tweet_created_on": candidate["tweet"].created_on, "tweet_url": candidate["tweet"].tweet_url, @@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None): def candidate_matches_state(candidate, state): - """ - Strong private dedupe using local persistent state. - Match order: - 1. canonical tweet URL - 2. text + media fingerprint - 3. normalized text - """ canonical_tweet_url = candidate["canonical_tweet_url"] text_media_key = candidate["text_media_key"] normalized_text = candidate["normalized_text"] @@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state): def prune_state(state, max_entries=5000): - """ - Keep state file from growing forever. - Prunes oldest records by posted_at if necessary. - """ posted_tweets = state.get("posted_tweets", {}) if len(posted_tweets) <= max_entries: @@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000): # --- Bluesky Post History --- def get_recent_bsky_posts(client, handle, limit=30): - """ - Fetch recent top-level Bluesky posts for duplicate detection. - Returns a list of dicts with dedupe keys. - """ recent_posts = [] try: @@ -690,16 +670,16 @@ def make_rich(content): raw = match.group(0) if "\n" not in raw and "\r" not in raw: - return re.sub(r"[…\.]+$", "", raw) + return strip_trailing_url_punctuation(raw) glued = raw.replace("\n", "").replace("\r", "") - test_url = re.sub(r"[…\.]+$", "", glued) + test_url = strip_trailing_url_punctuation(glued) if is_valid_url(test_url): return test_url parts = raw.split("\n") - test_part0 = re.sub(r"[…\.]+$", "", parts[0]) + test_part0 = strip_trailing_url_punctuation(parts[0]) if is_valid_url(test_part0): return raw @@ -725,7 +705,7 @@ def make_rich(content): if word.startswith("http://"): word = word.replace("http://", "https://", 1) - word = re.sub(r"[…\.]+$", "", word) + word = strip_trailing_url_punctuation(word) clean_url_value = clean_url(word) if clean_url_value and is_valid_url(clean_url_value): @@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url): page.close() -# --- Video Processing --- def download_and_crop_video(video_url, output_path): - """ - Download, trim, and compress video before upload. - """ temp_input = output_path.replace(".mp4", "_source.mp4") temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4") temp_output = output_path.replace(".mp4", "_compressed.mp4") @@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path): def candidate_matches_existing_bsky(candidate, recent_bsky_posts): - """ - Multi-signal dedupe against recent Bluesky posts. - """ candidate_non_x_urls = candidate["canonical_non_x_urls"] candidate_text_media_key = candidate["text_media_key"] candidate_normalized_text = candidate["normalized_text"] @@ -1249,11 +1222,8 @@ def sync_feeds(args): media_fingerprint = build_media_fingerprint(tweet) text_media_key = build_text_media_key(normalized_text, media_fingerprint) - canonical_non_x_urls = set() - for url in extract_non_x_urls_from_text(prepared_text): - canonical = canonicalize_url(url) - if canonical: - canonical_non_x_urls.add(canonical) + ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text) + canonical_non_x_urls = set(ordered_non_x_urls) candidate_tweets.append({ "tweet": tweet, @@ -1264,6 +1234,8 @@ def sync_feeds(args): "text_media_key": text_media_key, "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), "canonical_non_x_urls": canonical_non_x_urls, + "ordered_non_x_urls": ordered_non_x_urls, + "looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text), }) except Exception as e: @@ -1376,19 +1348,32 @@ def sync_feeds(args): if os.path.exists(temp_video_path): os.remove(temp_video_path) - # Only create an external link card if no image/video embed will be used. - if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]: - first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0] - external_embed = build_external_link_embed( - first_non_x_url, - bsky_client, - media_http_client, - fallback_title="Link" - ) - if external_embed: - logging.info(f"šŸ”— Built external link card for URL: {first_non_x_url}") - else: - logging.info(f"ā„¹ļø No external link card metadata available for URL: {first_non_x_url}") + # Only create the external rich snippet when there is no uploaded media. + # This specifically supports posts in the style: + # headline text + # https://news-site/article + if not video_embed and not image_embeds: + candidate_url = None + + if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"): + candidate_url = candidate["ordered_non_x_urls"][0] + logging.info(f"šŸ”— Detected title+URL post style. Using URL for external card: {candidate_url}") + elif candidate.get("ordered_non_x_urls"): + candidate_url = candidate["ordered_non_x_urls"][0] + logging.info(f"šŸ”— Text-only post with non-X URL. Using first URL for external card: {candidate_url}") + + if candidate_url: + external_embed = build_external_link_embed( + candidate_url, + bsky_client, + media_http_client, + fallback_title="Link" + ) + + if external_embed: + logging.info(f"āœ… Built external link card for URL: {candidate_url}") + else: + logging.info(f"ā„¹ļø Could not build external link card metadata for URL: {candidate_url}") try: post_result = None