diff --git a/twitter2bsky_daemon.py b/twitter2bsky_daemon.py index 99f3c41..39096d3 100644 --- a/twitter2bsky_daemon.py +++ b/twitter2bsky_daemon.py @@ -28,12 +28,10 @@ BSKY_TEXT_MAX_LENGTH = 275 VIDEO_MAX_DURATION_SECONDS = 179 MAX_VIDEO_UPLOAD_SIZE_MB = 45 -# Tweet image upload safety limits BSKY_IMAGE_MAX_BYTES = 950 * 1024 BSKY_IMAGE_MAX_DIMENSION = 2000 BSKY_IMAGE_MIN_JPEG_QUALITY = 45 -# External card thumbnail limits EXTERNAL_THUMB_MAX_BYTES = 950 * 1024 EXTERNAL_THUMB_MAX_DIMENSION = 1200 EXTERNAL_THUMB_MIN_JPEG_QUALITY = 40 @@ -46,6 +44,7 @@ BSKY_BLOB_TRANSIENT_ERROR_DELAY = 15 MEDIA_DOWNLOAD_TIMEOUT = 30 LINK_METADATA_TIMEOUT = 10 +URL_RESOLVE_TIMEOUT = 10 DEFAULT_BSKY_BASE_URL = "https://bsky.social" # --- Logging Setup --- @@ -289,7 +288,15 @@ def canonicalize_tweet_url(url): def is_x_or_twitter_domain(url): try: hostname = (urlparse(url).hostname or "").lower() - return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com", "t.co"} + return hostname in {"x.com", "www.x.com", "twitter.com", "www.twitter.com", "mobile.twitter.com"} + except Exception: + return False + + +def is_tco_domain(url): + try: + hostname = (urlparse(url).hostname or "").lower() + return hostname == "t.co" except Exception: return False @@ -302,13 +309,51 @@ def extract_urls_from_text(text): return re.findall(r"https?://[^\s#]+", repaired) +def resolve_url_if_needed(url, http_client): + """ + Resolve redirecting URLs such as t.co to their final destination. + Keep X/Twitter status URLs if they resolve there. + """ + if not url: + return None + + cleaned = canonicalize_url(url) + if not cleaned: + return None + + if not is_tco_domain(cleaned): + return cleaned + + try: + response = http_client.get(cleaned, timeout=URL_RESOLVE_TIMEOUT, follow_redirects=True) + final_url = str(response.url) + final_url = canonicalize_url(final_url) + + if final_url: + logging.info(f"๐Ÿ”— Resolved t.co URL {cleaned} -> {final_url}") + return final_url + + except Exception as e: + logging.warning(f"โš ๏ธ Could not resolve t.co URL {cleaned}: {repr(e)}") + + return cleaned + + def extract_non_x_urls_from_text(text): urls = extract_urls_from_text(text) result = [] for url in urls: cleaned = strip_trailing_url_punctuation(url) - if cleaned and not is_x_or_twitter_domain(cleaned): + if not cleaned: + continue + + # Keep t.co here for later resolution; do not discard it early. + if is_tco_domain(cleaned): + result.append(cleaned) + continue + + if not is_x_or_twitter_domain(cleaned): result.append(cleaned) return result @@ -335,6 +380,25 @@ def extract_first_visible_non_x_url(text): return None +def extract_first_resolved_external_url(text, http_client): + """ + Find the first visible candidate URL, resolve t.co if needed, + and return only if the final URL is a non-X external URL. + """ + for url in extract_non_x_urls_from_text(text or ""): + resolved = resolve_url_if_needed(url, http_client) + if not resolved: + continue + + if is_tco_domain(resolved): + continue + + if not is_x_or_twitter_domain(resolved): + return resolved + + return None + + def remove_url_from_visible_text(text, url_to_remove): if not text or not url_to_remove: return text @@ -508,7 +572,6 @@ def choose_final_visible_text(full_clean_text, primary_non_x_url=None, prefer_fu if not text: return text - # Golden rule: preserve exact original cleaned tweet text if it fits. if len(text) <= BSKY_TEXT_MAX_LENGTH: logging.info("๐ŸŸข Original cleaned tweet text fits in Bluesky. Preserving exact text.") return text @@ -693,6 +756,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None): "text_media_key": candidate["text_media_key"], "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]), "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []), + "resolved_primary_external_url": candidate.get("resolved_primary_external_url"), "bsky_uri": bsky_uri, "tweet_created_on": candidate["tweet"].created_on, "tweet_url": candidate["tweet"].tweet_url, @@ -796,7 +860,7 @@ def get_recent_bsky_posts(client, handle, limit=30): canonical_non_x_urls = set() for url in urls: - if not is_x_or_twitter_domain(url): + if not is_tco_domain(url) and not is_x_or_twitter_domain(url): canonical = canonicalize_url(url) if canonical: canonical_non_x_urls.add(canonical) @@ -1728,66 +1792,77 @@ def sync_feeds(args): candidate_tweets = [] - for tweet in reversed(tweets): - try: - tweet_time = arrow.get(tweet.created_on) + with httpx.Client() as resolve_http_client: + for tweet in reversed(tweets): + try: + tweet_time = arrow.get(tweet.created_on) - if tweet_time < too_old_cutoff: - logging.info(f"โญ๏ธ Skipping old tweet from {tweet_time}") - continue + if tweet_time < too_old_cutoff: + logging.info(f"โญ๏ธ Skipping old tweet from {tweet_time}") + continue - full_clean_text = clean_post_text(tweet.text) - normalized_text = normalize_post_text(full_clean_text) + full_clean_text = clean_post_text(tweet.text) + normalized_text = normalize_post_text(full_clean_text) - if not normalized_text: - logging.info(f"โญ๏ธ Skipping empty/blank tweet from {tweet_time}") - continue + if not normalized_text: + logging.info(f"โญ๏ธ Skipping empty/blank tweet from {tweet_time}") + continue - ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) - canonical_non_x_urls = set(ordered_non_x_urls) + ordered_non_x_urls = extract_ordered_non_x_urls(full_clean_text) - primary_non_x_url = extract_first_visible_non_x_url(full_clean_text) - if not primary_non_x_url and ordered_non_x_urls: - primary_non_x_url = ordered_non_x_urls[0] + resolved_primary_external_url = extract_first_resolved_external_url(full_clean_text, resolve_http_client) - has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) - has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) + canonical_non_x_urls = set() + if resolved_primary_external_url: + canonical_non_x_urls.add(canonicalize_url(resolved_primary_external_url)) - if primary_non_x_url and not has_video and not has_photo: - raw_text = choose_final_visible_text( - full_clean_text, - primary_non_x_url=primary_non_x_url, - prefer_full_text_without_url=True, - ) - else: - raw_text = choose_final_visible_text( - full_clean_text, - primary_non_x_url=primary_non_x_url, - prefer_full_text_without_url=False, - ) + for raw_url in ordered_non_x_urls: + if not is_tco_domain(raw_url) and not is_x_or_twitter_domain(raw_url): + canonical_non_x_urls.add(canonicalize_url(raw_url)) - media_fingerprint = build_media_fingerprint(tweet) - text_media_key = build_text_media_key(normalized_text, media_fingerprint) + primary_non_x_url = extract_first_visible_non_x_url(full_clean_text) + if not primary_non_x_url and ordered_non_x_urls: + primary_non_x_url = ordered_non_x_urls[0] - candidate_tweets.append({ - "tweet": tweet, - "tweet_time": tweet_time, - "raw_text": raw_text, - "full_clean_text": full_clean_text, - "normalized_text": normalized_text, - "media_fingerprint": media_fingerprint, - "text_media_key": text_media_key, - "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), - "canonical_non_x_urls": canonical_non_x_urls, - "ordered_non_x_urls": ordered_non_x_urls, - "primary_non_x_url": primary_non_x_url, - "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text), - "has_video": has_video, - "has_photo": has_photo, - }) + has_video = any(getattr(m, "type", None) == "video" for m in (tweet.media or [])) + has_photo = any(getattr(m, "type", None) == "photo" for m in (tweet.media or [])) - except Exception as e: - logging.warning(f"โš ๏ธ Failed to prepare candidate tweet: {e}") + if primary_non_x_url and not has_video and not has_photo: + raw_text = choose_final_visible_text( + full_clean_text, + primary_non_x_url=primary_non_x_url, + prefer_full_text_without_url=True, + ) + else: + raw_text = choose_final_visible_text( + full_clean_text, + primary_non_x_url=primary_non_x_url, + prefer_full_text_without_url=False, + ) + + media_fingerprint = build_media_fingerprint(tweet) + text_media_key = build_text_media_key(normalized_text, media_fingerprint) + + candidate_tweets.append({ + "tweet": tweet, + "tweet_time": tweet_time, + "raw_text": raw_text, + "full_clean_text": full_clean_text, + "normalized_text": normalized_text, + "media_fingerprint": media_fingerprint, + "text_media_key": text_media_key, + "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url), + "canonical_non_x_urls": canonical_non_x_urls, + "ordered_non_x_urls": ordered_non_x_urls, + "primary_non_x_url": primary_non_x_url, + "resolved_primary_external_url": resolved_primary_external_url, + "looks_like_title_plus_url": looks_like_title_plus_url_post(full_clean_text), + "has_video": has_video, + "has_photo": has_photo, + }) + + except Exception as e: + logging.warning(f"โš ๏ธ Failed to prepare candidate tweet: {e}") logging.info(f"๐Ÿงช Prepared {len(candidate_tweets)} candidate tweets for duplicate comparison.") @@ -1904,13 +1979,13 @@ def sync_feeds(args): media_upload_failures.append(f"photo:{media.media_url_https}") if not video_embed and not image_embeds: - candidate_url = candidate.get("primary_non_x_url") + candidate_url = candidate.get("resolved_primary_external_url") if candidate_url: if candidate.get("looks_like_title_plus_url"): - logging.info(f"๐Ÿ”— Detected title+URL post style. Using URL for external card: {candidate_url}") + logging.info(f"๐Ÿ”— Detected title+URL post style. Using resolved URL for external card: {candidate_url}") else: - logging.info(f"๐Ÿ”— Using first non-X URL for external card: {candidate_url}") + logging.info(f"๐Ÿ”— Using resolved first external URL for external card: {candidate_url}") external_embed = build_external_link_embed( candidate_url,