Added snippets only in urls 2

2026-04-05 21:44:18 +02:00
parent c1a9065744
commit 7614545893
1 changed files with 86 additions and 101 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275
 #   server-side proxy/PDS body-size caps.
 # - Custom PDSes such as eurosky.social may accept images fine but fail on
 #   larger video blob uploads.
 # - The safest approach is to:
 #     1. cap duration
 #     2. compress aggressively
 #     3. log final file size
 #     4. skip obviously too-large uploads
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -84,10 +79,16 @@ def is_valid_url(url):
        return False
 def strip_trailing_url_punctuation(url):
    if not url:
        return url
    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
-    cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url)
+    cleaned_url = strip_trailing_url_punctuation(cleaned_url)
    if is_valid_url(cleaned_url):
        return cleaned_url
@@ -97,7 +98,7 @@ def clean_url(url):
 def canonicalize_url(url):
    if not url:
        return None
-    return url.strip()
+    return strip_trailing_url_punctuation(url.strip())
 def canonicalize_tweet_url(url):
@@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text):
    result = []
    for url in urls:
-        cleaned = re.sub(r"[…\.]+$", "", url.strip())
+        cleaned = strip_trailing_url_punctuation(url)
        if cleaned and not is_x_or_twitter_domain(cleaned):
            result.append(cleaned)
    return result
 def extract_ordered_non_x_urls(text):
    """
    Extract non-X URLs preserving original order and uniqueness.
    This is used for posting decisions, especially external link-card creation.
    """
    seen = set()
    ordered = []
    for url in extract_non_x_urls_from_text(text):
        canonical = canonicalize_url(url)
        if canonical and canonical not in seen:
            seen.add(canonical)
            ordered.append(canonical)
    return ordered
 def extract_urls_from_facets(record):
    """
    Extract link URLs from Bluesky rich text facets if present.
@@ -163,10 +181,31 @@ def extract_urls_from_facets(record):
    return urls
 def looks_like_title_plus_url_post(text):
    """
    Detect the specific desired style:
    - some title/body text
    - one non-X URL, typically on the last line
    Example:
      Headline text...
      https://example.com/story
    """
    if not text:
        return False
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    if len(lines) < 2:
        return False
    last_line = lines[-1]
    urls_in_last_line = extract_ordered_non_x_urls(last_line)
    total_urls = extract_ordered_non_x_urls(text)
    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
 def get_rate_limit_wait_seconds(error_obj, default_delay):
    """
    Try to extract a sensible wait time from atproto/http error objects.
    """
    try:
        headers = getattr(error_obj, "headers", None)
        if headers:
@@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):
 def upload_blob_with_retry(client, binary_data, media_label="media"):
    """
    Retry Bluesky blob upload when rate-limited.
    Diagnostic note:
    On alternate PDSes, large video uploads may fail for reasons other than
    429 rate limits. In those cases we log the exception more explicitly and
    return None so the caller can degrade gracefully.
    """
    last_exception = None
    for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
@@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):
 def get_blob_from_url(media_url, client, http_client):
    """
    Download media and upload to Bluesky with retry support for upload rate limits.
    """
    try:
        r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
        if r.status_code != 200:
@@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client):
 def get_blob_from_file(file_path, client):
    """
    Upload a local file as a Bluesky blob.
    Diagnostic notes:
    - We log the final file size because this is often the real reason a custom
      PDS rejects video uploads.
    - Self-hosted or alternate services may have stricter proxy/body-size limits
      than bsky.social.
    """
    try:
        if not os.path.exists(file_path):
            logging.warning(f"Could not upload local file {file_path}: file does not exist")
@@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client):
 def build_external_link_embed(url, client, http_client, fallback_title="Link"):
    """
    Build a Bluesky external embed from a URL.
-
+    This is only used when there is no image/video embed.
    This should only be used when the post has no image/video embed, because
    Bluesky posts can only carry one embed type.
    """
    link_metadata = fetch_link_metadata(url, http_client)
@@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
 def prepare_post_text(text):
    """
    Prepare the final public text exactly as it should be posted to Bluesky.
    Does NOT append the source X URL.
    Enforces the Bluesky text limit.
    """
    raw_text = (text or "").strip()
    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
@@ -379,9 +391,6 @@ def prepare_post_text(text):
 def normalize_post_text(text):
    """
    Normalize post text for duplicate detection.
    """
    if not text:
        return ""
@@ -391,9 +400,6 @@ def normalize_post_text(text):
 def build_media_fingerprint(tweet):
    """
    Build a deterministic media fingerprint from scraped tweet media.
    """
    if not tweet or not tweet.media:
        return "no-media"
@@ -419,9 +425,6 @@ def build_media_fingerprint(tweet):
 def build_bsky_media_fingerprint(post_view):
    """
    Best-effort media fingerprint from Bluesky embed structure.
    """
    try:
        embed = getattr(post_view, "embed", None)
        if not embed:
@@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint):
 def create_bsky_client(base_url, handle, password):
    """
    Create a Bluesky/ATProto client pointed at the desired PDS or service host.
    Supports custom hosts like eurosky.social.
    """
    normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
    logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")
@@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH):
 def remember_posted_tweet(state, candidate, bsky_uri=None):
    """
    Store successful post in local state.
    Primary key is canonical tweet URL when available.
    Fallback key uses text_media_key.
    """
    canonical_tweet_url = candidate.get("canonical_tweet_url")
    fallback_key = f"textmedia:{candidate['text_media_key']}"
    state_key = canonical_tweet_url or fallback_key
@@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
        "media_fingerprint": candidate["media_fingerprint"],
        "text_media_key": candidate["text_media_key"],
        "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
        "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
        "bsky_uri": bsky_uri,
        "tweet_created_on": candidate["tweet"].created_on,
        "tweet_url": candidate["tweet"].tweet_url,
@@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
 def candidate_matches_state(candidate, state):
    """
    Strong private dedupe using local persistent state.
    Match order:
    1. canonical tweet URL
    2. text + media fingerprint
    3. normalized text
    """
    canonical_tweet_url = candidate["canonical_tweet_url"]
    text_media_key = candidate["text_media_key"]
    normalized_text = candidate["normalized_text"]
@@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state):
 def prune_state(state, max_entries=5000):
    """
    Keep state file from growing forever.
    Prunes oldest records by posted_at if necessary.
    """
    posted_tweets = state.get("posted_tweets", {})
    if len(posted_tweets) <= max_entries:
@@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000):
 # --- Bluesky Post History ---
 def get_recent_bsky_posts(client, handle, limit=30):
    """
    Fetch recent top-level Bluesky posts for duplicate detection.
    Returns a list of dicts with dedupe keys.
    """
    recent_posts = []
    try:
@@ -690,16 +670,16 @@ def make_rich(content):
        raw = match.group(0)
        if "\n" not in raw and "\r" not in raw:
-            return re.sub(r"[…\.]+$", "", raw)
+            return strip_trailing_url_punctuation(raw)
        glued = raw.replace("\n", "").replace("\r", "")
-        test_url = re.sub(r"[…\.]+$", "", glued)
+        test_url = strip_trailing_url_punctuation(glued)
        if is_valid_url(test_url):
            return test_url
        parts = raw.split("\n")
-        test_part0 = re.sub(r"[…\.]+$", "", parts[0])
+        test_part0 = strip_trailing_url_punctuation(parts[0])
        if is_valid_url(test_part0):
            return raw
@@ -725,7 +705,7 @@ def make_rich(content):
                if word.startswith("http://"):
                    word = word.replace("http://", "https://", 1)
-                word = re.sub(r"[…\.]+$", "", word)
+                word = strip_trailing_url_punctuation(word)
                clean_url_value = clean_url(word)
                if clean_url_value and is_valid_url(clean_url_value):
@@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
        page.close()
 # --- Video Processing ---
 def download_and_crop_video(video_url, output_path):
    """
    Download, trim, and compress video before upload.
    """
    temp_input = output_path.replace(".mp4", "_source.mp4")
    temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
    temp_output = output_path.replace(".mp4", "_compressed.mp4")
@@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path):
 def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
    """
    Multi-signal dedupe against recent Bluesky posts.
    """
    candidate_non_x_urls = candidate["canonical_non_x_urls"]
    candidate_text_media_key = candidate["text_media_key"]
    candidate_normalized_text = candidate["normalized_text"]
@@ -1249,11 +1222,8 @@ def sync_feeds(args):
                media_fingerprint = build_media_fingerprint(tweet)
                text_media_key = build_text_media_key(normalized_text, media_fingerprint)
-                canonical_non_x_urls = set()
+                ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
-                for url in extract_non_x_urls_from_text(prepared_text):
+                canonical_non_x_urls = set(ordered_non_x_urls)
                    canonical = canonicalize_url(url)
                    if canonical:
                        canonical_non_x_urls.add(canonical)
                candidate_tweets.append({
                    "tweet": tweet,
@@ -1264,6 +1234,8 @@ def sync_feeds(args):
                    "text_media_key": text_media_key,
                    "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
                    "canonical_non_x_urls": canonical_non_x_urls,
                    "ordered_non_x_urls": ordered_non_x_urls,
                    "looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
                })
            except Exception as e:
@@ -1376,19 +1348,32 @@ def sync_feeds(args):
                                if os.path.exists(temp_video_path):
                                    os.remove(temp_video_path)
-                # Only create an external link card if no image/video embed will be used.
+                # Only create the external rich snippet when there is no uploaded media.
-                if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]:
+                # This specifically supports posts in the style:
-                    first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0]
+                #   headline text
-                    external_embed = build_external_link_embed(
+                #   https://news-site/article
-                        first_non_x_url,
+                if not video_embed and not image_embeds:
-                        bsky_client,
+                    candidate_url = None
-                        media_http_client,
+
-                        fallback_title="Link"
+                    if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
-                    )
+                        candidate_url = candidate["ordered_non_x_urls"][0]
-                    if external_embed:
+                        logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
-                        logging.info(f"🔗 Built external link card for URL: {first_non_x_url}")
+                    elif candidate.get("ordered_non_x_urls"):
-                    else:
+                        candidate_url = candidate["ordered_non_x_urls"][0]
-                        logging.info(f"ℹ️ No external link card metadata available for URL: {first_non_x_url}")
+                        logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
                    if candidate_url:
                        external_embed = build_external_link_embed(
                            candidate_url,
                            bsky_client,
                            media_http_client,
                            fallback_title="Link"
                        )
                        if external_embed:
                            logging.info(f"✅ Built external link card for URL: {candidate_url}")
                        else:
                            logging.info(f"ℹ️ Could not build external link card metadata for URL: {candidate_url}")
                try:
                    post_result = None