Added snippets only in urls 2

2026-04-05 21:44:18 +02:00
parent c1a9065744
commit 7614545893
1 changed files with 86 additions and 101 deletions
--- a/twitter2bsky_daemon.py
+++ b/twitter2bsky_daemon.py
@@ -29,11 +29,6 @@ BSKY_TEXT_MAX_LENGTH = 275
 #   server-side proxy/PDS body-size caps.
 # - Custom PDSes such as eurosky.social may accept images fine but fail on
 #   larger video blob uploads.
-# - The safest approach is to:
-#     1. cap duration
-#     2. compress aggressively
-#     3. log final file size
-#     4. skip obviously too-large uploads
 VIDEO_MAX_DURATION_SECONDS = 179
 MAX_VIDEO_UPLOAD_SIZE_MB = 45

@@ -84,10 +79,16 @@ def is_valid_url(url):
        return False


+def strip_trailing_url_punctuation(url):
+    if not url:
+        return url
+    return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
+
+
 def clean_url(url):
    trimmed_url = url.strip()
    cleaned_url = re.sub(r"\s+", "", trimmed_url)
-    cleaned_url = re.sub(r"[…\.]+$", "", cleaned_url)
+    cleaned_url = strip_trailing_url_punctuation(cleaned_url)

    if is_valid_url(cleaned_url):
        return cleaned_url
@@ -97,7 +98,7 @@ def clean_url(url):
 def canonicalize_url(url):
    if not url:
        return None
-    return url.strip()
+    return strip_trailing_url_punctuation(url.strip())


 def canonicalize_tweet_url(url):
@@ -136,13 +137,30 @@ def extract_non_x_urls_from_text(text):
    result = []

    for url in urls:
-        cleaned = re.sub(r"[…\.]+$", "", url.strip())
+        cleaned = strip_trailing_url_punctuation(url)
        if cleaned and not is_x_or_twitter_domain(cleaned):
            result.append(cleaned)

    return result


+def extract_ordered_non_x_urls(text):
+    """
+    Extract non-X URLs preserving original order and uniqueness.
+    This is used for posting decisions, especially external link-card creation.
+    """
+    seen = set()
+    ordered = []
+
+    for url in extract_non_x_urls_from_text(text):
+        canonical = canonicalize_url(url)
+        if canonical and canonical not in seen:
+            seen.add(canonical)
+            ordered.append(canonical)
+
+    return ordered
+
+
 def extract_urls_from_facets(record):
    """
    Extract link URLs from Bluesky rich text facets if present.
@@ -163,10 +181,31 @@ def extract_urls_from_facets(record):
    return urls


+def looks_like_title_plus_url_post(text):
+    """
+    Detect the specific desired style:
+    - some title/body text
+    - one non-X URL, typically on the last line
+
+    Example:
+      Headline text...
+      https://example.com/story
+    """
+    if not text:
+        return False
+
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    if len(lines) < 2:
+        return False
+
+    last_line = lines[-1]
+    urls_in_last_line = extract_ordered_non_x_urls(last_line)
+    total_urls = extract_ordered_non_x_urls(text)
+
+    return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
+
+
 def get_rate_limit_wait_seconds(error_obj, default_delay):
-    """
-    Try to extract a sensible wait time from atproto/http error objects.
-    """
    try:
        headers = getattr(error_obj, "headers", None)
        if headers:
@@ -183,14 +222,6 @@ def get_rate_limit_wait_seconds(error_obj, default_delay):


 def upload_blob_with_retry(client, binary_data, media_label="media"):
-    """
-    Retry Bluesky blob upload when rate-limited.
-
-    Diagnostic note:
-    On alternate PDSes, large video uploads may fail for reasons other than
-    429 rate limits. In those cases we log the exception more explicitly and
-    return None so the caller can degrade gracefully.
-    """
    last_exception = None

    for attempt in range(1, BSKY_BLOB_UPLOAD_MAX_RETRIES + 1):
@@ -237,9 +268,6 @@ def upload_blob_with_retry(client, binary_data, media_label="media"):


 def get_blob_from_url(media_url, client, http_client):
-    """
-    Download media and upload to Bluesky with retry support for upload rate limits.
-    """
    try:
        r = http_client.get(media_url, timeout=MEDIA_DOWNLOAD_TIMEOUT, follow_redirects=True)
        if r.status_code != 200:
@@ -259,15 +287,6 @@ def get_blob_from_url(media_url, client, http_client):


 def get_blob_from_file(file_path, client):
-    """
-    Upload a local file as a Bluesky blob.
-
-    Diagnostic notes:
-    - We log the final file size because this is often the real reason a custom
-      PDS rejects video uploads.
-    - Self-hosted or alternate services may have stricter proxy/body-size limits
-      than bsky.social.
-    """
    try:
        if not os.path.exists(file_path):
            logging.warning(f"Could not upload local file {file_path}: file does not exist")
@@ -336,9 +355,7 @@ def fetch_link_metadata(url, http_client):
 def build_external_link_embed(url, client, http_client, fallback_title="Link"):
    """
    Build a Bluesky external embed from a URL.
-
-    This should only be used when the post has no image/video embed, because
-    Bluesky posts can only carry one embed type.
+    This is only used when there is no image/video embed.
    """
    link_metadata = fetch_link_metadata(url, http_client)

@@ -360,11 +377,6 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):


 def prepare_post_text(text):
-    """
-    Prepare the final public text exactly as it should be posted to Bluesky.
-    Does NOT append the source X URL.
-    Enforces the Bluesky text limit.
-    """
    raw_text = (text or "").strip()

    if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
@@ -379,9 +391,6 @@ def prepare_post_text(text):


 def normalize_post_text(text):
-    """
-    Normalize post text for duplicate detection.
-    """
    if not text:
        return ""

@@ -391,9 +400,6 @@ def normalize_post_text(text):


 def build_media_fingerprint(tweet):
-    """
-    Build a deterministic media fingerprint from scraped tweet media.
-    """
    if not tweet or not tweet.media:
        return "no-media"

@@ -419,9 +425,6 @@ def build_media_fingerprint(tweet):


 def build_bsky_media_fingerprint(post_view):
-    """
-    Best-effort media fingerprint from Bluesky embed structure.
-    """
    try:
        embed = getattr(post_view, "embed", None)
        if not embed:
@@ -463,10 +466,6 @@ def build_text_media_key(normalized_text, media_fingerprint):


 def create_bsky_client(base_url, handle, password):
-    """
-    Create a Bluesky/ATProto client pointed at the desired PDS or service host.
-    Supports custom hosts like eurosky.social.
-    """
    normalized_base_url = (base_url or DEFAULT_BSKY_BASE_URL).strip().rstrip("/")
    logging.info(f"🔐 Connecting Bluesky client via base URL: {normalized_base_url}")

@@ -538,11 +537,6 @@ def save_state(state, state_path=STATE_PATH):


 def remember_posted_tweet(state, candidate, bsky_uri=None):
-    """
-    Store successful post in local state.
-    Primary key is canonical tweet URL when available.
-    Fallback key uses text_media_key.
-    """
    canonical_tweet_url = candidate.get("canonical_tweet_url")
    fallback_key = f"textmedia:{candidate['text_media_key']}"
    state_key = canonical_tweet_url or fallback_key
@@ -554,6 +548,7 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):
        "media_fingerprint": candidate["media_fingerprint"],
        "text_media_key": candidate["text_media_key"],
        "canonical_non_x_urls": sorted(candidate["canonical_non_x_urls"]),
+        "ordered_non_x_urls": candidate.get("ordered_non_x_urls", []),
        "bsky_uri": bsky_uri,
        "tweet_created_on": candidate["tweet"].created_on,
        "tweet_url": candidate["tweet"].tweet_url,
@@ -567,13 +562,6 @@ def remember_posted_tweet(state, candidate, bsky_uri=None):


 def candidate_matches_state(candidate, state):
-    """
-    Strong private dedupe using local persistent state.
-    Match order:
-    1. canonical tweet URL
-    2. text + media fingerprint
-    3. normalized text
-    """
    canonical_tweet_url = candidate["canonical_tweet_url"]
    text_media_key = candidate["text_media_key"]
    normalized_text = candidate["normalized_text"]
@@ -595,10 +583,6 @@ def candidate_matches_state(candidate, state):


 def prune_state(state, max_entries=5000):
-    """
-    Keep state file from growing forever.
-    Prunes oldest records by posted_at if necessary.
-    """
    posted_tweets = state.get("posted_tweets", {})

    if len(posted_tweets) <= max_entries:
@@ -629,10 +613,6 @@ def prune_state(state, max_entries=5000):

 # --- Bluesky Post History ---
 def get_recent_bsky_posts(client, handle, limit=30):
-    """
-    Fetch recent top-level Bluesky posts for duplicate detection.
-    Returns a list of dicts with dedupe keys.
-    """
    recent_posts = []

    try:
@@ -690,16 +670,16 @@ def make_rich(content):
        raw = match.group(0)

        if "\n" not in raw and "\r" not in raw:
-            return re.sub(r"[…\.]+$", "", raw)
+            return strip_trailing_url_punctuation(raw)

        glued = raw.replace("\n", "").replace("\r", "")
-        test_url = re.sub(r"[…\.]+$", "", glued)
+        test_url = strip_trailing_url_punctuation(glued)

        if is_valid_url(test_url):
            return test_url

        parts = raw.split("\n")
-        test_part0 = re.sub(r"[…\.]+$", "", parts[0])
+        test_part0 = strip_trailing_url_punctuation(parts[0])
        if is_valid_url(test_part0):
            return raw

@@ -725,7 +705,7 @@ def make_rich(content):
                if word.startswith("http://"):
                    word = word.replace("http://", "https://", 1)

-                word = re.sub(r"[…\.]+$", "", word)
+                word = strip_trailing_url_punctuation(word)
                clean_url_value = clean_url(word)

                if clean_url_value and is_valid_url(clean_url_value):
@@ -1039,11 +1019,7 @@ def extract_video_url_from_tweet_page(context, tweet_url):
        page.close()


-# --- Video Processing ---
 def download_and_crop_video(video_url, output_path):
-    """
-    Download, trim, and compress video before upload.
-    """
    temp_input = output_path.replace(".mp4", "_source.mp4")
    temp_trimmed = output_path.replace(".mp4", "_trimmed.mp4")
    temp_output = output_path.replace(".mp4", "_compressed.mp4")
@@ -1168,9 +1144,6 @@ def download_and_crop_video(video_url, output_path):


 def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
-    """
-    Multi-signal dedupe against recent Bluesky posts.
-    """
    candidate_non_x_urls = candidate["canonical_non_x_urls"]
    candidate_text_media_key = candidate["text_media_key"]
    candidate_normalized_text = candidate["normalized_text"]
@@ -1249,11 +1222,8 @@ def sync_feeds(args):
                media_fingerprint = build_media_fingerprint(tweet)
                text_media_key = build_text_media_key(normalized_text, media_fingerprint)

-                canonical_non_x_urls = set()
-                for url in extract_non_x_urls_from_text(prepared_text):
-                    canonical = canonicalize_url(url)
-                    if canonical:
-                        canonical_non_x_urls.add(canonical)
+                ordered_non_x_urls = extract_ordered_non_x_urls(prepared_text)
+                canonical_non_x_urls = set(ordered_non_x_urls)

                candidate_tweets.append({
                    "tweet": tweet,
@@ -1264,6 +1234,8 @@ def sync_feeds(args):
                    "text_media_key": text_media_key,
                    "canonical_tweet_url": canonicalize_tweet_url(tweet.tweet_url),
                    "canonical_non_x_urls": canonical_non_x_urls,
+                    "ordered_non_x_urls": ordered_non_x_urls,
+                    "looks_like_title_plus_url": looks_like_title_plus_url_post(prepared_text),
                })

            except Exception as e:
@@ -1376,19 +1348,32 @@ def sync_feeds(args):
                                if os.path.exists(temp_video_path):
                                    os.remove(temp_video_path)

-                # Only create an external link card if no image/video embed will be used.
-                if not video_embed and not image_embeds and candidate["canonical_non_x_urls"]:
-                    first_non_x_url = sorted(candidate["canonical_non_x_urls"])[0]
-                    external_embed = build_external_link_embed(
-                        first_non_x_url,
-                        bsky_client,
-                        media_http_client,
-                        fallback_title="Link"
-                    )
-                    if external_embed:
-                        logging.info(f"🔗 Built external link card for URL: {first_non_x_url}")
-                    else:
-                        logging.info(f"ℹ️ No external link card metadata available for URL: {first_non_x_url}")
+                # Only create the external rich snippet when there is no uploaded media.
+                # This specifically supports posts in the style:
+                #   headline text
+                #   https://news-site/article
+                if not video_embed and not image_embeds:
+                    candidate_url = None
+
+                    if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
+                        candidate_url = candidate["ordered_non_x_urls"][0]
+                        logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
+                    elif candidate.get("ordered_non_x_urls"):
+                        candidate_url = candidate["ordered_non_x_urls"][0]
+                        logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
+
+                    if candidate_url:
+                        external_embed = build_external_link_embed(
+                            candidate_url,
+                            bsky_client,
+                            media_http_client,
+                            fallback_title="Link"
+                        )
+
+                        if external_embed:
+                            logging.info(f"✅ Built external link card for URL: {candidate_url}")
+                        else:
+                            logging.info(f"ℹ️ Could not build external link card metadata for URL: {candidate_url}")

                try:
                    post_result = None