New test for rich snippet

This commit is contained in:
2026-04-05 22:51:18 +02:00
parent 7614545893
commit 5abd9d685a

View File

@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
TWEET_MAX_AGE_DAYS = 3 TWEET_MAX_AGE_DAYS = 3
BSKY_TEXT_MAX_LENGTH = 275 BSKY_TEXT_MAX_LENGTH = 275
# Video handling notes:
# - Bluesky video support is constrained not just by duration, but also by
# practical upload limits like final file size, bitrate, resolution, and
# server-side proxy/PDS body-size caps.
# - Custom PDSes such as eurosky.social may accept images fine but fail on
# larger video blob uploads.
VIDEO_MAX_DURATION_SECONDS = 179 VIDEO_MAX_DURATION_SECONDS = 179
MAX_VIDEO_UPLOAD_SIZE_MB = 45 MAX_VIDEO_UPLOAD_SIZE_MB = 45
@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip()) return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
def repair_broken_urls(text):
"""
Repair URLs that were split by copied/scraped line breaks.
Examples:
https://
3cat.cat/path
becomes:
https://3cat.cat/path
https://3cat.cat/some-pa
th/article
becomes:
https://3cat.cat/some-path/article
"""
if not text:
return text
original = text
# Join protocol line breaks: https://\nexample.com -> https://example.com
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
# Join URL-internal line breaks when the next chunk still looks like URL content.
# This is intentionally conservative but effective for wrapped article URLs.
prev_text = None
while prev_text != text:
prev_text = text
text = re.sub(
r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
r"\1\2",
text,
flags=re.IGNORECASE
)
# Also fix accidental spaces inserted inside URLs after the protocol.
text = re.sub(
r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
r"\1\2",
text,
flags=re.IGNORECASE
)
if text != original:
logging.info("🔧 Repaired broken URL wrapping in scraped text")
return text
def clean_url(url): def clean_url(url):
trimmed_url = url.strip() trimmed_url = url.strip()
cleaned_url = re.sub(r"\s+", "", trimmed_url) cleaned_url = re.sub(r"\s+", "", trimmed_url)
@@ -102,9 +145,6 @@ def canonicalize_url(url):
def canonicalize_tweet_url(url): def canonicalize_tweet_url(url):
"""
Canonicalize x.com/twitter.com status URLs for internal dedupe only.
"""
if not url: if not url:
return None return None
@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
def extract_urls_from_text(text): def extract_urls_from_text(text):
if not text: if not text:
return [] return []
return re.findall(r"https?://[^\s]+", text) repaired = repair_broken_urls(text)
return re.findall(r"https?://[^\s]+", repaired)
def extract_non_x_urls_from_text(text): def extract_non_x_urls_from_text(text):
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):
def extract_ordered_non_x_urls(text): def extract_ordered_non_x_urls(text):
"""
Extract non-X URLs preserving original order and uniqueness.
This is used for posting decisions, especially external link-card creation.
"""
seen = set() seen = set()
ordered = [] ordered = []
@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):
def extract_urls_from_facets(record): def extract_urls_from_facets(record):
"""
Extract link URLs from Bluesky rich text facets if present.
"""
urls = [] urls = []
try: try:
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):
def looks_like_title_plus_url_post(text): def looks_like_title_plus_url_post(text):
"""
Detect the specific desired style:
- some title/body text
- one non-X URL, typically on the last line
Example:
Headline text...
https://example.com/story
"""
if not text: if not text:
return False return False
lines = [line.strip() for line in text.splitlines() if line.strip()] repaired = repair_broken_urls(text)
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
if len(lines) < 2: if len(lines) < 2:
return False return False
last_line = lines[-1] last_line = lines[-1]
urls_in_last_line = extract_ordered_non_x_urls(last_line) urls_in_last_line = extract_ordered_non_x_urls(last_line)
total_urls = extract_ordered_non_x_urls(text) total_urls = extract_ordered_non_x_urls(repaired)
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://")) return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):
def fetch_link_metadata(url, http_client): def fetch_link_metadata(url, http_client):
"""
Fetch metadata used to build a Bluesky external link card.
"""
try: try:
r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True) r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
r.raise_for_status() r.raise_for_status()
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):
def build_external_link_embed(url, client, http_client, fallback_title="Link"): def build_external_link_embed(url, client, http_client, fallback_title="Link"):
"""
Build a Bluesky external embed from a URL.
This is only used when there is no image/video embed.
"""
link_metadata = fetch_link_metadata(url, http_client) link_metadata = fetch_link_metadata(url, http_client)
thumb_blob = None thumb_blob = None
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
def prepare_post_text(text): def prepare_post_text(text):
raw_text = (text or "").strip() raw_text = repair_broken_urls((text or "").strip())
if len(raw_text) > BSKY_TEXT_MAX_LENGTH: if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3] truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
@@ -394,6 +413,7 @@ def normalize_post_text(text):
if not text: if not text:
return "" return ""
text = repair_broken_urls(text)
text = text.replace("\r", "\n") text = text.replace("\r", "\n")
text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"\s+", " ", text).strip()
return text.lower() return text.lower()
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
return client return client
# --- Local State Management ---
def default_state(): def default_state():
return { return {
"version": 1, "version": 1,
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
return state return state
# --- Bluesky Post History ---
def get_recent_bsky_posts(client, handle, limit=30): def get_recent_bsky_posts(client, handle, limit=30):
recent_posts = [] recent_posts = []
@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
def make_rich(content): def make_rich(content):
text_builder = client_utils.TextBuilder() text_builder = client_utils.TextBuilder()
content = repair_broken_urls(content.strip())
def repair_url(match):
raw = match.group(0)
if "\n" not in raw and "\r" not in raw:
return strip_trailing_url_punctuation(raw)
glued = raw.replace("\n", "").replace("\r", "")
test_url = strip_trailing_url_punctuation(glued)
if is_valid_url(test_url):
return test_url
parts = raw.split("\n")
test_part0 = strip_trailing_url_punctuation(parts[0])
if is_valid_url(test_part0):
return raw
return test_url
content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
lines = content.splitlines() lines = content.splitlines()
for line_idx, line in enumerate(lines): for line_idx, line in enumerate(lines):
@@ -730,7 +728,7 @@ def make_rich(content):
def build_dynamic_alt(raw_text): def build_dynamic_alt(raw_text):
dynamic_alt = raw_text.replace("\n", " ").strip() dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip() dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
if len(dynamic_alt) > 150: if len(dynamic_alt) > 150:
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
return None return None
# --- Playwright Scraping ---
def scrape_tweets_via_playwright(username, password, email, target_handle): def scrape_tweets_via_playwright(username, password, email, target_handle):
tweets = [] tweets = []
state_file = "twitter_browser_state.json" state_file = "twitter_browser_state.json"
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
return False, None return False, None
# --- Main Sync Function ---
def sync_feeds(args): def sync_feeds(args):
logging.info("🔄 Starting sync cycle...") logging.info("🔄 Starting sync cycle...")
try: try:
@@ -1348,18 +1344,15 @@ def sync_feeds(args):
if os.path.exists(temp_video_path): if os.path.exists(temp_video_path):
os.remove(temp_video_path) os.remove(temp_video_path)
# Only create the external rich snippet when there is no uploaded media.
# This specifically supports posts in the style:
# headline text
# https://news-site/article
if not video_embed and not image_embeds: if not video_embed and not image_embeds:
candidate_url = None candidate_url = None
if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"): if candidate.get("ordered_non_x_urls"):
candidate_url = candidate["ordered_non_x_urls"][0] candidate_url = candidate["ordered_non_x_urls"][0]
if candidate.get("looks_like_title_plus_url"):
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}") logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
elif candidate.get("ordered_non_x_urls"): else:
candidate_url = candidate["ordered_non_x_urls"][0]
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}") logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
if candidate_url: if candidate_url:
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
logging.error(f"❌ Error during sync cycle: {e}") logging.error(f"❌ Error during sync cycle: {e}")
# --- Main Execution ---
def main(): def main():
load_dotenv() load_dotenv()