New test for rich snippet
This commit is contained in:
@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
|
||||
TWEET_MAX_AGE_DAYS = 3
|
||||
BSKY_TEXT_MAX_LENGTH = 275
|
||||
|
||||
# Video handling notes:
|
||||
# - Bluesky video support is constrained not just by duration, but also by
|
||||
# practical upload limits like final file size, bitrate, resolution, and
|
||||
# server-side proxy/PDS body-size caps.
|
||||
# - Custom PDSes such as eurosky.social may accept images fine but fail on
|
||||
# larger video blob uploads.
|
||||
VIDEO_MAX_DURATION_SECONDS = 179
|
||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||
|
||||
@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
|
||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
||||
|
||||
|
||||
def repair_broken_urls(text):
|
||||
"""
|
||||
Repair URLs that were split by copied/scraped line breaks.
|
||||
|
||||
Examples:
|
||||
https://
|
||||
3cat.cat/path
|
||||
becomes:
|
||||
https://3cat.cat/path
|
||||
|
||||
https://3cat.cat/some-pa
|
||||
th/article
|
||||
becomes:
|
||||
https://3cat.cat/some-path/article
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
original = text
|
||||
|
||||
# Join protocol line breaks: https://\nexample.com -> https://example.com
|
||||
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||
|
||||
# Join URL-internal line breaks when the next chunk still looks like URL content.
|
||||
# This is intentionally conservative but effective for wrapped article URLs.
|
||||
prev_text = None
|
||||
while prev_text != text:
|
||||
prev_text = text
|
||||
text = re.sub(
|
||||
r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
|
||||
r"\1\2",
|
||||
text,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
# Also fix accidental spaces inserted inside URLs after the protocol.
|
||||
text = re.sub(
|
||||
r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
|
||||
r"\1\2",
|
||||
text,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
if text != original:
|
||||
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
trimmed_url = url.strip()
|
||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||
@@ -102,9 +145,6 @@ def canonicalize_url(url):
|
||||
|
||||
|
||||
def canonicalize_tweet_url(url):
|
||||
"""
|
||||
Canonicalize x.com/twitter.com status URLs for internal dedupe only.
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
|
||||
def extract_urls_from_text(text):
|
||||
if not text:
|
||||
return []
|
||||
return re.findall(r"https?://[^\s]+", text)
|
||||
repaired = repair_broken_urls(text)
|
||||
return re.findall(r"https?://[^\s]+", repaired)
|
||||
|
||||
|
||||
def extract_non_x_urls_from_text(text):
|
||||
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):
|
||||
|
||||
|
||||
def extract_ordered_non_x_urls(text):
|
||||
"""
|
||||
Extract non-X URLs preserving original order and uniqueness.
|
||||
This is used for posting decisions, especially external link-card creation.
|
||||
"""
|
||||
seen = set()
|
||||
ordered = []
|
||||
|
||||
@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):
|
||||
|
||||
|
||||
def extract_urls_from_facets(record):
|
||||
"""
|
||||
Extract link URLs from Bluesky rich text facets if present.
|
||||
"""
|
||||
urls = []
|
||||
|
||||
try:
|
||||
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):
|
||||
|
||||
|
||||
def looks_like_title_plus_url_post(text):
|
||||
"""
|
||||
Detect the specific desired style:
|
||||
- some title/body text
|
||||
- one non-X URL, typically on the last line
|
||||
|
||||
Example:
|
||||
Headline text...
|
||||
https://example.com/story
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
repaired = repair_broken_urls(text)
|
||||
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||
if len(lines) < 2:
|
||||
return False
|
||||
|
||||
last_line = lines[-1]
|
||||
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
||||
total_urls = extract_ordered_non_x_urls(text)
|
||||
total_urls = extract_ordered_non_x_urls(repaired)
|
||||
|
||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||
|
||||
@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):
|
||||
|
||||
|
||||
def fetch_link_metadata(url, http_client):
|
||||
"""
|
||||
Fetch metadata used to build a Bluesky external link card.
|
||||
"""
|
||||
try:
|
||||
r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):
|
||||
|
||||
|
||||
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||
"""
|
||||
Build a Bluesky external embed from a URL.
|
||||
This is only used when there is no image/video embed.
|
||||
"""
|
||||
link_metadata = fetch_link_metadata(url, http_client)
|
||||
|
||||
thumb_blob = None
|
||||
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||
|
||||
|
||||
def prepare_post_text(text):
|
||||
raw_text = (text or "").strip()
|
||||
raw_text = repair_broken_urls((text or "").strip())
|
||||
|
||||
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
||||
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
||||
@@ -394,6 +413,7 @@ def normalize_post_text(text):
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
text = repair_broken_urls(text)
|
||||
text = text.replace("\r", "\n")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text.lower()
|
||||
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
|
||||
return client
|
||||
|
||||
|
||||
# --- Local State Management ---
|
||||
def default_state():
|
||||
return {
|
||||
"version": 1,
|
||||
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
|
||||
return state
|
||||
|
||||
|
||||
# --- Bluesky Post History ---
|
||||
def get_recent_bsky_posts(client, handle, limit=30):
|
||||
recent_posts = []
|
||||
|
||||
@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
||||
|
||||
def make_rich(content):
|
||||
text_builder = client_utils.TextBuilder()
|
||||
|
||||
def repair_url(match):
|
||||
raw = match.group(0)
|
||||
|
||||
if "\n" not in raw and "\r" not in raw:
|
||||
return strip_trailing_url_punctuation(raw)
|
||||
|
||||
glued = raw.replace("\n", "").replace("\r", "")
|
||||
test_url = strip_trailing_url_punctuation(glued)
|
||||
|
||||
if is_valid_url(test_url):
|
||||
return test_url
|
||||
|
||||
parts = raw.split("\n")
|
||||
test_part0 = strip_trailing_url_punctuation(parts[0])
|
||||
if is_valid_url(test_part0):
|
||||
return raw
|
||||
|
||||
return test_url
|
||||
|
||||
content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
|
||||
content = repair_broken_urls(content.strip())
|
||||
lines = content.splitlines()
|
||||
|
||||
for line_idx, line in enumerate(lines):
|
||||
@@ -730,7 +728,7 @@ def make_rich(content):
|
||||
|
||||
|
||||
def build_dynamic_alt(raw_text):
|
||||
dynamic_alt = raw_text.replace("\n", " ").strip()
|
||||
dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
|
||||
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
||||
|
||||
if len(dynamic_alt) > 150:
|
||||
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
|
||||
return None
|
||||
|
||||
|
||||
# --- Playwright Scraping ---
|
||||
def scrape_tweets_via_playwright(username, password, email, target_handle):
|
||||
tweets = []
|
||||
state_file = "twitter_browser_state.json"
|
||||
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
||||
return False, None
|
||||
|
||||
|
||||
# --- Main Sync Function ---
|
||||
def sync_feeds(args):
|
||||
logging.info("🔄 Starting sync cycle...")
|
||||
try:
|
||||
@@ -1348,18 +1344,15 @@ def sync_feeds(args):
|
||||
if os.path.exists(temp_video_path):
|
||||
os.remove(temp_video_path)
|
||||
|
||||
# Only create the external rich snippet when there is no uploaded media.
|
||||
# This specifically supports posts in the style:
|
||||
# headline text
|
||||
# https://news-site/article
|
||||
if not video_embed and not image_embeds:
|
||||
candidate_url = None
|
||||
|
||||
if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
|
||||
if candidate.get("ordered_non_x_urls"):
|
||||
candidate_url = candidate["ordered_non_x_urls"][0]
|
||||
|
||||
if candidate.get("looks_like_title_plus_url"):
|
||||
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
|
||||
elif candidate.get("ordered_non_x_urls"):
|
||||
candidate_url = candidate["ordered_non_x_urls"][0]
|
||||
else:
|
||||
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
|
||||
|
||||
if candidate_url:
|
||||
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
|
||||
logging.error(f"❌ Error during sync cycle: {e}")
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
def main():
|
||||
load_dotenv()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user