New test for rich snippet
This commit is contained in:
@@ -23,12 +23,6 @@ DEDUPE_BSKY_LIMIT = 30
|
|||||||
TWEET_MAX_AGE_DAYS = 3
|
TWEET_MAX_AGE_DAYS = 3
|
||||||
BSKY_TEXT_MAX_LENGTH = 275
|
BSKY_TEXT_MAX_LENGTH = 275
|
||||||
|
|
||||||
# Video handling notes:
|
|
||||||
# - Bluesky video support is constrained not just by duration, but also by
|
|
||||||
# practical upload limits like final file size, bitrate, resolution, and
|
|
||||||
# server-side proxy/PDS body-size caps.
|
|
||||||
# - Custom PDSes such as eurosky.social may accept images fine but fail on
|
|
||||||
# larger video blob uploads.
|
|
||||||
VIDEO_MAX_DURATION_SECONDS = 179
|
VIDEO_MAX_DURATION_SECONDS = 179
|
||||||
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
MAX_VIDEO_UPLOAD_SIZE_MB = 45
|
||||||
|
|
||||||
@@ -85,6 +79,55 @@ def strip_trailing_url_punctuation(url):
|
|||||||
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
return re.sub(r"[\s…\.,;:!?)\]\"']+$", "", url.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def repair_broken_urls(text):
|
||||||
|
"""
|
||||||
|
Repair URLs that were split by copied/scraped line breaks.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
https://
|
||||||
|
3cat.cat/path
|
||||||
|
becomes:
|
||||||
|
https://3cat.cat/path
|
||||||
|
|
||||||
|
https://3cat.cat/some-pa
|
||||||
|
th/article
|
||||||
|
becomes:
|
||||||
|
https://3cat.cat/some-path/article
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
original = text
|
||||||
|
|
||||||
|
# Join protocol line breaks: https://\nexample.com -> https://example.com
|
||||||
|
text = re.sub(r"(https?://)\s*[\r\n]+\s*", r"\1", text, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# Join URL-internal line breaks when the next chunk still looks like URL content.
|
||||||
|
# This is intentionally conservative but effective for wrapped article URLs.
|
||||||
|
prev_text = None
|
||||||
|
while prev_text != text:
|
||||||
|
prev_text = text
|
||||||
|
text = re.sub(
|
||||||
|
r"((?:https?://|www\.)[^\s<>\"]*)[\r\n]+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
|
||||||
|
r"\1\2",
|
||||||
|
text,
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also fix accidental spaces inserted inside URLs after the protocol.
|
||||||
|
text = re.sub(
|
||||||
|
r"((?:https?://|www\.)[^\s<>\"]*)\s+([A-Za-z0-9/\-._~%!$&'()*+,;=:@?#]+)",
|
||||||
|
r"\1\2",
|
||||||
|
text,
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
if text != original:
|
||||||
|
logging.info("🔧 Repaired broken URL wrapping in scraped text")
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
trimmed_url = url.strip()
|
trimmed_url = url.strip()
|
||||||
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
cleaned_url = re.sub(r"\s+", "", trimmed_url)
|
||||||
@@ -102,9 +145,6 @@ def canonicalize_url(url):
|
|||||||
|
|
||||||
|
|
||||||
def canonicalize_tweet_url(url):
|
def canonicalize_tweet_url(url):
|
||||||
"""
|
|
||||||
Canonicalize x.com/twitter.com status URLs for internal dedupe only.
|
|
||||||
"""
|
|
||||||
if not url:
|
if not url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -129,7 +169,8 @@ def is_x_or_twitter_domain(url):
|
|||||||
def extract_urls_from_text(text):
|
def extract_urls_from_text(text):
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
return re.findall(r"https?://[^\s]+", text)
|
repaired = repair_broken_urls(text)
|
||||||
|
return re.findall(r"https?://[^\s]+", repaired)
|
||||||
|
|
||||||
|
|
||||||
def extract_non_x_urls_from_text(text):
|
def extract_non_x_urls_from_text(text):
|
||||||
@@ -145,10 +186,6 @@ def extract_non_x_urls_from_text(text):
|
|||||||
|
|
||||||
|
|
||||||
def extract_ordered_non_x_urls(text):
|
def extract_ordered_non_x_urls(text):
|
||||||
"""
|
|
||||||
Extract non-X URLs preserving original order and uniqueness.
|
|
||||||
This is used for posting decisions, especially external link-card creation.
|
|
||||||
"""
|
|
||||||
seen = set()
|
seen = set()
|
||||||
ordered = []
|
ordered = []
|
||||||
|
|
||||||
@@ -162,9 +199,6 @@ def extract_ordered_non_x_urls(text):
|
|||||||
|
|
||||||
|
|
||||||
def extract_urls_from_facets(record):
|
def extract_urls_from_facets(record):
|
||||||
"""
|
|
||||||
Extract link URLs from Bluesky rich text facets if present.
|
|
||||||
"""
|
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -182,25 +216,17 @@ def extract_urls_from_facets(record):
|
|||||||
|
|
||||||
|
|
||||||
def looks_like_title_plus_url_post(text):
|
def looks_like_title_plus_url_post(text):
|
||||||
"""
|
|
||||||
Detect the specific desired style:
|
|
||||||
- some title/body text
|
|
||||||
- one non-X URL, typically on the last line
|
|
||||||
|
|
||||||
Example:
|
|
||||||
Headline text...
|
|
||||||
https://example.com/story
|
|
||||||
"""
|
|
||||||
if not text:
|
if not text:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
repaired = repair_broken_urls(text)
|
||||||
|
lines = [line.strip() for line in repaired.splitlines() if line.strip()]
|
||||||
if len(lines) < 2:
|
if len(lines) < 2:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
last_line = lines[-1]
|
last_line = lines[-1]
|
||||||
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
urls_in_last_line = extract_ordered_non_x_urls(last_line)
|
||||||
total_urls = extract_ordered_non_x_urls(text)
|
total_urls = extract_ordered_non_x_urls(repaired)
|
||||||
|
|
||||||
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
return len(urls_in_last_line) == 1 and len(total_urls) == 1 and last_line.startswith(("http://", "https://"))
|
||||||
|
|
||||||
@@ -323,9 +349,6 @@ def get_blob_from_file(file_path, client):
|
|||||||
|
|
||||||
|
|
||||||
def fetch_link_metadata(url, http_client):
|
def fetch_link_metadata(url, http_client):
|
||||||
"""
|
|
||||||
Fetch metadata used to build a Bluesky external link card.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
|
r = http_client.get(url, timeout=LINK_METADATA_TIMEOUT, follow_redirects=True)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
@@ -353,10 +376,6 @@ def fetch_link_metadata(url, http_client):
|
|||||||
|
|
||||||
|
|
||||||
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
||||||
"""
|
|
||||||
Build a Bluesky external embed from a URL.
|
|
||||||
This is only used when there is no image/video embed.
|
|
||||||
"""
|
|
||||||
link_metadata = fetch_link_metadata(url, http_client)
|
link_metadata = fetch_link_metadata(url, http_client)
|
||||||
|
|
||||||
thumb_blob = None
|
thumb_blob = None
|
||||||
@@ -377,7 +396,7 @@ def build_external_link_embed(url, client, http_client, fallback_title="Link"):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_post_text(text):
|
def prepare_post_text(text):
|
||||||
raw_text = (text or "").strip()
|
raw_text = repair_broken_urls((text or "").strip())
|
||||||
|
|
||||||
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
if len(raw_text) > BSKY_TEXT_MAX_LENGTH:
|
||||||
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
truncated = raw_text[:BSKY_TEXT_MAX_LENGTH - 3]
|
||||||
@@ -394,6 +413,7 @@ def normalize_post_text(text):
|
|||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
text = repair_broken_urls(text)
|
||||||
text = text.replace("\r", "\n")
|
text = text.replace("\r", "\n")
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
return text.lower()
|
return text.lower()
|
||||||
@@ -486,7 +506,6 @@ def create_bsky_client(base_url, handle, password):
|
|||||||
return client
|
return client
|
||||||
|
|
||||||
|
|
||||||
# --- Local State Management ---
|
|
||||||
def default_state():
|
def default_state():
|
||||||
return {
|
return {
|
||||||
"version": 1,
|
"version": 1,
|
||||||
@@ -611,7 +630,6 @@ def prune_state(state, max_entries=5000):
|
|||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
# --- Bluesky Post History ---
|
|
||||||
def get_recent_bsky_posts(client, handle, limit=30):
|
def get_recent_bsky_posts(client, handle, limit=30):
|
||||||
recent_posts = []
|
recent_posts = []
|
||||||
|
|
||||||
@@ -665,27 +683,7 @@ def get_recent_bsky_posts(client, handle, limit=30):
|
|||||||
|
|
||||||
def make_rich(content):
|
def make_rich(content):
|
||||||
text_builder = client_utils.TextBuilder()
|
text_builder = client_utils.TextBuilder()
|
||||||
|
content = repair_broken_urls(content.strip())
|
||||||
def repair_url(match):
|
|
||||||
raw = match.group(0)
|
|
||||||
|
|
||||||
if "\n" not in raw and "\r" not in raw:
|
|
||||||
return strip_trailing_url_punctuation(raw)
|
|
||||||
|
|
||||||
glued = raw.replace("\n", "").replace("\r", "")
|
|
||||||
test_url = strip_trailing_url_punctuation(glued)
|
|
||||||
|
|
||||||
if is_valid_url(test_url):
|
|
||||||
return test_url
|
|
||||||
|
|
||||||
parts = raw.split("\n")
|
|
||||||
test_part0 = strip_trailing_url_punctuation(parts[0])
|
|
||||||
if is_valid_url(test_part0):
|
|
||||||
return raw
|
|
||||||
|
|
||||||
return test_url
|
|
||||||
|
|
||||||
content = re.sub(r"https?://[^\ \t]+", repair_url, content.strip())
|
|
||||||
lines = content.splitlines()
|
lines = content.splitlines()
|
||||||
|
|
||||||
for line_idx, line in enumerate(lines):
|
for line_idx, line in enumerate(lines):
|
||||||
@@ -730,7 +728,7 @@ def make_rich(content):
|
|||||||
|
|
||||||
|
|
||||||
def build_dynamic_alt(raw_text):
|
def build_dynamic_alt(raw_text):
|
||||||
dynamic_alt = raw_text.replace("\n", " ").strip()
|
dynamic_alt = repair_broken_urls(raw_text).replace("\n", " ").strip()
|
||||||
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
dynamic_alt = re.sub(r"https?://\S+", "", dynamic_alt).strip()
|
||||||
|
|
||||||
if len(dynamic_alt) > 150:
|
if len(dynamic_alt) > 150:
|
||||||
@@ -749,7 +747,6 @@ def build_video_embed(video_blob, alt_text):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# --- Playwright Scraping ---
|
|
||||||
def scrape_tweets_via_playwright(username, password, email, target_handle):
|
def scrape_tweets_via_playwright(username, password, email, target_handle):
|
||||||
tweets = []
|
tweets = []
|
||||||
state_file = "twitter_browser_state.json"
|
state_file = "twitter_browser_state.json"
|
||||||
@@ -1167,7 +1164,6 @@ def candidate_matches_existing_bsky(candidate, recent_bsky_posts):
|
|||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
# --- Main Sync Function ---
|
|
||||||
def sync_feeds(args):
|
def sync_feeds(args):
|
||||||
logging.info("🔄 Starting sync cycle...")
|
logging.info("🔄 Starting sync cycle...")
|
||||||
try:
|
try:
|
||||||
@@ -1348,18 +1344,15 @@ def sync_feeds(args):
|
|||||||
if os.path.exists(temp_video_path):
|
if os.path.exists(temp_video_path):
|
||||||
os.remove(temp_video_path)
|
os.remove(temp_video_path)
|
||||||
|
|
||||||
# Only create the external rich snippet when there is no uploaded media.
|
|
||||||
# This specifically supports posts in the style:
|
|
||||||
# headline text
|
|
||||||
# https://news-site/article
|
|
||||||
if not video_embed and not image_embeds:
|
if not video_embed and not image_embeds:
|
||||||
candidate_url = None
|
candidate_url = None
|
||||||
|
|
||||||
if candidate.get("looks_like_title_plus_url") and candidate.get("ordered_non_x_urls"):
|
if candidate.get("ordered_non_x_urls"):
|
||||||
candidate_url = candidate["ordered_non_x_urls"][0]
|
candidate_url = candidate["ordered_non_x_urls"][0]
|
||||||
|
|
||||||
|
if candidate.get("looks_like_title_plus_url"):
|
||||||
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
|
logging.info(f"🔗 Detected title+URL post style. Using URL for external card: {candidate_url}")
|
||||||
elif candidate.get("ordered_non_x_urls"):
|
else:
|
||||||
candidate_url = candidate["ordered_non_x_urls"][0]
|
|
||||||
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
|
logging.info(f"🔗 Text-only post with non-X URL. Using first URL for external card: {candidate_url}")
|
||||||
|
|
||||||
if candidate_url:
|
if candidate_url:
|
||||||
@@ -1433,7 +1426,6 @@ def sync_feeds(args):
|
|||||||
logging.error(f"❌ Error during sync cycle: {e}")
|
logging.error(f"❌ Error during sync cycle: {e}")
|
||||||
|
|
||||||
|
|
||||||
# --- Main Execution ---
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user