feat: add --max-age-days filter to prevent posting stale RSS entries Introduce an age-based filter
This commit is contained in:
84
rss2bsky.py
84
rss2bsky.py
@@ -401,6 +401,7 @@ def process_title(title: str) -> str:
|
|||||||
title_text = clean_whitespace(title_text)
|
title_text = clean_whitespace(title_text)
|
||||||
return title_text
|
return title_text
|
||||||
|
|
||||||
|
|
||||||
def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
||||||
title_text = clean_whitespace(title_text)
|
title_text = clean_whitespace(title_text)
|
||||||
link = canonicalize_url(link) or link or ""
|
link = canonicalize_url(link) or link or ""
|
||||||
@@ -414,13 +415,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
|||||||
seen.add(cleaned)
|
seen.add(cleaned)
|
||||||
variants.append(cleaned)
|
variants.append(cleaned)
|
||||||
|
|
||||||
# Variant 1: títol + link (si cap sencer)
|
# Variant 1: title + link (if fits whole)
|
||||||
if title_text and link:
|
if title_text and link:
|
||||||
full = f"{title_text}\n\n{link}"
|
full = f"{title_text}\n\n{link}"
|
||||||
if len(full) <= max_length:
|
if len(full) <= max_length:
|
||||||
add_variant(full)
|
add_variant(full)
|
||||||
|
|
||||||
# Variant 2: només títol complet (sense truncar)
|
# Variant 2: full title only (no truncation)
|
||||||
if title_text:
|
if title_text:
|
||||||
if len(title_text) <= max_length:
|
if len(title_text) <= max_length:
|
||||||
add_variant(title_text)
|
add_variant(title_text)
|
||||||
@@ -428,7 +429,7 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
|||||||
truncated = title_text[:max_length].rstrip(" .")
|
truncated = title_text[:max_length].rstrip(" .")
|
||||||
add_variant(truncated)
|
add_variant(truncated)
|
||||||
|
|
||||||
# Variant 3: títol truncat + link (si el títol complet+link no hi cap)
|
# Variant 3: truncated title + link (when full title+link doesn't fit)
|
||||||
if title_text and link:
|
if title_text and link:
|
||||||
full = f"{title_text}\n\n{link}"
|
full = f"{title_text}\n\n{link}"
|
||||||
if len(full) > max_length:
|
if len(full) > max_length:
|
||||||
@@ -438,12 +439,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
|||||||
truncated_title = title_text[:available].rstrip(" .")
|
truncated_title = title_text[:available].rstrip(" .")
|
||||||
add_variant(f"{truncated_title}\n\n{link}")
|
add_variant(f"{truncated_title}\n\n{link}")
|
||||||
|
|
||||||
# Variant 4: només link (si no hi ha títol)
|
# Variant 4: link only (when no title)
|
||||||
if link and not title_text:
|
if link and not title_text:
|
||||||
add_variant(link)
|
add_variant(link)
|
||||||
|
|
||||||
return variants
|
return variants
|
||||||
|
|
||||||
|
|
||||||
def is_x_or_twitter_domain(url: str) -> bool:
|
def is_x_or_twitter_domain(url: str) -> bool:
|
||||||
try:
|
try:
|
||||||
hostname = (urlparse(url).hostname or "").lower()
|
hostname = (urlparse(url).hostname or "").lower()
|
||||||
@@ -1076,7 +1078,7 @@ def build_candidates_from_feed(feed, max_length: int = 300) -> List[EntryCandida
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# Orchestration
|
# Login
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def login_with_backoff(
|
def login_with_backoff(
|
||||||
client: Client,
|
client: Client,
|
||||||
@@ -1110,7 +1112,6 @@ def login_with_backoff(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("❌ Login exception")
|
logging.exception("❌ Login exception")
|
||||||
|
|
||||||
# Rate-limited login: retry first, cooldown only if exhausted
|
|
||||||
if is_rate_limited_error(e):
|
if is_rate_limited_error(e):
|
||||||
if attempt < max_attempts:
|
if attempt < max_attempts:
|
||||||
wait_seconds = get_rate_limit_wait_seconds(e, base_delay, cfg)
|
wait_seconds = get_rate_limit_wait_seconds(e, base_delay, cfg)
|
||||||
@@ -1125,19 +1126,16 @@ def login_with_backoff(
|
|||||||
activate_post_creation_cooldown_from_error(e, cooldown_path, cfg)
|
activate_post_creation_cooldown_from_error(e, cooldown_path, cfg)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Bad credentials: fail fast
|
|
||||||
if is_auth_error(e):
|
if is_auth_error(e):
|
||||||
logging.error("❌ Authentication failed (bad handle/password/app-password).")
|
logging.error("❌ Authentication failed (bad handle/password/app-password).")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Network/transient: bounded retry
|
|
||||||
if attempt < max_attempts and (is_network_error(e) or is_timeout_error(e)):
|
if attempt < max_attempts and (is_network_error(e) or is_timeout_error(e)):
|
||||||
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||||
logging.warning(f"⏳ Transient login failure. Retrying in {delay:.1f}s...")
|
logging.warning(f"⏳ Transient login failure. Retrying in {delay:.1f}s...")
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Other errors: bounded retry
|
|
||||||
if attempt < max_attempts:
|
if attempt < max_attempts:
|
||||||
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||||
logging.warning(f"⏳ Login retry in {delay:.1f}s...")
|
logging.warning(f"⏳ Login retry in {delay:.1f}s...")
|
||||||
@@ -1148,6 +1146,10 @@ def login_with_backoff(
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# Orchestration
|
||||||
|
# ============================================================
|
||||||
def run_once(
|
def run_once(
|
||||||
rss_feed: str,
|
rss_feed: str,
|
||||||
bsky_handle: str,
|
bsky_handle: str,
|
||||||
@@ -1158,13 +1160,26 @@ def run_once(
|
|||||||
state_path: str,
|
state_path: str,
|
||||||
cooldown_path: str,
|
cooldown_path: str,
|
||||||
cfg: AppConfig,
|
cfg: AppConfig,
|
||||||
max_posts: int = 5 # ← NEW PARAMETER
|
max_posts: int = 5,
|
||||||
|
max_age_days: int = 7, # ← NEW: 0 = disabled
|
||||||
) -> RunResult:
|
) -> RunResult:
|
||||||
if not PIL_AVAILABLE:
|
if not PIL_AVAILABLE:
|
||||||
logging.warning("🟡 Pillow is not installed. External card thumbnail compression is disabled.")
|
logging.warning("🟡 Pillow is not installed. External card thumbnail compression is disabled.")
|
||||||
|
|
||||||
logging.info(f"🌍 Posting language(s): {post_langs}")
|
logging.info(f"🌍 Posting language(s): {post_langs}")
|
||||||
|
|
||||||
|
# ── Age-filter cutoff ────────────────────────────────────────
|
||||||
|
if max_age_days > 0:
|
||||||
|
cutoff = arrow.utcnow().shift(days=-max_age_days)
|
||||||
|
logging.info(
|
||||||
|
f"📅 Age filter active: skipping entries published before "
|
||||||
|
f"{cutoff.isoformat()} (max_age_days={max_age_days})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cutoff = None
|
||||||
|
logging.info("📅 Age filter disabled (max_age_days=0).")
|
||||||
|
# ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
if check_post_cooldown_or_log(cooldown_path):
|
if check_post_cooldown_or_log(cooldown_path):
|
||||||
return RunResult(published_count=0, stopped_reason="global_post_cooldown_active")
|
return RunResult(published_count=0, stopped_reason="global_post_cooldown_active")
|
||||||
|
|
||||||
@@ -1197,12 +1212,35 @@ def run_once(
|
|||||||
logging.info(f"📰 Prepared {len(candidates)} feed entry candidates for duplicate comparison.")
|
logging.info(f"📰 Prepared {len(candidates)} feed entry candidates for duplicate comparison.")
|
||||||
|
|
||||||
entries_to_post: List[EntryCandidate] = []
|
entries_to_post: List[EntryCandidate] = []
|
||||||
|
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
|
|
||||||
|
# ── Age filter ───────────────────────────────────────────
|
||||||
|
if cutoff is not None:
|
||||||
|
pub = candidate.published_arrow
|
||||||
|
if pub is None:
|
||||||
|
logging.info(
|
||||||
|
f"⏭️ Skipping entry with no publication date "
|
||||||
|
f"(age filter active, max_age_days={max_age_days}): "
|
||||||
|
f"{candidate.canonical_link or candidate.title_text}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if pub < cutoff:
|
||||||
|
logging.info(
|
||||||
|
f"⏭️ Skipping old entry published {pub.isoformat()} "
|
||||||
|
f"(cutoff: {cutoff.isoformat()}): "
|
||||||
|
f"{candidate.canonical_link or candidate.title_text}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
# ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# ── Deduplication: local state ───────────────────────────
|
||||||
is_dup_state, reason_state = candidate_matches_state(candidate, state)
|
is_dup_state, reason_state = candidate_matches_state(candidate, state)
|
||||||
if is_dup_state:
|
if is_dup_state:
|
||||||
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
|
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# ── Deduplication: recent Bluesky posts ──────────────────
|
||||||
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
|
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
|
||||||
if is_dup_bsky:
|
if is_dup_bsky:
|
||||||
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
|
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
|
||||||
@@ -1212,9 +1250,11 @@ def run_once(
|
|||||||
|
|
||||||
logging.info(f"📬 {len(entries_to_post)} entries remain after duplicate filtering.")
|
logging.info(f"📬 {len(entries_to_post)} entries remain after duplicate filtering.")
|
||||||
|
|
||||||
# ← NEW: log the effective cap before starting the loop
|
|
||||||
if len(entries_to_post) > max_posts:
|
if len(entries_to_post) > max_posts:
|
||||||
logging.info(f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} of {len(entries_to_post)} entries this run.")
|
logging.info(
|
||||||
|
f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} "
|
||||||
|
f"of {len(entries_to_post)} entries this run."
|
||||||
|
)
|
||||||
|
|
||||||
if not entries_to_post:
|
if not entries_to_post:
|
||||||
logging.info("ℹ️ Execution finished: no new entries to publish.")
|
logging.info("ℹ️ Execution finished: no new entries to publish.")
|
||||||
@@ -1227,7 +1267,6 @@ def run_once(
|
|||||||
|
|
||||||
for candidate in entries_to_post:
|
for candidate in entries_to_post:
|
||||||
|
|
||||||
# ← NEW: hard cap check at the top of every iteration
|
|
||||||
if published >= max_posts:
|
if published >= max_posts:
|
||||||
logging.info(f"🔢 === MAX POSTS REACHED === Stopping after {published} posts (limit: {max_posts}).")
|
logging.info(f"🔢 === MAX POSTS REACHED === Stopping after {published} posts (limit: {max_posts}).")
|
||||||
break
|
break
|
||||||
@@ -1311,6 +1350,7 @@ def run_once(
|
|||||||
|
|
||||||
return RunResult(published_count=published)
|
return RunResult(published_count=published)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# CLI
|
# CLI
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -1330,10 +1370,15 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument("--state-path", default=DEFAULT_STATE_PATH, help="Path to local JSON state file")
|
parser.add_argument("--state-path", default=DEFAULT_STATE_PATH, help="Path to local JSON state file")
|
||||||
parser.add_argument("--cooldown-path", default=DEFAULT_COOLDOWN_STATE_PATH, help="Path to shared cooldown JSON state file")
|
parser.add_argument("--cooldown-path", default=DEFAULT_COOLDOWN_STATE_PATH, help="Path to shared cooldown JSON state file")
|
||||||
parser.add_argument('--max-posts', type=int, default=5, help='Max new posts to publish per run')
|
parser.add_argument("--max-posts", type=int, default=5, help="Max new posts to publish per run (default: 5)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-age-days",
|
||||||
|
type=int,
|
||||||
|
default=7,
|
||||||
|
help="Skip entries older than this many days (default: 7). Use 0 to disable the age filter.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Parse comma-separated langs: "ca,es" → ["ca", "es"]
|
|
||||||
post_langs = [lang.strip() for lang in args.lang.split(",") if lang.strip()]
|
post_langs = [lang.strip() for lang in args.lang.split(",") if lang.strip()]
|
||||||
if not post_langs:
|
if not post_langs:
|
||||||
post_langs = ["ca"]
|
post_langs = ["ca"]
|
||||||
@@ -1348,13 +1393,14 @@ def main():
|
|||||||
bsky_username=args.bsky_username,
|
bsky_username=args.bsky_username,
|
||||||
bsky_password=args.bsky_app_password,
|
bsky_password=args.bsky_app_password,
|
||||||
service_url=args.service,
|
service_url=args.service,
|
||||||
post_langs=args.lang.split(","),
|
post_langs=post_langs,
|
||||||
state_path=args.state_path,
|
state_path=args.state_path,
|
||||||
cooldown_path=args.cooldown_path,
|
cooldown_path=args.cooldown_path,
|
||||||
cfg=AppConfig(),
|
cfg=cfg,
|
||||||
max_posts=args.max_posts
|
max_posts=args.max_posts,
|
||||||
|
max_age_days=args.max_age_days, # ← NEW
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user