feat: add --max-age-days filter to prevent posting stale RSS entries Introduce an age-based filter

This commit is contained in:
2026-05-17 07:14:21 +00:00
parent 6027ac58b4
commit 1a633d54e2

View File

@@ -401,6 +401,7 @@ def process_title(title: str) -> str:
title_text = clean_whitespace(title_text)
return title_text
def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
title_text = clean_whitespace(title_text)
link = canonicalize_url(link) or link or ""
@@ -414,13 +415,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
seen.add(cleaned)
variants.append(cleaned)
# Variant 1: títol + link (si cap sencer)
# Variant 1: title + link (if fits whole)
if title_text and link:
full = f"{title_text}\n\n{link}"
if len(full) <= max_length:
add_variant(full)
# Variant 2: només títol complet (sense truncar)
# Variant 2: full title only (no truncation)
if title_text:
if len(title_text) <= max_length:
add_variant(title_text)
@@ -428,7 +429,7 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
truncated = title_text[:max_length].rstrip(" .")
add_variant(truncated)
# Variant 3: títol truncat + link (si el títol complet+link no hi cap)
# Variant 3: truncated title + link (when full title+link doesn't fit)
if title_text and link:
full = f"{title_text}\n\n{link}"
if len(full) > max_length:
@@ -438,12 +439,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
truncated_title = title_text[:available].rstrip(" .")
add_variant(f"{truncated_title}\n\n{link}")
# Variant 4: només link (si no hi ha títol)
# Variant 4: link only (when no title)
if link and not title_text:
add_variant(link)
return variants
def is_x_or_twitter_domain(url: str) -> bool:
try:
hostname = (urlparse(url).hostname or "").lower()
@@ -1076,7 +1078,7 @@ def build_candidates_from_feed(feed, max_length: int = 300) -> List[EntryCandida
# ============================================================
# Orchestration
# Login
# ============================================================
def login_with_backoff(
client: Client,
@@ -1110,7 +1112,6 @@ def login_with_backoff(
except Exception as e:
logging.exception("❌ Login exception")
# Rate-limited login: retry first, cooldown only if exhausted
if is_rate_limited_error(e):
if attempt < max_attempts:
wait_seconds = get_rate_limit_wait_seconds(e, base_delay, cfg)
@@ -1125,19 +1126,16 @@ def login_with_backoff(
activate_post_creation_cooldown_from_error(e, cooldown_path, cfg)
return False
# Bad credentials: fail fast
if is_auth_error(e):
logging.error("❌ Authentication failed (bad handle/password/app-password).")
return False
# Network/transient: bounded retry
if attempt < max_attempts and (is_network_error(e) or is_timeout_error(e)):
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
logging.warning(f"⏳ Transient login failure. Retrying in {delay:.1f}s...")
time.sleep(delay)
continue
# Other errors: bounded retry
if attempt < max_attempts:
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
logging.warning(f"⏳ Login retry in {delay:.1f}s...")
@@ -1148,6 +1146,10 @@ def login_with_backoff(
return False
# ============================================================
# Orchestration
# ============================================================
def run_once(
rss_feed: str,
bsky_handle: str,
@@ -1158,13 +1160,26 @@ def run_once(
state_path: str,
cooldown_path: str,
cfg: AppConfig,
max_posts: int = 5 # ← NEW PARAMETER
max_posts: int = 5,
max_age_days: int = 7, # ← NEW: 0 = disabled
) -> RunResult:
if not PIL_AVAILABLE:
logging.warning("🟡 Pillow is not installed. External card thumbnail compression is disabled.")
logging.info(f"🌍 Posting language(s): {post_langs}")
# ── Age-filter cutoff ────────────────────────────────────────
if max_age_days > 0:
cutoff = arrow.utcnow().shift(days=-max_age_days)
logging.info(
f"📅 Age filter active: skipping entries published before "
f"{cutoff.isoformat()} (max_age_days={max_age_days})"
)
else:
cutoff = None
logging.info("📅 Age filter disabled (max_age_days=0).")
# ────────────────────────────────────────────────────────────
if check_post_cooldown_or_log(cooldown_path):
return RunResult(published_count=0, stopped_reason="global_post_cooldown_active")
@@ -1197,12 +1212,35 @@ def run_once(
logging.info(f"📰 Prepared {len(candidates)} feed entry candidates for duplicate comparison.")
entries_to_post: List[EntryCandidate] = []
for candidate in candidates:
# ── Age filter ───────────────────────────────────────────
if cutoff is not None:
pub = candidate.published_arrow
if pub is None:
logging.info(
f"⏭️ Skipping entry with no publication date "
f"(age filter active, max_age_days={max_age_days}): "
f"{candidate.canonical_link or candidate.title_text}"
)
continue
if pub < cutoff:
logging.info(
f"⏭️ Skipping old entry published {pub.isoformat()} "
f"(cutoff: {cutoff.isoformat()}): "
f"{candidate.canonical_link or candidate.title_text}"
)
continue
# ────────────────────────────────────────────────────────
# ── Deduplication: local state ───────────────────────────
is_dup_state, reason_state = candidate_matches_state(candidate, state)
if is_dup_state:
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
continue
# ── Deduplication: recent Bluesky posts ──────────────────
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
if is_dup_bsky:
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
@@ -1212,9 +1250,11 @@ def run_once(
logging.info(f"📬 {len(entries_to_post)} entries remain after duplicate filtering.")
# ← NEW: log the effective cap before starting the loop
if len(entries_to_post) > max_posts:
logging.info(f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} of {len(entries_to_post)} entries this run.")
logging.info(
f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} "
f"of {len(entries_to_post)} entries this run."
)
if not entries_to_post:
logging.info(" Execution finished: no new entries to publish.")
@@ -1227,7 +1267,6 @@ def run_once(
for candidate in entries_to_post:
# ← NEW: hard cap check at the top of every iteration
if published >= max_posts:
logging.info(f"🔢 === MAX POSTS REACHED === Stopping after {published} posts (limit: {max_posts}).")
break
@@ -1311,6 +1350,7 @@ def run_once(
return RunResult(published_count=published)
# ============================================================
# CLI
# ============================================================
@@ -1330,10 +1370,15 @@ def main():
)
parser.add_argument("--state-path", default=DEFAULT_STATE_PATH, help="Path to local JSON state file")
parser.add_argument("--cooldown-path", default=DEFAULT_COOLDOWN_STATE_PATH, help="Path to shared cooldown JSON state file")
parser.add_argument('--max-posts', type=int, default=5, help='Max new posts to publish per run')
parser.add_argument("--max-posts", type=int, default=5, help="Max new posts to publish per run (default: 5)")
parser.add_argument(
"--max-age-days",
type=int,
default=7,
help="Skip entries older than this many days (default: 7). Use 0 to disable the age filter.",
)
args = parser.parse_args()
# Parse comma-separated langs: "ca,es" → ["ca", "es"]
post_langs = [lang.strip() for lang in args.lang.split(",") if lang.strip()]
if not post_langs:
post_langs = ["ca"]
@@ -1348,11 +1393,12 @@ def main():
bsky_username=args.bsky_username,
bsky_password=args.bsky_app_password,
service_url=args.service,
post_langs=args.lang.split(","),
post_langs=post_langs,
state_path=args.state_path,
cooldown_path=args.cooldown_path,
cfg=AppConfig(),
max_posts=args.max_posts
cfg=cfg,
max_posts=args.max_posts,
max_age_days=args.max_age_days, # ← NEW
)