feat: add --max-age-days filter to prevent posting stale RSS entries Introduce an age-based filter
This commit is contained in:
84
rss2bsky.py
84
rss2bsky.py
@@ -401,6 +401,7 @@ def process_title(title: str) -> str:
|
||||
title_text = clean_whitespace(title_text)
|
||||
return title_text
|
||||
|
||||
|
||||
def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
||||
title_text = clean_whitespace(title_text)
|
||||
link = canonicalize_url(link) or link or ""
|
||||
@@ -414,13 +415,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
||||
seen.add(cleaned)
|
||||
variants.append(cleaned)
|
||||
|
||||
# Variant 1: títol + link (si cap sencer)
|
||||
# Variant 1: title + link (if fits whole)
|
||||
if title_text and link:
|
||||
full = f"{title_text}\n\n{link}"
|
||||
if len(full) <= max_length:
|
||||
add_variant(full)
|
||||
|
||||
# Variant 2: només títol complet (sense truncar)
|
||||
# Variant 2: full title only (no truncation)
|
||||
if title_text:
|
||||
if len(title_text) <= max_length:
|
||||
add_variant(title_text)
|
||||
@@ -428,7 +429,7 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
||||
truncated = title_text[:max_length].rstrip(" .")
|
||||
add_variant(truncated)
|
||||
|
||||
# Variant 3: títol truncat + link (si el títol complet+link no hi cap)
|
||||
# Variant 3: truncated title + link (when full title+link doesn't fit)
|
||||
if title_text and link:
|
||||
full = f"{title_text}\n\n{link}"
|
||||
if len(full) > max_length:
|
||||
@@ -438,12 +439,13 @@ def build_post_text_variants(title_text: str, link: str, max_length: int = 300):
|
||||
truncated_title = title_text[:available].rstrip(" .")
|
||||
add_variant(f"{truncated_title}\n\n{link}")
|
||||
|
||||
# Variant 4: només link (si no hi ha títol)
|
||||
# Variant 4: link only (when no title)
|
||||
if link and not title_text:
|
||||
add_variant(link)
|
||||
|
||||
return variants
|
||||
|
||||
|
||||
def is_x_or_twitter_domain(url: str) -> bool:
|
||||
try:
|
||||
hostname = (urlparse(url).hostname or "").lower()
|
||||
@@ -1076,7 +1078,7 @@ def build_candidates_from_feed(feed, max_length: int = 300) -> List[EntryCandida
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Orchestration
|
||||
# Login
|
||||
# ============================================================
|
||||
def login_with_backoff(
|
||||
client: Client,
|
||||
@@ -1110,7 +1112,6 @@ def login_with_backoff(
|
||||
except Exception as e:
|
||||
logging.exception("❌ Login exception")
|
||||
|
||||
# Rate-limited login: retry first, cooldown only if exhausted
|
||||
if is_rate_limited_error(e):
|
||||
if attempt < max_attempts:
|
||||
wait_seconds = get_rate_limit_wait_seconds(e, base_delay, cfg)
|
||||
@@ -1125,19 +1126,16 @@ def login_with_backoff(
|
||||
activate_post_creation_cooldown_from_error(e, cooldown_path, cfg)
|
||||
return False
|
||||
|
||||
# Bad credentials: fail fast
|
||||
if is_auth_error(e):
|
||||
logging.error("❌ Authentication failed (bad handle/password/app-password).")
|
||||
return False
|
||||
|
||||
# Network/transient: bounded retry
|
||||
if attempt < max_attempts and (is_network_error(e) or is_timeout_error(e)):
|
||||
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||
logging.warning(f"⏳ Transient login failure. Retrying in {delay:.1f}s...")
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
# Other errors: bounded retry
|
||||
if attempt < max_attempts:
|
||||
delay = min(base_delay * attempt, max_delay) + random.uniform(0, jitter_max)
|
||||
logging.warning(f"⏳ Login retry in {delay:.1f}s...")
|
||||
@@ -1148,6 +1146,10 @@ def login_with_backoff(
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Orchestration
|
||||
# ============================================================
|
||||
def run_once(
|
||||
rss_feed: str,
|
||||
bsky_handle: str,
|
||||
@@ -1158,13 +1160,26 @@ def run_once(
|
||||
state_path: str,
|
||||
cooldown_path: str,
|
||||
cfg: AppConfig,
|
||||
max_posts: int = 5 # ← NEW PARAMETER
|
||||
max_posts: int = 5,
|
||||
max_age_days: int = 7, # ← NEW: 0 = disabled
|
||||
) -> RunResult:
|
||||
if not PIL_AVAILABLE:
|
||||
logging.warning("🟡 Pillow is not installed. External card thumbnail compression is disabled.")
|
||||
|
||||
logging.info(f"🌍 Posting language(s): {post_langs}")
|
||||
|
||||
# ── Age-filter cutoff ────────────────────────────────────────
|
||||
if max_age_days > 0:
|
||||
cutoff = arrow.utcnow().shift(days=-max_age_days)
|
||||
logging.info(
|
||||
f"📅 Age filter active: skipping entries published before "
|
||||
f"{cutoff.isoformat()} (max_age_days={max_age_days})"
|
||||
)
|
||||
else:
|
||||
cutoff = None
|
||||
logging.info("📅 Age filter disabled (max_age_days=0).")
|
||||
# ────────────────────────────────────────────────────────────
|
||||
|
||||
if check_post_cooldown_or_log(cooldown_path):
|
||||
return RunResult(published_count=0, stopped_reason="global_post_cooldown_active")
|
||||
|
||||
@@ -1197,12 +1212,35 @@ def run_once(
|
||||
logging.info(f"📰 Prepared {len(candidates)} feed entry candidates for duplicate comparison.")
|
||||
|
||||
entries_to_post: List[EntryCandidate] = []
|
||||
|
||||
for candidate in candidates:
|
||||
|
||||
# ── Age filter ───────────────────────────────────────────
|
||||
if cutoff is not None:
|
||||
pub = candidate.published_arrow
|
||||
if pub is None:
|
||||
logging.info(
|
||||
f"⏭️ Skipping entry with no publication date "
|
||||
f"(age filter active, max_age_days={max_age_days}): "
|
||||
f"{candidate.canonical_link or candidate.title_text}"
|
||||
)
|
||||
continue
|
||||
if pub < cutoff:
|
||||
logging.info(
|
||||
f"⏭️ Skipping old entry published {pub.isoformat()} "
|
||||
f"(cutoff: {cutoff.isoformat()}): "
|
||||
f"{candidate.canonical_link or candidate.title_text}"
|
||||
)
|
||||
continue
|
||||
# ────────────────────────────────────────────────────────
|
||||
|
||||
# ── Deduplication: local state ───────────────────────────
|
||||
is_dup_state, reason_state = candidate_matches_state(candidate, state)
|
||||
if is_dup_state:
|
||||
logging.info(f"⏭️ Skipping candidate due to local state duplicate match on: {reason_state}")
|
||||
continue
|
||||
|
||||
# ── Deduplication: recent Bluesky posts ──────────────────
|
||||
is_dup_bsky, reason_bsky = candidate_matches_existing_bsky(candidate, recent_bsky_posts)
|
||||
if is_dup_bsky:
|
||||
logging.info(f"⏭️ Skipping candidate due to recent Bluesky duplicate match on: {reason_bsky}")
|
||||
@@ -1212,9 +1250,11 @@ def run_once(
|
||||
|
||||
logging.info(f"📬 {len(entries_to_post)} entries remain after duplicate filtering.")
|
||||
|
||||
# ← NEW: log the effective cap before starting the loop
|
||||
if len(entries_to_post) > max_posts:
|
||||
logging.info(f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} of {len(entries_to_post)} entries this run.")
|
||||
logging.info(
|
||||
f"🔢 max-posts cap is {max_posts}: will publish at most {max_posts} "
|
||||
f"of {len(entries_to_post)} entries this run."
|
||||
)
|
||||
|
||||
if not entries_to_post:
|
||||
logging.info("ℹ️ Execution finished: no new entries to publish.")
|
||||
@@ -1227,7 +1267,6 @@ def run_once(
|
||||
|
||||
for candidate in entries_to_post:
|
||||
|
||||
# ← NEW: hard cap check at the top of every iteration
|
||||
if published >= max_posts:
|
||||
logging.info(f"🔢 === MAX POSTS REACHED === Stopping after {published} posts (limit: {max_posts}).")
|
||||
break
|
||||
@@ -1311,6 +1350,7 @@ def run_once(
|
||||
|
||||
return RunResult(published_count=published)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# CLI
|
||||
# ============================================================
|
||||
@@ -1330,10 +1370,15 @@ def main():
|
||||
)
|
||||
parser.add_argument("--state-path", default=DEFAULT_STATE_PATH, help="Path to local JSON state file")
|
||||
parser.add_argument("--cooldown-path", default=DEFAULT_COOLDOWN_STATE_PATH, help="Path to shared cooldown JSON state file")
|
||||
parser.add_argument('--max-posts', type=int, default=5, help='Max new posts to publish per run')
|
||||
parser.add_argument("--max-posts", type=int, default=5, help="Max new posts to publish per run (default: 5)")
|
||||
parser.add_argument(
|
||||
"--max-age-days",
|
||||
type=int,
|
||||
default=7,
|
||||
help="Skip entries older than this many days (default: 7). Use 0 to disable the age filter.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse comma-separated langs: "ca,es" → ["ca", "es"]
|
||||
post_langs = [lang.strip() for lang in args.lang.split(",") if lang.strip()]
|
||||
if not post_langs:
|
||||
post_langs = ["ca"]
|
||||
@@ -1348,13 +1393,14 @@ def main():
|
||||
bsky_username=args.bsky_username,
|
||||
bsky_password=args.bsky_app_password,
|
||||
service_url=args.service,
|
||||
post_langs=args.lang.split(","),
|
||||
post_langs=post_langs,
|
||||
state_path=args.state_path,
|
||||
cooldown_path=args.cooldown_path,
|
||||
cfg=AppConfig(),
|
||||
max_posts=args.max_posts
|
||||
cfg=cfg,
|
||||
max_posts=args.max_posts,
|
||||
max_age_days=args.max_age_days, # ← NEW
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user