Files
scripts/bsky/remove_duplicates.py
Guillem Hernandez Sola 621ff10fbb bsky mgm
2026-04-09 11:53:37 +02:00

56 lines
1.8 KiB
Python

import pandas as pd
import os
# ── Config ────────────────────────────────────────────────────────────────────
INPUT_FILE = 'users_to_keep.csv'
OUTPUT_FILE = 'users_to_keep_clean.csv'
# ──────────────────────────────────────────────────────────────────────────────
def load_csv(filepath: str) -> pd.DataFrame:
"""Load CSV file and return a DataFrame."""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
return pd.read_csv(filepath)
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""Normalize handles and remove duplicate rows."""
df['Handle'] = df['Handle'].str.strip().str.lower()
return df.drop_duplicates(subset='Handle', keep='first')
def save_csv(df: pd.DataFrame, filepath: str) -> None:
"""Save DataFrame to CSV."""
df.to_csv(filepath, index=False)
def print_report(before: int, after: int, output: str) -> None:
"""Print a summary report."""
removed = before - after
print("" * 40)
print(f" Rows before : {before}")
print(f" Rows after : {after}")
print(f" Removed : {removed}")
print(f" Saved to : {output}")
print("" * 40)
def main():
print(f"\n📂 Loading '{INPUT_FILE}'...")
df = load_csv(INPUT_FILE)
before = len(df)
print("🔍 Removing duplicates...")
df_clean = remove_duplicates(df)
after = len(df_clean)
print(f"💾 Saving to '{OUTPUT_FILE}'...")
save_csv(df_clean, OUTPUT_FILE)
print_report(before, after, OUTPUT_FILE)
print("✅ Done!\n")
if __name__ == '__main__':
main()