56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import pandas as pd
|
|
import os
|
|
|
|
# ── Config ────────────────────────────────────────────────────────────────────
|
|
INPUT_FILE = 'users_to_keep.csv'
|
|
OUTPUT_FILE = 'users_to_keep_clean.csv'
|
|
# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def load_csv(filepath: str) -> pd.DataFrame:
|
|
"""Load CSV file and return a DataFrame."""
|
|
if not os.path.exists(filepath):
|
|
raise FileNotFoundError(f"File not found: {filepath}")
|
|
return pd.read_csv(filepath)
|
|
|
|
|
|
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Normalize handles and remove duplicate rows."""
|
|
df['Handle'] = df['Handle'].str.strip().str.lower()
|
|
return df.drop_duplicates(subset='Handle', keep='first')
|
|
|
|
|
|
def save_csv(df: pd.DataFrame, filepath: str) -> None:
|
|
"""Save DataFrame to CSV."""
|
|
df.to_csv(filepath, index=False)
|
|
|
|
|
|
def print_report(before: int, after: int, output: str) -> None:
|
|
"""Print a summary report."""
|
|
removed = before - after
|
|
print("─" * 40)
|
|
print(f" Rows before : {before}")
|
|
print(f" Rows after : {after}")
|
|
print(f" Removed : {removed}")
|
|
print(f" Saved to : {output}")
|
|
print("─" * 40)
|
|
|
|
|
|
def main():
|
|
print(f"\n📂 Loading '{INPUT_FILE}'...")
|
|
df = load_csv(INPUT_FILE)
|
|
before = len(df)
|
|
|
|
print("🔍 Removing duplicates...")
|
|
df_clean = remove_duplicates(df)
|
|
after = len(df_clean)
|
|
|
|
print(f"💾 Saving to '{OUTPUT_FILE}'...")
|
|
save_csv(df_clean, OUTPUT_FILE)
|
|
|
|
print_report(before, after, OUTPUT_FILE)
|
|
print("✅ Done!\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |