bsky mgm
This commit is contained in:
56
bsky/remove_duplicates.py
Normal file
56
bsky/remove_duplicates.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
INPUT_FILE = 'users_to_keep.csv'
|
||||
OUTPUT_FILE = 'users_to_keep_clean.csv'
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def load_csv(filepath: str) -> pd.DataFrame:
|
||||
"""Load CSV file and return a DataFrame."""
|
||||
if not os.path.exists(filepath):
|
||||
raise FileNotFoundError(f"File not found: {filepath}")
|
||||
return pd.read_csv(filepath)
|
||||
|
||||
|
||||
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Normalize handles and remove duplicate rows."""
|
||||
df['Handle'] = df['Handle'].str.strip().str.lower()
|
||||
return df.drop_duplicates(subset='Handle', keep='first')
|
||||
|
||||
|
||||
def save_csv(df: pd.DataFrame, filepath: str) -> None:
|
||||
"""Save DataFrame to CSV."""
|
||||
df.to_csv(filepath, index=False)
|
||||
|
||||
|
||||
def print_report(before: int, after: int, output: str) -> None:
|
||||
"""Print a summary report."""
|
||||
removed = before - after
|
||||
print("─" * 40)
|
||||
print(f" Rows before : {before}")
|
||||
print(f" Rows after : {after}")
|
||||
print(f" Removed : {removed}")
|
||||
print(f" Saved to : {output}")
|
||||
print("─" * 40)
|
||||
|
||||
|
||||
def main():
|
||||
print(f"\n📂 Loading '{INPUT_FILE}'...")
|
||||
df = load_csv(INPUT_FILE)
|
||||
before = len(df)
|
||||
|
||||
print("🔍 Removing duplicates...")
|
||||
df_clean = remove_duplicates(df)
|
||||
after = len(df_clean)
|
||||
|
||||
print(f"💾 Saving to '{OUTPUT_FILE}'...")
|
||||
save_csv(df_clean, OUTPUT_FILE)
|
||||
|
||||
print_report(before, after, OUTPUT_FILE)
|
||||
print("✅ Done!\n")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user