KaraokeVideoDownloader/data/cleanup_duplicate_files.py

164 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""
Utility script to identify and clean up duplicate files with (2), (3) suffixes.
This helps clean up files that were created before the duplicate prevention was implemented.
"""
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
def find_duplicate_files(downloads_dir: str = "downloads") -> Dict[str, List[Path]]:
"""
Find duplicate files with (2), (3), etc. suffixes in the downloads directory.
Args:
downloads_dir: Path to downloads directory
Returns:
Dictionary mapping base filenames to lists of duplicate files
"""
downloads_path = Path(downloads_dir)
if not downloads_path.exists():
print(f"❌ Downloads directory not found: {downloads_dir}")
return {}
duplicates = {}
# Scan all MP4 files in the downloads directory
for mp4_file in downloads_path.rglob("*.mp4"):
filename = mp4_file.name
# Check if this is a duplicate file with (2), (3), etc.
match = re.match(r'^(.+?)\s*\((\d+)\)\.mp4$', filename)
if match:
base_name = match.group(1)
suffix_num = int(match.group(2))
if base_name not in duplicates:
duplicates[base_name] = []
duplicates[base_name].append((mp4_file, suffix_num))
# Sort duplicates by suffix number
for base_name in duplicates:
duplicates[base_name].sort(key=lambda x: x[1])
return duplicates
def analyze_duplicates(duplicates: Dict[str, List[Tuple[Path, int]]]) -> None:
"""
Analyze and display information about found duplicates.
Args:
duplicates: Dictionary of duplicate files
"""
if not duplicates:
print("✅ No duplicate files found!")
return
print(f"🔍 Found {len(duplicates)} sets of duplicate files:")
print()
total_duplicates = 0
for base_name, files in duplicates.items():
print(f"📁 {base_name}")
for file_path, suffix in files:
file_size = file_path.stat().st_size / (1024 * 1024) # MB
print(f" ({suffix}) {file_path.name} - {file_size:.1f} MB")
print()
total_duplicates += len(files) - 1 # -1 because we keep the original
print(f"📊 Summary: {len(duplicates)} base files with {total_duplicates} duplicate files")
def cleanup_duplicates(duplicates: Dict[str, List[Tuple[Path, int]]], dry_run: bool = True) -> None:
"""
Clean up duplicate files, keeping only the first occurrence.
Args:
duplicates: Dictionary of duplicate files
dry_run: If True, only show what would be deleted without actually deleting
"""
if not duplicates:
print("✅ No duplicates to clean up!")
return
mode = "DRY RUN" if dry_run else "ACTUAL CLEANUP"
print(f"🧹 Starting {mode}...")
print()
total_deleted = 0
total_size_freed = 0
for base_name, files in duplicates.items():
print(f"📁 Processing: {base_name}")
# Keep the first file (lowest suffix number), delete the rest
files_to_delete = files[1:] # Skip the first file
for file_path, suffix in files_to_delete:
file_size = file_path.stat().st_size / (1024 * 1024) # MB
if dry_run:
print(f" 🗑️ Would delete: {file_path.name} ({file_size:.1f} MB)")
else:
try:
file_path.unlink()
print(f" ✅ Deleted: {file_path.name} ({file_size:.1f} MB)")
total_deleted += 1
total_size_freed += file_size
except Exception as e:
print(f" ❌ Failed to delete {file_path.name}: {e}")
print()
if dry_run:
print(f"📊 DRY RUN SUMMARY: Would delete {len([f for files in duplicates.values() for f in files[1:]])} files")
else:
print(f"📊 CLEANUP SUMMARY: Deleted {total_deleted} files, freed {total_size_freed:.1f} MB")
def main():
"""Main function to run the duplicate file cleanup."""
print("🎵 Karaoke Video Downloader - Duplicate File Cleanup")
print("=" * 50)
print()
# Find duplicates
duplicates = find_duplicate_files()
if not duplicates:
print("✅ No duplicate files found!")
return
# Analyze duplicates
analyze_duplicates(duplicates)
print()
# Ask user what to do
while True:
print("Options:")
print("1. Dry run (show what would be deleted)")
print("2. Actually delete duplicate files")
print("3. Exit without doing anything")
choice = input("\nEnter your choice (1-3): ").strip()
if choice == "1":
cleanup_duplicates(duplicates, dry_run=True)
break
elif choice == "2":
confirm = input("⚠️ Are you sure you want to delete duplicate files? (yes/no): ").strip().lower()
if confirm in ["yes", "y"]:
cleanup_duplicates(duplicates, dry_run=False)
else:
print("❌ Cleanup cancelled.")
break
elif choice == "3":
print("❌ Exiting without cleanup.")
break
else:
print("❌ Invalid choice. Please enter 1, 2, or 3.")
if __name__ == "__main__":
main()