From 613b64601a4259e88261644e6191ca557df5befb Mon Sep 17 00:00:00 2001 From: mbrucedogs Date: Mon, 28 Jul 2025 07:51:40 -0500 Subject: [PATCH] Signed-off-by: mbrucedogs --- PRD.md | 55 ++++++++ README.md | 54 ++++++++ data/cleanup_duplicate_files.py | 164 ++++++++++++++++++++++++ data/config.json | 2 +- karaoke_downloader/download_pipeline.py | 22 +++- karaoke_downloader/file_utils.py | 39 ++++++ 6 files changed, 331 insertions(+), 5 deletions(-) create mode 100644 data/cleanup_duplicate_files.py diff --git a/PRD.md b/PRD.md index cd40ea2..30ddc8d 100644 --- a/PRD.md +++ b/PRD.md @@ -273,6 +273,7 @@ The codebase has been comprehensively refactored to improve maintainability and - [x] **Parallel downloads for improved speed** โœ… **COMPLETED** - [x] **Enhanced fuzzy matching with improved video title parsing** โœ… **COMPLETED** - [x] **Consolidated extract_artist_title function** โœ… **COMPLETED** +- [x] **Duplicate file prevention and filename consistency** โœ… **COMPLETED** - [ ] Unit tests for all modules - [ ] Integration tests for end-to-end workflows - [ ] Plugin system for custom file operations @@ -304,3 +305,57 @@ The codebase has been comprehensively refactored to improve maintainability and - **Consistent behavior**: All parts of the system use the same parsing logic, eliminating edge cases where different modules would parse the same title differently - **Improved performance**: The `--limit` parameter now works as expected, providing faster processing for targeted downloads - **Cleaner codebase**: Eliminated duplicate code and import conflicts, making the system more maintainable + +## ๐Ÿ”ง Recent Bug Fixes & Improvements (v3.4.2) +### **Duplicate File Prevention & Filename Consistency** +- **Enhanced file existence checking**: `check_file_exists_with_patterns()` now detects files with `(2)`, `(3)`, etc. suffixes that yt-dlp creates +- **Automatic duplicate prevention**: Download pipeline skips downloads when files already exist (including duplicates) +- **Updated yt-dlp configuration**: Set `"nooverwrites": false` to prevent yt-dlp from creating duplicate files with suffixes +- **Cleanup utility**: `data/cleanup_duplicate_files.py` provides interactive cleanup of existing duplicate files +- **Filename vs ID3 tag consistency**: Removed "(Karaoke Version)" suffix from ID3 tags to match filenames exactly +- **Unified parsing**: Both filename generation and ID3 tagging use the same artist/title extraction logic + +### **Benefits of Duplicate Prevention** +- **No more duplicate files**: Eliminates `(2)`, `(3)` suffix files that waste disk space +- **Consistent metadata**: Filename and ID3 tag use identical artist/title format +- **Efficient disk usage**: Prevents unnecessary downloads of existing files +- **Clear file identification**: Consistent naming across all file operations + +## ๐Ÿ› ๏ธ Maintenance + +### **Regular Cleanup** +- Run the cleanup utility periodically to remove any duplicate files +- Monitor downloads for any new duplicate creation (should be rare with fixes) + +### **Configuration** +- Keep `"nooverwrites": false` in `data/config.json` +- This prevents yt-dlp from creating duplicate files + +### **Monitoring** +- Check logs for "โญ๏ธ Skipping download - file already exists" messages +- These indicate the duplicate prevention is working correctly + +## ๐Ÿ“š Documentation Standards + +### **Documentation Location** +- **All changes, refactoring, and improvements should be documented in the PRD.md and README.md files** +- **Do NOT create separate .md files for documenting changes, refactoring, or improvements** +- **Use the existing sections in PRD.md and README.md to track all project evolution** + +### **Where to Document Changes** +- **PRD.md**: Technical details, architecture changes, bug fixes, and implementation specifics +- **README.md**: User-facing features, usage instructions, and high-level improvements +- **CHANGELOG.md**: Version-specific release notes and change summaries + +### **Documentation Requirements** +- **All new features must be documented in both PRD.md and README.md** +- **All refactoring efforts must be documented in the appropriate sections** +- **All bug fixes must be documented with technical details** +- **Version numbers and dates should be clearly marked** +- **Benefits and improvements should be explicitly stated** + +### **Maintenance Responsibility** +- **Keep PRD.md and README.md synchronized with code changes** +- **Update documentation immediately when implementing new features** +- **Remove outdated information and consolidate related changes** +- **Ensure all CLI options and features are documented in both files** diff --git a/README.md b/README.md index ffb3806..db42962 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ A Python-based Windows CLI tool to download karaoke videos from YouTube channels - ๐Ÿท๏ธ **Server Duplicates Tracking**: Automatically checks against local songs.json file and marks duplicates for future skipping, preventing re-downloads of songs already on the server - โšก **Parallel Downloads**: Enable concurrent downloads with `--parallel --workers N` for significantly faster batch downloads (3-5x speedup) - ๐Ÿ“Š **Unmatched Songs Reports**: Generate detailed reports of songs that couldn't be found in any channel with `--generate-unmatched-report` +- ๐Ÿ›ก๏ธ **Duplicate File Prevention**: Automatically detects and prevents duplicate files with `(2)`, `(3)` suffixes, with cleanup utility for existing duplicates +- ๐Ÿท๏ธ **Consistent Metadata**: Filename and ID3 tag use identical artist/title format for clear file identification ## ๐Ÿ—๏ธ Architecture The codebase has been comprehensively refactored into a modular architecture with centralized utilities for improved maintainability, error handling, and code reuse: @@ -99,6 +101,33 @@ The codebase has been comprehensively refactored into a modular architecture wit - **Fixed import conflicts**: Resolved inconsistencies between different parsing implementations - **Single source of truth**: All title parsing logic is now centralized in `fuzzy_matcher.py` +## ๐Ÿ›ก๏ธ Duplicate File Prevention & Filename Consistency (v3.4.2) +### **Duplicate File Prevention** +- **Enhanced file existence checking**: Now detects files with `(2)`, `(3)`, etc. suffixes that yt-dlp creates +- **Automatic duplicate prevention**: Skips downloads when files already exist (including duplicates) +- **Updated yt-dlp configuration**: Set `"nooverwrites": false` to prevent yt-dlp from creating duplicate files +- **Cleanup utility**: `data/cleanup_duplicate_files.py` helps identify and remove existing duplicate files + +### **Filename vs ID3 Tag Consistency** +- **Consistent metadata**: Filename and ID3 tag now use identical artist/title format +- **Removed extra suffixes**: No more "(Karaoke Version)" in ID3 tags that don't match filenames +- **Unified parsing**: Both filename generation and ID3 tagging use the same artist/title extraction + +### **Benefits** +- โœ… **No more duplicate files** with `(2)`, `(3)` suffixes +- โœ… **Consistent metadata** between filename and ID3 tags +- โœ… **Efficient disk usage** by preventing unnecessary downloads +- โœ… **Clear file identification** with consistent naming + +### **Clean Up Existing Duplicates** +```bash +# Run the cleanup utility to find and remove existing duplicates +python data/cleanup_duplicate_files.py + +# Choose option 1 for dry run (recommended first) +# Choose option 2 to actually delete duplicates +``` + ## ๐Ÿ“‹ Requirements - **Windows 10/11** - **Python 3.7+** @@ -370,6 +399,31 @@ python download_karaoke.py --generate-unmatched-report --fuzzy-match --fuzzy-thr > **๐Ÿ”„ Maintenance Note**: The `commands.txt` file should be kept up to date with any CLI changes. When adding new command-line options or modifying existing ones, update this file to reflect all available commands and their usage. +## ๐Ÿ“š Documentation Standards + +### **Documentation Location** +- **All changes, refactoring, and improvements should be documented in the PRD.md and README.md files** +- **Do NOT create separate .md files for documenting changes, refactoring, or improvements** +- **Use the existing sections in PRD.md and README.md to track all project evolution** + +### **Where to Document Changes** +- **PRD.md**: Technical details, architecture changes, bug fixes, and implementation specifics +- **README.md**: User-facing features, usage instructions, and high-level improvements +- **CHANGELOG.md**: Version-specific release notes and change summaries + +### **Documentation Requirements** +- **All new features must be documented in both PRD.md and README.md** +- **All refactoring efforts must be documented in the appropriate sections** +- **All bug fixes must be documented with technical details** +- **Version numbers and dates should be clearly marked** +- **Benefits and improvements should be explicitly stated** + +### **Maintenance Responsibility** +- **Keep PRD.md and README.md synchronized with code changes** +- **Update documentation immediately when implementing new features** +- **Remove outdated information and consolidate related changes** +- **Ensure all CLI options and features are documented in both files** + ## ๐Ÿ”ง Refactoring Improvements (v3.3) The codebase has been comprehensively refactored to improve maintainability and reduce code duplication. Recent improvements have enhanced reliability, performance, and code organization: diff --git a/data/cleanup_duplicate_files.py b/data/cleanup_duplicate_files.py new file mode 100644 index 0000000..be8c376 --- /dev/null +++ b/data/cleanup_duplicate_files.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Utility script to identify and clean up duplicate files with (2), (3) suffixes. +This helps clean up files that were created before the duplicate prevention was implemented. +""" + +import json +import re +from pathlib import Path +from typing import Dict, List, Tuple + +def find_duplicate_files(downloads_dir: str = "downloads") -> Dict[str, List[Path]]: + """ + Find duplicate files with (2), (3), etc. suffixes in the downloads directory. + + Args: + downloads_dir: Path to downloads directory + + Returns: + Dictionary mapping base filenames to lists of duplicate files + """ + downloads_path = Path(downloads_dir) + if not downloads_path.exists(): + print(f"โŒ Downloads directory not found: {downloads_dir}") + return {} + + duplicates = {} + + # Scan all MP4 files in the downloads directory + for mp4_file in downloads_path.rglob("*.mp4"): + filename = mp4_file.name + + # Check if this is a duplicate file with (2), (3), etc. + match = re.match(r'^(.+?)\s*\((\d+)\)\.mp4$', filename) + if match: + base_name = match.group(1) + suffix_num = int(match.group(2)) + + if base_name not in duplicates: + duplicates[base_name] = [] + + duplicates[base_name].append((mp4_file, suffix_num)) + + # Sort duplicates by suffix number + for base_name in duplicates: + duplicates[base_name].sort(key=lambda x: x[1]) + + return duplicates + +def analyze_duplicates(duplicates: Dict[str, List[Tuple[Path, int]]]) -> None: + """ + Analyze and display information about found duplicates. + + Args: + duplicates: Dictionary of duplicate files + """ + if not duplicates: + print("โœ… No duplicate files found!") + return + + print(f"๐Ÿ” Found {len(duplicates)} sets of duplicate files:") + print() + + total_duplicates = 0 + for base_name, files in duplicates.items(): + print(f"๐Ÿ“ {base_name}") + for file_path, suffix in files: + file_size = file_path.stat().st_size / (1024 * 1024) # MB + print(f" ({suffix}) {file_path.name} - {file_size:.1f} MB") + print() + total_duplicates += len(files) - 1 # -1 because we keep the original + + print(f"๐Ÿ“Š Summary: {len(duplicates)} base files with {total_duplicates} duplicate files") + +def cleanup_duplicates(duplicates: Dict[str, List[Tuple[Path, int]]], dry_run: bool = True) -> None: + """ + Clean up duplicate files, keeping only the first occurrence. + + Args: + duplicates: Dictionary of duplicate files + dry_run: If True, only show what would be deleted without actually deleting + """ + if not duplicates: + print("โœ… No duplicates to clean up!") + return + + mode = "DRY RUN" if dry_run else "ACTUAL CLEANUP" + print(f"๐Ÿงน Starting {mode}...") + print() + + total_deleted = 0 + total_size_freed = 0 + + for base_name, files in duplicates.items(): + print(f"๐Ÿ“ Processing: {base_name}") + + # Keep the first file (lowest suffix number), delete the rest + files_to_delete = files[1:] # Skip the first file + + for file_path, suffix in files_to_delete: + file_size = file_path.stat().st_size / (1024 * 1024) # MB + + if dry_run: + print(f" ๐Ÿ—‘๏ธ Would delete: {file_path.name} ({file_size:.1f} MB)") + else: + try: + file_path.unlink() + print(f" โœ… Deleted: {file_path.name} ({file_size:.1f} MB)") + total_deleted += 1 + total_size_freed += file_size + except Exception as e: + print(f" โŒ Failed to delete {file_path.name}: {e}") + + print() + + if dry_run: + print(f"๐Ÿ“Š DRY RUN SUMMARY: Would delete {len([f for files in duplicates.values() for f in files[1:]])} files") + else: + print(f"๐Ÿ“Š CLEANUP SUMMARY: Deleted {total_deleted} files, freed {total_size_freed:.1f} MB") + +def main(): + """Main function to run the duplicate file cleanup.""" + print("๐ŸŽต Karaoke Video Downloader - Duplicate File Cleanup") + print("=" * 50) + print() + + # Find duplicates + duplicates = find_duplicate_files() + + if not duplicates: + print("โœ… No duplicate files found!") + return + + # Analyze duplicates + analyze_duplicates(duplicates) + print() + + # Ask user what to do + while True: + print("Options:") + print("1. Dry run (show what would be deleted)") + print("2. Actually delete duplicate files") + print("3. Exit without doing anything") + + choice = input("\nEnter your choice (1-3): ").strip() + + if choice == "1": + cleanup_duplicates(duplicates, dry_run=True) + break + elif choice == "2": + confirm = input("โš ๏ธ Are you sure you want to delete duplicate files? (yes/no): ").strip().lower() + if confirm in ["yes", "y"]: + cleanup_duplicates(duplicates, dry_run=False) + else: + print("โŒ Cleanup cancelled.") + break + elif choice == "3": + print("โŒ Exiting without cleanup.") + break + else: + print("โŒ Invalid choice. Please enter 1, 2, or 3.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data/config.json b/data/config.json index 65a5ed7..30f408b 100644 --- a/data/config.json +++ b/data/config.json @@ -19,7 +19,7 @@ "writethumbnail": false, "embed_metadata": false, "continuedl": true, - "nooverwrites": true, + "nooverwrites": false, "ignoreerrors": true, "no_warnings": false }, diff --git a/karaoke_downloader/download_pipeline.py b/karaoke_downloader/download_pipeline.py index 2ce5485..07705c8 100644 --- a/karaoke_downloader/download_pipeline.py +++ b/karaoke_downloader/download_pipeline.py @@ -20,6 +20,12 @@ from karaoke_downloader.youtube_utils import ( execute_yt_dlp_command, show_available_formats, ) +from karaoke_downloader.file_utils import ( + cleanup_temp_files, + get_unique_filename, + is_valid_mp4_file, + sanitize_filename, +) class DownloadPipeline: @@ -63,9 +69,15 @@ class DownloadPipeline: True if successful, False otherwise """ try: - # Step 1: Prepare file path - filename = sanitize_filename(artist, title) - output_path = self.downloads_dir / channel_name / filename + # Step 1: Prepare file path and check for existing files + output_path, file_exists = get_unique_filename(self.downloads_dir, channel_name, artist, title) + + if file_exists: + print(f"โญ๏ธ Skipping download - file already exists: {output_path.name}") + # Still add tags and track the existing file + if self._add_tags(output_path, artist, title, channel_name): + self._track_download(output_path, artist, title, video_id, channel_name) + return True # Step 2: Download video if not self._download_video(video_id, output_path, artist, title, channel_name): @@ -214,8 +226,10 @@ class DownloadPipeline: ) -> bool: """Step 3: Add ID3 tags to the downloaded file.""" try: + # Use the same artist/title as the filename for consistency + # Don't add "(Karaoke Version)" to the ID3 tag title add_id3_tags( - output_path, f"{artist} - {title} (Karaoke Version)", channel_name + output_path, f"{artist} - {title}", channel_name ) print(f"๐Ÿท๏ธ Added ID3 tags: {artist} - {title}") return True diff --git a/karaoke_downloader/file_utils.py b/karaoke_downloader/file_utils.py index 096beee..f9c9931 100644 --- a/karaoke_downloader/file_utils.py +++ b/karaoke_downloader/file_utils.py @@ -112,6 +112,7 @@ def check_file_exists_with_patterns( ) -> Tuple[bool, Optional[Path]]: """ Check if a file exists using multiple possible filename patterns. + Also checks for files with (2), (3), etc. suffixes that yt-dlp might create. Args: downloads_dir: Base downloads directory @@ -132,13 +133,51 @@ def check_file_exists_with_patterns( safe_title = sanitize_title_for_filenames(title) filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4" + # Check for exact filename match file_path = channel_dir / filename if file_path.exists() and file_path.stat().st_size > 0: return True, file_path + # Check for files with (2), (3), etc. suffixes + base_name = filename.replace(".mp4", "") + for suffix in range(2, 10): # Check up to (9) + suffixed_filename = f"{base_name} ({suffix}).mp4" + suffixed_path = channel_dir / suffixed_filename + if suffixed_path.exists() and suffixed_path.stat().st_size > 0: + return True, suffixed_path + return False, None +def get_unique_filename( + downloads_dir: Path, channel_name: str, artist: str, title: str +) -> Tuple[Path, bool]: + """ + Get a unique filename for download, checking for existing files including duplicates. + + Args: + downloads_dir: Base downloads directory + channel_name: Channel name + artist: Song artist + title: Song title + + Returns: + Tuple of (file_path, is_existing) where is_existing indicates if a file already exists + """ + filename = sanitize_filename(artist, title) + channel_dir = downloads_dir / channel_name + file_path = channel_dir / filename + + # Check if file already exists + exists, existing_path = check_file_exists_with_patterns(downloads_dir, channel_name, artist, title) + + if exists and existing_path: + print(f"๐Ÿ“ File already exists: {existing_path.name}") + return existing_path, True + + return file_path, False + + def ensure_directory_exists(directory: Path) -> None: """ Ensure a directory exists, creating it if necessary.