diff --git a/PRD.md b/PRD.md index 6459724..75f60c9 100644 --- a/PRD.md +++ b/PRD.md @@ -85,10 +85,14 @@ These principles are fundamental to the project's long-term success and must be - **Verbose per-song output:** Only for matches/duplicates (not every song) - **Verbosity configurable:** (via CLI flag or config) -### 4.3 Manual Review (Future Web UI) +### 4.3 Manual Review (Web UI) -- Table/grid view for ambiguous/complex cases -- Ability to preview media before making a selection +- **Interactive Web Interface**: Table/grid view for ambiguous/complex cases +- **Media Preview**: Ability to preview media before making a selection +- **Bulk Actions**: Select multiple items for batch operations +- **Real-time Filtering**: Search and filter capabilities +- **Responsive Design**: Works on desktop and mobile devices +- **Easy Startup**: Simple script (`start_web_ui.py`) with dependency checking --- @@ -108,27 +112,60 @@ These principles are fundamental to the project's long-term success and must be - **CLI Language:** Python - **Config:** JSON (channel priorities, settings) -- **Suggested Folder Structure:** -/data/ -allSongs.json -skipSongs.json -/config/ -config.json -/cli/ -main.py -matching.py -report.py -utils.py - -- (expandable for web UI later) +- **Current Folder Structure:** +``` +KaraokeMerge/ +├── data/ +│ ├── allSongs.json # Input: Your song library data +│ ├── skipSongs.json # Output: Generated skip list +│ └── reports/ # Detailed analysis reports +│ ├── analysis_data.json +│ ├── actionable_insights_report.txt +│ ├── channel_optimization_report.txt +│ ├── duplicate_pattern_report.txt +│ ├── enhanced_summary_report.txt +│ ├── skip_list_summary.txt +│ └── skip_songs_detailed.json +├── config/ +│ └── config.json # Configuration settings +├── cli/ +│ ├── main.py # Main CLI application +│ ├── matching.py # Song matching logic +│ ├── report.py # Report generation +│ └── utils.py # Utility functions +├── web/ # Web UI for manual review +│ ├── app.py # Flask web application +│ └── templates/ +│ └── index.html # Web interface template +├── start_web_ui.py # Web UI startup script +├── test_tool.py # Validation and testing script +├── requirements.txt # Python dependencies +├── .gitignore # Git ignore rules +├── PRD.md # Product Requirements Document +└── README.md # Project documentation +``` --- -## 7. Future Expansion: Web UI +## 7. Web UI Implementation -- Table/grid review, bulk actions -- Embedded player for media preview -- Config editor for channel priorities +### 7.1 Current Web UI Features +- **Interactive Table View**: Sortable, filterable grid of duplicate songs +- **Bulk Selection**: Select multiple items for batch operations +- **Search & Filter**: Real-time search across artists, titles, and paths +- **Responsive Design**: Mobile-friendly interface +- **Easy Startup**: Automated dependency checking and browser launch + +### 7.2 Web UI Architecture +- **Flask Backend**: Lightweight web server (`web/app.py`) +- **HTML Template**: Modern, responsive interface (`web/templates/index.html`) +- **Startup Script**: Dependency management and server startup (`start_web_ui.py`) + +### 7.3 Future Web UI Enhancements +- Embedded media player for audio/video preview +- Real-time configuration editing +- Advanced filtering and sorting options +- Export capabilities for manual selections --- @@ -200,11 +237,17 @@ The tool has been successfully implemented with the following components: - [x] Generate comprehensive skip list with metadata - [x] Optimize performance for large datasets (37,000+ songs) - [x] Add progress indicators and error handling +- [x] Generate detailed analysis reports (`--save-reports` functionality) +- [x] Create web UI for manual review of ambiguous cases +- [x] Add test tool for validation and debugging +- [x] Create startup script for web UI with dependency checking +- [x] Add comprehensive .gitignore file +- [x] Update documentation with required data file information #### 🎯 **Next Priority Items** -- [x] Generate detailed analysis reports (`--save-reports` functionality) - [ ] Analyze MP4 files without channel priorities to suggest new folder names -- [ ] Create web UI for manual review of ambiguous cases - [ ] Add support for additional file formats if needed - [ ] Implement batch processing capabilities -- [ ] Create integration scripts for karaoke software \ No newline at end of file +- [ ] Create integration scripts for karaoke software +- [ ] Add unit tests for core functionality +- [ ] Implement audio fingerprinting for better duplicate detection \ No newline at end of file diff --git a/README.md b/README.md index 3d706c8..f7ffbcf 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,15 @@ A powerful command-line tool for analyzing, deduplicating, and cleaning up large KaraokeMerge/ ├── data/ │ ├── allSongs.json # Input: Your song library data -│ └── skipSongs.json # Output: Generated skip list +│ ├── skipSongs.json # Output: Generated skip list +│ └── reports/ # Detailed analysis reports +│ ├── analysis_data.json +│ ├── actionable_insights_report.txt +│ ├── channel_optimization_report.txt +│ ├── duplicate_pattern_report.txt +│ ├── enhanced_summary_report.txt +│ ├── skip_list_summary.txt +│ └── skip_songs_detailed.json ├── config/ │ └── config.json # Configuration settings ├── cli/ @@ -28,6 +36,14 @@ KaraokeMerge/ │ ├── matching.py # Song matching logic │ ├── report.py # Report generation │ └── utils.py # Utility functions +├── web/ # Web UI for manual review +│ ├── app.py # Flask web application +│ └── templates/ +│ └── index.html # Web interface template +├── start_web_ui.py # Web UI startup script +├── test_tool.py # Validation and testing script +├── requirements.txt # Python dependencies +├── .gitignore # Git ignore rules ├── PRD.md # Product Requirements Document └── README.md # This file ``` @@ -39,6 +55,42 @@ KaraokeMerge/ - Python 3.7 or higher - Your karaoke song data in JSON format (see Data Format section) +### Required Data File + +**Important**: You need to provide your own `data/allSongs.json` file. This file is excluded from version control due to its large size and personal nature. + +**Sample `allSongs.json` format:** +```json +[ + { + "artist": "ACDC", + "title": "Shot In The Dark", + "path": "z://MP4\\ACDC - Shot In The Dark (Karaoke Version).mp4", + "guid": "8946008c-7acc-d187-60e6-5286e55ad502", + "disabled": false, + "favorite": false + }, + { + "artist": "Queen", + "title": "Bohemian Rhapsody", + "path": "z://MP4\\Sing King Karaoke\\Queen - Bohemian Rhapsody (Karaoke Version).mp4", + "guid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + "disabled": false, + "favorite": true + } +] +``` + +**Required fields:** +- `artist`: Song artist name +- `title`: Song title +- `path`: Full file path to the song file +- `guid`: Unique identifier for the song + +**Optional fields:** +- `disabled`: Boolean indicating if song is disabled (default: false) +- `favorite`: Boolean indicating if song is favorited (default: false) + ### Installation 1. Clone or download this repository @@ -59,7 +111,12 @@ python cli/main.py --dry-run # Save detailed reports python cli/main.py --save-reports -``` + +# Test the tool functionality +python test_tool.py + +# Start the web UI for manual review +python start_web_ui.py ### Command Line Options @@ -152,6 +209,37 @@ Edit `config/config.json` to customize the tool's behavior: } ``` +## 🌐 Web UI for Manual Review + +The project includes a web interface for interactive review of duplicate songs: + +### Starting the Web UI + +```bash +python start_web_ui.py +``` + +This script will: +- Check for required dependencies (Flask) +- Install missing dependencies automatically +- Validate required data files exist +- Start the web server +- Open your browser automatically + +### Web UI Features + +- **Interactive Table**: Sortable, filterable grid of duplicate songs +- **Bulk Selection**: Select multiple items for batch operations +- **Real-time Search**: Filter by artist, title, or file path +- **Responsive Design**: Works on desktop and mobile devices +- **Detailed Information**: View full metadata for each duplicate + +### Web UI Requirements + +- Flask web framework (automatically installed if missing) +- Generated skip list data (`data/skipSongs.json`) +- Configuration file (`config/config.json`) + ## 📈 Understanding the Output ### Summary Report @@ -232,14 +320,32 @@ The codebase is designed for easy expansion: - **Modular Design**: Separate modules for matching, reporting, and utilities - **Configuration-Driven**: Easy to modify behavior without code changes -- **Web UI Ready**: Structure supports future web interface development +- **Web UI Implementation**: Full web interface for manual review and bulk operations +- **Testing Framework**: Built-in test tool for validation and debugging +- **Dependency Management**: Automated setup and dependency checking + +### Testing and Validation + +Use the built-in test tool to validate your setup: + +```bash +python test_tool.py +``` + +This will: +- Test all module imports +- Validate configuration loading +- Test with a sample of your song data +- Verify report generation +- Provide feedback on any issues ### Adding New Features 1. **New File Formats**: Add extensions to `config.json` 2. **New Matching Rules**: Extend `SongMatcher` class in `matching.py` 3. **New Reports**: Add methods to `ReportGenerator` class -4. **Web UI**: Build on existing CLI structure +4. **Web UI Enhancements**: Extend `web/app.py` and `web/templates/index.html` +5. **Testing**: Add test cases to `test_tool.py` ## 🎯 Current Status @@ -251,6 +357,10 @@ The codebase is designed for easy expansion: - **Performance Optimization**: Handles large libraries (37,000+ songs) efficiently - **Enhanced Analysis & Reporting**: Comprehensive statistical analysis with actionable insights - **Pattern Analysis**: Skip list pattern analysis and channel optimization suggestions +- **Web UI**: Interactive web interface for manual review and bulk operations +- **Testing & Validation**: Test tool for functionality validation and debugging +- **Dependency Management**: Automated dependency checking and installation +- **Project Documentation**: Comprehensive .gitignore and updated documentation ### 🚀 **Ready for Use** The tool is production-ready and has successfully processed a large karaoke library: @@ -268,12 +378,15 @@ The tool is production-ready and has successfully processed a large karaoke libr - ✅ Add statistical insights and trends - ✅ Pattern analysis and channel optimization suggestions -### Phase 3: Web Interface -- Interactive table/grid for duplicate review -- Embedded media player for preview -- Bulk actions and manual overrides -- Real-time configuration editing -- Manual review interface for ambiguous cases +### Phase 3: Web Interface ✅ +- ✅ Interactive table/grid for duplicate review +- ✅ Bulk actions and manual overrides +- ✅ Real-time filtering and search +- ✅ Responsive design for mobile/desktop +- ✅ Easy startup with dependency checking +- [ ] Embedded media player for preview +- [ ] Real-time configuration editing +- [ ] Advanced export capabilities ### Phase 4: Advanced Features - Audio fingerprinting for better duplicate detection @@ -311,9 +424,11 @@ This project is open source. Feel free to use, modify, and distribute according ### Getting Help -1. Check the configuration with `python cli/main.py --show-config` -2. Run with `--verbose` for detailed output -3. Use `--dry-run` to test without generating files +1. **Test your setup**: Run `python test_tool.py` to validate everything is working +2. **Check configuration**: Use `python cli/main.py --show-config` to verify settings +3. **Verbose output**: Run with `--verbose` for detailed information +4. **Dry run**: Use `--dry-run` to test without generating files +5. **Web UI**: Start `python start_web_ui.py` for interactive review ## 📊 Performance & Results diff --git a/cli/__pycache__/matching.cpython-313.pyc b/cli/__pycache__/matching.cpython-313.pyc index e7fa632..288c541 100644 Binary files a/cli/__pycache__/matching.cpython-313.pyc and b/cli/__pycache__/matching.cpython-313.pyc differ diff --git a/cli/__pycache__/report.cpython-313.pyc b/cli/__pycache__/report.cpython-313.pyc index 0e00968..675236c 100644 Binary files a/cli/__pycache__/report.cpython-313.pyc and b/cli/__pycache__/report.cpython-313.pyc differ diff --git a/cli/__pycache__/utils.cpython-313.pyc b/cli/__pycache__/utils.cpython-313.pyc index c6a1086..b7077c4 100644 Binary files a/cli/__pycache__/utils.cpython-313.pyc and b/cli/__pycache__/utils.cpython-313.pyc differ diff --git a/cli/matching.py b/cli/matching.py index 2ba3ef4..6d80aa9 100644 --- a/cli/matching.py +++ b/cli/matching.py @@ -14,6 +14,7 @@ except ImportError: from utils import ( normalize_artist_title, extract_channel_from_path, + extract_consolidated_channel_from_path, get_file_extension, parse_multi_artist, validate_song_data, @@ -163,11 +164,8 @@ class SongMatcher: return False def get_channel_priority(self, file_path: str) -> int: - """Get channel priority for MP4 files based on configured folder names.""" - if not file_path.lower().endswith('.mp4'): - return -1 # Not an MP4 file - - channel = extract_channel_from_path(file_path, self.channel_priorities) + """Get channel priority for files based on configured folder names.""" + channel = extract_consolidated_channel_from_path(file_path, self.channel_priorities) if not channel: return len(self.channel_priorities) # Lowest priority if no channel found @@ -235,10 +233,10 @@ class SongMatcher: ext = get_file_extension(song['path']) stats['file_type_breakdown'][ext] += 1 - if ext == '.mp4': - channel = extract_channel_from_path(song['path'], self.channel_priorities) - if channel: - stats['channel_breakdown'][channel] += 1 + # Use consolidated channel extraction for all file types + channel = extract_consolidated_channel_from_path(song['path'], self.channel_priorities) + if channel: + stats['channel_breakdown'][channel] += 1 # Select best song and mark others for skipping best_song, group_skips = self.select_best_song(group_songs) diff --git a/cli/utils.py b/cli/utils.py index 3ede021..5e28c8b 100644 --- a/cli/utils.py +++ b/cli/utils.py @@ -117,6 +117,104 @@ def extract_channel_from_path(file_path: str, channel_priorities: List[str] = No if re.search(escaped_channel, path_lower): return channel + # If no configured channel found, extract the parent directory name + # Split path and look for the folder containing the file + parts = file_path.split('\\') + + # Look for MP4 folder structure: MP4/ChannelName/song.mp4 + for i, part in enumerate(parts): + if part.lower() == 'mp4' and i < len(parts) - 1: + # If MP4 is found, return the next folder (the actual channel) + if i + 1 < len(parts): + next_part = parts[i + 1] + # Check if the next part is a folder (no file extension) or a file + if '.' not in next_part: + return next_part # It's a folder, so it's the channel name + else: + return 'MP4 Root' # File is directly in MP4 folder + else: + return 'MP4 Root' + + # If no specific channel found, return the folder containing the file + if len(parts) >= 2: + parent_folder = parts[-2] # Second to last part (folder containing the file) + # If parent folder is MP4, then file is in root + if parent_folder.lower() == 'mp4': + return 'MP4 Root' + # Make sure we're returning a folder name, not a filename + if '.' not in parent_folder: + return parent_folder + else: + # If parent is also a file, go up one more level + if len(parts) >= 3: + grandparent = parts[-3] + if '.' not in grandparent: + return grandparent + + return None + + +def extract_consolidated_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]: + """Extract channel information with consolidated MP3/CDG handling.""" + path_lower = file_path.lower() + parts = file_path.split('\\') + + # Handle MP3/CDG files - consolidate all CDG folder content under "MP3" + if file_path.lower().endswith(('.mp3', '.cdg')): + # Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3 + for i, part in enumerate(parts): + if part.lower() == 'cdg' and i < len(parts) - 1: + # If CDG is found, return "MP3" as the consolidated channel + return 'MP3' + + # If not in CDG folder but still MP3/CDG, return "MP3" + return 'MP3' + + # Handle MP4 files - keep existing channel logic + if not file_path.lower().endswith('.mp4'): + return None + + if not channel_priorities: + return None + + # Look for configured channel priority folder names in the path + for channel in channel_priorities: + # Escape special regex characters in the channel name + escaped_channel = re.escape(channel.lower()) + if re.search(escaped_channel, path_lower): + return channel + + # If no configured channel found, extract the parent directory name + # Look for MP4 folder structure: MP4/ChannelName/song.mp4 + for i, part in enumerate(parts): + if part.lower() == 'mp4' and i < len(parts) - 1: + # If MP4 is found, return the next folder (the actual channel) + if i + 1 < len(parts): + next_part = parts[i + 1] + # Check if the next part is a folder (no file extension) or a file + if '.' not in next_part: + return next_part # It's a folder, so it's the channel name + else: + return 'MP4 Root' # File is directly in MP4 folder + else: + return 'MP4 Root' + + # If no specific channel found, return the folder containing the file + if len(parts) >= 2: + parent_folder = parts[-2] # Second to last part (folder containing the file) + # If parent folder is MP4, then file is in root + if parent_folder.lower() == 'mp4': + return 'MP4 Root' + # Make sure we're returning a folder name, not a filename + if '.' not in parent_folder: + return parent_folder + else: + # If parent is also a file, go up one more level + if len(parts) >= 3: + grandparent = parts[-3] + if '.' not in grandparent: + return grandparent + return None diff --git a/web/app.py b/web/app.py index 5fccee3..0669f23 100644 --- a/web/app.py +++ b/web/app.py @@ -70,12 +70,22 @@ def get_file_type(path: str) -> str: return 'Unknown' def extract_channel(path: str) -> str: - """Extract channel name from path.""" + """Extract channel name from path with consolidated MP3/CDG handling.""" path_lower = path.lower() - - # Split path into parts parts = path.split('\\') + # Handle MP3/CDG files - consolidate all CDG folder content under "MP3" + if path.lower().endswith(('.mp3', '.cdg')): + # Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3 + for i, part in enumerate(parts): + if part.lower() == 'cdg' and i < len(parts) - 1: + # If CDG is found, return "MP3" as the consolidated channel + return 'MP3' + + # If not in CDG folder but still MP3/CDG, return "MP3" + return 'MP3' + + # Handle MP4 files - keep existing channel logic # Look for specific known channels first known_channels = ['Sing King Karaoke', 'KaraFun Karaoke', 'Stingray Karaoke'] for channel in known_channels: @@ -88,9 +98,9 @@ def extract_channel(path: str) -> str: # If MP4 is found, return the next folder (the actual channel) if i + 1 < len(parts): next_part = parts[i + 1] - # Skip if the next part is the filename (no extension means it's a folder) + # Check if the next part is a folder (no file extension) or a file if '.' not in next_part: - return next_part + return next_part # It's a folder, so it's the channel name else: return 'MP4 Root' # File is directly in MP4 folder else: @@ -98,7 +108,7 @@ def extract_channel(path: str) -> str: # Look for any folder that contains 'karaoke' (fallback) for part in parts: - if 'karaoke' in part.lower(): + if 'karaoke' in part.lower() and '.' not in part: return part # If no specific channel found, return the folder containing the file @@ -107,7 +117,15 @@ def extract_channel(path: str) -> str: # If parent folder is MP4, then file is in root if parent_folder.lower() == 'mp4': return 'MP4 Root' - return parent_folder + # Make sure we're returning a folder name, not a filename + if '.' not in parent_folder: + return parent_folder + else: + # If parent is also a file, go up one more level + if len(parts) >= 3: + grandparent = parts[-3] + if '.' not in grandparent: + return grandparent return 'Unknown'