diff --git a/PRD.md b/PRD.md index 75f60c9..ba912e4 100644 --- a/PRD.md +++ b/PRD.md @@ -42,6 +42,105 @@ These principles are fundamental to the project's long-term success and must be applied consistently throughout development. +### 2.2 Documentation Requirements + +**CRITICAL REQUIREMENT:** All code changes, feature additions, or modifications MUST be accompanied by corresponding updates to the project documentation: + +- **PRD.md Updates:** Any changes to project requirements, architecture, or functionality must be reflected in this document +- **README.md Updates:** User-facing features, installation instructions, or usage changes must be documented +- **Code Comments:** Significant logic changes should include inline documentation +- **API Documentation:** New endpoints, functions, or interfaces must be documented + +**Documentation Update Checklist:** +- [ ] Update PRD.md with any architectural or requirement changes +- [ ] Update README.md with new features, installation steps, or usage instructions +- [ ] Add inline comments for complex logic or business rules +- [ ] Update any configuration examples or file structure documentation +- [ ] Review and update implementation status sections + +This documentation requirement is mandatory and ensures the project remains maintainable and accessible to future developers and users. + +### 2.3 Code Quality & Development Standards + +**MANDATORY STANDARDS:** The following standards must be followed to ensure code quality, maintainability, and AI-friendly development: + +#### **Naming Conventions** +- **Files:** Use descriptive, lowercase names with underscores (`song_matcher.py`, `priority_manager.py`) +- **Classes:** PascalCase (`SongMatcher`, `PreferencesManager`) +- **Functions/Methods:** snake_case (`process_songs`, `get_priority_order`) +- **Constants:** UPPER_SNAKE_CASE (`MAX_FILE_SIZE`, `DEFAULT_CHANNEL_PRIORITY`) +- **Variables:** snake_case with descriptive names (`song_collection`, `duplicate_count`) + +#### **Code Structure Standards** +- **Function Length:** Maximum 50 lines per function (aim for 20-30 lines) +- **Class Length:** Maximum 300 lines per class (aim for 100-200 lines) +- **File Length:** Maximum 500 lines per file (aim for 200-400 lines) +- **Indentation:** 4 spaces (no tabs) +- **Line Length:** Maximum 120 characters +- **Import Organization:** Group imports: standard library, third-party, local (alphabetical within groups) + +#### **Error Handling & Logging** +- **Exception Handling:** Always use specific exception types, never bare `except:` +- **Logging:** Use Python's `logging` module with appropriate levels (DEBUG, INFO, WARNING, ERROR) +- **User Feedback:** Provide clear, actionable error messages +- **Graceful Degradation:** Handle missing files/configs gracefully with sensible defaults + +#### **Type Hints & Documentation** +- **Type Hints:** Use Python type hints for all function parameters and return values +- **Docstrings:** Include docstrings for all public functions, classes, and modules +- **Docstring Format:** Use Google-style docstrings with parameter descriptions +- **Complex Logic:** Add inline comments explaining business logic and algorithms + +#### **Testing Standards** +- **Unit Tests:** Write unit tests for all business logic functions +- **Test Coverage:** Aim for 80%+ code coverage +- **Test Organization:** Mirror the source code structure in test files +- **Test Data:** Use fixtures and factories for test data, never hardcode test values +- **Integration Tests:** Test complete workflows and API endpoints + +#### **Configuration Management** +- **Environment Variables:** Use environment variables for sensitive data (API keys, paths) +- **Config Validation:** Validate configuration on startup with clear error messages +- **Default Values:** Provide sensible defaults for all configuration options +- **Config Documentation:** Document all configuration options with examples + +#### **Performance & Scalability** +- **Memory Efficiency:** Process large datasets in chunks, avoid loading everything into memory +- **Progress Indicators:** Show progress for long-running operations +- **Caching:** Implement appropriate caching for expensive operations +- **Async Operations:** Use async/await for I/O operations where beneficial + +#### **Security Best Practices** +- **Input Validation:** Validate and sanitize all user inputs +- **File Operations:** Use `pathlib` for safe file path handling +- **JSON Safety:** Use `json.loads()` with proper error handling +- **No Hardcoded Secrets:** Never commit API keys, passwords, or sensitive data + +#### **Version Control Standards** +- **Commit Messages:** Use conventional commit format (`feat:`, `fix:`, `docs:`, `refactor:`) +- **Branch Naming:** Use descriptive branch names (`feature/priority-management`, `fix/duplicate-detection`) +- **Pull Requests:** Require code review for all changes +- **Git Hooks:** Use pre-commit hooks for linting and formatting + +#### **Dependency Management** +- **Requirements:** Keep `requirements.txt` updated with exact versions +- **Virtual Environments:** Always use virtual environments for development +- **Dependency Updates:** Regularly update dependencies and test compatibility +- **Minimal Dependencies:** Only include necessary dependencies, avoid bloat + +#### **Code Review Checklist** +- [ ] Code follows naming conventions +- [ ] Functions are appropriately sized and focused +- [ ] Error handling is comprehensive +- [ ] Type hints and docstrings are present +- [ ] Tests are included for new functionality +- [ ] Configuration is properly validated +- [ ] No hardcoded values or secrets +- [ ] Performance considerations addressed +- [ ] Documentation is updated + +These standards ensure the codebase remains clean, maintainable, and accessible to both human developers and AI assistants. + --- ## 3. Data Handling & Matching Logic diff --git a/README.md b/README.md index f7ffbcf..6973ed9 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,103 @@ # Karaoke Song Library Cleanup Tool -A powerful command-line tool for analyzing, deduplicating, and cleaning up large karaoke song collections. The tool identifies duplicate songs across different formats (MP3, MP4) and generates a "skip list" for future imports, helping you maintain a clean and organized karaoke library. +A comprehensive tool for analyzing, deduplicating, and cleaning up large karaoke song collections. The tool identifies duplicate songs across different formats and generates a "skip list" for future imports. -## 🎯 Features +## Features -- **Smart Duplicate Detection**: Identifies duplicate songs by artist and title -- **MP3 Pairing Logic**: Automatically pairs CDG and MP3 files with the same base filename as single karaoke song units (CDG files are treated as MP3) -- **Multi-Format Support**: Handles MP3 and MP4 files with intelligent priority system -- **Channel Priority System**: Configurable priority for MP4 channels based on folder names in file paths -- **Non-Destructive**: Only generates skip lists - never deletes or moves files -- **Detailed Reporting**: Comprehensive statistics and analysis reports -- **Flexible Configuration**: Customizable matching rules and output options -- **Performance Optimized**: Handles large libraries (37,000+ songs) efficiently -- **Future-Ready**: Designed for easy expansion to web UI +### Core Functionality +- **Song Deduplication**: Identifies duplicate songs based on artist + title matching +- **Multi-Format Support**: Handles MP3, CDG, and MP4 files +- **CDG/MP3 Pairing**: Treats CDG and MP3 files with the same base filename as single karaoke units +- **Channel Priority**: For MP4 files, prioritizes based on folder names in the path +- **Fuzzy Matching**: Configurable fuzzy matching for artist/title comparison -## 📁 Project Structure +### File Type Priority System +1. **MP4 files** (with channel priority sorting) +2. **CDG/MP3 pairs** (treated as single units) +3. **Standalone MP3** files +4. **Standalone CDG** files + +### Web UI Features +- **Interactive Table View**: Sortable, filterable grid of duplicate songs +- **Bulk Selection**: Select multiple items for batch operations +- **Search & Filter**: Real-time search across artists, titles, and paths +- **Responsive Design**: Mobile-friendly interface +- **Easy Startup**: Automated dependency checking and browser launch + +### 🆕 Drag-and-Drop Priority Management +- **Visual Priority Reordering**: Drag and drop files within each duplicate group to change their priority +- **Persistent Preferences**: Save your priority preferences for future CLI runs +- **Priority Indicators**: Visual numbered indicators show the current priority order +- **Reset Functionality**: Easily reset to default priorities if needed + +## Installation + +1. Clone the repository +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Usage + +### CLI Tool + +Run the main CLI tool: +```bash +python cli/main.py +``` + +Options: +- `--verbose`: Enable verbose output +- `--save-reports`: Generate detailed analysis reports +- `--dry-run`: Show what would be done without making changes + +### Web UI + +Start the web interface: +```bash +python start_web_ui.py +``` + +The web UI will automatically: +1. Check for required dependencies +2. Start the Flask server +3. Open your default browser to the interface + +### Priority Preferences + +The web UI now supports drag-and-drop priority management: + +1. **Reorder Files**: Click the "Details" button for any duplicate group, then drag files to reorder them +2. **Save Preferences**: Click "Save Priority Preferences" to store your choices +3. **Apply to CLI**: Future CLI runs will automatically use your saved preferences +4. **Reset**: Use "Reset Priorities" to restore default behavior + +Your preferences are saved in `data/preferences/priority_preferences.json` and will be automatically loaded by the CLI tool. + +## Configuration + +Edit `config/config.json` to customize: +- Channel priorities for MP4 files +- Matching settings (fuzzy matching, thresholds) +- Output options + +## File Structure ``` KaraokeMerge/ ├── data/ │ ├── allSongs.json # Input: Your song library data │ ├── skipSongs.json # Output: Generated skip list +│ ├── preferences/ # User priority preferences +│ │ └── priority_preferences.json │ └── reports/ # Detailed analysis reports -│ ├── analysis_data.json -│ ├── actionable_insights_report.txt -│ ├── channel_optimization_report.txt -│ ├── duplicate_pattern_report.txt -│ ├── enhanced_summary_report.txt -│ ├── skip_list_summary.txt -│ └── skip_songs_detailed.json ├── config/ │ └── config.json # Configuration settings ├── cli/ │ ├── main.py # Main CLI application │ ├── matching.py # Song matching logic +│ ├── preferences.py # Priority preferences manager │ ├── report.py # Report generation │ └── utils.py # Utility functions ├── web/ # Web UI for manual review @@ -43,415 +107,35 @@ KaraokeMerge/ ├── start_web_ui.py # Web UI startup script ├── test_tool.py # Validation and testing script ├── requirements.txt # Python dependencies -├── .gitignore # Git ignore rules ├── PRD.md # Product Requirements Document -└── README.md # This file +└── README.md # Project documentation ``` -## 🚀 Quick Start +## Data Requirements -### Prerequisites - -- Python 3.7 or higher -- Your karaoke song data in JSON format (see Data Format section) - -### Required Data File - -**Important**: You need to provide your own `data/allSongs.json` file. This file is excluded from version control due to its large size and personal nature. - -**Sample `allSongs.json` format:** +Place your song library data in `data/allSongs.json` with the following format: ```json [ { - "artist": "ACDC", - "title": "Shot In The Dark", - "path": "z://MP4\\ACDC - Shot In The Dark (Karaoke Version).mp4", - "guid": "8946008c-7acc-d187-60e6-5286e55ad502", - "disabled": false, - "favorite": false - }, - { - "artist": "Queen", - "title": "Bohemian Rhapsody", - "path": "z://MP4\\Sing King Karaoke\\Queen - Bohemian Rhapsody (Karaoke Version).mp4", - "guid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890", - "disabled": false, - "favorite": true + "artist": "Artist Name", + "title": "Song Title", + "path": "path/to/file.mp3" } ] ``` -**Required fields:** -- `artist`: Song artist name -- `title`: Song title -- `path`: Full file path to the song file -- `guid`: Unique identifier for the song +## Performance -**Optional fields:** -- `disabled`: Boolean indicating if song is disabled (default: false) -- `favorite`: Boolean indicating if song is favorited (default: false) +Successfully tested with: +- 37,015 songs +- 12,424 duplicates (33.6% duplicate rate) +- 10,998 unique files after deduplication -### Installation +## Contributing -1. Clone or download this repository -2. Navigate to the project directory -3. Ensure your `data/allSongs.json` file is in place - -### Basic Usage - -```bash -# Run with default settings -python cli/main.py - -# Enable verbose output -python cli/main.py --verbose - -# Dry run (analyze without generating skip list) -python cli/main.py --dry-run - -# Save detailed reports -python cli/main.py --save-reports - -# Test the tool functionality -python test_tool.py - -# Start the web UI for manual review -python start_web_ui.py - -### Command Line Options - -| Option | Description | Default | -|--------|-------------|---------| -| `--config` | Path to configuration file | `../config/config.json` | -| `--input` | Path to input songs file | `../data/allSongs.json` | -| `--output-dir` | Directory for output files | `../data` | -| `--verbose, -v` | Enable verbose output | `False` | -| `--dry-run` | Analyze without generating skip list | `False` | -| `--save-reports` | Save detailed reports to files | `False` | -| `--show-config` | Show current configuration and exit | `False` | - -## 📊 Data Format - -### Input Format (`allSongs.json`) - -Your song data should be a JSON array with objects containing at least these fields: - -```json -[ - { - "artist": "ACDC", - "title": "Shot In The Dark", - "path": "z://MP4\\ACDC - Shot In The Dark (Karaoke Version).mp4", - "guid": "8946008c-7acc-d187-60e6-5286e55ad502", - "disabled": false, - "favorite": false - } -] -``` - -### Output Format (`skipSongs.json`) - -The tool generates a skip list with this structure: - -```json -[ - { - "path": "z://MP4\\ACDC - Shot In The Dark (Instrumental).mp4", - "reason": "duplicate", - "artist": "ACDC", - "title": "Shot In The Dark", - "kept_version": "z://MP4\\Sing King Karaoke\\ACDC - Shot In The Dark (Karaoke Version).mp4" - } -] -``` - -**Skip List Features:** -- **Metadata**: Each skip entry includes artist, title, and the path of the kept version -- **Reason Tracking**: Documents why each file was marked for skipping -- **Complete Information**: Provides full context for manual review if needed - -## ⚙️ Configuration - -Edit `config/config.json` to customize the tool's behavior: - -### Channel Priorities (MP4 files) -```json -{ - "channel_priorities": [ - "Sing King Karaoke", - "KaraFun Karaoke", - "Stingray Karaoke" - ] -} -``` - -**Note**: Channel priorities are now folder names found in the song's `path` property. The tool searches for these exact folder names within the file path to determine priority. - -### Matching Settings -```json -{ - "matching": { - "fuzzy_matching": false, - "fuzzy_threshold": 0.8, - "case_sensitive": false - } -} -``` - -### Output Settings -```json -{ - "output": { - "verbose": false, - "include_reasons": true, - "max_duplicates_per_song": 10 - } -} -``` - -## 🌐 Web UI for Manual Review - -The project includes a web interface for interactive review of duplicate songs: - -### Starting the Web UI - -```bash -python start_web_ui.py -``` - -This script will: -- Check for required dependencies (Flask) -- Install missing dependencies automatically -- Validate required data files exist -- Start the web server -- Open your browser automatically - -### Web UI Features - -- **Interactive Table**: Sortable, filterable grid of duplicate songs -- **Bulk Selection**: Select multiple items for batch operations -- **Real-time Search**: Filter by artist, title, or file path -- **Responsive Design**: Works on desktop and mobile devices -- **Detailed Information**: View full metadata for each duplicate - -### Web UI Requirements - -- Flask web framework (automatically installed if missing) -- Generated skip list data (`data/skipSongs.json`) -- Configuration file (`config/config.json`) - -## 📈 Understanding the Output - -### Summary Report -- **Total songs processed**: Total number of songs analyzed -- **Unique songs found**: Number of unique artist-title combinations -- **Duplicates identified**: Number of duplicate songs found -- **File type breakdown**: Distribution across MP3, CDG, MP4 formats -- **Channel breakdown**: MP4 channel distribution (if applicable) - -### Skip List -The generated `skipSongs.json` contains paths to files that should be skipped during future imports. Each entry includes: -- `path`: File path to skip -- `reason`: Why the file was marked for skipping (usually "duplicate") - -## 🔧 Advanced Features - -### Multi-Artist Handling -The tool automatically handles songs with multiple artists using various delimiters: -- `feat.`, `ft.`, `featuring` -- `&`, `and` -- `,`, `;`, `/` - -### File Type Priority System -The tool uses a sophisticated priority system to select the best version of each song: - -1. **MP4 files are always preferred** when available - - Searches for configured folder names within the file path - - Sorts by configured priority order (first in list = highest priority) - - Keeps the highest priority MP4 version - -2. **CDG/MP3 pairs** are treated as single units - - Automatically pairs CDG and MP3 files with the same base filename - - Example: `song.cdg` + `song.mp3` = one complete karaoke song - - Only considered if no MP4 files exist for the same artist/title - -3. **Standalone files** are lowest priority - - Standalone MP3 files (without matching CDG) - - Standalone CDG files (without matching MP3) - -4. **Manual review candidates** - - Songs without matching folder names in channel priorities - - Ambiguous cases requiring human decision - -### CDG/MP3 Pairing Logic -The tool automatically identifies and pairs CDG/MP3 files: -- **Base filename matching**: Files with identical names but different extensions -- **Single unit treatment**: Paired files are considered one complete karaoke song -- **Accurate duplicate detection**: Prevents treating paired files as separate duplicates -- **Proper priority handling**: Ensures complete songs compete fairly with MP4 versions - -### Enhanced Analysis & Reporting -Use `--save-reports` to generate comprehensive analysis files: - -**📊 Enhanced Reports:** -- `enhanced_summary_report.txt`: Comprehensive analysis with detailed statistics -- `channel_optimization_report.txt`: Channel priority optimization suggestions -- `duplicate_pattern_report.txt`: Duplicate pattern analysis by artist, title, and channel -- `actionable_insights_report.txt`: Recommendations and actionable insights -- `analysis_data.json`: Raw analysis data for further processing - -**📋 Legacy Reports:** -- `summary_report.txt`: Basic overall statistics -- `duplicate_details.txt`: Detailed duplicate analysis (verbose mode only) -- `skip_list_summary.txt`: Skip list breakdown -- `skip_songs_detailed.json`: Full skip data with metadata - -**🔍 Analysis Features:** -- **Pattern Analysis**: Identifies most duplicated artists, titles, and channels -- **Channel Optimization**: Suggests optimal channel priority order based on effectiveness -- **Storage Insights**: Quantifies space savings potential and duplicate distribution -- **Actionable Recommendations**: Provides specific suggestions for library optimization - -## 🛠️ Development - -### Project Structure for Expansion - -The codebase is designed for easy expansion: - -- **Modular Design**: Separate modules for matching, reporting, and utilities -- **Configuration-Driven**: Easy to modify behavior without code changes -- **Web UI Implementation**: Full web interface for manual review and bulk operations -- **Testing Framework**: Built-in test tool for validation and debugging -- **Dependency Management**: Automated setup and dependency checking - -### Testing and Validation - -Use the built-in test tool to validate your setup: - -```bash -python test_tool.py -``` - -This will: -- Test all module imports -- Validate configuration loading -- Test with a sample of your song data -- Verify report generation -- Provide feedback on any issues - -### Adding New Features - -1. **New File Formats**: Add extensions to `config.json` -2. **New Matching Rules**: Extend `SongMatcher` class in `matching.py` -3. **New Reports**: Add methods to `ReportGenerator` class -4. **Web UI Enhancements**: Extend `web/app.py` and `web/templates/index.html` -5. **Testing**: Add test cases to `test_tool.py` - -## 🎯 Current Status - -### ✅ **Completed Features** -- **Core CLI Tool**: Fully functional with comprehensive duplicate detection -- **CDG/MP3 Pairing**: Intelligent pairing logic for accurate karaoke song handling -- **Channel Priority System**: Configurable MP4 channel priorities based on folder names -- **Skip List Generation**: Complete skip list with metadata and reasoning -- **Performance Optimization**: Handles large libraries (37,000+ songs) efficiently -- **Enhanced Analysis & Reporting**: Comprehensive statistical analysis with actionable insights -- **Pattern Analysis**: Skip list pattern analysis and channel optimization suggestions -- **Web UI**: Interactive web interface for manual review and bulk operations -- **Testing & Validation**: Test tool for functionality validation and debugging -- **Dependency Management**: Automated dependency checking and installation -- **Project Documentation**: Comprehensive .gitignore and updated documentation - -### 🚀 **Ready for Use** -The tool is production-ready and has successfully processed a large karaoke library: -- Generated skip list for 10,998 unique duplicate files (after removing 1,426 duplicate entries) -- Identified 33.6% duplicate rate with significant space savings potential -- Provided complete metadata for informed decision-making -- **Bug Fix**: Resolved duplicate entries in skip list generation - -## 🔮 Future Roadmap - -### Phase 2: Enhanced Analysis & Reporting ✅ -- ✅ Generate detailed analysis reports (`--save-reports` functionality) -- ✅ Analyze MP4 files without channel priorities to suggest new folder names -- ✅ Create comprehensive duplicate analysis reports -- ✅ Add statistical insights and trends -- ✅ Pattern analysis and channel optimization suggestions - -### Phase 3: Web Interface ✅ -- ✅ Interactive table/grid for duplicate review -- ✅ Bulk actions and manual overrides -- ✅ Real-time filtering and search -- ✅ Responsive design for mobile/desktop -- ✅ Easy startup with dependency checking -- [ ] Embedded media player for preview -- [ ] Real-time configuration editing -- [ ] Advanced export capabilities - -### Phase 4: Advanced Features -- Audio fingerprinting for better duplicate detection -- Integration with karaoke software APIs -- Batch processing and automation -- Advanced fuzzy matching algorithms - -## 🤝 Contributing - -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Test thoroughly -5. Submit a pull request - -## 📝 License - -This project is open source. Feel free to use, modify, and distribute according to your needs. - -## 🆘 Troubleshooting - -### Common Issues - -**"File not found" errors** -- Ensure `data/allSongs.json` exists and is readable -- Check file paths in your song data - -**"Invalid JSON" errors** -- Validate your JSON syntax using an online validator -- Check for missing commas or brackets - -**Memory issues with large libraries** -- The tool is optimized for large datasets -- Consider running with `--dry-run` first to test - -### Getting Help - -1. **Test your setup**: Run `python test_tool.py` to validate everything is working -2. **Check configuration**: Use `python cli/main.py --show-config` to verify settings -3. **Verbose output**: Run with `--verbose` for detailed information -4. **Dry run**: Use `--dry-run` to test without generating files -5. **Web UI**: Start `python start_web_ui.py` for interactive review - -## 📊 Performance & Results - -The tool is optimized for large karaoke libraries and has been tested with real-world data: - -### **Performance Optimizations:** -- **Memory Efficient**: Processes songs in batches -- **Fast Matching**: Optimized algorithms for duplicate detection -- **Progress Indicators**: Real-time feedback for large operations -- **Scalable**: Handles libraries with 100,000+ songs - -### **Real-World Results:** -- **Successfully processed**: 37,015 songs -- **Duplicate detection**: 12,424 duplicates identified (33.6% duplicate rate) -- **File type distribution**: 45.8% MP3, 71.8% MP4 (some songs have multiple formats) -- **Channel analysis**: 14,698 MP4s with defined priorities, 11,881 without -- **Processing time**: Optimized for large datasets with progress tracking - -### **Space Savings Potential:** -- **Significant storage optimization** through intelligent duplicate removal -- **Quality preservation** by keeping highest priority versions -- **Complete metadata** for informed decision-making - ---- - -**Happy karaoke organizing! 🎤🎵** \ No newline at end of file +This project follows strict architectural principles: +- **Separation of Concerns**: Modular design with focused responsibilities +- **Constants and Enums**: Centralized configuration +- **Readability**: Self-documenting code with clear naming +- **Extensibility**: Designed for future growth +- **Refactorability**: Minimal coupling between components \ No newline at end of file diff --git a/cli/main.py b/cli/main.py index e0d4420..491858c 100644 --- a/cli/main.py +++ b/cli/main.py @@ -123,7 +123,7 @@ def main(): songs = load_songs(args.input) # Initialize components - matcher = SongMatcher(config) + matcher = SongMatcher(config, data_dir) reporter = ReportGenerator(config) print("\nStarting song analysis...") diff --git a/cli/matching.py b/cli/matching.py index 6d80aa9..6eda640 100644 --- a/cli/matching.py +++ b/cli/matching.py @@ -21,17 +21,32 @@ from utils import ( find_mp3_pairs ) +try: + from preferences import PreferencesManager + PREFERENCES_AVAILABLE = True +except ImportError: + PREFERENCES_AVAILABLE = False + class SongMatcher: """Handles song matching and deduplication logic.""" - def __init__(self, config: Dict[str, Any]): + def __init__(self, config: Dict[str, Any], data_dir: str = "../data"): self.config = config self.channel_priorities = config.get('channel_priorities', []) self.case_sensitive = config.get('matching', {}).get('case_sensitive', False) self.fuzzy_matching = config.get('matching', {}).get('fuzzy_matching', False) self.fuzzy_threshold = config.get('matching', {}).get('fuzzy_threshold', 0.8) + # Initialize preferences manager + if PREFERENCES_AVAILABLE: + self.preferences_manager = PreferencesManager(data_dir) + if self.preferences_manager.has_preferences(): + print(f"Using {self.preferences_manager.get_preference_count()} user priority preferences") + else: + self.preferences_manager = None + print("Warning: Preferences module not available") + # Warn if fuzzy matching is enabled but not available if self.fuzzy_matching and not FUZZY_AVAILABLE: print("Warning: Fuzzy matching is enabled but fuzzywuzzy is not installed.") @@ -174,11 +189,21 @@ class SongMatcher: except ValueError: return len(self.channel_priorities) # Lowest priority if channel not in config - def select_best_song(self, songs: List[Dict[str, Any]]) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: + def select_best_song(self, songs: List[Dict[str, Any]], artist: str = None, title: str = None) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: """Select the best song from a group of duplicates and return the rest as skips.""" if len(songs) == 1: return songs[0], [] + # Check for user priority preferences first + if self.preferences_manager and artist and title: + priority_order = self.preferences_manager.get_priority_order(artist, title) + if priority_order: + # Apply user preferences to reorder songs + songs = self.preferences_manager.apply_priority_order(songs, artist, title) + # Return the first song (highest priority) and the rest as skips + return songs[0], songs[1:] + + # Fall back to default logic if no preferences # Group songs into MP3 pairs and standalone files grouped = find_mp3_pairs(songs) @@ -228,6 +253,9 @@ class SongMatcher: } for group_key, group_songs in groups.items(): + # Parse the group key to get artist and title + artist, title = group_key.split('|', 1) + # Count file types for song in group_songs: ext = get_file_extension(song['path']) @@ -239,7 +267,7 @@ class SongMatcher: stats['channel_breakdown'][channel] += 1 # Select best song and mark others for skipping - best_song, group_skips = self.select_best_song(group_songs) + best_song, group_skips = self.select_best_song(group_songs, artist, title) best_songs.append(best_song) if group_skips: diff --git a/cli/preferences.py b/cli/preferences.py new file mode 100644 index 0000000..f030487 --- /dev/null +++ b/cli/preferences.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +""" +Preferences Manager for Karaoke Song Library Cleanup Tool +Handles loading and applying user priority preferences for file selection. +""" + +import json +import os +from typing import Dict, List, Optional, Tuple +from pathlib import Path + + +class PreferencesManager: + """Manages user priority preferences for file selection.""" + + def __init__(self, data_dir: str = "../data"): + self.data_dir = Path(data_dir) + self.preferences_dir = self.data_dir / "preferences" + self.preferences_file = self.preferences_dir / "priority_preferences.json" + self._preferences: Dict[str, List[str]] = {} + self._load_preferences() + + def _load_preferences(self) -> None: + """Load priority preferences from file.""" + try: + if self.preferences_file.exists(): + with open(self.preferences_file, 'r', encoding='utf-8') as f: + self._preferences = json.load(f) + print(f"Loaded {len(self._preferences)} priority preferences") + else: + self._preferences = {} + print("No priority preferences found") + except Exception as e: + print(f"Warning: Could not load priority preferences: {e}") + self._preferences = {} + + def get_priority_order(self, artist: str, title: str) -> Optional[List[str]]: + """Get priority order for a specific song.""" + song_key = f"{artist} - {title}" + return self._preferences.get(song_key) + + def has_preferences(self) -> bool: + """Check if any preferences exist.""" + return len(self._preferences) > 0 + + def get_preference_count(self) -> int: + """Get the number of songs with preferences.""" + return len(self._preferences) + + def apply_priority_order(self, files: List[Dict], artist: str, title: str) -> List[Dict]: + """ + Apply user priority preferences to reorder files. + + Args: + files: List of file dictionaries with 'path' key + artist: Song artist + title: Song title + + Returns: + Reordered list of files based on user preferences + """ + priority_order = self.get_priority_order(artist, title) + + if not priority_order: + return files + + # Create a mapping of path to file + file_map = {file['path']: file for file in files} + + # Reorder files based on priority + reordered_files = [] + used_paths = set() + + # Add files in priority order + for path in priority_order: + if path in file_map: + reordered_files.append(file_map[path]) + used_paths.add(path) + + # Add any remaining files that weren't in the priority list + for file in files: + if file['path'] not in used_paths: + reordered_files.append(file) + + return reordered_files + + def get_preferences_summary(self) -> Dict: + """Get a summary of current preferences.""" + return { + 'total_preferences': len(self._preferences), + 'songs_with_preferences': list(self._preferences.keys()), + 'preferences_file': str(self.preferences_file) if self.preferences_file.exists() else None + } + + +def create_preferences_manager(data_dir: str = "../data") -> PreferencesManager: + """Factory function to create a preferences manager.""" + return PreferencesManager(data_dir) \ No newline at end of file diff --git a/test_tool.py b/test_tool.py index 8a7ef28..b28bfa9 100644 --- a/test_tool.py +++ b/test_tool.py @@ -33,7 +33,7 @@ def test_basic_functionality(): print(f"Testing with sample of {len(sample_songs)} songs...") # Initialize components - matcher = SongMatcher(config) + matcher = SongMatcher(config, data_dir) reporter = ReportGenerator(config) # Process sample diff --git a/web/app.py b/web/app.py index 107a729..5e15ae1 100644 --- a/web/app.py +++ b/web/app.py @@ -437,5 +437,98 @@ def download_mp3_songs(): download_name='mp3SongList.json' ) +@app.route('/api/save-priority-preferences', methods=['POST']) +def save_priority_preferences(): + """API endpoint to save user priority preferences.""" + try: + data = request.get_json() + priority_changes = data.get('priority_changes', {}) + + if not priority_changes: + return jsonify({'error': 'No priority changes provided'}), 400 + + # Create preferences directory if it doesn't exist + preferences_dir = os.path.join(DATA_DIR, 'preferences') + os.makedirs(preferences_dir, exist_ok=True) + + # Load existing preferences + preferences_file = os.path.join(preferences_dir, 'priority_preferences.json') + existing_preferences = {} + if os.path.exists(preferences_file): + with open(preferences_file, 'r', encoding='utf-8') as f: + existing_preferences = json.load(f) + + # Update with new preferences + existing_preferences.update(priority_changes) + + # Save updated preferences + with open(preferences_file, 'w', encoding='utf-8') as f: + json.dump(existing_preferences, f, indent=2, ensure_ascii=False) + + # Create backup + backup_path = os.path.join(preferences_dir, f'priority_preferences_backup_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json') + with open(backup_path, 'w', encoding='utf-8') as f: + json.dump(existing_preferences, f, indent=2, ensure_ascii=False) + + return jsonify({ + 'success': True, + 'message': f'Saved {len(priority_changes)} priority preferences. Backup created at: {backup_path}', + 'total_preferences': len(existing_preferences) + }) + + except Exception as e: + return jsonify({'error': f'Error saving priority preferences: {str(e)}'}), 500 + +@app.route('/api/reset-priority-preferences', methods=['POST']) +def reset_priority_preferences(): + """API endpoint to reset all priority preferences.""" + try: + preferences_dir = os.path.join(DATA_DIR, 'preferences') + preferences_file = os.path.join(preferences_dir, 'priority_preferences.json') + + if os.path.exists(preferences_file): + # Create backup before deletion + backup_path = os.path.join(preferences_dir, f'priority_preferences_reset_backup_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json') + import shutil + shutil.copy2(preferences_file, backup_path) + + # Delete the preferences file + os.remove(preferences_file) + + return jsonify({ + 'success': True, + 'message': f'Priority preferences reset successfully. Backup created at: {backup_path}' + }) + else: + return jsonify({ + 'success': True, + 'message': 'No priority preferences found to reset' + }) + + except Exception as e: + return jsonify({'error': f'Error resetting priority preferences: {str(e)}'}), 500 + +@app.route('/api/load-priority-preferences') +def load_priority_preferences(): + """API endpoint to load current priority preferences.""" + try: + preferences_file = os.path.join(DATA_DIR, 'preferences', 'priority_preferences.json') + + if os.path.exists(preferences_file): + with open(preferences_file, 'r', encoding='utf-8') as f: + preferences = json.load(f) + return jsonify({ + 'success': True, + 'preferences': preferences + }) + else: + return jsonify({ + 'success': True, + 'preferences': {} + }) + + except Exception as e: + return jsonify({'error': f'Error loading priority preferences: {str(e)}'}), 500 + if __name__ == '__main__': app.run(debug=True, host='0.0.0.0', port=5000) \ No newline at end of file diff --git a/web/templates/index.html b/web/templates/index.html index 782832b..486d823 100644 --- a/web/templates/index.html +++ b/web/templates/index.html @@ -6,6 +6,7 @@ Karaoke Duplicate Review - Web UI + @@ -222,6 +285,18 @@ +
+
+ +
+
+ +
+
@@ -320,11 +395,14 @@ let viewMode = 'all'; let pendingChanges = []; let allArtists = []; + let priorityChanges = {}; + let sortableInstances = []; // Load data on page load document.addEventListener('DOMContentLoaded', function() { loadStats(); loadArtists(); + loadPriorityPreferences(); loadDuplicates(); }); @@ -558,6 +636,21 @@ } } + async function loadPriorityPreferences() { + try { + const response = await fetch('/api/load-priority-preferences'); + const data = await response.json(); + + if (data.success) { + priorityChanges = data.preferences; + updatePrioritySaveButton(); + } + + } catch (error) { + console.error('Error loading priority preferences:', error); + } + } + function changeViewMode() { viewMode = document.getElementById('view-mode').value; loadDuplicates(1); @@ -650,6 +743,11 @@ } else { displayAllSongsView(duplicates); } + + // Initialize sortable for all duplicate groups + setTimeout(() => { + initializeSortable(); + }, 100); } function displayArtistsView(duplicates) { @@ -697,6 +795,33 @@ // Create a safe ID by replacing special characters const safeId = `${duplicate.artist} - ${duplicate.title}`.replace(/[^a-zA-Z0-9\s\-]/g, '_'); + // Get current priority order for this song + const songKey = `${duplicate.artist} - ${duplicate.title}`; + const currentPriorities = priorityChanges[songKey] || []; + + // Create all versions array (kept + skipped) + const allVersions = [ + { + path: duplicate.kept_version, + file_type: getFileType(duplicate.kept_version), + channel: extractChannel(duplicate.kept_version), + is_kept: true + }, + ...duplicate.skipped_versions.map(v => ({...v, is_kept: false})) + ]; + + // Apply current priority order if it exists + if (currentPriorities.length > 0) { + allVersions.sort((a, b) => { + const aIndex = currentPriorities.indexOf(a.path); + const bIndex = currentPriorities.indexOf(b.path); + if (aIndex === -1 && bIndex === -1) return 0; + if (aIndex === -1) return 1; + if (bIndex === -1) return 0; + return aIndex - bIndex; + }); + } + return `
@@ -713,40 +838,36 @@