From 9124640bf4228febe2ceee804e8df28aa0b25a51 Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Fri, 1 Aug 2025 08:01:07 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- COMMANDS.md | 384 +++++++++++++++++++++-------------- PRD.md | 52 +++-- README.md | 211 ++++++++++++++++---- data/songs.json | 284 +++++++++++--------------- src/api/database.py | 292 +++++++++++++++++++-------- src/cli/main.py | 472 +++++++++++++++++++++++++++----------------- 6 files changed, 1065 insertions(+), 630 deletions(-) diff --git a/COMMANDS.md b/COMMANDS.md index 2b1c887..5cd1ecf 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -2,55 +2,86 @@ ## Overview -The MusicBrainz Data Cleaner is a command-line interface (CLI) tool that processes JSON song data files and cleans/normalizes the metadata using the MusicBrainz database. +The MusicBrainz Data Cleaner is a command-line interface (CLI) tool that processes JSON song data files and cleans/normalizes the metadata using the MusicBrainz database. The tool creates separate output files for successful and failed songs, along with detailed processing reports. ## Basic Command Structure ```bash -python musicbrainz_cleaner.py [output_file] [options] +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main [options] ``` -## Command Arguments +## Command Options -### Required Arguments +### Main Options -| Argument | Type | Description | Example | -|----------|------|-------------|---------| -| `input_file` | string | Path to the JSON file containing song data | `my_songs.json` | - -### Optional Arguments - -| Argument | Type | Description | Example | -|----------|------|-------------|---------| -| `output_file` | string | Path for the cleaned output file | `cleaned_songs.json` | -| `--help` | flag | Show help information | `--help` | -| `--version` | flag | Show version information | `--version` | +| Option | Type | Description | Default | Example | +|--------|------|-------------|---------|---------| +| `--source` | string | Source JSON file path | `data/songs.json` | `--source data/my_songs.json` | +| `--output-success` | string | Output file for successful songs | `source-success.json` | `--output-success cleaned.json` | +| `--output-failure` | string | Output file for failed songs | `source-failure.json` | `--output-failure failed.json` | +| `--limit` | number | Process only first N songs | None (all songs) | `--limit 1000` | +| `--use-api` | flag | Force use of HTTP API instead of database | Database mode | `--use-api` | +| `--test-connection` | flag | Test connection to MusicBrainz server | None | `--test-connection` | +| `--help` | flag | Show help information | None | `--help` | +| `--version` | flag | Show version information | None | `--version` | ## Command Examples -### Basic Usage +### Basic Usage (Default) ```bash -# Clean songs and save to auto-generated filename -python musicbrainz_cleaner.py songs.json -# Output: songs_cleaned.json +# Process all songs with default settings +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main +# Output: data/songs-success.json and data/songs-failure.json ``` -### Custom Output File +### Custom Source File ```bash -# Specify custom output filename -python musicbrainz_cleaner.py songs.json cleaned_songs.json +# Process specific file +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/my_songs.json +# Output: data/my_songs-success.json and data/my_songs-failure.json +``` + +### Custom Output Files + +```bash +# Specify custom output files +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/songs.json --output-success cleaned.json --output-failure failed.json +``` + +### Limited Processing + +```bash +# Process only first 1000 songs +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --limit 1000 +``` + +### Force API Mode + +```bash +# Use HTTP API instead of database (slower but works without PostgreSQL) +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --use-api +``` + +### Test Connection + +```bash +# Test database connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection + +# Test API connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection --use-api ``` ### Help and Information ```bash # Show help information -python musicbrainz_cleaner.py --help +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --help # Show version information -python musicbrainz_cleaner.py --version +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --version ``` ## Input File Format @@ -84,9 +115,13 @@ Any additional fields will be preserved in the output: - `path`: File path - Any other custom fields -## Output File Format +## Output Files -The output file will contain the same structure with cleaned data and added MBID fields: +The tool creates **three output files**: + +### 1. Successful Songs (`source-success.json`) + +Array of successfully processed songs with MBIDs added: ```json [ @@ -103,49 +138,107 @@ The output file will contain the same structure with cleaned data and added MBID ] ``` -### Added Fields +### 2. Failed Songs (`source-failure.json`) + +Array of songs that couldn't be processed (same format as source): + +```json +[ + { + "artist": "Unknown Artist", + "title": "Unknown Song", + "disabled": false, + "favorite": false, + "guid": "12345678-1234-1234-1234-123456789012", + "path": "z://MP4\\Unknown Artist - Unknown Song.mp4" + } +] +``` + +### 3. Processing Report (`processing_report_YYYYMMDD_HHMMSS.txt`) + +Human-readable text report with statistics and failed song list: + +``` +MusicBrainz Data Cleaner - Processing Report +================================================== + +Source File: data/songs.json +Processing Date: 2024-12-19 14:30:22 +Processing Time: 15263.3 seconds + +SUMMARY +-------------------- +Total Songs Processed: 49,170 +Successful Songs: 40,692 +Failed Songs: 8,478 +Success Rate: 82.8% + +DETAILED STATISTICS +-------------------- +Artists Found: 44,526/49,170 (90.6%) +Recordings Found: 40,998/49,170 (83.4%) +Processing Speed: 3.2 songs/second + +OUTPUT FILES +-------------------- +Successful Songs: data/songs-success.json +Failed Songs: data/songs-failure.json +Report File: data/processing_report_20241219_143022.txt + +FAILED SONGS (First 50) +-------------------- + 1. Unknown Artist - Unknown Song + 2. Invalid Artist - Invalid Title + 3. Test Artist - Test Song +... +``` + +### Added Fields (Successful Songs Only) - `mbid`: MusicBrainz Artist ID (string) - `recording_mbid`: MusicBrainz Recording ID (string) -## Command Line Options +## Processing Output -### Help Option +### Progress Indicators -```bash -python musicbrainz_cleaner.py --help +``` +πŸš€ Starting song processing... +πŸ“Š Total songs to process: 49,170 +Using database connection +================================================== + +[1 of 49,170] βœ… PASS: ACDC - Shot In The Dark +[2 of 49,170] ❌ FAIL: Unknown Artist - Unknown Song +[3 of 49,170] βœ… PASS: Bruno Mars feat. Cardi B - Finesse (remix) + + πŸ“ˆ Progress: 100/49,170 (0.2%) - Success: 85.0% - Rate: 3.2 songs/sec + +================================================== +πŸŽ‰ Processing completed! +πŸ“Š Final Results: + ⏱️ Total processing time: 15263.3 seconds + πŸš€ Average speed: 3.2 songs/second + βœ… Artists found: 44,526/49,170 (90.6%) + βœ… Recordings found: 40,998/49,170 (83.4%) + ❌ Failed songs: 8,478 (17.2%) +πŸ“„ Files saved: + βœ… Successful songs: data/songs-success.json + ❌ Failed songs: data/songs-failure.json + πŸ“‹ Text report: data/processing_report_20241219_143022.txt + πŸ“Š JSON report: data/processing_report_20241219_143022.json ``` -**Output:** -``` -Usage: python musicbrainz_cleaner.py [output_file.json] +### Status Indicators -MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz - -Arguments: - input_file.json JSON file containing array of song objects - output_file.json Optional: Output file for cleaned data - (default: input_file_cleaned.json) - -Examples: - python musicbrainz_cleaner.py songs.json - python musicbrainz_cleaner.py songs.json cleaned_songs.json - -Requirements: - - MusicBrainz server running on http://localhost:5001 - - Python 3.6+ with requests library -``` - -### Version Option - -```bash -python musicbrainz_cleaner.py --version -``` - -**Output:** -``` -MusicBrainz Data Cleaner v1.0.0 -``` +| Symbol | Meaning | Description | +|--------|---------|-------------| +| βœ… | Success | Song processed successfully with MBIDs found | +| ❌ | Failure | Song processing failed (no MBIDs found) | +| πŸ“ˆ | Progress | Progress update with statistics | +| πŸš€ | Start | Processing started | +| πŸŽ‰ | Complete | Processing completed successfully | ## Error Messages and Exit Codes @@ -161,7 +254,7 @@ MusicBrainz Data Cleaner v1.0.0 #### File Not Found ``` -Error: File 'songs.json' not found +Error: Source file does not exist: data/songs.json ``` #### Invalid JSON @@ -171,12 +264,12 @@ Error: Invalid JSON in file 'songs.json' #### Invalid Input Format ``` -Error: Input file should contain a JSON array of songs +Error: Source file should contain a JSON array of songs ``` #### Connection Error ``` -Error searching for artist 'Artist Name': Connection refused +❌ Connection to MusicBrainz database failed ``` #### Missing Dependencies @@ -184,112 +277,95 @@ Error searching for artist 'Artist Name': Connection refused ModuleNotFoundError: No module named 'requests' ``` -## Processing Output +## Environment Configuration -### Progress Indicators +### Docker Environment -``` -Processing 3 songs... -================================================== - -[1/3] Processing: ACDC - Shot In The Dark - βœ… Found artist: AC/DC (MBID: 66c662b6-6e2f-4930-8610-912e24c63ed1) - βœ… Found recording: Shot in the Dark (MBID: cf8b5cd0-d97c-413d-882f-fc422a2e57db) - βœ… Updated to: AC/DC - Shot in the Dark - -[2/3] Processing: Bruno Mars ft. Cardi B - Finesse Remix - ❌ Could not find artist: Bruno Mars ft. Cardi B - -[3/3] Processing: Taylor Swift - Love Story - βœ… Found artist: Taylor Swift (MBID: 20244d07-534f-4eff-b4d4-930878889970) - βœ… Found recording: Love Story (MBID: d783e6c5-761f-4fc3-bfcf-6089cdfc8f96) - βœ… Updated to: Taylor Swift - Love Story - -================================================== -βœ… Processing complete! -πŸ“ Output saved to: songs_cleaned.json -``` - -### Status Indicators - -| Symbol | Meaning | Description | -|--------|---------|-------------| -| βœ… | Success | Operation completed successfully | -| ❌ | Error | Operation failed | -| πŸ”„ | Processing | Currently processing | - -## Batch Processing - -### Multiple Files - -To process multiple files, you can use shell scripting: - -```bash -# Process all JSON files in current directory -for file in *.json; do - python musicbrainz_cleaner.py "$file" -done -``` - -### Large Files - -For large files, the tool processes songs one at a time with a 0.1-second delay between API calls to be respectful to the MusicBrainz server. - -## Environment Variables - -The tool uses the following default configuration: +The tool runs in a Docker container with the following configuration: | Setting | Default | Description | |---------|---------|-------------| -| MusicBrainz URL | `http://localhost:5001` | Local MusicBrainz server URL | -| API Delay | `0.1` seconds | Delay between API calls | +| Database Host | `db` | PostgreSQL database container | +| Database Port | `5432` | PostgreSQL port | +| Database Name | `musicbrainz_db` | MusicBrainz database name | +| API URL | `http://localhost:5001` | MusicBrainz web server URL | + +### Environment Variables + +```bash +# Database configuration +DB_HOST=db +DB_PORT=5432 +DB_NAME=musicbrainz_db +DB_USER=musicbrainz +DB_PASSWORD=musicbrainz + +# Web server configuration +MUSICBRAINZ_WEB_SERVER_PORT=5001 +``` ## Troubleshooting Commands ### Check MusicBrainz Server Status ```bash -# Test if server is running +# Test if web server is running curl -I http://localhost:5001 -# Test API endpoint -curl http://localhost:5001/ws/2/artist/?query=name:AC/DC&fmt=json +# Test database connection +docker-compose exec db psql -U musicbrainz -d musicbrainz_db -c "SELECT COUNT(*) FROM artist;" ``` ### Validate JSON File ```bash # Check if JSON is valid -python -m json.tool songs.json +python -m json.tool data/songs.json # Check JSON structure -python -c "import json; data=json.load(open('songs.json')); print('Valid JSON array with', len(data), 'items')" +python -c "import json; data=json.load(open('data/songs.json')); print('Valid JSON array with', len(data), 'items')" ``` -### Check Python Dependencies +### Test Tool Connection ```bash -# Check if requests is installed -python -c "import requests; print('requests version:', requests.__version__)" +# Test database connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection -# Install if missing -pip install requests +# Test API connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection --use-api ``` ## Advanced Usage -### Custom MusicBrainz Server +### Batch Processing -To use a different MusicBrainz server, modify the script: +To process multiple files, you can use shell scripting: -```python -# In musicbrainz_cleaner.py, change: -self.base_url = "http://your-server:5001" +```bash +# Process all JSON files in data directory +for file in data/*.json; do + docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source "$file" +done ``` -### Verbose Output +### Large Files -For debugging, you can modify the script to add more verbose output by uncommenting debug print statements. +For large files, the tool processes songs efficiently with: +- Direct database access for maximum speed +- Progress tracking every 100 songs +- Memory-efficient processing +- No rate limiting with database access + +### Custom Processing + +```bash +# Process with custom chunk size (for testing) +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/songs.json --limit 1000 + +# Process with custom output files +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/songs.json --output-success my_cleaned.json --output-failure my_failed.json +``` ## Command Line Shortcuts @@ -299,18 +375,22 @@ Add these to your shell profile for convenience: ```bash # Add to ~/.bashrc or ~/.zshrc -alias mbclean='python musicbrainz_cleaner.py' -alias mbclean-help='python musicbrainz_cleaner.py --help' +alias mbclean='docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main' +alias mbclean-help='docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --help' +alias mbclean-test='docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection' ``` ### Usage with Aliases ```bash # Using alias -mbclean songs.json +mbclean --source data/songs.json # Show help mbclean-help + +# Test connection +mbclean-test ``` ## Integration Examples @@ -319,8 +399,8 @@ mbclean-help ```bash # Process files and commit changes -python musicbrainz_cleaner.py songs.json -git add songs_cleaned.json +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/songs.json +git add data/songs-success.json data/songs-failure.json git commit -m "Clean song metadata with MusicBrainz IDs" ``` @@ -328,7 +408,7 @@ git commit -m "Clean song metadata with MusicBrainz IDs" ```bash # Add to crontab to process files daily -0 2 * * * cd /path/to/musicbrainz-cleaner && python musicbrainz_cleaner.py /path/to/songs.json +0 2 * * * cd /path/to/musicbrainz-cleaner && docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source /path/to/songs.json ``` ### With Shell Scripts @@ -337,13 +417,18 @@ git commit -m "Clean song metadata with MusicBrainz IDs" #!/bin/bash # clean_songs.sh INPUT_FILE="$1" -OUTPUT_FILE="${INPUT_FILE%.json}_cleaned.json" +OUTPUT_SUCCESS="${INPUT_FILE%.json}-success.json" +OUTPUT_FAILURE="${INPUT_FILE%.json}-failure.json" -python musicbrainz_cleaner.py "$INPUT_FILE" "$OUTPUT_FILE" +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main \ + --source "$INPUT_FILE" \ + --output-success "$OUTPUT_SUCCESS" \ + --output-failure "$OUTPUT_FAILURE" if [ $? -eq 0 ]; then - echo "Successfully cleaned $INPUT_FILE" - echo "Output saved to $OUTPUT_FILE" + echo "Successfully processed $INPUT_FILE" + echo "Successful songs: $OUTPUT_SUCCESS" + echo "Failed songs: $OUTPUT_FAILURE" else echo "Error processing $INPUT_FILE" exit 1 @@ -354,7 +439,10 @@ fi | Command | Description | |---------|-------------| -| `python musicbrainz_cleaner.py file.json` | Basic usage | -| `python musicbrainz_cleaner.py file.json output.json` | Custom output | -| `python musicbrainz_cleaner.py --help` | Show help | -| `python musicbrainz_cleaner.py --version` | Show version | \ No newline at end of file +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main` | Process all songs with defaults | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source file.json` | Process specific file | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --limit 1000` | Process first 1000 songs | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection` | Test database connection | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --use-api` | Force API mode | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --help` | Show help | +| `docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --version` | Show version | \ No newline at end of file diff --git a/PRD.md b/PRD.md index f35c6ce..3e12cdf 100644 --- a/PRD.md +++ b/PRD.md @@ -38,13 +38,16 @@ docker-compose exec db psql -U musicbrainz -d musicbrainz_db -c "SELECT COUNT(*) docker-compose run --rm musicbrainz-cleaner python3 -c "from src.api.database import MusicBrainzDatabase; db = MusicBrainzDatabase(); print('Connection result:', db.connect())" ``` -### 4. Run Tests +### 4. Run the Cleaner ```bash -# Test 100 random songs -docker-compose run --rm musicbrainz-cleaner python3 test_100_random.py +# Process all songs with default settings +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main -# Or other test scripts -docker-compose run --rm musicbrainz-cleaner python3 [script_name].py +# Process with custom options +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/my_songs.json --limit 1000 + +# Test connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection ``` **⚠️ Critical**: Always run scripts via Docker - the cleaner cannot connect to the database directly from outside the container. @@ -119,10 +122,15 @@ Users have song data in JSON format with inconsistent artist names, song titles, #### 6. CLI Interface - **REQ-034:** Command-line interface with argument parsing -- **REQ-035:** Support for input and optional output file specification -- **REQ-036:** Progress reporting during processing +- **REQ-035:** Support for source file specification with smart defaults +- **REQ-036:** Progress reporting during processing with song counter - **REQ-037:** Error handling and user-friendly messages - **REQ-038:** Option to force API mode with `--use-api` flag +- **NEW REQ-039:** Simplified CLI with default full dataset processing +- **NEW REQ-040:** Separate output files for successful and failed songs (array format) +- **NEW REQ-041:** Human-readable text report with statistics +- **NEW REQ-042:** Configurable processing limits and output file paths +- **NEW REQ-043:** Smart defaults for all file paths and options ### βœ… Non-Functional Requirements @@ -170,7 +178,11 @@ src/ β”‚ β”œβ”€β”€ __init__.py β”‚ └── constants.py # Constants and settings β”œβ”€β”€ core/ # Core functionality -β”œβ”€β”€ utils/ # Utility functions +β”œβ”€β”€ tests/ # Test files and scripts +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ test_*.py # Unit and integration tests +β”‚ └── debug_*.py # Debug scripts +└── utils/ # Utility functions ``` ### Architectural Principles @@ -183,6 +195,7 @@ src/ - **Fallback Strategy**: Automatic fallback to API when database unavailable - **NEW**: **Database-First**: Always use live database data over static caches - **NEW**: **Intelligent Collaboration Detection**: Distinguish band names from collaborations +- **NEW**: **Test Organization**: All test files must be placed in `src/tests/` directory, not in root ### Data Flow 1. Read JSON input file @@ -224,6 +237,13 @@ src/ - Manual configuration needed for custom artist/recording mappings - **NEW**: Some edge cases may require manual intervention (data quality issues) +### Test File Organization +- **REQUIRED**: All test files must be placed in `src/tests/` directory +- **PROHIBITED**: Test files should not be placed in the root directory +- **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns +- **Purpose**: Keeps root directory clean and organizes test code properly +- **Import Path**: Tests can import from parent modules using relative imports + ## Server Setup Requirements ### MusicBrainz Server Configuration @@ -295,6 +315,11 @@ docker-compose logs -f musicbrainz - [x] **NEW**: Band name vs collaboration distinction - [x] **NEW**: Complex collaboration parsing - [x] **NEW**: Removed problematic known_artists cache +- [x] **NEW**: Simplified CLI with default full dataset processing +- [x] **NEW**: Separate output files for successful and failed songs (array format) +- [x] **NEW**: Human-readable text reports with statistics +- [x] **NEW**: Smart defaults for all file paths and options +- [x] **NEW**: Configurable processing limits and output file paths ### πŸ”„ Future Enhancements - [ ] Web interface option @@ -396,14 +421,17 @@ pip install -r requirements.txt ### Usage ```bash -# Use database access (recommended, faster) -python musicbrainz_cleaner.py input.json +# Process all songs with default settings (recommended) +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main + +# Process specific file with custom options +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/my_songs.json --limit 1000 # Force API mode (slower, fallback) -python musicbrainz_cleaner.py input.json --use-api +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --use-api # Test connections -python musicbrainz_cleaner.py --test-connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection ``` ## Maintenance diff --git a/README.md b/README.md index 34946ab..3f039eb 100644 --- a/README.md +++ b/README.md @@ -265,8 +265,13 @@ Before running tests, verify: ## πŸ§ͺ Testing -Run the test suite to verify everything works correctly: +### Test File Organization +- **REQUIRED**: All test files must be placed in `src/tests/` directory +- **PROHIBITED**: Test files should not be placed in the root directory +- **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns +- **Purpose**: Keeps root directory clean and organizes test code properly +### Running Tests ```bash # Run all tests python3 src/tests/run_tests.py @@ -288,7 +293,25 @@ python3 src/tests/run_tests.py --list - **Integration Tests**: Test interactions between components and database - **Debug Tests**: Debug scripts and troubleshooting tools -## πŸ“ Data Files +## πŸ“ Project Structure + +``` +musicbrainz-cleaner/ +β”œβ”€β”€ src/ +β”‚ β”œβ”€β”€ api/ # Database and API access +β”‚ β”œβ”€β”€ cli/ # Command-line interface +β”‚ β”œβ”€β”€ config/ # Configuration and constants +β”‚ β”œβ”€β”€ core/ # Core functionality +β”‚ β”œβ”€β”€ tests/ # Test files (REQUIRED location) +β”‚ └── utils/ # Utility functions +β”œβ”€β”€ data/ # Data files and output +β”‚ β”œβ”€β”€ known_artists.json # Name variations (ACDC β†’ AC/DC) +β”‚ β”œβ”€β”€ known_recordings.json # Known recording MBIDs +β”‚ └── songs.json # Source songs file +└── docker-compose.yml # Docker configuration +``` + +### Data Files The tool uses external JSON files for name variations: @@ -342,40 +365,61 @@ These files can be easily updated without touching the code, making it simple to - **Numbers**: "98 Degrees", "S Club 7", "3 Doors Down" - **Special Characters**: "a-ha", "The B-52s", "Salt-N-Pepa" +### πŸ†• Simplified Processing +- **Default Behavior**: Process all songs by default (no special flags needed) +- **Separate Output Files**: Successful and failed songs saved to different files +- **Progress Tracking**: Real-time progress with song counter and status +- **Smart Defaults**: Sensible defaults for all file paths and options +- **Detailed Reporting**: Comprehensive statistics and processing report +- **Batch Processing**: Efficient handling of large song collections + ## πŸ“– Usage Examples -### Basic Usage +### Basic Usage (Default) ```bash -# Clean your songs and save to auto-generated filename -python musicbrainz_cleaner.py my_songs.json -# Output: my_songs_cleaned.json +# Process all songs with default settings (data/songs.json) +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main +# Output: data/songs-success.json and data/songs-failure.json ``` -### Custom Output File +### Custom Source File ```bash -# Specify your own output filename -python musicbrainz_cleaner.py my_songs.json cleaned_songs.json +# Process specific file +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/my_songs.json +# Output: data/my_songs-success.json and data/my_songs-failure.json +``` + +### Custom Output Files +```bash +# Specify custom output files +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --source data/songs.json --output-success cleaned.json --output-failure failed.json +``` + +### Limit Processing +```bash +# Process only first 1000 songs +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --limit 1000 ``` ### Force API Mode ```bash # Use HTTP API instead of database (slower but works without PostgreSQL) -python musicbrainz_cleaner.py my_songs.json --use-api +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --use-api ``` ### Test Connections ```bash # Test database connection -python musicbrainz_cleaner.py --test-connection +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection # Test with API mode -python musicbrainz_cleaner.py --test-connection --use-api +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --test-connection --use-api ``` ### Help ```bash # Show usage information -python musicbrainz_cleaner.py --help +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --help ``` ## πŸ“ Data Files @@ -406,7 +450,10 @@ Your JSON file should contain an array of song objects: ## πŸ“€ Output Format -The tool will update your objects with corrected data: +The tool creates **three output files**: + +### 1. Successful Songs (`source-success.json`) +Array of successfully processed songs with MBIDs added: ```json [ @@ -433,39 +480,123 @@ The tool will update your objects with corrected data: ] ``` +### 2. Failed Songs (`source-failure.json`) +Array of songs that couldn't be processed (same format as source): + +```json +[ + { + "artist": "Unknown Artist", + "title": "Unknown Song", + "disabled": false, + "favorite": false, + "guid": "12345678-1234-1234-1234-123456789012", + "path": "z://MP4\\Unknown Artist - Unknown Song.mp4" + } +] +``` + +### 3. Processing Report (`processing_report_YYYYMMDD_HHMMSS.txt`) +Human-readable text report with statistics and failed song list: + +``` +MusicBrainz Data Cleaner - Processing Report +================================================== + +Source File: data/songs.json +Processing Date: 2024-12-19 14:30:22 +Processing Time: 15263.3 seconds + +SUMMARY +-------------------- +Total Songs Processed: 49,170 +Successful Songs: 40,692 +Failed Songs: 8,478 +Success Rate: 82.8% + +DETAILED STATISTICS +-------------------- +Artists Found: 44,526/49,170 (90.6%) +Recordings Found: 40,998/49,170 (83.4%) +Processing Speed: 3.2 songs/second + +OUTPUT FILES +-------------------- +Successful Songs: data/songs-success.json +Failed Songs: data/songs-failure.json +Report File: data/processing_report_20241219_143022.txt + +FAILED SONGS (First 50) +-------------------- + 1. Unknown Artist - Unknown Song + 2. Invalid Artist - Invalid Title + 3. Test Artist - Test Song +... +``` + ## 🎬 Example Run +### Basic Processing ```bash -$ python musicbrainz_cleaner.py data/sample_songs.json +$ docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main -Processing 3 songs... +πŸš€ Starting song processing... +πŸ“Š Total songs to process: 49,170 Using database connection ================================================== -[1/3] Processing: ACDC - Shot In The Dark - 🎯 Fuzzy match found: ACDC β†’ AC/DC (score: 0.85) - βœ… Found artist: AC/DC (MBID: 66c662b6-6e2f-4930-8610-912e24c63ed1) - 🎯 Fuzzy match found: Shot In The Dark β†’ Shot in the Dark (score: 0.92) - βœ… Found recording: Shot in the Dark (MBID: cf8b5cd0-d97c-413d-882f-fc422a2e57db) - βœ… Updated to: AC/DC - Shot in the Dark +[1 of 49,170] βœ… PASS: ACDC - Shot In The Dark +[2 of 49,170] ❌ FAIL: Unknown Artist - Unknown Song +[3 of 49,170] βœ… PASS: Bruno Mars feat. Cardi B - Finesse (remix) +[4 of 49,170] βœ… PASS: Taylor Swift - Love Story +... -[2/3] Processing: Bruno Mars ft. Cardi B - Finesse Remix - 🎯 Fuzzy match found: Bruno Mars β†’ Bruno Mars (score: 1.00) - βœ… Found artist: Bruno Mars (MBID: afb680f2-b6eb-4cd7-a70b-a63b25c763d5) - 🎯 Fuzzy match found: Finesse Remix β†’ Finesse (remix) (score: 0.88) - βœ… Found recording: Finesse (remix) (MBID: 8ed14014-547a-4128-ab81-c2dca7ae198e) - βœ… Updated to: Bruno Mars feat. Cardi B - Finesse (remix) - -[3/3] Processing: Taylor Swift - Love Story - 🎯 Fuzzy match found: Taylor Swift β†’ Taylor Swift (score: 1.00) - βœ… Found artist: Taylor Swift (MBID: 20244d07-534f-4eff-b4d4-930878889970) - 🎯 Fuzzy match found: Love Story β†’ Love Story (score: 1.00) - βœ… Found recording: Love Story (MBID: d783e6c5-761f-4fc3-bfcf-6089cdfc8f96) - βœ… Updated to: Taylor Swift - Love Story + πŸ“ˆ Progress: 100/49,170 (0.2%) - Success: 85.0% - Rate: 3.2 songs/sec + πŸ“ˆ Progress: 200/49,170 (0.4%) - Success: 87.5% - Rate: 3.1 songs/sec + ... ================================================== -βœ… Processing complete! -πŸ“ Output saved to: data/sample_songs_cleaned.json +πŸŽ‰ Processing completed! +πŸ“Š Final Results: + ⏱️ Total processing time: 15263.3 seconds + πŸš€ Average speed: 3.2 songs/second + βœ… Artists found: 44,526/49,170 (90.6%) + βœ… Recordings found: 40,998/49,170 (83.4%) + ❌ Failed songs: 8,478 (17.2%) +πŸ“„ Files saved: + βœ… Successful songs: data/songs-success.json + ❌ Failed songs: data/songs-failure.json + πŸ“‹ Text report: data/processing_report_20241219_143022.txt + πŸ“Š JSON report: data/processing_report_20241219_143022.json +``` + +### Limited Processing +```bash +$ docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --limit 1000 + +⚠️ Limiting processing to first 1000 songs +πŸš€ Starting song processing... +πŸ“Š Total songs to process: 1,000 +Using database connection +================================================== + +[1 of 1,000] βœ… PASS: ACDC - Shot In The Dark +[2 of 1,000] ❌ FAIL: Unknown Artist - Unknown Song +... + +================================================== +πŸŽ‰ Processing completed! +πŸ“Š Final Results: + ⏱️ Total processing time: 312.5 seconds + πŸš€ Average speed: 3.2 songs/second + βœ… Artists found: 856/1,000 (85.6%) + βœ… Recordings found: 789/1,000 (78.9%) + ❌ Failed songs: 211 (21.1%) +πŸ“„ Files saved: + βœ… Successful songs: data/songs-success.json + ❌ Failed songs: data/songs-failure.json + πŸ“‹ Text report: data/processing_report_20241219_143022.txt + πŸ“Š JSON report: data/processing_report_20241219_143022.json ``` ## πŸ”§ Troubleshooting @@ -601,6 +732,12 @@ This tool is provided as-is for educational and personal use. - **Fuzzy search thresholds** need tuning for different datasets - **Connection pooling** would improve performance for large datasets +### CLI Design +- **Simplified interface** with smart defaults reduces complexity +- **Array format consistency** makes output files easier to work with +- **Human-readable reports** improve user experience +- **Test file organization** keeps project structure clean + --- **Happy cleaning! 🎡✨** \ No newline at end of file diff --git a/data/songs.json b/data/songs.json index c9924de..efdbb96 100755 --- a/data/songs.json +++ b/data/songs.json @@ -63689,6 +63689,14 @@ "path": "z://MP4\\KaraFun Karaoke\\Karaoke Zoom - The Commodores.mp4", "title": "Zoom" }, + { + "artist": "Kat DeLuna feat. Elephant Man", + "disabled": false, + "favorite": false, + "guid": "a5487de7-4ec6-d6bb-7e88-6ec275133a52", + "path": "z://MP4\\KaraFun Karaoke\\Kat DeLuna feat. Elephant Man - Whine Up.mp4", + "title": "Whine Up" + }, { "artist": "Marillion", "disabled": false, @@ -69378,14 +69386,6 @@ "path": "z://MP4\\KaraFun Karaoke\\Whenever, Wherever - Shakira Karaoke Version KaraFun.mp4", "title": "Whenever, Wherever" }, - { - "artist": "Whine Up", - "disabled": false, - "favorite": false, - "guid": "04f64889-07cc-2811-bf7d-3fa235859e25", - "path": "z://MP4\\KaraFun Karaoke\\Whine Up - Kat DeLuna feat. Elephant Man Karaoke Version KaraFun.mp4", - "title": "Kat DeLuna feat. Elephant Man Karaoke Version KaraFun" - }, { "artist": "Michael BublΓ©", "disabled": false, @@ -169868,7 +169868,7 @@ "title": "Girl Crush" }, { - "artist": "Little Mix", + "artist": "Little M!x", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -169877,7 +169877,7 @@ "title": "How Ya Doin'" }, { - "artist": "Little Mix", + "artist": "Little M!x", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -200306,6 +200306,14 @@ "path": "z://MP4\\Stingray Karaoke\\Bartender Lady Antebellum Karaoke with Lyrics.mp4", "title": "Bartender" }, + { + "artist": "Bastille", + "disabled": false, + "favorite": false, + "guid": "2ca64591-9d7c-e975-5340-ca1804195902", + "path": "z://MP4\\Stingray Karaoke\\Bastille - Pompeii.mp4", + "title": "Pompeii" + }, { "artist": "No Doubt", "disabled": false, @@ -200549,6 +200557,14 @@ "path": "z://MP4\\Stingray Karaoke\\Black Sabbath - Paranoid.mp4", "title": "Paranoid" }, + { + "artist": "Black Sabbath", + "disabled": false, + "favorite": false, + "guid": "b1e7d35c-6682-546e-4b29-5829d3343899", + "path": "z://MP4\\Stingray Karaoke\\Black Sabbath - Snowblind.mp4", + "title": "Snowblind" + }, { "artist": "Elton John", "disabled": false, @@ -202891,6 +202907,14 @@ "path": "z://MP4\\Stingray Karaoke\\Jingle Bell Rock Bobby Helms Karaoke with Lyrics.mp4", "title": "Jingle Bell Rock" }, + { + "artist": "John Lennon", + "disabled": false, + "favorite": false, + "guid": "1b67db72-da96-f56f-2e38-c6ace52dfc1a", + "path": "z://MP4\\Stingray Karaoke\\John Lennon - Give Peace A Chance.mp4", + "title": "Give Peace A Chance" + }, { "artist": "John Lennon", "disabled": false, @@ -202987,38 +203011,6 @@ "path": "z://MP4\\Stingray Karaoke\\Justin Bieber - Mistletoe (Karaoke Version).mp4", "title": "Mistletoe" }, - { - "artist": "Bastille", - "disabled": false, - "favorite": false, - "guid": "e270d6ed-4e3e-9db2-d5ff-a06e1cc6b7d3", - "path": "z://MP4\\Stingray Karaoke\\Karaoke Version Pompeii in the Style of Bastille with lyrics (no lead vocal).mp4", - "title": "Karaoke Version Pompeii" - }, - { - "artist": "John Lennon", - "disabled": false, - "favorite": false, - "guid": "caa7c825-12a7-57b7-fafd-4e259851bd54", - "path": "z://MP4\\Stingray Karaoke\\Karaoke Video Give Peace A Chance in the Style of John Lennon with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Give Peace A Chance" - }, - { - "artist": "Peggy Lee", - "disabled": false, - "favorite": false, - "guid": "532b8b6d-4fa3-b5b4-3879-765d6bc223e2", - "path": "z://MP4\\Stingray Karaoke\\Karaoke Video Sing Fever in the Style of Peggy Lee with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Sing Fever" - }, - { - "artist": "Black Sabbath", - "disabled": false, - "favorite": false, - "guid": "4dae1287-ffd5-7d98-3b39-21cc172bebd5", - "path": "z://MP4\\Stingray Karaoke\\Karaoke Video Snowblind in the Style of Black Sabbath with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Snowblind" - }, { "artist": "Katy Perry", "disabled": false, @@ -204138,6 +204130,14 @@ "path": "z://MP4\\Stingray Karaoke\\Pearl Jam - Black (Karaoke Version).mp4", "title": "Black" }, + { + "artist": "Peggy Lee", + "disabled": false, + "favorite": false, + "guid": "bac4bc08-7c95-f059-8abe-a723960cd2aa", + "path": "z://MP4\\Stingray Karaoke\\Peggy Lee - Sing Fever.mp4", + "title": "Sing Fever" + }, { "artist": "Pentatonix", "disabled": false, @@ -207329,6 +207329,14 @@ "path": "z://MP4\\TheKARAOKEChannel\\Birthday in the Style of The Beatles karaoke video with lyrics (no lead vocal).mp4", "title": "Birthday" }, + { + "artist": "Black Sabbath", + "disabled": false, + "favorite": false, + "guid": "b0dfc4cd-8a54-0db4-378a-266d8e14e882", + "path": "z://MP4\\TheKARAOKEChannel\\Black Sabbath - Snowblind.mp4", + "title": "Snowblind" + }, { "artist": "Alannah Myles", "disabled": false, @@ -208913,6 +208921,14 @@ "path": "z://MP4\\TheKARAOKEChannel\\Green Day - I Fought The Law.mp4", "title": "I Fought The Law" }, + { + "artist": "Gretchen Wilson", + "disabled": false, + "favorite": false, + "guid": "660cec43-1346-f165-7f30-ada5d336d123", + "path": "z://MP4\\TheKARAOKEChannel\\Gretchen Wilson - Here For The Party.mp4", + "title": "Here For The Party" + }, { "artist": "Miranda Lambert", "disabled": false, @@ -209841,6 +209857,14 @@ "path": "z://MP4\\TheKARAOKEChannel\\John Legend - All Of Me (Lyrics).mp4", "title": "All Of Me" }, + { + "artist": "John Lennon", + "disabled": false, + "favorite": false, + "guid": "a9a33863-57a0-01f2-0060-386e0f3cdc32", + "path": "z://MP4\\TheKARAOKEChannel\\John Lennon - Give Peace A Chance.mp4", + "title": "Give Peace A Chance" + }, { "artist": "John Lennon", "disabled": false, @@ -210017,54 +210041,6 @@ "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Version Pompeii in the Style of Bastille with lyrics (no lead vocal).mp4", "title": "Karaoke Version Pompeii" }, - { - "artist": "Kelly Clarkson", - "disabled": false, - "favorite": false, - "guid": "ba61f0c9-cf3c-47b5-2d2f-b5de183348e9", - "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Video Because of You in the Style of Kelly Clarkson with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Because of You" - }, - { - "artist": "John Lennon", - "disabled": false, - "favorite": false, - "guid": "d0610705-cbce-05c4-e9ce-c7e0d5dec594", - "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Video Give Peace A Chance in the Style of John Lennon with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Give Peace A Chance" - }, - { - "artist": "Gretchen Wilson", - "disabled": false, - "favorite": false, - "guid": "0ad553d3-6816-a567-aae1-ffe72f717c95", - "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Video Here For The Party in the Style of Gretchen Wilson with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Here For The Party" - }, - { - "artist": "Peggy Lee", - "disabled": false, - "favorite": false, - "guid": "9c448677-ebaa-f8f4-9a32-55d16451515b", - "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Video Sing Fever in the Style of Peggy Lee with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Sing Fever" - }, - { - "artist": "Black Sabbath", - "disabled": false, - "favorite": false, - "guid": "ae8512c0-733e-b962-708f-57e74c77c7f8", - "path": "z://MP4\\TheKARAOKEChannel\\Karaoke Video Snowblind in the Style of Black Sabbath with lyrics (no lead vocal).mp4", - "title": "Karaoke Video Snowblind" - }, - { - "artist": "Jewel", - "disabled": false, - "favorite": false, - "guid": "8775ad86-a0a0-2877-99ef-5b59f928ac4c", - "path": "z://MP4\\TheKARAOKEChannel\\KarenLovesAdam performs Foolish Games for the Undercover Karaoke Challenge in the style of Jewel.mp4", - "title": "KarenLovesAdam performs Foolish Games for the Undercover Karaoke Challenge" - }, { "artist": "Culture Club", "disabled": false, @@ -210137,6 +210113,14 @@ "path": "z://MP4\\TheKARAOKEChannel\\Kelly Clarkson & Ariana Grande - Santa, Can’t You Hear Me (Karaoke With Lyrics).mp4", "title": "Santa, Can’t You Hear Me" }, + { + "artist": "Kelly Clarkson", + "disabled": false, + "favorite": false, + "guid": "6362d98a-46df-48a6-6bf9-7a3df08a390c", + "path": "z://MP4\\TheKARAOKEChannel\\Kelly Clarkson - Because of You .mp4", + "title": "Because of You" + }, { "artist": "Kenny Rogers", "disabled": false, @@ -211369,6 +211353,14 @@ "path": "z://MP4\\TheKARAOKEChannel\\Peaceful Easy Feeling Eagles Karaoke with Lyrics.mp4", "title": "Peaceful Easy Feeling" }, + { + "artist": "Peggy Lee", + "disabled": false, + "favorite": false, + "guid": "8c50e858-3dc6-19d9-a417-5331e763af4b", + "path": "z://MP4\\TheKARAOKEChannel\\Peggy Lee - Sing Fever.mp4", + "title": "Sing Fever" + }, { "artist": "Pentatonix", "disabled": false, @@ -218141,15 +218133,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\All About Eve - Marthas Harbour.mp4", "title": "Martha's Harbour" }, - { - "artist": "All I Want For Christmas Is My Two Front Teeth", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "f347cb5c-fbc2-2d4d-a124-741414c58939", - "path": "z://MP4\\ZoomKaraokeOfficial\\All I Want For Christmas Is My Two Front Teeth - Karaoke Version from Zoom Karaoke.mp4", - "title": "Karaoke Version from Zoom Karaoke" - }, { "artist": "All Saints", "disabled": false, @@ -219192,7 +219175,7 @@ "genre": "Karaoke", "guid": "b5b380d6-6699-d1b6-095b-e7721f553838", "path": "z://MP4\\ZoomKaraokeOfficial\\Annie Soundtrack - Tomorrow Karaoke Version from Zoom Karaoke (1982 Version).mp4", - "title": "Tomorrow" + "title": "Tomorrow - Karaoke Version from Zoom Karaoke (1982 Version)" }, { "artist": "Another Level", @@ -220407,7 +220390,7 @@ "genre": "Karaoke", "guid": "892e330d-aeb9-195d-e2ac-1d2ed44b00f2", "path": "z://MP4\\ZoomKaraokeOfficial\\Bananarama - Venus Karaoke Version from Zoom Karaoke (Lyric Fixed).mp4", - "title": "Venus (Lyric Fixed)" + "title": "Venus - Karaoke Version from Zoom Karaoke (Lyric Fixed)" }, { "artist": "Bananarama", @@ -220416,7 +220399,7 @@ "genre": "Karaoke", "guid": "3cda0061-edc5-71eb-b1ce-a592de40fed8", "path": "z://MP4\\ZoomKaraokeOfficial\\Bananarama - Venus Karaoke Version from Zoom Karaoke (Old Version).mp4", - "title": "Venus (Old Version)" + "title": "Venus - Karaoke Version from Zoom Karaoke (Old Version)" }, { "artist": "Band Aid 30", @@ -223107,7 +223090,7 @@ "genre": "Karaoke", "guid": "708df1c8-4d05-607f-30a0-7ea1cf43c026", "path": "z://MP4\\ZoomKaraokeOfficial\\Blondie - Union City Blue Karaoke Version from Zoom Karaoke.mp4", - "title": "Union City Blue - Karaoke Version from Zoom Karaoke" + "title": "Union City Blue" }, { "artist": "Blood Brothers", @@ -225150,7 +225133,7 @@ "genre": "Karaoke", "guid": "ee0c36f0-bf6c-4d87-78f1-356e07824609", "path": "z://MP4\\ZoomKaraokeOfficial\\Bruce Springsteen - Sherry Darling Karaoke Version from Zoom Karaoke.mp4", - "title": "Sherry Darling - Karaoke Version from Zoom Karaoke" + "title": "Sherry Darling" }, { "artist": "Bruce Springsteen", @@ -227112,7 +227095,7 @@ "genre": "Karaoke", "guid": "89e2f35a-d52a-54f7-da46-8a590ee38f68", "path": "z://MP4\\ZoomKaraokeOfficial\\Charli XCX - Speed Drive Karaoke Version from Zoom Karaoke (Barbie Movie).mp4", - "title": "Speed Drive (Barbie Movie)" + "title": "Speed Drive - Karaoke Version from Zoom Karaoke (Barbie Movie)" }, { "artist": "Charli XCX ft. Ariana Grande", @@ -230082,7 +230065,7 @@ "genre": "Karaoke", "guid": "0173e449-92bc-8c75-7054-105227a56c19", "path": "z://MP4\\ZoomKaraokeOfficial\\Darlene Love - All Alone On Christmas Karaoke Version from Zoom Karaoke (from Home Alone).mp4", - "title": "All Alone On Christmas (from 'Home Alone')" + "title": "All Alone On Christmas - Karaoke Version from Zoom Karaoke (from 'Home Alone')" }, { "artist": "Darts", @@ -230217,7 +230200,7 @@ "genre": "Karaoke", "guid": "a48c4cc6-7a32-57e7-48ce-14ff7692b8d1", "path": "z://MP4\\ZoomKaraokeOfficial\\Dave Edmunds - From Small Things (Big Things One Day Come) Karaoke Version from Zoom Karaoke.mp4", - "title": "From Small Things (Big Things One Day Come)." + "title": "From Small Things (Big Things One Day Come) - Karaoke Version from Zoom Karaoke." }, { "artist": "Dave Edmunds", @@ -232143,7 +232126,7 @@ "genre": "Karaoke", "guid": "b119decd-7823-7e21-53b8-86d6934c9d87", "path": "z://MP4\\ZoomKaraokeOfficial\\Devo - Girl U Want Karaoke Version from Zoom Karaoke.mp4", - "title": "Girl U Want - Karaoke Version from Zoom Karaoke" + "title": "Girl U Want" }, { "artist": "Dexy's Midnight Runners", @@ -232550,15 +232533,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\Diplo & Miguel - Dont Forget My Love.mp4", "title": "Don't Forget My Love" }, - { - "artist": "Disney Villians The Musical ft. Maleficent", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "f2b524bf-d2e3-9a7e-a83a-495d79fdea59", - "path": "z://MP4\\ZoomKaraokeOfficial\\Disney Villians The Musical feat Maleficent - Karaoke Version from Zoom Karaoke.mp4", - "title": "Karaoke Version from Zoom Karaoke" - }, { "artist": "Divinyls", "disabled": false, @@ -238587,7 +238561,7 @@ "genre": "Karaoke", "guid": "c9081555-9f9f-7de4-1889-e0fde67dba1c", "path": "z://MP4\\ZoomKaraokeOfficial\\First Aid Kit - Fireworks (No Harmony For Duet) Karaoke Version from Zoom Karaoke.mp4", - "title": "Fireworks (No Harmony For Duet) - Karaoke Version from Zoom Karaoke" + "title": "Fireworks (No Harmony For Duet)" }, { "artist": "First Aid Kit", @@ -242000,15 +241974,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\God Save The King - British National Anthem.mp4", "title": "British National Anthem" }, - { - "artist": "God Save The Queen", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "b7d8684d-b7af-28b9-f7cd-8ec9dd16320f", - "path": "z://MP4\\ZoomKaraokeOfficial\\God Save The Queen - Karaoke Version from Zoom Karaoke British National Anthem.mp4", - "title": "Karaoke Version from Zoom Karaoke - British National Anthem" - }, { "artist": "Goldfrapp", "disabled": false, @@ -242036,15 +242001,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\Gonzalez - Havent Stopped Dancing Yet.mp4", "title": "Haven't Stopped Dancing Yet" }, - { - "artist": "Goodnight Campers from Hi-De-Hi", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "05afcc76-656b-9f6c-986f-c204a08e49eb", - "path": "z://MP4\\ZoomKaraokeOfficial\\Goodnight Campers from Hi-De-Hi - Karaoke Version from Zoom Karaoke.mp4", - "title": "Karaoke Version from Zoom Karaoke" - }, { "artist": "Gorgon City ft. Zak Abel", "disabled": false, @@ -244493,15 +244449,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\Imelda May - Train Kept A Rollin.mp4", "title": "Train Kept A Rollin'" }, - { - "artist": "In The Box (The Goodbye Song)", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "0c4383a2-d65f-a16c-b4b2-1a0fa1189480", - "path": "z://MP4\\ZoomKaraokeOfficial\\In The Box (The Goodbye Song) - Karaoke Version from Zoom Karaoke Australian TV Theme.mp4", - "title": "Karaoke Version from Zoom Karaoke - Australian TV Theme" - }, { "artist": "Infernal", "disabled": false, @@ -246111,7 +246058,7 @@ "genre": "Karaoke", "guid": "3ba6129d-ac04-b405-044d-8b453f565165", "path": "z://MP4\\ZoomKaraokeOfficial\\Jeannie C Riley - Harper Valley PTA Karaoke Version from Zoom Karaoke.mp4", - "title": "Harper Valley P.T.A. - Karaoke Version from Zoom Karaoke" + "title": "Harper Valley P.T.A." }, { "artist": "Jedward", @@ -250758,13 +250705,13 @@ "title": "You And I" }, { - "artist": "Kenny Rogers -You Decorated My Life", + "artist": "Kenny Rogers", "disabled": false, "favorite": false, "genre": "Karaoke", - "guid": "dec681a1-bddc-9920-083c-79022832ae3c", - "path": "z://MP4\\ZoomKaraokeOfficial\\Kenny Rogers -You Decorated My Life - Karaoke Version from Zoom Karaoke.mp4", - "title": "Karaoke Version from Zoom Karaoke" + "guid": "8bf25cbb-f0b1-b7d1-3095-9bc53b1fd971", + "path": "z://MP4\\ZoomKaraokeOfficial\\Kenny Rogers - You Decorated My Life.mp4", + "title": "You Decorated My Life" }, { "artist": "Kenny Rogers And Dolly Parton", @@ -257280,7 +257227,7 @@ "genre": "Karaoke", "guid": "c67b0bdb-0fdb-79e6-1aa7-d9abb648df62", "path": "z://MP4\\ZoomKaraokeOfficial\\Meghan Trainor - Good To Be Alive Karaoke Version from Zoom Karaoke.mp4", - "title": "Good To Be Alive - Karaoke Version from Zoom Karaoke" + "title": "Good To Be Alive" }, { "artist": "Meghan Trainor", @@ -263886,7 +263833,7 @@ "genre": "Karaoke", "guid": "33ebdbf6-adb1-bca8-a29a-499a167b53a8", "path": "z://MP4\\ZoomKaraokeOfficial\\Perry Como - Ave Maria Karaoke Version from Zoom Karaoke.mp4", - "title": "Ave Maria - Karaoke Version from Zoom Karaoke" + "title": "Ave Maria" }, { "artist": "Perry Como", @@ -265785,7 +265732,7 @@ "genre": "Karaoke", "guid": "da7260c6-5dab-0a8b-8229-307e1d5d2b2f", "path": "z://MP4\\ZoomKaraokeOfficial\\Postmodern Jukebox - We Cant Stop (Without Backing Vocals) Karaoke Version from Zoom Karaoke.mp4", - "title": "We Can't Stop (Without Backing Vocals) - Karaoke Version from Zoom Karaoke" + "title": "We Can't Stop" }, { "artist": "Postmodern Jukebox", @@ -267675,7 +267622,7 @@ "genre": "Karaoke", "guid": "d041e962-e78a-ad5f-e419-b670e47590e4", "path": "z://MP4\\ZoomKaraokeOfficial\\Rihanna - SOS Karaoke Version from Zoom Karaoke.mp4", - "title": "S.O.S. - Karaoke Version from Zoom Karaoke" + "title": "S.O.S." }, { "artist": "Rihanna", @@ -271617,7 +271564,7 @@ "genre": "Karaoke", "guid": "1cbf4200-9369-7551-005b-63420fa9b187", "path": "z://MP4\\ZoomKaraokeOfficial\\Sheppard - Geronimo Karaoke Version from Zoom Karaoke.mp4", - "title": "Geronimo - Karaoke Version from Zoom Karaoke" + "title": "Geronimo" }, { "artist": "Sherbet", @@ -271977,7 +271924,7 @@ "genre": "Karaoke", "guid": "d10abce9-d119-e81a-4156-48bf66ea9f30", "path": "z://MP4\\ZoomKaraokeOfficial\\Showaddywaddy - Why Do Lovers Break Each Others Hearts Karaoke Version From Zoom Karaoke.mp4", - "title": "Why Do Lovers Break Each Other's Hearts - Karaoke Version From Zoom Karaoke" + "title": "Why Do Lovers Break Each Other's Hearts" }, { "artist": "Showaddywaddy", @@ -275424,7 +275371,7 @@ "genre": "Karaoke", "guid": "50bc6b77-c2b2-bbca-522a-05ec0d55d2bd", "path": "z://MP4\\ZoomKaraokeOfficial\\Take That - SOS Karaoke Version from Zoom Karaoke.mp4", - "title": "S.O.S. - Karaoke Version from Zoom Karaoke" + "title": "S.O.S." }, { "artist": "Take That", @@ -281543,7 +281490,7 @@ "genre": "Karaoke", "guid": "75c896fe-77ad-690b-f358-dccbda4b5ae8", "path": "z://MP4\\ZoomKaraokeOfficial\\The Mavericks - Here Comes My Baby Karaoke Version from Zoom Karaoke.mp4", - "title": "Here Comes My Baby - Karaoke Version from Zoom Karaoke" + "title": "Here Comes My Baby" }, { "artist": "The Mavericks", @@ -284417,7 +284364,7 @@ "title": "You're No Good" }, { - "artist": "The Tamperer featuring Maya", + "artist": "The Tamperer ft. Maya", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -284426,7 +284373,7 @@ "title": "Feel It" }, { - "artist": "The Tamperer featuring Maya", + "artist": "The Tamperer ft. Maya", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -286475,7 +286422,7 @@ "genre": "Karaoke", "guid": "6e734302-c67e-cfff-c585-65e90e57c9a4", "path": "z://MP4\\ZoomKaraokeOfficial\\Tom Jones - Ill Never Fall In Love Again Karaoke Version from Zoom Karaoke.mp4", - "title": "I'll Never Fall In Love Again - Karaoke Version from Zoom Karaoke" + "title": "I'll Never Fall In Love Again" }, { "artist": "Tom Jones", @@ -288970,15 +288917,6 @@ "path": "z://MP4\\ZoomKaraokeOfficial\\Wendy Moten - Come In Out Of The Rain.mp4", "title": "Come In Out Of The Rain" }, - { - "artist": "Werewolves Of London/All Summer Long/Sweet Home Alabama Medley", - "disabled": false, - "favorite": false, - "genre": "Karaoke", - "guid": "fec61e62-37f3-7d62-f527-aa58a8a9cb45", - "path": "z://MP4\\ZoomKaraokeOfficial\\Werewolves Of LondonAll Summer LongSweet Home Alabama Medley - Karaoke Version from Zoom Karaoke.mp4", - "title": "Karaoke Version from Zoom Karaoke" - }, { "artist": "West End ft. Sybil", "disabled": false, @@ -306804,7 +306742,7 @@ "title": "Metal Postcard" }, { - "artist": "Lovato, Demi & Joe Jonas", + "artist": "Lavato, Demi & Joe Jonas", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -308673,7 +308611,7 @@ "title": "I Run To You" }, { - "artist": "Lovato, Demi & Joe Jonas", + "artist": "Lavato, Demi & Joe Jonas", "disabled": false, "favorite": false, "genre": "Karaoke", @@ -358620,7 +358558,7 @@ "title": "Green Garden" }, { - "artist": "Little Mix", + "artist": "Little M!x", "disabled": false, "favorite": false, "guid": "c8a57b4d-070d-0569-f178-edc96ff7b64f", diff --git a/src/api/database.py b/src/api/database.py index e05a789..81e4069 100644 --- a/src/api/database.py +++ b/src/api/database.py @@ -348,7 +348,7 @@ class MusicBrainzDatabase: best_recording_count = artist['recording_count'] print(f" 🎯 New best match: {artist['name']} (score: {score}, recordings: {artist['recording_count']})") - if best_score >= 80: + if best_score >= 70: # Lower threshold for better matching print(SUCCESS_MESSAGES['fuzzy_match_found'].format( original=artist_name, matched=best_match['name'], @@ -413,7 +413,7 @@ class MusicBrainzDatabase: best_score = score best_match = recording - if best_score >= 80: + if best_score >= 70: # Lower threshold for better matching return (best_match['name'], best_match['gid'], best_score / 100.0) else: # No artist constraint - search by title only with all variations @@ -447,7 +447,7 @@ class MusicBrainzDatabase: best_score = score best_match = recording - if best_score >= 80: + if best_score >= 70: # Lower threshold for better matching return (best_match['name'], best_match['gid'], best_score / 100.0) return None @@ -608,15 +608,30 @@ class MusicBrainzDatabase: import re # Primary collaboration indicators - primary_patterns = ['ft.', 'feat.', 'featuring'] + primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat'] # Secondary collaboration indicators (need more careful handling) - secondary_patterns = ['&', 'and'] + secondary_patterns = ['&', 'and', ','] # Check if this is a collaboration is_collaboration = False split_pattern = None + # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye" + # This should be "Kanye West ft. Jamie Foxx" + if artist_string.lower().startswith(('ft ', 'feat ')): + # This is a malformed collaboration string + # Try to extract the actual artists from the rest + remaining = artist_string[artist_string.find(' ') + 1:].strip() + if ',' in remaining: + # Split on comma and reverse the order + parts = [part.strip() for part in remaining.split(',')] + if len(parts) >= 2: + # Assume the last part is the main artist + main_artist = parts[-1].strip() + collaborators = parts[:-1] + return (main_artist, collaborators) + for pattern in primary_patterns: if pattern.lower() in artist_string.lower(): is_collaboration = True @@ -632,66 +647,80 @@ class MusicBrainzDatabase: # If no primary collaboration found, check secondary patterns if not is_collaboration: for pattern in secondary_patterns: - # Use word boundaries to avoid splitting within words like "Orlando" - import re - pattern_regex = r'\b' + re.escape(pattern) + r'\b' - if re.search(pattern_regex, artist_string, re.IGNORECASE): - # For secondary patterns, be more careful - # Check if this looks like a band name vs collaboration - parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) - if len(parts) == 2: - part1 = parts[0].strip() - part2 = parts[1].strip() - + if pattern == ',': + # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj") + if ',' in artist_string: + # Count commas to determine if this is likely a collaboration + comma_count = artist_string.count(',') + if comma_count >= 1: + # Split on comma and treat as collaboration + parts = [part.strip() for part in artist_string.split(',')] + if len(parts) >= 2: + # First artist is main, rest are collaborators + main_artist = parts[0] + collaborators = parts[1:] + return (main_artist, collaborators) + else: + # Use word boundaries to avoid splitting within words like "Orlando" + import re + pattern_regex = r'\b' + re.escape(pattern) + r'\b' + if re.search(pattern_regex, artist_string, re.IGNORECASE): + # For secondary patterns, be more careful # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - part1_words = len(part1.split()) - part2_words = len(part2.split()) - - # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - - # If one part has a comma, it's likely a collaboration - if ',' in part1 or ',' in part2: + parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) + if len(parts) == 2: + part1 = parts[0].strip() + part2 = parts[1].strip() + + # Check if this looks like a band name vs collaboration + # Band names typically have simple, short parts + # Collaborations often have more complex artist names + part1_words = len(part1.split()) + part2_words = len(part2.split()) + + # Check if this looks like a band name vs collaboration + # Band names typically have simple, short parts + # Collaborations often have more complex artist names + + # If one part has a comma, it's likely a collaboration + if ',' in part1 or ',' in part2: + is_collaboration = True + split_pattern = pattern + break + + # For "&" and "and" separators, check against known band names + # Load band names from JSON file + try: + import json + import os + json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json') + with open(json_path, 'r') as f: + known_data = json.load(f) + known_band_names = {name.lower() for name in known_data.get('groups', [])} + except (FileNotFoundError, json.JSONDecodeError, KeyError): + # Fallback to a minimal list if file can't be loaded + known_band_names = { + "gerry and the pacemakers", "simon & garfunkel", "hall & oates", + "brooks & dunn", "the everly brothers" + } + + if artist_string.lower() in known_band_names: + # This is a known band name, not a collaboration + continue + + # For "and" separators, be more conservative + # Many band names use "and" (e.g., "Gerry and the Pacemakers") + if pattern.lower() == 'and': + # Only treat as collaboration if it looks like individual artists + # Band names often have "the" in the second part + if 'the ' in part2.lower(): + # Likely a band name, skip + continue + + # Likely a collaboration is_collaboration = True split_pattern = pattern break - - # For "&" and "and" separators, check against known band names - # Load band names from JSON file - try: - import json - import os - json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json') - with open(json_path, 'r') as f: - known_data = json.load(f) - known_band_names = {name.lower() for name in known_data.get('groups', [])} - except (FileNotFoundError, json.JSONDecodeError, KeyError): - # Fallback to a minimal list if file can't be loaded - known_band_names = { - "gerry and the pacemakers", "simon & garfunkel", "hall & oates", - "brooks & dunn", "the everly brothers" - } - - if artist_string.lower() in known_band_names: - # This is a known band name, not a collaboration - continue - - # For "and" separators, be more conservative - # Many band names use "and" (e.g., "Gerry and the Pacemakers") - if pattern.lower() == 'and': - # Only treat as collaboration if it looks like individual artists - # Band names often have "the" in the second part - if 'the ' in part2.lower(): - # Likely a band name, skip - continue - - # Likely a collaboration - is_collaboration = True - split_pattern = pattern - break if not is_collaboration: return (artist_string, []) @@ -711,45 +740,152 @@ class MusicBrainzDatabase: def _generate_title_variations(self, title: str) -> List[str]: """ - Generate title variations by removing parenthetical content. + Generate title variations by removing parenthetical content and fixing common issues. Returns list of title variations to try. """ import re search_titles = [title.strip()] - # Remove complete parentheses (content) + # Fix common typos and missing apostrophes + title_fixes = title.strip() + + # Fix missing apostrophes in common contractions + apostrophe_fixes = [ + (r'\bDont\b', "Don't"), + (r'\bCant\b', "Can't"), + (r'\bWont\b', "Won't"), + (r'\bArent\b', "Aren't"), + (r'\bIsnt\b', "Isn't"), + (r'\bWasnt\b', "Wasn't"), + (r'\bDidnt\b', "Didn't"), + (r'\bDoesnt\b', "Doesn't"), + (r'\bHavent\b', "Haven't"), + (r'\bHasnt\b', "Hasn't"), + (r'\bWouldnt\b', "Wouldn't"), + (r'\bCouldnt\b', "Couldn't"), + (r'\bShouldnt\b', "Shouldn't"), + (r'\bPhunk\b', "Funk"), # Common typo + (r'\bBout\b', "About"), # Shortened form + ] + + for pattern, replacement in apostrophe_fixes: + fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE) + if fixed_title != title_fixes: + title_fixes = fixed_title + if title_fixes not in search_titles: + search_titles.append(title_fixes) + + # Comprehensive parentheses removal - try multiple approaches + # 1. Remove all complete parentheses (most aggressive) clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) clean_title = clean_title.strip() if clean_title != title.strip() and clean_title: search_titles.append(clean_title) - # Remove unmatched opening parenthesis at end + # 2. Remove specific common patterns first, then general parentheses + specific_patterns = [ + r'\s*\(Karaoke Version\)', + r'\s*\(Karaoke\)', + r'\s*\(Instrumental\)', + r'\s*\(Backing Track\)', + r'\s*\(live [^)]*\)', + r'\s*\(Live [^)]*\)', + r'\s*\(Acoustic\)', + r'\s*\(acoustic\)', + r'\s*\(Without Backing Vocals\)', + r'\s*\(Without Backing Vocals\)', + r'\s*\(Clean\)', + r'\s*\(clean\)', + r'\s*\(Remix\)', + r'\s*\(remix\)', + r'\s*\(Radio Edit\)', + r'\s*\(radio edit\)', + r'\s*\(Extended Mix\)', + r'\s*\(extended mix\)', + r'\s*\(Single Version\)', + r'\s*\(single version\)', + r'\s*\(Album Version\)', + r'\s*\(album version\)', + r'\s*\(Original Mix\)', + r'\s*\(original mix\)', + r'\s*\(John Lewis Christmas Ad \d+\)', # Specific pattern from test + r'\s*\(from the movie [^)]*\)', + r'\s*\(from the [^)]*\)', + r'\s*\(feat\. [^)]*\)', + r'\s*\(featuring [^)]*\)', + r'\s*\(ft\. [^)]*\)', + r'\s*\(duet\)', + r'\s*\(Duet\)', + r'\s*\(Two Semitones Down\)', + r'\s*\(Minus Piano\)', + r'\s*\(Cut Down\)', + r'\s*\(Boone & Speedy Vocals\)', + r'\s*\(My Heart Belongs to You\)', + ] + + # 3. Remove dash-separated content (like "Live At the BBC") + dash_patterns = [ + r'\s*-\s*Live [^-]*$', + r'\s*-\s*live [^-]*$', + r'\s*-\s*Live At [^-]*$', + r'\s*-\s*Live At the [^-]*$', + r'\s*-\s*Live At the BBC$', + r'\s*-\s*Live From [^-]*$', + r'\s*-\s*Live In [^-]*$', + r'\s*-\s*Live On [^-]*$', + ] + + # Apply specific patterns first + for pattern in specific_patterns: + specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + specific_clean = specific_clean.strip() + if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles: + search_titles.append(specific_clean) + + # Apply dash patterns + for pattern in dash_patterns: + dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + dash_clean = dash_clean.strip() + if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles: + search_titles.append(dash_clean) + + # 3. Remove any remaining parentheses after specific patterns + for pattern in specific_patterns: + remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip()) + remaining_clean = remaining_clean.strip() + if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles: + search_titles.append(remaining_clean) + + # 4. Remove unmatched opening parenthesis at end clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip()) clean_title2 = clean_title2.strip() if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles: search_titles.append(clean_title2) - # Remove unmatched closing parenthesis at start + # 5. Remove unmatched closing parenthesis at start clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip()) clean_title3 = clean_title3.strip() if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles: search_titles.append(clean_title3) - # Also try with specific karaoke patterns removed - karaoke_patterns = [ - r'\s*\(Karaoke Version\)', - r'\s*\(Karaoke\)', - r'\s*\(Instrumental\)', - r'\s*\(Backing Track\)', - ] - for pattern in karaoke_patterns: - karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - karaoke_clean = karaoke_clean.strip() - if karaoke_clean != title.strip() and karaoke_clean not in search_titles: - search_titles.append(karaoke_clean) + # 6. Try removing extra spaces and normalizing + normalized_title = re.sub(r'\s+', ' ', title.strip()) + if normalized_title != title.strip() and normalized_title not in search_titles: + search_titles.append(normalized_title) - return search_titles + # 7. Apply normalization to all cleaned versions and remove duplicates + normalized_versions = [] + for version in search_titles: + # Normalize spaces (replace multiple spaces with single space) + normalized = re.sub(r'\s+', ' ', version.strip()) + # Remove leading/trailing spaces + normalized = normalized.strip() + if normalized and normalized not in normalized_versions: + normalized_versions.append(normalized) + + return normalized_versions def _parse_collaborators(self, collaborators_string: str) -> List[str]: """ diff --git a/src/cli/main.py b/src/cli/main.py index 1d88bdd..27ad0ff 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -10,6 +10,7 @@ import time import re from pathlib import Path from typing import Dict, Optional, Any, Tuple, List +from datetime import datetime # Import constants from ..config.constants import ( @@ -170,10 +171,10 @@ class MusicBrainzCleaner: import re # Primary collaboration indicators - primary_patterns = ['ft.', 'feat.', 'featuring'] + primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat'] # Secondary collaboration indicators (need more careful handling) - secondary_patterns = ['&', 'and'] + secondary_patterns = ['&', 'and', ','] # Check if this is a collaboration is_collaboration = False @@ -366,99 +367,188 @@ class MusicBrainzCleaner: return song, False - def clean_songs_file(self, input_file: Path, output_file: Optional[Path] = None, limit: Optional[int] = None) -> Tuple[Path, List[Dict]]: - try: - # Read input file - with open(input_file, 'r', encoding='utf-8') as f: - songs = json.load(f) - - if not isinstance(songs, list): - print("Error: Input file should contain a JSON array of songs") - return input_file, [] - - # Apply limit if specified - if limit is not None: - songs = songs[:limit] - print(f"⚠️ Limiting processing to first {limit} songs") - - # Determine output path - if output_file is None: - output_file = input_file.parent / f"{input_file.stem}_cleaned.json" - - print(f"Processing {len(songs)} songs...") - print(f"Using {'database' if self.use_database else 'API'} connection") - print(PROGRESS_SEPARATOR) - - # Clean each song - cleaned_songs = [] - failed_songs = [] - success_count = 0 - fail_count = 0 - - for i, song in enumerate(songs, 1): - cleaned_song, success = self.clean_song(song) - cleaned_songs.append(cleaned_song) + def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]: + """ + Process songs from source file and save successful and failed songs to separate files. + This is the main processing method that handles full dataset processing by default. + """ + if not source_file.exists(): + print(f'❌ Source file not found: {source_file}') + return {} + + print('πŸš€ Starting song processing...') + + # Load songs + with open(source_file, 'r') as f: + all_songs = json.load(f) + + if not isinstance(all_songs, list): + print("Error: Source file should contain a JSON array of songs") + return {} + + # Apply limit if specified + if limit is not None: + all_songs = all_songs[:limit] + print(f"⚠️ Limiting processing to first {limit} songs") + + total_songs = len(all_songs) + print(f'πŸ“Š Total songs to process: {total_songs:,}') + print(f'Using {"database" if self.use_database else "API"} connection') + print(PROGRESS_SEPARATOR) + + # Initialize arrays for batch processing + successful_songs = [] + failed_songs = [] + + # Statistics tracking + stats = { + 'total_processed': 0, + 'artists_found': 0, + 'recordings_found': 0, + 'start_time': time.time() + } + + # Process each song + for i, song in enumerate(all_songs, 1): + try: + result = self.clean_song(song) + cleaned_song, success = result - if success: - success_count += 1 - print(f"[{i}/{len(songs)}] βœ… PASS") + artist_found = 'mbid' in cleaned_song + recording_found = 'recording_mbid' in cleaned_song + + # Display progress with counter and status + artist_name = song.get('artist', 'Unknown') + title = song.get('title', 'Unknown') + + if artist_found and recording_found: + stats['artists_found'] += 1 + stats['recordings_found'] += 1 + successful_songs.append(cleaned_song) + print(f'[{i:,} of {total_songs:,}] βœ… PASS: {artist_name} - {title}') else: - fail_count += 1 - print(f"[{i}/{len(songs)}] ❌ FAIL") - # Store failed song info for report - failed_songs.append({ - 'index': i, - 'original_artist': song.get('artist', ''), - 'original_title': song.get('title', ''), - 'cleaned_artist': cleaned_song.get('artist', ''), - 'cleaned_title': cleaned_song.get('title', ''), - 'has_mbid': 'mbid' in cleaned_song, - 'has_recording_mbid': 'recording_mbid' in cleaned_song - }) + # Keep the original song in failed_songs array (same format as source) + failed_songs.append(song) + print(f'[{i:,} of {total_songs:,}] ❌ FAIL: {artist_name} - {title}') - # Only add delay for API calls, not database queries - if not self.use_database: - time.sleep(API_REQUEST_DELAY) + stats['total_processed'] += 1 + + # Progress update every 100 songs + if i % 100 == 0: + elapsed = time.time() - stats['start_time'] + rate = i / elapsed if elapsed > 0 else 0 + success_rate = (stats['artists_found'] / i * 100) if i > 0 else 0 + print(f' πŸ“ˆ Progress: {i:,}/{total_songs:,} ({i/total_songs*100:.1f}%) - ' + f'Success: {success_rate:.1f}% - Rate: {rate:.1f} songs/sec') + + except Exception as e: + print(f' ❌ Error processing song {i}: {e}') + # Keep the original song in failed_songs array + failed_songs.append(song) + stats['total_processed'] += 1 - # Write output file - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(cleaned_songs, f, indent=2, ensure_ascii=False) + # Only add delay for API calls, not database queries + if not self.use_database: + time.sleep(API_REQUEST_DELAY) + + # Determine output file paths + if output_success is None: + output_success = source_file.parent / f"{source_file.stem}-success.json" + if output_failure is None: + output_failure = source_file.parent / f"{source_file.stem}-failure.json" + + # Save successful songs (array format, same as source) + with open(output_success, 'w', encoding='utf-8') as f: + json.dump(successful_songs, f, indent=2, ensure_ascii=False) + + # Save failed songs (array format, same as source) + with open(output_failure, 'w', encoding='utf-8') as f: + json.dump(failed_songs, f, indent=2, ensure_ascii=False) + + # Calculate final statistics + total_time = time.time() - stats['start_time'] + + # Create human-readable text report + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + report_file = source_file.parent / f"processing_report_{timestamp}.txt" + + with open(report_file, 'w', encoding='utf-8') as f: + f.write("MusicBrainz Data Cleaner - Processing Report\n") + f.write("=" * 50 + "\n\n") + f.write(f"Source File: {source_file}\n") + f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"Processing Time: {total_time:.1f} seconds\n\n") - # Generate failure report - report_file = input_file.parent / f"{input_file.stem}_failure_report.json" - with open(report_file, 'w', encoding='utf-8') as f: - json.dump({ - 'summary': { - 'total_songs': len(songs), - 'successful': success_count, - 'failed': fail_count, - 'success_rate': f"{(success_count/len(songs)*100):.1f}%" - }, - 'failed_songs': failed_songs - }, f, indent=2, ensure_ascii=False) + f.write("SUMMARY\n") + f.write("-" * 20 + "\n") + f.write(f"Total Songs Processed: {stats['total_processed']:,}\n") + f.write(f"Successful Songs: {len(successful_songs):,}\n") + f.write(f"Failed Songs: {len(failed_songs):,}\n") + f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n") - print(f"\n{PROGRESS_SEPARATOR}") - print(f"βœ… SUCCESS: {success_count} songs") - print(f"❌ FAILED: {fail_count} songs") - print(f"πŸ“Š SUCCESS RATE: {(success_count/len(songs)*100):.1f}%") - print(f"πŸ’Ύ CLEANED DATA: {output_file}") - print(f"πŸ“‹ FAILURE REPORT: {report_file}") + f.write("DETAILED STATISTICS\n") + f.write("-" * 20 + "\n") + f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n") + f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n") + f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n") - return output_file, failed_songs + f.write("OUTPUT FILES\n") + f.write("-" * 20 + "\n") + f.write(f"Successful Songs: {output_success}\n") + f.write(f"Failed Songs: {output_failure}\n") + f.write(f"Report File: {report_file}\n\n") - except FileNotFoundError: - print(f"Error: File '{input_file}' not found") - return input_file, [] - except json.JSONDecodeError: - print(f"Error: Invalid JSON in file '{input_file}'") - return input_file, [] - except Exception as e: - print(f"Error processing file: {e}") - return input_file, [] - finally: - # Clean up database connection - if self.use_database and hasattr(self, 'db'): - self.db.disconnect() + if len(failed_songs) > 0: + f.write("FAILED SONGS (First 50)\n") + f.write("-" * 20 + "\n") + for i, song in enumerate(failed_songs[:50], 1): + artist = song.get('artist', 'Unknown') + title = song.get('title', 'Unknown') + f.write(f"{i:3d}. {artist} - {title}\n") + + if len(failed_songs) > 50: + f.write(f"... and {len(failed_songs) - 50} more failed songs\n") + + # Save detailed JSON report for programmatic access + json_report_file = source_file.parent / f"processing_report_{timestamp}.json" + final_stats = { + 'summary': { + 'total_tested': stats['total_processed'], + 'artists_found': stats['artists_found'], + 'recordings_found': stats['recordings_found'], + 'failed_count': len(failed_songs), + 'artist_success_rate': (stats['artists_found'] / stats['total_processed'] * 100) if stats['total_processed'] > 0 else 0, + 'recording_success_rate': (stats['recordings_found'] / stats['total_processed'] * 100) if stats['total_processed'] > 0 else 0, + 'processing_time_seconds': total_time, + 'songs_per_second': stats['total_processed'] / total_time if total_time > 0 else 0 + }, + 'files': { + 'source': str(source_file), + 'successful_songs': str(output_success), + 'failed_songs': str(output_failure), + 'text_report': str(report_file), + 'json_report': str(json_report_file) + } + } + + with open(json_report_file, 'w') as f: + json.dump(final_stats, f, indent=2) + + print(f'\n{PROGRESS_SEPARATOR}') + print(f'πŸŽ‰ Processing completed!') + print(f'πŸ“Š Final Results:') + print(f' ⏱️ Total processing time: {total_time:.1f} seconds') + print(f' πŸš€ Average speed: {stats["total_processed"] / total_time:.1f} songs/second') + print(f' βœ… Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)') + print(f' βœ… Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)') + print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)') + print(f'πŸ“„ Files saved:') + print(f' βœ… Successful songs: {output_success}') + print(f' ❌ Failed songs: {output_failure}') + print(f' πŸ“‹ Text report: {report_file}') + print(f' πŸ“Š JSON report: {json_report_file}') + + return final_stats def print_help() -> None: @@ -466,25 +556,36 @@ def print_help() -> None: MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz USAGE: - musicbrainz-cleaner [output_file.json] [options] - -ARGUMENTS: - input_file.json JSON file containing array of song objects - output_file.json Optional: Output file for cleaned data + musicbrainz-cleaner [options] OPTIONS: - --help, -h Show this help message - --version, -v Show version information - --test-connection Test connection to MusicBrainz server - --limit N Process only the first N songs (for testing) - --use-api Force use of HTTP API instead of direct database access + --source FILE Source JSON file (default: data/songs.json) + --output-success FILE Output file for successful songs (default: source-success.json) + --output-failure FILE Output file for failed songs (default: source-failure.json) + --limit N Process only the first N songs (default: all songs) + --use-api Force use of HTTP API instead of direct database access + --test-connection Test connection to MusicBrainz server + --help, -h Show this help message + --version, -v Show version information EXAMPLES: - musicbrainz-cleaner songs.json - musicbrainz-cleaner songs.json cleaned_songs.json + # Process all songs with default settings + musicbrainz-cleaner + + # Process specific file + musicbrainz-cleaner --source data/my_songs.json + + # Process with custom output files + musicbrainz-cleaner --source data/songs.json --output-success cleaned.json --output-failure failed.json + + # Process only first 1000 songs + musicbrainz-cleaner --limit 1000 + + # Test connection musicbrainz-cleaner --test-connection - musicbrainz-cleaner songs.json --limit 5 - musicbrainz-cleaner songs.json --use-api + + # Force API mode + musicbrainz-cleaner --use-api REQUIREMENTS: - MusicBrainz server running on http://localhost:5001 @@ -501,12 +602,14 @@ PERFORMANCE: def print_version() -> None: version_info = """ -MusicBrainz Data Cleaner v2.0.0 +MusicBrainz Data Cleaner v3.0.0 Enhanced with: - Direct PostgreSQL database access - Fuzzy search for better matching - Improved performance and accuracy +- Separate output files for successful and failed songs +- Detailed progress tracking and reporting Copyright (c) 2024 MusicBrainz Data Cleaner Contributors MIT License - see LICENSE file for details @@ -516,35 +619,89 @@ Built with Python 3.6+ print(version_info) +def parse_arguments(args: List[str]) -> Dict[str, Any]: + """Parse command line arguments into a dictionary""" + parsed = { + 'source': 'data/songs.json', + 'output_success': None, + 'output_failure': None, + 'limit': None, + 'use_api': False, + 'test_connection': False, + 'help': False, + 'version': False + } + + i = 0 + while i < len(args): + arg = args[i] + + if arg in ['--help', '-h', 'help']: + parsed['help'] = True + elif arg in ['--version', '-v', 'version']: + parsed['version'] = True + elif arg == '--test-connection': + parsed['test_connection'] = True + elif arg == '--use-api': + parsed['use_api'] = True + elif arg == '--source': + if i + 1 < len(args) and not args[i + 1].startswith('--'): + parsed['source'] = args[i + 1] + i += 1 + else: + print("Error: --source requires a file path") + sys.exit(ExitCode.USAGE_ERROR) + elif arg == '--output-success': + if i + 1 < len(args) and not args[i + 1].startswith('--'): + parsed['output_success'] = args[i + 1] + i += 1 + else: + print("Error: --output-success requires a file path") + sys.exit(ExitCode.USAGE_ERROR) + elif arg == '--output-failure': + if i + 1 < len(args) and not args[i + 1].startswith('--'): + parsed['output_failure'] = args[i + 1] + i += 1 + else: + print("Error: --output-failure requires a file path") + sys.exit(ExitCode.USAGE_ERROR) + elif arg == '--limit': + if i + 1 < len(args) and not args[i + 1].startswith('--'): + try: + parsed['limit'] = int(args[i + 1]) + if parsed['limit'] <= 0: + print("Error: --limit must be a positive number") + sys.exit(ExitCode.USAGE_ERROR) + except ValueError: + print("Error: --limit requires a valid number") + sys.exit(ExitCode.USAGE_ERROR) + i += 1 + else: + print("Error: --limit requires a number") + sys.exit(ExitCode.USAGE_ERROR) + + i += 1 + + return parsed + + def main() -> int: try: args = sys.argv[1:] + parsed = parse_arguments(args) # Handle help and version flags - if not args or args[0] in ['--help', '-h', 'help']: + if parsed['help']: print_help() return ExitCode.SUCCESS - if args[0] in ['--version', '-v', 'version']: + if parsed['version']: print_version() return ExitCode.SUCCESS - # Check for API flag - use_database = '--use-api' not in args - if not use_database: - print("⚠️ Using HTTP API mode (slower than database access)") - # Handle test connection - if args[0] == '--test-connection': - if use_database: - db = MusicBrainzDatabase() - if db.test_connection(): - print("βœ… Connection to MusicBrainz database successful") - return ExitCode.SUCCESS - else: - print("❌ Connection to MusicBrainz database failed") - return ExitCode.ERROR - else: + if parsed['test_connection']: + if parsed['use_api']: api = MusicBrainzAPIClient() if api.test_connection(): print("βœ… Connection to MusicBrainz API server successful") @@ -552,10 +709,7 @@ def main() -> int: else: print("❌ Connection to MusicBrainz API server failed") return ExitCode.ERROR - - # Check for test connection flag in any position - if '--test-connection' in args: - if use_database: + else: db = MusicBrainzDatabase() if db.test_connection(): print("βœ… Connection to MusicBrainz database successful") @@ -563,73 +717,27 @@ def main() -> int: else: print("❌ Connection to MusicBrainz database failed") return ExitCode.ERROR - else: - api = MusicBrainzAPIClient() - if api.test_connection(): - print("βœ… Connection to MusicBrainz API server successful") - return ExitCode.SUCCESS - else: - print("❌ Connection to MusicBrainz API server failed") - return ExitCode.ERROR - # Validate input file - if not args: - print("Error: Input file is required") - print("Use --help for usage information") + # Process songs (main functionality) + source_file = Path(parsed['source']) + output_success = Path(parsed['output_success']) if parsed['output_success'] else None + output_failure = Path(parsed['output_failure']) if parsed['output_failure'] else None + + if not source_file.exists(): + print(f"Error: Source file does not exist: {source_file}") return ExitCode.USAGE_ERROR - # Parse limit argument and remove it from args - limit = None - args_to_remove = [] - for i, arg in enumerate(args): - if arg == '--limit': - if i + 1 < len(args) and not args[i + 1].startswith('--'): - try: - limit = int(args[i + 1]) - if limit <= 0: - print("Error: Limit must be a positive number") - return ExitCode.USAGE_ERROR - args_to_remove.extend([i, i + 1]) - except ValueError: - print("Error: --limit requires a valid number") - return ExitCode.USAGE_ERROR - else: - print("Error: --limit requires a number") - return ExitCode.USAGE_ERROR - - # Remove limit arguments and API flag from args - for index in reversed(args_to_remove): - args.pop(index) - - # Remove API flag - args = [arg for arg in args if arg != '--use-api'] - - # Filter out remaining flags to get file arguments - file_args = [arg for arg in args if not arg.startswith('--')] - - if not file_args: - print("Error: Input file is required") - print("Use --help for usage information") + if not source_file.is_file(): + print(f"Error: Source path is not a file: {source_file}") return ExitCode.USAGE_ERROR - input_file = Path(file_args[0]) - output_file = Path(file_args[1]) if len(file_args) > 1 else None - - if not input_file.exists(): - print(f"Error: Input file does not exist: {input_file}") - return ExitCode.USAGE_ERROR - - if not input_file.is_file(): - print(f"Error: Input path is not a file: {input_file}") - return ExitCode.USAGE_ERROR - - if input_file.suffix.lower() != '.json': - print(f"Error: Input file must be a JSON file: {input_file}") + if source_file.suffix.lower() != '.json': + print(f"Error: Source file must be a JSON file: {source_file}") return ExitCode.USAGE_ERROR # Process the file - cleaner = MusicBrainzCleaner(use_database=use_database) - result_path, failed_songs = cleaner.clean_songs_file(input_file, output_file, limit) + cleaner = MusicBrainzCleaner(use_database=not parsed['use_api']) + cleaner.process_songs(source_file, output_success, output_failure, parsed['limit']) return ExitCode.SUCCESS