#!/usr/bin/env python3 """ Command-line interface for MusicBrainz Data Cleaner. Now uses direct database access with fuzzy search for better performance. """ import sys import json import time from pathlib import Path from typing import Dict, Optional, Any, Tuple, List from datetime import datetime # Import constants from ..config.constants import ( DEFAULT_MUSICBRAINZ_URL, API_REQUEST_DELAY, PROGRESS_SEPARATOR, SUCCESS_MESSAGES, ERROR_MESSAGES, ExitCode ) # Import core components from ..core.song_processor import SongProcessor from ..core.factory import DataProviderFactory class MusicBrainzCleaner: """Enhanced MusicBrainz Cleaner with interface-based data access.""" def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False): self.use_database = use_database self.base_url = base_url self.force = force # Create data provider using factory self.data_provider = DataProviderFactory.create_provider(use_database, base_url) # Initialize centralized song processor with the data provider self.song_processor = SongProcessor(self.data_provider) def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]: """ Clean a single song using the centralized song processor. Returns (cleaned_song, success_status) """ song_copy = song.copy() # If force flag is set, remove ALL existing MBIDs to force complete reprocessing if self.force: song_copy.pop('mbid', None) song_copy.pop('recording_mbid', None) return self.song_processor.clean_song(song_copy) def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]: """ Process songs from source file and save successful and failed songs to separate files. This is the main processing method that handles full dataset processing by default. """ if not source_file.exists(): print(f'❌ Source file not found: {source_file}') return {} print('🚀 Starting song processing...') # Load songs with open(source_file, 'r') as f: all_songs = json.load(f) if not isinstance(all_songs, list): print("Error: Source file should contain a JSON array of songs") return {} # Apply limit if specified if limit is not None: all_songs = all_songs[:limit] print(f"⚠️ Limiting processing to first {limit} songs") total_songs = len(all_songs) print(f'📊 Total songs to process: {total_songs:,}') # Check if file is empty if total_songs == 0: print('⚠️ No songs to process - file is empty') return {} print(f'Using {"database" if self.use_database else "API"} connection') if self.force: print('🔄 Force mode enabled - ignoring all existing MBIDs') print(PROGRESS_SEPARATOR) # Initialize arrays for batch processing successful_songs = [] failed_songs = [] # Statistics tracking stats = { 'total_processed': 0, 'artists_found': 0, 'recordings_found': 0, 'start_time': time.time() } # Process each song for i, song in enumerate(all_songs, 1): try: result = self.clean_song(song) cleaned_song, success = result artist_found = 'mbid' in cleaned_song recording_found = 'recording_mbid' in cleaned_song # Display progress with counter and status artist_name = song.get('artist', 'Unknown') title = song.get('title', 'Unknown') if artist_found and recording_found: stats['artists_found'] += 1 stats['recordings_found'] += 1 successful_songs.append(cleaned_song) print(f'[{i:,} of {total_songs:,}] ✅ PASS: {artist_name} - {title}') else: # Keep the original song in failed_songs array (same format as source) failed_songs.append(song) print(f'[{i:,} of {total_songs:,}] ❌ FAIL: {artist_name} - {title}') stats['total_processed'] += 1 # Progress update every 100 songs if i % 100 == 0: elapsed = time.time() - stats['start_time'] rate = i / elapsed if elapsed > 0 else 0 success_rate = (stats['artists_found'] / i * 100) if i > 0 else 0 print(f' 📈 Progress: {i:,}/{total_songs:,} ({i/total_songs*100:.1f}%) - ' f'Success: {success_rate:.1f}% - Rate: {rate:.1f} songs/sec') except Exception as e: print(f' ❌ Error processing song {i}: {e}') # Keep the original song in failed_songs array failed_songs.append(song) stats['total_processed'] += 1 # Only add delay for API calls, not database queries if not self.use_database: time.sleep(API_REQUEST_DELAY) # Determine output file paths if output_success is None: output_success = source_file.parent / f"{source_file.stem}-success.json" if output_failure is None: output_failure = source_file.parent / f"{source_file.stem}-failure.json" # Save successful songs (array format, same as source) with open(output_success, 'w', encoding='utf-8') as f: json.dump(successful_songs, f, indent=2, ensure_ascii=False) # Save failed songs (array format, same as source) with open(output_failure, 'w', encoding='utf-8') as f: json.dump(failed_songs, f, indent=2, ensure_ascii=False) # Calculate final statistics total_time = time.time() - stats['start_time'] # Create human-readable text report timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_file = source_file.parent / f"processing_report_{timestamp}.txt" with open(report_file, 'w', encoding='utf-8') as f: f.write("MusicBrainz Data Cleaner - Processing Report\n") f.write("=" * 50 + "\n\n") f.write(f"Source File: {source_file}\n") f.write(f"Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"Processing Time: {total_time:.1f} seconds\n\n") f.write("SUMMARY\n") f.write("-" * 20 + "\n") f.write(f"Total Songs Processed: {stats['total_processed']:,}\n") f.write(f"Successful Songs: {len(successful_songs):,}\n") f.write(f"Failed Songs: {len(failed_songs):,}\n") f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%") f.write("\n\n") f.write("DETAILED STATISTICS\n") f.write("-" * 20 + "\n") f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%") f.write(")\n") f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%") f.write(")\n") f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second") f.write("\n\n") f.write("OUTPUT FILES\n") f.write("-" * 20 + "\n") f.write(f"Successful Songs: {output_success}\n") f.write(f"Failed Songs: {output_failure}\n") f.write(f"Report File: {report_file}\n\n") if len(failed_songs) > 0: f.write("FAILED SONGS (First 50)\n") f.write("-" * 20 + "\n") for i, song in enumerate(failed_songs[:50], 1): artist = song.get('artist', 'Unknown') title = song.get('title', 'Unknown') f.write(f"{i:3d}. {artist} - {title}\n") if len(failed_songs) > 50: f.write(f"... and {len(failed_songs) - 50} more failed songs\n") # Save detailed JSON report for programmatic access json_report_file = source_file.parent / f"processing_report_{timestamp}.json" final_stats = { 'summary': { 'total_tested': stats['total_processed'], 'artists_found': stats['artists_found'], 'recordings_found': stats['recordings_found'], 'failed_count': len(failed_songs), 'artist_success_rate': (stats['artists_found'] / stats['total_processed'] * 100) if stats['total_processed'] > 0 else 0, 'recording_success_rate': (stats['recordings_found'] / stats['total_processed'] * 100) if stats['total_processed'] > 0 else 0, 'processing_time_seconds': total_time, 'songs_per_second': stats['total_processed'] / total_time if total_time > 0 else 0 }, 'files': { 'source': str(source_file), 'successful_songs': str(output_success), 'failed_songs': str(output_failure), 'text_report': str(report_file), 'json_report': str(json_report_file) } } with open(json_report_file, 'w') as f: json.dump(final_stats, f, indent=2) print(f'\n{PROGRESS_SEPARATOR}') print(f'🎉 Processing completed!') print(f'📊 Final Results:') print(f' ⏱️ Total processing time: {total_time:.1f} seconds') print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else ' 🚀 Average speed: 0.0 songs/second') print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)') print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)') print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ❌ Failed songs: {len(failed_songs):,} (0.0%)') print(f'📄 Files saved:') print(f' ✅ Successful songs: {output_success}') print(f' ❌ Failed songs: {output_failure}') print(f' 📋 Text report: {report_file}') print(f' 📊 JSON report: {json_report_file}') return final_stats def print_help() -> None: help_text = """ MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz USAGE: musicbrainz-cleaner [options] OPTIONS: --source FILE Source JSON file (default: data/songs.json) --output-success FILE Output file for successful songs (default: source-success.json) --output-failure FILE Output file for failed songs (default: source-failure.json) --limit N Process only the first N songs (default: all songs) --use-api Force use of HTTP API instead of direct database access --force Force reprocessing, ignoring all existing MBIDs --test-connection Test connection to MusicBrainz server --help, -h Show this help message --version, -v Show version information EXAMPLES: # Process all songs with default settings musicbrainz-cleaner # Process specific file musicbrainz-cleaner --source data/my_songs.json # Process with custom output files musicbrainz-cleaner --source data/songs.json --output-success cleaned.json --output-failure failed.json # Process only first 1000 songs musicbrainz-cleaner --limit 1000 # Test connection musicbrainz-cleaner --test-connection # Force API mode musicbrainz-cleaner --use-api # Force reprocessing (ignore all existing MBIDs) musicbrainz-cleaner --force REQUIREMENTS: - MusicBrainz server running on http://localhost:5001 - PostgreSQL database accessible (for direct database access) - Python 3.6+ with requests, psycopg2-binary, and fuzzywuzzy libraries PERFORMANCE: - Direct database access is much faster than API calls - Fuzzy search provides better matching for similar names - No rate limiting with database access """ print(help_text) def print_version() -> None: version_info = """ MusicBrainz Data Cleaner v3.0.0 Enhanced with: - Direct PostgreSQL database access - Fuzzy search for better matching - Improved performance and accuracy - Separate output files for successful and failed songs - Detailed progress tracking and reporting Copyright (c) 2024 MusicBrainz Data Cleaner Contributors MIT License - see LICENSE file for details Built with Python 3.6+ """ print(version_info) def parse_arguments(args: List[str]) -> Dict[str, Any]: """Parse command line arguments into a dictionary""" parsed = { 'source': 'data/songs.json', 'output_success': None, 'output_failure': None, 'limit': None, 'use_api': False, 'test_connection': False, 'force': False, 'help': False, 'version': False } i = 0 while i < len(args): arg = args[i] if arg in ['--help', '-h', 'help']: parsed['help'] = True elif arg in ['--version', '-v', 'version']: parsed['version'] = True elif arg == '--test-connection': parsed['test_connection'] = True elif arg == '--use-api': parsed['use_api'] = True elif arg == '--force': parsed['force'] = True elif arg == '--source': if i + 1 < len(args) and not args[i + 1].startswith('--'): parsed['source'] = args[i + 1] i += 1 else: print("Error: --source requires a file path") sys.exit(ExitCode.USAGE_ERROR) elif arg == '--output-success': if i + 1 < len(args) and not args[i + 1].startswith('--'): parsed['output_success'] = args[i + 1] i += 1 else: print("Error: --output-success requires a file path") sys.exit(ExitCode.USAGE_ERROR) elif arg == '--output-failure': if i + 1 < len(args) and not args[i + 1].startswith('--'): parsed['output_failure'] = args[i + 1] i += 1 else: print("Error: --output-failure requires a file path") sys.exit(ExitCode.USAGE_ERROR) elif arg == '--limit': if i + 1 < len(args) and not args[i + 1].startswith('--'): try: parsed['limit'] = int(args[i + 1]) if parsed['limit'] <= 0: print("Error: --limit must be a positive number") sys.exit(ExitCode.USAGE_ERROR) except ValueError: print("Error: --limit requires a valid number") sys.exit(ExitCode.USAGE_ERROR) i += 1 else: print("Error: --limit requires a number") sys.exit(ExitCode.USAGE_ERROR) i += 1 return parsed def main() -> int: try: args = sys.argv[1:] parsed = parse_arguments(args) # Handle help and version flags if parsed['help']: print_help() return ExitCode.SUCCESS if parsed['version']: print_version() return ExitCode.SUCCESS # Handle test connection if parsed['test_connection']: provider = DataProviderFactory.create_provider(not parsed['use_api']) if provider.test_connection(): provider_type = "API server" if parsed['use_api'] else "database" print(f"✅ Connection to MusicBrainz {provider_type} successful") return ExitCode.SUCCESS else: provider_type = "API server" if parsed['use_api'] else "database" print(f"❌ Connection to MusicBrainz {provider_type} failed") return ExitCode.ERROR # Process songs (main functionality) source_file = Path(parsed['source']) output_success = Path(parsed['output_success']) if parsed['output_success'] else None output_failure = Path(parsed['output_failure']) if parsed['output_failure'] else None if not source_file.exists(): print(f"Error: Source file does not exist: {source_file}") return ExitCode.USAGE_ERROR if not source_file.is_file(): print(f"Error: Source path is not a file: {source_file}") return ExitCode.USAGE_ERROR if source_file.suffix.lower() != '.json': print(f"Error: Source file must be a JSON file: {source_file}") return ExitCode.USAGE_ERROR # Process the file cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force']) cleaner.process_songs(source_file, output_success, output_failure, parsed['limit']) return ExitCode.SUCCESS except KeyboardInterrupt: print("\nOperation cancelled by user") return ExitCode.ERROR except Exception as e: print(f"Unexpected error: {e}") return ExitCode.ERROR if __name__ == "__main__": sys.exit(main())