diff --git a/PRD.md b/PRD.md index 34ef33d..35fa3ee 100644 --- a/PRD.md +++ b/PRD.md @@ -639,4 +639,21 @@ Test files often contain working code snippets that can be adapted: - **ALL developers** must follow this directive - **ALL test files** must be in `src/tests/` - **NO EXCEPTIONS** to this rule -- **IMMEDIATE CORRECTION** required for any violations \ No newline at end of file +- **IMMEDIATE CORRECTION** required for any violations + +## Performance Optimizations + +### Default Artist Sorting +- **Enabled by default**: Songs are automatically sorted by artist name before processing +- **Performance benefits**: + - Better database query efficiency (similar artists processed together) + - Improved caching behavior + - Cleaner log output organization +- **Optional disable**: Use `--no-sort` flag to preserve original order +- **User experience**: Most users benefit from sorting, so it's the default + +### Multiple Artist Candidate Search +- **Intelligent artist selection**: Tries multiple artist candidates when first choice doesn't have the recording +- **Recording-aware prioritization**: Artists with the specific recording are prioritized +- **Fallback strategy**: Up to 5 different artist candidates are tried if needed +- **Comprehensive search**: Searches names, aliases, and fuzzy matches \ No newline at end of file diff --git a/src/cli/main.py b/src/cli/main.py index a14ce6d..d0de05c 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -31,10 +31,11 @@ from ..core.factory import DataProviderFactory class MusicBrainzCleaner: """Enhanced MusicBrainz Cleaner with interface-based data access.""" - def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False): + def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False, no_sort: bool = False): self.use_database = use_database self.base_url = base_url self.force = force + self.no_sort = no_sort # Create data provider using factory self.data_provider = DataProviderFactory.create_provider(use_database, base_url) @@ -75,10 +76,19 @@ class MusicBrainzCleaner: print("Error: Source file should contain a JSON array of songs") return {} - # Apply limit if specified + # Sort songs by artist for better performance and organization + original_count = len(all_songs) + if not self.no_sort: + print("🔄 Sorting songs by artist for optimal processing...") + all_songs.sort(key=lambda song: song.get('artist', '').lower()) + print(f"✅ Songs sorted by artist") + else: + print("â„šī¸ Preserving original song order (sorting disabled)") + + # Apply limit if specified (after sorting) if limit is not None: all_songs = all_songs[:limit] - print(f"âš ī¸ Limiting processing to first {limit} songs") + print(f"âš ī¸ Limiting processing to first {limit} songs (from {original_count:,} total)") total_songs = len(all_songs) print(f'📊 Total songs to process: {total_songs:,}') @@ -259,6 +269,10 @@ MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz USAGE: musicbrainz-cleaner [options] +DEFAULT BEHAVIOR: + Songs are automatically sorted by artist name for optimal performance. + Use --no-sort to preserve the original order. + OPTIONS: --source FILE Source JSON file (default: data/songs.json) --output-success FILE Output file for successful songs (default: source-success.json) @@ -266,6 +280,7 @@ OPTIONS: --limit N Process only the first N songs (default: all songs) --use-api Force use of HTTP API instead of direct database access --force Force reprocessing, ignoring all existing MBIDs + --no-sort Disable artist sorting (preserve original order) --test-connection Test connection to MusicBrainz server --help, -h Show this help message --version, -v Show version information @@ -292,6 +307,9 @@ EXAMPLES: # Force reprocessing (ignore all existing MBIDs) musicbrainz-cleaner --force + # Disable artist sorting (preserve original order) + musicbrainz-cleaner --no-sort + REQUIREMENTS: - MusicBrainz server running on http://localhost:5001 - PostgreSQL database accessible (for direct database access) @@ -334,6 +352,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]: 'use_api': False, 'test_connection': False, 'force': False, + 'no_sort': False, 'help': False, 'version': False } @@ -352,6 +371,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]: parsed['use_api'] = True elif arg == '--force': parsed['force'] = True + elif arg == '--no-sort': + parsed['no_sort'] = True elif arg == '--source': if i + 1 < len(args) and not args[i + 1].startswith('--'): parsed['source'] = args[i + 1] @@ -437,7 +458,11 @@ def main() -> int: return ExitCode.USAGE_ERROR # Process the file - cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force']) + cleaner = MusicBrainzCleaner( + use_database=not parsed['use_api'], + force=parsed['force'], + no_sort=parsed['no_sort'] + ) cleaner.process_songs(source_file, output_success, output_failure, parsed['limit']) return ExitCode.SUCCESS