Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-01 15:27:11 -05:00
parent b75418eade
commit f127bac480
2 changed files with 47 additions and 5 deletions

17
PRD.md
View File

@ -640,3 +640,20 @@ Test files often contain working code snippets that can be adapted:
- **ALL test files** must be in `src/tests/`
- **NO EXCEPTIONS** to this rule
- **IMMEDIATE CORRECTION** required for any violations
## Performance Optimizations
### Default Artist Sorting
- **Enabled by default**: Songs are automatically sorted by artist name before processing
- **Performance benefits**:
- Better database query efficiency (similar artists processed together)
- Improved caching behavior
- Cleaner log output organization
- **Optional disable**: Use `--no-sort` flag to preserve original order
- **User experience**: Most users benefit from sorting, so it's the default
### Multiple Artist Candidate Search
- **Intelligent artist selection**: Tries multiple artist candidates when first choice doesn't have the recording
- **Recording-aware prioritization**: Artists with the specific recording are prioritized
- **Fallback strategy**: Up to 5 different artist candidates are tried if needed
- **Comprehensive search**: Searches names, aliases, and fuzzy matches

View File

@ -31,10 +31,11 @@ from ..core.factory import DataProviderFactory
class MusicBrainzCleaner:
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False, no_sort: bool = False):
self.use_database = use_database
self.base_url = base_url
self.force = force
self.no_sort = no_sort
# Create data provider using factory
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
@ -75,10 +76,19 @@ class MusicBrainzCleaner:
print("Error: Source file should contain a JSON array of songs")
return {}
# Apply limit if specified
# Sort songs by artist for better performance and organization
original_count = len(all_songs)
if not self.no_sort:
print("🔄 Sorting songs by artist for optimal processing...")
all_songs.sort(key=lambda song: song.get('artist', '').lower())
print(f"✅ Songs sorted by artist")
else:
print(" Preserving original song order (sorting disabled)")
# Apply limit if specified (after sorting)
if limit is not None:
all_songs = all_songs[:limit]
print(f"⚠️ Limiting processing to first {limit} songs")
print(f"⚠️ Limiting processing to first {limit} songs (from {original_count:,} total)")
total_songs = len(all_songs)
print(f'📊 Total songs to process: {total_songs:,}')
@ -259,6 +269,10 @@ MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz
USAGE:
musicbrainz-cleaner [options]
DEFAULT BEHAVIOR:
Songs are automatically sorted by artist name for optimal performance.
Use --no-sort to preserve the original order.
OPTIONS:
--source FILE Source JSON file (default: data/songs.json)
--output-success FILE Output file for successful songs (default: source-success.json)
@ -266,6 +280,7 @@ OPTIONS:
--limit N Process only the first N songs (default: all songs)
--use-api Force use of HTTP API instead of direct database access
--force Force reprocessing, ignoring all existing MBIDs
--no-sort Disable artist sorting (preserve original order)
--test-connection Test connection to MusicBrainz server
--help, -h Show this help message
--version, -v Show version information
@ -292,6 +307,9 @@ EXAMPLES:
# Force reprocessing (ignore all existing MBIDs)
musicbrainz-cleaner --force
# Disable artist sorting (preserve original order)
musicbrainz-cleaner --no-sort
REQUIREMENTS:
- MusicBrainz server running on http://localhost:5001
- PostgreSQL database accessible (for direct database access)
@ -334,6 +352,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
'use_api': False,
'test_connection': False,
'force': False,
'no_sort': False,
'help': False,
'version': False
}
@ -352,6 +371,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
parsed['use_api'] = True
elif arg == '--force':
parsed['force'] = True
elif arg == '--no-sort':
parsed['no_sort'] = True
elif arg == '--source':
if i + 1 < len(args) and not args[i + 1].startswith('--'):
parsed['source'] = args[i + 1]
@ -437,7 +458,11 @@ def main() -> int:
return ExitCode.USAGE_ERROR
# Process the file
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
cleaner = MusicBrainzCleaner(
use_database=not parsed['use_api'],
force=parsed['force'],
no_sort=parsed['no_sort']
)
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
return ExitCode.SUCCESS