Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
b75418eade
commit
f127bac480
17
PRD.md
17
PRD.md
@ -640,3 +640,20 @@ Test files often contain working code snippets that can be adapted:
|
||||
- **ALL test files** must be in `src/tests/`
|
||||
- **NO EXCEPTIONS** to this rule
|
||||
- **IMMEDIATE CORRECTION** required for any violations
|
||||
|
||||
## Performance Optimizations
|
||||
|
||||
### Default Artist Sorting
|
||||
- **Enabled by default**: Songs are automatically sorted by artist name before processing
|
||||
- **Performance benefits**:
|
||||
- Better database query efficiency (similar artists processed together)
|
||||
- Improved caching behavior
|
||||
- Cleaner log output organization
|
||||
- **Optional disable**: Use `--no-sort` flag to preserve original order
|
||||
- **User experience**: Most users benefit from sorting, so it's the default
|
||||
|
||||
### Multiple Artist Candidate Search
|
||||
- **Intelligent artist selection**: Tries multiple artist candidates when first choice doesn't have the recording
|
||||
- **Recording-aware prioritization**: Artists with the specific recording are prioritized
|
||||
- **Fallback strategy**: Up to 5 different artist candidates are tried if needed
|
||||
- **Comprehensive search**: Searches names, aliases, and fuzzy matches
|
||||
@ -31,10 +31,11 @@ from ..core.factory import DataProviderFactory
|
||||
class MusicBrainzCleaner:
|
||||
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
|
||||
|
||||
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
|
||||
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False, no_sort: bool = False):
|
||||
self.use_database = use_database
|
||||
self.base_url = base_url
|
||||
self.force = force
|
||||
self.no_sort = no_sort
|
||||
|
||||
# Create data provider using factory
|
||||
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
|
||||
@ -75,10 +76,19 @@ class MusicBrainzCleaner:
|
||||
print("Error: Source file should contain a JSON array of songs")
|
||||
return {}
|
||||
|
||||
# Apply limit if specified
|
||||
# Sort songs by artist for better performance and organization
|
||||
original_count = len(all_songs)
|
||||
if not self.no_sort:
|
||||
print("🔄 Sorting songs by artist for optimal processing...")
|
||||
all_songs.sort(key=lambda song: song.get('artist', '').lower())
|
||||
print(f"✅ Songs sorted by artist")
|
||||
else:
|
||||
print("ℹ️ Preserving original song order (sorting disabled)")
|
||||
|
||||
# Apply limit if specified (after sorting)
|
||||
if limit is not None:
|
||||
all_songs = all_songs[:limit]
|
||||
print(f"⚠️ Limiting processing to first {limit} songs")
|
||||
print(f"⚠️ Limiting processing to first {limit} songs (from {original_count:,} total)")
|
||||
|
||||
total_songs = len(all_songs)
|
||||
print(f'📊 Total songs to process: {total_songs:,}')
|
||||
@ -259,6 +269,10 @@ MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz
|
||||
USAGE:
|
||||
musicbrainz-cleaner [options]
|
||||
|
||||
DEFAULT BEHAVIOR:
|
||||
Songs are automatically sorted by artist name for optimal performance.
|
||||
Use --no-sort to preserve the original order.
|
||||
|
||||
OPTIONS:
|
||||
--source FILE Source JSON file (default: data/songs.json)
|
||||
--output-success FILE Output file for successful songs (default: source-success.json)
|
||||
@ -266,6 +280,7 @@ OPTIONS:
|
||||
--limit N Process only the first N songs (default: all songs)
|
||||
--use-api Force use of HTTP API instead of direct database access
|
||||
--force Force reprocessing, ignoring all existing MBIDs
|
||||
--no-sort Disable artist sorting (preserve original order)
|
||||
--test-connection Test connection to MusicBrainz server
|
||||
--help, -h Show this help message
|
||||
--version, -v Show version information
|
||||
@ -292,6 +307,9 @@ EXAMPLES:
|
||||
# Force reprocessing (ignore all existing MBIDs)
|
||||
musicbrainz-cleaner --force
|
||||
|
||||
# Disable artist sorting (preserve original order)
|
||||
musicbrainz-cleaner --no-sort
|
||||
|
||||
REQUIREMENTS:
|
||||
- MusicBrainz server running on http://localhost:5001
|
||||
- PostgreSQL database accessible (for direct database access)
|
||||
@ -334,6 +352,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
||||
'use_api': False,
|
||||
'test_connection': False,
|
||||
'force': False,
|
||||
'no_sort': False,
|
||||
'help': False,
|
||||
'version': False
|
||||
}
|
||||
@ -352,6 +371,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
||||
parsed['use_api'] = True
|
||||
elif arg == '--force':
|
||||
parsed['force'] = True
|
||||
elif arg == '--no-sort':
|
||||
parsed['no_sort'] = True
|
||||
elif arg == '--source':
|
||||
if i + 1 < len(args) and not args[i + 1].startswith('--'):
|
||||
parsed['source'] = args[i + 1]
|
||||
@ -437,7 +458,11 @@ def main() -> int:
|
||||
return ExitCode.USAGE_ERROR
|
||||
|
||||
# Process the file
|
||||
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
|
||||
cleaner = MusicBrainzCleaner(
|
||||
use_database=not parsed['use_api'],
|
||||
force=parsed['force'],
|
||||
no_sort=parsed['no_sort']
|
||||
)
|
||||
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
|
||||
|
||||
return ExitCode.SUCCESS
|
||||
|
||||
Loading…
Reference in New Issue
Block a user