Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
b75418eade
commit
f127bac480
19
PRD.md
19
PRD.md
@ -639,4 +639,21 @@ Test files often contain working code snippets that can be adapted:
|
|||||||
- **ALL developers** must follow this directive
|
- **ALL developers** must follow this directive
|
||||||
- **ALL test files** must be in `src/tests/`
|
- **ALL test files** must be in `src/tests/`
|
||||||
- **NO EXCEPTIONS** to this rule
|
- **NO EXCEPTIONS** to this rule
|
||||||
- **IMMEDIATE CORRECTION** required for any violations
|
- **IMMEDIATE CORRECTION** required for any violations
|
||||||
|
|
||||||
|
## Performance Optimizations
|
||||||
|
|
||||||
|
### Default Artist Sorting
|
||||||
|
- **Enabled by default**: Songs are automatically sorted by artist name before processing
|
||||||
|
- **Performance benefits**:
|
||||||
|
- Better database query efficiency (similar artists processed together)
|
||||||
|
- Improved caching behavior
|
||||||
|
- Cleaner log output organization
|
||||||
|
- **Optional disable**: Use `--no-sort` flag to preserve original order
|
||||||
|
- **User experience**: Most users benefit from sorting, so it's the default
|
||||||
|
|
||||||
|
### Multiple Artist Candidate Search
|
||||||
|
- **Intelligent artist selection**: Tries multiple artist candidates when first choice doesn't have the recording
|
||||||
|
- **Recording-aware prioritization**: Artists with the specific recording are prioritized
|
||||||
|
- **Fallback strategy**: Up to 5 different artist candidates are tried if needed
|
||||||
|
- **Comprehensive search**: Searches names, aliases, and fuzzy matches
|
||||||
@ -31,10 +31,11 @@ from ..core.factory import DataProviderFactory
|
|||||||
class MusicBrainzCleaner:
|
class MusicBrainzCleaner:
|
||||||
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
|
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
|
||||||
|
|
||||||
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
|
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False, no_sort: bool = False):
|
||||||
self.use_database = use_database
|
self.use_database = use_database
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.force = force
|
self.force = force
|
||||||
|
self.no_sort = no_sort
|
||||||
|
|
||||||
# Create data provider using factory
|
# Create data provider using factory
|
||||||
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
|
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
|
||||||
@ -75,10 +76,19 @@ class MusicBrainzCleaner:
|
|||||||
print("Error: Source file should contain a JSON array of songs")
|
print("Error: Source file should contain a JSON array of songs")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Apply limit if specified
|
# Sort songs by artist for better performance and organization
|
||||||
|
original_count = len(all_songs)
|
||||||
|
if not self.no_sort:
|
||||||
|
print("🔄 Sorting songs by artist for optimal processing...")
|
||||||
|
all_songs.sort(key=lambda song: song.get('artist', '').lower())
|
||||||
|
print(f"✅ Songs sorted by artist")
|
||||||
|
else:
|
||||||
|
print("ℹ️ Preserving original song order (sorting disabled)")
|
||||||
|
|
||||||
|
# Apply limit if specified (after sorting)
|
||||||
if limit is not None:
|
if limit is not None:
|
||||||
all_songs = all_songs[:limit]
|
all_songs = all_songs[:limit]
|
||||||
print(f"⚠️ Limiting processing to first {limit} songs")
|
print(f"⚠️ Limiting processing to first {limit} songs (from {original_count:,} total)")
|
||||||
|
|
||||||
total_songs = len(all_songs)
|
total_songs = len(all_songs)
|
||||||
print(f'📊 Total songs to process: {total_songs:,}')
|
print(f'📊 Total songs to process: {total_songs:,}')
|
||||||
@ -259,6 +269,10 @@ MusicBrainz Data Cleaner - Clean and normalize song data using MusicBrainz
|
|||||||
USAGE:
|
USAGE:
|
||||||
musicbrainz-cleaner [options]
|
musicbrainz-cleaner [options]
|
||||||
|
|
||||||
|
DEFAULT BEHAVIOR:
|
||||||
|
Songs are automatically sorted by artist name for optimal performance.
|
||||||
|
Use --no-sort to preserve the original order.
|
||||||
|
|
||||||
OPTIONS:
|
OPTIONS:
|
||||||
--source FILE Source JSON file (default: data/songs.json)
|
--source FILE Source JSON file (default: data/songs.json)
|
||||||
--output-success FILE Output file for successful songs (default: source-success.json)
|
--output-success FILE Output file for successful songs (default: source-success.json)
|
||||||
@ -266,6 +280,7 @@ OPTIONS:
|
|||||||
--limit N Process only the first N songs (default: all songs)
|
--limit N Process only the first N songs (default: all songs)
|
||||||
--use-api Force use of HTTP API instead of direct database access
|
--use-api Force use of HTTP API instead of direct database access
|
||||||
--force Force reprocessing, ignoring all existing MBIDs
|
--force Force reprocessing, ignoring all existing MBIDs
|
||||||
|
--no-sort Disable artist sorting (preserve original order)
|
||||||
--test-connection Test connection to MusicBrainz server
|
--test-connection Test connection to MusicBrainz server
|
||||||
--help, -h Show this help message
|
--help, -h Show this help message
|
||||||
--version, -v Show version information
|
--version, -v Show version information
|
||||||
@ -292,6 +307,9 @@ EXAMPLES:
|
|||||||
# Force reprocessing (ignore all existing MBIDs)
|
# Force reprocessing (ignore all existing MBIDs)
|
||||||
musicbrainz-cleaner --force
|
musicbrainz-cleaner --force
|
||||||
|
|
||||||
|
# Disable artist sorting (preserve original order)
|
||||||
|
musicbrainz-cleaner --no-sort
|
||||||
|
|
||||||
REQUIREMENTS:
|
REQUIREMENTS:
|
||||||
- MusicBrainz server running on http://localhost:5001
|
- MusicBrainz server running on http://localhost:5001
|
||||||
- PostgreSQL database accessible (for direct database access)
|
- PostgreSQL database accessible (for direct database access)
|
||||||
@ -334,6 +352,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
|||||||
'use_api': False,
|
'use_api': False,
|
||||||
'test_connection': False,
|
'test_connection': False,
|
||||||
'force': False,
|
'force': False,
|
||||||
|
'no_sort': False,
|
||||||
'help': False,
|
'help': False,
|
||||||
'version': False
|
'version': False
|
||||||
}
|
}
|
||||||
@ -352,6 +371,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
|||||||
parsed['use_api'] = True
|
parsed['use_api'] = True
|
||||||
elif arg == '--force':
|
elif arg == '--force':
|
||||||
parsed['force'] = True
|
parsed['force'] = True
|
||||||
|
elif arg == '--no-sort':
|
||||||
|
parsed['no_sort'] = True
|
||||||
elif arg == '--source':
|
elif arg == '--source':
|
||||||
if i + 1 < len(args) and not args[i + 1].startswith('--'):
|
if i + 1 < len(args) and not args[i + 1].startswith('--'):
|
||||||
parsed['source'] = args[i + 1]
|
parsed['source'] = args[i + 1]
|
||||||
@ -437,7 +458,11 @@ def main() -> int:
|
|||||||
return ExitCode.USAGE_ERROR
|
return ExitCode.USAGE_ERROR
|
||||||
|
|
||||||
# Process the file
|
# Process the file
|
||||||
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
|
cleaner = MusicBrainzCleaner(
|
||||||
|
use_database=not parsed['use_api'],
|
||||||
|
force=parsed['force'],
|
||||||
|
no_sort=parsed['no_sort']
|
||||||
|
)
|
||||||
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
|
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
|
||||||
|
|
||||||
return ExitCode.SUCCESS
|
return ExitCode.SUCCESS
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user