Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
ec33253403
commit
543e7b71d4
34
PRD.md
34
PRD.md
@ -250,13 +250,15 @@ src/
|
||||
- Manual configuration needed for custom artist/recording mappings
|
||||
- **NEW**: Some edge cases may require manual intervention (data quality issues)
|
||||
|
||||
### Test File Organization
|
||||
- **REQUIRED**: All test files must be placed in `src/tests/` directory
|
||||
- **PROHIBITED**: Test files should not be placed in the root directory
|
||||
### Test File Organization - CRITICAL DIRECTIVE
|
||||
- **REQUIRED**: All test files MUST be placed in `src/tests/` directory
|
||||
- **PROHIBITED**: Test files should NEVER be placed in the root directory
|
||||
- **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns
|
||||
- **Purpose**: Keeps root directory clean and organizes test code properly
|
||||
- **Import Path**: Tests can import from parent modules using relative imports
|
||||
|
||||
**⚠️ CRITICAL ENFORCEMENT**: This directive is ABSOLUTE and NON-NEGOTIABLE. Any test files created in the root directory will be immediately deleted and moved to the correct location.
|
||||
|
||||
### Using Tests for Issue Resolution
|
||||
- **FIRST STEP**: When encountering issues, check `src/tests/` directory for existing test files
|
||||
- **EXISTING TESTS**: Many common issues already have test cases that can help debug problems
|
||||
@ -580,6 +582,8 @@ Test files often contain working code snippets that can be adapted:
|
||||
- Data processing logic
|
||||
- Error handling approaches
|
||||
|
||||
**⚠️ REMINDER**: All test files MUST be in `src/tests/` directory. NEVER create test files in the root directory.
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### Database Integration
|
||||
@ -613,4 +617,26 @@ Test files often contain working code snippets that can be adapted:
|
||||
- **Resource Requirements**: MusicBrainz services require significant memory (8GB+ recommended) and disk space
|
||||
- **Platform Compatibility**: Apple Silicon (M1/M2) works but may show platform mismatch warnings
|
||||
- **Database Connection Issues**: Common startup problems include wrong host configuration and incomplete initialization
|
||||
- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
|
||||
- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
|
||||
|
||||
## CRITICAL PROJECT DIRECTIVE - TEST FILE ORGANIZATION
|
||||
|
||||
**⚠️ ABSOLUTE REQUIREMENT - NON-NEGOTIABLE**
|
||||
|
||||
### Test File Placement Rules
|
||||
- **REQUIRED**: ALL test files MUST be placed in `src/tests/` directory
|
||||
- **PROHIBITED**: Test files should NEVER be placed in the root directory
|
||||
- **ENFORCEMENT**: Any test files created in the root directory will be immediately deleted and moved to the correct location
|
||||
- **NON-NEGOTIABLE**: This directive is absolute and must be followed at all times
|
||||
|
||||
### Why This Matters
|
||||
- **Project Structure**: Keeps the root directory clean and organized
|
||||
- **Code Organization**: Groups all test-related code in one location
|
||||
- **Maintainability**: Makes it easier to find and manage test files
|
||||
- **Best Practices**: Follows standard Python project structure conventions
|
||||
|
||||
### Compliance Required
|
||||
- **ALL developers** must follow this directive
|
||||
- **ALL test files** must be in `src/tests/`
|
||||
- **NO EXCEPTIONS** to this rule
|
||||
- **IMMEDIATE CORRECTION** required for any violations
|
||||
@ -461,6 +461,399 @@ class MusicBrainzDatabase(MusicBrainzDataProvider):
|
||||
print(f"Error in fuzzy_search_recording: {e}")
|
||||
return None
|
||||
|
||||
def fuzzy_search_artist_with_recording(self, artist_name: str, recording_title: str) -> Optional[Tuple[str, str, float]]:
|
||||
"""
|
||||
Fuzzy search for artist by name, prioritizing artists that have the specific recording.
|
||||
Returns (artist_name, mbid, similarity_score) or None.
|
||||
"""
|
||||
if not self.connection:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Clean artist name
|
||||
clean_name = artist_name.strip()
|
||||
|
||||
# Handle collaborations - extract main artist
|
||||
if 'ft.' in clean_name.lower():
|
||||
main_artist = clean_name.split('ft.')[0].strip()
|
||||
elif 'feat.' in clean_name.lower():
|
||||
main_artist = clean_name.split('feat.')[0].strip()
|
||||
else:
|
||||
main_artist = clean_name
|
||||
|
||||
# Find all potential artists (including aliases)
|
||||
all_artists = []
|
||||
|
||||
# 1. Exact matches in artist names
|
||||
query = "SELECT name, gid FROM artist WHERE name = %s"
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
exact_results = self.cursor.fetchall()
|
||||
for result in exact_results:
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'exact_name',
|
||||
'score': 1.0
|
||||
})
|
||||
|
||||
# 2. Exact matches in artist aliases
|
||||
query = """
|
||||
SELECT a.name, a.gid
|
||||
FROM artist a
|
||||
JOIN artist_alias aa ON a.id = aa.artist
|
||||
WHERE aa.name = %s
|
||||
"""
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
alias_results = self.cursor.fetchall()
|
||||
for result in alias_results:
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'exact_alias',
|
||||
'score': 0.95
|
||||
})
|
||||
|
||||
# 3. Case-insensitive matches
|
||||
query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
case_results = self.cursor.fetchall()
|
||||
for result in case_results:
|
||||
# Avoid duplicates
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'case_insensitive',
|
||||
'score': 0.99
|
||||
})
|
||||
|
||||
# 4. Fuzzy matches in names and aliases
|
||||
name_variants = [main_artist]
|
||||
# Add dash variants
|
||||
name_variants.extend([
|
||||
main_artist.replace('-', '‐'),
|
||||
main_artist.replace('‐', '-'),
|
||||
])
|
||||
|
||||
# Add name variations (e.g., "S Club 7" -> "S Club")
|
||||
for variant in name_variants[:]: # Copy list to avoid modification during iteration
|
||||
if ' ' in variant:
|
||||
parts = variant.split()
|
||||
if len(parts) > 1 and parts[-1].isdigit():
|
||||
name_variants.append(' '.join(parts[:-1]))
|
||||
|
||||
# Search for fuzzy matches
|
||||
for variant in name_variants:
|
||||
# Search in artist names
|
||||
query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
|
||||
self.cursor.execute(query, (f'%{variant}%',))
|
||||
fuzzy_results = self.cursor.fetchall()
|
||||
for result in fuzzy_results:
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
|
||||
if score >= 0.8: # Only include good matches
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'fuzzy_name',
|
||||
'score': score
|
||||
})
|
||||
|
||||
# Search in artist aliases
|
||||
query = """
|
||||
SELECT a.name, a.gid
|
||||
FROM artist a
|
||||
JOIN artist_alias aa ON a.id = aa.artist
|
||||
WHERE aa.name ILIKE %s
|
||||
"""
|
||||
self.cursor.execute(query, (f'%{variant}%',))
|
||||
fuzzy_alias_results = self.cursor.fetchall()
|
||||
for result in fuzzy_alias_results:
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
|
||||
if score >= 0.8: # Only include good matches
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'fuzzy_alias',
|
||||
'score': score
|
||||
})
|
||||
|
||||
if not all_artists:
|
||||
return None
|
||||
|
||||
# Remove duplicates and get recording information
|
||||
unique_artists = []
|
||||
seen_mbids = set()
|
||||
for artist in all_artists:
|
||||
if artist['gid'] not in seen_mbids:
|
||||
seen_mbids.add(artist['gid'])
|
||||
unique_artists.append(artist)
|
||||
|
||||
print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
|
||||
|
||||
# Check which artists have the specific recording
|
||||
artists_with_recording = []
|
||||
for artist in unique_artists:
|
||||
# Check if this artist has the specific recording
|
||||
query = """
|
||||
SELECT COUNT(r.id) as recording_count
|
||||
FROM artist a
|
||||
JOIN artist_credit_name acn ON a.id = acn.artist
|
||||
JOIN artist_credit ac ON acn.artist_credit = ac.id
|
||||
JOIN recording r ON ac.id = r.artist_credit
|
||||
WHERE a.gid = %s AND r.name ILIKE %s
|
||||
"""
|
||||
self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
|
||||
result = self.cursor.fetchone()
|
||||
has_recording = result['recording_count'] > 0 if result else False
|
||||
|
||||
# Also get total recording count
|
||||
query = """
|
||||
SELECT COUNT(r.id) as recording_count
|
||||
FROM artist a
|
||||
JOIN artist_credit_name acn ON a.id = acn.artist
|
||||
JOIN artist_credit ac ON acn.artist_credit = ac.id
|
||||
JOIN recording r ON ac.id = r.artist_credit
|
||||
WHERE a.gid = %s
|
||||
"""
|
||||
self.cursor.execute(query, (artist['gid'],))
|
||||
total_result = self.cursor.fetchone()
|
||||
total_recordings = total_result['recording_count'] if total_result else 0
|
||||
|
||||
artist_info = {
|
||||
**artist,
|
||||
'has_recording': has_recording,
|
||||
'total_recordings': total_recordings
|
||||
}
|
||||
artists_with_recording.append(artist_info)
|
||||
|
||||
status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
|
||||
print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
|
||||
|
||||
# Prioritize artists that have the specific recording
|
||||
artists_with_recording.sort(key=lambda x: (
|
||||
not x['has_recording'], # Artists with recording first
|
||||
-x['score'], # Then by score (highest first)
|
||||
-x['total_recordings'] # Then by total recordings (highest first)
|
||||
))
|
||||
|
||||
if artists_with_recording:
|
||||
best_artist = artists_with_recording[0]
|
||||
print(f" 🎯 Selected: {best_artist['name']} (score: {best_artist['score']:.2f}, has recording: {best_artist['has_recording']})")
|
||||
|
||||
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
|
||||
original=artist_name,
|
||||
matched=best_artist['name'],
|
||||
score=best_artist['score']
|
||||
))
|
||||
return (best_artist['name'], best_artist['gid'], best_artist['score'])
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in fuzzy_search_artist_with_recording: {e}")
|
||||
return None
|
||||
|
||||
def fuzzy_search_artist_candidates(self, artist_name: str, recording_title: str = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fuzzy search for artist by name, returning multiple candidates.
|
||||
If recording_title is provided, prioritizes artists that have the specific recording.
|
||||
Returns list of artist candidates with their details.
|
||||
"""
|
||||
if not self.connection:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Clean artist name
|
||||
clean_name = artist_name.strip()
|
||||
|
||||
# Handle collaborations - extract main artist
|
||||
if 'ft.' in clean_name.lower():
|
||||
main_artist = clean_name.split('ft.')[0].strip()
|
||||
elif 'feat.' in clean_name.lower():
|
||||
main_artist = clean_name.split('feat.')[0].strip()
|
||||
else:
|
||||
main_artist = clean_name
|
||||
|
||||
# Find all potential artists (including aliases)
|
||||
all_artists = []
|
||||
|
||||
# 1. Exact matches in artist names
|
||||
query = "SELECT name, gid FROM artist WHERE name = %s"
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
exact_results = self.cursor.fetchall()
|
||||
for result in exact_results:
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'exact_name',
|
||||
'score': 1.0
|
||||
})
|
||||
|
||||
# 2. Exact matches in artist aliases
|
||||
query = """
|
||||
SELECT a.name, a.gid
|
||||
FROM artist a
|
||||
JOIN artist_alias aa ON a.id = aa.artist
|
||||
WHERE aa.name = %s
|
||||
"""
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
alias_results = self.cursor.fetchall()
|
||||
for result in alias_results:
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'exact_alias',
|
||||
'score': 0.95
|
||||
})
|
||||
|
||||
# 3. Case-insensitive matches
|
||||
query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
|
||||
self.cursor.execute(query, (main_artist,))
|
||||
case_results = self.cursor.fetchall()
|
||||
for result in case_results:
|
||||
# Avoid duplicates
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'case_insensitive',
|
||||
'score': 0.99
|
||||
})
|
||||
|
||||
# 4. Fuzzy matches in names and aliases
|
||||
name_variants = [main_artist]
|
||||
# Add dash variants
|
||||
name_variants.extend([
|
||||
main_artist.replace('-', '‐'),
|
||||
main_artist.replace('‐', '-'),
|
||||
])
|
||||
|
||||
# Add name variations (e.g., "S Club 7" -> "S Club")
|
||||
for variant in name_variants[:]: # Copy list to avoid modification during iteration
|
||||
if ' ' in variant:
|
||||
parts = variant.split()
|
||||
if len(parts) > 1 and parts[-1].isdigit():
|
||||
name_variants.append(' '.join(parts[:-1]))
|
||||
|
||||
# Search for fuzzy matches
|
||||
for variant in name_variants:
|
||||
# Search in artist names
|
||||
query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
|
||||
self.cursor.execute(query, (f'%{variant}%',))
|
||||
fuzzy_results = self.cursor.fetchall()
|
||||
for result in fuzzy_results:
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
|
||||
if score >= 0.8: # Only include good matches
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'fuzzy_name',
|
||||
'score': score
|
||||
})
|
||||
|
||||
# Search in artist aliases
|
||||
query = """
|
||||
SELECT a.name, a.gid
|
||||
FROM artist a
|
||||
JOIN artist_alias aa ON a.id = aa.artist
|
||||
WHERE aa.name ILIKE %s
|
||||
"""
|
||||
self.cursor.execute(query, (f'%{variant}%',))
|
||||
fuzzy_alias_results = self.cursor.fetchall()
|
||||
for result in fuzzy_alias_results:
|
||||
if not any(a['gid'] == result['gid'] for a in all_artists):
|
||||
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
|
||||
if score >= 0.8: # Only include good matches
|
||||
all_artists.append({
|
||||
'name': result['name'],
|
||||
'gid': result['gid'],
|
||||
'match_type': 'fuzzy_alias',
|
||||
'score': score
|
||||
})
|
||||
|
||||
if not all_artists:
|
||||
return []
|
||||
|
||||
# Remove duplicates and get recording information
|
||||
unique_artists = []
|
||||
seen_mbids = set()
|
||||
for artist in all_artists:
|
||||
if artist['gid'] not in seen_mbids:
|
||||
seen_mbids.add(artist['gid'])
|
||||
unique_artists.append(artist)
|
||||
|
||||
# If we have a recording title, check which artists have it
|
||||
if recording_title:
|
||||
print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
|
||||
|
||||
# Check which artists have the specific recording
|
||||
for artist in unique_artists:
|
||||
# Check if this artist has the specific recording
|
||||
query = """
|
||||
SELECT COUNT(r.id) as recording_count
|
||||
FROM artist a
|
||||
JOIN artist_credit_name acn ON a.id = acn.artist
|
||||
JOIN artist_credit ac ON acn.artist_credit = ac.id
|
||||
JOIN recording r ON ac.id = r.artist_credit
|
||||
WHERE a.gid = %s AND r.name ILIKE %s
|
||||
"""
|
||||
self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
|
||||
result = self.cursor.fetchone()
|
||||
has_recording = result['recording_count'] > 0 if result else False
|
||||
|
||||
# Also get total recording count
|
||||
query = """
|
||||
SELECT COUNT(r.id) as recording_count
|
||||
FROM artist a
|
||||
JOIN artist_credit_name acn ON a.id = acn.artist
|
||||
JOIN artist_credit ac ON acn.artist_credit = ac.id
|
||||
JOIN recording r ON ac.id = r.artist_credit
|
||||
WHERE a.gid = %s
|
||||
"""
|
||||
self.cursor.execute(query, (artist['gid'],))
|
||||
total_result = self.cursor.fetchone()
|
||||
total_recordings = total_result['recording_count'] if total_result else 0
|
||||
|
||||
artist['has_recording'] = has_recording
|
||||
artist['total_recordings'] = total_recordings
|
||||
|
||||
status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
|
||||
print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
|
||||
|
||||
# Sort by: has recording first, then by score, then by total recordings
|
||||
unique_artists.sort(key=lambda x: (
|
||||
not x.get('has_recording', False), # Artists with recording first
|
||||
-x['score'], # Then by score (highest first)
|
||||
-x.get('total_recordings', 0) # Then by total recordings (highest first)
|
||||
))
|
||||
else:
|
||||
# Just get total recording counts for sorting
|
||||
for artist in unique_artists:
|
||||
query = """
|
||||
SELECT COUNT(r.id) as recording_count
|
||||
FROM artist a
|
||||
JOIN artist_credit_name acn ON a.id = acn.artist
|
||||
JOIN artist_credit ac ON acn.artist_credit = ac.id
|
||||
JOIN recording r ON ac.id = r.artist_credit
|
||||
WHERE a.gid = %s
|
||||
"""
|
||||
self.cursor.execute(query, (artist['gid'],))
|
||||
total_result = self.cursor.fetchone()
|
||||
total_recordings = total_result['recording_count'] if total_result else 0
|
||||
artist['total_recordings'] = total_recordings
|
||||
|
||||
# Sort by score and total recordings
|
||||
unique_artists.sort(key=lambda x: (-x['score'], -x['total_recordings']))
|
||||
|
||||
return unique_artists
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in fuzzy_search_artist_candidates: {e}")
|
||||
return []
|
||||
|
||||
def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get detailed artist information by MBID."""
|
||||
if not self.connection:
|
||||
|
||||
@ -31,9 +31,10 @@ from ..core.factory import DataProviderFactory
|
||||
class MusicBrainzCleaner:
|
||||
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
|
||||
|
||||
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL):
|
||||
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
|
||||
self.use_database = use_database
|
||||
self.base_url = base_url
|
||||
self.force = force
|
||||
|
||||
# Create data provider using factory
|
||||
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
|
||||
@ -46,7 +47,14 @@ class MusicBrainzCleaner:
|
||||
Clean a single song using the centralized song processor.
|
||||
Returns (cleaned_song, success_status)
|
||||
"""
|
||||
return self.song_processor.clean_song(song)
|
||||
song_copy = song.copy()
|
||||
|
||||
# If force flag is set, remove ALL existing MBIDs to force complete reprocessing
|
||||
if self.force:
|
||||
song_copy.pop('mbid', None)
|
||||
song_copy.pop('recording_mbid', None)
|
||||
|
||||
return self.song_processor.clean_song(song_copy)
|
||||
|
||||
def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
@ -74,7 +82,15 @@ class MusicBrainzCleaner:
|
||||
|
||||
total_songs = len(all_songs)
|
||||
print(f'📊 Total songs to process: {total_songs:,}')
|
||||
|
||||
# Check if file is empty
|
||||
if total_songs == 0:
|
||||
print('⚠️ No songs to process - file is empty')
|
||||
return {}
|
||||
|
||||
print(f'Using {"database" if self.use_database else "API"} connection')
|
||||
if self.force:
|
||||
print('🔄 Force mode enabled - ignoring all existing MBIDs')
|
||||
print(PROGRESS_SEPARATOR)
|
||||
|
||||
# Initialize arrays for batch processing
|
||||
@ -165,13 +181,17 @@ class MusicBrainzCleaner:
|
||||
f.write(f"Total Songs Processed: {stats['total_processed']:,}\n")
|
||||
f.write(f"Successful Songs: {len(successful_songs):,}\n")
|
||||
f.write(f"Failed Songs: {len(failed_songs):,}\n")
|
||||
f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n")
|
||||
f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%")
|
||||
f.write("\n\n")
|
||||
|
||||
f.write("DETAILED STATISTICS\n")
|
||||
f.write("-" * 20 + "\n")
|
||||
f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n")
|
||||
f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n")
|
||||
f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n")
|
||||
f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%")
|
||||
f.write(")\n")
|
||||
f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%")
|
||||
f.write(")\n")
|
||||
f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second")
|
||||
f.write("\n\n")
|
||||
|
||||
f.write("OUTPUT FILES\n")
|
||||
f.write("-" * 20 + "\n")
|
||||
@ -193,7 +213,7 @@ class MusicBrainzCleaner:
|
||||
# Save detailed JSON report for programmatic access
|
||||
json_report_file = source_file.parent / f"processing_report_{timestamp}.json"
|
||||
final_stats = {
|
||||
'summary': {
|
||||
'summary': {
|
||||
'total_tested': stats['total_processed'],
|
||||
'artists_found': stats['artists_found'],
|
||||
'recordings_found': stats['recordings_found'],
|
||||
@ -219,10 +239,10 @@ class MusicBrainzCleaner:
|
||||
print(f'🎉 Processing completed!')
|
||||
print(f'📊 Final Results:')
|
||||
print(f' ⏱️ Total processing time: {total_time:.1f} seconds')
|
||||
print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second')
|
||||
print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)')
|
||||
print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)')
|
||||
print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)')
|
||||
print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else ' 🚀 Average speed: 0.0 songs/second')
|
||||
print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)')
|
||||
print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)')
|
||||
print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ❌ Failed songs: {len(failed_songs):,} (0.0%)')
|
||||
print(f'📄 Files saved:')
|
||||
print(f' ✅ Successful songs: {output_success}')
|
||||
print(f' ❌ Failed songs: {output_failure}')
|
||||
@ -245,6 +265,7 @@ OPTIONS:
|
||||
--output-failure FILE Output file for failed songs (default: source-failure.json)
|
||||
--limit N Process only the first N songs (default: all songs)
|
||||
--use-api Force use of HTTP API instead of direct database access
|
||||
--force Force reprocessing, ignoring all existing MBIDs
|
||||
--test-connection Test connection to MusicBrainz server
|
||||
--help, -h Show this help message
|
||||
--version, -v Show version information
|
||||
@ -268,6 +289,9 @@ EXAMPLES:
|
||||
# Force API mode
|
||||
musicbrainz-cleaner --use-api
|
||||
|
||||
# Force reprocessing (ignore all existing MBIDs)
|
||||
musicbrainz-cleaner --force
|
||||
|
||||
REQUIREMENTS:
|
||||
- MusicBrainz server running on http://localhost:5001
|
||||
- PostgreSQL database accessible (for direct database access)
|
||||
@ -309,6 +333,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
||||
'limit': None,
|
||||
'use_api': False,
|
||||
'test_connection': False,
|
||||
'force': False,
|
||||
'help': False,
|
||||
'version': False
|
||||
}
|
||||
@ -325,6 +350,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
|
||||
parsed['test_connection'] = True
|
||||
elif arg == '--use-api':
|
||||
parsed['use_api'] = True
|
||||
elif arg == '--force':
|
||||
parsed['force'] = True
|
||||
elif arg == '--source':
|
||||
if i + 1 < len(args) and not args[i + 1].startswith('--'):
|
||||
parsed['source'] = args[i + 1]
|
||||
@ -410,7 +437,7 @@ def main() -> int:
|
||||
return ExitCode.USAGE_ERROR
|
||||
|
||||
# Process the file
|
||||
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'])
|
||||
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
|
||||
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
|
||||
|
||||
return ExitCode.SUCCESS
|
||||
|
||||
@ -22,19 +22,27 @@ class SongProcessor(SongProcessorInterface):
|
||||
"""
|
||||
self.data_provider = data_provider
|
||||
|
||||
def find_artist_mbid(self, artist_name: str) -> Optional[str]:
|
||||
def find_artist_mbid(self, artist_name: str, recording_title: str = None) -> Optional[str]:
|
||||
"""
|
||||
Find artist MBID using consistent logic across all interfaces.
|
||||
|
||||
Args:
|
||||
artist_name: Name of the artist to search for
|
||||
recording_title: Optional recording title to help choose the correct artist
|
||||
|
||||
Returns:
|
||||
Artist MBID if found, None otherwise
|
||||
"""
|
||||
if not artist_name:
|
||||
return None
|
||||
|
||||
|
||||
# If we have a recording title, use the improved search that considers the recording
|
||||
if recording_title and hasattr(self.data_provider, 'fuzzy_search_artist_with_recording'):
|
||||
result = self.data_provider.fuzzy_search_artist_with_recording(artist_name, recording_title)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (artist_name, mbid, score)
|
||||
|
||||
# Fallback to regular search
|
||||
result = self.data_provider.fuzzy_search_artist(artist_name)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (artist_name, mbid, score)
|
||||
@ -69,7 +77,39 @@ class SongProcessor(SongProcessorInterface):
|
||||
if variation.lower() == known_title.lower():
|
||||
return recording_mbid
|
||||
|
||||
# Handle collaborations using artist credit
|
||||
# If we have an artist_mbid, try to find recording with that specific artist
|
||||
if artist_mbid:
|
||||
# Try to find recording with the specific artist MBID
|
||||
for variation in title_variations:
|
||||
result = self.data_provider.fuzzy_search_recording(variation, artist_mbid)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return recording MBID
|
||||
|
||||
# If no artist_mbid or recording not found, try multiple artist candidates
|
||||
if original_artist and hasattr(self.data_provider, 'fuzzy_search_artist_candidates'):
|
||||
print(f"🎯 Trying multiple artist candidates for '{original_artist}' with recording '{title}'...")
|
||||
|
||||
# Get multiple artist candidates
|
||||
candidates = self.data_provider.fuzzy_search_artist_candidates(original_artist, title)
|
||||
|
||||
# Try each candidate to find the recording
|
||||
for i, candidate in enumerate(candidates[:5]): # Limit to top 5 candidates
|
||||
candidate_mbid = candidate['gid']
|
||||
candidate_name = candidate['name']
|
||||
candidate_score = candidate['score']
|
||||
has_recording = candidate.get('has_recording', False)
|
||||
|
||||
print(f" 🔍 Trying candidate {i+1}: {candidate_name} (score: {candidate_score:.2f}, has recording: {has_recording})")
|
||||
|
||||
# Try to find recording with this artist candidate
|
||||
for variation in title_variations:
|
||||
result = self.data_provider.fuzzy_search_recording(variation, candidate_mbid)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
recording_mbid = result[1]
|
||||
print(f" ✅ Found recording with candidate {i+1}: {candidate_name}")
|
||||
return recording_mbid
|
||||
|
||||
# Handle collaborations using artist credit (fallback)
|
||||
# If no artist_mbid (collaboration case), try to find by title and verify artist credit
|
||||
if not artist_mbid and original_artist:
|
||||
# This is a collaboration case, try to find by title with all variations
|
||||
@ -141,8 +181,8 @@ class SongProcessor(SongProcessorInterface):
|
||||
original_artist = song.get('artist', '')
|
||||
original_title = song.get('title', '')
|
||||
|
||||
# Find artist MBID
|
||||
artist_mbid = self.find_artist_mbid(song.get('artist', ''))
|
||||
# Find artist MBID (pass recording title to help choose correct artist)
|
||||
artist_mbid = self.find_artist_mbid(song.get('artist', ''), song.get('title', ''))
|
||||
|
||||
# Handle collaboration case (artist_mbid is None)
|
||||
# Use the same complex collaboration parsing as the database
|
||||
@ -188,8 +228,8 @@ class SongProcessor(SongProcessorInterface):
|
||||
song['artist'] = artist_info['name']
|
||||
song['mbid'] = artist_mbid
|
||||
|
||||
# Find recording MBID
|
||||
recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''))
|
||||
# Find recording MBID (pass original artist for multiple candidate search)
|
||||
recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''), original_artist)
|
||||
if not recording_mbid:
|
||||
return song, False
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user