Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-01 10:55:50 -05:00
parent ec33253403
commit 543e7b71d4
4 changed files with 509 additions and 23 deletions

34
PRD.md
View File

@ -250,13 +250,15 @@ src/
- Manual configuration needed for custom artist/recording mappings
- **NEW**: Some edge cases may require manual intervention (data quality issues)
### Test File Organization
- **REQUIRED**: All test files must be placed in `src/tests/` directory
- **PROHIBITED**: Test files should not be placed in the root directory
### Test File Organization - CRITICAL DIRECTIVE
- **REQUIRED**: All test files MUST be placed in `src/tests/` directory
- **PROHIBITED**: Test files should NEVER be placed in the root directory
- **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns
- **Purpose**: Keeps root directory clean and organizes test code properly
- **Import Path**: Tests can import from parent modules using relative imports
**⚠️ CRITICAL ENFORCEMENT**: This directive is ABSOLUTE and NON-NEGOTIABLE. Any test files created in the root directory will be immediately deleted and moved to the correct location.
### Using Tests for Issue Resolution
- **FIRST STEP**: When encountering issues, check `src/tests/` directory for existing test files
- **EXISTING TESTS**: Many common issues already have test cases that can help debug problems
@ -580,6 +582,8 @@ Test files often contain working code snippets that can be adapted:
- Data processing logic
- Error handling approaches
**⚠️ REMINDER**: All test files MUST be in `src/tests/` directory. NEVER create test files in the root directory.
## Lessons Learned
### Database Integration
@ -613,4 +617,26 @@ Test files often contain working code snippets that can be adapted:
- **Resource Requirements**: MusicBrainz services require significant memory (8GB+ recommended) and disk space
- **Platform Compatibility**: Apple Silicon (M1/M2) works but may show platform mismatch warnings
- **Database Connection Issues**: Common startup problems include wrong host configuration and incomplete initialization
- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
## CRITICAL PROJECT DIRECTIVE - TEST FILE ORGANIZATION
**⚠️ ABSOLUTE REQUIREMENT - NON-NEGOTIABLE**
### Test File Placement Rules
- **REQUIRED**: ALL test files MUST be placed in `src/tests/` directory
- **PROHIBITED**: Test files should NEVER be placed in the root directory
- **ENFORCEMENT**: Any test files created in the root directory will be immediately deleted and moved to the correct location
- **NON-NEGOTIABLE**: This directive is absolute and must be followed at all times
### Why This Matters
- **Project Structure**: Keeps the root directory clean and organized
- **Code Organization**: Groups all test-related code in one location
- **Maintainability**: Makes it easier to find and manage test files
- **Best Practices**: Follows standard Python project structure conventions
### Compliance Required
- **ALL developers** must follow this directive
- **ALL test files** must be in `src/tests/`
- **NO EXCEPTIONS** to this rule
- **IMMEDIATE CORRECTION** required for any violations

View File

@ -461,6 +461,399 @@ class MusicBrainzDatabase(MusicBrainzDataProvider):
print(f"Error in fuzzy_search_recording: {e}")
return None
def fuzzy_search_artist_with_recording(self, artist_name: str, recording_title: str) -> Optional[Tuple[str, str, float]]:
"""
Fuzzy search for artist by name, prioritizing artists that have the specific recording.
Returns (artist_name, mbid, similarity_score) or None.
"""
if not self.connection:
return None
try:
# Clean artist name
clean_name = artist_name.strip()
# Handle collaborations - extract main artist
if 'ft.' in clean_name.lower():
main_artist = clean_name.split('ft.')[0].strip()
elif 'feat.' in clean_name.lower():
main_artist = clean_name.split('feat.')[0].strip()
else:
main_artist = clean_name
# Find all potential artists (including aliases)
all_artists = []
# 1. Exact matches in artist names
query = "SELECT name, gid FROM artist WHERE name = %s"
self.cursor.execute(query, (main_artist,))
exact_results = self.cursor.fetchall()
for result in exact_results:
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'exact_name',
'score': 1.0
})
# 2. Exact matches in artist aliases
query = """
SELECT a.name, a.gid
FROM artist a
JOIN artist_alias aa ON a.id = aa.artist
WHERE aa.name = %s
"""
self.cursor.execute(query, (main_artist,))
alias_results = self.cursor.fetchall()
for result in alias_results:
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'exact_alias',
'score': 0.95
})
# 3. Case-insensitive matches
query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
self.cursor.execute(query, (main_artist,))
case_results = self.cursor.fetchall()
for result in case_results:
# Avoid duplicates
if not any(a['gid'] == result['gid'] for a in all_artists):
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'case_insensitive',
'score': 0.99
})
# 4. Fuzzy matches in names and aliases
name_variants = [main_artist]
# Add dash variants
name_variants.extend([
main_artist.replace('-', ''),
main_artist.replace('', '-'),
])
# Add name variations (e.g., "S Club 7" -> "S Club")
for variant in name_variants[:]: # Copy list to avoid modification during iteration
if ' ' in variant:
parts = variant.split()
if len(parts) > 1 and parts[-1].isdigit():
name_variants.append(' '.join(parts[:-1]))
# Search for fuzzy matches
for variant in name_variants:
# Search in artist names
query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
self.cursor.execute(query, (f'%{variant}%',))
fuzzy_results = self.cursor.fetchall()
for result in fuzzy_results:
if not any(a['gid'] == result['gid'] for a in all_artists):
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
if score >= 0.8: # Only include good matches
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'fuzzy_name',
'score': score
})
# Search in artist aliases
query = """
SELECT a.name, a.gid
FROM artist a
JOIN artist_alias aa ON a.id = aa.artist
WHERE aa.name ILIKE %s
"""
self.cursor.execute(query, (f'%{variant}%',))
fuzzy_alias_results = self.cursor.fetchall()
for result in fuzzy_alias_results:
if not any(a['gid'] == result['gid'] for a in all_artists):
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
if score >= 0.8: # Only include good matches
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'fuzzy_alias',
'score': score
})
if not all_artists:
return None
# Remove duplicates and get recording information
unique_artists = []
seen_mbids = set()
for artist in all_artists:
if artist['gid'] not in seen_mbids:
seen_mbids.add(artist['gid'])
unique_artists.append(artist)
print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
# Check which artists have the specific recording
artists_with_recording = []
for artist in unique_artists:
# Check if this artist has the specific recording
query = """
SELECT COUNT(r.id) as recording_count
FROM artist a
JOIN artist_credit_name acn ON a.id = acn.artist
JOIN artist_credit ac ON acn.artist_credit = ac.id
JOIN recording r ON ac.id = r.artist_credit
WHERE a.gid = %s AND r.name ILIKE %s
"""
self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
result = self.cursor.fetchone()
has_recording = result['recording_count'] > 0 if result else False
# Also get total recording count
query = """
SELECT COUNT(r.id) as recording_count
FROM artist a
JOIN artist_credit_name acn ON a.id = acn.artist
JOIN artist_credit ac ON acn.artist_credit = ac.id
JOIN recording r ON ac.id = r.artist_credit
WHERE a.gid = %s
"""
self.cursor.execute(query, (artist['gid'],))
total_result = self.cursor.fetchone()
total_recordings = total_result['recording_count'] if total_result else 0
artist_info = {
**artist,
'has_recording': has_recording,
'total_recordings': total_recordings
}
artists_with_recording.append(artist_info)
status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
# Prioritize artists that have the specific recording
artists_with_recording.sort(key=lambda x: (
not x['has_recording'], # Artists with recording first
-x['score'], # Then by score (highest first)
-x['total_recordings'] # Then by total recordings (highest first)
))
if artists_with_recording:
best_artist = artists_with_recording[0]
print(f" 🎯 Selected: {best_artist['name']} (score: {best_artist['score']:.2f}, has recording: {best_artist['has_recording']})")
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
original=artist_name,
matched=best_artist['name'],
score=best_artist['score']
))
return (best_artist['name'], best_artist['gid'], best_artist['score'])
return None
except Exception as e:
print(f"Error in fuzzy_search_artist_with_recording: {e}")
return None
def fuzzy_search_artist_candidates(self, artist_name: str, recording_title: str = None) -> List[Dict[str, Any]]:
"""
Fuzzy search for artist by name, returning multiple candidates.
If recording_title is provided, prioritizes artists that have the specific recording.
Returns list of artist candidates with their details.
"""
if not self.connection:
return []
try:
# Clean artist name
clean_name = artist_name.strip()
# Handle collaborations - extract main artist
if 'ft.' in clean_name.lower():
main_artist = clean_name.split('ft.')[0].strip()
elif 'feat.' in clean_name.lower():
main_artist = clean_name.split('feat.')[0].strip()
else:
main_artist = clean_name
# Find all potential artists (including aliases)
all_artists = []
# 1. Exact matches in artist names
query = "SELECT name, gid FROM artist WHERE name = %s"
self.cursor.execute(query, (main_artist,))
exact_results = self.cursor.fetchall()
for result in exact_results:
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'exact_name',
'score': 1.0
})
# 2. Exact matches in artist aliases
query = """
SELECT a.name, a.gid
FROM artist a
JOIN artist_alias aa ON a.id = aa.artist
WHERE aa.name = %s
"""
self.cursor.execute(query, (main_artist,))
alias_results = self.cursor.fetchall()
for result in alias_results:
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'exact_alias',
'score': 0.95
})
# 3. Case-insensitive matches
query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
self.cursor.execute(query, (main_artist,))
case_results = self.cursor.fetchall()
for result in case_results:
# Avoid duplicates
if not any(a['gid'] == result['gid'] for a in all_artists):
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'case_insensitive',
'score': 0.99
})
# 4. Fuzzy matches in names and aliases
name_variants = [main_artist]
# Add dash variants
name_variants.extend([
main_artist.replace('-', ''),
main_artist.replace('', '-'),
])
# Add name variations (e.g., "S Club 7" -> "S Club")
for variant in name_variants[:]: # Copy list to avoid modification during iteration
if ' ' in variant:
parts = variant.split()
if len(parts) > 1 and parts[-1].isdigit():
name_variants.append(' '.join(parts[:-1]))
# Search for fuzzy matches
for variant in name_variants:
# Search in artist names
query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
self.cursor.execute(query, (f'%{variant}%',))
fuzzy_results = self.cursor.fetchall()
for result in fuzzy_results:
if not any(a['gid'] == result['gid'] for a in all_artists):
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
if score >= 0.8: # Only include good matches
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'fuzzy_name',
'score': score
})
# Search in artist aliases
query = """
SELECT a.name, a.gid
FROM artist a
JOIN artist_alias aa ON a.id = aa.artist
WHERE aa.name ILIKE %s
"""
self.cursor.execute(query, (f'%{variant}%',))
fuzzy_alias_results = self.cursor.fetchall()
for result in fuzzy_alias_results:
if not any(a['gid'] == result['gid'] for a in all_artists):
score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
if score >= 0.8: # Only include good matches
all_artists.append({
'name': result['name'],
'gid': result['gid'],
'match_type': 'fuzzy_alias',
'score': score
})
if not all_artists:
return []
# Remove duplicates and get recording information
unique_artists = []
seen_mbids = set()
for artist in all_artists:
if artist['gid'] not in seen_mbids:
seen_mbids.add(artist['gid'])
unique_artists.append(artist)
# If we have a recording title, check which artists have it
if recording_title:
print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
# Check which artists have the specific recording
for artist in unique_artists:
# Check if this artist has the specific recording
query = """
SELECT COUNT(r.id) as recording_count
FROM artist a
JOIN artist_credit_name acn ON a.id = acn.artist
JOIN artist_credit ac ON acn.artist_credit = ac.id
JOIN recording r ON ac.id = r.artist_credit
WHERE a.gid = %s AND r.name ILIKE %s
"""
self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
result = self.cursor.fetchone()
has_recording = result['recording_count'] > 0 if result else False
# Also get total recording count
query = """
SELECT COUNT(r.id) as recording_count
FROM artist a
JOIN artist_credit_name acn ON a.id = acn.artist
JOIN artist_credit ac ON acn.artist_credit = ac.id
JOIN recording r ON ac.id = r.artist_credit
WHERE a.gid = %s
"""
self.cursor.execute(query, (artist['gid'],))
total_result = self.cursor.fetchone()
total_recordings = total_result['recording_count'] if total_result else 0
artist['has_recording'] = has_recording
artist['total_recordings'] = total_recordings
status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
# Sort by: has recording first, then by score, then by total recordings
unique_artists.sort(key=lambda x: (
not x.get('has_recording', False), # Artists with recording first
-x['score'], # Then by score (highest first)
-x.get('total_recordings', 0) # Then by total recordings (highest first)
))
else:
# Just get total recording counts for sorting
for artist in unique_artists:
query = """
SELECT COUNT(r.id) as recording_count
FROM artist a
JOIN artist_credit_name acn ON a.id = acn.artist
JOIN artist_credit ac ON acn.artist_credit = ac.id
JOIN recording r ON ac.id = r.artist_credit
WHERE a.gid = %s
"""
self.cursor.execute(query, (artist['gid'],))
total_result = self.cursor.fetchone()
total_recordings = total_result['recording_count'] if total_result else 0
artist['total_recordings'] = total_recordings
# Sort by score and total recordings
unique_artists.sort(key=lambda x: (-x['score'], -x['total_recordings']))
return unique_artists
except Exception as e:
print(f"Error in fuzzy_search_artist_candidates: {e}")
return []
def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]:
"""Get detailed artist information by MBID."""
if not self.connection:

View File

@ -31,9 +31,10 @@ from ..core.factory import DataProviderFactory
class MusicBrainzCleaner:
"""Enhanced MusicBrainz Cleaner with interface-based data access."""
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL):
def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
self.use_database = use_database
self.base_url = base_url
self.force = force
# Create data provider using factory
self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
@ -46,7 +47,14 @@ class MusicBrainzCleaner:
Clean a single song using the centralized song processor.
Returns (cleaned_song, success_status)
"""
return self.song_processor.clean_song(song)
song_copy = song.copy()
# If force flag is set, remove ALL existing MBIDs to force complete reprocessing
if self.force:
song_copy.pop('mbid', None)
song_copy.pop('recording_mbid', None)
return self.song_processor.clean_song(song_copy)
def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]:
"""
@ -74,7 +82,15 @@ class MusicBrainzCleaner:
total_songs = len(all_songs)
print(f'📊 Total songs to process: {total_songs:,}')
# Check if file is empty
if total_songs == 0:
print('⚠️ No songs to process - file is empty')
return {}
print(f'Using {"database" if self.use_database else "API"} connection')
if self.force:
print('🔄 Force mode enabled - ignoring all existing MBIDs')
print(PROGRESS_SEPARATOR)
# Initialize arrays for batch processing
@ -165,13 +181,17 @@ class MusicBrainzCleaner:
f.write(f"Total Songs Processed: {stats['total_processed']:,}\n")
f.write(f"Successful Songs: {len(successful_songs):,}\n")
f.write(f"Failed Songs: {len(failed_songs):,}\n")
f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n")
f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%")
f.write("\n\n")
f.write("DETAILED STATISTICS\n")
f.write("-" * 20 + "\n")
f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n")
f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n")
f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n")
f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%")
f.write(")\n")
f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%")
f.write(")\n")
f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second")
f.write("\n\n")
f.write("OUTPUT FILES\n")
f.write("-" * 20 + "\n")
@ -193,7 +213,7 @@ class MusicBrainzCleaner:
# Save detailed JSON report for programmatic access
json_report_file = source_file.parent / f"processing_report_{timestamp}.json"
final_stats = {
'summary': {
'summary': {
'total_tested': stats['total_processed'],
'artists_found': stats['artists_found'],
'recordings_found': stats['recordings_found'],
@ -219,10 +239,10 @@ class MusicBrainzCleaner:
print(f'🎉 Processing completed!')
print(f'📊 Final Results:')
print(f' ⏱️ Total processing time: {total_time:.1f} seconds')
print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second')
print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)')
print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)')
print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)')
print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else ' 🚀 Average speed: 0.0 songs/second')
print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)')
print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)')
print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ❌ Failed songs: {len(failed_songs):,} (0.0%)')
print(f'📄 Files saved:')
print(f' ✅ Successful songs: {output_success}')
print(f' ❌ Failed songs: {output_failure}')
@ -245,6 +265,7 @@ OPTIONS:
--output-failure FILE Output file for failed songs (default: source-failure.json)
--limit N Process only the first N songs (default: all songs)
--use-api Force use of HTTP API instead of direct database access
--force Force reprocessing, ignoring all existing MBIDs
--test-connection Test connection to MusicBrainz server
--help, -h Show this help message
--version, -v Show version information
@ -268,6 +289,9 @@ EXAMPLES:
# Force API mode
musicbrainz-cleaner --use-api
# Force reprocessing (ignore all existing MBIDs)
musicbrainz-cleaner --force
REQUIREMENTS:
- MusicBrainz server running on http://localhost:5001
- PostgreSQL database accessible (for direct database access)
@ -309,6 +333,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
'limit': None,
'use_api': False,
'test_connection': False,
'force': False,
'help': False,
'version': False
}
@ -325,6 +350,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
parsed['test_connection'] = True
elif arg == '--use-api':
parsed['use_api'] = True
elif arg == '--force':
parsed['force'] = True
elif arg == '--source':
if i + 1 < len(args) and not args[i + 1].startswith('--'):
parsed['source'] = args[i + 1]
@ -410,7 +437,7 @@ def main() -> int:
return ExitCode.USAGE_ERROR
# Process the file
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'])
cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
return ExitCode.SUCCESS

View File

@ -22,19 +22,27 @@ class SongProcessor(SongProcessorInterface):
"""
self.data_provider = data_provider
def find_artist_mbid(self, artist_name: str) -> Optional[str]:
def find_artist_mbid(self, artist_name: str, recording_title: str = None) -> Optional[str]:
"""
Find artist MBID using consistent logic across all interfaces.
Args:
artist_name: Name of the artist to search for
recording_title: Optional recording title to help choose the correct artist
Returns:
Artist MBID if found, None otherwise
"""
if not artist_name:
return None
# If we have a recording title, use the improved search that considers the recording
if recording_title and hasattr(self.data_provider, 'fuzzy_search_artist_with_recording'):
result = self.data_provider.fuzzy_search_artist_with_recording(artist_name, recording_title)
if result and isinstance(result, tuple) and len(result) >= 2:
return result[1] # Return MBID from tuple (artist_name, mbid, score)
# Fallback to regular search
result = self.data_provider.fuzzy_search_artist(artist_name)
if result and isinstance(result, tuple) and len(result) >= 2:
return result[1] # Return MBID from tuple (artist_name, mbid, score)
@ -69,7 +77,39 @@ class SongProcessor(SongProcessorInterface):
if variation.lower() == known_title.lower():
return recording_mbid
# Handle collaborations using artist credit
# If we have an artist_mbid, try to find recording with that specific artist
if artist_mbid:
# Try to find recording with the specific artist MBID
for variation in title_variations:
result = self.data_provider.fuzzy_search_recording(variation, artist_mbid)
if result and isinstance(result, tuple) and len(result) >= 2:
return result[1] # Return recording MBID
# If no artist_mbid or recording not found, try multiple artist candidates
if original_artist and hasattr(self.data_provider, 'fuzzy_search_artist_candidates'):
print(f"🎯 Trying multiple artist candidates for '{original_artist}' with recording '{title}'...")
# Get multiple artist candidates
candidates = self.data_provider.fuzzy_search_artist_candidates(original_artist, title)
# Try each candidate to find the recording
for i, candidate in enumerate(candidates[:5]): # Limit to top 5 candidates
candidate_mbid = candidate['gid']
candidate_name = candidate['name']
candidate_score = candidate['score']
has_recording = candidate.get('has_recording', False)
print(f" 🔍 Trying candidate {i+1}: {candidate_name} (score: {candidate_score:.2f}, has recording: {has_recording})")
# Try to find recording with this artist candidate
for variation in title_variations:
result = self.data_provider.fuzzy_search_recording(variation, candidate_mbid)
if result and isinstance(result, tuple) and len(result) >= 2:
recording_mbid = result[1]
print(f" ✅ Found recording with candidate {i+1}: {candidate_name}")
return recording_mbid
# Handle collaborations using artist credit (fallback)
# If no artist_mbid (collaboration case), try to find by title and verify artist credit
if not artist_mbid and original_artist:
# This is a collaboration case, try to find by title with all variations
@ -141,8 +181,8 @@ class SongProcessor(SongProcessorInterface):
original_artist = song.get('artist', '')
original_title = song.get('title', '')
# Find artist MBID
artist_mbid = self.find_artist_mbid(song.get('artist', ''))
# Find artist MBID (pass recording title to help choose correct artist)
artist_mbid = self.find_artist_mbid(song.get('artist', ''), song.get('title', ''))
# Handle collaboration case (artist_mbid is None)
# Use the same complex collaboration parsing as the database
@ -188,8 +228,8 @@ class SongProcessor(SongProcessorInterface):
song['artist'] = artist_info['name']
song['mbid'] = artist_mbid
# Find recording MBID
recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''))
# Find recording MBID (pass original artist for multiple candidate search)
recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''), original_artist)
if not recording_mbid:
return song, False