From 543e7b71d4f58c3ee2ccafff62d88db012701e13 Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Fri, 1 Aug 2025 10:55:50 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- PRD.md | 34 +++- src/api/database.py | 393 +++++++++++++++++++++++++++++++++++++ src/cli/main.py | 51 +++-- src/core/song_processor.py | 54 ++++- 4 files changed, 509 insertions(+), 23 deletions(-) diff --git a/PRD.md b/PRD.md index 260304a..34ef33d 100644 --- a/PRD.md +++ b/PRD.md @@ -250,13 +250,15 @@ src/ - Manual configuration needed for custom artist/recording mappings - **NEW**: Some edge cases may require manual intervention (data quality issues) -### Test File Organization -- **REQUIRED**: All test files must be placed in `src/tests/` directory -- **PROHIBITED**: Test files should not be placed in the root directory +### Test File Organization - CRITICAL DIRECTIVE +- **REQUIRED**: All test files MUST be placed in `src/tests/` directory +- **PROHIBITED**: Test files should NEVER be placed in the root directory - **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns - **Purpose**: Keeps root directory clean and organizes test code properly - **Import Path**: Tests can import from parent modules using relative imports +**⚠️ CRITICAL ENFORCEMENT**: This directive is ABSOLUTE and NON-NEGOTIABLE. Any test files created in the root directory will be immediately deleted and moved to the correct location. + ### Using Tests for Issue Resolution - **FIRST STEP**: When encountering issues, check `src/tests/` directory for existing test files - **EXISTING TESTS**: Many common issues already have test cases that can help debug problems @@ -580,6 +582,8 @@ Test files often contain working code snippets that can be adapted: - Data processing logic - Error handling approaches +**⚠️ REMINDER**: All test files MUST be in `src/tests/` directory. NEVER create test files in the root directory. + ## Lessons Learned ### Database Integration @@ -613,4 +617,26 @@ Test files often contain working code snippets that can be adapted: - **Resource Requirements**: MusicBrainz services require significant memory (8GB+ recommended) and disk space - **Platform Compatibility**: Apple Silicon (M1/M2) works but may show platform mismatch warnings - **Database Connection Issues**: Common startup problems include wrong host configuration and incomplete initialization -- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly \ No newline at end of file +- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly + +## CRITICAL PROJECT DIRECTIVE - TEST FILE ORGANIZATION + +**⚠️ ABSOLUTE REQUIREMENT - NON-NEGOTIABLE** + +### Test File Placement Rules +- **REQUIRED**: ALL test files MUST be placed in `src/tests/` directory +- **PROHIBITED**: Test files should NEVER be placed in the root directory +- **ENFORCEMENT**: Any test files created in the root directory will be immediately deleted and moved to the correct location +- **NON-NEGOTIABLE**: This directive is absolute and must be followed at all times + +### Why This Matters +- **Project Structure**: Keeps the root directory clean and organized +- **Code Organization**: Groups all test-related code in one location +- **Maintainability**: Makes it easier to find and manage test files +- **Best Practices**: Follows standard Python project structure conventions + +### Compliance Required +- **ALL developers** must follow this directive +- **ALL test files** must be in `src/tests/` +- **NO EXCEPTIONS** to this rule +- **IMMEDIATE CORRECTION** required for any violations \ No newline at end of file diff --git a/src/api/database.py b/src/api/database.py index d34d43c..d5e4fa8 100644 --- a/src/api/database.py +++ b/src/api/database.py @@ -461,6 +461,399 @@ class MusicBrainzDatabase(MusicBrainzDataProvider): print(f"Error in fuzzy_search_recording: {e}") return None + def fuzzy_search_artist_with_recording(self, artist_name: str, recording_title: str) -> Optional[Tuple[str, str, float]]: + """ + Fuzzy search for artist by name, prioritizing artists that have the specific recording. + Returns (artist_name, mbid, similarity_score) or None. + """ + if not self.connection: + return None + + try: + # Clean artist name + clean_name = artist_name.strip() + + # Handle collaborations - extract main artist + if 'ft.' in clean_name.lower(): + main_artist = clean_name.split('ft.')[0].strip() + elif 'feat.' in clean_name.lower(): + main_artist = clean_name.split('feat.')[0].strip() + else: + main_artist = clean_name + + # Find all potential artists (including aliases) + all_artists = [] + + # 1. Exact matches in artist names + query = "SELECT name, gid FROM artist WHERE name = %s" + self.cursor.execute(query, (main_artist,)) + exact_results = self.cursor.fetchall() + for result in exact_results: + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'exact_name', + 'score': 1.0 + }) + + # 2. Exact matches in artist aliases + query = """ + SELECT a.name, a.gid + FROM artist a + JOIN artist_alias aa ON a.id = aa.artist + WHERE aa.name = %s + """ + self.cursor.execute(query, (main_artist,)) + alias_results = self.cursor.fetchall() + for result in alias_results: + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'exact_alias', + 'score': 0.95 + }) + + # 3. Case-insensitive matches + query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)" + self.cursor.execute(query, (main_artist,)) + case_results = self.cursor.fetchall() + for result in case_results: + # Avoid duplicates + if not any(a['gid'] == result['gid'] for a in all_artists): + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'case_insensitive', + 'score': 0.99 + }) + + # 4. Fuzzy matches in names and aliases + name_variants = [main_artist] + # Add dash variants + name_variants.extend([ + main_artist.replace('-', '‐'), + main_artist.replace('‐', '-'), + ]) + + # Add name variations (e.g., "S Club 7" -> "S Club") + for variant in name_variants[:]: # Copy list to avoid modification during iteration + if ' ' in variant: + parts = variant.split() + if len(parts) > 1 and parts[-1].isdigit(): + name_variants.append(' '.join(parts[:-1])) + + # Search for fuzzy matches + for variant in name_variants: + # Search in artist names + query = "SELECT name, gid FROM artist WHERE name ILIKE %s" + self.cursor.execute(query, (f'%{variant}%',)) + fuzzy_results = self.cursor.fetchall() + for result in fuzzy_results: + if not any(a['gid'] == result['gid'] for a in all_artists): + score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0 + if score >= 0.8: # Only include good matches + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'fuzzy_name', + 'score': score + }) + + # Search in artist aliases + query = """ + SELECT a.name, a.gid + FROM artist a + JOIN artist_alias aa ON a.id = aa.artist + WHERE aa.name ILIKE %s + """ + self.cursor.execute(query, (f'%{variant}%',)) + fuzzy_alias_results = self.cursor.fetchall() + for result in fuzzy_alias_results: + if not any(a['gid'] == result['gid'] for a in all_artists): + score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0 + if score >= 0.8: # Only include good matches + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'fuzzy_alias', + 'score': score + }) + + if not all_artists: + return None + + # Remove duplicates and get recording information + unique_artists = [] + seen_mbids = set() + for artist in all_artists: + if artist['gid'] not in seen_mbids: + seen_mbids.add(artist['gid']) + unique_artists.append(artist) + + print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...") + + # Check which artists have the specific recording + artists_with_recording = [] + for artist in unique_artists: + # Check if this artist has the specific recording + query = """ + SELECT COUNT(r.id) as recording_count + FROM artist a + JOIN artist_credit_name acn ON a.id = acn.artist + JOIN artist_credit ac ON acn.artist_credit = ac.id + JOIN recording r ON ac.id = r.artist_credit + WHERE a.gid = %s AND r.name ILIKE %s + """ + self.cursor.execute(query, (artist['gid'], f'%{recording_title}%')) + result = self.cursor.fetchone() + has_recording = result['recording_count'] > 0 if result else False + + # Also get total recording count + query = """ + SELECT COUNT(r.id) as recording_count + FROM artist a + JOIN artist_credit_name acn ON a.id = acn.artist + JOIN artist_credit ac ON acn.artist_credit = ac.id + JOIN recording r ON ac.id = r.artist_credit + WHERE a.gid = %s + """ + self.cursor.execute(query, (artist['gid'],)) + total_result = self.cursor.fetchone() + total_recordings = total_result['recording_count'] if total_result else 0 + + artist_info = { + **artist, + 'has_recording': has_recording, + 'total_recordings': total_recordings + } + artists_with_recording.append(artist_info) + + status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING" + print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}") + + # Prioritize artists that have the specific recording + artists_with_recording.sort(key=lambda x: ( + not x['has_recording'], # Artists with recording first + -x['score'], # Then by score (highest first) + -x['total_recordings'] # Then by total recordings (highest first) + )) + + if artists_with_recording: + best_artist = artists_with_recording[0] + print(f" 🎯 Selected: {best_artist['name']} (score: {best_artist['score']:.2f}, has recording: {best_artist['has_recording']})") + + print(SUCCESS_MESSAGES['fuzzy_match_found'].format( + original=artist_name, + matched=best_artist['name'], + score=best_artist['score'] + )) + return (best_artist['name'], best_artist['gid'], best_artist['score']) + + return None + + except Exception as e: + print(f"Error in fuzzy_search_artist_with_recording: {e}") + return None + + def fuzzy_search_artist_candidates(self, artist_name: str, recording_title: str = None) -> List[Dict[str, Any]]: + """ + Fuzzy search for artist by name, returning multiple candidates. + If recording_title is provided, prioritizes artists that have the specific recording. + Returns list of artist candidates with their details. + """ + if not self.connection: + return [] + + try: + # Clean artist name + clean_name = artist_name.strip() + + # Handle collaborations - extract main artist + if 'ft.' in clean_name.lower(): + main_artist = clean_name.split('ft.')[0].strip() + elif 'feat.' in clean_name.lower(): + main_artist = clean_name.split('feat.')[0].strip() + else: + main_artist = clean_name + + # Find all potential artists (including aliases) + all_artists = [] + + # 1. Exact matches in artist names + query = "SELECT name, gid FROM artist WHERE name = %s" + self.cursor.execute(query, (main_artist,)) + exact_results = self.cursor.fetchall() + for result in exact_results: + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'exact_name', + 'score': 1.0 + }) + + # 2. Exact matches in artist aliases + query = """ + SELECT a.name, a.gid + FROM artist a + JOIN artist_alias aa ON a.id = aa.artist + WHERE aa.name = %s + """ + self.cursor.execute(query, (main_artist,)) + alias_results = self.cursor.fetchall() + for result in alias_results: + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'exact_alias', + 'score': 0.95 + }) + + # 3. Case-insensitive matches + query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)" + self.cursor.execute(query, (main_artist,)) + case_results = self.cursor.fetchall() + for result in case_results: + # Avoid duplicates + if not any(a['gid'] == result['gid'] for a in all_artists): + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'case_insensitive', + 'score': 0.99 + }) + + # 4. Fuzzy matches in names and aliases + name_variants = [main_artist] + # Add dash variants + name_variants.extend([ + main_artist.replace('-', '‐'), + main_artist.replace('‐', '-'), + ]) + + # Add name variations (e.g., "S Club 7" -> "S Club") + for variant in name_variants[:]: # Copy list to avoid modification during iteration + if ' ' in variant: + parts = variant.split() + if len(parts) > 1 and parts[-1].isdigit(): + name_variants.append(' '.join(parts[:-1])) + + # Search for fuzzy matches + for variant in name_variants: + # Search in artist names + query = "SELECT name, gid FROM artist WHERE name ILIKE %s" + self.cursor.execute(query, (f'%{variant}%',)) + fuzzy_results = self.cursor.fetchall() + for result in fuzzy_results: + if not any(a['gid'] == result['gid'] for a in all_artists): + score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0 + if score >= 0.8: # Only include good matches + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'fuzzy_name', + 'score': score + }) + + # Search in artist aliases + query = """ + SELECT a.name, a.gid + FROM artist a + JOIN artist_alias aa ON a.id = aa.artist + WHERE aa.name ILIKE %s + """ + self.cursor.execute(query, (f'%{variant}%',)) + fuzzy_alias_results = self.cursor.fetchall() + for result in fuzzy_alias_results: + if not any(a['gid'] == result['gid'] for a in all_artists): + score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0 + if score >= 0.8: # Only include good matches + all_artists.append({ + 'name': result['name'], + 'gid': result['gid'], + 'match_type': 'fuzzy_alias', + 'score': score + }) + + if not all_artists: + return [] + + # Remove duplicates and get recording information + unique_artists = [] + seen_mbids = set() + for artist in all_artists: + if artist['gid'] not in seen_mbids: + seen_mbids.add(artist['gid']) + unique_artists.append(artist) + + # If we have a recording title, check which artists have it + if recording_title: + print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...") + + # Check which artists have the specific recording + for artist in unique_artists: + # Check if this artist has the specific recording + query = """ + SELECT COUNT(r.id) as recording_count + FROM artist a + JOIN artist_credit_name acn ON a.id = acn.artist + JOIN artist_credit ac ON acn.artist_credit = ac.id + JOIN recording r ON ac.id = r.artist_credit + WHERE a.gid = %s AND r.name ILIKE %s + """ + self.cursor.execute(query, (artist['gid'], f'%{recording_title}%')) + result = self.cursor.fetchone() + has_recording = result['recording_count'] > 0 if result else False + + # Also get total recording count + query = """ + SELECT COUNT(r.id) as recording_count + FROM artist a + JOIN artist_credit_name acn ON a.id = acn.artist + JOIN artist_credit ac ON acn.artist_credit = ac.id + JOIN recording r ON ac.id = r.artist_credit + WHERE a.gid = %s + """ + self.cursor.execute(query, (artist['gid'],)) + total_result = self.cursor.fetchone() + total_recordings = total_result['recording_count'] if total_result else 0 + + artist['has_recording'] = has_recording + artist['total_recordings'] = total_recordings + + status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING" + print(f" 📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}") + + # Sort by: has recording first, then by score, then by total recordings + unique_artists.sort(key=lambda x: ( + not x.get('has_recording', False), # Artists with recording first + -x['score'], # Then by score (highest first) + -x.get('total_recordings', 0) # Then by total recordings (highest first) + )) + else: + # Just get total recording counts for sorting + for artist in unique_artists: + query = """ + SELECT COUNT(r.id) as recording_count + FROM artist a + JOIN artist_credit_name acn ON a.id = acn.artist + JOIN artist_credit ac ON acn.artist_credit = ac.id + JOIN recording r ON ac.id = r.artist_credit + WHERE a.gid = %s + """ + self.cursor.execute(query, (artist['gid'],)) + total_result = self.cursor.fetchone() + total_recordings = total_result['recording_count'] if total_result else 0 + artist['total_recordings'] = total_recordings + + # Sort by score and total recordings + unique_artists.sort(key=lambda x: (-x['score'], -x['total_recordings'])) + + return unique_artists + + except Exception as e: + print(f"Error in fuzzy_search_artist_candidates: {e}") + return [] + def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]: """Get detailed artist information by MBID.""" if not self.connection: diff --git a/src/cli/main.py b/src/cli/main.py index 9b3e012..a14ce6d 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -31,9 +31,10 @@ from ..core.factory import DataProviderFactory class MusicBrainzCleaner: """Enhanced MusicBrainz Cleaner with interface-based data access.""" - def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL): + def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False): self.use_database = use_database self.base_url = base_url + self.force = force # Create data provider using factory self.data_provider = DataProviderFactory.create_provider(use_database, base_url) @@ -46,7 +47,14 @@ class MusicBrainzCleaner: Clean a single song using the centralized song processor. Returns (cleaned_song, success_status) """ - return self.song_processor.clean_song(song) + song_copy = song.copy() + + # If force flag is set, remove ALL existing MBIDs to force complete reprocessing + if self.force: + song_copy.pop('mbid', None) + song_copy.pop('recording_mbid', None) + + return self.song_processor.clean_song(song_copy) def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]: """ @@ -74,7 +82,15 @@ class MusicBrainzCleaner: total_songs = len(all_songs) print(f'📊 Total songs to process: {total_songs:,}') + + # Check if file is empty + if total_songs == 0: + print('⚠️ No songs to process - file is empty') + return {} + print(f'Using {"database" if self.use_database else "API"} connection') + if self.force: + print('🔄 Force mode enabled - ignoring all existing MBIDs') print(PROGRESS_SEPARATOR) # Initialize arrays for batch processing @@ -165,13 +181,17 @@ class MusicBrainzCleaner: f.write(f"Total Songs Processed: {stats['total_processed']:,}\n") f.write(f"Successful Songs: {len(successful_songs):,}\n") f.write(f"Failed Songs: {len(failed_songs):,}\n") - f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n") + f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%") + f.write("\n\n") f.write("DETAILED STATISTICS\n") f.write("-" * 20 + "\n") - f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n") - f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n") - f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n") + f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%") + f.write(")\n") + f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%") + f.write(")\n") + f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second") + f.write("\n\n") f.write("OUTPUT FILES\n") f.write("-" * 20 + "\n") @@ -193,7 +213,7 @@ class MusicBrainzCleaner: # Save detailed JSON report for programmatic access json_report_file = source_file.parent / f"processing_report_{timestamp}.json" final_stats = { - 'summary': { + 'summary': { 'total_tested': stats['total_processed'], 'artists_found': stats['artists_found'], 'recordings_found': stats['recordings_found'], @@ -219,10 +239,10 @@ class MusicBrainzCleaner: print(f'🎉 Processing completed!') print(f'📊 Final Results:') print(f' ⏱️ Total processing time: {total_time:.1f} seconds') - print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second') - print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)') - print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)') - print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)') + print(f' 🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else ' 🚀 Average speed: 0.0 songs/second') + print(f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)') + print(f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)') + print(f' ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f' ❌ Failed songs: {len(failed_songs):,} (0.0%)') print(f'📄 Files saved:') print(f' ✅ Successful songs: {output_success}') print(f' ❌ Failed songs: {output_failure}') @@ -245,6 +265,7 @@ OPTIONS: --output-failure FILE Output file for failed songs (default: source-failure.json) --limit N Process only the first N songs (default: all songs) --use-api Force use of HTTP API instead of direct database access + --force Force reprocessing, ignoring all existing MBIDs --test-connection Test connection to MusicBrainz server --help, -h Show this help message --version, -v Show version information @@ -268,6 +289,9 @@ EXAMPLES: # Force API mode musicbrainz-cleaner --use-api + # Force reprocessing (ignore all existing MBIDs) + musicbrainz-cleaner --force + REQUIREMENTS: - MusicBrainz server running on http://localhost:5001 - PostgreSQL database accessible (for direct database access) @@ -309,6 +333,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]: 'limit': None, 'use_api': False, 'test_connection': False, + 'force': False, 'help': False, 'version': False } @@ -325,6 +350,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]: parsed['test_connection'] = True elif arg == '--use-api': parsed['use_api'] = True + elif arg == '--force': + parsed['force'] = True elif arg == '--source': if i + 1 < len(args) and not args[i + 1].startswith('--'): parsed['source'] = args[i + 1] @@ -410,7 +437,7 @@ def main() -> int: return ExitCode.USAGE_ERROR # Process the file - cleaner = MusicBrainzCleaner(use_database=not parsed['use_api']) + cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force']) cleaner.process_songs(source_file, output_success, output_failure, parsed['limit']) return ExitCode.SUCCESS diff --git a/src/core/song_processor.py b/src/core/song_processor.py index 763539a..d50db72 100644 --- a/src/core/song_processor.py +++ b/src/core/song_processor.py @@ -22,19 +22,27 @@ class SongProcessor(SongProcessorInterface): """ self.data_provider = data_provider - def find_artist_mbid(self, artist_name: str) -> Optional[str]: + def find_artist_mbid(self, artist_name: str, recording_title: str = None) -> Optional[str]: """ Find artist MBID using consistent logic across all interfaces. Args: artist_name: Name of the artist to search for + recording_title: Optional recording title to help choose the correct artist Returns: Artist MBID if found, None otherwise """ if not artist_name: return None - + + # If we have a recording title, use the improved search that considers the recording + if recording_title and hasattr(self.data_provider, 'fuzzy_search_artist_with_recording'): + result = self.data_provider.fuzzy_search_artist_with_recording(artist_name, recording_title) + if result and isinstance(result, tuple) and len(result) >= 2: + return result[1] # Return MBID from tuple (artist_name, mbid, score) + + # Fallback to regular search result = self.data_provider.fuzzy_search_artist(artist_name) if result and isinstance(result, tuple) and len(result) >= 2: return result[1] # Return MBID from tuple (artist_name, mbid, score) @@ -69,7 +77,39 @@ class SongProcessor(SongProcessorInterface): if variation.lower() == known_title.lower(): return recording_mbid - # Handle collaborations using artist credit + # If we have an artist_mbid, try to find recording with that specific artist + if artist_mbid: + # Try to find recording with the specific artist MBID + for variation in title_variations: + result = self.data_provider.fuzzy_search_recording(variation, artist_mbid) + if result and isinstance(result, tuple) and len(result) >= 2: + return result[1] # Return recording MBID + + # If no artist_mbid or recording not found, try multiple artist candidates + if original_artist and hasattr(self.data_provider, 'fuzzy_search_artist_candidates'): + print(f"🎯 Trying multiple artist candidates for '{original_artist}' with recording '{title}'...") + + # Get multiple artist candidates + candidates = self.data_provider.fuzzy_search_artist_candidates(original_artist, title) + + # Try each candidate to find the recording + for i, candidate in enumerate(candidates[:5]): # Limit to top 5 candidates + candidate_mbid = candidate['gid'] + candidate_name = candidate['name'] + candidate_score = candidate['score'] + has_recording = candidate.get('has_recording', False) + + print(f" 🔍 Trying candidate {i+1}: {candidate_name} (score: {candidate_score:.2f}, has recording: {has_recording})") + + # Try to find recording with this artist candidate + for variation in title_variations: + result = self.data_provider.fuzzy_search_recording(variation, candidate_mbid) + if result and isinstance(result, tuple) and len(result) >= 2: + recording_mbid = result[1] + print(f" ✅ Found recording with candidate {i+1}: {candidate_name}") + return recording_mbid + + # Handle collaborations using artist credit (fallback) # If no artist_mbid (collaboration case), try to find by title and verify artist credit if not artist_mbid and original_artist: # This is a collaboration case, try to find by title with all variations @@ -141,8 +181,8 @@ class SongProcessor(SongProcessorInterface): original_artist = song.get('artist', '') original_title = song.get('title', '') - # Find artist MBID - artist_mbid = self.find_artist_mbid(song.get('artist', '')) + # Find artist MBID (pass recording title to help choose correct artist) + artist_mbid = self.find_artist_mbid(song.get('artist', ''), song.get('title', '')) # Handle collaboration case (artist_mbid is None) # Use the same complex collaboration parsing as the database @@ -188,8 +228,8 @@ class SongProcessor(SongProcessorInterface): song['artist'] = artist_info['name'] song['mbid'] = artist_mbid - # Find recording MBID - recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', '')) + # Find recording MBID (pass original artist for multiple candidate search) + recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''), original_artist) if not recording_mbid: return song, False