Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-08-01 10:55:50 -05:00 · 2025-08-01 10:55:50 -05:00 · 543e7b71d4
commit 543e7b71d4
parent ec33253403
4 changed files with 509 additions and 23 deletions
--- a/PRD.md
+++ b/PRD.md
@ -250,13 +250,15 @@ src/
 - Manual configuration needed for custom artist/recording mappings
 - **NEW**: Some edge cases may require manual intervention (data quality issues)

-### Test File Organization
- **REQUIRED**: All test files must be placed in `src/tests/` directory
- **PROHIBITED**: Test files should not be placed in the root directory
+### Test File Organization - CRITICAL DIRECTIVE
+- **REQUIRED**: All test files MUST be placed in `src/tests/` directory
+- **PROHIBITED**: Test files should NEVER be placed in the root directory
 - **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns
 - **Purpose**: Keeps root directory clean and organizes test code properly
 - **Import Path**: Tests can import from parent modules using relative imports

+**⚠️ CRITICAL ENFORCEMENT**: This directive is ABSOLUTE and NON-NEGOTIABLE. Any test files created in the root directory will be immediately deleted and moved to the correct location.
+
 ### Using Tests for Issue Resolution
 - **FIRST STEP**: When encountering issues, check `src/tests/` directory for existing test files
 - **EXISTING TESTS**: Many common issues already have test cases that can help debug problems
@ -580,6 +582,8 @@ Test files often contain working code snippets that can be adapted:
 - Data processing logic
 - Error handling approaches

+**⚠️ REMINDER**: All test files MUST be in `src/tests/` directory. NEVER create test files in the root directory.
+
 ## Lessons Learned

 ### Database Integration
@ -613,4 +617,26 @@ Test files often contain working code snippets that can be adapted:
 - **Resource Requirements**: MusicBrainz services require significant memory (8GB+ recommended) and disk space
 - **Platform Compatibility**: Apple Silicon (M1/M2) works but may show platform mismatch warnings
 - **Database Connection Issues**: Common startup problems include wrong host configuration and incomplete initialization
- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly 
+- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
+
+## CRITICAL PROJECT DIRECTIVE - TEST FILE ORGANIZATION
+
+**⚠️ ABSOLUTE REQUIREMENT - NON-NEGOTIABLE**
+
+### Test File Placement Rules
+- **REQUIRED**: ALL test files MUST be placed in `src/tests/` directory
+- **PROHIBITED**: Test files should NEVER be placed in the root directory
+- **ENFORCEMENT**: Any test files created in the root directory will be immediately deleted and moved to the correct location
+- **NON-NEGOTIABLE**: This directive is absolute and must be followed at all times
+
+### Why This Matters
+- **Project Structure**: Keeps the root directory clean and organized
+- **Code Organization**: Groups all test-related code in one location
+- **Maintainability**: Makes it easier to find and manage test files
+- **Best Practices**: Follows standard Python project structure conventions
+
+### Compliance Required
+- **ALL developers** must follow this directive
+- **ALL test files** must be in `src/tests/`
+- **NO EXCEPTIONS** to this rule
+- **IMMEDIATE CORRECTION** required for any violations 
--- a/src/api/database.py
+++ b/src/api/database.py
@ -461,6 +461,399 @@ class MusicBrainzDatabase(MusicBrainzDataProvider):
            print(f"Error in fuzzy_search_recording: {e}")
            return None
    
+    def fuzzy_search_artist_with_recording(self, artist_name: str, recording_title: str) -> Optional[Tuple[str, str, float]]:
+        """
+        Fuzzy search for artist by name, prioritizing artists that have the specific recording.
+        Returns (artist_name, mbid, similarity_score) or None.
+        """
+        if not self.connection:
+            return None
+        
+        try:
+            # Clean artist name
+            clean_name = artist_name.strip()
+            
+            # Handle collaborations - extract main artist
+            if 'ft.' in clean_name.lower():
+                main_artist = clean_name.split('ft.')[0].strip()
+            elif 'feat.' in clean_name.lower():
+                main_artist = clean_name.split('feat.')[0].strip()
+            else:
+                main_artist = clean_name
+            
+            # Find all potential artists (including aliases)
+            all_artists = []
+            
+            # 1. Exact matches in artist names
+            query = "SELECT name, gid FROM artist WHERE name = %s"
+            self.cursor.execute(query, (main_artist,))
+            exact_results = self.cursor.fetchall()
+            for result in exact_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_name',
+                    'score': 1.0
+                })
+            
+            # 2. Exact matches in artist aliases
+            query = """
+                SELECT a.name, a.gid
+                FROM artist a
+                JOIN artist_alias aa ON a.id = aa.artist
+                WHERE aa.name = %s
+            """
+            self.cursor.execute(query, (main_artist,))
+            alias_results = self.cursor.fetchall()
+            for result in alias_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_alias',
+                    'score': 0.95
+                })
+            
+            # 3. Case-insensitive matches
+            query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
+            self.cursor.execute(query, (main_artist,))
+            case_results = self.cursor.fetchall()
+            for result in case_results:
+                # Avoid duplicates
+                if not any(a['gid'] == result['gid'] for a in all_artists):
+                    all_artists.append({
+                        'name': result['name'],
+                        'gid': result['gid'],
+                        'match_type': 'case_insensitive',
+                        'score': 0.99
+                    })
+            
+            # 4. Fuzzy matches in names and aliases
+            name_variants = [main_artist]
+            # Add dash variants
+            name_variants.extend([
+                main_artist.replace('-', '‐'),
+                main_artist.replace('‐', '-'),
+            ])
+            
+            # Add name variations (e.g., "S Club 7" -> "S Club")
+            for variant in name_variants[:]:  # Copy list to avoid modification during iteration
+                if ' ' in variant:
+                    parts = variant.split()
+                    if len(parts) > 1 and parts[-1].isdigit():
+                        name_variants.append(' '.join(parts[:-1]))
+            
+            # Search for fuzzy matches
+            for variant in name_variants:
+                # Search in artist names
+                query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_results = self.cursor.fetchall()
+                for result in fuzzy_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_name',
+                                'score': score
+                            })
+                
+                # Search in artist aliases
+                query = """
+                    SELECT a.name, a.gid 
+                    FROM artist a
+                    JOIN artist_alias aa ON a.id = aa.artist
+                    WHERE aa.name ILIKE %s
+                """
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_alias_results = self.cursor.fetchall()
+                for result in fuzzy_alias_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_alias',
+                                'score': score
+                            })
+            
+            if not all_artists:
+                return None
+            
+            # Remove duplicates and get recording information
+            unique_artists = []
+            seen_mbids = set()
+            for artist in all_artists:
+                if artist['gid'] not in seen_mbids:
+                    seen_mbids.add(artist['gid'])
+                    unique_artists.append(artist)
+            
+            print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
+            
+            # Check which artists have the specific recording
+            artists_with_recording = []
+            for artist in unique_artists:
+                # Check if this artist has the specific recording
+                query = """
+                    SELECT COUNT(r.id) as recording_count
+                    FROM artist a
+                    JOIN artist_credit_name acn ON a.id = acn.artist
+                    JOIN artist_credit ac ON acn.artist_credit = ac.id
+                    JOIN recording r ON ac.id = r.artist_credit
+                    WHERE a.gid = %s AND r.name ILIKE %s
+                """
+                self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
+                result = self.cursor.fetchone()
+                has_recording = result['recording_count'] > 0 if result else False
+                
+                # Also get total recording count
+                query = """
+                    SELECT COUNT(r.id) as recording_count
+                    FROM artist a
+                    JOIN artist_credit_name acn ON a.id = acn.artist
+                    JOIN artist_credit ac ON acn.artist_credit = ac.id
+                    JOIN recording r ON ac.id = r.artist_credit
+                    WHERE a.gid = %s
+                """
+                self.cursor.execute(query, (artist['gid'],))
+                total_result = self.cursor.fetchone()
+                total_recordings = total_result['recording_count'] if total_result else 0
+                
+                artist_info = {
+                    **artist,
+                    'has_recording': has_recording,
+                    'total_recordings': total_recordings
+                }
+                artists_with_recording.append(artist_info)
+                
+                status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
+                print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
+            
+            # Prioritize artists that have the specific recording
+            artists_with_recording.sort(key=lambda x: (
+                not x['has_recording'],  # Artists with recording first
+                -x['score'],            # Then by score (highest first)
+                -x['total_recordings']  # Then by total recordings (highest first)
+            ))
+            
+            if artists_with_recording:
+                best_artist = artists_with_recording[0]
+                print(f"  🎯 Selected: {best_artist['name']} (score: {best_artist['score']:.2f}, has recording: {best_artist['has_recording']})")
+                
+                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
+                    original=artist_name,
+                    matched=best_artist['name'],
+                    score=best_artist['score']
+                ))
+                return (best_artist['name'], best_artist['gid'], best_artist['score'])
+            
+            return None
+            
+        except Exception as e:
+            print(f"Error in fuzzy_search_artist_with_recording: {e}")
+            return None
+    
+    def fuzzy_search_artist_candidates(self, artist_name: str, recording_title: str = None) -> List[Dict[str, Any]]:
+        """
+        Fuzzy search for artist by name, returning multiple candidates.
+        If recording_title is provided, prioritizes artists that have the specific recording.
+        Returns list of artist candidates with their details.
+        """
+        if not self.connection:
+            return []
+        
+        try:
+            # Clean artist name
+            clean_name = artist_name.strip()
+            
+            # Handle collaborations - extract main artist
+            if 'ft.' in clean_name.lower():
+                main_artist = clean_name.split('ft.')[0].strip()
+            elif 'feat.' in clean_name.lower():
+                main_artist = clean_name.split('feat.')[0].strip()
+            else:
+                main_artist = clean_name
+            
+            # Find all potential artists (including aliases)
+            all_artists = []
+            
+            # 1. Exact matches in artist names
+            query = "SELECT name, gid FROM artist WHERE name = %s"
+            self.cursor.execute(query, (main_artist,))
+            exact_results = self.cursor.fetchall()
+            for result in exact_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_name',
+                    'score': 1.0
+                })
+            
+            # 2. Exact matches in artist aliases
+            query = """
+                SELECT a.name, a.gid
+                FROM artist a
+                JOIN artist_alias aa ON a.id = aa.artist
+                WHERE aa.name = %s
+            """
+            self.cursor.execute(query, (main_artist,))
+            alias_results = self.cursor.fetchall()
+            for result in alias_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_alias',
+                    'score': 0.95
+                })
+            
+            # 3. Case-insensitive matches
+            query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
+            self.cursor.execute(query, (main_artist,))
+            case_results = self.cursor.fetchall()
+            for result in case_results:
+                # Avoid duplicates
+                if not any(a['gid'] == result['gid'] for a in all_artists):
+                    all_artists.append({
+                        'name': result['name'],
+                        'gid': result['gid'],
+                        'match_type': 'case_insensitive',
+                        'score': 0.99
+                    })
+            
+            # 4. Fuzzy matches in names and aliases
+            name_variants = [main_artist]
+            # Add dash variants
+            name_variants.extend([
+                main_artist.replace('-', '‐'),
+                main_artist.replace('‐', '-'),
+            ])
+            
+            # Add name variations (e.g., "S Club 7" -> "S Club")
+            for variant in name_variants[:]:  # Copy list to avoid modification during iteration
+                if ' ' in variant:
+                    parts = variant.split()
+                    if len(parts) > 1 and parts[-1].isdigit():
+                        name_variants.append(' '.join(parts[:-1]))
+            
+            # Search for fuzzy matches
+            for variant in name_variants:
+                # Search in artist names
+                query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_results = self.cursor.fetchall()
+                for result in fuzzy_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_name',
+                                'score': score
+                            })
+                
+                # Search in artist aliases
+                query = """
+                    SELECT a.name, a.gid 
+                    FROM artist a
+                    JOIN artist_alias aa ON a.id = aa.artist
+                    WHERE aa.name ILIKE %s
+                """
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_alias_results = self.cursor.fetchall()
+                for result in fuzzy_alias_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_alias',
+                                'score': score
+                            })
+            
+            if not all_artists:
+                return []
+            
+            # Remove duplicates and get recording information
+            unique_artists = []
+            seen_mbids = set()
+            for artist in all_artists:
+                if artist['gid'] not in seen_mbids:
+                    seen_mbids.add(artist['gid'])
+                    unique_artists.append(artist)
+            
+            # If we have a recording title, check which artists have it
+            if recording_title:
+                print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
+                
+                # Check which artists have the specific recording
+                for artist in unique_artists:
+                    # Check if this artist has the specific recording
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s AND r.name ILIKE %s
+                    """
+                    self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
+                    result = self.cursor.fetchone()
+                    has_recording = result['recording_count'] > 0 if result else False
+                    
+                    # Also get total recording count
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s
+                    """
+                    self.cursor.execute(query, (artist['gid'],))
+                    total_result = self.cursor.fetchone()
+                    total_recordings = total_result['recording_count'] if total_result else 0
+                    
+                    artist['has_recording'] = has_recording
+                    artist['total_recordings'] = total_recordings
+                    
+                    status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
+                    print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
+                
+                # Sort by: has recording first, then by score, then by total recordings
+                unique_artists.sort(key=lambda x: (
+                    not x.get('has_recording', False),  # Artists with recording first
+                    -x['score'],                       # Then by score (highest first)
+                    -x.get('total_recordings', 0)      # Then by total recordings (highest first)
+                ))
+            else:
+                # Just get total recording counts for sorting
+                for artist in unique_artists:
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s
+                    """
+                    self.cursor.execute(query, (artist['gid'],))
+                    total_result = self.cursor.fetchone()
+                    total_recordings = total_result['recording_count'] if total_result else 0
+                    artist['total_recordings'] = total_recordings
+                
+                # Sort by score and total recordings
+                unique_artists.sort(key=lambda x: (-x['score'], -x['total_recordings']))
+            
+            return unique_artists
+            
+        except Exception as e:
+            print(f"Error in fuzzy_search_artist_candidates: {e}")
+            return []
+    
    def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]:
        """Get detailed artist information by MBID."""
        if not self.connection:
--- a/src/cli/main.py
+++ b/src/cli/main.py
@ -31,9 +31,10 @@ from ..core.factory import DataProviderFactory
 class MusicBrainzCleaner:
    """Enhanced MusicBrainz Cleaner with interface-based data access."""
    
-    def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL):
+    def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
        self.use_database = use_database
        self.base_url = base_url
+        self.force = force
        
        # Create data provider using factory
        self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
@ -46,7 +47,14 @@ class MusicBrainzCleaner:
        Clean a single song using the centralized song processor.
        Returns (cleaned_song, success_status)
        """
-        return self.song_processor.clean_song(song)
+        song_copy = song.copy()
+        
+        # If force flag is set, remove ALL existing MBIDs to force complete reprocessing
+        if self.force:
+            song_copy.pop('mbid', None)
+            song_copy.pop('recording_mbid', None)
+                
+        return self.song_processor.clean_song(song_copy)
    
    def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]:
        """
@ -74,7 +82,15 @@ class MusicBrainzCleaner:
        
        total_songs = len(all_songs)
        print(f'📊 Total songs to process: {total_songs:,}')
+        
+        # Check if file is empty
+        if total_songs == 0:
+            print('⚠️  No songs to process - file is empty')
+            return {}
+            
        print(f'Using {"database" if self.use_database else "API"} connection')
+        if self.force:
+            print('🔄 Force mode enabled - ignoring all existing MBIDs')
        print(PROGRESS_SEPARATOR)
        
        # Initialize arrays for batch processing
@ -165,13 +181,17 @@ class MusicBrainzCleaner:
            f.write(f"Total Songs Processed: {stats['total_processed']:,}\n")
            f.write(f"Successful Songs: {len(successful_songs):,}\n")
            f.write(f"Failed Songs: {len(failed_songs):,}\n")
-            f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n")
+            f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%")
+            f.write("\n\n")
            
            f.write("DETAILED STATISTICS\n")
            f.write("-" * 20 + "\n")
-            f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n")
-            f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n")
-            f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n")
+            f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%")
+            f.write(")\n")
+            f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%")
+            f.write(")\n")
+            f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second")
+            f.write("\n\n")
            
            f.write("OUTPUT FILES\n")
            f.write("-" * 20 + "\n")
@ -193,7 +213,7 @@ class MusicBrainzCleaner:
        # Save detailed JSON report for programmatic access
        json_report_file = source_file.parent / f"processing_report_{timestamp}.json"
        final_stats = {
-            'summary': {
+                    'summary': {
                'total_tested': stats['total_processed'],
                'artists_found': stats['artists_found'],
                'recordings_found': stats['recordings_found'],
@ -219,10 +239,10 @@ class MusicBrainzCleaner:
        print(f'🎉 Processing completed!')
        print(f'📊 Final Results:')
        print(f'  ⏱️  Total processing time: {total_time:.1f} seconds')
-        print(f'  🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second')
-        print(f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)')
-        print(f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)')
-        print(f'  ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)')
+        print(f'  🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else '  🚀 Average speed: 0.0 songs/second')
+        print(f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)')
+        print(f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)')
+        print(f'  ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ❌ Failed songs: {len(failed_songs):,} (0.0%)')
        print(f'📄 Files saved:')
        print(f'  ✅ Successful songs: {output_success}')
        print(f'  ❌ Failed songs: {output_failure}')
@ -245,6 +265,7 @@ OPTIONS:
    --output-failure FILE      Output file for failed songs (default: source-failure.json)
    --limit N                  Process only the first N songs (default: all songs)
    --use-api                  Force use of HTTP API instead of direct database access
+    --force                    Force reprocessing, ignoring all existing MBIDs
    --test-connection          Test connection to MusicBrainz server
    --help, -h                 Show this help message
    --version, -v              Show version information
@ -268,6 +289,9 @@ EXAMPLES:
    # Force API mode
    musicbrainz-cleaner --use-api

+    # Force reprocessing (ignore all existing MBIDs)
+    musicbrainz-cleaner --force
+
 REQUIREMENTS:
    - MusicBrainz server running on http://localhost:5001
    - PostgreSQL database accessible (for direct database access)
@ -309,6 +333,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
        'limit': None,
        'use_api': False,
        'test_connection': False,
+        'force': False,
        'help': False,
        'version': False
    }
@ -325,6 +350,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
            parsed['test_connection'] = True
        elif arg == '--use-api':
            parsed['use_api'] = True
+        elif arg == '--force':
+            parsed['force'] = True
        elif arg == '--source':
            if i + 1 < len(args) and not args[i + 1].startswith('--'):
                parsed['source'] = args[i + 1]
@ -410,7 +437,7 @@ def main() -> int:
            return ExitCode.USAGE_ERROR
        
        # Process the file
-        cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'])
+        cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
        cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
        
        return ExitCode.SUCCESS
--- a/src/core/song_processor.py
+++ b/src/core/song_processor.py
@ -22,19 +22,27 @@ class SongProcessor(SongProcessorInterface):
        """
        self.data_provider = data_provider
    
-    def find_artist_mbid(self, artist_name: str) -> Optional[str]:
+    def find_artist_mbid(self, artist_name: str, recording_title: str = None) -> Optional[str]:
        """
        Find artist MBID using consistent logic across all interfaces.
        
        Args:
            artist_name: Name of the artist to search for
+            recording_title: Optional recording title to help choose the correct artist
            
        Returns:
            Artist MBID if found, None otherwise
        """
        if not artist_name:
            return None
-            
+        
+        # If we have a recording title, use the improved search that considers the recording
+        if recording_title and hasattr(self.data_provider, 'fuzzy_search_artist_with_recording'):
+            result = self.data_provider.fuzzy_search_artist_with_recording(artist_name, recording_title)
+            if result and isinstance(result, tuple) and len(result) >= 2:
+                return result[1]  # Return MBID from tuple (artist_name, mbid, score)
+        
+        # Fallback to regular search
        result = self.data_provider.fuzzy_search_artist(artist_name)
        if result and isinstance(result, tuple) and len(result) >= 2:
            return result[1]  # Return MBID from tuple (artist_name, mbid, score)
@ -69,7 +77,39 @@ class SongProcessor(SongProcessorInterface):
                    if variation.lower() == known_title.lower():
                        return recording_mbid
        
-        # Handle collaborations using artist credit
+        # If we have an artist_mbid, try to find recording with that specific artist
+        if artist_mbid:
+            # Try to find recording with the specific artist MBID
+            for variation in title_variations:
+                result = self.data_provider.fuzzy_search_recording(variation, artist_mbid)
+                if result and isinstance(result, tuple) and len(result) >= 2:
+                    return result[1]  # Return recording MBID
+        
+        # If no artist_mbid or recording not found, try multiple artist candidates
+        if original_artist and hasattr(self.data_provider, 'fuzzy_search_artist_candidates'):
+            print(f"🎯 Trying multiple artist candidates for '{original_artist}' with recording '{title}'...")
+            
+            # Get multiple artist candidates
+            candidates = self.data_provider.fuzzy_search_artist_candidates(original_artist, title)
+            
+            # Try each candidate to find the recording
+            for i, candidate in enumerate(candidates[:5]):  # Limit to top 5 candidates
+                candidate_mbid = candidate['gid']
+                candidate_name = candidate['name']
+                candidate_score = candidate['score']
+                has_recording = candidate.get('has_recording', False)
+                
+                print(f"  🔍 Trying candidate {i+1}: {candidate_name} (score: {candidate_score:.2f}, has recording: {has_recording})")
+                
+                # Try to find recording with this artist candidate
+                for variation in title_variations:
+                    result = self.data_provider.fuzzy_search_recording(variation, candidate_mbid)
+                    if result and isinstance(result, tuple) and len(result) >= 2:
+                        recording_mbid = result[1]
+                        print(f"  ✅ Found recording with candidate {i+1}: {candidate_name}")
+                        return recording_mbid
+        
+        # Handle collaborations using artist credit (fallback)
        # If no artist_mbid (collaboration case), try to find by title and verify artist credit
        if not artist_mbid and original_artist:
            # This is a collaboration case, try to find by title with all variations
@ -141,8 +181,8 @@ class SongProcessor(SongProcessorInterface):
        original_artist = song.get('artist', '')
        original_title = song.get('title', '')
        
-        # Find artist MBID
-        artist_mbid = self.find_artist_mbid(song.get('artist', ''))
+        # Find artist MBID (pass recording title to help choose correct artist)
+        artist_mbid = self.find_artist_mbid(song.get('artist', ''), song.get('title', ''))
        
        # Handle collaboration case (artist_mbid is None)
        # Use the same complex collaboration parsing as the database
@ -188,8 +228,8 @@ class SongProcessor(SongProcessorInterface):
            song['artist'] = artist_info['name']
            song['mbid'] = artist_mbid
        
-        # Find recording MBID
-        recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''))
+        # Find recording MBID (pass original artist for multiple candidate search)
+        recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''), original_artist)
        if not recording_mbid:
            return song, False