From 543e7b71d4f58c3ee2ccafff62d88db012701e13 Mon Sep 17 00:00:00 2001
From: Matt Bruce <mbrucedogs@gmail.com>
Date: Fri, 1 Aug 2025 10:55:50 -0500
Subject: [PATCH] Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

---
 PRD.md                     |  34 +++-
 src/api/database.py        | 393 +++++++++++++++++++++++++++++++++++++
 src/cli/main.py            |  51 +++--
 src/core/song_processor.py |  54 ++++-
 4 files changed, 509 insertions(+), 23 deletions(-)

diff --git a/PRD.md b/PRD.md
index 260304a..34ef33d 100644
--- a/PRD.md
+++ b/PRD.md
@@ -250,13 +250,15 @@ src/
 - Manual configuration needed for custom artist/recording mappings
 - **NEW**: Some edge cases may require manual intervention (data quality issues)
 
-### Test File Organization
-- **REQUIRED**: All test files must be placed in `src/tests/` directory
-- **PROHIBITED**: Test files should not be placed in the root directory
+### Test File Organization - CRITICAL DIRECTIVE
+- **REQUIRED**: All test files MUST be placed in `src/tests/` directory
+- **PROHIBITED**: Test files should NEVER be placed in the root directory
 - **Naming Convention**: Test files should follow `test_*.py` or `debug_*.py` patterns
 - **Purpose**: Keeps root directory clean and organizes test code properly
 - **Import Path**: Tests can import from parent modules using relative imports
 
+**⚠️ CRITICAL ENFORCEMENT**: This directive is ABSOLUTE and NON-NEGOTIABLE. Any test files created in the root directory will be immediately deleted and moved to the correct location.
+
 ### Using Tests for Issue Resolution
 - **FIRST STEP**: When encountering issues, check `src/tests/` directory for existing test files
 - **EXISTING TESTS**: Many common issues already have test cases that can help debug problems
@@ -580,6 +582,8 @@ Test files often contain working code snippets that can be adapted:
 - Data processing logic
 - Error handling approaches
 
+**⚠️ REMINDER**: All test files MUST be in `src/tests/` directory. NEVER create test files in the root directory.
+
 ## Lessons Learned
 
 ### Database Integration
@@ -613,4 +617,26 @@ Test files often contain working code snippets that can be adapted:
 - **Resource Requirements**: MusicBrainz services require significant memory (8GB+ recommended) and disk space
 - **Platform Compatibility**: Apple Silicon (M1/M2) works but may show platform mismatch warnings
 - **Database Connection Issues**: Common startup problems include wrong host configuration and incomplete initialization
-- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly 
\ No newline at end of file
+- **Test Script Logic**: Critical to handle tuple return values from cleaner methods correctly
+
+## CRITICAL PROJECT DIRECTIVE - TEST FILE ORGANIZATION
+
+**⚠️ ABSOLUTE REQUIREMENT - NON-NEGOTIABLE**
+
+### Test File Placement Rules
+- **REQUIRED**: ALL test files MUST be placed in `src/tests/` directory
+- **PROHIBITED**: Test files should NEVER be placed in the root directory
+- **ENFORCEMENT**: Any test files created in the root directory will be immediately deleted and moved to the correct location
+- **NON-NEGOTIABLE**: This directive is absolute and must be followed at all times
+
+### Why This Matters
+- **Project Structure**: Keeps the root directory clean and organized
+- **Code Organization**: Groups all test-related code in one location
+- **Maintainability**: Makes it easier to find and manage test files
+- **Best Practices**: Follows standard Python project structure conventions
+
+### Compliance Required
+- **ALL developers** must follow this directive
+- **ALL test files** must be in `src/tests/`
+- **NO EXCEPTIONS** to this rule
+- **IMMEDIATE CORRECTION** required for any violations 
\ No newline at end of file
diff --git a/src/api/database.py b/src/api/database.py
index d34d43c..d5e4fa8 100644
--- a/src/api/database.py
+++ b/src/api/database.py
@@ -461,6 +461,399 @@ class MusicBrainzDatabase(MusicBrainzDataProvider):
             print(f"Error in fuzzy_search_recording: {e}")
             return None
     
+    def fuzzy_search_artist_with_recording(self, artist_name: str, recording_title: str) -> Optional[Tuple[str, str, float]]:
+        """
+        Fuzzy search for artist by name, prioritizing artists that have the specific recording.
+        Returns (artist_name, mbid, similarity_score) or None.
+        """
+        if not self.connection:
+            return None
+        
+        try:
+            # Clean artist name
+            clean_name = artist_name.strip()
+            
+            # Handle collaborations - extract main artist
+            if 'ft.' in clean_name.lower():
+                main_artist = clean_name.split('ft.')[0].strip()
+            elif 'feat.' in clean_name.lower():
+                main_artist = clean_name.split('feat.')[0].strip()
+            else:
+                main_artist = clean_name
+            
+            # Find all potential artists (including aliases)
+            all_artists = []
+            
+            # 1. Exact matches in artist names
+            query = "SELECT name, gid FROM artist WHERE name = %s"
+            self.cursor.execute(query, (main_artist,))
+            exact_results = self.cursor.fetchall()
+            for result in exact_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_name',
+                    'score': 1.0
+                })
+            
+            # 2. Exact matches in artist aliases
+            query = """
+                SELECT a.name, a.gid
+                FROM artist a
+                JOIN artist_alias aa ON a.id = aa.artist
+                WHERE aa.name = %s
+            """
+            self.cursor.execute(query, (main_artist,))
+            alias_results = self.cursor.fetchall()
+            for result in alias_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_alias',
+                    'score': 0.95
+                })
+            
+            # 3. Case-insensitive matches
+            query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
+            self.cursor.execute(query, (main_artist,))
+            case_results = self.cursor.fetchall()
+            for result in case_results:
+                # Avoid duplicates
+                if not any(a['gid'] == result['gid'] for a in all_artists):
+                    all_artists.append({
+                        'name': result['name'],
+                        'gid': result['gid'],
+                        'match_type': 'case_insensitive',
+                        'score': 0.99
+                    })
+            
+            # 4. Fuzzy matches in names and aliases
+            name_variants = [main_artist]
+            # Add dash variants
+            name_variants.extend([
+                main_artist.replace('-', '‐'),
+                main_artist.replace('‐', '-'),
+            ])
+            
+            # Add name variations (e.g., "S Club 7" -> "S Club")
+            for variant in name_variants[:]:  # Copy list to avoid modification during iteration
+                if ' ' in variant:
+                    parts = variant.split()
+                    if len(parts) > 1 and parts[-1].isdigit():
+                        name_variants.append(' '.join(parts[:-1]))
+            
+            # Search for fuzzy matches
+            for variant in name_variants:
+                # Search in artist names
+                query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_results = self.cursor.fetchall()
+                for result in fuzzy_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_name',
+                                'score': score
+                            })
+                
+                # Search in artist aliases
+                query = """
+                    SELECT a.name, a.gid 
+                    FROM artist a
+                    JOIN artist_alias aa ON a.id = aa.artist
+                    WHERE aa.name ILIKE %s
+                """
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_alias_results = self.cursor.fetchall()
+                for result in fuzzy_alias_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_alias',
+                                'score': score
+                            })
+            
+            if not all_artists:
+                return None
+            
+            # Remove duplicates and get recording information
+            unique_artists = []
+            seen_mbids = set()
+            for artist in all_artists:
+                if artist['gid'] not in seen_mbids:
+                    seen_mbids.add(artist['gid'])
+                    unique_artists.append(artist)
+            
+            print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
+            
+            # Check which artists have the specific recording
+            artists_with_recording = []
+            for artist in unique_artists:
+                # Check if this artist has the specific recording
+                query = """
+                    SELECT COUNT(r.id) as recording_count
+                    FROM artist a
+                    JOIN artist_credit_name acn ON a.id = acn.artist
+                    JOIN artist_credit ac ON acn.artist_credit = ac.id
+                    JOIN recording r ON ac.id = r.artist_credit
+                    WHERE a.gid = %s AND r.name ILIKE %s
+                """
+                self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
+                result = self.cursor.fetchone()
+                has_recording = result['recording_count'] > 0 if result else False
+                
+                # Also get total recording count
+                query = """
+                    SELECT COUNT(r.id) as recording_count
+                    FROM artist a
+                    JOIN artist_credit_name acn ON a.id = acn.artist
+                    JOIN artist_credit ac ON acn.artist_credit = ac.id
+                    JOIN recording r ON ac.id = r.artist_credit
+                    WHERE a.gid = %s
+                """
+                self.cursor.execute(query, (artist['gid'],))
+                total_result = self.cursor.fetchone()
+                total_recordings = total_result['recording_count'] if total_result else 0
+                
+                artist_info = {
+                    **artist,
+                    'has_recording': has_recording,
+                    'total_recordings': total_recordings
+                }
+                artists_with_recording.append(artist_info)
+                
+                status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
+                print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
+            
+            # Prioritize artists that have the specific recording
+            artists_with_recording.sort(key=lambda x: (
+                not x['has_recording'],  # Artists with recording first
+                -x['score'],            # Then by score (highest first)
+                -x['total_recordings']  # Then by total recordings (highest first)
+            ))
+            
+            if artists_with_recording:
+                best_artist = artists_with_recording[0]
+                print(f"  🎯 Selected: {best_artist['name']} (score: {best_artist['score']:.2f}, has recording: {best_artist['has_recording']})")
+                
+                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
+                    original=artist_name,
+                    matched=best_artist['name'],
+                    score=best_artist['score']
+                ))
+                return (best_artist['name'], best_artist['gid'], best_artist['score'])
+            
+            return None
+            
+        except Exception as e:
+            print(f"Error in fuzzy_search_artist_with_recording: {e}")
+            return None
+    
+    def fuzzy_search_artist_candidates(self, artist_name: str, recording_title: str = None) -> List[Dict[str, Any]]:
+        """
+        Fuzzy search for artist by name, returning multiple candidates.
+        If recording_title is provided, prioritizes artists that have the specific recording.
+        Returns list of artist candidates with their details.
+        """
+        if not self.connection:
+            return []
+        
+        try:
+            # Clean artist name
+            clean_name = artist_name.strip()
+            
+            # Handle collaborations - extract main artist
+            if 'ft.' in clean_name.lower():
+                main_artist = clean_name.split('ft.')[0].strip()
+            elif 'feat.' in clean_name.lower():
+                main_artist = clean_name.split('feat.')[0].strip()
+            else:
+                main_artist = clean_name
+            
+            # Find all potential artists (including aliases)
+            all_artists = []
+            
+            # 1. Exact matches in artist names
+            query = "SELECT name, gid FROM artist WHERE name = %s"
+            self.cursor.execute(query, (main_artist,))
+            exact_results = self.cursor.fetchall()
+            for result in exact_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_name',
+                    'score': 1.0
+                })
+            
+            # 2. Exact matches in artist aliases
+            query = """
+                SELECT a.name, a.gid
+                FROM artist a
+                JOIN artist_alias aa ON a.id = aa.artist
+                WHERE aa.name = %s
+            """
+            self.cursor.execute(query, (main_artist,))
+            alias_results = self.cursor.fetchall()
+            for result in alias_results:
+                all_artists.append({
+                    'name': result['name'],
+                    'gid': result['gid'],
+                    'match_type': 'exact_alias',
+                    'score': 0.95
+                })
+            
+            # 3. Case-insensitive matches
+            query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s)"
+            self.cursor.execute(query, (main_artist,))
+            case_results = self.cursor.fetchall()
+            for result in case_results:
+                # Avoid duplicates
+                if not any(a['gid'] == result['gid'] for a in all_artists):
+                    all_artists.append({
+                        'name': result['name'],
+                        'gid': result['gid'],
+                        'match_type': 'case_insensitive',
+                        'score': 0.99
+                    })
+            
+            # 4. Fuzzy matches in names and aliases
+            name_variants = [main_artist]
+            # Add dash variants
+            name_variants.extend([
+                main_artist.replace('-', '‐'),
+                main_artist.replace('‐', '-'),
+            ])
+            
+            # Add name variations (e.g., "S Club 7" -> "S Club")
+            for variant in name_variants[:]:  # Copy list to avoid modification during iteration
+                if ' ' in variant:
+                    parts = variant.split()
+                    if len(parts) > 1 and parts[-1].isdigit():
+                        name_variants.append(' '.join(parts[:-1]))
+            
+            # Search for fuzzy matches
+            for variant in name_variants:
+                # Search in artist names
+                query = "SELECT name, gid FROM artist WHERE name ILIKE %s"
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_results = self.cursor.fetchall()
+                for result in fuzzy_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_name',
+                                'score': score
+                            })
+                
+                # Search in artist aliases
+                query = """
+                    SELECT a.name, a.gid 
+                    FROM artist a
+                    JOIN artist_alias aa ON a.id = aa.artist
+                    WHERE aa.name ILIKE %s
+                """
+                self.cursor.execute(query, (f'%{variant}%',))
+                fuzzy_alias_results = self.cursor.fetchall()
+                for result in fuzzy_alias_results:
+                    if not any(a['gid'] == result['gid'] for a in all_artists):
+                        score = fuzz.ratio(main_artist.lower(), result['name'].lower()) / 100.0
+                        if score >= 0.8:  # Only include good matches
+                            all_artists.append({
+                                'name': result['name'],
+                                'gid': result['gid'],
+                                'match_type': 'fuzzy_alias',
+                                'score': score
+                            })
+            
+            if not all_artists:
+                return []
+            
+            # Remove duplicates and get recording information
+            unique_artists = []
+            seen_mbids = set()
+            for artist in all_artists:
+                if artist['gid'] not in seen_mbids:
+                    seen_mbids.add(artist['gid'])
+                    unique_artists.append(artist)
+            
+            # If we have a recording title, check which artists have it
+            if recording_title:
+                print(f"🔍 Found {len(unique_artists)} potential artists for '{main_artist}', checking for recording '{recording_title}'...")
+                
+                # Check which artists have the specific recording
+                for artist in unique_artists:
+                    # Check if this artist has the specific recording
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s AND r.name ILIKE %s
+                    """
+                    self.cursor.execute(query, (artist['gid'], f'%{recording_title}%'))
+                    result = self.cursor.fetchone()
+                    has_recording = result['recording_count'] > 0 if result else False
+                    
+                    # Also get total recording count
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s
+                    """
+                    self.cursor.execute(query, (artist['gid'],))
+                    total_result = self.cursor.fetchone()
+                    total_recordings = total_result['recording_count'] if total_result else 0
+                    
+                    artist['has_recording'] = has_recording
+                    artist['total_recordings'] = total_recordings
+                    
+                    status = "✅ HAS RECORDING" if has_recording else "❌ NO RECORDING"
+                    print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {total_recordings} recordings - {status}")
+                
+                # Sort by: has recording first, then by score, then by total recordings
+                unique_artists.sort(key=lambda x: (
+                    not x.get('has_recording', False),  # Artists with recording first
+                    -x['score'],                       # Then by score (highest first)
+                    -x.get('total_recordings', 0)      # Then by total recordings (highest first)
+                ))
+            else:
+                # Just get total recording counts for sorting
+                for artist in unique_artists:
+                    query = """
+                        SELECT COUNT(r.id) as recording_count
+                        FROM artist a
+                        JOIN artist_credit_name acn ON a.id = acn.artist
+                        JOIN artist_credit ac ON acn.artist_credit = ac.id
+                        JOIN recording r ON ac.id = r.artist_credit
+                        WHERE a.gid = %s
+                    """
+                    self.cursor.execute(query, (artist['gid'],))
+                    total_result = self.cursor.fetchone()
+                    total_recordings = total_result['recording_count'] if total_result else 0
+                    artist['total_recordings'] = total_recordings
+                
+                # Sort by score and total recordings
+                unique_artists.sort(key=lambda x: (-x['score'], -x['total_recordings']))
+            
+            return unique_artists
+            
+        except Exception as e:
+            print(f"Error in fuzzy_search_artist_candidates: {e}")
+            return []
+    
     def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]:
         """Get detailed artist information by MBID."""
         if not self.connection:
diff --git a/src/cli/main.py b/src/cli/main.py
index 9b3e012..a14ce6d 100644
--- a/src/cli/main.py
+++ b/src/cli/main.py
@@ -31,9 +31,10 @@ from ..core.factory import DataProviderFactory
 class MusicBrainzCleaner:
     """Enhanced MusicBrainz Cleaner with interface-based data access."""
     
-    def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL):
+    def __init__(self, use_database: bool = True, base_url: str = DEFAULT_MUSICBRAINZ_URL, force: bool = False):
         self.use_database = use_database
         self.base_url = base_url
+        self.force = force
         
         # Create data provider using factory
         self.data_provider = DataProviderFactory.create_provider(use_database, base_url)
@@ -46,7 +47,14 @@ class MusicBrainzCleaner:
         Clean a single song using the centralized song processor.
         Returns (cleaned_song, success_status)
         """
-        return self.song_processor.clean_song(song)
+        song_copy = song.copy()
+        
+        # If force flag is set, remove ALL existing MBIDs to force complete reprocessing
+        if self.force:
+            song_copy.pop('mbid', None)
+            song_copy.pop('recording_mbid', None)
+                
+        return self.song_processor.clean_song(song_copy)
     
     def process_songs(self, source_file: Path, output_success: Path = None, output_failure: Path = None, limit: Optional[int] = None) -> Dict[str, Any]:
         """
@@ -74,7 +82,15 @@ class MusicBrainzCleaner:
         
         total_songs = len(all_songs)
         print(f'📊 Total songs to process: {total_songs:,}')
+        
+        # Check if file is empty
+        if total_songs == 0:
+            print('⚠️  No songs to process - file is empty')
+            return {}
+            
         print(f'Using {"database" if self.use_database else "API"} connection')
+        if self.force:
+            print('🔄 Force mode enabled - ignoring all existing MBIDs')
         print(PROGRESS_SEPARATOR)
         
         # Initialize arrays for batch processing
@@ -165,13 +181,17 @@ class MusicBrainzCleaner:
             f.write(f"Total Songs Processed: {stats['total_processed']:,}\n")
             f.write(f"Successful Songs: {len(successful_songs):,}\n")
             f.write(f"Failed Songs: {len(failed_songs):,}\n")
-            f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%\n\n")
+            f.write(f"Success Rate: {(len(successful_songs)/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else "Success Rate: 0.0%")
+            f.write("\n\n")
             
             f.write("DETAILED STATISTICS\n")
             f.write("-" * 20 + "\n")
-            f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%)\n")
-            f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%)\n")
-            f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second\n\n")
+            f.write(f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} ({(stats['artists_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Artists Found: {stats['artists_found']:,}/{stats['total_processed']:,} (0.0%")
+            f.write(")\n")
+            f.write(f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} ({(stats['recordings_found']/stats['total_processed']*100):.1f}%" if stats['total_processed'] > 0 else f"Recordings Found: {stats['recordings_found']:,}/{stats['total_processed']:,} (0.0%")
+            f.write(")\n")
+            f.write(f"Processing Speed: {stats['total_processed'] / total_time:.1f} songs/second" if total_time > 0 else "Processing Speed: 0.0 songs/second")
+            f.write("\n\n")
             
             f.write("OUTPUT FILES\n")
             f.write("-" * 20 + "\n")
@@ -193,7 +213,7 @@ class MusicBrainzCleaner:
         # Save detailed JSON report for programmatic access
         json_report_file = source_file.parent / f"processing_report_{timestamp}.json"
         final_stats = {
-            'summary': {
+                    'summary': {
                 'total_tested': stats['total_processed'],
                 'artists_found': stats['artists_found'],
                 'recordings_found': stats['recordings_found'],
@@ -219,10 +239,10 @@ class MusicBrainzCleaner:
         print(f'🎉 Processing completed!')
         print(f'📊 Final Results:')
         print(f'  ⏱️  Total processing time: {total_time:.1f} seconds')
-        print(f'  🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second')
-        print(f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)')
-        print(f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)')
-        print(f'  ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)')
+        print(f'  🚀 Average speed: {stats["total_processed"] / total_time:.1f} songs/second' if total_time > 0 else '  🚀 Average speed: 0.0 songs/second')
+        print(f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} ({stats["artists_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ✅ Artists found: {stats["artists_found"]:,}/{stats["total_processed"]:,} (0.0%)')
+        print(f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} ({stats["recordings_found"]/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ✅ Recordings found: {stats["recordings_found"]:,}/{stats["total_processed"]:,} (0.0%)')
+        print(f'  ❌ Failed songs: {len(failed_songs):,} ({len(failed_songs)/stats["total_processed"]*100:.1f}%)' if stats["total_processed"] > 0 else f'  ❌ Failed songs: {len(failed_songs):,} (0.0%)')
         print(f'📄 Files saved:')
         print(f'  ✅ Successful songs: {output_success}')
         print(f'  ❌ Failed songs: {output_failure}')
@@ -245,6 +265,7 @@ OPTIONS:
     --output-failure FILE      Output file for failed songs (default: source-failure.json)
     --limit N                  Process only the first N songs (default: all songs)
     --use-api                  Force use of HTTP API instead of direct database access
+    --force                    Force reprocessing, ignoring all existing MBIDs
     --test-connection          Test connection to MusicBrainz server
     --help, -h                 Show this help message
     --version, -v              Show version information
@@ -268,6 +289,9 @@ EXAMPLES:
     # Force API mode
     musicbrainz-cleaner --use-api
 
+    # Force reprocessing (ignore all existing MBIDs)
+    musicbrainz-cleaner --force
+
 REQUIREMENTS:
     - MusicBrainz server running on http://localhost:5001
     - PostgreSQL database accessible (for direct database access)
@@ -309,6 +333,7 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
         'limit': None,
         'use_api': False,
         'test_connection': False,
+        'force': False,
         'help': False,
         'version': False
     }
@@ -325,6 +350,8 @@ def parse_arguments(args: List[str]) -> Dict[str, Any]:
             parsed['test_connection'] = True
         elif arg == '--use-api':
             parsed['use_api'] = True
+        elif arg == '--force':
+            parsed['force'] = True
         elif arg == '--source':
             if i + 1 < len(args) and not args[i + 1].startswith('--'):
                 parsed['source'] = args[i + 1]
@@ -410,7 +437,7 @@ def main() -> int:
             return ExitCode.USAGE_ERROR
         
         # Process the file
-        cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'])
+        cleaner = MusicBrainzCleaner(use_database=not parsed['use_api'], force=parsed['force'])
         cleaner.process_songs(source_file, output_success, output_failure, parsed['limit'])
         
         return ExitCode.SUCCESS
diff --git a/src/core/song_processor.py b/src/core/song_processor.py
index 763539a..d50db72 100644
--- a/src/core/song_processor.py
+++ b/src/core/song_processor.py
@@ -22,19 +22,27 @@ class SongProcessor(SongProcessorInterface):
         """
         self.data_provider = data_provider
     
-    def find_artist_mbid(self, artist_name: str) -> Optional[str]:
+    def find_artist_mbid(self, artist_name: str, recording_title: str = None) -> Optional[str]:
         """
         Find artist MBID using consistent logic across all interfaces.
         
         Args:
             artist_name: Name of the artist to search for
+            recording_title: Optional recording title to help choose the correct artist
             
         Returns:
             Artist MBID if found, None otherwise
         """
         if not artist_name:
             return None
-            
+        
+        # If we have a recording title, use the improved search that considers the recording
+        if recording_title and hasattr(self.data_provider, 'fuzzy_search_artist_with_recording'):
+            result = self.data_provider.fuzzy_search_artist_with_recording(artist_name, recording_title)
+            if result and isinstance(result, tuple) and len(result) >= 2:
+                return result[1]  # Return MBID from tuple (artist_name, mbid, score)
+        
+        # Fallback to regular search
         result = self.data_provider.fuzzy_search_artist(artist_name)
         if result and isinstance(result, tuple) and len(result) >= 2:
             return result[1]  # Return MBID from tuple (artist_name, mbid, score)
@@ -69,7 +77,39 @@ class SongProcessor(SongProcessorInterface):
                     if variation.lower() == known_title.lower():
                         return recording_mbid
         
-        # Handle collaborations using artist credit
+        # If we have an artist_mbid, try to find recording with that specific artist
+        if artist_mbid:
+            # Try to find recording with the specific artist MBID
+            for variation in title_variations:
+                result = self.data_provider.fuzzy_search_recording(variation, artist_mbid)
+                if result and isinstance(result, tuple) and len(result) >= 2:
+                    return result[1]  # Return recording MBID
+        
+        # If no artist_mbid or recording not found, try multiple artist candidates
+        if original_artist and hasattr(self.data_provider, 'fuzzy_search_artist_candidates'):
+            print(f"🎯 Trying multiple artist candidates for '{original_artist}' with recording '{title}'...")
+            
+            # Get multiple artist candidates
+            candidates = self.data_provider.fuzzy_search_artist_candidates(original_artist, title)
+            
+            # Try each candidate to find the recording
+            for i, candidate in enumerate(candidates[:5]):  # Limit to top 5 candidates
+                candidate_mbid = candidate['gid']
+                candidate_name = candidate['name']
+                candidate_score = candidate['score']
+                has_recording = candidate.get('has_recording', False)
+                
+                print(f"  🔍 Trying candidate {i+1}: {candidate_name} (score: {candidate_score:.2f}, has recording: {has_recording})")
+                
+                # Try to find recording with this artist candidate
+                for variation in title_variations:
+                    result = self.data_provider.fuzzy_search_recording(variation, candidate_mbid)
+                    if result and isinstance(result, tuple) and len(result) >= 2:
+                        recording_mbid = result[1]
+                        print(f"  ✅ Found recording with candidate {i+1}: {candidate_name}")
+                        return recording_mbid
+        
+        # Handle collaborations using artist credit (fallback)
         # If no artist_mbid (collaboration case), try to find by title and verify artist credit
         if not artist_mbid and original_artist:
             # This is a collaboration case, try to find by title with all variations
@@ -141,8 +181,8 @@ class SongProcessor(SongProcessorInterface):
         original_artist = song.get('artist', '')
         original_title = song.get('title', '')
         
-        # Find artist MBID
-        artist_mbid = self.find_artist_mbid(song.get('artist', ''))
+        # Find artist MBID (pass recording title to help choose correct artist)
+        artist_mbid = self.find_artist_mbid(song.get('artist', ''), song.get('title', ''))
         
         # Handle collaboration case (artist_mbid is None)
         # Use the same complex collaboration parsing as the database
@@ -188,8 +228,8 @@ class SongProcessor(SongProcessorInterface):
             song['artist'] = artist_info['name']
             song['mbid'] = artist_mbid
         
-        # Find recording MBID
-        recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''))
+        # Find recording MBID (pass original artist for multiple candidate search)
+        recording_mbid = self.find_recording_mbid(artist_mbid, song.get('title', ''), original_artist)
         if not recording_mbid:
             return song, False