Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-08-01 09:46:35 -05:00 · 2025-08-01 09:46:35 -05:00 · 4bbd03eab7
commit 4bbd03eab7
parent 3d8b0165af
2 changed files with 123 additions and 538 deletions
--- a/src/api/database.py
+++ b/src/api/database.py
@ -13,6 +13,7 @@ from ..config.constants import (
    TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
    SUCCESS_MESSAGES, ERROR_MESSAGES
 )
+from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations


 class MusicBrainzDatabase:
@ -372,7 +373,7 @@ class MusicBrainzDatabase:
        
        try:
            # Generate title variations using the helper method
-            search_titles = self._generate_title_variations(title)
+            search_titles = generate_title_variations(title)
            
            if artist_mbid:
                # Try to find recording by artist and title with all variations
@ -418,16 +419,17 @@ class MusicBrainzDatabase:
            else:
                # No artist constraint - search by title only with all variations
                for search_title in search_titles:
-                    # Try exact match first
-                    query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
-                    self.cursor.execute(query, (search_title,))
+                    # Try exact match first (handle both apostrophe types)
+                    search_title_curly = search_title.replace("'", "'")
+                    query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1"
+                    self.cursor.execute(query, (search_title, search_title_curly))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 1.0)
                    
-                    # Try case-insensitive match
-                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
-                    self.cursor.execute(query, (search_title,))
+                    # Try case-insensitive match (handle both apostrophe types)
+                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1"
+                    self.cursor.execute(query, (search_title, search_title_curly))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 0.99)
@ -437,8 +439,10 @@ class MusicBrainzDatabase:
                best_score = 0
                
                for search_title in search_titles:
-                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
-                    self.cursor.execute(query, (f"%{search_title}%",))
+                    # Handle both apostrophe types in partial search
+                    search_title_curly = search_title.replace("'", "'")
+                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50"
+                    self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%"))
                    recordings = self.cursor.fetchall()
                    
                    for recording in recordings:
@ -550,374 +554,55 @@ class MusicBrainzDatabase:
            clean_name = artist_name.strip()

            # Parse complex collaboration
-            main_artist, collaborators = self._parse_complex_collaboration(clean_name)
+            main_artist, collaborators = parse_complex_collaboration(clean_name)
            collaboration = len(collaborators) > 0
            
            # Generate title variations using the helper method
-            search_titles = self._generate_title_variations(title)
+            search_titles = generate_title_variations(title)
            
-            # Try to find recording with artist credit using all title variations
+            # Build the expected artist credit string
+            expected_artist_string = main_artist
+            if collaborators:
+                expected_artist_string += " & " + " & ".join(collaborators)
+            
+            # Try to find recording with matching artist credit using all title variations
            for search_title in search_titles:
-                query = """
-                    SELECT ac.id as artist_credit_id, r.gid as recording_mbid
+                # First, find all recordings with matching title (handle both apostrophe types)
+                title_query = """
+                    SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id
                    FROM recording r
-                    JOIN artist_credit ac ON r.artist_credit = ac.id
-                    JOIN artist_credit_name acn ON ac.id = acn.artist_credit
-                    JOIN artist a ON acn.artist = a.id
-                    WHERE a.name = %s AND r.name ILIKE %s
-                    LIMIT 1
+                    WHERE r.name ILIKE %s OR r.name ILIKE %s
                """
-                self.cursor.execute(query, (main_artist, f"%{search_title}%"))
-                result = self.cursor.fetchone()
+                # Create versions with both straight and curly apostrophes
+                search_title_curly = search_title.replace("'", "'").replace("'", "'")
+                self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%"))
+                recordings = self.cursor.fetchall()
                
-                if result:
-                    break
+                for recording in recordings:
+                    # Check if this recording's artist credit matches our expected artist string
+                    artist_query = """
+                        SELECT a.name, acn.join_phrase, acn.position
+                        FROM artist_credit_name acn
+                        JOIN artist a ON acn.artist = a.id
+                        WHERE acn.artist_credit = %s
+                        ORDER BY acn.position
+                    """
+                    self.cursor.execute(artist_query, (recording['artist_credit_id'],))
+                    artists = self.cursor.fetchall()
                    
-            if result:
-                # Get the full artist string for this credit
-                artist_query = """
-                    SELECT a.name, acn.join_phrase, acn.position
-                    FROM artist_credit_name acn
-                    JOIN artist a ON acn.artist = a.id
-                    WHERE acn.artist_credit = %s
-                    ORDER BY acn.position
-                """
-                self.cursor.execute(artist_query, (result['artist_credit_id'],))
-                artists = self.cursor.fetchall()
+                    # Build the actual artist string for this recording
+                    actual_artist_string = ""
+                    for artist in artists:
+                        actual_artist_string += artist['name']
+                        if artist['join_phrase']:
+                            actual_artist_string += artist['join_phrase']
                    
-                # Build the artist string
-                artist_string = ""
-                for artist in artists:
-                    artist_string += artist['name']
-                    if artist['join_phrase']:
-                        artist_string += artist['join_phrase']
-                
-                return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])
+                    # Check if the artist strings match
+                    if actual_artist_string.lower() == expected_artist_string.lower():
+                        return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid'])
            
            return None
            
        except Exception as e:
            print(f"Error in find_artist_credit: {e}")
            return None 
-
-    def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
-        """
-        Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
-        Returns: (main_artist, [collaborators])
-        """
-        import re
-        
-        # Primary collaboration indicators
-        primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
-        
-        # Secondary collaboration indicators (need more careful handling)
-        secondary_patterns = ['&', 'and', ',']
-        
-        # Check if this is a collaboration
-        is_collaboration = False
-        split_pattern = None
-        
-        # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
-        # This should be "Kanye West ft. Jamie Foxx"
-        if artist_string.lower().startswith(('ft ', 'feat ')):
-            # This is a malformed collaboration string
-            # Try to extract the actual artists from the rest
-            remaining = artist_string[artist_string.find(' ') + 1:].strip()
-            if ',' in remaining:
-                # Split on comma and reverse the order
-                parts = [part.strip() for part in remaining.split(',')]
-                if len(parts) >= 2:
-                    # Assume the last part is the main artist
-                    main_artist = parts[-1].strip()
-                    collaborators = parts[:-1]
-                    return (main_artist, collaborators)
-        
-        for pattern in primary_patterns:
-            if pattern.lower() in artist_string.lower():
-                is_collaboration = True
-                # Find the actual pattern in the original string (case-insensitive)
-                import re
-                match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
-                if match:
-                    split_pattern = match.group(0)
-                else:
-                    split_pattern = pattern
-                break
-        
-        # If no primary collaboration found, check secondary patterns
-        if not is_collaboration:
-            for pattern in secondary_patterns:
-                if pattern == ',':
-                    # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
-                    if ',' in artist_string:
-                        # Count commas to determine if this is likely a collaboration
-                        comma_count = artist_string.count(',')
-                        if comma_count >= 1:
-                            # Split on comma and treat as collaboration
-                            parts = [part.strip() for part in artist_string.split(',')]
-                            if len(parts) >= 2:
-                                # First artist is main, rest are collaborators
-                                main_artist = parts[0]
-                                collaborators = parts[1:]
-                                return (main_artist, collaborators)
-                else:
-                    # Use whitespace boundaries to avoid splitting within words like "Orlando"
-                    import re
-                    if pattern in ['&', 'and']:
-                        pattern_regex = r'\s' + re.escape(pattern) + r'\s'
-                    else:
-                        pattern_regex = r'\b' + re.escape(pattern) + r'\b'
-                    if re.search(pattern_regex, artist_string, re.IGNORECASE):
-                        # For secondary patterns, be more careful
-                        # Check if this looks like a band name vs collaboration
-                        parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
-                        if len(parts) == 2:
-                            part1 = parts[0].strip()
-                            part2 = parts[1].strip()
-                            
-                            # Check if this looks like a band name vs collaboration
-                            # Band names typically have simple, short parts
-                            # Collaborations often have more complex artist names
-                            part1_words = len(part1.split())
-                            part2_words = len(part2.split())
-                            
-                            # Check if this looks like a band name vs collaboration
-                            # Band names typically have simple, short parts
-                            # Collaborations often have more complex artist names
-                            
-                            # If one part has a comma, it's likely a collaboration
-                            if ',' in part1 or ',' in part2:
-                                is_collaboration = True
-                                split_pattern = pattern
-                                break
-                            
-                            # For "&" and "and" separators, check against known band names
-                            # Load band names from JSON file
-                            try:
-                                import json
-                                import os
-                                json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
-                                with open(json_path, 'r') as f:
-                                    known_data = json.load(f)
-                                known_band_names = {name.lower() for name in known_data.get('groups', [])}
-                            except (FileNotFoundError, json.JSONDecodeError, KeyError):
-                                # Fallback to a minimal list if file can't be loaded
-                                known_band_names = {
-                                    "gerry and the pacemakers", "simon & garfunkel", "hall & oates", 
-                                    "brooks & dunn", "the everly brothers"
-                                }
-                            
-                            if artist_string.lower() in known_band_names:
-                                # This is a known band name, not a collaboration
-                                continue
-                            
-                            # For "and" separators, be more conservative
-                            # Many band names use "and" (e.g., "Gerry and the Pacemakers")
-                            if pattern.lower() == 'and':
-                                # Only treat as collaboration if it looks like individual artists
-                                # Band names often have "the" in the second part
-                                if 'the ' in part2.lower():
-                                    # Likely a band name, skip
-                                    continue
-                            
-                            # Likely a collaboration
-                            is_collaboration = True
-                            split_pattern = pattern
-                            break
-        
-        if not is_collaboration:
-            return (artist_string, [])
-        
-        # Split on the pattern using the same regex that was used for detection
-        if split_pattern in ['&', 'and']:
-            # Use whitespace boundary regex for these patterns
-            pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
-            parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
-        else:
-            # Use simple split for other patterns
-            parts = artist_string.split(split_pattern)
-            
-        if len(parts) < 2:
-            return (artist_string, [])
-        
-        main_artist = parts[0].strip()
-        collaborators_string = split_pattern.join(parts[1:]).strip()
-        
-        # Now parse the collaborators string which might have multiple separators
-        collaborators = self._parse_collaborators(collaborators_string)
-        
-        return (main_artist, collaborators)
-
-    def _generate_title_variations(self, title: str) -> List[str]:
-        """
-        Generate title variations by removing parenthetical content and fixing common issues.
-        Returns list of title variations to try.
-        """
-        import re
-        
-        search_titles = [title.strip()]
-        
-        # Fix common typos and missing apostrophes
-        title_fixes = title.strip()
-        
-        # Fix missing apostrophes in common contractions
-        apostrophe_fixes = [
-            (r'\bDont\b', "Don't"),
-            (r'\bCant\b', "Can't"),
-            (r'\bWont\b', "Won't"),
-            (r'\bArent\b', "Aren't"),
-            (r'\bIsnt\b', "Isn't"),
-            (r'\bWasnt\b', "Wasn't"),
-            (r'\bDidnt\b', "Didn't"),
-            (r'\bDoesnt\b', "Doesn't"),
-            (r'\bHavent\b', "Haven't"),
-            (r'\bHasnt\b', "Hasn't"),
-            (r'\bWouldnt\b', "Wouldn't"),
-            (r'\bCouldnt\b', "Couldn't"),
-            (r'\bShouldnt\b', "Shouldn't"),
-            (r'\bPhunk\b', "Funk"),  # Common typo
-            (r'\bBout\b', "About"),  # Shortened form
-        ]
-        
-        for pattern, replacement in apostrophe_fixes:
-            fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
-            if fixed_title != title_fixes:
-                title_fixes = fixed_title
-                if title_fixes not in search_titles:
-                    search_titles.append(title_fixes)
-        
-        # Comprehensive parentheses removal - try multiple approaches
-        # 1. Remove all complete parentheses (most aggressive)
-        clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
-        clean_title = clean_title.strip()
-        if clean_title != title.strip() and clean_title:
-            search_titles.append(clean_title)
-        
-        # 2. Remove specific common patterns first, then general parentheses
-        specific_patterns = [
-            r'\s*\(Karaoke Version\)',
-            r'\s*\(Karaoke\)',
-            r'\s*\(Instrumental\)',
-            r'\s*\(Backing Track\)',
-            r'\s*\(live [^)]*\)',
-            r'\s*\(Live [^)]*\)',
-            r'\s*\(Acoustic\)',
-            r'\s*\(acoustic\)',
-            r'\s*\(Without Backing Vocals\)',
-            r'\s*\(Without Backing Vocals\)',
-            r'\s*\(Clean\)',
-            r'\s*\(clean\)',
-            r'\s*\(Remix\)',
-            r'\s*\(remix\)',
-            r'\s*\(Radio Edit\)',
-            r'\s*\(radio edit\)',
-            r'\s*\(Extended Mix\)',
-            r'\s*\(extended mix\)',
-            r'\s*\(Single Version\)',
-            r'\s*\(single version\)',
-            r'\s*\(Album Version\)',
-            r'\s*\(album version\)',
-            r'\s*\(Original Mix\)',
-            r'\s*\(original mix\)',
-            r'\s*\(John Lewis Christmas Ad \d+\)',  # Specific pattern from test
-            r'\s*\(from the movie [^)]*\)',
-            r'\s*\(from the [^)]*\)',
-            r'\s*\(feat\. [^)]*\)',
-            r'\s*\(featuring [^)]*\)',
-            r'\s*\(ft\. [^)]*\)',
-            r'\s*\(duet\)',
-            r'\s*\(Duet\)',
-            r'\s*\(Two Semitones Down\)',
-            r'\s*\(Minus Piano\)',
-            r'\s*\(Cut Down\)',
-            r'\s*\(Boone & Speedy Vocals\)',
-            r'\s*\(My Heart Belongs to You\)',
-        ]
-        
-        # 3. Remove dash-separated content (like "Live At the BBC")
-        dash_patterns = [
-            r'\s*-\s*Live [^-]*$',
-            r'\s*-\s*live [^-]*$',
-            r'\s*-\s*Live At [^-]*$',
-            r'\s*-\s*Live At the [^-]*$',
-            r'\s*-\s*Live At the BBC$',
-            r'\s*-\s*Live From [^-]*$',
-            r'\s*-\s*Live In [^-]*$',
-            r'\s*-\s*Live On [^-]*$',
-        ]
-        
-        # Apply specific patterns first
-        for pattern in specific_patterns:
-            specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
-            specific_clean = specific_clean.strip()
-            if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
-                search_titles.append(specific_clean)
-        
-        # Apply dash patterns
-        for pattern in dash_patterns:
-            dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
-            dash_clean = dash_clean.strip()
-            if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles:
-                search_titles.append(dash_clean)
-        
-        # 3. Remove any remaining parentheses after specific patterns
-        for pattern in specific_patterns:
-            remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
-            remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip())
-            remaining_clean = remaining_clean.strip()
-            if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles:
-                search_titles.append(remaining_clean)
-        
-        # 4. Remove unmatched opening parenthesis at end
-        clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
-        clean_title2 = clean_title2.strip()
-        if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
-            search_titles.append(clean_title2)
-        
-        # 5. Remove unmatched closing parenthesis at start
-        clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
-        clean_title3 = clean_title3.strip()
-        if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
-            search_titles.append(clean_title3)
-        
-        # 6. Try removing extra spaces and normalizing
-        normalized_title = re.sub(r'\s+', ' ', title.strip())
-        if normalized_title != title.strip() and normalized_title not in search_titles:
-            search_titles.append(normalized_title)
-        
-        # 7. Apply normalization to all cleaned versions and remove duplicates
-        normalized_versions = []
-        for version in search_titles:
-            # Normalize spaces (replace multiple spaces with single space)
-            normalized = re.sub(r'\s+', ' ', version.strip())
-            # Remove leading/trailing spaces
-            normalized = normalized.strip()
-            if normalized and normalized not in normalized_versions:
-                normalized_versions.append(normalized)
-        
-        return normalized_versions
-
-    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
-        """
-        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
-        """
-        import re
-        
-        if not collaborators_string:
-            return []
-        
-        # Split on common separators: comma, ampersand, "and"
-        # Use regex to handle multiple separators
-        # Use word boundaries to avoid splitting within words like "Orlando"
-        separators = r'[,&]|\b(?:and)\b'
-        
-        # Split and clean up each part
-        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
-        
-        collaborators = []
-        for part in parts:
-            part = part.strip()
-            if part:  # Skip empty parts
-                collaborators.append(part)
-        
-        return collaborators 
--- a/src/cli/main.py
+++ b/src/cli/main.py
@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader
 # Import database and API clients
 from ..api.database import MusicBrainzDatabase
 from ..api.api_client import MusicBrainzAPIClient
+from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations


 class MusicBrainzCleaner:
@ -81,7 +82,7 @@ class MusicBrainzCleaner:
        
        # Handle collaborations - try to find main artist first
        # Use the same complex collaboration parsing as the database
-        main_artist, collaborators = self._parse_complex_collaboration(clean_name)
+        main_artist, collaborators = parse_complex_collaboration(clean_name)
        has_collaboration = len(collaborators) > 0
        
        if has_collaboration:
@ -119,38 +120,68 @@ class MusicBrainzCleaner:
        
        return None
    
-    def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]:
-        clean_title = self._clean_title(title)
+    def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
+        # Use the shared title variations logic for comprehensive title cleaning
+        title_variations = generate_title_variations(title)
        
        # Load known recordings data first
        known_recordings = data_loader.load_known_recordings()
        
-        # Check known recordings first
+        # Check known recordings first with all title variations
        for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
-            if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower():
-                return recording_mbid
+            if artist_mbid == known_artist_mbid:
+                for variation in title_variations:
+                    if variation.lower() == known_title.lower():
+                        return recording_mbid
        
        # Handle collaborations using artist credit
        if self.use_database:
-            # If no artist_mbid (collaboration case), try to find by title and original artist name
-            if not artist_mbid:
-                # This is a collaboration case, try to find by title
-                result = self.db.fuzzy_search_recording(clean_title)
-                if result and isinstance(result, tuple) and len(result) >= 2:
-                    return result[1]  # Return MBID from tuple (recording_name, mbid, score)
+            # If no artist_mbid (collaboration case), try to find by title and verify artist credit
+            if not artist_mbid and original_artist:
+                # This is a collaboration case, try to find by title with all variations
+                for variation in title_variations:
+                    # Search for recordings by title first
+                    result = self.db.fuzzy_search_recording(variation)
+                    if result and isinstance(result, tuple) and len(result) >= 2:
+                        recording_mbid = result[1]
+                        
+                        # Verify that this recording has the correct artist credit
+                        recording_info = self.get_recording_info(recording_mbid)
+                        if recording_info and recording_info.get('artist_credit'):
+                            # Check if the artist credit matches our expected collaboration
+                            expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ')
+                            if recording_info['artist_credit'].lower() == expected_artist_string.lower():
+                                return recording_mbid
+                        
+                        # If exact match fails, try partial match
+                        if recording_info and recording_info.get('artist_credit'):
+                            # Check if all artists in the collaboration are present in the recording
+                            main_artist, collaborators = parse_complex_collaboration(original_artist)
+                            recording_artists = recording_info['artist_credit'].lower()
+                            
+                            # Check if main artist is in the recording
+                            if main_artist.lower() in recording_artists:
+                                # Check if at least one collaborator is also present
+                                for collaborator in collaborators:
+                                    if collaborator.lower() in recording_artists:
+                                        return recording_mbid
+                
+                return None
            else:
-                # Regular case with artist_mbid
-                result = self.db.fuzzy_search_recording(clean_title, artist_mbid)
-                if result and isinstance(result, tuple) and len(result) >= 2:
-                    return result[1]  # Return MBID from tuple (recording_name, mbid, score)
+                # Regular case with artist_mbid - try all title variations
+                for variation in title_variations:
+                    result = self.db.fuzzy_search_recording(variation, artist_mbid)
+                    if result and isinstance(result, tuple) and len(result) >= 2:
+                        return result[1]  # Return MBID from tuple (recording_name, mbid, score)
        else:
-            # Fallback to API
-            try:
-                result = self.api.search_recording(clean_title, artist_mbid)
-                if result:
-                    return result['id']
-            except:
-                pass
+            # Fallback to API - try all title variations
+            for variation in title_variations:
+                try:
+                    result = self.api.search_recording(variation, artist_mbid)
+                    if result:
+                        return result['id']
+                except:
+                    pass
        
        return None
    
@ -176,130 +207,6 @@ class MusicBrainzCleaner:
                parts.append(credit['joinphrase'])
        return ''.join(parts).strip()

-    def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
-        """
-        Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
-        Returns: (main_artist, [collaborators])
-        """
-        import re
-        
-        # Primary collaboration indicators
-        primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
-        
-        # Secondary collaboration indicators (need more careful handling)
-        secondary_patterns = ['&', 'and', ',']
-        
-        # Check if this is a collaboration
-        is_collaboration = False
-        split_pattern = None
-        
-        for pattern in primary_patterns:
-            if pattern.lower() in artist_string.lower():
-                is_collaboration = True
-                # Find the actual pattern in the original string (case-insensitive)
-                import re
-                match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
-                if match:
-                    split_pattern = match.group(0)
-                else:
-                    split_pattern = pattern
-                break
-        
-        # If no primary collaboration found, check secondary patterns
-        if not is_collaboration:
-            for pattern in secondary_patterns:
-                # Use whitespace boundaries for & and and, word boundaries for others
-                if pattern in ['&', 'and']:
-                    pattern_regex = r'\s' + re.escape(pattern) + r'\s'
-                else:
-                    pattern_regex = r'\b' + re.escape(pattern) + r'\b'
-                if re.search(pattern_regex, artist_string, re.IGNORECASE):
-                    # For secondary patterns, be more careful
-                    # Check if this looks like a band name vs collaboration
-                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
-                    if len(parts) == 2:
-                        part1 = parts[0].strip()
-                        part2 = parts[1].strip()
-                        
-                        # Check if this looks like a band name vs collaboration
-                        # Band names typically have simple, short parts
-                        # Collaborations often have more complex artist names
-                        part1_words = len(part1.split())
-                        part2_words = len(part2.split())
-                        
-                        # Check if this looks like a band name vs collaboration
-                        # Band names typically have simple, short parts
-                        # Collaborations often have more complex artist names
-                        
-                        # If one part has a comma, it's likely a collaboration
-                        if ',' in part1 or ',' in part2:
-                            is_collaboration = True
-                            split_pattern = pattern
-                            break
-                        
-                        # For "&" separators, be more permissive
-                        # Most "&" separators in music are collaborations, not band names
-                        # Only treat very obvious band names as non-collaborations
-                        if part1_words == 1 and part2_words == 1:
-                            # Check for very obvious band name patterns
-                            obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
-                            if artist_string.lower() in obvious_band_names:
-                                # Likely a band name, skip
-                                continue
-                        
-                        # Likely a collaboration
-                        is_collaboration = True
-                        split_pattern = pattern
-                        break
-        
-        if not is_collaboration:
-            return (artist_string, [])
-        
-        # Split on the pattern using the same regex that was used for detection
-        if split_pattern in ['&', 'and']:
-            # Use whitespace boundary regex for these patterns
-            pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
-            parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
-        else:
-            # Use simple split for other patterns
-            parts = artist_string.split(split_pattern)
-            
-        if len(parts) < 2:
-            return (artist_string, [])
-        
-        main_artist = parts[0].strip()
-        collaborators_string = split_pattern.join(parts[1:]).strip()
-        
-        # Now parse the collaborators string which might have multiple separators
-        collaborators = self._parse_collaborators(collaborators_string)
-        
-        return (main_artist, collaborators)
-
-    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
-        """
-        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
-        """
-        import re
-        
-        if not collaborators_string:
-            return []
-        
-        # Split on common separators: comma, ampersand, "and"
-        # Use regex to handle multiple separators
-        # Use word boundaries to avoid splitting within words like "Orlando"
-        separators = r'[,&]|\b(?:and)\b'
-        
-        # Split and clean up each part
-        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
-        
-        collaborators = []
-        for part in parts:
-            part = part.strip()
-            if part:  # Skip empty parts
-                collaborators.append(part)
-        
-        return collaborators
-    
    def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
        """
        Clean a single song and return (cleaned_song, success_status)
@ -313,44 +220,37 @@ class MusicBrainzCleaner:
        # Handle collaboration case (artist_mbid is None)
        # Use the same complex collaboration parsing as the database
        artist_name = song.get('artist', '')
-        main_artist, collaborators = self._parse_complex_collaboration(artist_name)
+        main_artist, collaborators = parse_complex_collaboration(artist_name)
        has_collaboration = len(collaborators) > 0
        
        if artist_mbid is None and has_collaboration:
-            # Try to find recording using artist credit approach
-            if self.use_database:
-                result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', ''))
-                if result:
-                    artist_credit_id, artist_string, recording_mbid = result
+            # Try to find recording using the improved find_recording_mbid method
+            recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist)
+            if recording_mbid:
+                # Get recording info to update the song
+                recording_info = self.get_recording_info(recording_mbid)
+                if recording_info:
+                    # Update with the correct artist credit and title
+                    if self.use_database and recording_info.get('artist_credit'):
+                        song['artist'] = recording_info['artist_credit']
+                    elif not self.use_database and recording_info.get('artist-credit'):
+                        artist_string = self._build_artist_string(recording_info['artist-credit'])
+                        if artist_string:
+                            song['artist'] = artist_string
                    
-                    # Update with the correct artist credit
-                    song['artist'] = artist_string
+                    song['title'] = recording_info['title']
                    song['recording_mbid'] = recording_mbid
                    
                    # For collaborations, try to get the main artist's MBID
                    if self.use_database:
-                        main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
+                        main_artist, collaborators = parse_complex_collaboration(song.get('artist', ''))
                        if main_artist:
                            artist_result = self.db.fuzzy_search_artist(main_artist)
                            if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
                                song['mbid'] = artist_result[1]  # Set the main artist's MBID
                    
                    return song, True
-                else:
-                    return song, False
-            else:
-                # Fallback to API method
-                recording_mbid = self.find_recording_mbid(None, song.get('title', ''))
-                if recording_mbid:
-                    recording_info = self.get_recording_info(recording_mbid)
-                    if recording_info and recording_info.get('artist-credit'):
-                        artist_string = self._build_artist_string(recording_info['artist-credit'])
-                        if artist_string:
-                            song['artist'] = artist_string
-                            song['title'] = recording_info['title']
-                            song['recording_mbid'] = recording_mbid
-                            return song, True
-                return song, False
+            return song, False
        
        # Regular case (non-collaboration or collaboration not found)
        if not artist_mbid: