Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-08-01 09:46:35 -05:00 · 2025-08-01 09:46:35 -05:00 · 4bbd03eab7
commit 4bbd03eab7
parent 3d8b0165af
2 changed files with 123 additions and 538 deletions
--- a/src/api/database.py
+++ b/src/api/database.py
@ -13,6 +13,7 @@ from ..config.constants import (
    TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
    SUCCESS_MESSAGES, ERROR_MESSAGES
 )
 from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
 class MusicBrainzDatabase:
@ -372,7 +373,7 @@ class MusicBrainzDatabase:
        try:
            # Generate title variations using the helper method
-            search_titles = self._generate_title_variations(title)
+            search_titles = generate_title_variations(title)
            if artist_mbid:
                # Try to find recording by artist and title with all variations
@ -418,16 +419,17 @@ class MusicBrainzDatabase:
            else:
                # No artist constraint - search by title only with all variations
                for search_title in search_titles:
-                    # Try exact match first
+                    # Try exact match first (handle both apostrophe types)
-                    query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
+                    search_title_curly = search_title.replace("'", "'")
-                    self.cursor.execute(query, (search_title,))
+                    query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1"
                    self.cursor.execute(query, (search_title, search_title_curly))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 1.0)
-                    # Try case-insensitive match
+                    # Try case-insensitive match (handle both apostrophe types)
-                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
+                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1"
-                    self.cursor.execute(query, (search_title,))
+                    self.cursor.execute(query, (search_title, search_title_curly))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 0.99)
@ -437,8 +439,10 @@ class MusicBrainzDatabase:
                best_score = 0
                for search_title in search_titles:
-                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
+                    # Handle both apostrophe types in partial search
-                    self.cursor.execute(query, (f"%{search_title}%",))
+                    search_title_curly = search_title.replace("'", "'")
                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50"
                    self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%"))
                    recordings = self.cursor.fetchall()
                    for recording in recordings:
@ -550,374 +554,55 @@ class MusicBrainzDatabase:
            clean_name = artist_name.strip()
            # Parse complex collaboration
-            main_artist, collaborators = self._parse_complex_collaboration(clean_name)
+            main_artist, collaborators = parse_complex_collaboration(clean_name)
            collaboration = len(collaborators) > 0
            # Generate title variations using the helper method
-            search_titles = self._generate_title_variations(title)
+            search_titles = generate_title_variations(title)
-            # Try to find recording with artist credit using all title variations
+            # Build the expected artist credit string
            expected_artist_string = main_artist
            if collaborators:
                expected_artist_string += " & " + " & ".join(collaborators)
            # Try to find recording with matching artist credit using all title variations
            for search_title in search_titles:
-                query = """
+                # First, find all recordings with matching title (handle both apostrophe types)
-                    SELECT ac.id as artist_credit_id, r.gid as recording_mbid
+                title_query = """
                    SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id
                    FROM recording r
-                    JOIN artist_credit ac ON r.artist_credit = ac.id
+                    WHERE r.name ILIKE %s OR r.name ILIKE %s
                    JOIN artist_credit_name acn ON ac.id = acn.artist_credit
                    JOIN artist a ON acn.artist = a.id
                    WHERE a.name = %s AND r.name ILIKE %s
                    LIMIT 1
                """
-                self.cursor.execute(query, (main_artist, f"%{search_title}%"))
+                # Create versions with both straight and curly apostrophes
-                result = self.cursor.fetchone()
+                search_title_curly = search_title.replace("'", "'").replace("'", "'")
                self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%"))
                recordings = self.cursor.fetchall()
-                if result:
+                for recording in recordings:
-                    break
+                    # Check if this recording's artist credit matches our expected artist string
-            
+                    artist_query = """
-            if result:
+                        SELECT a.name, acn.join_phrase, acn.position
-                # Get the full artist string for this credit
+                        FROM artist_credit_name acn
-                artist_query = """
+                        JOIN artist a ON acn.artist = a.id
-                    SELECT a.name, acn.join_phrase, acn.position
+                        WHERE acn.artist_credit = %s
-                    FROM artist_credit_name acn
+                        ORDER BY acn.position
-                    JOIN artist a ON acn.artist = a.id
+                    """
-                    WHERE acn.artist_credit = %s
+                    self.cursor.execute(artist_query, (recording['artist_credit_id'],))
-                    ORDER BY acn.position
+                    artists = self.cursor.fetchall()
-                """
+                    
-                self.cursor.execute(artist_query, (result['artist_credit_id'],))
+                    # Build the actual artist string for this recording
-                artists = self.cursor.fetchall()
+                    actual_artist_string = ""
-                
+                    for artist in artists:
-                # Build the artist string
+                        actual_artist_string += artist['name']
-                artist_string = ""
+                        if artist['join_phrase']:
-                for artist in artists:
+                            actual_artist_string += artist['join_phrase']
-                    artist_string += artist['name']
+                    
-                    if artist['join_phrase']:
+                    # Check if the artist strings match
-                        artist_string += artist['join_phrase']
+                    if actual_artist_string.lower() == expected_artist_string.lower():
-                
+                        return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid'])
                return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])
            return None
        except Exception as e:
            print(f"Error in find_artist_credit: {e}")
-            return None
+            return None 
    def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
        """
        Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
        Returns: (main_artist, [collaborators])
        """
        import re
        # Primary collaboration indicators
        primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
        # Secondary collaboration indicators (need more careful handling)
        secondary_patterns = ['&', 'and', ',']
        # Check if this is a collaboration
        is_collaboration = False
        split_pattern = None
        # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
        # This should be "Kanye West ft. Jamie Foxx"
        if artist_string.lower().startswith(('ft ', 'feat ')):
            # This is a malformed collaboration string
            # Try to extract the actual artists from the rest
            remaining = artist_string[artist_string.find(' ') + 1:].strip()
            if ',' in remaining:
                # Split on comma and reverse the order
                parts = [part.strip() for part in remaining.split(',')]
                if len(parts) >= 2:
                    # Assume the last part is the main artist
                    main_artist = parts[-1].strip()
                    collaborators = parts[:-1]
                    return (main_artist, collaborators)
        for pattern in primary_patterns:
            if pattern.lower() in artist_string.lower():
                is_collaboration = True
                # Find the actual pattern in the original string (case-insensitive)
                import re
                match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
                if match:
                    split_pattern = match.group(0)
                else:
                    split_pattern = pattern
                break
        # If no primary collaboration found, check secondary patterns
        if not is_collaboration:
            for pattern in secondary_patterns:
                if pattern == ',':
                    # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
                    if ',' in artist_string:
                        # Count commas to determine if this is likely a collaboration
                        comma_count = artist_string.count(',')
                        if comma_count >= 1:
                            # Split on comma and treat as collaboration
                            parts = [part.strip() for part in artist_string.split(',')]
                            if len(parts) >= 2:
                                # First artist is main, rest are collaborators
                                main_artist = parts[0]
                                collaborators = parts[1:]
                                return (main_artist, collaborators)
                else:
                    # Use whitespace boundaries to avoid splitting within words like "Orlando"
                    import re
                    if pattern in ['&', 'and']:
                        pattern_regex = r'\s' + re.escape(pattern) + r'\s'
                    else:
                        pattern_regex = r'\b' + re.escape(pattern) + r'\b'
                    if re.search(pattern_regex, artist_string, re.IGNORECASE):
                        # For secondary patterns, be more careful
                        # Check if this looks like a band name vs collaboration
                        parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                        if len(parts) == 2:
                            part1 = parts[0].strip()
                            part2 = parts[1].strip()
                            # Check if this looks like a band name vs collaboration
                            # Band names typically have simple, short parts
                            # Collaborations often have more complex artist names
                            part1_words = len(part1.split())
                            part2_words = len(part2.split())
                            # Check if this looks like a band name vs collaboration
                            # Band names typically have simple, short parts
                            # Collaborations often have more complex artist names
                            # If one part has a comma, it's likely a collaboration
                            if ',' in part1 or ',' in part2:
                                is_collaboration = True
                                split_pattern = pattern
                                break
                            # For "&" and "and" separators, check against known band names
                            # Load band names from JSON file
                            try:
                                import json
                                import os
                                json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
                                with open(json_path, 'r') as f:
                                    known_data = json.load(f)
                                known_band_names = {name.lower() for name in known_data.get('groups', [])}
                            except (FileNotFoundError, json.JSONDecodeError, KeyError):
                                # Fallback to a minimal list if file can't be loaded
                                known_band_names = {
                                    "gerry and the pacemakers", "simon & garfunkel", "hall & oates", 
                                    "brooks & dunn", "the everly brothers"
                                }
                            if artist_string.lower() in known_band_names:
                                # This is a known band name, not a collaboration
                                continue
                            # For "and" separators, be more conservative
                            # Many band names use "and" (e.g., "Gerry and the Pacemakers")
                            if pattern.lower() == 'and':
                                # Only treat as collaboration if it looks like individual artists
                                # Band names often have "the" in the second part
                                if 'the ' in part2.lower():
                                    # Likely a band name, skip
                                    continue
                            # Likely a collaboration
                            is_collaboration = True
                            split_pattern = pattern
                            break
        if not is_collaboration:
            return (artist_string, [])
        # Split on the pattern using the same regex that was used for detection
        if split_pattern in ['&', 'and']:
            # Use whitespace boundary regex for these patterns
            pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
            parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
        else:
            # Use simple split for other patterns
            parts = artist_string.split(split_pattern)
        if len(parts) < 2:
            return (artist_string, [])
        main_artist = parts[0].strip()
        collaborators_string = split_pattern.join(parts[1:]).strip()
        # Now parse the collaborators string which might have multiple separators
        collaborators = self._parse_collaborators(collaborators_string)
        return (main_artist, collaborators)
    def _generate_title_variations(self, title: str) -> List[str]:
        """
        Generate title variations by removing parenthetical content and fixing common issues.
        Returns list of title variations to try.
        """
        import re
        search_titles = [title.strip()]
        # Fix common typos and missing apostrophes
        title_fixes = title.strip()
        # Fix missing apostrophes in common contractions
        apostrophe_fixes = [
            (r'\bDont\b', "Don't"),
            (r'\bCant\b', "Can't"),
            (r'\bWont\b', "Won't"),
            (r'\bArent\b', "Aren't"),
            (r'\bIsnt\b', "Isn't"),
            (r'\bWasnt\b', "Wasn't"),
            (r'\bDidnt\b', "Didn't"),
            (r'\bDoesnt\b', "Doesn't"),
            (r'\bHavent\b', "Haven't"),
            (r'\bHasnt\b', "Hasn't"),
            (r'\bWouldnt\b', "Wouldn't"),
            (r'\bCouldnt\b', "Couldn't"),
            (r'\bShouldnt\b', "Shouldn't"),
            (r'\bPhunk\b', "Funk"),  # Common typo
            (r'\bBout\b', "About"),  # Shortened form
        ]
        for pattern, replacement in apostrophe_fixes:
            fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
            if fixed_title != title_fixes:
                title_fixes = fixed_title
                if title_fixes not in search_titles:
                    search_titles.append(title_fixes)
        # Comprehensive parentheses removal - try multiple approaches
        # 1. Remove all complete parentheses (most aggressive)
        clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
        clean_title = clean_title.strip()
        if clean_title != title.strip() and clean_title:
            search_titles.append(clean_title)
        # 2. Remove specific common patterns first, then general parentheses
        specific_patterns = [
            r'\s*\(Karaoke Version\)',
            r'\s*\(Karaoke\)',
            r'\s*\(Instrumental\)',
            r'\s*\(Backing Track\)',
            r'\s*\(live [^)]*\)',
            r'\s*\(Live [^)]*\)',
            r'\s*\(Acoustic\)',
            r'\s*\(acoustic\)',
            r'\s*\(Without Backing Vocals\)',
            r'\s*\(Without Backing Vocals\)',
            r'\s*\(Clean\)',
            r'\s*\(clean\)',
            r'\s*\(Remix\)',
            r'\s*\(remix\)',
            r'\s*\(Radio Edit\)',
            r'\s*\(radio edit\)',
            r'\s*\(Extended Mix\)',
            r'\s*\(extended mix\)',
            r'\s*\(Single Version\)',
            r'\s*\(single version\)',
            r'\s*\(Album Version\)',
            r'\s*\(album version\)',
            r'\s*\(Original Mix\)',
            r'\s*\(original mix\)',
            r'\s*\(John Lewis Christmas Ad \d+\)',  # Specific pattern from test
            r'\s*\(from the movie [^)]*\)',
            r'\s*\(from the [^)]*\)',
            r'\s*\(feat\. [^)]*\)',
            r'\s*\(featuring [^)]*\)',
            r'\s*\(ft\. [^)]*\)',
            r'\s*\(duet\)',
            r'\s*\(Duet\)',
            r'\s*\(Two Semitones Down\)',
            r'\s*\(Minus Piano\)',
            r'\s*\(Cut Down\)',
            r'\s*\(Boone & Speedy Vocals\)',
            r'\s*\(My Heart Belongs to You\)',
        ]
        # 3. Remove dash-separated content (like "Live At the BBC")
        dash_patterns = [
            r'\s*-\s*Live [^-]*$',
            r'\s*-\s*live [^-]*$',
            r'\s*-\s*Live At [^-]*$',
            r'\s*-\s*Live At the [^-]*$',
            r'\s*-\s*Live At the BBC$',
            r'\s*-\s*Live From [^-]*$',
            r'\s*-\s*Live In [^-]*$',
            r'\s*-\s*Live On [^-]*$',
        ]
        # Apply specific patterns first
        for pattern in specific_patterns:
            specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
            specific_clean = specific_clean.strip()
            if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
                search_titles.append(specific_clean)
        # Apply dash patterns
        for pattern in dash_patterns:
            dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
            dash_clean = dash_clean.strip()
            if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles:
                search_titles.append(dash_clean)
        # 3. Remove any remaining parentheses after specific patterns
        for pattern in specific_patterns:
            remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
            remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip())
            remaining_clean = remaining_clean.strip()
            if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles:
                search_titles.append(remaining_clean)
        # 4. Remove unmatched opening parenthesis at end
        clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
        clean_title2 = clean_title2.strip()
        if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
            search_titles.append(clean_title2)
        # 5. Remove unmatched closing parenthesis at start
        clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
        clean_title3 = clean_title3.strip()
        if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
            search_titles.append(clean_title3)
        # 6. Try removing extra spaces and normalizing
        normalized_title = re.sub(r'\s+', ' ', title.strip())
        if normalized_title != title.strip() and normalized_title not in search_titles:
            search_titles.append(normalized_title)
        # 7. Apply normalization to all cleaned versions and remove duplicates
        normalized_versions = []
        for version in search_titles:
            # Normalize spaces (replace multiple spaces with single space)
            normalized = re.sub(r'\s+', ' ', version.strip())
            # Remove leading/trailing spaces
            normalized = normalized.strip()
            if normalized and normalized not in normalized_versions:
                normalized_versions.append(normalized)
        return normalized_versions
    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
        """
        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
        """
        import re
        if not collaborators_string:
            return []
        # Split on common separators: comma, ampersand, "and"
        # Use regex to handle multiple separators
        # Use word boundaries to avoid splitting within words like "Orlando"
        separators = r'[,&]|\b(?:and)\b'
        # Split and clean up each part
        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
        collaborators = []
        for part in parts:
            part = part.strip()
            if part:  # Skip empty parts
                collaborators.append(part)
        return collaborators 
--- a/src/cli/main.py
+++ b/src/cli/main.py
@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader
 # Import database and API clients
 from ..api.database import MusicBrainzDatabase
 from ..api.api_client import MusicBrainzAPIClient
 from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
 class MusicBrainzCleaner:
@ -81,7 +82,7 @@ class MusicBrainzCleaner:
        # Handle collaborations - try to find main artist first
        # Use the same complex collaboration parsing as the database
-        main_artist, collaborators = self._parse_complex_collaboration(clean_name)
+        main_artist, collaborators = parse_complex_collaboration(clean_name)
        has_collaboration = len(collaborators) > 0
        if has_collaboration:
@ -119,38 +120,68 @@ class MusicBrainzCleaner:
        return None
-    def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]:
+    def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
-        clean_title = self._clean_title(title)
+        # Use the shared title variations logic for comprehensive title cleaning
        title_variations = generate_title_variations(title)
        # Load known recordings data first
        known_recordings = data_loader.load_known_recordings()
-        # Check known recordings first
+        # Check known recordings first with all title variations
        for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
-            if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower():
+            if artist_mbid == known_artist_mbid:
-                return recording_mbid
+                for variation in title_variations:
                    if variation.lower() == known_title.lower():
                        return recording_mbid
        # Handle collaborations using artist credit
        if self.use_database:
-            # If no artist_mbid (collaboration case), try to find by title and original artist name
+            # If no artist_mbid (collaboration case), try to find by title and verify artist credit
-            if not artist_mbid:
+            if not artist_mbid and original_artist:
-                # This is a collaboration case, try to find by title
+                # This is a collaboration case, try to find by title with all variations
-                result = self.db.fuzzy_search_recording(clean_title)
+                for variation in title_variations:
-                if result and isinstance(result, tuple) and len(result) >= 2:
+                    # Search for recordings by title first
-                    return result[1]  # Return MBID from tuple (recording_name, mbid, score)
+                    result = self.db.fuzzy_search_recording(variation)
                    if result and isinstance(result, tuple) and len(result) >= 2:
                        recording_mbid = result[1]
                        # Verify that this recording has the correct artist credit
                        recording_info = self.get_recording_info(recording_mbid)
                        if recording_info and recording_info.get('artist_credit'):
                            # Check if the artist credit matches our expected collaboration
                            expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ')
                            if recording_info['artist_credit'].lower() == expected_artist_string.lower():
                                return recording_mbid
                        # If exact match fails, try partial match
                        if recording_info and recording_info.get('artist_credit'):
                            # Check if all artists in the collaboration are present in the recording
                            main_artist, collaborators = parse_complex_collaboration(original_artist)
                            recording_artists = recording_info['artist_credit'].lower()
                            # Check if main artist is in the recording
                            if main_artist.lower() in recording_artists:
                                # Check if at least one collaborator is also present
                                for collaborator in collaborators:
                                    if collaborator.lower() in recording_artists:
                                        return recording_mbid
                return None
            else:
-                # Regular case with artist_mbid
+                # Regular case with artist_mbid - try all title variations
-                result = self.db.fuzzy_search_recording(clean_title, artist_mbid)
+                for variation in title_variations:
-                if result and isinstance(result, tuple) and len(result) >= 2:
+                    result = self.db.fuzzy_search_recording(variation, artist_mbid)
-                    return result[1]  # Return MBID from tuple (recording_name, mbid, score)
+                    if result and isinstance(result, tuple) and len(result) >= 2:
                        return result[1]  # Return MBID from tuple (recording_name, mbid, score)
        else:
-            # Fallback to API
+            # Fallback to API - try all title variations
-            try:
+            for variation in title_variations:
-                result = self.api.search_recording(clean_title, artist_mbid)
+                try:
-                if result:
+                    result = self.api.search_recording(variation, artist_mbid)
-                    return result['id']
+                    if result:
-            except:
+                        return result['id']
-                pass
+                except:
                    pass
        return None
@ -176,130 +207,6 @@ class MusicBrainzCleaner:
                parts.append(credit['joinphrase'])
        return ''.join(parts).strip()
    def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
        """
        Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
        Returns: (main_artist, [collaborators])
        """
        import re
        # Primary collaboration indicators
        primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
        # Secondary collaboration indicators (need more careful handling)
        secondary_patterns = ['&', 'and', ',']
        # Check if this is a collaboration
        is_collaboration = False
        split_pattern = None
        for pattern in primary_patterns:
            if pattern.lower() in artist_string.lower():
                is_collaboration = True
                # Find the actual pattern in the original string (case-insensitive)
                import re
                match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
                if match:
                    split_pattern = match.group(0)
                else:
                    split_pattern = pattern
                break
        # If no primary collaboration found, check secondary patterns
        if not is_collaboration:
            for pattern in secondary_patterns:
                # Use whitespace boundaries for & and and, word boundaries for others
                if pattern in ['&', 'and']:
                    pattern_regex = r'\s' + re.escape(pattern) + r'\s'
                else:
                    pattern_regex = r'\b' + re.escape(pattern) + r'\b'
                if re.search(pattern_regex, artist_string, re.IGNORECASE):
                    # For secondary patterns, be more careful
                    # Check if this looks like a band name vs collaboration
                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        part1 = parts[0].strip()
                        part2 = parts[1].strip()
                        # Check if this looks like a band name vs collaboration
                        # Band names typically have simple, short parts
                        # Collaborations often have more complex artist names
                        part1_words = len(part1.split())
                        part2_words = len(part2.split())
                        # Check if this looks like a band name vs collaboration
                        # Band names typically have simple, short parts
                        # Collaborations often have more complex artist names
                        # If one part has a comma, it's likely a collaboration
                        if ',' in part1 or ',' in part2:
                            is_collaboration = True
                            split_pattern = pattern
                            break
                        # For "&" separators, be more permissive
                        # Most "&" separators in music are collaborations, not band names
                        # Only treat very obvious band names as non-collaborations
                        if part1_words == 1 and part2_words == 1:
                            # Check for very obvious band name patterns
                            obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
                            if artist_string.lower() in obvious_band_names:
                                # Likely a band name, skip
                                continue
                        # Likely a collaboration
                        is_collaboration = True
                        split_pattern = pattern
                        break
        if not is_collaboration:
            return (artist_string, [])
        # Split on the pattern using the same regex that was used for detection
        if split_pattern in ['&', 'and']:
            # Use whitespace boundary regex for these patterns
            pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
            parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
        else:
            # Use simple split for other patterns
            parts = artist_string.split(split_pattern)
        if len(parts) < 2:
            return (artist_string, [])
        main_artist = parts[0].strip()
        collaborators_string = split_pattern.join(parts[1:]).strip()
        # Now parse the collaborators string which might have multiple separators
        collaborators = self._parse_collaborators(collaborators_string)
        return (main_artist, collaborators)
    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
        """
        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
        """
        import re
        if not collaborators_string:
            return []
        # Split on common separators: comma, ampersand, "and"
        # Use regex to handle multiple separators
        # Use word boundaries to avoid splitting within words like "Orlando"
        separators = r'[,&]|\b(?:and)\b'
        # Split and clean up each part
        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
        collaborators = []
        for part in parts:
            part = part.strip()
            if part:  # Skip empty parts
                collaborators.append(part)
        return collaborators
    def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
        """
        Clean a single song and return (cleaned_song, success_status)
@ -313,44 +220,37 @@ class MusicBrainzCleaner:
        # Handle collaboration case (artist_mbid is None)
        # Use the same complex collaboration parsing as the database
        artist_name = song.get('artist', '')
-        main_artist, collaborators = self._parse_complex_collaboration(artist_name)
+        main_artist, collaborators = parse_complex_collaboration(artist_name)
        has_collaboration = len(collaborators) > 0
        if artist_mbid is None and has_collaboration:
-            # Try to find recording using artist credit approach
+            # Try to find recording using the improved find_recording_mbid method
-            if self.use_database:
+            recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist)
-                result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', ''))
+            if recording_mbid:
-                if result:
+                # Get recording info to update the song
-                    artist_credit_id, artist_string, recording_mbid = result
+                recording_info = self.get_recording_info(recording_mbid)
                if recording_info:
                    # Update with the correct artist credit and title
                    if self.use_database and recording_info.get('artist_credit'):
                        song['artist'] = recording_info['artist_credit']
                    elif not self.use_database and recording_info.get('artist-credit'):
                        artist_string = self._build_artist_string(recording_info['artist-credit'])
                        if artist_string:
                            song['artist'] = artist_string
-                    # Update with the correct artist credit
+                    song['title'] = recording_info['title']
                    song['artist'] = artist_string
                    song['recording_mbid'] = recording_mbid
                    # For collaborations, try to get the main artist's MBID
                    if self.use_database:
-                        main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
+                        main_artist, collaborators = parse_complex_collaboration(song.get('artist', ''))
                        if main_artist:
                            artist_result = self.db.fuzzy_search_artist(main_artist)
                            if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
                                song['mbid'] = artist_result[1]  # Set the main artist's MBID
                    return song, True
-                else:
+            return song, False
                    return song, False
            else:
                # Fallback to API method
                recording_mbid = self.find_recording_mbid(None, song.get('title', ''))
                if recording_mbid:
                    recording_info = self.get_recording_info(recording_mbid)
                    if recording_info and recording_info.get('artist-credit'):
                        artist_string = self._build_artist_string(recording_info['artist-credit'])
                        if artist_string:
                            song['artist'] = artist_string
                            song['title'] = recording_info['title']
                            song['recording_mbid'] = recording_mbid
                            return song, True
                return song, False
        # Regular case (non-collaboration or collaboration not found)
        if not artist_mbid: