Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-07-31 16:24:48 -05:00 · 2025-07-31 16:24:48 -05:00 · dc396a346f
commit dc396a346f
parent 20817a3373
2 changed files with 121 additions and 72 deletions
--- a/src/api/database.py
+++ b/src/api/database.py
@ -345,38 +345,48 @@ class MusicBrainzDatabase:
            return None
        
        try:
-            # Clean title
-            clean_title = title.strip()
+            # Try with full title first, then with parenthetical content removed
+            search_titles = [title.strip()]
            
-            # Remove karaoke patterns
+            # Create version with parenthetical content removed
+            import re
+            # Remove all parenthetical content like (clean), (remix), (karaoke), etc.
+            clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
+            clean_title = clean_title.strip()
+            if clean_title != title.strip():
+                search_titles.append(clean_title)
+            
+            # Also try with specific karaoke patterns removed
            karaoke_patterns = [
                r'\s*\(Karaoke Version\)',
                r'\s*\(Karaoke\)',
                r'\s*\(Instrumental\)',
                r'\s*\(Backing Track\)',
            ]
-            import re
            for pattern in karaoke_patterns:
-                clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
-            clean_title = clean_title.strip()
+                karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
+                karaoke_clean = karaoke_clean.strip()
+                if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
+                    search_titles.append(karaoke_clean)
            
            if artist_mbid:
-                # Try to find recording by artist and title
-                query = """
-                    SELECT r.name, r.gid
-                    FROM recording r
-                    JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
-                    JOIN artist a ON acn.artist = a.id
-                    WHERE a.gid = %s AND r.name ILIKE %s
-                    LIMIT 1
-                """
-                self.cursor.execute(query, (artist_mbid, f"%{clean_title}%"))
-                result = self.cursor.fetchone()
+                # Try to find recording by artist and title with all variations
+                for search_title in search_titles:
+                    query = """
+                        SELECT r.name, r.gid
+                        FROM recording r
+                        JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
+                        JOIN artist a ON acn.artist = a.id
+                        WHERE a.gid = %s AND r.name ILIKE %s
+                        LIMIT 1
+                    """
+                    self.cursor.execute(query, (artist_mbid, f"%{search_title}%"))
+                    result = self.cursor.fetchone()
+                    
+                    if result:
+                        return (result['name'], result['gid'], 1.0)
                
-                if result:
-                    return (result['name'], result['gid'], 1.0)
-                
-                # Try fuzzy search
+                # Try fuzzy search with all title variations
                query = """
                    SELECT r.name, r.gid
                    FROM recording r
@ -392,42 +402,45 @@ class MusicBrainzDatabase:
                best_score = 0
                
                for recording in recordings:
-                    score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
-                    if score > best_score:
-                        best_score = score
-                        best_match = recording
+                    for search_title in search_titles:
+                        score = fuzz.ratio(search_title.lower(), recording['name'].lower())
+                        if score > best_score:
+                            best_score = score
+                            best_match = recording
                
                if best_score >= 80:
                    return (best_match['name'], best_match['gid'], best_score / 100.0)
            else:
-                # No artist constraint - search by title only, but be more specific
-                # Try exact match first
-                query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
-                self.cursor.execute(query, (clean_title,))
-                result = self.cursor.fetchone()
-                if result:
-                    return (result['name'], result['gid'], 1.0)
-                
-                # Try case-insensitive match
-                query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
-                self.cursor.execute(query, (clean_title,))
-                result = self.cursor.fetchone()
-                if result:
-                    return (result['name'], result['gid'], 0.99)
-                
-                # Try partial match
-                query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
-                self.cursor.execute(query, (f"%{clean_title}%",))
-                recordings = self.cursor.fetchall()
+                # No artist constraint - search by title only with all variations
+                for search_title in search_titles:
+                    # Try exact match first
+                    query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
+                    self.cursor.execute(query, (search_title,))
+                    result = self.cursor.fetchone()
+                    if result:
+                        return (result['name'], result['gid'], 1.0)
+                    
+                    # Try case-insensitive match
+                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
+                    self.cursor.execute(query, (search_title,))
+                    result = self.cursor.fetchone()
+                    if result:
+                        return (result['name'], result['gid'], 0.99)
                
+                # Try partial match with all variations
                best_match = None
                best_score = 0
                
-                for recording in recordings:
-                    score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
-                    if score > best_score:
-                        best_score = score
-                        best_match = recording
+                for search_title in search_titles:
+                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
+                    self.cursor.execute(query, (f"%{search_title}%",))
+                    recordings = self.cursor.fetchall()
+                    
+                    for recording in recordings:
+                        score = fuzz.ratio(search_title.lower(), recording['name'].lower())
+                        if score > best_score:
+                            best_score = score
+                            best_match = recording
                
                if best_score >= 80:
                    return (best_match['name'], best_match['gid'], best_score / 100.0)
@ -535,31 +548,46 @@ class MusicBrainzDatabase:
            main_artist, collaborators = self._parse_complex_collaboration(clean_name)
            collaboration = len(collaborators) > 0
            
-            # Clean title
-            clean_title = title.strip()
+            # Try with full title first, then with parenthetical content removed
+            search_titles = [title.strip()]
+            
+            # Create version with parenthetical content removed
+            import re
+            # Remove all parenthetical content like (clean), (remix), (karaoke), etc.
+            clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
+            clean_title = clean_title.strip()
+            if clean_title != title.strip():
+                search_titles.append(clean_title)
+            
+            # Also try with specific karaoke patterns removed
            karaoke_patterns = [
                r'\s*\(Karaoke Version\)',
                r'\s*\(Karaoke\)',
                r'\s*\(Instrumental\)',
                r'\s*\(Backing Track\)',
            ]
-            import re
            for pattern in karaoke_patterns:
-                clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
-            clean_title = clean_title.strip()
+                karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
+                karaoke_clean = karaoke_clean.strip()
+                if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
+                    search_titles.append(karaoke_clean)
            
-            # Try to find recording with artist credit
-            query = """
-                SELECT ac.id as artist_credit_id, r.gid as recording_mbid
-                FROM recording r
-                JOIN artist_credit ac ON r.artist_credit = ac.id
-                JOIN artist_credit_name acn ON ac.id = acn.artist_credit
-                JOIN artist a ON acn.artist = a.id
-                WHERE a.name = %s AND r.name ILIKE %s
-                LIMIT 1
-            """
-            self.cursor.execute(query, (main_artist, f"%{clean_title}%"))
-            result = self.cursor.fetchone()
+            # Try to find recording with artist credit using all title variations
+            for search_title in search_titles:
+                query = """
+                    SELECT ac.id as artist_credit_id, r.gid as recording_mbid
+                    FROM recording r
+                    JOIN artist_credit ac ON r.artist_credit = ac.id
+                    JOIN artist_credit_name acn ON ac.id = acn.artist_credit
+                    JOIN artist a ON acn.artist = a.id
+                    WHERE a.name = %s AND r.name ILIKE %s
+                    LIMIT 1
+                """
+                self.cursor.execute(query, (main_artist, f"%{search_title}%"))
+                result = self.cursor.fetchone()
+                
+                if result:
+                    break
            
            if result:
                # Get the full artist string for this credit
@ -620,10 +648,13 @@ class MusicBrainzDatabase:
        # If no primary collaboration found, check secondary patterns
        if not is_collaboration:
            for pattern in secondary_patterns:
-                if pattern.lower() in artist_string.lower():
+                # Use word boundaries to avoid splitting within words like "Orlando"
+                import re
+                pattern_regex = r'\b' + re.escape(pattern) + r'\b'
+                if re.search(pattern_regex, artist_string, re.IGNORECASE):
                    # For secondary patterns, be more careful
                    # Check if this looks like a band name vs collaboration
-                    parts = artist_string.split(pattern)
+                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        part1 = parts[0].strip()
                        part2 = parts[1].strip()
@ -686,7 +717,8 @@ class MusicBrainzDatabase:
        
        # Split on common separators: comma, ampersand, "and"
        # Use regex to handle multiple separators
-        separators = r'[,&]|\band\b'
+        # Use word boundaries to avoid splitting within words like "Orlando"
+        separators = r'[,&]|\b(?:and)\b'
        
        # Split and clean up each part
        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
--- a/src/cli/main.py
+++ b/src/cli/main.py
@ -59,6 +59,11 @@ class MusicBrainzCleaner:
        return name
    
    def _clean_title(self, title: str) -> str:
+        """
+        Clean title by removing karaoke patterns.
+        Note: The database search now tries multiple title variations including
+        the full title and versions with parenthetical content removed.
+        """
        title = title.strip()
        karaoke_patterns = [
            r'\s*\(Karaoke Version\)',
@ -189,10 +194,12 @@ class MusicBrainzCleaner:
        # If no primary collaboration found, check secondary patterns
        if not is_collaboration:
            for pattern in secondary_patterns:
-                if pattern.lower() in artist_string.lower():
+                # Use word boundaries to avoid splitting within words like "Orlando"
+                pattern_regex = r'\b' + re.escape(pattern) + r'\b'
+                if re.search(pattern_regex, artist_string, re.IGNORECASE):
                    # For secondary patterns, be more careful
                    # Check if this looks like a band name vs collaboration
-                    parts = artist_string.split(pattern)
+                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        part1 = parts[0].strip()
                        part2 = parts[1].strip()
@ -255,7 +262,8 @@ class MusicBrainzCleaner:
        
        # Split on common separators: comma, ampersand, "and"
        # Use regex to handle multiple separators
-        separators = r'[,&]|\band\b'
+        # Use word boundaries to avoid splitting within words like "Orlando"
+        separators = r'[,&]|\b(?:and)\b'
        
        # Split and clean up each part
        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
@ -292,6 +300,15 @@ class MusicBrainzCleaner:
                    # Update with the correct artist credit
                    song['artist'] = artist_string
                    song['recording_mbid'] = recording_mbid
+                    
+                    # For collaborations, try to get the main artist's MBID
+                    if self.use_database:
+                        main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
+                        if main_artist:
+                            artist_result = self.db.fuzzy_search_artist(main_artist)
+                            if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
+                                song['mbid'] = artist_result[1]  # Set the main artist's MBID
+                    
                    print(f"  ✅ Updated to: {song['artist']} - {song.get('title')}")
                    return song
                else: