diff --git a/src/api/database.py b/src/api/database.py index bbed99e..82673e6 100644 --- a/src/api/database.py +++ b/src/api/database.py @@ -345,38 +345,48 @@ class MusicBrainzDatabase: return None try: - # Clean title - clean_title = title.strip() + # Try with full title first, then with parenthetical content removed + search_titles = [title.strip()] - # Remove karaoke patterns + # Create version with parenthetical content removed + import re + # Remove all parenthetical content like (clean), (remix), (karaoke), etc. + clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) + clean_title = clean_title.strip() + if clean_title != title.strip(): + search_titles.append(clean_title) + + # Also try with specific karaoke patterns removed karaoke_patterns = [ r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)', ] - import re for pattern in karaoke_patterns: - clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE) - clean_title = clean_title.strip() + karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + karaoke_clean = karaoke_clean.strip() + if karaoke_clean != title.strip() and karaoke_clean not in search_titles: + search_titles.append(karaoke_clean) if artist_mbid: - # Try to find recording by artist and title - query = """ - SELECT r.name, r.gid - FROM recording r - JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit - JOIN artist a ON acn.artist = a.id - WHERE a.gid = %s AND r.name ILIKE %s - LIMIT 1 - """ - self.cursor.execute(query, (artist_mbid, f"%{clean_title}%")) - result = self.cursor.fetchone() + # Try to find recording by artist and title with all variations + for search_title in search_titles: + query = """ + SELECT r.name, r.gid + FROM recording r + JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit + JOIN artist a ON acn.artist = a.id + WHERE a.gid = %s AND r.name ILIKE %s + LIMIT 1 + """ + self.cursor.execute(query, (artist_mbid, f"%{search_title}%")) + result = self.cursor.fetchone() + + if result: + return (result['name'], result['gid'], 1.0) - if result: - return (result['name'], result['gid'], 1.0) - - # Try fuzzy search + # Try fuzzy search with all title variations query = """ SELECT r.name, r.gid FROM recording r @@ -392,42 +402,45 @@ class MusicBrainzDatabase: best_score = 0 for recording in recordings: - score = fuzz.ratio(clean_title.lower(), recording['name'].lower()) - if score > best_score: - best_score = score - best_match = recording + for search_title in search_titles: + score = fuzz.ratio(search_title.lower(), recording['name'].lower()) + if score > best_score: + best_score = score + best_match = recording if best_score >= 80: return (best_match['name'], best_match['gid'], best_score / 100.0) else: - # No artist constraint - search by title only, but be more specific - # Try exact match first - query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1" - self.cursor.execute(query, (clean_title,)) - result = self.cursor.fetchone() - if result: - return (result['name'], result['gid'], 1.0) - - # Try case-insensitive match - query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1" - self.cursor.execute(query, (clean_title,)) - result = self.cursor.fetchone() - if result: - return (result['name'], result['gid'], 0.99) - - # Try partial match - query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50" - self.cursor.execute(query, (f"%{clean_title}%",)) - recordings = self.cursor.fetchall() + # No artist constraint - search by title only with all variations + for search_title in search_titles: + # Try exact match first + query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1" + self.cursor.execute(query, (search_title,)) + result = self.cursor.fetchone() + if result: + return (result['name'], result['gid'], 1.0) + + # Try case-insensitive match + query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1" + self.cursor.execute(query, (search_title,)) + result = self.cursor.fetchone() + if result: + return (result['name'], result['gid'], 0.99) + # Try partial match with all variations best_match = None best_score = 0 - for recording in recordings: - score = fuzz.ratio(clean_title.lower(), recording['name'].lower()) - if score > best_score: - best_score = score - best_match = recording + for search_title in search_titles: + query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50" + self.cursor.execute(query, (f"%{search_title}%",)) + recordings = self.cursor.fetchall() + + for recording in recordings: + score = fuzz.ratio(search_title.lower(), recording['name'].lower()) + if score > best_score: + best_score = score + best_match = recording if best_score >= 80: return (best_match['name'], best_match['gid'], best_score / 100.0) @@ -535,31 +548,46 @@ class MusicBrainzDatabase: main_artist, collaborators = self._parse_complex_collaboration(clean_name) collaboration = len(collaborators) > 0 - # Clean title - clean_title = title.strip() + # Try with full title first, then with parenthetical content removed + search_titles = [title.strip()] + + # Create version with parenthetical content removed + import re + # Remove all parenthetical content like (clean), (remix), (karaoke), etc. + clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) + clean_title = clean_title.strip() + if clean_title != title.strip(): + search_titles.append(clean_title) + + # Also try with specific karaoke patterns removed karaoke_patterns = [ r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)', ] - import re for pattern in karaoke_patterns: - clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE) - clean_title = clean_title.strip() + karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + karaoke_clean = karaoke_clean.strip() + if karaoke_clean != title.strip() and karaoke_clean not in search_titles: + search_titles.append(karaoke_clean) - # Try to find recording with artist credit - query = """ - SELECT ac.id as artist_credit_id, r.gid as recording_mbid - FROM recording r - JOIN artist_credit ac ON r.artist_credit = ac.id - JOIN artist_credit_name acn ON ac.id = acn.artist_credit - JOIN artist a ON acn.artist = a.id - WHERE a.name = %s AND r.name ILIKE %s - LIMIT 1 - """ - self.cursor.execute(query, (main_artist, f"%{clean_title}%")) - result = self.cursor.fetchone() + # Try to find recording with artist credit using all title variations + for search_title in search_titles: + query = """ + SELECT ac.id as artist_credit_id, r.gid as recording_mbid + FROM recording r + JOIN artist_credit ac ON r.artist_credit = ac.id + JOIN artist_credit_name acn ON ac.id = acn.artist_credit + JOIN artist a ON acn.artist = a.id + WHERE a.name = %s AND r.name ILIKE %s + LIMIT 1 + """ + self.cursor.execute(query, (main_artist, f"%{search_title}%")) + result = self.cursor.fetchone() + + if result: + break if result: # Get the full artist string for this credit @@ -620,10 +648,13 @@ class MusicBrainzDatabase: # If no primary collaboration found, check secondary patterns if not is_collaboration: for pattern in secondary_patterns: - if pattern.lower() in artist_string.lower(): + # Use word boundaries to avoid splitting within words like "Orlando" + import re + pattern_regex = r'\b' + re.escape(pattern) + r'\b' + if re.search(pattern_regex, artist_string, re.IGNORECASE): # For secondary patterns, be more careful # Check if this looks like a band name vs collaboration - parts = artist_string.split(pattern) + parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) if len(parts) == 2: part1 = parts[0].strip() part2 = parts[1].strip() @@ -686,7 +717,8 @@ class MusicBrainzDatabase: # Split on common separators: comma, ampersand, "and" # Use regex to handle multiple separators - separators = r'[,&]|\band\b' + # Use word boundaries to avoid splitting within words like "Orlando" + separators = r'[,&]|\b(?:and)\b' # Split and clean up each part parts = re.split(separators, collaborators_string, flags=re.IGNORECASE) diff --git a/src/cli/main.py b/src/cli/main.py index 7ae79d6..346306f 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -59,6 +59,11 @@ class MusicBrainzCleaner: return name def _clean_title(self, title: str) -> str: + """ + Clean title by removing karaoke patterns. + Note: The database search now tries multiple title variations including + the full title and versions with parenthetical content removed. + """ title = title.strip() karaoke_patterns = [ r'\s*\(Karaoke Version\)', @@ -189,10 +194,12 @@ class MusicBrainzCleaner: # If no primary collaboration found, check secondary patterns if not is_collaboration: for pattern in secondary_patterns: - if pattern.lower() in artist_string.lower(): + # Use word boundaries to avoid splitting within words like "Orlando" + pattern_regex = r'\b' + re.escape(pattern) + r'\b' + if re.search(pattern_regex, artist_string, re.IGNORECASE): # For secondary patterns, be more careful # Check if this looks like a band name vs collaboration - parts = artist_string.split(pattern) + parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) if len(parts) == 2: part1 = parts[0].strip() part2 = parts[1].strip() @@ -255,7 +262,8 @@ class MusicBrainzCleaner: # Split on common separators: comma, ampersand, "and" # Use regex to handle multiple separators - separators = r'[,&]|\band\b' + # Use word boundaries to avoid splitting within words like "Orlando" + separators = r'[,&]|\b(?:and)\b' # Split and clean up each part parts = re.split(separators, collaborators_string, flags=re.IGNORECASE) @@ -292,6 +300,15 @@ class MusicBrainzCleaner: # Update with the correct artist credit song['artist'] = artist_string song['recording_mbid'] = recording_mbid + + # For collaborations, try to get the main artist's MBID + if self.use_database: + main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', '')) + if main_artist: + artist_result = self.db.fuzzy_search_artist(main_artist) + if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2: + song['mbid'] = artist_result[1] # Set the main artist's MBID + print(f" ✅ Updated to: {song['artist']} - {song.get('title')}") return song else: