Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-07-31 16:24:48 -05:00
parent 20817a3373
commit dc396a346f
2 changed files with 121 additions and 72 deletions

View File

@ -345,38 +345,48 @@ class MusicBrainzDatabase:
return None
try:
# Clean title
clean_title = title.strip()
# Try with full title first, then with parenthetical content removed
search_titles = [title.strip()]
# Remove karaoke patterns
# Create version with parenthetical content removed
import re
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
clean_title = clean_title.strip()
if clean_title != title.strip():
search_titles.append(clean_title)
# Also try with specific karaoke patterns removed
karaoke_patterns = [
r'\s*\(Karaoke Version\)',
r'\s*\(Karaoke\)',
r'\s*\(Instrumental\)',
r'\s*\(Backing Track\)',
]
import re
for pattern in karaoke_patterns:
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
clean_title = clean_title.strip()
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
karaoke_clean = karaoke_clean.strip()
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
search_titles.append(karaoke_clean)
if artist_mbid:
# Try to find recording by artist and title
query = """
SELECT r.name, r.gid
FROM recording r
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
JOIN artist a ON acn.artist = a.id
WHERE a.gid = %s AND r.name ILIKE %s
LIMIT 1
"""
self.cursor.execute(query, (artist_mbid, f"%{clean_title}%"))
result = self.cursor.fetchone()
# Try to find recording by artist and title with all variations
for search_title in search_titles:
query = """
SELECT r.name, r.gid
FROM recording r
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
JOIN artist a ON acn.artist = a.id
WHERE a.gid = %s AND r.name ILIKE %s
LIMIT 1
"""
self.cursor.execute(query, (artist_mbid, f"%{search_title}%"))
result = self.cursor.fetchone()
if result:
return (result['name'], result['gid'], 1.0)
if result:
return (result['name'], result['gid'], 1.0)
# Try fuzzy search
# Try fuzzy search with all title variations
query = """
SELECT r.name, r.gid
FROM recording r
@ -392,42 +402,45 @@ class MusicBrainzDatabase:
best_score = 0
for recording in recordings:
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
if score > best_score:
best_score = score
best_match = recording
for search_title in search_titles:
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
if score > best_score:
best_score = score
best_match = recording
if best_score >= 80:
return (best_match['name'], best_match['gid'], best_score / 100.0)
else:
# No artist constraint - search by title only, but be more specific
# Try exact match first
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
self.cursor.execute(query, (clean_title,))
result = self.cursor.fetchone()
if result:
return (result['name'], result['gid'], 1.0)
# Try case-insensitive match
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
self.cursor.execute(query, (clean_title,))
result = self.cursor.fetchone()
if result:
return (result['name'], result['gid'], 0.99)
# Try partial match
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
self.cursor.execute(query, (f"%{clean_title}%",))
recordings = self.cursor.fetchall()
# No artist constraint - search by title only with all variations
for search_title in search_titles:
# Try exact match first
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
self.cursor.execute(query, (search_title,))
result = self.cursor.fetchone()
if result:
return (result['name'], result['gid'], 1.0)
# Try case-insensitive match
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
self.cursor.execute(query, (search_title,))
result = self.cursor.fetchone()
if result:
return (result['name'], result['gid'], 0.99)
# Try partial match with all variations
best_match = None
best_score = 0
for recording in recordings:
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
if score > best_score:
best_score = score
best_match = recording
for search_title in search_titles:
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
self.cursor.execute(query, (f"%{search_title}%",))
recordings = self.cursor.fetchall()
for recording in recordings:
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
if score > best_score:
best_score = score
best_match = recording
if best_score >= 80:
return (best_match['name'], best_match['gid'], best_score / 100.0)
@ -535,31 +548,46 @@ class MusicBrainzDatabase:
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
collaboration = len(collaborators) > 0
# Clean title
clean_title = title.strip()
# Try with full title first, then with parenthetical content removed
search_titles = [title.strip()]
# Create version with parenthetical content removed
import re
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
clean_title = clean_title.strip()
if clean_title != title.strip():
search_titles.append(clean_title)
# Also try with specific karaoke patterns removed
karaoke_patterns = [
r'\s*\(Karaoke Version\)',
r'\s*\(Karaoke\)',
r'\s*\(Instrumental\)',
r'\s*\(Backing Track\)',
]
import re
for pattern in karaoke_patterns:
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
clean_title = clean_title.strip()
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
karaoke_clean = karaoke_clean.strip()
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
search_titles.append(karaoke_clean)
# Try to find recording with artist credit
query = """
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
FROM recording r
JOIN artist_credit ac ON r.artist_credit = ac.id
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
JOIN artist a ON acn.artist = a.id
WHERE a.name = %s AND r.name ILIKE %s
LIMIT 1
"""
self.cursor.execute(query, (main_artist, f"%{clean_title}%"))
result = self.cursor.fetchone()
# Try to find recording with artist credit using all title variations
for search_title in search_titles:
query = """
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
FROM recording r
JOIN artist_credit ac ON r.artist_credit = ac.id
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
JOIN artist a ON acn.artist = a.id
WHERE a.name = %s AND r.name ILIKE %s
LIMIT 1
"""
self.cursor.execute(query, (main_artist, f"%{search_title}%"))
result = self.cursor.fetchone()
if result:
break
if result:
# Get the full artist string for this credit
@ -620,10 +648,13 @@ class MusicBrainzDatabase:
# If no primary collaboration found, check secondary patterns
if not is_collaboration:
for pattern in secondary_patterns:
if pattern.lower() in artist_string.lower():
# Use word boundaries to avoid splitting within words like "Orlando"
import re
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
if re.search(pattern_regex, artist_string, re.IGNORECASE):
# For secondary patterns, be more careful
# Check if this looks like a band name vs collaboration
parts = artist_string.split(pattern)
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
if len(parts) == 2:
part1 = parts[0].strip()
part2 = parts[1].strip()
@ -686,7 +717,8 @@ class MusicBrainzDatabase:
# Split on common separators: comma, ampersand, "and"
# Use regex to handle multiple separators
separators = r'[,&]|\band\b'
# Use word boundaries to avoid splitting within words like "Orlando"
separators = r'[,&]|\b(?:and)\b'
# Split and clean up each part
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)

View File

@ -59,6 +59,11 @@ class MusicBrainzCleaner:
return name
def _clean_title(self, title: str) -> str:
"""
Clean title by removing karaoke patterns.
Note: The database search now tries multiple title variations including
the full title and versions with parenthetical content removed.
"""
title = title.strip()
karaoke_patterns = [
r'\s*\(Karaoke Version\)',
@ -189,10 +194,12 @@ class MusicBrainzCleaner:
# If no primary collaboration found, check secondary patterns
if not is_collaboration:
for pattern in secondary_patterns:
if pattern.lower() in artist_string.lower():
# Use word boundaries to avoid splitting within words like "Orlando"
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
if re.search(pattern_regex, artist_string, re.IGNORECASE):
# For secondary patterns, be more careful
# Check if this looks like a band name vs collaboration
parts = artist_string.split(pattern)
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
if len(parts) == 2:
part1 = parts[0].strip()
part2 = parts[1].strip()
@ -255,7 +262,8 @@ class MusicBrainzCleaner:
# Split on common separators: comma, ampersand, "and"
# Use regex to handle multiple separators
separators = r'[,&]|\band\b'
# Use word boundaries to avoid splitting within words like "Orlando"
separators = r'[,&]|\b(?:and)\b'
# Split and clean up each part
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
@ -292,6 +300,15 @@ class MusicBrainzCleaner:
# Update with the correct artist credit
song['artist'] = artist_string
song['recording_mbid'] = recording_mbid
# For collaborations, try to get the main artist's MBID
if self.use_database:
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
if main_artist:
artist_result = self.db.fuzzy_search_artist(main_artist)
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
song['mbid'] = artist_result[1] # Set the main artist's MBID
print(f" ✅ Updated to: {song['artist']} - {song.get('title')}")
return song
else: