Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
20817a3373
commit
dc396a346f
@ -345,38 +345,48 @@ class MusicBrainzDatabase:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Clean title
|
||||
clean_title = title.strip()
|
||||
# Try with full title first, then with parenthetical content removed
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Remove karaoke patterns
|
||||
# Create version with parenthetical content removed
|
||||
import re
|
||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip():
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# Also try with specific karaoke patterns removed
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
]
|
||||
import re
|
||||
for pattern in karaoke_patterns:
|
||||
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
|
||||
clean_title = clean_title.strip()
|
||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
karaoke_clean = karaoke_clean.strip()
|
||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||
search_titles.append(karaoke_clean)
|
||||
|
||||
if artist_mbid:
|
||||
# Try to find recording by artist and title
|
||||
query = """
|
||||
SELECT r.name, r.gid
|
||||
FROM recording r
|
||||
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE a.gid = %s AND r.name ILIKE %s
|
||||
LIMIT 1
|
||||
"""
|
||||
self.cursor.execute(query, (artist_mbid, f"%{clean_title}%"))
|
||||
result = self.cursor.fetchone()
|
||||
# Try to find recording by artist and title with all variations
|
||||
for search_title in search_titles:
|
||||
query = """
|
||||
SELECT r.name, r.gid
|
||||
FROM recording r
|
||||
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE a.gid = %s AND r.name ILIKE %s
|
||||
LIMIT 1
|
||||
"""
|
||||
self.cursor.execute(query, (artist_mbid, f"%{search_title}%"))
|
||||
result = self.cursor.fetchone()
|
||||
|
||||
if result:
|
||||
return (result['name'], result['gid'], 1.0)
|
||||
|
||||
if result:
|
||||
return (result['name'], result['gid'], 1.0)
|
||||
|
||||
# Try fuzzy search
|
||||
# Try fuzzy search with all title variations
|
||||
query = """
|
||||
SELECT r.name, r.gid
|
||||
FROM recording r
|
||||
@ -392,42 +402,45 @@ class MusicBrainzDatabase:
|
||||
best_score = 0
|
||||
|
||||
for recording in recordings:
|
||||
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = recording
|
||||
for search_title in search_titles:
|
||||
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = recording
|
||||
|
||||
if best_score >= 80:
|
||||
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
||||
else:
|
||||
# No artist constraint - search by title only, but be more specific
|
||||
# Try exact match first
|
||||
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (clean_title,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 1.0)
|
||||
|
||||
# Try case-insensitive match
|
||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
||||
self.cursor.execute(query, (clean_title,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 0.99)
|
||||
|
||||
# Try partial match
|
||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
||||
self.cursor.execute(query, (f"%{clean_title}%",))
|
||||
recordings = self.cursor.fetchall()
|
||||
# No artist constraint - search by title only with all variations
|
||||
for search_title in search_titles:
|
||||
# Try exact match first
|
||||
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (search_title,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 1.0)
|
||||
|
||||
# Try case-insensitive match
|
||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
||||
self.cursor.execute(query, (search_title,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 0.99)
|
||||
|
||||
# Try partial match with all variations
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for recording in recordings:
|
||||
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = recording
|
||||
for search_title in search_titles:
|
||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
||||
self.cursor.execute(query, (f"%{search_title}%",))
|
||||
recordings = self.cursor.fetchall()
|
||||
|
||||
for recording in recordings:
|
||||
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = recording
|
||||
|
||||
if best_score >= 80:
|
||||
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
||||
@ -535,31 +548,46 @@ class MusicBrainzDatabase:
|
||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||
collaboration = len(collaborators) > 0
|
||||
|
||||
# Clean title
|
||||
clean_title = title.strip()
|
||||
# Try with full title first, then with parenthetical content removed
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Create version with parenthetical content removed
|
||||
import re
|
||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip():
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# Also try with specific karaoke patterns removed
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
]
|
||||
import re
|
||||
for pattern in karaoke_patterns:
|
||||
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
|
||||
clean_title = clean_title.strip()
|
||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
karaoke_clean = karaoke_clean.strip()
|
||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||
search_titles.append(karaoke_clean)
|
||||
|
||||
# Try to find recording with artist credit
|
||||
query = """
|
||||
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
||||
FROM recording r
|
||||
JOIN artist_credit ac ON r.artist_credit = ac.id
|
||||
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE a.name = %s AND r.name ILIKE %s
|
||||
LIMIT 1
|
||||
"""
|
||||
self.cursor.execute(query, (main_artist, f"%{clean_title}%"))
|
||||
result = self.cursor.fetchone()
|
||||
# Try to find recording with artist credit using all title variations
|
||||
for search_title in search_titles:
|
||||
query = """
|
||||
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
||||
FROM recording r
|
||||
JOIN artist_credit ac ON r.artist_credit = ac.id
|
||||
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE a.name = %s AND r.name ILIKE %s
|
||||
LIMIT 1
|
||||
"""
|
||||
self.cursor.execute(query, (main_artist, f"%{search_title}%"))
|
||||
result = self.cursor.fetchone()
|
||||
|
||||
if result:
|
||||
break
|
||||
|
||||
if result:
|
||||
# Get the full artist string for this credit
|
||||
@ -620,10 +648,13 @@ class MusicBrainzDatabase:
|
||||
# If no primary collaboration found, check secondary patterns
|
||||
if not is_collaboration:
|
||||
for pattern in secondary_patterns:
|
||||
if pattern.lower() in artist_string.lower():
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
import re
|
||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||
# For secondary patterns, be more careful
|
||||
# Check if this looks like a band name vs collaboration
|
||||
parts = artist_string.split(pattern)
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
if len(parts) == 2:
|
||||
part1 = parts[0].strip()
|
||||
part2 = parts[1].strip()
|
||||
@ -686,7 +717,8 @@ class MusicBrainzDatabase:
|
||||
|
||||
# Split on common separators: comma, ampersand, "and"
|
||||
# Use regex to handle multiple separators
|
||||
separators = r'[,&]|\band\b'
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
separators = r'[,&]|\b(?:and)\b'
|
||||
|
||||
# Split and clean up each part
|
||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||
|
||||
@ -59,6 +59,11 @@ class MusicBrainzCleaner:
|
||||
return name
|
||||
|
||||
def _clean_title(self, title: str) -> str:
|
||||
"""
|
||||
Clean title by removing karaoke patterns.
|
||||
Note: The database search now tries multiple title variations including
|
||||
the full title and versions with parenthetical content removed.
|
||||
"""
|
||||
title = title.strip()
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
@ -189,10 +194,12 @@ class MusicBrainzCleaner:
|
||||
# If no primary collaboration found, check secondary patterns
|
||||
if not is_collaboration:
|
||||
for pattern in secondary_patterns:
|
||||
if pattern.lower() in artist_string.lower():
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||
# For secondary patterns, be more careful
|
||||
# Check if this looks like a band name vs collaboration
|
||||
parts = artist_string.split(pattern)
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
if len(parts) == 2:
|
||||
part1 = parts[0].strip()
|
||||
part2 = parts[1].strip()
|
||||
@ -255,7 +262,8 @@ class MusicBrainzCleaner:
|
||||
|
||||
# Split on common separators: comma, ampersand, "and"
|
||||
# Use regex to handle multiple separators
|
||||
separators = r'[,&]|\band\b'
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
separators = r'[,&]|\b(?:and)\b'
|
||||
|
||||
# Split and clean up each part
|
||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||
@ -292,6 +300,15 @@ class MusicBrainzCleaner:
|
||||
# Update with the correct artist credit
|
||||
song['artist'] = artist_string
|
||||
song['recording_mbid'] = recording_mbid
|
||||
|
||||
# For collaborations, try to get the main artist's MBID
|
||||
if self.use_database:
|
||||
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
|
||||
if main_artist:
|
||||
artist_result = self.db.fuzzy_search_artist(main_artist)
|
||||
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
|
||||
song['mbid'] = artist_result[1] # Set the main artist's MBID
|
||||
|
||||
print(f" ✅ Updated to: {song['artist']} - {song.get('title')}")
|
||||
return song
|
||||
else:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user