Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
20817a3373
commit
dc396a346f
@ -345,38 +345,48 @@ class MusicBrainzDatabase:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Clean title
|
# Try with full title first, then with parenthetical content removed
|
||||||
clean_title = title.strip()
|
search_titles = [title.strip()]
|
||||||
|
|
||||||
# Remove karaoke patterns
|
# Create version with parenthetical content removed
|
||||||
|
import re
|
||||||
|
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||||
|
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||||
|
clean_title = clean_title.strip()
|
||||||
|
if clean_title != title.strip():
|
||||||
|
search_titles.append(clean_title)
|
||||||
|
|
||||||
|
# Also try with specific karaoke patterns removed
|
||||||
karaoke_patterns = [
|
karaoke_patterns = [
|
||||||
r'\s*\(Karaoke Version\)',
|
r'\s*\(Karaoke Version\)',
|
||||||
r'\s*\(Karaoke\)',
|
r'\s*\(Karaoke\)',
|
||||||
r'\s*\(Instrumental\)',
|
r'\s*\(Instrumental\)',
|
||||||
r'\s*\(Backing Track\)',
|
r'\s*\(Backing Track\)',
|
||||||
]
|
]
|
||||||
import re
|
|
||||||
for pattern in karaoke_patterns:
|
for pattern in karaoke_patterns:
|
||||||
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
|
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||||
clean_title = clean_title.strip()
|
karaoke_clean = karaoke_clean.strip()
|
||||||
|
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||||
|
search_titles.append(karaoke_clean)
|
||||||
|
|
||||||
if artist_mbid:
|
if artist_mbid:
|
||||||
# Try to find recording by artist and title
|
# Try to find recording by artist and title with all variations
|
||||||
query = """
|
for search_title in search_titles:
|
||||||
SELECT r.name, r.gid
|
query = """
|
||||||
FROM recording r
|
SELECT r.name, r.gid
|
||||||
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
|
FROM recording r
|
||||||
JOIN artist a ON acn.artist = a.id
|
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
|
||||||
WHERE a.gid = %s AND r.name ILIKE %s
|
JOIN artist a ON acn.artist = a.id
|
||||||
LIMIT 1
|
WHERE a.gid = %s AND r.name ILIKE %s
|
||||||
"""
|
LIMIT 1
|
||||||
self.cursor.execute(query, (artist_mbid, f"%{clean_title}%"))
|
"""
|
||||||
result = self.cursor.fetchone()
|
self.cursor.execute(query, (artist_mbid, f"%{search_title}%"))
|
||||||
|
result = self.cursor.fetchone()
|
||||||
|
|
||||||
|
if result:
|
||||||
|
return (result['name'], result['gid'], 1.0)
|
||||||
|
|
||||||
if result:
|
# Try fuzzy search with all title variations
|
||||||
return (result['name'], result['gid'], 1.0)
|
|
||||||
|
|
||||||
# Try fuzzy search
|
|
||||||
query = """
|
query = """
|
||||||
SELECT r.name, r.gid
|
SELECT r.name, r.gid
|
||||||
FROM recording r
|
FROM recording r
|
||||||
@ -392,42 +402,45 @@ class MusicBrainzDatabase:
|
|||||||
best_score = 0
|
best_score = 0
|
||||||
|
|
||||||
for recording in recordings:
|
for recording in recordings:
|
||||||
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
|
for search_title in search_titles:
|
||||||
if score > best_score:
|
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
|
||||||
best_score = score
|
if score > best_score:
|
||||||
best_match = recording
|
best_score = score
|
||||||
|
best_match = recording
|
||||||
|
|
||||||
if best_score >= 80:
|
if best_score >= 80:
|
||||||
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
||||||
else:
|
else:
|
||||||
# No artist constraint - search by title only, but be more specific
|
# No artist constraint - search by title only with all variations
|
||||||
# Try exact match first
|
for search_title in search_titles:
|
||||||
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
# Try exact match first
|
||||||
self.cursor.execute(query, (clean_title,))
|
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
||||||
result = self.cursor.fetchone()
|
self.cursor.execute(query, (search_title,))
|
||||||
if result:
|
result = self.cursor.fetchone()
|
||||||
return (result['name'], result['gid'], 1.0)
|
if result:
|
||||||
|
return (result['name'], result['gid'], 1.0)
|
||||||
# Try case-insensitive match
|
|
||||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
# Try case-insensitive match
|
||||||
self.cursor.execute(query, (clean_title,))
|
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
||||||
result = self.cursor.fetchone()
|
self.cursor.execute(query, (search_title,))
|
||||||
if result:
|
result = self.cursor.fetchone()
|
||||||
return (result['name'], result['gid'], 0.99)
|
if result:
|
||||||
|
return (result['name'], result['gid'], 0.99)
|
||||||
# Try partial match
|
|
||||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
|
||||||
self.cursor.execute(query, (f"%{clean_title}%",))
|
|
||||||
recordings = self.cursor.fetchall()
|
|
||||||
|
|
||||||
|
# Try partial match with all variations
|
||||||
best_match = None
|
best_match = None
|
||||||
best_score = 0
|
best_score = 0
|
||||||
|
|
||||||
for recording in recordings:
|
for search_title in search_titles:
|
||||||
score = fuzz.ratio(clean_title.lower(), recording['name'].lower())
|
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
||||||
if score > best_score:
|
self.cursor.execute(query, (f"%{search_title}%",))
|
||||||
best_score = score
|
recordings = self.cursor.fetchall()
|
||||||
best_match = recording
|
|
||||||
|
for recording in recordings:
|
||||||
|
score = fuzz.ratio(search_title.lower(), recording['name'].lower())
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = recording
|
||||||
|
|
||||||
if best_score >= 80:
|
if best_score >= 80:
|
||||||
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
return (best_match['name'], best_match['gid'], best_score / 100.0)
|
||||||
@ -535,31 +548,46 @@ class MusicBrainzDatabase:
|
|||||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||||
collaboration = len(collaborators) > 0
|
collaboration = len(collaborators) > 0
|
||||||
|
|
||||||
# Clean title
|
# Try with full title first, then with parenthetical content removed
|
||||||
clean_title = title.strip()
|
search_titles = [title.strip()]
|
||||||
|
|
||||||
|
# Create version with parenthetical content removed
|
||||||
|
import re
|
||||||
|
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||||
|
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||||
|
clean_title = clean_title.strip()
|
||||||
|
if clean_title != title.strip():
|
||||||
|
search_titles.append(clean_title)
|
||||||
|
|
||||||
|
# Also try with specific karaoke patterns removed
|
||||||
karaoke_patterns = [
|
karaoke_patterns = [
|
||||||
r'\s*\(Karaoke Version\)',
|
r'\s*\(Karaoke Version\)',
|
||||||
r'\s*\(Karaoke\)',
|
r'\s*\(Karaoke\)',
|
||||||
r'\s*\(Instrumental\)',
|
r'\s*\(Instrumental\)',
|
||||||
r'\s*\(Backing Track\)',
|
r'\s*\(Backing Track\)',
|
||||||
]
|
]
|
||||||
import re
|
|
||||||
for pattern in karaoke_patterns:
|
for pattern in karaoke_patterns:
|
||||||
clean_title = re.sub(pattern, '', clean_title, flags=re.IGNORECASE)
|
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||||
clean_title = clean_title.strip()
|
karaoke_clean = karaoke_clean.strip()
|
||||||
|
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||||
|
search_titles.append(karaoke_clean)
|
||||||
|
|
||||||
# Try to find recording with artist credit
|
# Try to find recording with artist credit using all title variations
|
||||||
query = """
|
for search_title in search_titles:
|
||||||
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
query = """
|
||||||
FROM recording r
|
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
||||||
JOIN artist_credit ac ON r.artist_credit = ac.id
|
FROM recording r
|
||||||
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
JOIN artist_credit ac ON r.artist_credit = ac.id
|
||||||
JOIN artist a ON acn.artist = a.id
|
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
||||||
WHERE a.name = %s AND r.name ILIKE %s
|
JOIN artist a ON acn.artist = a.id
|
||||||
LIMIT 1
|
WHERE a.name = %s AND r.name ILIKE %s
|
||||||
"""
|
LIMIT 1
|
||||||
self.cursor.execute(query, (main_artist, f"%{clean_title}%"))
|
"""
|
||||||
result = self.cursor.fetchone()
|
self.cursor.execute(query, (main_artist, f"%{search_title}%"))
|
||||||
|
result = self.cursor.fetchone()
|
||||||
|
|
||||||
|
if result:
|
||||||
|
break
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
# Get the full artist string for this credit
|
# Get the full artist string for this credit
|
||||||
@ -620,10 +648,13 @@ class MusicBrainzDatabase:
|
|||||||
# If no primary collaboration found, check secondary patterns
|
# If no primary collaboration found, check secondary patterns
|
||||||
if not is_collaboration:
|
if not is_collaboration:
|
||||||
for pattern in secondary_patterns:
|
for pattern in secondary_patterns:
|
||||||
if pattern.lower() in artist_string.lower():
|
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||||
|
import re
|
||||||
|
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||||
|
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||||
# For secondary patterns, be more careful
|
# For secondary patterns, be more careful
|
||||||
# Check if this looks like a band name vs collaboration
|
# Check if this looks like a band name vs collaboration
|
||||||
parts = artist_string.split(pattern)
|
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
part1 = parts[0].strip()
|
part1 = parts[0].strip()
|
||||||
part2 = parts[1].strip()
|
part2 = parts[1].strip()
|
||||||
@ -686,7 +717,8 @@ class MusicBrainzDatabase:
|
|||||||
|
|
||||||
# Split on common separators: comma, ampersand, "and"
|
# Split on common separators: comma, ampersand, "and"
|
||||||
# Use regex to handle multiple separators
|
# Use regex to handle multiple separators
|
||||||
separators = r'[,&]|\band\b'
|
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||||
|
separators = r'[,&]|\b(?:and)\b'
|
||||||
|
|
||||||
# Split and clean up each part
|
# Split and clean up each part
|
||||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||||
|
|||||||
@ -59,6 +59,11 @@ class MusicBrainzCleaner:
|
|||||||
return name
|
return name
|
||||||
|
|
||||||
def _clean_title(self, title: str) -> str:
|
def _clean_title(self, title: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean title by removing karaoke patterns.
|
||||||
|
Note: The database search now tries multiple title variations including
|
||||||
|
the full title and versions with parenthetical content removed.
|
||||||
|
"""
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
karaoke_patterns = [
|
karaoke_patterns = [
|
||||||
r'\s*\(Karaoke Version\)',
|
r'\s*\(Karaoke Version\)',
|
||||||
@ -189,10 +194,12 @@ class MusicBrainzCleaner:
|
|||||||
# If no primary collaboration found, check secondary patterns
|
# If no primary collaboration found, check secondary patterns
|
||||||
if not is_collaboration:
|
if not is_collaboration:
|
||||||
for pattern in secondary_patterns:
|
for pattern in secondary_patterns:
|
||||||
if pattern.lower() in artist_string.lower():
|
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||||
|
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||||
|
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||||
# For secondary patterns, be more careful
|
# For secondary patterns, be more careful
|
||||||
# Check if this looks like a band name vs collaboration
|
# Check if this looks like a band name vs collaboration
|
||||||
parts = artist_string.split(pattern)
|
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
part1 = parts[0].strip()
|
part1 = parts[0].strip()
|
||||||
part2 = parts[1].strip()
|
part2 = parts[1].strip()
|
||||||
@ -255,7 +262,8 @@ class MusicBrainzCleaner:
|
|||||||
|
|
||||||
# Split on common separators: comma, ampersand, "and"
|
# Split on common separators: comma, ampersand, "and"
|
||||||
# Use regex to handle multiple separators
|
# Use regex to handle multiple separators
|
||||||
separators = r'[,&]|\band\b'
|
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||||
|
separators = r'[,&]|\b(?:and)\b'
|
||||||
|
|
||||||
# Split and clean up each part
|
# Split and clean up each part
|
||||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||||
@ -292,6 +300,15 @@ class MusicBrainzCleaner:
|
|||||||
# Update with the correct artist credit
|
# Update with the correct artist credit
|
||||||
song['artist'] = artist_string
|
song['artist'] = artist_string
|
||||||
song['recording_mbid'] = recording_mbid
|
song['recording_mbid'] = recording_mbid
|
||||||
|
|
||||||
|
# For collaborations, try to get the main artist's MBID
|
||||||
|
if self.use_database:
|
||||||
|
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
|
||||||
|
if main_artist:
|
||||||
|
artist_result = self.db.fuzzy_search_artist(main_artist)
|
||||||
|
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
|
||||||
|
song['mbid'] = artist_result[1] # Set the main artist's MBID
|
||||||
|
|
||||||
print(f" ✅ Updated to: {song['artist']} - {song.get('title')}")
|
print(f" ✅ Updated to: {song['artist']} - {song.get('title')}")
|
||||||
return song
|
return song
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user