Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
3d8b0165af
commit
4bbd03eab7
@ -13,6 +13,7 @@ from ..config.constants import (
|
|||||||
TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
|
TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
|
||||||
SUCCESS_MESSAGES, ERROR_MESSAGES
|
SUCCESS_MESSAGES, ERROR_MESSAGES
|
||||||
)
|
)
|
||||||
|
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
|
||||||
|
|
||||||
|
|
||||||
class MusicBrainzDatabase:
|
class MusicBrainzDatabase:
|
||||||
@ -372,7 +373,7 @@ class MusicBrainzDatabase:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate title variations using the helper method
|
# Generate title variations using the helper method
|
||||||
search_titles = self._generate_title_variations(title)
|
search_titles = generate_title_variations(title)
|
||||||
|
|
||||||
if artist_mbid:
|
if artist_mbid:
|
||||||
# Try to find recording by artist and title with all variations
|
# Try to find recording by artist and title with all variations
|
||||||
@ -418,16 +419,17 @@ class MusicBrainzDatabase:
|
|||||||
else:
|
else:
|
||||||
# No artist constraint - search by title only with all variations
|
# No artist constraint - search by title only with all variations
|
||||||
for search_title in search_titles:
|
for search_title in search_titles:
|
||||||
# Try exact match first
|
# Try exact match first (handle both apostrophe types)
|
||||||
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
search_title_curly = search_title.replace("'", "'")
|
||||||
self.cursor.execute(query, (search_title,))
|
query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1"
|
||||||
|
self.cursor.execute(query, (search_title, search_title_curly))
|
||||||
result = self.cursor.fetchone()
|
result = self.cursor.fetchone()
|
||||||
if result:
|
if result:
|
||||||
return (result['name'], result['gid'], 1.0)
|
return (result['name'], result['gid'], 1.0)
|
||||||
|
|
||||||
# Try case-insensitive match
|
# Try case-insensitive match (handle both apostrophe types)
|
||||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1"
|
||||||
self.cursor.execute(query, (search_title,))
|
self.cursor.execute(query, (search_title, search_title_curly))
|
||||||
result = self.cursor.fetchone()
|
result = self.cursor.fetchone()
|
||||||
if result:
|
if result:
|
||||||
return (result['name'], result['gid'], 0.99)
|
return (result['name'], result['gid'], 0.99)
|
||||||
@ -437,8 +439,10 @@ class MusicBrainzDatabase:
|
|||||||
best_score = 0
|
best_score = 0
|
||||||
|
|
||||||
for search_title in search_titles:
|
for search_title in search_titles:
|
||||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
# Handle both apostrophe types in partial search
|
||||||
self.cursor.execute(query, (f"%{search_title}%",))
|
search_title_curly = search_title.replace("'", "'")
|
||||||
|
query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50"
|
||||||
|
self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%"))
|
||||||
recordings = self.cursor.fetchall()
|
recordings = self.cursor.fetchall()
|
||||||
|
|
||||||
for recording in recordings:
|
for recording in recordings:
|
||||||
@ -550,374 +554,55 @@ class MusicBrainzDatabase:
|
|||||||
clean_name = artist_name.strip()
|
clean_name = artist_name.strip()
|
||||||
|
|
||||||
# Parse complex collaboration
|
# Parse complex collaboration
|
||||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
main_artist, collaborators = parse_complex_collaboration(clean_name)
|
||||||
collaboration = len(collaborators) > 0
|
collaboration = len(collaborators) > 0
|
||||||
|
|
||||||
# Generate title variations using the helper method
|
# Generate title variations using the helper method
|
||||||
search_titles = self._generate_title_variations(title)
|
search_titles = generate_title_variations(title)
|
||||||
|
|
||||||
# Try to find recording with artist credit using all title variations
|
# Build the expected artist credit string
|
||||||
|
expected_artist_string = main_artist
|
||||||
|
if collaborators:
|
||||||
|
expected_artist_string += " & " + " & ".join(collaborators)
|
||||||
|
|
||||||
|
# Try to find recording with matching artist credit using all title variations
|
||||||
for search_title in search_titles:
|
for search_title in search_titles:
|
||||||
query = """
|
# First, find all recordings with matching title (handle both apostrophe types)
|
||||||
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
title_query = """
|
||||||
|
SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id
|
||||||
FROM recording r
|
FROM recording r
|
||||||
JOIN artist_credit ac ON r.artist_credit = ac.id
|
WHERE r.name ILIKE %s OR r.name ILIKE %s
|
||||||
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
|
||||||
JOIN artist a ON acn.artist = a.id
|
|
||||||
WHERE a.name = %s AND r.name ILIKE %s
|
|
||||||
LIMIT 1
|
|
||||||
"""
|
"""
|
||||||
self.cursor.execute(query, (main_artist, f"%{search_title}%"))
|
# Create versions with both straight and curly apostrophes
|
||||||
result = self.cursor.fetchone()
|
search_title_curly = search_title.replace("'", "'").replace("'", "'")
|
||||||
|
self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%"))
|
||||||
|
recordings = self.cursor.fetchall()
|
||||||
|
|
||||||
if result:
|
for recording in recordings:
|
||||||
break
|
# Check if this recording's artist credit matches our expected artist string
|
||||||
|
artist_query = """
|
||||||
if result:
|
SELECT a.name, acn.join_phrase, acn.position
|
||||||
# Get the full artist string for this credit
|
FROM artist_credit_name acn
|
||||||
artist_query = """
|
JOIN artist a ON acn.artist = a.id
|
||||||
SELECT a.name, acn.join_phrase, acn.position
|
WHERE acn.artist_credit = %s
|
||||||
FROM artist_credit_name acn
|
ORDER BY acn.position
|
||||||
JOIN artist a ON acn.artist = a.id
|
"""
|
||||||
WHERE acn.artist_credit = %s
|
self.cursor.execute(artist_query, (recording['artist_credit_id'],))
|
||||||
ORDER BY acn.position
|
artists = self.cursor.fetchall()
|
||||||
"""
|
|
||||||
self.cursor.execute(artist_query, (result['artist_credit_id'],))
|
# Build the actual artist string for this recording
|
||||||
artists = self.cursor.fetchall()
|
actual_artist_string = ""
|
||||||
|
for artist in artists:
|
||||||
# Build the artist string
|
actual_artist_string += artist['name']
|
||||||
artist_string = ""
|
if artist['join_phrase']:
|
||||||
for artist in artists:
|
actual_artist_string += artist['join_phrase']
|
||||||
artist_string += artist['name']
|
|
||||||
if artist['join_phrase']:
|
# Check if the artist strings match
|
||||||
artist_string += artist['join_phrase']
|
if actual_artist_string.lower() == expected_artist_string.lower():
|
||||||
|
return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid'])
|
||||||
return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in find_artist_credit: {e}")
|
print(f"Error in find_artist_credit: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
|
|
||||||
"""
|
|
||||||
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
|
|
||||||
Returns: (main_artist, [collaborators])
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
# Primary collaboration indicators
|
|
||||||
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
|
|
||||||
|
|
||||||
# Secondary collaboration indicators (need more careful handling)
|
|
||||||
secondary_patterns = ['&', 'and', ',']
|
|
||||||
|
|
||||||
# Check if this is a collaboration
|
|
||||||
is_collaboration = False
|
|
||||||
split_pattern = None
|
|
||||||
|
|
||||||
# Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
|
|
||||||
# This should be "Kanye West ft. Jamie Foxx"
|
|
||||||
if artist_string.lower().startswith(('ft ', 'feat ')):
|
|
||||||
# This is a malformed collaboration string
|
|
||||||
# Try to extract the actual artists from the rest
|
|
||||||
remaining = artist_string[artist_string.find(' ') + 1:].strip()
|
|
||||||
if ',' in remaining:
|
|
||||||
# Split on comma and reverse the order
|
|
||||||
parts = [part.strip() for part in remaining.split(',')]
|
|
||||||
if len(parts) >= 2:
|
|
||||||
# Assume the last part is the main artist
|
|
||||||
main_artist = parts[-1].strip()
|
|
||||||
collaborators = parts[:-1]
|
|
||||||
return (main_artist, collaborators)
|
|
||||||
|
|
||||||
for pattern in primary_patterns:
|
|
||||||
if pattern.lower() in artist_string.lower():
|
|
||||||
is_collaboration = True
|
|
||||||
# Find the actual pattern in the original string (case-insensitive)
|
|
||||||
import re
|
|
||||||
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
|
|
||||||
if match:
|
|
||||||
split_pattern = match.group(0)
|
|
||||||
else:
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
# If no primary collaboration found, check secondary patterns
|
|
||||||
if not is_collaboration:
|
|
||||||
for pattern in secondary_patterns:
|
|
||||||
if pattern == ',':
|
|
||||||
# Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
|
|
||||||
if ',' in artist_string:
|
|
||||||
# Count commas to determine if this is likely a collaboration
|
|
||||||
comma_count = artist_string.count(',')
|
|
||||||
if comma_count >= 1:
|
|
||||||
# Split on comma and treat as collaboration
|
|
||||||
parts = [part.strip() for part in artist_string.split(',')]
|
|
||||||
if len(parts) >= 2:
|
|
||||||
# First artist is main, rest are collaborators
|
|
||||||
main_artist = parts[0]
|
|
||||||
collaborators = parts[1:]
|
|
||||||
return (main_artist, collaborators)
|
|
||||||
else:
|
|
||||||
# Use whitespace boundaries to avoid splitting within words like "Orlando"
|
|
||||||
import re
|
|
||||||
if pattern in ['&', 'and']:
|
|
||||||
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
|
|
||||||
else:
|
|
||||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
|
||||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
|
||||||
# For secondary patterns, be more careful
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
||||||
if len(parts) == 2:
|
|
||||||
part1 = parts[0].strip()
|
|
||||||
part2 = parts[1].strip()
|
|
||||||
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
# Band names typically have simple, short parts
|
|
||||||
# Collaborations often have more complex artist names
|
|
||||||
part1_words = len(part1.split())
|
|
||||||
part2_words = len(part2.split())
|
|
||||||
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
# Band names typically have simple, short parts
|
|
||||||
# Collaborations often have more complex artist names
|
|
||||||
|
|
||||||
# If one part has a comma, it's likely a collaboration
|
|
||||||
if ',' in part1 or ',' in part2:
|
|
||||||
is_collaboration = True
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
# For "&" and "and" separators, check against known band names
|
|
||||||
# Load band names from JSON file
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
|
|
||||||
with open(json_path, 'r') as f:
|
|
||||||
known_data = json.load(f)
|
|
||||||
known_band_names = {name.lower() for name in known_data.get('groups', [])}
|
|
||||||
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
|
||||||
# Fallback to a minimal list if file can't be loaded
|
|
||||||
known_band_names = {
|
|
||||||
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
|
|
||||||
"brooks & dunn", "the everly brothers"
|
|
||||||
}
|
|
||||||
|
|
||||||
if artist_string.lower() in known_band_names:
|
|
||||||
# This is a known band name, not a collaboration
|
|
||||||
continue
|
|
||||||
|
|
||||||
# For "and" separators, be more conservative
|
|
||||||
# Many band names use "and" (e.g., "Gerry and the Pacemakers")
|
|
||||||
if pattern.lower() == 'and':
|
|
||||||
# Only treat as collaboration if it looks like individual artists
|
|
||||||
# Band names often have "the" in the second part
|
|
||||||
if 'the ' in part2.lower():
|
|
||||||
# Likely a band name, skip
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Likely a collaboration
|
|
||||||
is_collaboration = True
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
if not is_collaboration:
|
|
||||||
return (artist_string, [])
|
|
||||||
|
|
||||||
# Split on the pattern using the same regex that was used for detection
|
|
||||||
if split_pattern in ['&', 'and']:
|
|
||||||
# Use whitespace boundary regex for these patterns
|
|
||||||
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
|
|
||||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
||||||
else:
|
|
||||||
# Use simple split for other patterns
|
|
||||||
parts = artist_string.split(split_pattern)
|
|
||||||
|
|
||||||
if len(parts) < 2:
|
|
||||||
return (artist_string, [])
|
|
||||||
|
|
||||||
main_artist = parts[0].strip()
|
|
||||||
collaborators_string = split_pattern.join(parts[1:]).strip()
|
|
||||||
|
|
||||||
# Now parse the collaborators string which might have multiple separators
|
|
||||||
collaborators = self._parse_collaborators(collaborators_string)
|
|
||||||
|
|
||||||
return (main_artist, collaborators)
|
|
||||||
|
|
||||||
def _generate_title_variations(self, title: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Generate title variations by removing parenthetical content and fixing common issues.
|
|
||||||
Returns list of title variations to try.
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
search_titles = [title.strip()]
|
|
||||||
|
|
||||||
# Fix common typos and missing apostrophes
|
|
||||||
title_fixes = title.strip()
|
|
||||||
|
|
||||||
# Fix missing apostrophes in common contractions
|
|
||||||
apostrophe_fixes = [
|
|
||||||
(r'\bDont\b', "Don't"),
|
|
||||||
(r'\bCant\b', "Can't"),
|
|
||||||
(r'\bWont\b', "Won't"),
|
|
||||||
(r'\bArent\b', "Aren't"),
|
|
||||||
(r'\bIsnt\b', "Isn't"),
|
|
||||||
(r'\bWasnt\b', "Wasn't"),
|
|
||||||
(r'\bDidnt\b', "Didn't"),
|
|
||||||
(r'\bDoesnt\b', "Doesn't"),
|
|
||||||
(r'\bHavent\b', "Haven't"),
|
|
||||||
(r'\bHasnt\b', "Hasn't"),
|
|
||||||
(r'\bWouldnt\b', "Wouldn't"),
|
|
||||||
(r'\bCouldnt\b', "Couldn't"),
|
|
||||||
(r'\bShouldnt\b', "Shouldn't"),
|
|
||||||
(r'\bPhunk\b', "Funk"), # Common typo
|
|
||||||
(r'\bBout\b', "About"), # Shortened form
|
|
||||||
]
|
|
||||||
|
|
||||||
for pattern, replacement in apostrophe_fixes:
|
|
||||||
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
|
||||||
if fixed_title != title_fixes:
|
|
||||||
title_fixes = fixed_title
|
|
||||||
if title_fixes not in search_titles:
|
|
||||||
search_titles.append(title_fixes)
|
|
||||||
|
|
||||||
# Comprehensive parentheses removal - try multiple approaches
|
|
||||||
# 1. Remove all complete parentheses (most aggressive)
|
|
||||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
|
||||||
clean_title = clean_title.strip()
|
|
||||||
if clean_title != title.strip() and clean_title:
|
|
||||||
search_titles.append(clean_title)
|
|
||||||
|
|
||||||
# 2. Remove specific common patterns first, then general parentheses
|
|
||||||
specific_patterns = [
|
|
||||||
r'\s*\(Karaoke Version\)',
|
|
||||||
r'\s*\(Karaoke\)',
|
|
||||||
r'\s*\(Instrumental\)',
|
|
||||||
r'\s*\(Backing Track\)',
|
|
||||||
r'\s*\(live [^)]*\)',
|
|
||||||
r'\s*\(Live [^)]*\)',
|
|
||||||
r'\s*\(Acoustic\)',
|
|
||||||
r'\s*\(acoustic\)',
|
|
||||||
r'\s*\(Without Backing Vocals\)',
|
|
||||||
r'\s*\(Without Backing Vocals\)',
|
|
||||||
r'\s*\(Clean\)',
|
|
||||||
r'\s*\(clean\)',
|
|
||||||
r'\s*\(Remix\)',
|
|
||||||
r'\s*\(remix\)',
|
|
||||||
r'\s*\(Radio Edit\)',
|
|
||||||
r'\s*\(radio edit\)',
|
|
||||||
r'\s*\(Extended Mix\)',
|
|
||||||
r'\s*\(extended mix\)',
|
|
||||||
r'\s*\(Single Version\)',
|
|
||||||
r'\s*\(single version\)',
|
|
||||||
r'\s*\(Album Version\)',
|
|
||||||
r'\s*\(album version\)',
|
|
||||||
r'\s*\(Original Mix\)',
|
|
||||||
r'\s*\(original mix\)',
|
|
||||||
r'\s*\(John Lewis Christmas Ad \d+\)', # Specific pattern from test
|
|
||||||
r'\s*\(from the movie [^)]*\)',
|
|
||||||
r'\s*\(from the [^)]*\)',
|
|
||||||
r'\s*\(feat\. [^)]*\)',
|
|
||||||
r'\s*\(featuring [^)]*\)',
|
|
||||||
r'\s*\(ft\. [^)]*\)',
|
|
||||||
r'\s*\(duet\)',
|
|
||||||
r'\s*\(Duet\)',
|
|
||||||
r'\s*\(Two Semitones Down\)',
|
|
||||||
r'\s*\(Minus Piano\)',
|
|
||||||
r'\s*\(Cut Down\)',
|
|
||||||
r'\s*\(Boone & Speedy Vocals\)',
|
|
||||||
r'\s*\(My Heart Belongs to You\)',
|
|
||||||
]
|
|
||||||
|
|
||||||
# 3. Remove dash-separated content (like "Live At the BBC")
|
|
||||||
dash_patterns = [
|
|
||||||
r'\s*-\s*Live [^-]*$',
|
|
||||||
r'\s*-\s*live [^-]*$',
|
|
||||||
r'\s*-\s*Live At [^-]*$',
|
|
||||||
r'\s*-\s*Live At the [^-]*$',
|
|
||||||
r'\s*-\s*Live At the BBC$',
|
|
||||||
r'\s*-\s*Live From [^-]*$',
|
|
||||||
r'\s*-\s*Live In [^-]*$',
|
|
||||||
r'\s*-\s*Live On [^-]*$',
|
|
||||||
]
|
|
||||||
|
|
||||||
# Apply specific patterns first
|
|
||||||
for pattern in specific_patterns:
|
|
||||||
specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
||||||
specific_clean = specific_clean.strip()
|
|
||||||
if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
|
|
||||||
search_titles.append(specific_clean)
|
|
||||||
|
|
||||||
# Apply dash patterns
|
|
||||||
for pattern in dash_patterns:
|
|
||||||
dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
||||||
dash_clean = dash_clean.strip()
|
|
||||||
if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles:
|
|
||||||
search_titles.append(dash_clean)
|
|
||||||
|
|
||||||
# 3. Remove any remaining parentheses after specific patterns
|
|
||||||
for pattern in specific_patterns:
|
|
||||||
remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
||||||
remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip())
|
|
||||||
remaining_clean = remaining_clean.strip()
|
|
||||||
if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles:
|
|
||||||
search_titles.append(remaining_clean)
|
|
||||||
|
|
||||||
# 4. Remove unmatched opening parenthesis at end
|
|
||||||
clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
|
|
||||||
clean_title2 = clean_title2.strip()
|
|
||||||
if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
|
|
||||||
search_titles.append(clean_title2)
|
|
||||||
|
|
||||||
# 5. Remove unmatched closing parenthesis at start
|
|
||||||
clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
|
|
||||||
clean_title3 = clean_title3.strip()
|
|
||||||
if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
|
|
||||||
search_titles.append(clean_title3)
|
|
||||||
|
|
||||||
# 6. Try removing extra spaces and normalizing
|
|
||||||
normalized_title = re.sub(r'\s+', ' ', title.strip())
|
|
||||||
if normalized_title != title.strip() and normalized_title not in search_titles:
|
|
||||||
search_titles.append(normalized_title)
|
|
||||||
|
|
||||||
# 7. Apply normalization to all cleaned versions and remove duplicates
|
|
||||||
normalized_versions = []
|
|
||||||
for version in search_titles:
|
|
||||||
# Normalize spaces (replace multiple spaces with single space)
|
|
||||||
normalized = re.sub(r'\s+', ' ', version.strip())
|
|
||||||
# Remove leading/trailing spaces
|
|
||||||
normalized = normalized.strip()
|
|
||||||
if normalized and normalized not in normalized_versions:
|
|
||||||
normalized_versions.append(normalized)
|
|
||||||
|
|
||||||
return normalized_versions
|
|
||||||
|
|
||||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
if not collaborators_string:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Split on common separators: comma, ampersand, "and"
|
|
||||||
# Use regex to handle multiple separators
|
|
||||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
|
||||||
separators = r'[,&]|\b(?:and)\b'
|
|
||||||
|
|
||||||
# Split and clean up each part
|
|
||||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
collaborators = []
|
|
||||||
for part in parts:
|
|
||||||
part = part.strip()
|
|
||||||
if part: # Skip empty parts
|
|
||||||
collaborators.append(part)
|
|
||||||
|
|
||||||
return collaborators
|
|
||||||
242
src/cli/main.py
242
src/cli/main.py
@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader
|
|||||||
# Import database and API clients
|
# Import database and API clients
|
||||||
from ..api.database import MusicBrainzDatabase
|
from ..api.database import MusicBrainzDatabase
|
||||||
from ..api.api_client import MusicBrainzAPIClient
|
from ..api.api_client import MusicBrainzAPIClient
|
||||||
|
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
|
||||||
|
|
||||||
|
|
||||||
class MusicBrainzCleaner:
|
class MusicBrainzCleaner:
|
||||||
@ -81,7 +82,7 @@ class MusicBrainzCleaner:
|
|||||||
|
|
||||||
# Handle collaborations - try to find main artist first
|
# Handle collaborations - try to find main artist first
|
||||||
# Use the same complex collaboration parsing as the database
|
# Use the same complex collaboration parsing as the database
|
||||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
main_artist, collaborators = parse_complex_collaboration(clean_name)
|
||||||
has_collaboration = len(collaborators) > 0
|
has_collaboration = len(collaborators) > 0
|
||||||
|
|
||||||
if has_collaboration:
|
if has_collaboration:
|
||||||
@ -119,38 +120,68 @@ class MusicBrainzCleaner:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]:
|
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
|
||||||
clean_title = self._clean_title(title)
|
# Use the shared title variations logic for comprehensive title cleaning
|
||||||
|
title_variations = generate_title_variations(title)
|
||||||
|
|
||||||
# Load known recordings data first
|
# Load known recordings data first
|
||||||
known_recordings = data_loader.load_known_recordings()
|
known_recordings = data_loader.load_known_recordings()
|
||||||
|
|
||||||
# Check known recordings first
|
# Check known recordings first with all title variations
|
||||||
for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
|
for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
|
||||||
if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower():
|
if artist_mbid == known_artist_mbid:
|
||||||
return recording_mbid
|
for variation in title_variations:
|
||||||
|
if variation.lower() == known_title.lower():
|
||||||
|
return recording_mbid
|
||||||
|
|
||||||
# Handle collaborations using artist credit
|
# Handle collaborations using artist credit
|
||||||
if self.use_database:
|
if self.use_database:
|
||||||
# If no artist_mbid (collaboration case), try to find by title and original artist name
|
# If no artist_mbid (collaboration case), try to find by title and verify artist credit
|
||||||
if not artist_mbid:
|
if not artist_mbid and original_artist:
|
||||||
# This is a collaboration case, try to find by title
|
# This is a collaboration case, try to find by title with all variations
|
||||||
result = self.db.fuzzy_search_recording(clean_title)
|
for variation in title_variations:
|
||||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
# Search for recordings by title first
|
||||||
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
result = self.db.fuzzy_search_recording(variation)
|
||||||
|
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||||
|
recording_mbid = result[1]
|
||||||
|
|
||||||
|
# Verify that this recording has the correct artist credit
|
||||||
|
recording_info = self.get_recording_info(recording_mbid)
|
||||||
|
if recording_info and recording_info.get('artist_credit'):
|
||||||
|
# Check if the artist credit matches our expected collaboration
|
||||||
|
expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ')
|
||||||
|
if recording_info['artist_credit'].lower() == expected_artist_string.lower():
|
||||||
|
return recording_mbid
|
||||||
|
|
||||||
|
# If exact match fails, try partial match
|
||||||
|
if recording_info and recording_info.get('artist_credit'):
|
||||||
|
# Check if all artists in the collaboration are present in the recording
|
||||||
|
main_artist, collaborators = parse_complex_collaboration(original_artist)
|
||||||
|
recording_artists = recording_info['artist_credit'].lower()
|
||||||
|
|
||||||
|
# Check if main artist is in the recording
|
||||||
|
if main_artist.lower() in recording_artists:
|
||||||
|
# Check if at least one collaborator is also present
|
||||||
|
for collaborator in collaborators:
|
||||||
|
if collaborator.lower() in recording_artists:
|
||||||
|
return recording_mbid
|
||||||
|
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
# Regular case with artist_mbid
|
# Regular case with artist_mbid - try all title variations
|
||||||
result = self.db.fuzzy_search_recording(clean_title, artist_mbid)
|
for variation in title_variations:
|
||||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
result = self.db.fuzzy_search_recording(variation, artist_mbid)
|
||||||
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||||
|
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
||||||
else:
|
else:
|
||||||
# Fallback to API
|
# Fallback to API - try all title variations
|
||||||
try:
|
for variation in title_variations:
|
||||||
result = self.api.search_recording(clean_title, artist_mbid)
|
try:
|
||||||
if result:
|
result = self.api.search_recording(variation, artist_mbid)
|
||||||
return result['id']
|
if result:
|
||||||
except:
|
return result['id']
|
||||||
pass
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -176,130 +207,6 @@ class MusicBrainzCleaner:
|
|||||||
parts.append(credit['joinphrase'])
|
parts.append(credit['joinphrase'])
|
||||||
return ''.join(parts).strip()
|
return ''.join(parts).strip()
|
||||||
|
|
||||||
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
|
|
||||||
"""
|
|
||||||
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
|
|
||||||
Returns: (main_artist, [collaborators])
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
# Primary collaboration indicators
|
|
||||||
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
|
|
||||||
|
|
||||||
# Secondary collaboration indicators (need more careful handling)
|
|
||||||
secondary_patterns = ['&', 'and', ',']
|
|
||||||
|
|
||||||
# Check if this is a collaboration
|
|
||||||
is_collaboration = False
|
|
||||||
split_pattern = None
|
|
||||||
|
|
||||||
for pattern in primary_patterns:
|
|
||||||
if pattern.lower() in artist_string.lower():
|
|
||||||
is_collaboration = True
|
|
||||||
# Find the actual pattern in the original string (case-insensitive)
|
|
||||||
import re
|
|
||||||
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
|
|
||||||
if match:
|
|
||||||
split_pattern = match.group(0)
|
|
||||||
else:
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
# If no primary collaboration found, check secondary patterns
|
|
||||||
if not is_collaboration:
|
|
||||||
for pattern in secondary_patterns:
|
|
||||||
# Use whitespace boundaries for & and and, word boundaries for others
|
|
||||||
if pattern in ['&', 'and']:
|
|
||||||
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
|
|
||||||
else:
|
|
||||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
|
||||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
|
||||||
# For secondary patterns, be more careful
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
||||||
if len(parts) == 2:
|
|
||||||
part1 = parts[0].strip()
|
|
||||||
part2 = parts[1].strip()
|
|
||||||
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
# Band names typically have simple, short parts
|
|
||||||
# Collaborations often have more complex artist names
|
|
||||||
part1_words = len(part1.split())
|
|
||||||
part2_words = len(part2.split())
|
|
||||||
|
|
||||||
# Check if this looks like a band name vs collaboration
|
|
||||||
# Band names typically have simple, short parts
|
|
||||||
# Collaborations often have more complex artist names
|
|
||||||
|
|
||||||
# If one part has a comma, it's likely a collaboration
|
|
||||||
if ',' in part1 or ',' in part2:
|
|
||||||
is_collaboration = True
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
# For "&" separators, be more permissive
|
|
||||||
# Most "&" separators in music are collaborations, not band names
|
|
||||||
# Only treat very obvious band names as non-collaborations
|
|
||||||
if part1_words == 1 and part2_words == 1:
|
|
||||||
# Check for very obvious band name patterns
|
|
||||||
obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
|
|
||||||
if artist_string.lower() in obvious_band_names:
|
|
||||||
# Likely a band name, skip
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Likely a collaboration
|
|
||||||
is_collaboration = True
|
|
||||||
split_pattern = pattern
|
|
||||||
break
|
|
||||||
|
|
||||||
if not is_collaboration:
|
|
||||||
return (artist_string, [])
|
|
||||||
|
|
||||||
# Split on the pattern using the same regex that was used for detection
|
|
||||||
if split_pattern in ['&', 'and']:
|
|
||||||
# Use whitespace boundary regex for these patterns
|
|
||||||
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
|
|
||||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
||||||
else:
|
|
||||||
# Use simple split for other patterns
|
|
||||||
parts = artist_string.split(split_pattern)
|
|
||||||
|
|
||||||
if len(parts) < 2:
|
|
||||||
return (artist_string, [])
|
|
||||||
|
|
||||||
main_artist = parts[0].strip()
|
|
||||||
collaborators_string = split_pattern.join(parts[1:]).strip()
|
|
||||||
|
|
||||||
# Now parse the collaborators string which might have multiple separators
|
|
||||||
collaborators = self._parse_collaborators(collaborators_string)
|
|
||||||
|
|
||||||
return (main_artist, collaborators)
|
|
||||||
|
|
||||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
if not collaborators_string:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Split on common separators: comma, ampersand, "and"
|
|
||||||
# Use regex to handle multiple separators
|
|
||||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
|
||||||
separators = r'[,&]|\b(?:and)\b'
|
|
||||||
|
|
||||||
# Split and clean up each part
|
|
||||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
collaborators = []
|
|
||||||
for part in parts:
|
|
||||||
part = part.strip()
|
|
||||||
if part: # Skip empty parts
|
|
||||||
collaborators.append(part)
|
|
||||||
|
|
||||||
return collaborators
|
|
||||||
|
|
||||||
def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
|
def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
|
||||||
"""
|
"""
|
||||||
Clean a single song and return (cleaned_song, success_status)
|
Clean a single song and return (cleaned_song, success_status)
|
||||||
@ -313,44 +220,37 @@ class MusicBrainzCleaner:
|
|||||||
# Handle collaboration case (artist_mbid is None)
|
# Handle collaboration case (artist_mbid is None)
|
||||||
# Use the same complex collaboration parsing as the database
|
# Use the same complex collaboration parsing as the database
|
||||||
artist_name = song.get('artist', '')
|
artist_name = song.get('artist', '')
|
||||||
main_artist, collaborators = self._parse_complex_collaboration(artist_name)
|
main_artist, collaborators = parse_complex_collaboration(artist_name)
|
||||||
has_collaboration = len(collaborators) > 0
|
has_collaboration = len(collaborators) > 0
|
||||||
|
|
||||||
if artist_mbid is None and has_collaboration:
|
if artist_mbid is None and has_collaboration:
|
||||||
# Try to find recording using artist credit approach
|
# Try to find recording using the improved find_recording_mbid method
|
||||||
if self.use_database:
|
recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist)
|
||||||
result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', ''))
|
if recording_mbid:
|
||||||
if result:
|
# Get recording info to update the song
|
||||||
artist_credit_id, artist_string, recording_mbid = result
|
recording_info = self.get_recording_info(recording_mbid)
|
||||||
|
if recording_info:
|
||||||
|
# Update with the correct artist credit and title
|
||||||
|
if self.use_database and recording_info.get('artist_credit'):
|
||||||
|
song['artist'] = recording_info['artist_credit']
|
||||||
|
elif not self.use_database and recording_info.get('artist-credit'):
|
||||||
|
artist_string = self._build_artist_string(recording_info['artist-credit'])
|
||||||
|
if artist_string:
|
||||||
|
song['artist'] = artist_string
|
||||||
|
|
||||||
# Update with the correct artist credit
|
song['title'] = recording_info['title']
|
||||||
song['artist'] = artist_string
|
|
||||||
song['recording_mbid'] = recording_mbid
|
song['recording_mbid'] = recording_mbid
|
||||||
|
|
||||||
# For collaborations, try to get the main artist's MBID
|
# For collaborations, try to get the main artist's MBID
|
||||||
if self.use_database:
|
if self.use_database:
|
||||||
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
|
main_artist, collaborators = parse_complex_collaboration(song.get('artist', ''))
|
||||||
if main_artist:
|
if main_artist:
|
||||||
artist_result = self.db.fuzzy_search_artist(main_artist)
|
artist_result = self.db.fuzzy_search_artist(main_artist)
|
||||||
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
|
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
|
||||||
song['mbid'] = artist_result[1] # Set the main artist's MBID
|
song['mbid'] = artist_result[1] # Set the main artist's MBID
|
||||||
|
|
||||||
return song, True
|
return song, True
|
||||||
else:
|
return song, False
|
||||||
return song, False
|
|
||||||
else:
|
|
||||||
# Fallback to API method
|
|
||||||
recording_mbid = self.find_recording_mbid(None, song.get('title', ''))
|
|
||||||
if recording_mbid:
|
|
||||||
recording_info = self.get_recording_info(recording_mbid)
|
|
||||||
if recording_info and recording_info.get('artist-credit'):
|
|
||||||
artist_string = self._build_artist_string(recording_info['artist-credit'])
|
|
||||||
if artist_string:
|
|
||||||
song['artist'] = artist_string
|
|
||||||
song['title'] = recording_info['title']
|
|
||||||
song['recording_mbid'] = recording_mbid
|
|
||||||
return song, True
|
|
||||||
return song, False
|
|
||||||
|
|
||||||
# Regular case (non-collaboration or collaboration not found)
|
# Regular case (non-collaboration or collaboration not found)
|
||||||
if not artist_mbid:
|
if not artist_mbid:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user