Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
3d8b0165af
commit
4bbd03eab7
@ -13,6 +13,7 @@ from ..config.constants import (
|
||||
TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
|
||||
SUCCESS_MESSAGES, ERROR_MESSAGES
|
||||
)
|
||||
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
|
||||
|
||||
|
||||
class MusicBrainzDatabase:
|
||||
@ -372,7 +373,7 @@ class MusicBrainzDatabase:
|
||||
|
||||
try:
|
||||
# Generate title variations using the helper method
|
||||
search_titles = self._generate_title_variations(title)
|
||||
search_titles = generate_title_variations(title)
|
||||
|
||||
if artist_mbid:
|
||||
# Try to find recording by artist and title with all variations
|
||||
@ -418,16 +419,17 @@ class MusicBrainzDatabase:
|
||||
else:
|
||||
# No artist constraint - search by title only with all variations
|
||||
for search_title in search_titles:
|
||||
# Try exact match first
|
||||
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (search_title,))
|
||||
# Try exact match first (handle both apostrophe types)
|
||||
search_title_curly = search_title.replace("'", "'")
|
||||
query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (search_title, search_title_curly))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 1.0)
|
||||
|
||||
# Try case-insensitive match
|
||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
|
||||
self.cursor.execute(query, (search_title,))
|
||||
# Try case-insensitive match (handle both apostrophe types)
|
||||
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1"
|
||||
self.cursor.execute(query, (search_title, search_title_curly))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
return (result['name'], result['gid'], 0.99)
|
||||
@ -437,8 +439,10 @@ class MusicBrainzDatabase:
|
||||
best_score = 0
|
||||
|
||||
for search_title in search_titles:
|
||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
|
||||
self.cursor.execute(query, (f"%{search_title}%",))
|
||||
# Handle both apostrophe types in partial search
|
||||
search_title_curly = search_title.replace("'", "'")
|
||||
query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50"
|
||||
self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%"))
|
||||
recordings = self.cursor.fetchall()
|
||||
|
||||
for recording in recordings:
|
||||
@ -550,374 +554,55 @@ class MusicBrainzDatabase:
|
||||
clean_name = artist_name.strip()
|
||||
|
||||
# Parse complex collaboration
|
||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||
main_artist, collaborators = parse_complex_collaboration(clean_name)
|
||||
collaboration = len(collaborators) > 0
|
||||
|
||||
# Generate title variations using the helper method
|
||||
search_titles = self._generate_title_variations(title)
|
||||
search_titles = generate_title_variations(title)
|
||||
|
||||
# Try to find recording with artist credit using all title variations
|
||||
# Build the expected artist credit string
|
||||
expected_artist_string = main_artist
|
||||
if collaborators:
|
||||
expected_artist_string += " & " + " & ".join(collaborators)
|
||||
|
||||
# Try to find recording with matching artist credit using all title variations
|
||||
for search_title in search_titles:
|
||||
query = """
|
||||
SELECT ac.id as artist_credit_id, r.gid as recording_mbid
|
||||
# First, find all recordings with matching title (handle both apostrophe types)
|
||||
title_query = """
|
||||
SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id
|
||||
FROM recording r
|
||||
JOIN artist_credit ac ON r.artist_credit = ac.id
|
||||
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE a.name = %s AND r.name ILIKE %s
|
||||
LIMIT 1
|
||||
WHERE r.name ILIKE %s OR r.name ILIKE %s
|
||||
"""
|
||||
self.cursor.execute(query, (main_artist, f"%{search_title}%"))
|
||||
result = self.cursor.fetchone()
|
||||
# Create versions with both straight and curly apostrophes
|
||||
search_title_curly = search_title.replace("'", "'").replace("'", "'")
|
||||
self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%"))
|
||||
recordings = self.cursor.fetchall()
|
||||
|
||||
if result:
|
||||
break
|
||||
for recording in recordings:
|
||||
# Check if this recording's artist credit matches our expected artist string
|
||||
artist_query = """
|
||||
SELECT a.name, acn.join_phrase, acn.position
|
||||
FROM artist_credit_name acn
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE acn.artist_credit = %s
|
||||
ORDER BY acn.position
|
||||
"""
|
||||
self.cursor.execute(artist_query, (recording['artist_credit_id'],))
|
||||
artists = self.cursor.fetchall()
|
||||
|
||||
if result:
|
||||
# Get the full artist string for this credit
|
||||
artist_query = """
|
||||
SELECT a.name, acn.join_phrase, acn.position
|
||||
FROM artist_credit_name acn
|
||||
JOIN artist a ON acn.artist = a.id
|
||||
WHERE acn.artist_credit = %s
|
||||
ORDER BY acn.position
|
||||
"""
|
||||
self.cursor.execute(artist_query, (result['artist_credit_id'],))
|
||||
artists = self.cursor.fetchall()
|
||||
# Build the actual artist string for this recording
|
||||
actual_artist_string = ""
|
||||
for artist in artists:
|
||||
actual_artist_string += artist['name']
|
||||
if artist['join_phrase']:
|
||||
actual_artist_string += artist['join_phrase']
|
||||
|
||||
# Build the artist string
|
||||
artist_string = ""
|
||||
for artist in artists:
|
||||
artist_string += artist['name']
|
||||
if artist['join_phrase']:
|
||||
artist_string += artist['join_phrase']
|
||||
|
||||
return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])
|
||||
# Check if the artist strings match
|
||||
if actual_artist_string.lower() == expected_artist_string.lower():
|
||||
return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid'])
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in find_artist_credit: {e}")
|
||||
return None
|
||||
|
||||
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
|
||||
Returns: (main_artist, [collaborators])
|
||||
"""
|
||||
import re
|
||||
|
||||
# Primary collaboration indicators
|
||||
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
|
||||
|
||||
# Secondary collaboration indicators (need more careful handling)
|
||||
secondary_patterns = ['&', 'and', ',']
|
||||
|
||||
# Check if this is a collaboration
|
||||
is_collaboration = False
|
||||
split_pattern = None
|
||||
|
||||
# Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
|
||||
# This should be "Kanye West ft. Jamie Foxx"
|
||||
if artist_string.lower().startswith(('ft ', 'feat ')):
|
||||
# This is a malformed collaboration string
|
||||
# Try to extract the actual artists from the rest
|
||||
remaining = artist_string[artist_string.find(' ') + 1:].strip()
|
||||
if ',' in remaining:
|
||||
# Split on comma and reverse the order
|
||||
parts = [part.strip() for part in remaining.split(',')]
|
||||
if len(parts) >= 2:
|
||||
# Assume the last part is the main artist
|
||||
main_artist = parts[-1].strip()
|
||||
collaborators = parts[:-1]
|
||||
return (main_artist, collaborators)
|
||||
|
||||
for pattern in primary_patterns:
|
||||
if pattern.lower() in artist_string.lower():
|
||||
is_collaboration = True
|
||||
# Find the actual pattern in the original string (case-insensitive)
|
||||
import re
|
||||
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
|
||||
if match:
|
||||
split_pattern = match.group(0)
|
||||
else:
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
# If no primary collaboration found, check secondary patterns
|
||||
if not is_collaboration:
|
||||
for pattern in secondary_patterns:
|
||||
if pattern == ',':
|
||||
# Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
|
||||
if ',' in artist_string:
|
||||
# Count commas to determine if this is likely a collaboration
|
||||
comma_count = artist_string.count(',')
|
||||
if comma_count >= 1:
|
||||
# Split on comma and treat as collaboration
|
||||
parts = [part.strip() for part in artist_string.split(',')]
|
||||
if len(parts) >= 2:
|
||||
# First artist is main, rest are collaborators
|
||||
main_artist = parts[0]
|
||||
collaborators = parts[1:]
|
||||
return (main_artist, collaborators)
|
||||
else:
|
||||
# Use whitespace boundaries to avoid splitting within words like "Orlando"
|
||||
import re
|
||||
if pattern in ['&', 'and']:
|
||||
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
|
||||
else:
|
||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||
# For secondary patterns, be more careful
|
||||
# Check if this looks like a band name vs collaboration
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
if len(parts) == 2:
|
||||
part1 = parts[0].strip()
|
||||
part2 = parts[1].strip()
|
||||
|
||||
# Check if this looks like a band name vs collaboration
|
||||
# Band names typically have simple, short parts
|
||||
# Collaborations often have more complex artist names
|
||||
part1_words = len(part1.split())
|
||||
part2_words = len(part2.split())
|
||||
|
||||
# Check if this looks like a band name vs collaboration
|
||||
# Band names typically have simple, short parts
|
||||
# Collaborations often have more complex artist names
|
||||
|
||||
# If one part has a comma, it's likely a collaboration
|
||||
if ',' in part1 or ',' in part2:
|
||||
is_collaboration = True
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
# For "&" and "and" separators, check against known band names
|
||||
# Load band names from JSON file
|
||||
try:
|
||||
import json
|
||||
import os
|
||||
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
|
||||
with open(json_path, 'r') as f:
|
||||
known_data = json.load(f)
|
||||
known_band_names = {name.lower() for name in known_data.get('groups', [])}
|
||||
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
||||
# Fallback to a minimal list if file can't be loaded
|
||||
known_band_names = {
|
||||
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
|
||||
"brooks & dunn", "the everly brothers"
|
||||
}
|
||||
|
||||
if artist_string.lower() in known_band_names:
|
||||
# This is a known band name, not a collaboration
|
||||
continue
|
||||
|
||||
# For "and" separators, be more conservative
|
||||
# Many band names use "and" (e.g., "Gerry and the Pacemakers")
|
||||
if pattern.lower() == 'and':
|
||||
# Only treat as collaboration if it looks like individual artists
|
||||
# Band names often have "the" in the second part
|
||||
if 'the ' in part2.lower():
|
||||
# Likely a band name, skip
|
||||
continue
|
||||
|
||||
# Likely a collaboration
|
||||
is_collaboration = True
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
if not is_collaboration:
|
||||
return (artist_string, [])
|
||||
|
||||
# Split on the pattern using the same regex that was used for detection
|
||||
if split_pattern in ['&', 'and']:
|
||||
# Use whitespace boundary regex for these patterns
|
||||
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
else:
|
||||
# Use simple split for other patterns
|
||||
parts = artist_string.split(split_pattern)
|
||||
|
||||
if len(parts) < 2:
|
||||
return (artist_string, [])
|
||||
|
||||
main_artist = parts[0].strip()
|
||||
collaborators_string = split_pattern.join(parts[1:]).strip()
|
||||
|
||||
# Now parse the collaborators string which might have multiple separators
|
||||
collaborators = self._parse_collaborators(collaborators_string)
|
||||
|
||||
return (main_artist, collaborators)
|
||||
|
||||
def _generate_title_variations(self, title: str) -> List[str]:
|
||||
"""
|
||||
Generate title variations by removing parenthetical content and fixing common issues.
|
||||
Returns list of title variations to try.
|
||||
"""
|
||||
import re
|
||||
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Fix common typos and missing apostrophes
|
||||
title_fixes = title.strip()
|
||||
|
||||
# Fix missing apostrophes in common contractions
|
||||
apostrophe_fixes = [
|
||||
(r'\bDont\b', "Don't"),
|
||||
(r'\bCant\b', "Can't"),
|
||||
(r'\bWont\b', "Won't"),
|
||||
(r'\bArent\b', "Aren't"),
|
||||
(r'\bIsnt\b', "Isn't"),
|
||||
(r'\bWasnt\b', "Wasn't"),
|
||||
(r'\bDidnt\b', "Didn't"),
|
||||
(r'\bDoesnt\b', "Doesn't"),
|
||||
(r'\bHavent\b', "Haven't"),
|
||||
(r'\bHasnt\b', "Hasn't"),
|
||||
(r'\bWouldnt\b', "Wouldn't"),
|
||||
(r'\bCouldnt\b', "Couldn't"),
|
||||
(r'\bShouldnt\b', "Shouldn't"),
|
||||
(r'\bPhunk\b', "Funk"), # Common typo
|
||||
(r'\bBout\b', "About"), # Shortened form
|
||||
]
|
||||
|
||||
for pattern, replacement in apostrophe_fixes:
|
||||
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
||||
if fixed_title != title_fixes:
|
||||
title_fixes = fixed_title
|
||||
if title_fixes not in search_titles:
|
||||
search_titles.append(title_fixes)
|
||||
|
||||
# Comprehensive parentheses removal - try multiple approaches
|
||||
# 1. Remove all complete parentheses (most aggressive)
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip() and clean_title:
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# 2. Remove specific common patterns first, then general parentheses
|
||||
specific_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
r'\s*\(live [^)]*\)',
|
||||
r'\s*\(Live [^)]*\)',
|
||||
r'\s*\(Acoustic\)',
|
||||
r'\s*\(acoustic\)',
|
||||
r'\s*\(Without Backing Vocals\)',
|
||||
r'\s*\(Without Backing Vocals\)',
|
||||
r'\s*\(Clean\)',
|
||||
r'\s*\(clean\)',
|
||||
r'\s*\(Remix\)',
|
||||
r'\s*\(remix\)',
|
||||
r'\s*\(Radio Edit\)',
|
||||
r'\s*\(radio edit\)',
|
||||
r'\s*\(Extended Mix\)',
|
||||
r'\s*\(extended mix\)',
|
||||
r'\s*\(Single Version\)',
|
||||
r'\s*\(single version\)',
|
||||
r'\s*\(Album Version\)',
|
||||
r'\s*\(album version\)',
|
||||
r'\s*\(Original Mix\)',
|
||||
r'\s*\(original mix\)',
|
||||
r'\s*\(John Lewis Christmas Ad \d+\)', # Specific pattern from test
|
||||
r'\s*\(from the movie [^)]*\)',
|
||||
r'\s*\(from the [^)]*\)',
|
||||
r'\s*\(feat\. [^)]*\)',
|
||||
r'\s*\(featuring [^)]*\)',
|
||||
r'\s*\(ft\. [^)]*\)',
|
||||
r'\s*\(duet\)',
|
||||
r'\s*\(Duet\)',
|
||||
r'\s*\(Two Semitones Down\)',
|
||||
r'\s*\(Minus Piano\)',
|
||||
r'\s*\(Cut Down\)',
|
||||
r'\s*\(Boone & Speedy Vocals\)',
|
||||
r'\s*\(My Heart Belongs to You\)',
|
||||
]
|
||||
|
||||
# 3. Remove dash-separated content (like "Live At the BBC")
|
||||
dash_patterns = [
|
||||
r'\s*-\s*Live [^-]*$',
|
||||
r'\s*-\s*live [^-]*$',
|
||||
r'\s*-\s*Live At [^-]*$',
|
||||
r'\s*-\s*Live At the [^-]*$',
|
||||
r'\s*-\s*Live At the BBC$',
|
||||
r'\s*-\s*Live From [^-]*$',
|
||||
r'\s*-\s*Live In [^-]*$',
|
||||
r'\s*-\s*Live On [^-]*$',
|
||||
]
|
||||
|
||||
# Apply specific patterns first
|
||||
for pattern in specific_patterns:
|
||||
specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
specific_clean = specific_clean.strip()
|
||||
if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
|
||||
search_titles.append(specific_clean)
|
||||
|
||||
# Apply dash patterns
|
||||
for pattern in dash_patterns:
|
||||
dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
dash_clean = dash_clean.strip()
|
||||
if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles:
|
||||
search_titles.append(dash_clean)
|
||||
|
||||
# 3. Remove any remaining parentheses after specific patterns
|
||||
for pattern in specific_patterns:
|
||||
remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip())
|
||||
remaining_clean = remaining_clean.strip()
|
||||
if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles:
|
||||
search_titles.append(remaining_clean)
|
||||
|
||||
# 4. Remove unmatched opening parenthesis at end
|
||||
clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
|
||||
clean_title2 = clean_title2.strip()
|
||||
if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
|
||||
search_titles.append(clean_title2)
|
||||
|
||||
# 5. Remove unmatched closing parenthesis at start
|
||||
clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
|
||||
clean_title3 = clean_title3.strip()
|
||||
if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
|
||||
search_titles.append(clean_title3)
|
||||
|
||||
# 6. Try removing extra spaces and normalizing
|
||||
normalized_title = re.sub(r'\s+', ' ', title.strip())
|
||||
if normalized_title != title.strip() and normalized_title not in search_titles:
|
||||
search_titles.append(normalized_title)
|
||||
|
||||
# 7. Apply normalization to all cleaned versions and remove duplicates
|
||||
normalized_versions = []
|
||||
for version in search_titles:
|
||||
# Normalize spaces (replace multiple spaces with single space)
|
||||
normalized = re.sub(r'\s+', ' ', version.strip())
|
||||
# Remove leading/trailing spaces
|
||||
normalized = normalized.strip()
|
||||
if normalized and normalized not in normalized_versions:
|
||||
normalized_versions.append(normalized)
|
||||
|
||||
return normalized_versions
|
||||
|
||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
||||
"""
|
||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
||||
"""
|
||||
import re
|
||||
|
||||
if not collaborators_string:
|
||||
return []
|
||||
|
||||
# Split on common separators: comma, ampersand, "and"
|
||||
# Use regex to handle multiple separators
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
separators = r'[,&]|\b(?:and)\b'
|
||||
|
||||
# Split and clean up each part
|
||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||
|
||||
collaborators = []
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if part: # Skip empty parts
|
||||
collaborators.append(part)
|
||||
|
||||
return collaborators
|
||||
242
src/cli/main.py
242
src/cli/main.py
@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader
|
||||
# Import database and API clients
|
||||
from ..api.database import MusicBrainzDatabase
|
||||
from ..api.api_client import MusicBrainzAPIClient
|
||||
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
|
||||
|
||||
|
||||
class MusicBrainzCleaner:
|
||||
@ -81,7 +82,7 @@ class MusicBrainzCleaner:
|
||||
|
||||
# Handle collaborations - try to find main artist first
|
||||
# Use the same complex collaboration parsing as the database
|
||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||
main_artist, collaborators = parse_complex_collaboration(clean_name)
|
||||
has_collaboration = len(collaborators) > 0
|
||||
|
||||
if has_collaboration:
|
||||
@ -119,38 +120,68 @@ class MusicBrainzCleaner:
|
||||
|
||||
return None
|
||||
|
||||
def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]:
|
||||
clean_title = self._clean_title(title)
|
||||
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
|
||||
# Use the shared title variations logic for comprehensive title cleaning
|
||||
title_variations = generate_title_variations(title)
|
||||
|
||||
# Load known recordings data first
|
||||
known_recordings = data_loader.load_known_recordings()
|
||||
|
||||
# Check known recordings first
|
||||
# Check known recordings first with all title variations
|
||||
for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
|
||||
if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower():
|
||||
return recording_mbid
|
||||
if artist_mbid == known_artist_mbid:
|
||||
for variation in title_variations:
|
||||
if variation.lower() == known_title.lower():
|
||||
return recording_mbid
|
||||
|
||||
# Handle collaborations using artist credit
|
||||
if self.use_database:
|
||||
# If no artist_mbid (collaboration case), try to find by title and original artist name
|
||||
if not artist_mbid:
|
||||
# This is a collaboration case, try to find by title
|
||||
result = self.db.fuzzy_search_recording(clean_title)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
||||
# If no artist_mbid (collaboration case), try to find by title and verify artist credit
|
||||
if not artist_mbid and original_artist:
|
||||
# This is a collaboration case, try to find by title with all variations
|
||||
for variation in title_variations:
|
||||
# Search for recordings by title first
|
||||
result = self.db.fuzzy_search_recording(variation)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
recording_mbid = result[1]
|
||||
|
||||
# Verify that this recording has the correct artist credit
|
||||
recording_info = self.get_recording_info(recording_mbid)
|
||||
if recording_info and recording_info.get('artist_credit'):
|
||||
# Check if the artist credit matches our expected collaboration
|
||||
expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ')
|
||||
if recording_info['artist_credit'].lower() == expected_artist_string.lower():
|
||||
return recording_mbid
|
||||
|
||||
# If exact match fails, try partial match
|
||||
if recording_info and recording_info.get('artist_credit'):
|
||||
# Check if all artists in the collaboration are present in the recording
|
||||
main_artist, collaborators = parse_complex_collaboration(original_artist)
|
||||
recording_artists = recording_info['artist_credit'].lower()
|
||||
|
||||
# Check if main artist is in the recording
|
||||
if main_artist.lower() in recording_artists:
|
||||
# Check if at least one collaborator is also present
|
||||
for collaborator in collaborators:
|
||||
if collaborator.lower() in recording_artists:
|
||||
return recording_mbid
|
||||
|
||||
return None
|
||||
else:
|
||||
# Regular case with artist_mbid
|
||||
result = self.db.fuzzy_search_recording(clean_title, artist_mbid)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
||||
# Regular case with artist_mbid - try all title variations
|
||||
for variation in title_variations:
|
||||
result = self.db.fuzzy_search_recording(variation, artist_mbid)
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (recording_name, mbid, score)
|
||||
else:
|
||||
# Fallback to API
|
||||
try:
|
||||
result = self.api.search_recording(clean_title, artist_mbid)
|
||||
if result:
|
||||
return result['id']
|
||||
except:
|
||||
pass
|
||||
# Fallback to API - try all title variations
|
||||
for variation in title_variations:
|
||||
try:
|
||||
result = self.api.search_recording(variation, artist_mbid)
|
||||
if result:
|
||||
return result['id']
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@ -176,130 +207,6 @@ class MusicBrainzCleaner:
|
||||
parts.append(credit['joinphrase'])
|
||||
return ''.join(parts).strip()
|
||||
|
||||
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
|
||||
Returns: (main_artist, [collaborators])
|
||||
"""
|
||||
import re
|
||||
|
||||
# Primary collaboration indicators
|
||||
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
|
||||
|
||||
# Secondary collaboration indicators (need more careful handling)
|
||||
secondary_patterns = ['&', 'and', ',']
|
||||
|
||||
# Check if this is a collaboration
|
||||
is_collaboration = False
|
||||
split_pattern = None
|
||||
|
||||
for pattern in primary_patterns:
|
||||
if pattern.lower() in artist_string.lower():
|
||||
is_collaboration = True
|
||||
# Find the actual pattern in the original string (case-insensitive)
|
||||
import re
|
||||
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
|
||||
if match:
|
||||
split_pattern = match.group(0)
|
||||
else:
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
# If no primary collaboration found, check secondary patterns
|
||||
if not is_collaboration:
|
||||
for pattern in secondary_patterns:
|
||||
# Use whitespace boundaries for & and and, word boundaries for others
|
||||
if pattern in ['&', 'and']:
|
||||
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
|
||||
else:
|
||||
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
||||
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
||||
# For secondary patterns, be more careful
|
||||
# Check if this looks like a band name vs collaboration
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
if len(parts) == 2:
|
||||
part1 = parts[0].strip()
|
||||
part2 = parts[1].strip()
|
||||
|
||||
# Check if this looks like a band name vs collaboration
|
||||
# Band names typically have simple, short parts
|
||||
# Collaborations often have more complex artist names
|
||||
part1_words = len(part1.split())
|
||||
part2_words = len(part2.split())
|
||||
|
||||
# Check if this looks like a band name vs collaboration
|
||||
# Band names typically have simple, short parts
|
||||
# Collaborations often have more complex artist names
|
||||
|
||||
# If one part has a comma, it's likely a collaboration
|
||||
if ',' in part1 or ',' in part2:
|
||||
is_collaboration = True
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
# For "&" separators, be more permissive
|
||||
# Most "&" separators in music are collaborations, not band names
|
||||
# Only treat very obvious band names as non-collaborations
|
||||
if part1_words == 1 and part2_words == 1:
|
||||
# Check for very obvious band name patterns
|
||||
obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
|
||||
if artist_string.lower() in obvious_band_names:
|
||||
# Likely a band name, skip
|
||||
continue
|
||||
|
||||
# Likely a collaboration
|
||||
is_collaboration = True
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
if not is_collaboration:
|
||||
return (artist_string, [])
|
||||
|
||||
# Split on the pattern using the same regex that was used for detection
|
||||
if split_pattern in ['&', 'and']:
|
||||
# Use whitespace boundary regex for these patterns
|
||||
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
|
||||
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
||||
else:
|
||||
# Use simple split for other patterns
|
||||
parts = artist_string.split(split_pattern)
|
||||
|
||||
if len(parts) < 2:
|
||||
return (artist_string, [])
|
||||
|
||||
main_artist = parts[0].strip()
|
||||
collaborators_string = split_pattern.join(parts[1:]).strip()
|
||||
|
||||
# Now parse the collaborators string which might have multiple separators
|
||||
collaborators = self._parse_collaborators(collaborators_string)
|
||||
|
||||
return (main_artist, collaborators)
|
||||
|
||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
||||
"""
|
||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
||||
"""
|
||||
import re
|
||||
|
||||
if not collaborators_string:
|
||||
return []
|
||||
|
||||
# Split on common separators: comma, ampersand, "and"
|
||||
# Use regex to handle multiple separators
|
||||
# Use word boundaries to avoid splitting within words like "Orlando"
|
||||
separators = r'[,&]|\b(?:and)\b'
|
||||
|
||||
# Split and clean up each part
|
||||
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
||||
|
||||
collaborators = []
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if part: # Skip empty parts
|
||||
collaborators.append(part)
|
||||
|
||||
return collaborators
|
||||
|
||||
def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
|
||||
"""
|
||||
Clean a single song and return (cleaned_song, success_status)
|
||||
@ -313,44 +220,37 @@ class MusicBrainzCleaner:
|
||||
# Handle collaboration case (artist_mbid is None)
|
||||
# Use the same complex collaboration parsing as the database
|
||||
artist_name = song.get('artist', '')
|
||||
main_artist, collaborators = self._parse_complex_collaboration(artist_name)
|
||||
main_artist, collaborators = parse_complex_collaboration(artist_name)
|
||||
has_collaboration = len(collaborators) > 0
|
||||
|
||||
if artist_mbid is None and has_collaboration:
|
||||
# Try to find recording using artist credit approach
|
||||
if self.use_database:
|
||||
result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', ''))
|
||||
if result:
|
||||
artist_credit_id, artist_string, recording_mbid = result
|
||||
# Try to find recording using the improved find_recording_mbid method
|
||||
recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist)
|
||||
if recording_mbid:
|
||||
# Get recording info to update the song
|
||||
recording_info = self.get_recording_info(recording_mbid)
|
||||
if recording_info:
|
||||
# Update with the correct artist credit and title
|
||||
if self.use_database and recording_info.get('artist_credit'):
|
||||
song['artist'] = recording_info['artist_credit']
|
||||
elif not self.use_database and recording_info.get('artist-credit'):
|
||||
artist_string = self._build_artist_string(recording_info['artist-credit'])
|
||||
if artist_string:
|
||||
song['artist'] = artist_string
|
||||
|
||||
# Update with the correct artist credit
|
||||
song['artist'] = artist_string
|
||||
song['title'] = recording_info['title']
|
||||
song['recording_mbid'] = recording_mbid
|
||||
|
||||
# For collaborations, try to get the main artist's MBID
|
||||
if self.use_database:
|
||||
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', ''))
|
||||
main_artist, collaborators = parse_complex_collaboration(song.get('artist', ''))
|
||||
if main_artist:
|
||||
artist_result = self.db.fuzzy_search_artist(main_artist)
|
||||
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
|
||||
song['mbid'] = artist_result[1] # Set the main artist's MBID
|
||||
|
||||
return song, True
|
||||
else:
|
||||
return song, False
|
||||
else:
|
||||
# Fallback to API method
|
||||
recording_mbid = self.find_recording_mbid(None, song.get('title', ''))
|
||||
if recording_mbid:
|
||||
recording_info = self.get_recording_info(recording_mbid)
|
||||
if recording_info and recording_info.get('artist-credit'):
|
||||
artist_string = self._build_artist_string(recording_info['artist-credit'])
|
||||
if artist_string:
|
||||
song['artist'] = artist_string
|
||||
song['title'] = recording_info['title']
|
||||
song['recording_mbid'] = recording_mbid
|
||||
return song, True
|
||||
return song, False
|
||||
return song, False
|
||||
|
||||
# Regular case (non-collaboration or collaboration not found)
|
||||
if not artist_mbid:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user