Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-01 09:46:35 -05:00
parent 3d8b0165af
commit 4bbd03eab7
2 changed files with 123 additions and 538 deletions

View File

@ -13,6 +13,7 @@ from ..config.constants import (
TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD, TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
SUCCESS_MESSAGES, ERROR_MESSAGES SUCCESS_MESSAGES, ERROR_MESSAGES
) )
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
class MusicBrainzDatabase: class MusicBrainzDatabase:
@ -372,7 +373,7 @@ class MusicBrainzDatabase:
try: try:
# Generate title variations using the helper method # Generate title variations using the helper method
search_titles = self._generate_title_variations(title) search_titles = generate_title_variations(title)
if artist_mbid: if artist_mbid:
# Try to find recording by artist and title with all variations # Try to find recording by artist and title with all variations
@ -418,16 +419,17 @@ class MusicBrainzDatabase:
else: else:
# No artist constraint - search by title only with all variations # No artist constraint - search by title only with all variations
for search_title in search_titles: for search_title in search_titles:
# Try exact match first # Try exact match first (handle both apostrophe types)
query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1" search_title_curly = search_title.replace("'", "'")
self.cursor.execute(query, (search_title,)) query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1"
self.cursor.execute(query, (search_title, search_title_curly))
result = self.cursor.fetchone() result = self.cursor.fetchone()
if result: if result:
return (result['name'], result['gid'], 1.0) return (result['name'], result['gid'], 1.0)
# Try case-insensitive match # Try case-insensitive match (handle both apostrophe types)
query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1" query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1"
self.cursor.execute(query, (search_title,)) self.cursor.execute(query, (search_title, search_title_curly))
result = self.cursor.fetchone() result = self.cursor.fetchone()
if result: if result:
return (result['name'], result['gid'], 0.99) return (result['name'], result['gid'], 0.99)
@ -437,8 +439,10 @@ class MusicBrainzDatabase:
best_score = 0 best_score = 0
for search_title in search_titles: for search_title in search_titles:
query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50" # Handle both apostrophe types in partial search
self.cursor.execute(query, (f"%{search_title}%",)) search_title_curly = search_title.replace("'", "'")
query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50"
self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%"))
recordings = self.cursor.fetchall() recordings = self.cursor.fetchall()
for recording in recordings: for recording in recordings:
@ -550,374 +554,55 @@ class MusicBrainzDatabase:
clean_name = artist_name.strip() clean_name = artist_name.strip()
# Parse complex collaboration # Parse complex collaboration
main_artist, collaborators = self._parse_complex_collaboration(clean_name) main_artist, collaborators = parse_complex_collaboration(clean_name)
collaboration = len(collaborators) > 0 collaboration = len(collaborators) > 0
# Generate title variations using the helper method # Generate title variations using the helper method
search_titles = self._generate_title_variations(title) search_titles = generate_title_variations(title)
# Try to find recording with artist credit using all title variations # Build the expected artist credit string
expected_artist_string = main_artist
if collaborators:
expected_artist_string += " & " + " & ".join(collaborators)
# Try to find recording with matching artist credit using all title variations
for search_title in search_titles: for search_title in search_titles:
query = """ # First, find all recordings with matching title (handle both apostrophe types)
SELECT ac.id as artist_credit_id, r.gid as recording_mbid title_query = """
SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id
FROM recording r FROM recording r
JOIN artist_credit ac ON r.artist_credit = ac.id WHERE r.name ILIKE %s OR r.name ILIKE %s
JOIN artist_credit_name acn ON ac.id = acn.artist_credit
JOIN artist a ON acn.artist = a.id
WHERE a.name = %s AND r.name ILIKE %s
LIMIT 1
""" """
self.cursor.execute(query, (main_artist, f"%{search_title}%")) # Create versions with both straight and curly apostrophes
result = self.cursor.fetchone() search_title_curly = search_title.replace("'", "'").replace("'", "'")
self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%"))
recordings = self.cursor.fetchall()
if result: for recording in recordings:
break # Check if this recording's artist credit matches our expected artist string
artist_query = """
if result: SELECT a.name, acn.join_phrase, acn.position
# Get the full artist string for this credit FROM artist_credit_name acn
artist_query = """ JOIN artist a ON acn.artist = a.id
SELECT a.name, acn.join_phrase, acn.position WHERE acn.artist_credit = %s
FROM artist_credit_name acn ORDER BY acn.position
JOIN artist a ON acn.artist = a.id """
WHERE acn.artist_credit = %s self.cursor.execute(artist_query, (recording['artist_credit_id'],))
ORDER BY acn.position artists = self.cursor.fetchall()
"""
self.cursor.execute(artist_query, (result['artist_credit_id'],)) # Build the actual artist string for this recording
artists = self.cursor.fetchall() actual_artist_string = ""
for artist in artists:
# Build the artist string actual_artist_string += artist['name']
artist_string = "" if artist['join_phrase']:
for artist in artists: actual_artist_string += artist['join_phrase']
artist_string += artist['name']
if artist['join_phrase']: # Check if the artist strings match
artist_string += artist['join_phrase'] if actual_artist_string.lower() == expected_artist_string.lower():
return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid'])
return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])
return None return None
except Exception as e: except Exception as e:
print(f"Error in find_artist_credit: {e}") print(f"Error in find_artist_credit: {e}")
return None return None
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
"""
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
Returns: (main_artist, [collaborators])
"""
import re
# Primary collaboration indicators
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
# Secondary collaboration indicators (need more careful handling)
secondary_patterns = ['&', 'and', ',']
# Check if this is a collaboration
is_collaboration = False
split_pattern = None
# Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
# This should be "Kanye West ft. Jamie Foxx"
if artist_string.lower().startswith(('ft ', 'feat ')):
# This is a malformed collaboration string
# Try to extract the actual artists from the rest
remaining = artist_string[artist_string.find(' ') + 1:].strip()
if ',' in remaining:
# Split on comma and reverse the order
parts = [part.strip() for part in remaining.split(',')]
if len(parts) >= 2:
# Assume the last part is the main artist
main_artist = parts[-1].strip()
collaborators = parts[:-1]
return (main_artist, collaborators)
for pattern in primary_patterns:
if pattern.lower() in artist_string.lower():
is_collaboration = True
# Find the actual pattern in the original string (case-insensitive)
import re
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
if match:
split_pattern = match.group(0)
else:
split_pattern = pattern
break
# If no primary collaboration found, check secondary patterns
if not is_collaboration:
for pattern in secondary_patterns:
if pattern == ',':
# Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
if ',' in artist_string:
# Count commas to determine if this is likely a collaboration
comma_count = artist_string.count(',')
if comma_count >= 1:
# Split on comma and treat as collaboration
parts = [part.strip() for part in artist_string.split(',')]
if len(parts) >= 2:
# First artist is main, rest are collaborators
main_artist = parts[0]
collaborators = parts[1:]
return (main_artist, collaborators)
else:
# Use whitespace boundaries to avoid splitting within words like "Orlando"
import re
if pattern in ['&', 'and']:
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
else:
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
if re.search(pattern_regex, artist_string, re.IGNORECASE):
# For secondary patterns, be more careful
# Check if this looks like a band name vs collaboration
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
if len(parts) == 2:
part1 = parts[0].strip()
part2 = parts[1].strip()
# Check if this looks like a band name vs collaboration
# Band names typically have simple, short parts
# Collaborations often have more complex artist names
part1_words = len(part1.split())
part2_words = len(part2.split())
# Check if this looks like a band name vs collaboration
# Band names typically have simple, short parts
# Collaborations often have more complex artist names
# If one part has a comma, it's likely a collaboration
if ',' in part1 or ',' in part2:
is_collaboration = True
split_pattern = pattern
break
# For "&" and "and" separators, check against known band names
# Load band names from JSON file
try:
import json
import os
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
with open(json_path, 'r') as f:
known_data = json.load(f)
known_band_names = {name.lower() for name in known_data.get('groups', [])}
except (FileNotFoundError, json.JSONDecodeError, KeyError):
# Fallback to a minimal list if file can't be loaded
known_band_names = {
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
"brooks & dunn", "the everly brothers"
}
if artist_string.lower() in known_band_names:
# This is a known band name, not a collaboration
continue
# For "and" separators, be more conservative
# Many band names use "and" (e.g., "Gerry and the Pacemakers")
if pattern.lower() == 'and':
# Only treat as collaboration if it looks like individual artists
# Band names often have "the" in the second part
if 'the ' in part2.lower():
# Likely a band name, skip
continue
# Likely a collaboration
is_collaboration = True
split_pattern = pattern
break
if not is_collaboration:
return (artist_string, [])
# Split on the pattern using the same regex that was used for detection
if split_pattern in ['&', 'and']:
# Use whitespace boundary regex for these patterns
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
else:
# Use simple split for other patterns
parts = artist_string.split(split_pattern)
if len(parts) < 2:
return (artist_string, [])
main_artist = parts[0].strip()
collaborators_string = split_pattern.join(parts[1:]).strip()
# Now parse the collaborators string which might have multiple separators
collaborators = self._parse_collaborators(collaborators_string)
return (main_artist, collaborators)
def _generate_title_variations(self, title: str) -> List[str]:
"""
Generate title variations by removing parenthetical content and fixing common issues.
Returns list of title variations to try.
"""
import re
search_titles = [title.strip()]
# Fix common typos and missing apostrophes
title_fixes = title.strip()
# Fix missing apostrophes in common contractions
apostrophe_fixes = [
(r'\bDont\b', "Don't"),
(r'\bCant\b', "Can't"),
(r'\bWont\b', "Won't"),
(r'\bArent\b', "Aren't"),
(r'\bIsnt\b', "Isn't"),
(r'\bWasnt\b', "Wasn't"),
(r'\bDidnt\b', "Didn't"),
(r'\bDoesnt\b', "Doesn't"),
(r'\bHavent\b', "Haven't"),
(r'\bHasnt\b', "Hasn't"),
(r'\bWouldnt\b', "Wouldn't"),
(r'\bCouldnt\b', "Couldn't"),
(r'\bShouldnt\b', "Shouldn't"),
(r'\bPhunk\b', "Funk"), # Common typo
(r'\bBout\b', "About"), # Shortened form
]
for pattern, replacement in apostrophe_fixes:
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
if fixed_title != title_fixes:
title_fixes = fixed_title
if title_fixes not in search_titles:
search_titles.append(title_fixes)
# Comprehensive parentheses removal - try multiple approaches
# 1. Remove all complete parentheses (most aggressive)
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
clean_title = clean_title.strip()
if clean_title != title.strip() and clean_title:
search_titles.append(clean_title)
# 2. Remove specific common patterns first, then general parentheses
specific_patterns = [
r'\s*\(Karaoke Version\)',
r'\s*\(Karaoke\)',
r'\s*\(Instrumental\)',
r'\s*\(Backing Track\)',
r'\s*\(live [^)]*\)',
r'\s*\(Live [^)]*\)',
r'\s*\(Acoustic\)',
r'\s*\(acoustic\)',
r'\s*\(Without Backing Vocals\)',
r'\s*\(Without Backing Vocals\)',
r'\s*\(Clean\)',
r'\s*\(clean\)',
r'\s*\(Remix\)',
r'\s*\(remix\)',
r'\s*\(Radio Edit\)',
r'\s*\(radio edit\)',
r'\s*\(Extended Mix\)',
r'\s*\(extended mix\)',
r'\s*\(Single Version\)',
r'\s*\(single version\)',
r'\s*\(Album Version\)',
r'\s*\(album version\)',
r'\s*\(Original Mix\)',
r'\s*\(original mix\)',
r'\s*\(John Lewis Christmas Ad \d+\)', # Specific pattern from test
r'\s*\(from the movie [^)]*\)',
r'\s*\(from the [^)]*\)',
r'\s*\(feat\. [^)]*\)',
r'\s*\(featuring [^)]*\)',
r'\s*\(ft\. [^)]*\)',
r'\s*\(duet\)',
r'\s*\(Duet\)',
r'\s*\(Two Semitones Down\)',
r'\s*\(Minus Piano\)',
r'\s*\(Cut Down\)',
r'\s*\(Boone & Speedy Vocals\)',
r'\s*\(My Heart Belongs to You\)',
]
# 3. Remove dash-separated content (like "Live At the BBC")
dash_patterns = [
r'\s*-\s*Live [^-]*$',
r'\s*-\s*live [^-]*$',
r'\s*-\s*Live At [^-]*$',
r'\s*-\s*Live At the [^-]*$',
r'\s*-\s*Live At the BBC$',
r'\s*-\s*Live From [^-]*$',
r'\s*-\s*Live In [^-]*$',
r'\s*-\s*Live On [^-]*$',
]
# Apply specific patterns first
for pattern in specific_patterns:
specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
specific_clean = specific_clean.strip()
if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
search_titles.append(specific_clean)
# Apply dash patterns
for pattern in dash_patterns:
dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
dash_clean = dash_clean.strip()
if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles:
search_titles.append(dash_clean)
# 3. Remove any remaining parentheses after specific patterns
for pattern in specific_patterns:
remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip())
remaining_clean = remaining_clean.strip()
if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles:
search_titles.append(remaining_clean)
# 4. Remove unmatched opening parenthesis at end
clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
clean_title2 = clean_title2.strip()
if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
search_titles.append(clean_title2)
# 5. Remove unmatched closing parenthesis at start
clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
clean_title3 = clean_title3.strip()
if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
search_titles.append(clean_title3)
# 6. Try removing extra spaces and normalizing
normalized_title = re.sub(r'\s+', ' ', title.strip())
if normalized_title != title.strip() and normalized_title not in search_titles:
search_titles.append(normalized_title)
# 7. Apply normalization to all cleaned versions and remove duplicates
normalized_versions = []
for version in search_titles:
# Normalize spaces (replace multiple spaces with single space)
normalized = re.sub(r'\s+', ' ', version.strip())
# Remove leading/trailing spaces
normalized = normalized.strip()
if normalized and normalized not in normalized_versions:
normalized_versions.append(normalized)
return normalized_versions
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
"""
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
"""
import re
if not collaborators_string:
return []
# Split on common separators: comma, ampersand, "and"
# Use regex to handle multiple separators
# Use word boundaries to avoid splitting within words like "Orlando"
separators = r'[,&]|\b(?:and)\b'
# Split and clean up each part
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
collaborators = []
for part in parts:
part = part.strip()
if part: # Skip empty parts
collaborators.append(part)
return collaborators

View File

@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader
# Import database and API clients # Import database and API clients
from ..api.database import MusicBrainzDatabase from ..api.database import MusicBrainzDatabase
from ..api.api_client import MusicBrainzAPIClient from ..api.api_client import MusicBrainzAPIClient
from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations
class MusicBrainzCleaner: class MusicBrainzCleaner:
@ -81,7 +82,7 @@ class MusicBrainzCleaner:
# Handle collaborations - try to find main artist first # Handle collaborations - try to find main artist first
# Use the same complex collaboration parsing as the database # Use the same complex collaboration parsing as the database
main_artist, collaborators = self._parse_complex_collaboration(clean_name) main_artist, collaborators = parse_complex_collaboration(clean_name)
has_collaboration = len(collaborators) > 0 has_collaboration = len(collaborators) > 0
if has_collaboration: if has_collaboration:
@ -119,38 +120,68 @@ class MusicBrainzCleaner:
return None return None
def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]: def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
clean_title = self._clean_title(title) # Use the shared title variations logic for comprehensive title cleaning
title_variations = generate_title_variations(title)
# Load known recordings data first # Load known recordings data first
known_recordings = data_loader.load_known_recordings() known_recordings = data_loader.load_known_recordings()
# Check known recordings first # Check known recordings first with all title variations
for (known_artist_mbid, known_title), recording_mbid in known_recordings.items(): for (known_artist_mbid, known_title), recording_mbid in known_recordings.items():
if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower(): if artist_mbid == known_artist_mbid:
return recording_mbid for variation in title_variations:
if variation.lower() == known_title.lower():
return recording_mbid
# Handle collaborations using artist credit # Handle collaborations using artist credit
if self.use_database: if self.use_database:
# If no artist_mbid (collaboration case), try to find by title and original artist name # If no artist_mbid (collaboration case), try to find by title and verify artist credit
if not artist_mbid: if not artist_mbid and original_artist:
# This is a collaboration case, try to find by title # This is a collaboration case, try to find by title with all variations
result = self.db.fuzzy_search_recording(clean_title) for variation in title_variations:
if result and isinstance(result, tuple) and len(result) >= 2: # Search for recordings by title first
return result[1] # Return MBID from tuple (recording_name, mbid, score) result = self.db.fuzzy_search_recording(variation)
if result and isinstance(result, tuple) and len(result) >= 2:
recording_mbid = result[1]
# Verify that this recording has the correct artist credit
recording_info = self.get_recording_info(recording_mbid)
if recording_info and recording_info.get('artist_credit'):
# Check if the artist credit matches our expected collaboration
expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ')
if recording_info['artist_credit'].lower() == expected_artist_string.lower():
return recording_mbid
# If exact match fails, try partial match
if recording_info and recording_info.get('artist_credit'):
# Check if all artists in the collaboration are present in the recording
main_artist, collaborators = parse_complex_collaboration(original_artist)
recording_artists = recording_info['artist_credit'].lower()
# Check if main artist is in the recording
if main_artist.lower() in recording_artists:
# Check if at least one collaborator is also present
for collaborator in collaborators:
if collaborator.lower() in recording_artists:
return recording_mbid
return None
else: else:
# Regular case with artist_mbid # Regular case with artist_mbid - try all title variations
result = self.db.fuzzy_search_recording(clean_title, artist_mbid) for variation in title_variations:
if result and isinstance(result, tuple) and len(result) >= 2: result = self.db.fuzzy_search_recording(variation, artist_mbid)
return result[1] # Return MBID from tuple (recording_name, mbid, score) if result and isinstance(result, tuple) and len(result) >= 2:
return result[1] # Return MBID from tuple (recording_name, mbid, score)
else: else:
# Fallback to API # Fallback to API - try all title variations
try: for variation in title_variations:
result = self.api.search_recording(clean_title, artist_mbid) try:
if result: result = self.api.search_recording(variation, artist_mbid)
return result['id'] if result:
except: return result['id']
pass except:
pass
return None return None
@ -176,130 +207,6 @@ class MusicBrainzCleaner:
parts.append(credit['joinphrase']) parts.append(credit['joinphrase'])
return ''.join(parts).strip() return ''.join(parts).strip()
def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
"""
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
Returns: (main_artist, [collaborators])
"""
import re
# Primary collaboration indicators
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
# Secondary collaboration indicators (need more careful handling)
secondary_patterns = ['&', 'and', ',']
# Check if this is a collaboration
is_collaboration = False
split_pattern = None
for pattern in primary_patterns:
if pattern.lower() in artist_string.lower():
is_collaboration = True
# Find the actual pattern in the original string (case-insensitive)
import re
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
if match:
split_pattern = match.group(0)
else:
split_pattern = pattern
break
# If no primary collaboration found, check secondary patterns
if not is_collaboration:
for pattern in secondary_patterns:
# Use whitespace boundaries for & and and, word boundaries for others
if pattern in ['&', 'and']:
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
else:
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
if re.search(pattern_regex, artist_string, re.IGNORECASE):
# For secondary patterns, be more careful
# Check if this looks like a band name vs collaboration
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
if len(parts) == 2:
part1 = parts[0].strip()
part2 = parts[1].strip()
# Check if this looks like a band name vs collaboration
# Band names typically have simple, short parts
# Collaborations often have more complex artist names
part1_words = len(part1.split())
part2_words = len(part2.split())
# Check if this looks like a band name vs collaboration
# Band names typically have simple, short parts
# Collaborations often have more complex artist names
# If one part has a comma, it's likely a collaboration
if ',' in part1 or ',' in part2:
is_collaboration = True
split_pattern = pattern
break
# For "&" separators, be more permissive
# Most "&" separators in music are collaborations, not band names
# Only treat very obvious band names as non-collaborations
if part1_words == 1 and part2_words == 1:
# Check for very obvious band name patterns
obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
if artist_string.lower() in obvious_band_names:
# Likely a band name, skip
continue
# Likely a collaboration
is_collaboration = True
split_pattern = pattern
break
if not is_collaboration:
return (artist_string, [])
# Split on the pattern using the same regex that was used for detection
if split_pattern in ['&', 'and']:
# Use whitespace boundary regex for these patterns
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
else:
# Use simple split for other patterns
parts = artist_string.split(split_pattern)
if len(parts) < 2:
return (artist_string, [])
main_artist = parts[0].strip()
collaborators_string = split_pattern.join(parts[1:]).strip()
# Now parse the collaborators string which might have multiple separators
collaborators = self._parse_collaborators(collaborators_string)
return (main_artist, collaborators)
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
"""
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
"""
import re
if not collaborators_string:
return []
# Split on common separators: comma, ampersand, "and"
# Use regex to handle multiple separators
# Use word boundaries to avoid splitting within words like "Orlando"
separators = r'[,&]|\b(?:and)\b'
# Split and clean up each part
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
collaborators = []
for part in parts:
part = part.strip()
if part: # Skip empty parts
collaborators.append(part)
return collaborators
def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]: def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]:
""" """
Clean a single song and return (cleaned_song, success_status) Clean a single song and return (cleaned_song, success_status)
@ -313,44 +220,37 @@ class MusicBrainzCleaner:
# Handle collaboration case (artist_mbid is None) # Handle collaboration case (artist_mbid is None)
# Use the same complex collaboration parsing as the database # Use the same complex collaboration parsing as the database
artist_name = song.get('artist', '') artist_name = song.get('artist', '')
main_artist, collaborators = self._parse_complex_collaboration(artist_name) main_artist, collaborators = parse_complex_collaboration(artist_name)
has_collaboration = len(collaborators) > 0 has_collaboration = len(collaborators) > 0
if artist_mbid is None and has_collaboration: if artist_mbid is None and has_collaboration:
# Try to find recording using artist credit approach # Try to find recording using the improved find_recording_mbid method
if self.use_database: recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist)
result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', '')) if recording_mbid:
if result: # Get recording info to update the song
artist_credit_id, artist_string, recording_mbid = result recording_info = self.get_recording_info(recording_mbid)
if recording_info:
# Update with the correct artist credit and title
if self.use_database and recording_info.get('artist_credit'):
song['artist'] = recording_info['artist_credit']
elif not self.use_database and recording_info.get('artist-credit'):
artist_string = self._build_artist_string(recording_info['artist-credit'])
if artist_string:
song['artist'] = artist_string
# Update with the correct artist credit song['title'] = recording_info['title']
song['artist'] = artist_string
song['recording_mbid'] = recording_mbid song['recording_mbid'] = recording_mbid
# For collaborations, try to get the main artist's MBID # For collaborations, try to get the main artist's MBID
if self.use_database: if self.use_database:
main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', '')) main_artist, collaborators = parse_complex_collaboration(song.get('artist', ''))
if main_artist: if main_artist:
artist_result = self.db.fuzzy_search_artist(main_artist) artist_result = self.db.fuzzy_search_artist(main_artist)
if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2: if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2:
song['mbid'] = artist_result[1] # Set the main artist's MBID song['mbid'] = artist_result[1] # Set the main artist's MBID
return song, True return song, True
else: return song, False
return song, False
else:
# Fallback to API method
recording_mbid = self.find_recording_mbid(None, song.get('title', ''))
if recording_mbid:
recording_info = self.get_recording_info(recording_mbid)
if recording_info and recording_info.get('artist-credit'):
artist_string = self._build_artist_string(recording_info['artist-credit'])
if artist_string:
song['artist'] = artist_string
song['title'] = recording_info['title']
song['recording_mbid'] = recording_mbid
return song, True
return song, False
# Regular case (non-collaboration or collaboration not found) # Regular case (non-collaboration or collaboration not found)
if not artist_mbid: if not artist_mbid: