From 4bbd03eab73e4635c6118508bc95b60f4d8ddc5a Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Fri, 1 Aug 2025 09:46:35 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- src/api/database.py | 419 ++++++-------------------------------------- src/cli/main.py | 242 ++++++++----------------- 2 files changed, 123 insertions(+), 538 deletions(-) diff --git a/src/api/database.py b/src/api/database.py index c42e08f..51d2b1a 100644 --- a/src/api/database.py +++ b/src/api/database.py @@ -13,6 +13,7 @@ from ..config.constants import ( TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD, SUCCESS_MESSAGES, ERROR_MESSAGES ) +from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations class MusicBrainzDatabase: @@ -372,7 +373,7 @@ class MusicBrainzDatabase: try: # Generate title variations using the helper method - search_titles = self._generate_title_variations(title) + search_titles = generate_title_variations(title) if artist_mbid: # Try to find recording by artist and title with all variations @@ -418,16 +419,17 @@ class MusicBrainzDatabase: else: # No artist constraint - search by title only with all variations for search_title in search_titles: - # Try exact match first - query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1" - self.cursor.execute(query, (search_title,)) + # Try exact match first (handle both apostrophe types) + search_title_curly = search_title.replace("'", "'") + query = "SELECT name, gid FROM recording WHERE name = %s OR name = %s LIMIT 1" + self.cursor.execute(query, (search_title, search_title_curly)) result = self.cursor.fetchone() if result: return (result['name'], result['gid'], 1.0) - # Try case-insensitive match - query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1" - self.cursor.execute(query, (search_title,)) + # Try case-insensitive match (handle both apostrophe types) + query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) OR LOWER(name) = LOWER(%s) LIMIT 1" + self.cursor.execute(query, (search_title, search_title_curly)) result = self.cursor.fetchone() if result: return (result['name'], result['gid'], 0.99) @@ -437,8 +439,10 @@ class MusicBrainzDatabase: best_score = 0 for search_title in search_titles: - query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50" - self.cursor.execute(query, (f"%{search_title}%",)) + # Handle both apostrophe types in partial search + search_title_curly = search_title.replace("'", "'") + query = "SELECT name, gid FROM recording WHERE name ILIKE %s OR name ILIKE %s LIMIT 50" + self.cursor.execute(query, (f"%{search_title}%", f"%{search_title_curly}%")) recordings = self.cursor.fetchall() for recording in recordings: @@ -550,374 +554,55 @@ class MusicBrainzDatabase: clean_name = artist_name.strip() # Parse complex collaboration - main_artist, collaborators = self._parse_complex_collaboration(clean_name) + main_artist, collaborators = parse_complex_collaboration(clean_name) collaboration = len(collaborators) > 0 # Generate title variations using the helper method - search_titles = self._generate_title_variations(title) + search_titles = generate_title_variations(title) - # Try to find recording with artist credit using all title variations + # Build the expected artist credit string + expected_artist_string = main_artist + if collaborators: + expected_artist_string += " & " + " & ".join(collaborators) + + # Try to find recording with matching artist credit using all title variations for search_title in search_titles: - query = """ - SELECT ac.id as artist_credit_id, r.gid as recording_mbid + # First, find all recordings with matching title (handle both apostrophe types) + title_query = """ + SELECT r.gid as recording_mbid, r.artist_credit as artist_credit_id FROM recording r - JOIN artist_credit ac ON r.artist_credit = ac.id - JOIN artist_credit_name acn ON ac.id = acn.artist_credit - JOIN artist a ON acn.artist = a.id - WHERE a.name = %s AND r.name ILIKE %s - LIMIT 1 + WHERE r.name ILIKE %s OR r.name ILIKE %s """ - self.cursor.execute(query, (main_artist, f"%{search_title}%")) - result = self.cursor.fetchone() + # Create versions with both straight and curly apostrophes + search_title_curly = search_title.replace("'", "'").replace("'", "'") + self.cursor.execute(title_query, (f"%{search_title}%", f"%{search_title_curly}%")) + recordings = self.cursor.fetchall() - if result: - break - - if result: - # Get the full artist string for this credit - artist_query = """ - SELECT a.name, acn.join_phrase, acn.position - FROM artist_credit_name acn - JOIN artist a ON acn.artist = a.id - WHERE acn.artist_credit = %s - ORDER BY acn.position - """ - self.cursor.execute(artist_query, (result['artist_credit_id'],)) - artists = self.cursor.fetchall() - - # Build the artist string - artist_string = "" - for artist in artists: - artist_string += artist['name'] - if artist['join_phrase']: - artist_string += artist['join_phrase'] - - return (str(result['artist_credit_id']), artist_string, result['recording_mbid']) + for recording in recordings: + # Check if this recording's artist credit matches our expected artist string + artist_query = """ + SELECT a.name, acn.join_phrase, acn.position + FROM artist_credit_name acn + JOIN artist a ON acn.artist = a.id + WHERE acn.artist_credit = %s + ORDER BY acn.position + """ + self.cursor.execute(artist_query, (recording['artist_credit_id'],)) + artists = self.cursor.fetchall() + + # Build the actual artist string for this recording + actual_artist_string = "" + for artist in artists: + actual_artist_string += artist['name'] + if artist['join_phrase']: + actual_artist_string += artist['join_phrase'] + + # Check if the artist strings match + if actual_artist_string.lower() == expected_artist_string.lower(): + return (str(recording['artist_credit_id']), actual_artist_string, recording['recording_mbid']) return None except Exception as e: print(f"Error in find_artist_credit: {e}") - return None - - def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]: - """ - Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer" - Returns: (main_artist, [collaborators]) - """ - import re - - # Primary collaboration indicators - primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat'] - - # Secondary collaboration indicators (need more careful handling) - secondary_patterns = ['&', 'and', ','] - - # Check if this is a collaboration - is_collaboration = False - split_pattern = None - - # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye" - # This should be "Kanye West ft. Jamie Foxx" - if artist_string.lower().startswith(('ft ', 'feat ')): - # This is a malformed collaboration string - # Try to extract the actual artists from the rest - remaining = artist_string[artist_string.find(' ') + 1:].strip() - if ',' in remaining: - # Split on comma and reverse the order - parts = [part.strip() for part in remaining.split(',')] - if len(parts) >= 2: - # Assume the last part is the main artist - main_artist = parts[-1].strip() - collaborators = parts[:-1] - return (main_artist, collaborators) - - for pattern in primary_patterns: - if pattern.lower() in artist_string.lower(): - is_collaboration = True - # Find the actual pattern in the original string (case-insensitive) - import re - match = re.search(re.escape(pattern), artist_string, re.IGNORECASE) - if match: - split_pattern = match.group(0) - else: - split_pattern = pattern - break - - # If no primary collaboration found, check secondary patterns - if not is_collaboration: - for pattern in secondary_patterns: - if pattern == ',': - # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj") - if ',' in artist_string: - # Count commas to determine if this is likely a collaboration - comma_count = artist_string.count(',') - if comma_count >= 1: - # Split on comma and treat as collaboration - parts = [part.strip() for part in artist_string.split(',')] - if len(parts) >= 2: - # First artist is main, rest are collaborators - main_artist = parts[0] - collaborators = parts[1:] - return (main_artist, collaborators) - else: - # Use whitespace boundaries to avoid splitting within words like "Orlando" - import re - if pattern in ['&', 'and']: - pattern_regex = r'\s' + re.escape(pattern) + r'\s' - else: - pattern_regex = r'\b' + re.escape(pattern) + r'\b' - if re.search(pattern_regex, artist_string, re.IGNORECASE): - # For secondary patterns, be more careful - # Check if this looks like a band name vs collaboration - parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) - if len(parts) == 2: - part1 = parts[0].strip() - part2 = parts[1].strip() - - # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - part1_words = len(part1.split()) - part2_words = len(part2.split()) - - # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - - # If one part has a comma, it's likely a collaboration - if ',' in part1 or ',' in part2: - is_collaboration = True - split_pattern = pattern - break - - # For "&" and "and" separators, check against known band names - # Load band names from JSON file - try: - import json - import os - json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json') - with open(json_path, 'r') as f: - known_data = json.load(f) - known_band_names = {name.lower() for name in known_data.get('groups', [])} - except (FileNotFoundError, json.JSONDecodeError, KeyError): - # Fallback to a minimal list if file can't be loaded - known_band_names = { - "gerry and the pacemakers", "simon & garfunkel", "hall & oates", - "brooks & dunn", "the everly brothers" - } - - if artist_string.lower() in known_band_names: - # This is a known band name, not a collaboration - continue - - # For "and" separators, be more conservative - # Many band names use "and" (e.g., "Gerry and the Pacemakers") - if pattern.lower() == 'and': - # Only treat as collaboration if it looks like individual artists - # Band names often have "the" in the second part - if 'the ' in part2.lower(): - # Likely a band name, skip - continue - - # Likely a collaboration - is_collaboration = True - split_pattern = pattern - break - - if not is_collaboration: - return (artist_string, []) - - # Split on the pattern using the same regex that was used for detection - if split_pattern in ['&', 'and']: - # Use whitespace boundary regex for these patterns - pattern_regex = r'\s' + re.escape(split_pattern) + r'\s' - parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) - else: - # Use simple split for other patterns - parts = artist_string.split(split_pattern) - - if len(parts) < 2: - return (artist_string, []) - - main_artist = parts[0].strip() - collaborators_string = split_pattern.join(parts[1:]).strip() - - # Now parse the collaborators string which might have multiple separators - collaborators = self._parse_collaborators(collaborators_string) - - return (main_artist, collaborators) - - def _generate_title_variations(self, title: str) -> List[str]: - """ - Generate title variations by removing parenthetical content and fixing common issues. - Returns list of title variations to try. - """ - import re - - search_titles = [title.strip()] - - # Fix common typos and missing apostrophes - title_fixes = title.strip() - - # Fix missing apostrophes in common contractions - apostrophe_fixes = [ - (r'\bDont\b', "Don't"), - (r'\bCant\b', "Can't"), - (r'\bWont\b', "Won't"), - (r'\bArent\b', "Aren't"), - (r'\bIsnt\b', "Isn't"), - (r'\bWasnt\b', "Wasn't"), - (r'\bDidnt\b', "Didn't"), - (r'\bDoesnt\b', "Doesn't"), - (r'\bHavent\b', "Haven't"), - (r'\bHasnt\b', "Hasn't"), - (r'\bWouldnt\b', "Wouldn't"), - (r'\bCouldnt\b', "Couldn't"), - (r'\bShouldnt\b', "Shouldn't"), - (r'\bPhunk\b', "Funk"), # Common typo - (r'\bBout\b', "About"), # Shortened form - ] - - for pattern, replacement in apostrophe_fixes: - fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE) - if fixed_title != title_fixes: - title_fixes = fixed_title - if title_fixes not in search_titles: - search_titles.append(title_fixes) - - # Comprehensive parentheses removal - try multiple approaches - # 1. Remove all complete parentheses (most aggressive) - clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) - clean_title = clean_title.strip() - if clean_title != title.strip() and clean_title: - search_titles.append(clean_title) - - # 2. Remove specific common patterns first, then general parentheses - specific_patterns = [ - r'\s*\(Karaoke Version\)', - r'\s*\(Karaoke\)', - r'\s*\(Instrumental\)', - r'\s*\(Backing Track\)', - r'\s*\(live [^)]*\)', - r'\s*\(Live [^)]*\)', - r'\s*\(Acoustic\)', - r'\s*\(acoustic\)', - r'\s*\(Without Backing Vocals\)', - r'\s*\(Without Backing Vocals\)', - r'\s*\(Clean\)', - r'\s*\(clean\)', - r'\s*\(Remix\)', - r'\s*\(remix\)', - r'\s*\(Radio Edit\)', - r'\s*\(radio edit\)', - r'\s*\(Extended Mix\)', - r'\s*\(extended mix\)', - r'\s*\(Single Version\)', - r'\s*\(single version\)', - r'\s*\(Album Version\)', - r'\s*\(album version\)', - r'\s*\(Original Mix\)', - r'\s*\(original mix\)', - r'\s*\(John Lewis Christmas Ad \d+\)', # Specific pattern from test - r'\s*\(from the movie [^)]*\)', - r'\s*\(from the [^)]*\)', - r'\s*\(feat\. [^)]*\)', - r'\s*\(featuring [^)]*\)', - r'\s*\(ft\. [^)]*\)', - r'\s*\(duet\)', - r'\s*\(Duet\)', - r'\s*\(Two Semitones Down\)', - r'\s*\(Minus Piano\)', - r'\s*\(Cut Down\)', - r'\s*\(Boone & Speedy Vocals\)', - r'\s*\(My Heart Belongs to You\)', - ] - - # 3. Remove dash-separated content (like "Live At the BBC") - dash_patterns = [ - r'\s*-\s*Live [^-]*$', - r'\s*-\s*live [^-]*$', - r'\s*-\s*Live At [^-]*$', - r'\s*-\s*Live At the [^-]*$', - r'\s*-\s*Live At the BBC$', - r'\s*-\s*Live From [^-]*$', - r'\s*-\s*Live In [^-]*$', - r'\s*-\s*Live On [^-]*$', - ] - - # Apply specific patterns first - for pattern in specific_patterns: - specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - specific_clean = specific_clean.strip() - if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles: - search_titles.append(specific_clean) - - # Apply dash patterns - for pattern in dash_patterns: - dash_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - dash_clean = dash_clean.strip() - if dash_clean != title.strip() and dash_clean and dash_clean not in search_titles: - search_titles.append(dash_clean) - - # 3. Remove any remaining parentheses after specific patterns - for pattern in specific_patterns: - remaining_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - remaining_clean = re.sub(r'\s*\([^)]*\)', '', remaining_clean.strip()) - remaining_clean = remaining_clean.strip() - if remaining_clean != title.strip() and remaining_clean and remaining_clean not in search_titles: - search_titles.append(remaining_clean) - - # 4. Remove unmatched opening parenthesis at end - clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip()) - clean_title2 = clean_title2.strip() - if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles: - search_titles.append(clean_title2) - - # 5. Remove unmatched closing parenthesis at start - clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip()) - clean_title3 = clean_title3.strip() - if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles: - search_titles.append(clean_title3) - - # 6. Try removing extra spaces and normalizing - normalized_title = re.sub(r'\s+', ' ', title.strip()) - if normalized_title != title.strip() and normalized_title not in search_titles: - search_titles.append(normalized_title) - - # 7. Apply normalization to all cleaned versions and remove duplicates - normalized_versions = [] - for version in search_titles: - # Normalize spaces (replace multiple spaces with single space) - normalized = re.sub(r'\s+', ' ', version.strip()) - # Remove leading/trailing spaces - normalized = normalized.strip() - if normalized and normalized not in normalized_versions: - normalized_versions.append(normalized) - - return normalized_versions - - def _parse_collaborators(self, collaborators_string: str) -> List[str]: - """ - Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists - """ - import re - - if not collaborators_string: - return [] - - # Split on common separators: comma, ampersand, "and" - # Use regex to handle multiple separators - # Use word boundaries to avoid splitting within words like "Orlando" - separators = r'[,&]|\b(?:and)\b' - - # Split and clean up each part - parts = re.split(separators, collaborators_string, flags=re.IGNORECASE) - - collaborators = [] - for part in parts: - part = part.strip() - if part: # Skip empty parts - collaborators.append(part) - - return collaborators \ No newline at end of file + return None \ No newline at end of file diff --git a/src/cli/main.py b/src/cli/main.py index c885142..4411bea 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -28,6 +28,7 @@ from ..utils.data_loader import data_loader # Import database and API clients from ..api.database import MusicBrainzDatabase from ..api.api_client import MusicBrainzAPIClient +from src.utils.artist_title_processing import parse_complex_collaboration, parse_collaborators, generate_title_variations class MusicBrainzCleaner: @@ -81,7 +82,7 @@ class MusicBrainzCleaner: # Handle collaborations - try to find main artist first # Use the same complex collaboration parsing as the database - main_artist, collaborators = self._parse_complex_collaboration(clean_name) + main_artist, collaborators = parse_complex_collaboration(clean_name) has_collaboration = len(collaborators) > 0 if has_collaboration: @@ -119,38 +120,68 @@ class MusicBrainzCleaner: return None - def find_recording_mbid(self, artist_mbid: str, title: str) -> Optional[str]: - clean_title = self._clean_title(title) + def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]: + # Use the shared title variations logic for comprehensive title cleaning + title_variations = generate_title_variations(title) # Load known recordings data first known_recordings = data_loader.load_known_recordings() - # Check known recordings first + # Check known recordings first with all title variations for (known_artist_mbid, known_title), recording_mbid in known_recordings.items(): - if artist_mbid == known_artist_mbid and clean_title.lower() == known_title.lower(): - return recording_mbid + if artist_mbid == known_artist_mbid: + for variation in title_variations: + if variation.lower() == known_title.lower(): + return recording_mbid # Handle collaborations using artist credit if self.use_database: - # If no artist_mbid (collaboration case), try to find by title and original artist name - if not artist_mbid: - # This is a collaboration case, try to find by title - result = self.db.fuzzy_search_recording(clean_title) - if result and isinstance(result, tuple) and len(result) >= 2: - return result[1] # Return MBID from tuple (recording_name, mbid, score) + # If no artist_mbid (collaboration case), try to find by title and verify artist credit + if not artist_mbid and original_artist: + # This is a collaboration case, try to find by title with all variations + for variation in title_variations: + # Search for recordings by title first + result = self.db.fuzzy_search_recording(variation) + if result and isinstance(result, tuple) and len(result) >= 2: + recording_mbid = result[1] + + # Verify that this recording has the correct artist credit + recording_info = self.get_recording_info(recording_mbid) + if recording_info and recording_info.get('artist_credit'): + # Check if the artist credit matches our expected collaboration + expected_artist_string = original_artist.replace(',', ' & ').replace(' and ', ' & ') + if recording_info['artist_credit'].lower() == expected_artist_string.lower(): + return recording_mbid + + # If exact match fails, try partial match + if recording_info and recording_info.get('artist_credit'): + # Check if all artists in the collaboration are present in the recording + main_artist, collaborators = parse_complex_collaboration(original_artist) + recording_artists = recording_info['artist_credit'].lower() + + # Check if main artist is in the recording + if main_artist.lower() in recording_artists: + # Check if at least one collaborator is also present + for collaborator in collaborators: + if collaborator.lower() in recording_artists: + return recording_mbid + + return None else: - # Regular case with artist_mbid - result = self.db.fuzzy_search_recording(clean_title, artist_mbid) - if result and isinstance(result, tuple) and len(result) >= 2: - return result[1] # Return MBID from tuple (recording_name, mbid, score) + # Regular case with artist_mbid - try all title variations + for variation in title_variations: + result = self.db.fuzzy_search_recording(variation, artist_mbid) + if result and isinstance(result, tuple) and len(result) >= 2: + return result[1] # Return MBID from tuple (recording_name, mbid, score) else: - # Fallback to API - try: - result = self.api.search_recording(clean_title, artist_mbid) - if result: - return result['id'] - except: - pass + # Fallback to API - try all title variations + for variation in title_variations: + try: + result = self.api.search_recording(variation, artist_mbid) + if result: + return result['id'] + except: + pass return None @@ -176,130 +207,6 @@ class MusicBrainzCleaner: parts.append(credit['joinphrase']) return ''.join(parts).strip() - def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]: - """ - Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer" - Returns: (main_artist, [collaborators]) - """ - import re - - # Primary collaboration indicators - primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat'] - - # Secondary collaboration indicators (need more careful handling) - secondary_patterns = ['&', 'and', ','] - - # Check if this is a collaboration - is_collaboration = False - split_pattern = None - - for pattern in primary_patterns: - if pattern.lower() in artist_string.lower(): - is_collaboration = True - # Find the actual pattern in the original string (case-insensitive) - import re - match = re.search(re.escape(pattern), artist_string, re.IGNORECASE) - if match: - split_pattern = match.group(0) - else: - split_pattern = pattern - break - - # If no primary collaboration found, check secondary patterns - if not is_collaboration: - for pattern in secondary_patterns: - # Use whitespace boundaries for & and and, word boundaries for others - if pattern in ['&', 'and']: - pattern_regex = r'\s' + re.escape(pattern) + r'\s' - else: - pattern_regex = r'\b' + re.escape(pattern) + r'\b' - if re.search(pattern_regex, artist_string, re.IGNORECASE): - # For secondary patterns, be more careful - # Check if this looks like a band name vs collaboration - parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) - if len(parts) == 2: - part1 = parts[0].strip() - part2 = parts[1].strip() - - # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - part1_words = len(part1.split()) - part2_words = len(part2.split()) - - # Check if this looks like a band name vs collaboration - # Band names typically have simple, short parts - # Collaborations often have more complex artist names - - # If one part has a comma, it's likely a collaboration - if ',' in part1 or ',' in part2: - is_collaboration = True - split_pattern = pattern - break - - # For "&" separators, be more permissive - # Most "&" separators in music are collaborations, not band names - # Only treat very obvious band names as non-collaborations - if part1_words == 1 and part2_words == 1: - # Check for very obvious band name patterns - obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn'] - if artist_string.lower() in obvious_band_names: - # Likely a band name, skip - continue - - # Likely a collaboration - is_collaboration = True - split_pattern = pattern - break - - if not is_collaboration: - return (artist_string, []) - - # Split on the pattern using the same regex that was used for detection - if split_pattern in ['&', 'and']: - # Use whitespace boundary regex for these patterns - pattern_regex = r'\s' + re.escape(split_pattern) + r'\s' - parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) - else: - # Use simple split for other patterns - parts = artist_string.split(split_pattern) - - if len(parts) < 2: - return (artist_string, []) - - main_artist = parts[0].strip() - collaborators_string = split_pattern.join(parts[1:]).strip() - - # Now parse the collaborators string which might have multiple separators - collaborators = self._parse_collaborators(collaborators_string) - - return (main_artist, collaborators) - - def _parse_collaborators(self, collaborators_string: str) -> List[str]: - """ - Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists - """ - import re - - if not collaborators_string: - return [] - - # Split on common separators: comma, ampersand, "and" - # Use regex to handle multiple separators - # Use word boundaries to avoid splitting within words like "Orlando" - separators = r'[,&]|\b(?:and)\b' - - # Split and clean up each part - parts = re.split(separators, collaborators_string, flags=re.IGNORECASE) - - collaborators = [] - for part in parts: - part = part.strip() - if part: # Skip empty parts - collaborators.append(part) - - return collaborators - def clean_song(self, song: Dict[str, Any]) -> Tuple[Dict[str, Any], bool]: """ Clean a single song and return (cleaned_song, success_status) @@ -313,44 +220,37 @@ class MusicBrainzCleaner: # Handle collaboration case (artist_mbid is None) # Use the same complex collaboration parsing as the database artist_name = song.get('artist', '') - main_artist, collaborators = self._parse_complex_collaboration(artist_name) + main_artist, collaborators = parse_complex_collaboration(artist_name) has_collaboration = len(collaborators) > 0 if artist_mbid is None and has_collaboration: - # Try to find recording using artist credit approach - if self.use_database: - result = self.db.find_artist_credit(song.get('artist', ''), song.get('title', '')) - if result: - artist_credit_id, artist_string, recording_mbid = result + # Try to find recording using the improved find_recording_mbid method + recording_mbid = self.find_recording_mbid(None, song.get('title', ''), original_artist) + if recording_mbid: + # Get recording info to update the song + recording_info = self.get_recording_info(recording_mbid) + if recording_info: + # Update with the correct artist credit and title + if self.use_database and recording_info.get('artist_credit'): + song['artist'] = recording_info['artist_credit'] + elif not self.use_database and recording_info.get('artist-credit'): + artist_string = self._build_artist_string(recording_info['artist-credit']) + if artist_string: + song['artist'] = artist_string - # Update with the correct artist credit - song['artist'] = artist_string + song['title'] = recording_info['title'] song['recording_mbid'] = recording_mbid # For collaborations, try to get the main artist's MBID if self.use_database: - main_artist, collaborators = self._parse_complex_collaboration(song.get('artist', '')) + main_artist, collaborators = parse_complex_collaboration(song.get('artist', '')) if main_artist: artist_result = self.db.fuzzy_search_artist(main_artist) if artist_result and isinstance(artist_result, tuple) and len(artist_result) >= 2: song['mbid'] = artist_result[1] # Set the main artist's MBID return song, True - else: - return song, False - else: - # Fallback to API method - recording_mbid = self.find_recording_mbid(None, song.get('title', '')) - if recording_mbid: - recording_info = self.get_recording_info(recording_mbid) - if recording_info and recording_info.get('artist-credit'): - artist_string = self._build_artist_string(recording_info['artist-credit']) - if artist_string: - song['artist'] = artist_string - song['title'] = recording_info['title'] - song['recording_mbid'] = recording_mbid - return song, True - return song, False + return song, False # Regular case (non-collaboration or collaboration not found) if not artist_mbid: