musicbrainz-cleaner/src/utils/artist_title_processing.py

211 lines
11 KiB
Python

import re
import json
import os
from typing import List, Tuple
def parse_complex_collaboration(artist_string: str) -> Tuple[str, List[str]]:
"""
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
Returns: (main_artist, [collaborators])
"""
# Primary collaboration indicators
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
# Secondary collaboration indicators (need more careful handling)
secondary_patterns = ['&', 'and', ',']
# Check if this is a collaboration
is_collaboration = False
split_pattern = None
# Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
# This should be "Kanye West ft. Jamie Foxx"
if artist_string.lower().startswith(('ft ', 'feat ')):
# This is a malformed collaboration string
# Try to extract the actual artists from the rest
remaining = artist_string[artist_string.find(' ') + 1:].strip()
if ',' in remaining:
# Split on comma and reverse the order
parts = [part.strip() for part in remaining.split(',')]
if len(parts) >= 2:
# Assume the last part is the main artist
main_artist = parts[-1].strip()
collaborators = parts[:-1]
return (main_artist, collaborators)
for pattern in primary_patterns:
if pattern.lower() in artist_string.lower():
is_collaboration = True
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
if match:
split_pattern = match.group(0)
else:
split_pattern = pattern
break
# If no primary collaboration found, check secondary patterns
if not is_collaboration:
for pattern in secondary_patterns:
if pattern == ',':
# Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
if ',' in artist_string:
comma_count = artist_string.count(',')
if comma_count >= 1:
parts = [part.strip() for part in artist_string.split(',')]
if len(parts) >= 2:
main_artist = parts[0]
collaborators = parts[1:]
return (main_artist, collaborators)
else:
if pattern in ['&', 'and']:
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
else:
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
if re.search(pattern_regex, artist_string, re.IGNORECASE):
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
if len(parts) == 2:
part1 = parts[0].strip()
part2 = parts[1].strip()
part1_words = len(part1.split())
part2_words = len(part2.split())
# If one part has a comma, it's likely a collaboration
if ',' in part1 or ',' in part2:
is_collaboration = True
split_pattern = pattern
break
# For "&" and "and" separators, check against known band names
try:
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
with open(json_path, 'r') as f:
known_data = json.load(f)
known_band_names = {name.lower() for name in known_data.get('groups', [])}
except (FileNotFoundError, json.JSONDecodeError, KeyError):
known_band_names = {
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
"brooks & dunn", "the everly brothers"
}
if artist_string.lower() in known_band_names:
continue
if pattern.lower() == 'and':
if 'the ' in part2.lower():
continue
is_collaboration = True
split_pattern = pattern
break
if not is_collaboration:
return (artist_string, [])
# Split on the pattern using the same regex that was used for detection
if split_pattern in ['&', 'and']:
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
else:
parts = artist_string.split(split_pattern)
if len(parts) < 2:
return (artist_string, [])
main_artist = parts[0].strip()
collaborators_string = split_pattern.join(parts[1:]).strip()
collaborators = parse_collaborators(collaborators_string)
return (main_artist, collaborators)
def parse_collaborators(collaborators_string: str) -> List[str]:
if not collaborators_string:
return []
separators = r'[,&]|\b(?:and)\b'
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
collaborators = [part.strip() for part in parts if part.strip()]
return collaborators
def generate_title_variations(title: str) -> List[str]:
search_titles = [title.strip()]
title_fixes = title.strip()
# Normalize apostrophes (curly to straight)
title_fixes = title_fixes.replace(''', "'").replace(''', "'")
if title_fixes != title.strip():
search_titles.append(title_fixes)
# Load contraction fixes from JSON file
try:
from ..utils.data_loader import data_loader
contraction_fixes = data_loader.load_contraction_fixes()
except:
# Fallback to hardcoded fixes if file loading fails
contraction_fixes = {
"dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
"wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
"ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
"ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
"arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
"werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
"havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
"couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
"didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
"youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
"whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
"lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
"heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
"shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
"were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
"well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
"itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
"wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
"shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
"mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
"oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
"rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
}
# Apply contraction fixes
for broken_contraction, fixed_contraction in contraction_fixes.items():
# Use word boundaries to avoid partial matches
pattern = r'\b' + re.escape(broken_contraction) + r'\b'
fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
if fixed_title != title_fixes:
title_fixes = fixed_title
if title_fixes not in search_titles:
search_titles.append(title_fixes)
# Additional hardcoded fixes for edge cases
additional_fixes = [
(r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
]
for pattern, replacement in additional_fixes:
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
if fixed_title != title_fixes:
title_fixes = fixed_title
if title_fixes not in search_titles:
search_titles.append(title_fixes)
# Remove specific patterns first, then general parentheses
specific_patterns = [
r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)',
r'\s*\(Radio Edit\)', r'\s*\(radio edit\)', r'\s*\(Extended Mix\)', r'\s*\(extended mix\)',
r'\s*\(Single Version\)', r'\s*\(single version\)', r'\s*\(Album Version\)', r'\s*\(album version\)',
r'\s*\(Original Mix\)', r'\s*\(original mix\)', r'\s*\(John Lewis Christmas Ad \d+\)',
r'\s*\(from the movie [^)]*\)', r'\s*\(from the [^)]*\)', r'\s*\(feat\. [^)]*\)',
r'\s*\(featuring [^)]*\)', r'\s*\(ft\. [^)]*\)', r'\s*\(duet\)', r'\s*\(Duet\)',
r'\s*\(Two Semitones Down\)', r'\s*\(Minus Piano\)', r'\s*\(Cut Down\)',
r'\s*\(Boone & Speedy Vocals\)', r'\s*\(My Heart Belongs to You\)'
]
# Apply specific patterns first
for pattern in specific_patterns:
specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
specific_clean = specific_clean.strip()
if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
search_titles.append(specific_clean)
# Only remove general parentheses if no specific patterns matched
if len(search_titles) == 1: # Only the original title
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()).strip()
if clean_title != title.strip() and clean_title:
search_titles.append(clean_title)
specific_patterns = [
r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)'
]
for pattern in specific_patterns:
clean_title = re.sub(pattern, '', title_fixes, flags=re.IGNORECASE).strip()
if clean_title and clean_title not in search_titles:
search_titles.append(clean_title)
return search_titles