211 lines
11 KiB
Python
211 lines
11 KiB
Python
import re
|
|
import json
|
|
import os
|
|
from typing import List, Tuple
|
|
|
|
def parse_complex_collaboration(artist_string: str) -> Tuple[str, List[str]]:
|
|
"""
|
|
Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
|
|
Returns: (main_artist, [collaborators])
|
|
"""
|
|
# Primary collaboration indicators
|
|
primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
|
|
# Secondary collaboration indicators (need more careful handling)
|
|
secondary_patterns = ['&', 'and', ',']
|
|
# Check if this is a collaboration
|
|
is_collaboration = False
|
|
split_pattern = None
|
|
# Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
|
|
# This should be "Kanye West ft. Jamie Foxx"
|
|
if artist_string.lower().startswith(('ft ', 'feat ')):
|
|
# This is a malformed collaboration string
|
|
# Try to extract the actual artists from the rest
|
|
remaining = artist_string[artist_string.find(' ') + 1:].strip()
|
|
if ',' in remaining:
|
|
# Split on comma and reverse the order
|
|
parts = [part.strip() for part in remaining.split(',')]
|
|
if len(parts) >= 2:
|
|
# Assume the last part is the main artist
|
|
main_artist = parts[-1].strip()
|
|
collaborators = parts[:-1]
|
|
return (main_artist, collaborators)
|
|
for pattern in primary_patterns:
|
|
if pattern.lower() in artist_string.lower():
|
|
is_collaboration = True
|
|
match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
|
|
if match:
|
|
split_pattern = match.group(0)
|
|
else:
|
|
split_pattern = pattern
|
|
break
|
|
# If no primary collaboration found, check secondary patterns
|
|
if not is_collaboration:
|
|
for pattern in secondary_patterns:
|
|
if pattern == ',':
|
|
# Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
|
|
if ',' in artist_string:
|
|
comma_count = artist_string.count(',')
|
|
if comma_count >= 1:
|
|
parts = [part.strip() for part in artist_string.split(',')]
|
|
if len(parts) >= 2:
|
|
main_artist = parts[0]
|
|
collaborators = parts[1:]
|
|
return (main_artist, collaborators)
|
|
else:
|
|
if pattern in ['&', 'and']:
|
|
pattern_regex = r'\s' + re.escape(pattern) + r'\s'
|
|
else:
|
|
pattern_regex = r'\b' + re.escape(pattern) + r'\b'
|
|
if re.search(pattern_regex, artist_string, re.IGNORECASE):
|
|
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
if len(parts) == 2:
|
|
part1 = parts[0].strip()
|
|
part2 = parts[1].strip()
|
|
part1_words = len(part1.split())
|
|
part2_words = len(part2.split())
|
|
# If one part has a comma, it's likely a collaboration
|
|
if ',' in part1 or ',' in part2:
|
|
is_collaboration = True
|
|
split_pattern = pattern
|
|
break
|
|
# For "&" and "and" separators, check against known band names
|
|
try:
|
|
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
|
|
with open(json_path, 'r') as f:
|
|
known_data = json.load(f)
|
|
known_band_names = {name.lower() for name in known_data.get('groups', [])}
|
|
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
|
known_band_names = {
|
|
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
|
|
"brooks & dunn", "the everly brothers"
|
|
}
|
|
if artist_string.lower() in known_band_names:
|
|
continue
|
|
if pattern.lower() == 'and':
|
|
if 'the ' in part2.lower():
|
|
continue
|
|
is_collaboration = True
|
|
split_pattern = pattern
|
|
break
|
|
if not is_collaboration:
|
|
return (artist_string, [])
|
|
# Split on the pattern using the same regex that was used for detection
|
|
if split_pattern in ['&', 'and']:
|
|
pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
|
|
parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
|
|
else:
|
|
parts = artist_string.split(split_pattern)
|
|
if len(parts) < 2:
|
|
return (artist_string, [])
|
|
main_artist = parts[0].strip()
|
|
collaborators_string = split_pattern.join(parts[1:]).strip()
|
|
collaborators = parse_collaborators(collaborators_string)
|
|
return (main_artist, collaborators)
|
|
|
|
def parse_collaborators(collaborators_string: str) -> List[str]:
|
|
if not collaborators_string:
|
|
return []
|
|
separators = r'[,&]|\b(?:and)\b'
|
|
parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
|
|
collaborators = [part.strip() for part in parts if part.strip()]
|
|
return collaborators
|
|
|
|
def generate_title_variations(title: str) -> List[str]:
|
|
search_titles = [title.strip()]
|
|
title_fixes = title.strip()
|
|
|
|
# Normalize apostrophes (curly to straight)
|
|
title_fixes = title_fixes.replace(''', "'").replace(''', "'")
|
|
if title_fixes != title.strip():
|
|
search_titles.append(title_fixes)
|
|
|
|
# Load contraction fixes from JSON file
|
|
try:
|
|
from ..utils.data_loader import data_loader
|
|
contraction_fixes = data_loader.load_contraction_fixes()
|
|
except:
|
|
# Fallback to hardcoded fixes if file loading fails
|
|
contraction_fixes = {
|
|
"dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
|
|
"wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
|
|
"ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
|
|
"ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
|
|
"arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
|
|
"werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
|
|
"havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
|
|
"couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
|
|
"didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
|
|
"youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
|
|
"whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
|
|
"lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
|
|
"heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
|
|
"shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
|
|
"were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
|
|
"well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
|
|
"itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
|
|
"wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
|
|
"shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
|
|
"mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
|
|
"oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
|
|
"rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
|
|
}
|
|
|
|
# Apply contraction fixes
|
|
for broken_contraction, fixed_contraction in contraction_fixes.items():
|
|
# Use word boundaries to avoid partial matches
|
|
pattern = r'\b' + re.escape(broken_contraction) + r'\b'
|
|
fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
|
|
if fixed_title != title_fixes:
|
|
title_fixes = fixed_title
|
|
if title_fixes not in search_titles:
|
|
search_titles.append(title_fixes)
|
|
|
|
# Additional hardcoded fixes for edge cases
|
|
additional_fixes = [
|
|
(r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
|
|
]
|
|
for pattern, replacement in additional_fixes:
|
|
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
|
if fixed_title != title_fixes:
|
|
title_fixes = fixed_title
|
|
if title_fixes not in search_titles:
|
|
search_titles.append(title_fixes)
|
|
|
|
# Remove specific patterns first, then general parentheses
|
|
specific_patterns = [
|
|
r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
|
|
r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
|
|
r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)',
|
|
r'\s*\(Radio Edit\)', r'\s*\(radio edit\)', r'\s*\(Extended Mix\)', r'\s*\(extended mix\)',
|
|
r'\s*\(Single Version\)', r'\s*\(single version\)', r'\s*\(Album Version\)', r'\s*\(album version\)',
|
|
r'\s*\(Original Mix\)', r'\s*\(original mix\)', r'\s*\(John Lewis Christmas Ad \d+\)',
|
|
r'\s*\(from the movie [^)]*\)', r'\s*\(from the [^)]*\)', r'\s*\(feat\. [^)]*\)',
|
|
r'\s*\(featuring [^)]*\)', r'\s*\(ft\. [^)]*\)', r'\s*\(duet\)', r'\s*\(Duet\)',
|
|
r'\s*\(Two Semitones Down\)', r'\s*\(Minus Piano\)', r'\s*\(Cut Down\)',
|
|
r'\s*\(Boone & Speedy Vocals\)', r'\s*\(My Heart Belongs to You\)'
|
|
]
|
|
|
|
# Apply specific patterns first
|
|
for pattern in specific_patterns:
|
|
specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
specific_clean = specific_clean.strip()
|
|
if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
|
|
search_titles.append(specific_clean)
|
|
|
|
# Only remove general parentheses if no specific patterns matched
|
|
if len(search_titles) == 1: # Only the original title
|
|
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()).strip()
|
|
if clean_title != title.strip() and clean_title:
|
|
search_titles.append(clean_title)
|
|
|
|
specific_patterns = [
|
|
r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
|
|
r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
|
|
r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)'
|
|
]
|
|
for pattern in specific_patterns:
|
|
clean_title = re.sub(pattern, '', title_fixes, flags=re.IGNORECASE).strip()
|
|
if clean_title and clean_title not in search_titles:
|
|
search_titles.append(clean_title)
|
|
|
|
return search_titles |