import re import json import os from typing import List, Tuple def parse_complex_collaboration(artist_string: str) -> Tuple[str, List[str]]: """ Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer" Returns: (main_artist, [collaborators]) """ # Primary collaboration indicators primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat'] # Secondary collaboration indicators (need more careful handling) secondary_patterns = ['&', 'and', ','] # Check if this is a collaboration is_collaboration = False split_pattern = None # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye" # This should be "Kanye West ft. Jamie Foxx" if artist_string.lower().startswith(('ft ', 'feat ')): # This is a malformed collaboration string # Try to extract the actual artists from the rest remaining = artist_string[artist_string.find(' ') + 1:].strip() if ',' in remaining: # Split on comma and reverse the order parts = [part.strip() for part in remaining.split(',')] if len(parts) >= 2: # Assume the last part is the main artist main_artist = parts[-1].strip() collaborators = parts[:-1] return (main_artist, collaborators) for pattern in primary_patterns: if pattern.lower() in artist_string.lower(): is_collaboration = True match = re.search(re.escape(pattern), artist_string, re.IGNORECASE) if match: split_pattern = match.group(0) else: split_pattern = pattern break # If no primary collaboration found, check secondary patterns if not is_collaboration: for pattern in secondary_patterns: if pattern == ',': # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj") if ',' in artist_string: comma_count = artist_string.count(',') if comma_count >= 1: parts = [part.strip() for part in artist_string.split(',')] if len(parts) >= 2: main_artist = parts[0] collaborators = parts[1:] return (main_artist, collaborators) else: if pattern in ['&', 'and']: pattern_regex = r'\s' + re.escape(pattern) + r'\s' else: pattern_regex = r'\b' + re.escape(pattern) + r'\b' if re.search(pattern_regex, artist_string, re.IGNORECASE): parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) if len(parts) == 2: part1 = parts[0].strip() part2 = parts[1].strip() part1_words = len(part1.split()) part2_words = len(part2.split()) # If one part has a comma, it's likely a collaboration if ',' in part1 or ',' in part2: is_collaboration = True split_pattern = pattern break # For "&" and "and" separators, check against known band names try: json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json') with open(json_path, 'r') as f: known_data = json.load(f) known_band_names = {name.lower() for name in known_data.get('groups', [])} except (FileNotFoundError, json.JSONDecodeError, KeyError): known_band_names = { "gerry and the pacemakers", "simon & garfunkel", "hall & oates", "brooks & dunn", "the everly brothers" } if artist_string.lower() in known_band_names: continue if pattern.lower() == 'and': if 'the ' in part2.lower(): continue is_collaboration = True split_pattern = pattern break if not is_collaboration: return (artist_string, []) # Split on the pattern using the same regex that was used for detection if split_pattern in ['&', 'and']: pattern_regex = r'\s' + re.escape(split_pattern) + r'\s' parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE) else: parts = artist_string.split(split_pattern) if len(parts) < 2: return (artist_string, []) main_artist = parts[0].strip() collaborators_string = split_pattern.join(parts[1:]).strip() collaborators = parse_collaborators(collaborators_string) return (main_artist, collaborators) def parse_collaborators(collaborators_string: str) -> List[str]: if not collaborators_string: return [] separators = r'[,&]|\b(?:and)\b' parts = re.split(separators, collaborators_string, flags=re.IGNORECASE) collaborators = [part.strip() for part in parts if part.strip()] return collaborators def generate_title_variations(title: str) -> List[str]: search_titles = [title.strip()] title_fixes = title.strip() # Normalize apostrophes (curly to straight) title_fixes = title_fixes.replace(''', "'").replace(''', "'") if title_fixes != title.strip(): search_titles.append(title_fixes) # Load contraction fixes from JSON file try: from ..utils.data_loader import data_loader contraction_fixes = data_loader.load_contraction_fixes() except: # Fallback to hardcoded fixes if file loading fails contraction_fixes = { "dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't", "wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm", "ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd", "ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't", "arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't", "werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't", "havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't", "couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't", "didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're", "youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's", "whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's", "lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's", "heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's", "shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's", "were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've", "well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's", "itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all", "wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've", "shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've", "mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am", "oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't", "rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll" } # Apply contraction fixes for broken_contraction, fixed_contraction in contraction_fixes.items(): # Use word boundaries to avoid partial matches pattern = r'\b' + re.escape(broken_contraction) + r'\b' fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE) if fixed_title != title_fixes: title_fixes = fixed_title if title_fixes not in search_titles: search_titles.append(title_fixes) # Additional hardcoded fixes for edge cases additional_fixes = [ (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About") ] for pattern, replacement in additional_fixes: fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE) if fixed_title != title_fixes: title_fixes = fixed_title if title_fixes not in search_titles: search_titles.append(title_fixes) # Remove specific patterns first, then general parentheses specific_patterns = [ r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)', r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)', r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)', r'\s*\(Radio Edit\)', r'\s*\(radio edit\)', r'\s*\(Extended Mix\)', r'\s*\(extended mix\)', r'\s*\(Single Version\)', r'\s*\(single version\)', r'\s*\(Album Version\)', r'\s*\(album version\)', r'\s*\(Original Mix\)', r'\s*\(original mix\)', r'\s*\(John Lewis Christmas Ad \d+\)', r'\s*\(from the movie [^)]*\)', r'\s*\(from the [^)]*\)', r'\s*\(feat\. [^)]*\)', r'\s*\(featuring [^)]*\)', r'\s*\(ft\. [^)]*\)', r'\s*\(duet\)', r'\s*\(Duet\)', r'\s*\(Two Semitones Down\)', r'\s*\(Minus Piano\)', r'\s*\(Cut Down\)', r'\s*\(Boone & Speedy Vocals\)', r'\s*\(My Heart Belongs to You\)' ] # Apply specific patterns first for pattern in specific_patterns: specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) specific_clean = specific_clean.strip() if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles: search_titles.append(specific_clean) # Only remove general parentheses if no specific patterns matched if len(search_titles) == 1: # Only the original title clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()).strip() if clean_title != title.strip() and clean_title: search_titles.append(clean_title) specific_patterns = [ r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)', r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)', r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)' ] for pattern in specific_patterns: clean_title = re.sub(pattern, '', title_fixes, flags=re.IGNORECASE).strip() if clean_title and clean_title not in search_titles: search_titles.append(clean_title) return search_titles