musicbrainz-cleaner/src/utils/artist_title_processing.py

import re
import json
import os
from typing import List, Tuple

def parse_complex_collaboration(artist_string: str) -> Tuple[str, List[str]]:
    """
    Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
    Returns: (main_artist, [collaborators])
    """
    # Primary collaboration indicators
    primary_patterns = ['ft.', 'feat.', 'featuring', 'ft', 'feat']
    # Secondary collaboration indicators (need more careful handling)
    secondary_patterns = ['&', 'and', ',']
    # Check if this is a collaboration
    is_collaboration = False
    split_pattern = None
    # Special case: Handle malformed artist names like "ft Jamie Foxx West, Kanye"
    # This should be "Kanye West ft. Jamie Foxx"
    if artist_string.lower().startswith(('ft ', 'feat ')):
        # This is a malformed collaboration string
        # Try to extract the actual artists from the rest
        remaining = artist_string[artist_string.find(' ') + 1:].strip()
        if ',' in remaining:
            # Split on comma and reverse the order
            parts = [part.strip() for part in remaining.split(',')]
            if len(parts) >= 2:
                # Assume the last part is the main artist
                main_artist = parts[-1].strip()
                collaborators = parts[:-1]
                return (main_artist, collaborators)
    for pattern in primary_patterns:
        if pattern.lower() in artist_string.lower():
            is_collaboration = True
            match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
            if match:
                split_pattern = match.group(0)
            else:
                split_pattern = pattern
            break
    # If no primary collaboration found, check secondary patterns
    if not is_collaboration:
        for pattern in secondary_patterns:
            if pattern == ',':
                # Handle comma-separated artists (e.g., "Ariana Grande, Normani, Nicki Minaj")
                if ',' in artist_string:
                    comma_count = artist_string.count(',')
                    if comma_count >= 1:
                        parts = [part.strip() for part in artist_string.split(',')]
                        if len(parts) >= 2:
                            main_artist = parts[0]
                            collaborators = parts[1:]
                            return (main_artist, collaborators)
            else:
                if pattern in ['&', 'and']:
                    pattern_regex = r'\s' + re.escape(pattern) + r'\s'
                else:
                    pattern_regex = r'\b' + re.escape(pattern) + r'\b'
                if re.search(pattern_regex, artist_string, re.IGNORECASE):
                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        part1 = parts[0].strip()
                        part2 = parts[1].strip()
                        part1_words = len(part1.split())
                        part2_words = len(part2.split())
                        # If one part has a comma, it's likely a collaboration
                        if ',' in part1 or ',' in part2:
                            is_collaboration = True
                            split_pattern = pattern
                            break
                        # For "&" and "and" separators, check against known band names
                        try:
                            json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
                            with open(json_path, 'r') as f:
                                known_data = json.load(f)
                            known_band_names = {name.lower() for name in known_data.get('groups', [])}
                        except (FileNotFoundError, json.JSONDecodeError, KeyError):
                            known_band_names = {
                                "gerry and the pacemakers", "simon & garfunkel", "hall & oates",
                                "brooks & dunn", "the everly brothers"
                            }
                        if artist_string.lower() in known_band_names:
                            continue
                        if pattern.lower() == 'and':
                            if 'the ' in part2.lower():
                                continue
                        is_collaboration = True
                        split_pattern = pattern
                        break
    if not is_collaboration:
        return (artist_string, [])
    # Split on the pattern using the same regex that was used for detection
    if split_pattern in ['&', 'and']:
        pattern_regex = r'\s' + re.escape(split_pattern) + r'\s'
        parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
    else:
        parts = artist_string.split(split_pattern)
    if len(parts) < 2:
        return (artist_string, [])
    main_artist = parts[0].strip()
    collaborators_string = split_pattern.join(parts[1:]).strip()
    collaborators = parse_collaborators(collaborators_string)
    return (main_artist, collaborators)

def parse_collaborators(collaborators_string: str) -> List[str]:
    if not collaborators_string:
        return []
    separators = r'[,&]|\b(?:and)\b'
    parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)
    collaborators = [part.strip() for part in parts if part.strip()]
    return collaborators

def generate_title_variations(title: str) -> List[str]:
    search_titles = [title.strip()]
    title_fixes = title.strip()

    # Normalize apostrophes (curly to straight)
    title_fixes = title_fixes.replace(''', "'").replace(''', "'")
    if title_fixes != title.strip():
        search_titles.append(title_fixes)

    # Load contraction fixes from JSON file
    try:
        from ..utils.data_loader import data_loader
        contraction_fixes = data_loader.load_contraction_fixes()
    except:
        # Fallback to hardcoded fixes if file loading fails
        contraction_fixes = {
            "dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
            "wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
            "ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
            "ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
            "arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
            "werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
            "havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
            "couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
            "didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
            "youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
            "whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
            "lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
            "heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
            "shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
            "were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
            "well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
            "itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
            "wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
            "shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
            "mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
            "oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
            "rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
        }

    # Apply contraction fixes
    for broken_contraction, fixed_contraction in contraction_fixes.items():
        # Use word boundaries to avoid partial matches
        pattern = r'\b' + re.escape(broken_contraction) + r'\b'
        fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
        if fixed_title != title_fixes:
            title_fixes = fixed_title
            if title_fixes not in search_titles:
                search_titles.append(title_fixes)

    # Additional hardcoded fixes for edge cases
    additional_fixes = [
        (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
    ]
    for pattern, replacement in additional_fixes:
        fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
        if fixed_title != title_fixes:
            title_fixes = fixed_title
            if title_fixes not in search_titles:
                search_titles.append(title_fixes)

    # Remove specific patterns first, then general parentheses
    specific_patterns = [
        r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
        r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
        r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)',
        r'\s*\(Radio Edit\)', r'\s*\(radio edit\)', r'\s*\(Extended Mix\)', r'\s*\(extended mix\)',
        r'\s*\(Single Version\)', r'\s*\(single version\)', r'\s*\(Album Version\)', r'\s*\(album version\)',
        r'\s*\(Original Mix\)', r'\s*\(original mix\)', r'\s*\(John Lewis Christmas Ad \d+\)',
        r'\s*\(from the movie [^)]*\)', r'\s*\(from the [^)]*\)', r'\s*\(feat\. [^)]*\)',
        r'\s*\(featuring [^)]*\)', r'\s*\(ft\. [^)]*\)', r'\s*\(duet\)', r'\s*\(Duet\)',
        r'\s*\(Two Semitones Down\)', r'\s*\(Minus Piano\)', r'\s*\(Cut Down\)',
        r'\s*\(Boone & Speedy Vocals\)', r'\s*\(My Heart Belongs to You\)'
    ]

    # Apply specific patterns first
    for pattern in specific_patterns:
        specific_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
        specific_clean = specific_clean.strip()
        if specific_clean != title.strip() and specific_clean and specific_clean not in search_titles:
            search_titles.append(specific_clean)

    # Only remove general parentheses if no specific patterns matched
    if len(search_titles) == 1:  # Only the original title
        clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()).strip()
        if clean_title != title.strip() and clean_title:
            search_titles.append(clean_title)

    specific_patterns = [
        r'\s*\(Karaoke Version\)', r'\s*\(Karaoke\)', r'\s*\(Instrumental\)', r'\s*\(Backing Track\)',
        r'\s*\(live [^)]*\)', r'\s*\(Live [^)]*\)', r'\s*\(Acoustic\)', r'\s*\(acoustic\)',
        r'\s*\(Without Backing Vocals\)', r'\s*\(Clean\)', r'\s*\(clean\)', r'\s*\(Remix\)'
    ]
    for pattern in specific_patterns:
        clean_title = re.sub(pattern, '', title_fixes, flags=re.IGNORECASE).strip()
        if clean_title and clean_title not in search_titles:
            search_titles.append(clean_title)

    return search_titles