musicbrainz-cleaner/src/api/database.py

"""
Direct PostgreSQL database access for MusicBrainz Data Cleaner.
Provides fuzzy search capabilities for artists and recordings.
"""

import psycopg2
import psycopg2.extras
from typing import List, Dict, Optional, Tuple, Any
from fuzzywuzzy import fuzz
from ..config.constants import (
    DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD, DB_CONNECTION_TIMEOUT,
    FUZZY_SEARCH_THRESHOLD, MAX_SEARCH_RESULTS,
    TITLE_SIMILARITY_THRESHOLD, ARTIST_SIMILARITY_THRESHOLD,
    SUCCESS_MESSAGES, ERROR_MESSAGES
)


class MusicBrainzDatabase:
    """Direct PostgreSQL database access for MusicBrainz with fuzzy search."""

    def __init__(self, host: str = DB_HOST, port: int = DB_PORT,
                 database: str = DB_NAME, user: str = DB_USER,
                 password: str = DB_PASSWORD):
        self.host = host
        self.port = port
        self.database = database
        self.user = user
        self.password = password
        self.connection = None
        self.cursor = None

    def connect(self) -> bool:
        """Establish connection to the MusicBrainz database."""
        try:
            # Use the direct connection method that works
            self.connection = psycopg2.connect(
                host='db',  # Use Docker service name
                port=self.port,
                database=self.database,
                user=self.user,
                password=self.password,
                connect_timeout=DB_CONNECTION_TIMEOUT
            )
            self.cursor = self.connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
            print(SUCCESS_MESSAGES['db_connected'])
            return True

        except Exception as e:
            print(ERROR_MESSAGES['db_connection_failed'])
            print(f"Connection error: {e}")
            return False

    def disconnect(self):
        """Close database connection."""
        if self.cursor:
            self.cursor.close()
        if self.connection:
            self.connection.close()

    def __enter__(self):
        """Context manager entry."""
        self.connect()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.disconnect()

    def fuzzy_search_artist(self, artist_name: str) -> Optional[Tuple[str, str, float]]:
        """
        Fuzzy search for artist by name.
        Returns (artist_name, mbid, similarity_score) or None.
        """
        if not self.connection:
            return None

        try:
            # Clean artist name
            clean_name = artist_name.strip()

            # Handle collaborations - extract main artist
            if 'ft.' in clean_name.lower():
                main_artist = clean_name.split('ft.')[0].strip()
            elif 'feat.' in clean_name.lower():
                main_artist = clean_name.split('feat.')[0].strip()
            else:
                main_artist = clean_name

            # Try exact match first - but check for multiple matches
            query = "SELECT name, gid FROM artist WHERE name = %s"
            self.cursor.execute(query, (main_artist,))
            results = self.cursor.fetchall()

            if results:
                if len(results) == 1:
                    # Only one exact match, use it
                    result = results[0]
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=1.0
                    ))
                    return (result['name'], result['gid'], 1.0)
                else:
                    # Multiple exact matches - choose the one with most recordings
                    print(f"🔍 Found {len(results)} exact matches for '{main_artist}', checking recording counts...")
                    artists_with_counts = []
                    for artist in results:
                        query = """
                            SELECT COUNT(r.id) as recording_count
                            FROM artist a
                            JOIN artist_credit_name acn ON a.id = acn.artist
                            JOIN artist_credit ac ON acn.artist_credit = ac.id
                            JOIN recording r ON ac.id = r.artist_credit
                            WHERE a.gid = %s
                        """
                        self.cursor.execute(query, (artist['gid'],))
                        count_result = self.cursor.fetchone()
                        recording_count = count_result['recording_count'] if count_result else 0
                        artists_with_counts.append({
                            'name': artist['name'],
                            'gid': artist['gid'],
                            'recording_count': recording_count
                        })
                        print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {recording_count} recordings")

                    # Choose the artist with the most recordings
                    best_artist = max(artists_with_counts, key=lambda x: x['recording_count'])
                    print(f"  🎯 Selected: {best_artist['name']} with {best_artist['recording_count']} recordings")

                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=best_artist['name'],
                        score=1.0
                    ))
                    return (best_artist['name'], best_artist['gid'], 1.0)

            # Try case-insensitive match
            query = "SELECT name, gid FROM artist WHERE LOWER(name) = LOWER(%s) LIMIT 1"
            self.cursor.execute(query, (main_artist,))
            result = self.cursor.fetchone()
            if result:
                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                    original=artist_name,
                    matched=result['name'],
                    score=0.99
                ))
                return (result['name'], result['gid'], 0.99)

            # Try matching with different dash characters
            # Replace regular dash with Unicode dash and vice versa
            dash_variants = [
                main_artist.replace('-', '‐'),  # Regular dash to Unicode dash
                main_artist.replace('‐', '-'),  # Unicode dash to regular dash
            ]

            for variant in dash_variants:
                if variant != main_artist:
                    query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                    self.cursor.execute(query, (variant,))
                    result = self.cursor.fetchone()
                    if result:
                        print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                            original=artist_name,
                            matched=result['name'],
                            score=0.98
                        ))
                        return (result['name'], result['gid'], 0.98)

            # Try matching with "&" vs "and" variations
            if '&' in main_artist:
                and_variant = main_artist.replace('&', 'and')
                query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                self.cursor.execute(query, (and_variant,))
                result = self.cursor.fetchone()
                if result:
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=0.97
                    ))
                    return (result['name'], result['gid'], 0.97)
            elif ' and ' in main_artist.lower():
                ampersand_variant = main_artist.replace(' and ', ' & ')
                query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                self.cursor.execute(query, (ampersand_variant,))
                result = self.cursor.fetchone()
                if result:
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=0.97
                    ))
                    return (result['name'], result['gid'], 0.97)

            # Try fuzzy search
            # Use more specific patterns to avoid false matches
            # Also try different dash characters and name variations
            dash_variants = [
                main_artist,
                main_artist.replace('-', '‐'),  # Regular dash to Unicode dash
                main_artist.replace('‐', '-'),  # Unicode dash to regular dash
            ]

            # Add name variations for common cases like "S Club 7" -> "S Club"
            name_variants = []
            for variant in dash_variants:
                name_variants.append(variant)
                # Handle cases like "S Club 7" -> "S Club"
                if ' ' in variant:
                    parts = variant.split()
                    if len(parts) > 1:
                        # Try without the last part if it's a number (e.g., "S Club 7" -> "S Club")
                        if parts[-1].isdigit():
                            name_variants.append(' '.join(parts[:-1]))

            # First try exact matches with name variations
            for variant in name_variants:
                # Try exact match in artist names
                query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                self.cursor.execute(query, (variant,))
                result = self.cursor.fetchone()
                if result:
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=0.95
                    ))
                    return (result['name'], result['gid'], 0.95)

                            # Try exact match in artist aliases
            query = """
                SELECT a.name, a.gid
                FROM artist a
                JOIN artist_alias aa ON a.id = aa.artist
                WHERE aa.name = %s
                LIMIT 1
            """
            self.cursor.execute(query, (variant,))
            result = self.cursor.fetchone()
            if result:
                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                    original=artist_name,
                    matched=result['name'],
                    score=0.94
                ))
                return (result['name'], result['gid'], 0.94)

            # Try exact match in sort_name field
            query = "SELECT name, gid FROM artist WHERE sort_name = %s LIMIT 1"
            self.cursor.execute(query, (variant,))
            result = self.cursor.fetchone()
            if result:
                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                    original=artist_name,
                    matched=result['name'],
                    score=0.93
                ))
                return (result['name'], result['gid'], 0.93)

            # Then try starts-with patterns for better precision
            patterns = []
            for variant in name_variants:
                patterns.extend([
                    f"{variant}%",   # Starts with (most precise)
                    f" {variant} ",  # Word boundary
                    f"%{variant}%",  # Contains (least precise)
                ])

            all_artists = []
            for pattern in patterns:
                # Search in artist names
                query = "SELECT name, gid FROM artist WHERE name ILIKE %s LIMIT 20"
                self.cursor.execute(query, (pattern,))
                artists = self.cursor.fetchall()
                all_artists.extend(artists)

                # Search in artist aliases
                query = """
                    SELECT a.name, a.gid
                    FROM artist a
                    JOIN artist_alias aa ON a.id = aa.artist
                    WHERE aa.name ILIKE %s
                    LIMIT 20
                """
                self.cursor.execute(query, (pattern,))
                artists = self.cursor.fetchall()
                all_artists.extend(artists)

                # Search in sort_name field
                query = "SELECT name, gid FROM artist WHERE sort_name ILIKE %s LIMIT 20"
                self.cursor.execute(query, (pattern,))
                artists = self.cursor.fetchall()
                all_artists.extend(artists)

            # Remove duplicates based on MBID
            seen_mbids = set()
            unique_artists = []
            for artist in all_artists:
                if artist['gid'] not in seen_mbids:
                    seen_mbids.add(artist['gid'])
                    unique_artists.append(artist)

            # Get recording counts for all artists to prioritize famous ones
            artists_with_counts = []
            print(f"🔍 Found {len(unique_artists)} unique artists, checking recording counts...")
            for artist in unique_artists:
                query = """
                    SELECT COUNT(r.id) as recording_count
                    FROM artist a
                    JOIN artist_credit_name acn ON a.id = acn.artist
                    JOIN artist_credit ac ON acn.artist_credit = ac.id
                    JOIN recording r ON ac.id = r.artist_credit
                    WHERE a.gid = %s
                """
                self.cursor.execute(query, (artist['gid'],))
                result = self.cursor.fetchone()
                recording_count = result['recording_count'] if result else 0
                artists_with_counts.append({
                    'name': artist['name'],
                    'gid': artist['gid'],
                    'recording_count': recording_count
                })
                print(f"  📊 {artist['name']} ({artist['gid'][:8]}...): {recording_count} recordings")

            best_match = None
            best_score = 0
            best_recording_count = 0

            for artist in artists_with_counts:
                score = fuzz.ratio(main_artist.lower(), artist['name'].lower())

                # Only apply strict filtering for very obvious false matches
                # like "Sleazy-E" vs "Eazy-E" where one is clearly a substring of the other
                if (main_artist.lower() in artist['name'].lower() and
                    artist['name'].lower() != main_artist.lower() and
                    len(artist['name']) > len(main_artist) + 2):
                    # This is likely a false match, require very high score
                    if score < 90:
                        continue

                # Prioritize artists with more recordings (famous artists)
                # If scores are close (within 5 points), prefer the one with more recordings
                if (score > best_score or
                    (score >= best_score - 5 and artist['recording_count'] > best_recording_count * 2)):
                    best_score = score
                    best_match = artist
                    best_recording_count = artist['recording_count']
                    print(f"  🎯 New best match: {artist['name']} (score: {score}, recordings: {artist['recording_count']})")

            if best_score >= 80:
                print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                    original=artist_name,
                    matched=best_match['name'],
                    score=best_score / 100.0
                ))
                return (best_match['name'], best_match['gid'], best_score / 100.0)

            return None

        except Exception as e:
            print(f"Error in fuzzy_search_artist: {e}")
            return None

    def fuzzy_search_recording(self, title: str, artist_mbid: Optional[str] = None) -> Optional[Tuple[str, str, float]]:
        """
        Fuzzy search for recording by title and optionally artist MBID.
        Returns (recording_name, mbid, similarity_score) or None.
        """
        if not self.connection:
            return None

        try:
            # Generate title variations using the helper method
            search_titles = self._generate_title_variations(title)

            if artist_mbid:
                # Try to find recording by artist and title with all variations
                for search_title in search_titles:
                    query = """
                        SELECT r.name, r.gid
                        FROM recording r
                        JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
                        JOIN artist a ON acn.artist = a.id
                        WHERE a.gid = %s AND r.name ILIKE %s
                        LIMIT 1
                    """
                    self.cursor.execute(query, (artist_mbid, f"%{search_title}%"))
                    result = self.cursor.fetchone()

                    if result:
                        return (result['name'], result['gid'], 1.0)

                # Try fuzzy search with all title variations
                query = """
                    SELECT r.name, r.gid
                    FROM recording r
                    JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
                    JOIN artist a ON acn.artist = a.id
                    WHERE a.gid = %s
                    LIMIT 100
                """
                self.cursor.execute(query, (artist_mbid,))
                recordings = self.cursor.fetchall()

                best_match = None
                best_score = 0

                for recording in recordings:
                    for search_title in search_titles:
                        score = fuzz.ratio(search_title.lower(), recording['name'].lower())
                        if score > best_score:
                            best_score = score
                            best_match = recording

                if best_score >= 80:
                    return (best_match['name'], best_match['gid'], best_score / 100.0)
            else:
                # No artist constraint - search by title only with all variations
                for search_title in search_titles:
                    # Try exact match first
                    query = "SELECT name, gid FROM recording WHERE name = %s LIMIT 1"
                    self.cursor.execute(query, (search_title,))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 1.0)

                    # Try case-insensitive match
                    query = "SELECT name, gid FROM recording WHERE LOWER(name) = LOWER(%s) LIMIT 1"
                    self.cursor.execute(query, (search_title,))
                    result = self.cursor.fetchone()
                    if result:
                        return (result['name'], result['gid'], 0.99)

                # Try partial match with all variations
                best_match = None
                best_score = 0

                for search_title in search_titles:
                    query = "SELECT name, gid FROM recording WHERE name ILIKE %s LIMIT 50"
                    self.cursor.execute(query, (f"%{search_title}%",))
                    recordings = self.cursor.fetchall()

                    for recording in recordings:
                        score = fuzz.ratio(search_title.lower(), recording['name'].lower())
                        if score > best_score:
                            best_score = score
                            best_match = recording

                if best_score >= 80:
                    return (best_match['name'], best_match['gid'], best_score / 100.0)

            return None

        except Exception as e:
            print(f"Error in fuzzy_search_recording: {e}")
            return None

    def get_artist_info(self, mbid: str) -> Optional[Dict[str, Any]]:
        """Get detailed artist information by MBID."""
        if not self.connection:
            return None

        try:
            query = """
                SELECT name, gid as mbid, sort_name
                FROM artist
                WHERE gid = %s
            """
            self.cursor.execute(query, (mbid,))
            artist = self.cursor.fetchone()

            if artist:
                return dict(artist)
            return None

        except Exception as e:
            print(ERROR_MESSAGES['db_query_failed'].format(error=str(e)))
            return None

    def get_recording_info(self, mbid: str) -> Optional[Dict[str, Any]]:
        """Get recording information by MBID."""
        if not self.connection:
            return None

        try:
            query = """
                SELECT r.name as title, r.gid as mbid, ac.id as artist_credit_id
                FROM recording r
                JOIN artist_credit ac ON r.artist_credit = ac.id
                WHERE r.gid = %s
                LIMIT 1
            """
            self.cursor.execute(query, (mbid,))
            result = self.cursor.fetchone()

            if result:
                # Get artist credit information
                artist_query = """
                    SELECT a.name, acn.join_phrase, acn.position
                    FROM artist_credit_name acn
                    JOIN artist a ON acn.artist = a.id
                    WHERE acn.artist_credit = %s
                    ORDER BY acn.position
                """
                self.cursor.execute(artist_query, (result['artist_credit_id'],))
                artists = self.cursor.fetchall()

                # Build the artist string
                artist_string = ""
                for artist in artists:
                    artist_string += artist['name']
                    if artist['join_phrase']:
                        artist_string += artist['join_phrase']

                return {
                    'title': result['title'],
                    'mbid': result['mbid'],
                    'artist_credit': artist_string
                }

            return None

        except Exception as e:
            print(f"Error in get_recording_info: {e}")
            return None

    def test_connection(self) -> bool:
        """Test database connection."""
        try:
            if not self.connection:
                return self.connect()

            self.cursor.execute("SELECT 1")
            return True
        except Exception as e:
            print(f"Database connection test failed: {e}")
            return False

    def find_artist_credit(self, artist_name: str, title: str) -> Optional[Tuple[str, str, str]]:
        """
        Find artist credit for collaboration.
        Returns (artist_credit_id, artist_string, recording_mbid) or None.
        """
        if not self.connection:
            return None

        try:
            # Clean artist name
            clean_name = artist_name.strip()

            # Parse complex collaboration
            main_artist, collaborators = self._parse_complex_collaboration(clean_name)
            collaboration = len(collaborators) > 0

            # Generate title variations using the helper method
            search_titles = self._generate_title_variations(title)

            # Try to find recording with artist credit using all title variations
            for search_title in search_titles:
                query = """
                    SELECT ac.id as artist_credit_id, r.gid as recording_mbid
                    FROM recording r
                    JOIN artist_credit ac ON r.artist_credit = ac.id
                    JOIN artist_credit_name acn ON ac.id = acn.artist_credit
                    JOIN artist a ON acn.artist = a.id
                    WHERE a.name = %s AND r.name ILIKE %s
                    LIMIT 1
                """
                self.cursor.execute(query, (main_artist, f"%{search_title}%"))
                result = self.cursor.fetchone()

                if result:
                    break

            if result:
                # Get the full artist string for this credit
                artist_query = """
                    SELECT a.name, acn.join_phrase, acn.position
                    FROM artist_credit_name acn
                    JOIN artist a ON acn.artist = a.id
                    WHERE acn.artist_credit = %s
                    ORDER BY acn.position
                """
                self.cursor.execute(artist_query, (result['artist_credit_id'],))
                artists = self.cursor.fetchall()

                # Build the artist string
                artist_string = ""
                for artist in artists:
                    artist_string += artist['name']
                    if artist['join_phrase']:
                        artist_string += artist['join_phrase']

                return (str(result['artist_credit_id']), artist_string, result['recording_mbid'])

            return None

        except Exception as e:
            print(f"Error in find_artist_credit: {e}")
            return None

    def _parse_complex_collaboration(self, artist_string: str) -> Tuple[str, List[str]]:
        """
        Parse complex collaboration strings like "Pitbull ft. Ne-Yo, Afrojack & Nayer"
        Returns: (main_artist, [collaborators])
        """
        import re

        # Primary collaboration indicators
        primary_patterns = ['ft.', 'feat.', 'featuring']

        # Secondary collaboration indicators (need more careful handling)
        secondary_patterns = ['&', 'and']

        # Check if this is a collaboration
        is_collaboration = False
        split_pattern = None

        for pattern in primary_patterns:
            if pattern.lower() in artist_string.lower():
                is_collaboration = True
                # Find the actual pattern in the original string (case-insensitive)
                import re
                match = re.search(re.escape(pattern), artist_string, re.IGNORECASE)
                if match:
                    split_pattern = match.group(0)
                else:
                    split_pattern = pattern
                break

        # If no primary collaboration found, check secondary patterns
        if not is_collaboration:
            for pattern in secondary_patterns:
                # Use word boundaries to avoid splitting within words like "Orlando"
                import re
                pattern_regex = r'\b' + re.escape(pattern) + r'\b'
                if re.search(pattern_regex, artist_string, re.IGNORECASE):
                    # For secondary patterns, be more careful
                    # Check if this looks like a band name vs collaboration
                    parts = re.split(pattern_regex, artist_string, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        part1 = parts[0].strip()
                        part2 = parts[1].strip()

                        # Check if this looks like a band name vs collaboration
                        # Band names typically have simple, short parts
                        # Collaborations often have more complex artist names
                        part1_words = len(part1.split())
                        part2_words = len(part2.split())

                        # Check if this looks like a band name vs collaboration
                        # Band names typically have simple, short parts
                        # Collaborations often have more complex artist names

                        # If one part has a comma, it's likely a collaboration
                        if ',' in part1 or ',' in part2:
                            is_collaboration = True
                            split_pattern = pattern
                            break

                        # For "&" and "and" separators, check against known band names
                        # Load band names from JSON file
                        try:
                            import json
                            import os
                            json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
                            with open(json_path, 'r') as f:
                                known_data = json.load(f)
                            known_band_names = {name.lower() for name in known_data.get('groups', [])}
                        except (FileNotFoundError, json.JSONDecodeError, KeyError):
                            # Fallback to a minimal list if file can't be loaded
                            known_band_names = {
                                "gerry and the pacemakers", "simon & garfunkel", "hall & oates",
                                "brooks & dunn", "the everly brothers"
                            }

                        if artist_string.lower() in known_band_names:
                            # This is a known band name, not a collaboration
                            continue

                        # For "and" separators, be more conservative
                        # Many band names use "and" (e.g., "Gerry and the Pacemakers")
                        if pattern.lower() == 'and':
                            # Only treat as collaboration if it looks like individual artists
                            # Band names often have "the" in the second part
                            if 'the ' in part2.lower():
                                # Likely a band name, skip
                                continue

                        # Likely a collaboration
                        is_collaboration = True
                        split_pattern = pattern
                        break

        if not is_collaboration:
            return (artist_string, [])

        # Split on the pattern
        parts = artist_string.split(split_pattern)
        if len(parts) < 2:
            return (artist_string, [])

        main_artist = parts[0].strip()
        collaborators_string = split_pattern.join(parts[1:]).strip()

        # Now parse the collaborators string which might have multiple separators
        collaborators = self._parse_collaborators(collaborators_string)

        return (main_artist, collaborators)

    def _generate_title_variations(self, title: str) -> List[str]:
        """
        Generate title variations by removing parenthetical content.
        Returns list of title variations to try.
        """
        import re

        search_titles = [title.strip()]

        # Remove complete parentheses (content)
        clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
        clean_title = clean_title.strip()
        if clean_title != title.strip() and clean_title:
            search_titles.append(clean_title)

        # Remove unmatched opening parenthesis at end
        clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
        clean_title2 = clean_title2.strip()
        if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
            search_titles.append(clean_title2)

        # Remove unmatched closing parenthesis at start
        clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
        clean_title3 = clean_title3.strip()
        if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
            search_titles.append(clean_title3)

        # Also try with specific karaoke patterns removed
        karaoke_patterns = [
            r'\s*\(Karaoke Version\)',
            r'\s*\(Karaoke\)',
            r'\s*\(Instrumental\)',
            r'\s*\(Backing Track\)',
        ]
        for pattern in karaoke_patterns:
            karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
            karaoke_clean = karaoke_clean.strip()
            if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
                search_titles.append(karaoke_clean)

        return search_titles

    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
        """
        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
        """
        import re

        if not collaborators_string:
            return []

        # Split on common separators: comma, ampersand, "and"
        # Use regex to handle multiple separators
        # Use word boundaries to avoid splitting within words like "Orlando"
        separators = r'[,&]|\b(?:and)\b'

        # Split and clean up each part
        parts = re.split(separators, collaborators_string, flags=re.IGNORECASE)

        collaborators = []
        for part in parts:
            part = part.strip()
            if part:  # Skip empty parts
                collaborators.append(part)

        return collaborators