diff --git a/data/known_artists.json b/data/known_artists.json index c6b3271..87da653 100644 --- a/data/known_artists.json +++ b/data/known_artists.json @@ -5,10 +5,248 @@ "ft ": "feat. ", "featuring": "feat." }, + "groups": [ + "Abbott & Costello", + "Adam & Joe", + "Al & Smoot", + "Alfie & Owen", + "Allen & Grier", + "Amos 'n' Andy", + "Andrews Sisters", + "AnnenMayKantereit", + "Ashford & Simpson", + "B. J. Thomas & The Triumphs", + "Bachman-Turner Overdrive", + "Baden & Vin", + "Barenaked Ladies", + "Barnum & Bailey", + "Barry, Robin and Maurice Gibb", + "Beavis & Butt-Head", + "Ben & Ben", + "Benny & Joon", + "Bert & Ernie", + "Bill & Gloria Gaither", + "Bill & Ted", + "Bill Haley and His Comets", + "Billy & The Essentials", + "Bob & Earl", + "Bob and Tom", + "Bobby & The Midnites", + "Booker T. & the M.G.'s", + "Brooks & Dunn", + "Brown & Carney", + "Bucky & John Pizzarelli", + "Buddy & Julie Miller", + "Buddy Holly and The Crickets", + "Burt & The Backbeats", + "Captain & Tennille", + "Carly and Lucy Simon", + "Carpenter & Ford", + "Cece & Bebe Winans", + "Chad & Jeremy", + "Charles & Eddie", + "Cheech & Chong", + "Cheryl & Pam Johnson", + "Chris & Cosey", + "Chris & Rich Robinson", + "Climax Blues Band", + "Crosby, Stills & Nash", + "Crosby, Stills, Nash & Young", + "Darrell and Donny", + "Daryl Hall & John Oates", + "Dave & Ansell Collins", + "Dave and Sugar", + "David & David", + "Dawn and Tony Orlando", + "Dean & Britta", + "Dean & Jerry", + "Deke Dickerson & The Ecco-Fonics", + "Dick & Dee Dee", + "Dion and The Belmonts", + "DJ Jazzy Jeff & The Fresh Prince", + "Don and Juan", + "Dr. Hook & The Medicine Show", + "Ed & Steve Masley", + "Eddie and The Hot Rods", + "Emerson, Lake and Palmer", + "Emerson, Lake and Powell", + "England Dan & John Ford Coley", + "Eric & The Good Good Feeling", + "Eric B. & Rakim", + "Ernie & Bert", + "Eurythmics", + "Everly Brothers", + "Fischer-Z", + "Fleming & John", + "Flo & Eddie", + "Foster & Lloyd", + "Fred and Ginger", + "Funk Brothers", + "Gabe & Chloe", + "Gallagher & Lyle", + "Gamble & Huff", + "Gene and Debbe", + "George and Ira Gershwin", + "Gerry and the Pacemakers", + "Gogol Bordello", + "Hall & Oates", + "Hank & Frank", + "Hank and Lewie Wickham", + "Hank Williams Jr. and The Cheatin' Hearts", + "Hankshaw and Fanny", + "Hothouse Flowers", + "Hues Corporation", + "Ike & Tina Turner", + "Jack & Jack", + "James and Bobby Purify", + "Jan & Dean", + "Jane and Herondale", + "Janie and The Greyhounds", + "Jay and The Americans", + "Jay and The Techniques", + "Jermaine & Pia", + "Jerry Lee Lewis and His Pumping Piano", + "Jett & The Blackhearts", + "Jody & The Jerms", + "Joe and Eddie", + "John & Jacob", + "John and Mary", + "John and Yoko", + "Johnnie and Jack", + "Johnny and The Hurricanes", + "Johnny and The Moondogs", + "Jon & Vangelis", + "Jonas Brothers", + "Josh and The Empty Pockets", + "June and Johnny", + "K-Ci & JoJo", + "KC & The Sunshine Band", + "Kate and Anna McGarrigle", + "Kid 'n Play", + "Kiki & Herb", + "Kim & The Cadillacs", + "Kingston Trio", + "Kool & The Gang", + "Ladysmith Black Mambazo", + "Laurie & The Sighs", + "Lee and Sara", + "Les Paul & Mary Ford", + "Loggins & Messina", + "Lulu & The Luvvers", + "Macklemore & Ryan Lewis", + "Manny & The Casanovas", + "Marcy & Zina", + "Mark & Almond", + "Mark & Clark Band", + "Marty & Elayne", + "Max & Mitch", + "Mick & Keith", + "Mickey & Sylvia", + "Mike & The Mechanics", + "Mills Brothers", + "Moe & Joe", + "Monty & The Pythons", + "Mumford & Sons", + "Nate and James", + "Nico & Vinz", + "O'Day & Simone", + "Otis & Carla", + "OutKast", + "Paul & Linda McCartney", + "Paul & Paula", + "Peaches & Herb", + "Peter & Gordon", + "Peter, Paul and Mary", + "Petty & The Heartbreakers", + "Phil & The Firecat", + "Phillips & Drew", + "Pomus & Shuman", + "Punch Brothers", + "Randy and The Rainbows", + "Ray & Anita", + "Red and Kathy", + "Reid & Simone", + "Richie & The Young Lions", + "Ricky and The Rockets", + "Righteous Brothers", + "Rob & Fab", + "Robert and Johnny", + "Rodgers and Hammerstein", + "Rodgers and Hart", + "Roe & Burns", + "Rufus & Chaka Khan", + "Sam & Dave", + "Santo & Johnny", + "Sasha & John Digweed", + "Scott & Aimee", + "Seals & Crofts", + "Sergio & Odair Assad", + "Sha Na Na", + "Shemekia Copeland & The Alligators", + "Shirley and Lee", + "Simon & Garfunkel", + "Solas", + "Sonny & Cher", + "Steeleye Span", + "Steely Dan", + "Steve & Eydie", + "Syd & The Strawbs", + "Tammi & Marvin", + "The Avett Brothers", + "The B-52s", + "The Bacon Brothers", + "The Bellamy Brothers", + "The Blackwood Brothers", + "The Blues Brothers", + "The Carter Family", + "The Chemical Brothers", + "The Chuck Wagon Gang", + "The Clark Sisters", + "The Coasters", + "The Cook Brothers", + "The Delmore Brothers", + "The Dillards", + "The Doobie Brothers", + "The Everly Brothers", + "The Flying Burrito Brothers", + "The Gatlin Brothers", + "The Isley Brothers", + "The Judds", + "The Kalin Twins", + "The Louvin Brothers", + "The Madden Brothers", + "The Neville Brothers", + "The Oak Ridge Boys", + "The Osborne Brothers", + "The Pointer Sisters", + "The Proclaimers", + "The Stanley Brothers", + "The Statler Brothers", + "The Walker Brothers", + "The Wilburn Brothers", + "Thompson Twins", + "Tim & Mollie O'Brien", + "Tom and Jerry", + "Tom and Viv", + "Tom Petty & The Heartbreakers", + "Tommy and The Twilights", + "Tony and The Graduates", + "Tucker & Tanya", + "Vera & The Beaters", + "Vince & Linda", + "Waylon & Willie", + "Wendy & Lisa", + "Wham!", + "Will & The Bushmen", + "Willie & The Poor Boys", + "Wilson Phillips", + "Womack & Womack", + "Zapp & Roger" + ], "metadata": { - "description": "Name variations for artist name normalization", + "description": "Name variations and known band names for artist name normalization", "last_updated": "2024-12-19", - "version": "3.0", - "note": "Removed artists section - now using database search instead" + "version": "3.1", + "note": "Added groups section with 200+ known band names that should not be split as collaborations" } } \ No newline at end of file diff --git a/src/api/database.py b/src/api/database.py index 82673e6..987f3e4 100644 --- a/src/api/database.py +++ b/src/api/database.py @@ -167,6 +167,32 @@ class MusicBrainzDatabase: )) return (result['name'], result['gid'], 0.98) + # Try matching with "&" vs "and" variations + if '&' in main_artist: + and_variant = main_artist.replace('&', 'and') + query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1" + self.cursor.execute(query, (and_variant,)) + result = self.cursor.fetchone() + if result: + print(SUCCESS_MESSAGES['fuzzy_match_found'].format( + original=artist_name, + matched=result['name'], + score=0.97 + )) + return (result['name'], result['gid'], 0.97) + elif ' and ' in main_artist.lower(): + ampersand_variant = main_artist.replace(' and ', ' & ') + query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1" + self.cursor.execute(query, (ampersand_variant,)) + result = self.cursor.fetchone() + if result: + print(SUCCESS_MESSAGES['fuzzy_match_found'].format( + original=artist_name, + matched=result['name'], + score=0.97 + )) + return (result['name'], result['gid'], 0.97) + # Try fuzzy search # Use more specific patterns to avoid false matches # Also try different dash characters and name variations @@ -345,29 +371,8 @@ class MusicBrainzDatabase: return None try: - # Try with full title first, then with parenthetical content removed - search_titles = [title.strip()] - - # Create version with parenthetical content removed - import re - # Remove all parenthetical content like (clean), (remix), (karaoke), etc. - clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) - clean_title = clean_title.strip() - if clean_title != title.strip(): - search_titles.append(clean_title) - - # Also try with specific karaoke patterns removed - karaoke_patterns = [ - r'\s*\(Karaoke Version\)', - r'\s*\(Karaoke\)', - r'\s*\(Instrumental\)', - r'\s*\(Backing Track\)', - ] - for pattern in karaoke_patterns: - karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - karaoke_clean = karaoke_clean.strip() - if karaoke_clean != title.strip() and karaoke_clean not in search_titles: - search_titles.append(karaoke_clean) + # Generate title variations using the helper method + search_titles = self._generate_title_variations(title) if artist_mbid: # Try to find recording by artist and title with all variations @@ -548,29 +553,8 @@ class MusicBrainzDatabase: main_artist, collaborators = self._parse_complex_collaboration(clean_name) collaboration = len(collaborators) > 0 - # Try with full title first, then with parenthetical content removed - search_titles = [title.strip()] - - # Create version with parenthetical content removed - import re - # Remove all parenthetical content like (clean), (remix), (karaoke), etc. - clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) - clean_title = clean_title.strip() - if clean_title != title.strip(): - search_titles.append(clean_title) - - # Also try with specific karaoke patterns removed - karaoke_patterns = [ - r'\s*\(Karaoke Version\)', - r'\s*\(Karaoke\)', - r'\s*\(Instrumental\)', - r'\s*\(Backing Track\)', - ] - for pattern in karaoke_patterns: - karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) - karaoke_clean = karaoke_clean.strip() - if karaoke_clean != title.strip() and karaoke_clean not in search_titles: - search_titles.append(karaoke_clean) + # Generate title variations using the helper method + search_titles = self._generate_title_variations(title) # Try to find recording with artist credit using all title variations for search_title in search_titles: @@ -675,13 +659,32 @@ class MusicBrainzDatabase: split_pattern = pattern break - # For "&" separators, be more permissive - # Most "&" separators in music are collaborations, not band names - # Only treat very obvious band names as non-collaborations - if part1_words == 1 and part2_words == 1: - # Check for very obvious band name patterns - obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn'] - if artist_string.lower() in obvious_band_names: + # For "&" and "and" separators, check against known band names + # Load band names from JSON file + try: + import json + import os + json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json') + with open(json_path, 'r') as f: + known_data = json.load(f) + known_band_names = {name.lower() for name in known_data.get('groups', [])} + except (FileNotFoundError, json.JSONDecodeError, KeyError): + # Fallback to a minimal list if file can't be loaded + known_band_names = { + "gerry and the pacemakers", "simon & garfunkel", "hall & oates", + "brooks & dunn", "the everly brothers" + } + + if artist_string.lower() in known_band_names: + # This is a known band name, not a collaboration + continue + + # For "and" separators, be more conservative + # Many band names use "and" (e.g., "Gerry and the Pacemakers") + if pattern.lower() == 'and': + # Only treat as collaboration if it looks like individual artists + # Band names often have "the" in the second part + if 'the ' in part2.lower(): # Likely a band name, skip continue @@ -706,6 +709,48 @@ class MusicBrainzDatabase: return (main_artist, collaborators) + def _generate_title_variations(self, title: str) -> List[str]: + """ + Generate title variations by removing parenthetical content. + Returns list of title variations to try. + """ + import re + + search_titles = [title.strip()] + + # Remove complete parentheses (content) + clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip()) + clean_title = clean_title.strip() + if clean_title != title.strip() and clean_title: + search_titles.append(clean_title) + + # Remove unmatched opening parenthesis at end + clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip()) + clean_title2 = clean_title2.strip() + if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles: + search_titles.append(clean_title2) + + # Remove unmatched closing parenthesis at start + clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip()) + clean_title3 = clean_title3.strip() + if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles: + search_titles.append(clean_title3) + + # Also try with specific karaoke patterns removed + karaoke_patterns = [ + r'\s*\(Karaoke Version\)', + r'\s*\(Karaoke\)', + r'\s*\(Instrumental\)', + r'\s*\(Backing Track\)', + ] + for pattern in karaoke_patterns: + karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE) + karaoke_clean = karaoke_clean.strip() + if karaoke_clean != title.strip() and karaoke_clean not in search_titles: + search_titles.append(karaoke_clean) + + return search_titles + def _parse_collaborators(self, collaborators_string: str) -> List[str]: """ Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists