Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-07-31 17:14:04 -05:00 · 2025-07-31 17:14:04 -05:00 · eeeb0bfd64
commit eeeb0bfd64
parent dc396a346f
2 changed files with 339 additions and 56 deletions
--- a/data/known_artists.json
+++ b/data/known_artists.json
@ -5,10 +5,248 @@
    "ft ": "feat. ",
    "featuring": "feat."
  },
  "groups": [
    "Abbott & Costello",
    "Adam & Joe",
    "Al & Smoot",
    "Alfie & Owen",
    "Allen & Grier",
    "Amos 'n' Andy",
    "Andrews Sisters",
    "AnnenMayKantereit",
    "Ashford & Simpson",
    "B. J. Thomas & The Triumphs",
    "Bachman-Turner Overdrive",
    "Baden & Vin",
    "Barenaked Ladies",
    "Barnum & Bailey",
    "Barry, Robin and Maurice Gibb",
    "Beavis & Butt-Head",
    "Ben & Ben",
    "Benny & Joon",
    "Bert & Ernie",
    "Bill & Gloria Gaither",
    "Bill & Ted",
    "Bill Haley and His Comets",
    "Billy & The Essentials",
    "Bob & Earl",
    "Bob and Tom",
    "Bobby & The Midnites",
    "Booker T. & the M.G.'s",
    "Brooks & Dunn",
    "Brown & Carney",
    "Bucky & John Pizzarelli",
    "Buddy & Julie Miller",
    "Buddy Holly and The Crickets",
    "Burt & The Backbeats",
    "Captain & Tennille",
    "Carly and Lucy Simon",
    "Carpenter & Ford",
    "Cece & Bebe Winans",
    "Chad & Jeremy",
    "Charles & Eddie",
    "Cheech & Chong",
    "Cheryl & Pam Johnson",
    "Chris & Cosey",
    "Chris & Rich Robinson",
    "Climax Blues Band",
    "Crosby, Stills & Nash",
    "Crosby, Stills, Nash & Young",
    "Darrell and Donny",
    "Daryl Hall & John Oates",
    "Dave & Ansell Collins",
    "Dave and Sugar",
    "David & David",
    "Dawn and Tony Orlando",
    "Dean & Britta",
    "Dean & Jerry",
    "Deke Dickerson & The Ecco-Fonics",
    "Dick & Dee Dee",
    "Dion and The Belmonts",
    "DJ Jazzy Jeff & The Fresh Prince",
    "Don and Juan",
    "Dr. Hook & The Medicine Show",
    "Ed & Steve Masley",
    "Eddie and The Hot Rods",
    "Emerson, Lake and Palmer",
    "Emerson, Lake and Powell",
    "England Dan & John Ford Coley",
    "Eric & The Good Good Feeling",
    "Eric B. & Rakim",
    "Ernie & Bert",
    "Eurythmics",
    "Everly Brothers",
    "Fischer-Z",
    "Fleming & John",
    "Flo & Eddie",
    "Foster & Lloyd",
    "Fred and Ginger",
    "Funk Brothers",
    "Gabe & Chloe",
    "Gallagher & Lyle",
    "Gamble & Huff",
    "Gene and Debbe",
    "George and Ira Gershwin",
    "Gerry and the Pacemakers",
    "Gogol Bordello",
    "Hall & Oates",
    "Hank & Frank",
    "Hank and Lewie Wickham",
    "Hank Williams Jr. and The Cheatin' Hearts",
    "Hankshaw and Fanny",
    "Hothouse Flowers",
    "Hues Corporation",
    "Ike & Tina Turner",
    "Jack & Jack",
    "James and Bobby Purify",
    "Jan & Dean",
    "Jane and Herondale",
    "Janie and The Greyhounds",
    "Jay and The Americans",
    "Jay and The Techniques",
    "Jermaine & Pia",
    "Jerry Lee Lewis and His Pumping Piano",
    "Jett & The Blackhearts",
    "Jody & The Jerms",
    "Joe and Eddie",
    "John & Jacob",
    "John and Mary",
    "John and Yoko",
    "Johnnie and Jack",
    "Johnny and The Hurricanes",
    "Johnny and The Moondogs",
    "Jon & Vangelis",
    "Jonas Brothers",
    "Josh and The Empty Pockets",
    "June and Johnny",
    "K-Ci & JoJo",
    "KC & The Sunshine Band",
    "Kate and Anna McGarrigle",
    "Kid 'n Play",
    "Kiki & Herb",
    "Kim & The Cadillacs",
    "Kingston Trio",
    "Kool & The Gang",
    "Ladysmith Black Mambazo",
    "Laurie & The Sighs",
    "Lee and Sara",
    "Les Paul & Mary Ford",
    "Loggins & Messina",
    "Lulu & The Luvvers",
    "Macklemore & Ryan Lewis",
    "Manny & The Casanovas",
    "Marcy & Zina",
    "Mark & Almond",
    "Mark & Clark Band",
    "Marty & Elayne",
    "Max & Mitch",
    "Mick & Keith",
    "Mickey & Sylvia",
    "Mike & The Mechanics",
    "Mills Brothers",
    "Moe & Joe",
    "Monty & The Pythons",
    "Mumford & Sons",
    "Nate and James",
    "Nico & Vinz",
    "O'Day & Simone",
    "Otis & Carla",
    "OutKast",
    "Paul & Linda McCartney",
    "Paul & Paula",
    "Peaches & Herb",
    "Peter & Gordon",
    "Peter, Paul and Mary",
    "Petty & The Heartbreakers",
    "Phil & The Firecat",
    "Phillips & Drew",
    "Pomus & Shuman",
    "Punch Brothers",
    "Randy and The Rainbows",
    "Ray & Anita",
    "Red and Kathy",
    "Reid & Simone",
    "Richie & The Young Lions",
    "Ricky and The Rockets",
    "Righteous Brothers",
    "Rob & Fab",
    "Robert and Johnny",
    "Rodgers and Hammerstein",
    "Rodgers and Hart",
    "Roe & Burns",
    "Rufus & Chaka Khan",
    "Sam & Dave",
    "Santo & Johnny",
    "Sasha & John Digweed",
    "Scott & Aimee",
    "Seals & Crofts",
    "Sergio & Odair Assad",
    "Sha Na Na",
    "Shemekia Copeland & The Alligators",
    "Shirley and Lee",
    "Simon & Garfunkel",
    "Solas",
    "Sonny & Cher",
    "Steeleye Span",
    "Steely Dan",
    "Steve & Eydie",
    "Syd & The Strawbs",
    "Tammi & Marvin",
    "The Avett Brothers",
    "The B-52s",
    "The Bacon Brothers",
    "The Bellamy Brothers",
    "The Blackwood Brothers",
    "The Blues Brothers",
    "The Carter Family",
    "The Chemical Brothers",
    "The Chuck Wagon Gang",
    "The Clark Sisters",
    "The Coasters",
    "The Cook Brothers",
    "The Delmore Brothers",
    "The Dillards",
    "The Doobie Brothers",
    "The Everly Brothers",
    "The Flying Burrito Brothers",
    "The Gatlin Brothers",
    "The Isley Brothers",
    "The Judds",
    "The Kalin Twins",
    "The Louvin Brothers",
    "The Madden Brothers",
    "The Neville Brothers",
    "The Oak Ridge Boys",
    "The Osborne Brothers",
    "The Pointer Sisters",
    "The Proclaimers",
    "The Stanley Brothers",
    "The Statler Brothers",
    "The Walker Brothers",
    "The Wilburn Brothers",
    "Thompson Twins",
    "Tim & Mollie O'Brien",
    "Tom and Jerry",
    "Tom and Viv",
    "Tom Petty & The Heartbreakers",
    "Tommy and The Twilights",
    "Tony and The Graduates",
    "Tucker & Tanya",
    "Vera & The Beaters",
    "Vince & Linda",
    "Waylon & Willie",
    "Wendy & Lisa",
    "Wham!",
    "Will & The Bushmen",
    "Willie & The Poor Boys",
    "Wilson Phillips",
    "Womack & Womack",
    "Zapp & Roger"
  ],
  "metadata": {
-    "description": "Name variations for artist name normalization",
+    "description": "Name variations and known band names for artist name normalization",
    "last_updated": "2024-12-19",
-    "version": "3.0",
+    "version": "3.1",
-    "note": "Removed artists section - now using database search instead"
+    "note": "Added groups section with 200+ known band names that should not be split as collaborations"
  }
 } 
--- a/src/api/database.py
+++ b/src/api/database.py
@ -167,6 +167,32 @@ class MusicBrainzDatabase:
                        ))
                        return (result['name'], result['gid'], 0.98)
            # Try matching with "&" vs "and" variations
            if '&' in main_artist:
                and_variant = main_artist.replace('&', 'and')
                query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                self.cursor.execute(query, (and_variant,))
                result = self.cursor.fetchone()
                if result:
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=0.97
                    ))
                    return (result['name'], result['gid'], 0.97)
            elif ' and ' in main_artist.lower():
                ampersand_variant = main_artist.replace(' and ', ' & ')
                query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
                self.cursor.execute(query, (ampersand_variant,))
                result = self.cursor.fetchone()
                if result:
                    print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
                        original=artist_name,
                        matched=result['name'],
                        score=0.97
                    ))
                    return (result['name'], result['gid'], 0.97)
            # Try fuzzy search
            # Use more specific patterns to avoid false matches
            # Also try different dash characters and name variations
@ -345,29 +371,8 @@ class MusicBrainzDatabase:
            return None
        try:
-            # Try with full title first, then with parenthetical content removed
+            # Generate title variations using the helper method
-            search_titles = [title.strip()]
+            search_titles = self._generate_title_variations(title)
            # Create version with parenthetical content removed
            import re
            # Remove all parenthetical content like (clean), (remix), (karaoke), etc.
            clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
            clean_title = clean_title.strip()
            if clean_title != title.strip():
                search_titles.append(clean_title)
            # Also try with specific karaoke patterns removed
            karaoke_patterns = [
                r'\s*\(Karaoke Version\)',
                r'\s*\(Karaoke\)',
                r'\s*\(Instrumental\)',
                r'\s*\(Backing Track\)',
            ]
            for pattern in karaoke_patterns:
                karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
                karaoke_clean = karaoke_clean.strip()
                if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
                    search_titles.append(karaoke_clean)
            if artist_mbid:
                # Try to find recording by artist and title with all variations
@ -548,29 +553,8 @@ class MusicBrainzDatabase:
            main_artist, collaborators = self._parse_complex_collaboration(clean_name)
            collaboration = len(collaborators) > 0
-            # Try with full title first, then with parenthetical content removed
+            # Generate title variations using the helper method
-            search_titles = [title.strip()]
+            search_titles = self._generate_title_variations(title)
            # Create version with parenthetical content removed
            import re
            # Remove all parenthetical content like (clean), (remix), (karaoke), etc.
            clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
            clean_title = clean_title.strip()
            if clean_title != title.strip():
                search_titles.append(clean_title)
            # Also try with specific karaoke patterns removed
            karaoke_patterns = [
                r'\s*\(Karaoke Version\)',
                r'\s*\(Karaoke\)',
                r'\s*\(Instrumental\)',
                r'\s*\(Backing Track\)',
            ]
            for pattern in karaoke_patterns:
                karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
                karaoke_clean = karaoke_clean.strip()
                if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
                    search_titles.append(karaoke_clean)
            # Try to find recording with artist credit using all title variations
            for search_title in search_titles:
@ -675,13 +659,32 @@ class MusicBrainzDatabase:
                            split_pattern = pattern
                            break
-                        # For "&" separators, be more permissive
+                        # For "&" and "and" separators, check against known band names
-                        # Most "&" separators in music are collaborations, not band names
+                        # Load band names from JSON file
-                        # Only treat very obvious band names as non-collaborations
+                        try:
-                        if part1_words == 1 and part2_words == 1:
+                            import json
-                            # Check for very obvious band name patterns
+                            import os
-                            obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
+                            json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
-                            if artist_string.lower() in obvious_band_names:
+                            with open(json_path, 'r') as f:
                                known_data = json.load(f)
                            known_band_names = {name.lower() for name in known_data.get('groups', [])}
                        except (FileNotFoundError, json.JSONDecodeError, KeyError):
                            # Fallback to a minimal list if file can't be loaded
                            known_band_names = {
                                "gerry and the pacemakers", "simon & garfunkel", "hall & oates", 
                                "brooks & dunn", "the everly brothers"
                            }
                        if artist_string.lower() in known_band_names:
                            # This is a known band name, not a collaboration
                            continue
                        # For "and" separators, be more conservative
                        # Many band names use "and" (e.g., "Gerry and the Pacemakers")
                        if pattern.lower() == 'and':
                            # Only treat as collaboration if it looks like individual artists
                            # Band names often have "the" in the second part
                            if 'the ' in part2.lower():
                                # Likely a band name, skip
                                continue
@ -706,6 +709,48 @@ class MusicBrainzDatabase:
        return (main_artist, collaborators)
    def _generate_title_variations(self, title: str) -> List[str]:
        """
        Generate title variations by removing parenthetical content.
        Returns list of title variations to try.
        """
        import re
        search_titles = [title.strip()]
        # Remove complete parentheses (content)
        clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
        clean_title = clean_title.strip()
        if clean_title != title.strip() and clean_title:
            search_titles.append(clean_title)
        # Remove unmatched opening parenthesis at end
        clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
        clean_title2 = clean_title2.strip()
        if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
            search_titles.append(clean_title2)
        # Remove unmatched closing parenthesis at start
        clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
        clean_title3 = clean_title3.strip()
        if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
            search_titles.append(clean_title3)
        # Also try with specific karaoke patterns removed
        karaoke_patterns = [
            r'\s*\(Karaoke Version\)',
            r'\s*\(Karaoke\)',
            r'\s*\(Instrumental\)',
            r'\s*\(Backing Track\)',
        ]
        for pattern in karaoke_patterns:
            karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
            karaoke_clean = karaoke_clean.strip()
            if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
                search_titles.append(karaoke_clean)
        return search_titles
    def _parse_collaborators(self, collaborators_string: str) -> List[str]:
        """
        Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists