Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
dc396a346f
commit
eeeb0bfd64
@ -5,10 +5,248 @@
|
||||
"ft ": "feat. ",
|
||||
"featuring": "feat."
|
||||
},
|
||||
"groups": [
|
||||
"Abbott & Costello",
|
||||
"Adam & Joe",
|
||||
"Al & Smoot",
|
||||
"Alfie & Owen",
|
||||
"Allen & Grier",
|
||||
"Amos 'n' Andy",
|
||||
"Andrews Sisters",
|
||||
"AnnenMayKantereit",
|
||||
"Ashford & Simpson",
|
||||
"B. J. Thomas & The Triumphs",
|
||||
"Bachman-Turner Overdrive",
|
||||
"Baden & Vin",
|
||||
"Barenaked Ladies",
|
||||
"Barnum & Bailey",
|
||||
"Barry, Robin and Maurice Gibb",
|
||||
"Beavis & Butt-Head",
|
||||
"Ben & Ben",
|
||||
"Benny & Joon",
|
||||
"Bert & Ernie",
|
||||
"Bill & Gloria Gaither",
|
||||
"Bill & Ted",
|
||||
"Bill Haley and His Comets",
|
||||
"Billy & The Essentials",
|
||||
"Bob & Earl",
|
||||
"Bob and Tom",
|
||||
"Bobby & The Midnites",
|
||||
"Booker T. & the M.G.'s",
|
||||
"Brooks & Dunn",
|
||||
"Brown & Carney",
|
||||
"Bucky & John Pizzarelli",
|
||||
"Buddy & Julie Miller",
|
||||
"Buddy Holly and The Crickets",
|
||||
"Burt & The Backbeats",
|
||||
"Captain & Tennille",
|
||||
"Carly and Lucy Simon",
|
||||
"Carpenter & Ford",
|
||||
"Cece & Bebe Winans",
|
||||
"Chad & Jeremy",
|
||||
"Charles & Eddie",
|
||||
"Cheech & Chong",
|
||||
"Cheryl & Pam Johnson",
|
||||
"Chris & Cosey",
|
||||
"Chris & Rich Robinson",
|
||||
"Climax Blues Band",
|
||||
"Crosby, Stills & Nash",
|
||||
"Crosby, Stills, Nash & Young",
|
||||
"Darrell and Donny",
|
||||
"Daryl Hall & John Oates",
|
||||
"Dave & Ansell Collins",
|
||||
"Dave and Sugar",
|
||||
"David & David",
|
||||
"Dawn and Tony Orlando",
|
||||
"Dean & Britta",
|
||||
"Dean & Jerry",
|
||||
"Deke Dickerson & The Ecco-Fonics",
|
||||
"Dick & Dee Dee",
|
||||
"Dion and The Belmonts",
|
||||
"DJ Jazzy Jeff & The Fresh Prince",
|
||||
"Don and Juan",
|
||||
"Dr. Hook & The Medicine Show",
|
||||
"Ed & Steve Masley",
|
||||
"Eddie and The Hot Rods",
|
||||
"Emerson, Lake and Palmer",
|
||||
"Emerson, Lake and Powell",
|
||||
"England Dan & John Ford Coley",
|
||||
"Eric & The Good Good Feeling",
|
||||
"Eric B. & Rakim",
|
||||
"Ernie & Bert",
|
||||
"Eurythmics",
|
||||
"Everly Brothers",
|
||||
"Fischer-Z",
|
||||
"Fleming & John",
|
||||
"Flo & Eddie",
|
||||
"Foster & Lloyd",
|
||||
"Fred and Ginger",
|
||||
"Funk Brothers",
|
||||
"Gabe & Chloe",
|
||||
"Gallagher & Lyle",
|
||||
"Gamble & Huff",
|
||||
"Gene and Debbe",
|
||||
"George and Ira Gershwin",
|
||||
"Gerry and the Pacemakers",
|
||||
"Gogol Bordello",
|
||||
"Hall & Oates",
|
||||
"Hank & Frank",
|
||||
"Hank and Lewie Wickham",
|
||||
"Hank Williams Jr. and The Cheatin' Hearts",
|
||||
"Hankshaw and Fanny",
|
||||
"Hothouse Flowers",
|
||||
"Hues Corporation",
|
||||
"Ike & Tina Turner",
|
||||
"Jack & Jack",
|
||||
"James and Bobby Purify",
|
||||
"Jan & Dean",
|
||||
"Jane and Herondale",
|
||||
"Janie and The Greyhounds",
|
||||
"Jay and The Americans",
|
||||
"Jay and The Techniques",
|
||||
"Jermaine & Pia",
|
||||
"Jerry Lee Lewis and His Pumping Piano",
|
||||
"Jett & The Blackhearts",
|
||||
"Jody & The Jerms",
|
||||
"Joe and Eddie",
|
||||
"John & Jacob",
|
||||
"John and Mary",
|
||||
"John and Yoko",
|
||||
"Johnnie and Jack",
|
||||
"Johnny and The Hurricanes",
|
||||
"Johnny and The Moondogs",
|
||||
"Jon & Vangelis",
|
||||
"Jonas Brothers",
|
||||
"Josh and The Empty Pockets",
|
||||
"June and Johnny",
|
||||
"K-Ci & JoJo",
|
||||
"KC & The Sunshine Band",
|
||||
"Kate and Anna McGarrigle",
|
||||
"Kid 'n Play",
|
||||
"Kiki & Herb",
|
||||
"Kim & The Cadillacs",
|
||||
"Kingston Trio",
|
||||
"Kool & The Gang",
|
||||
"Ladysmith Black Mambazo",
|
||||
"Laurie & The Sighs",
|
||||
"Lee and Sara",
|
||||
"Les Paul & Mary Ford",
|
||||
"Loggins & Messina",
|
||||
"Lulu & The Luvvers",
|
||||
"Macklemore & Ryan Lewis",
|
||||
"Manny & The Casanovas",
|
||||
"Marcy & Zina",
|
||||
"Mark & Almond",
|
||||
"Mark & Clark Band",
|
||||
"Marty & Elayne",
|
||||
"Max & Mitch",
|
||||
"Mick & Keith",
|
||||
"Mickey & Sylvia",
|
||||
"Mike & The Mechanics",
|
||||
"Mills Brothers",
|
||||
"Moe & Joe",
|
||||
"Monty & The Pythons",
|
||||
"Mumford & Sons",
|
||||
"Nate and James",
|
||||
"Nico & Vinz",
|
||||
"O'Day & Simone",
|
||||
"Otis & Carla",
|
||||
"OutKast",
|
||||
"Paul & Linda McCartney",
|
||||
"Paul & Paula",
|
||||
"Peaches & Herb",
|
||||
"Peter & Gordon",
|
||||
"Peter, Paul and Mary",
|
||||
"Petty & The Heartbreakers",
|
||||
"Phil & The Firecat",
|
||||
"Phillips & Drew",
|
||||
"Pomus & Shuman",
|
||||
"Punch Brothers",
|
||||
"Randy and The Rainbows",
|
||||
"Ray & Anita",
|
||||
"Red and Kathy",
|
||||
"Reid & Simone",
|
||||
"Richie & The Young Lions",
|
||||
"Ricky and The Rockets",
|
||||
"Righteous Brothers",
|
||||
"Rob & Fab",
|
||||
"Robert and Johnny",
|
||||
"Rodgers and Hammerstein",
|
||||
"Rodgers and Hart",
|
||||
"Roe & Burns",
|
||||
"Rufus & Chaka Khan",
|
||||
"Sam & Dave",
|
||||
"Santo & Johnny",
|
||||
"Sasha & John Digweed",
|
||||
"Scott & Aimee",
|
||||
"Seals & Crofts",
|
||||
"Sergio & Odair Assad",
|
||||
"Sha Na Na",
|
||||
"Shemekia Copeland & The Alligators",
|
||||
"Shirley and Lee",
|
||||
"Simon & Garfunkel",
|
||||
"Solas",
|
||||
"Sonny & Cher",
|
||||
"Steeleye Span",
|
||||
"Steely Dan",
|
||||
"Steve & Eydie",
|
||||
"Syd & The Strawbs",
|
||||
"Tammi & Marvin",
|
||||
"The Avett Brothers",
|
||||
"The B-52s",
|
||||
"The Bacon Brothers",
|
||||
"The Bellamy Brothers",
|
||||
"The Blackwood Brothers",
|
||||
"The Blues Brothers",
|
||||
"The Carter Family",
|
||||
"The Chemical Brothers",
|
||||
"The Chuck Wagon Gang",
|
||||
"The Clark Sisters",
|
||||
"The Coasters",
|
||||
"The Cook Brothers",
|
||||
"The Delmore Brothers",
|
||||
"The Dillards",
|
||||
"The Doobie Brothers",
|
||||
"The Everly Brothers",
|
||||
"The Flying Burrito Brothers",
|
||||
"The Gatlin Brothers",
|
||||
"The Isley Brothers",
|
||||
"The Judds",
|
||||
"The Kalin Twins",
|
||||
"The Louvin Brothers",
|
||||
"The Madden Brothers",
|
||||
"The Neville Brothers",
|
||||
"The Oak Ridge Boys",
|
||||
"The Osborne Brothers",
|
||||
"The Pointer Sisters",
|
||||
"The Proclaimers",
|
||||
"The Stanley Brothers",
|
||||
"The Statler Brothers",
|
||||
"The Walker Brothers",
|
||||
"The Wilburn Brothers",
|
||||
"Thompson Twins",
|
||||
"Tim & Mollie O'Brien",
|
||||
"Tom and Jerry",
|
||||
"Tom and Viv",
|
||||
"Tom Petty & The Heartbreakers",
|
||||
"Tommy and The Twilights",
|
||||
"Tony and The Graduates",
|
||||
"Tucker & Tanya",
|
||||
"Vera & The Beaters",
|
||||
"Vince & Linda",
|
||||
"Waylon & Willie",
|
||||
"Wendy & Lisa",
|
||||
"Wham!",
|
||||
"Will & The Bushmen",
|
||||
"Willie & The Poor Boys",
|
||||
"Wilson Phillips",
|
||||
"Womack & Womack",
|
||||
"Zapp & Roger"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Name variations for artist name normalization",
|
||||
"description": "Name variations and known band names for artist name normalization",
|
||||
"last_updated": "2024-12-19",
|
||||
"version": "3.0",
|
||||
"note": "Removed artists section - now using database search instead"
|
||||
"version": "3.1",
|
||||
"note": "Added groups section with 200+ known band names that should not be split as collaborations"
|
||||
}
|
||||
}
|
||||
@ -167,6 +167,32 @@ class MusicBrainzDatabase:
|
||||
))
|
||||
return (result['name'], result['gid'], 0.98)
|
||||
|
||||
# Try matching with "&" vs "and" variations
|
||||
if '&' in main_artist:
|
||||
and_variant = main_artist.replace('&', 'and')
|
||||
query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (and_variant,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
|
||||
original=artist_name,
|
||||
matched=result['name'],
|
||||
score=0.97
|
||||
))
|
||||
return (result['name'], result['gid'], 0.97)
|
||||
elif ' and ' in main_artist.lower():
|
||||
ampersand_variant = main_artist.replace(' and ', ' & ')
|
||||
query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
|
||||
self.cursor.execute(query, (ampersand_variant,))
|
||||
result = self.cursor.fetchone()
|
||||
if result:
|
||||
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
|
||||
original=artist_name,
|
||||
matched=result['name'],
|
||||
score=0.97
|
||||
))
|
||||
return (result['name'], result['gid'], 0.97)
|
||||
|
||||
# Try fuzzy search
|
||||
# Use more specific patterns to avoid false matches
|
||||
# Also try different dash characters and name variations
|
||||
@ -345,29 +371,8 @@ class MusicBrainzDatabase:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Try with full title first, then with parenthetical content removed
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Create version with parenthetical content removed
|
||||
import re
|
||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip():
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# Also try with specific karaoke patterns removed
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
]
|
||||
for pattern in karaoke_patterns:
|
||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
karaoke_clean = karaoke_clean.strip()
|
||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||
search_titles.append(karaoke_clean)
|
||||
# Generate title variations using the helper method
|
||||
search_titles = self._generate_title_variations(title)
|
||||
|
||||
if artist_mbid:
|
||||
# Try to find recording by artist and title with all variations
|
||||
@ -548,29 +553,8 @@ class MusicBrainzDatabase:
|
||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||
collaboration = len(collaborators) > 0
|
||||
|
||||
# Try with full title first, then with parenthetical content removed
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Create version with parenthetical content removed
|
||||
import re
|
||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip():
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# Also try with specific karaoke patterns removed
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
]
|
||||
for pattern in karaoke_patterns:
|
||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
karaoke_clean = karaoke_clean.strip()
|
||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||
search_titles.append(karaoke_clean)
|
||||
# Generate title variations using the helper method
|
||||
search_titles = self._generate_title_variations(title)
|
||||
|
||||
# Try to find recording with artist credit using all title variations
|
||||
for search_title in search_titles:
|
||||
@ -675,13 +659,32 @@ class MusicBrainzDatabase:
|
||||
split_pattern = pattern
|
||||
break
|
||||
|
||||
# For "&" separators, be more permissive
|
||||
# Most "&" separators in music are collaborations, not band names
|
||||
# Only treat very obvious band names as non-collaborations
|
||||
if part1_words == 1 and part2_words == 1:
|
||||
# Check for very obvious band name patterns
|
||||
obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
|
||||
if artist_string.lower() in obvious_band_names:
|
||||
# For "&" and "and" separators, check against known band names
|
||||
# Load band names from JSON file
|
||||
try:
|
||||
import json
|
||||
import os
|
||||
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
|
||||
with open(json_path, 'r') as f:
|
||||
known_data = json.load(f)
|
||||
known_band_names = {name.lower() for name in known_data.get('groups', [])}
|
||||
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
||||
# Fallback to a minimal list if file can't be loaded
|
||||
known_band_names = {
|
||||
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
|
||||
"brooks & dunn", "the everly brothers"
|
||||
}
|
||||
|
||||
if artist_string.lower() in known_band_names:
|
||||
# This is a known band name, not a collaboration
|
||||
continue
|
||||
|
||||
# For "and" separators, be more conservative
|
||||
# Many band names use "and" (e.g., "Gerry and the Pacemakers")
|
||||
if pattern.lower() == 'and':
|
||||
# Only treat as collaboration if it looks like individual artists
|
||||
# Band names often have "the" in the second part
|
||||
if 'the ' in part2.lower():
|
||||
# Likely a band name, skip
|
||||
continue
|
||||
|
||||
@ -706,6 +709,48 @@ class MusicBrainzDatabase:
|
||||
|
||||
return (main_artist, collaborators)
|
||||
|
||||
def _generate_title_variations(self, title: str) -> List[str]:
|
||||
"""
|
||||
Generate title variations by removing parenthetical content.
|
||||
Returns list of title variations to try.
|
||||
"""
|
||||
import re
|
||||
|
||||
search_titles = [title.strip()]
|
||||
|
||||
# Remove complete parentheses (content)
|
||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||
clean_title = clean_title.strip()
|
||||
if clean_title != title.strip() and clean_title:
|
||||
search_titles.append(clean_title)
|
||||
|
||||
# Remove unmatched opening parenthesis at end
|
||||
clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
|
||||
clean_title2 = clean_title2.strip()
|
||||
if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
|
||||
search_titles.append(clean_title2)
|
||||
|
||||
# Remove unmatched closing parenthesis at start
|
||||
clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
|
||||
clean_title3 = clean_title3.strip()
|
||||
if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
|
||||
search_titles.append(clean_title3)
|
||||
|
||||
# Also try with specific karaoke patterns removed
|
||||
karaoke_patterns = [
|
||||
r'\s*\(Karaoke Version\)',
|
||||
r'\s*\(Karaoke\)',
|
||||
r'\s*\(Instrumental\)',
|
||||
r'\s*\(Backing Track\)',
|
||||
]
|
||||
for pattern in karaoke_patterns:
|
||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||
karaoke_clean = karaoke_clean.strip()
|
||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||
search_titles.append(karaoke_clean)
|
||||
|
||||
return search_titles
|
||||
|
||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
||||
"""
|
||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
||||
|
||||
Loading…
Reference in New Issue
Block a user