Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
dc396a346f
commit
eeeb0bfd64
@ -5,10 +5,248 @@
|
|||||||
"ft ": "feat. ",
|
"ft ": "feat. ",
|
||||||
"featuring": "feat."
|
"featuring": "feat."
|
||||||
},
|
},
|
||||||
|
"groups": [
|
||||||
|
"Abbott & Costello",
|
||||||
|
"Adam & Joe",
|
||||||
|
"Al & Smoot",
|
||||||
|
"Alfie & Owen",
|
||||||
|
"Allen & Grier",
|
||||||
|
"Amos 'n' Andy",
|
||||||
|
"Andrews Sisters",
|
||||||
|
"AnnenMayKantereit",
|
||||||
|
"Ashford & Simpson",
|
||||||
|
"B. J. Thomas & The Triumphs",
|
||||||
|
"Bachman-Turner Overdrive",
|
||||||
|
"Baden & Vin",
|
||||||
|
"Barenaked Ladies",
|
||||||
|
"Barnum & Bailey",
|
||||||
|
"Barry, Robin and Maurice Gibb",
|
||||||
|
"Beavis & Butt-Head",
|
||||||
|
"Ben & Ben",
|
||||||
|
"Benny & Joon",
|
||||||
|
"Bert & Ernie",
|
||||||
|
"Bill & Gloria Gaither",
|
||||||
|
"Bill & Ted",
|
||||||
|
"Bill Haley and His Comets",
|
||||||
|
"Billy & The Essentials",
|
||||||
|
"Bob & Earl",
|
||||||
|
"Bob and Tom",
|
||||||
|
"Bobby & The Midnites",
|
||||||
|
"Booker T. & the M.G.'s",
|
||||||
|
"Brooks & Dunn",
|
||||||
|
"Brown & Carney",
|
||||||
|
"Bucky & John Pizzarelli",
|
||||||
|
"Buddy & Julie Miller",
|
||||||
|
"Buddy Holly and The Crickets",
|
||||||
|
"Burt & The Backbeats",
|
||||||
|
"Captain & Tennille",
|
||||||
|
"Carly and Lucy Simon",
|
||||||
|
"Carpenter & Ford",
|
||||||
|
"Cece & Bebe Winans",
|
||||||
|
"Chad & Jeremy",
|
||||||
|
"Charles & Eddie",
|
||||||
|
"Cheech & Chong",
|
||||||
|
"Cheryl & Pam Johnson",
|
||||||
|
"Chris & Cosey",
|
||||||
|
"Chris & Rich Robinson",
|
||||||
|
"Climax Blues Band",
|
||||||
|
"Crosby, Stills & Nash",
|
||||||
|
"Crosby, Stills, Nash & Young",
|
||||||
|
"Darrell and Donny",
|
||||||
|
"Daryl Hall & John Oates",
|
||||||
|
"Dave & Ansell Collins",
|
||||||
|
"Dave and Sugar",
|
||||||
|
"David & David",
|
||||||
|
"Dawn and Tony Orlando",
|
||||||
|
"Dean & Britta",
|
||||||
|
"Dean & Jerry",
|
||||||
|
"Deke Dickerson & The Ecco-Fonics",
|
||||||
|
"Dick & Dee Dee",
|
||||||
|
"Dion and The Belmonts",
|
||||||
|
"DJ Jazzy Jeff & The Fresh Prince",
|
||||||
|
"Don and Juan",
|
||||||
|
"Dr. Hook & The Medicine Show",
|
||||||
|
"Ed & Steve Masley",
|
||||||
|
"Eddie and The Hot Rods",
|
||||||
|
"Emerson, Lake and Palmer",
|
||||||
|
"Emerson, Lake and Powell",
|
||||||
|
"England Dan & John Ford Coley",
|
||||||
|
"Eric & The Good Good Feeling",
|
||||||
|
"Eric B. & Rakim",
|
||||||
|
"Ernie & Bert",
|
||||||
|
"Eurythmics",
|
||||||
|
"Everly Brothers",
|
||||||
|
"Fischer-Z",
|
||||||
|
"Fleming & John",
|
||||||
|
"Flo & Eddie",
|
||||||
|
"Foster & Lloyd",
|
||||||
|
"Fred and Ginger",
|
||||||
|
"Funk Brothers",
|
||||||
|
"Gabe & Chloe",
|
||||||
|
"Gallagher & Lyle",
|
||||||
|
"Gamble & Huff",
|
||||||
|
"Gene and Debbe",
|
||||||
|
"George and Ira Gershwin",
|
||||||
|
"Gerry and the Pacemakers",
|
||||||
|
"Gogol Bordello",
|
||||||
|
"Hall & Oates",
|
||||||
|
"Hank & Frank",
|
||||||
|
"Hank and Lewie Wickham",
|
||||||
|
"Hank Williams Jr. and The Cheatin' Hearts",
|
||||||
|
"Hankshaw and Fanny",
|
||||||
|
"Hothouse Flowers",
|
||||||
|
"Hues Corporation",
|
||||||
|
"Ike & Tina Turner",
|
||||||
|
"Jack & Jack",
|
||||||
|
"James and Bobby Purify",
|
||||||
|
"Jan & Dean",
|
||||||
|
"Jane and Herondale",
|
||||||
|
"Janie and The Greyhounds",
|
||||||
|
"Jay and The Americans",
|
||||||
|
"Jay and The Techniques",
|
||||||
|
"Jermaine & Pia",
|
||||||
|
"Jerry Lee Lewis and His Pumping Piano",
|
||||||
|
"Jett & The Blackhearts",
|
||||||
|
"Jody & The Jerms",
|
||||||
|
"Joe and Eddie",
|
||||||
|
"John & Jacob",
|
||||||
|
"John and Mary",
|
||||||
|
"John and Yoko",
|
||||||
|
"Johnnie and Jack",
|
||||||
|
"Johnny and The Hurricanes",
|
||||||
|
"Johnny and The Moondogs",
|
||||||
|
"Jon & Vangelis",
|
||||||
|
"Jonas Brothers",
|
||||||
|
"Josh and The Empty Pockets",
|
||||||
|
"June and Johnny",
|
||||||
|
"K-Ci & JoJo",
|
||||||
|
"KC & The Sunshine Band",
|
||||||
|
"Kate and Anna McGarrigle",
|
||||||
|
"Kid 'n Play",
|
||||||
|
"Kiki & Herb",
|
||||||
|
"Kim & The Cadillacs",
|
||||||
|
"Kingston Trio",
|
||||||
|
"Kool & The Gang",
|
||||||
|
"Ladysmith Black Mambazo",
|
||||||
|
"Laurie & The Sighs",
|
||||||
|
"Lee and Sara",
|
||||||
|
"Les Paul & Mary Ford",
|
||||||
|
"Loggins & Messina",
|
||||||
|
"Lulu & The Luvvers",
|
||||||
|
"Macklemore & Ryan Lewis",
|
||||||
|
"Manny & The Casanovas",
|
||||||
|
"Marcy & Zina",
|
||||||
|
"Mark & Almond",
|
||||||
|
"Mark & Clark Band",
|
||||||
|
"Marty & Elayne",
|
||||||
|
"Max & Mitch",
|
||||||
|
"Mick & Keith",
|
||||||
|
"Mickey & Sylvia",
|
||||||
|
"Mike & The Mechanics",
|
||||||
|
"Mills Brothers",
|
||||||
|
"Moe & Joe",
|
||||||
|
"Monty & The Pythons",
|
||||||
|
"Mumford & Sons",
|
||||||
|
"Nate and James",
|
||||||
|
"Nico & Vinz",
|
||||||
|
"O'Day & Simone",
|
||||||
|
"Otis & Carla",
|
||||||
|
"OutKast",
|
||||||
|
"Paul & Linda McCartney",
|
||||||
|
"Paul & Paula",
|
||||||
|
"Peaches & Herb",
|
||||||
|
"Peter & Gordon",
|
||||||
|
"Peter, Paul and Mary",
|
||||||
|
"Petty & The Heartbreakers",
|
||||||
|
"Phil & The Firecat",
|
||||||
|
"Phillips & Drew",
|
||||||
|
"Pomus & Shuman",
|
||||||
|
"Punch Brothers",
|
||||||
|
"Randy and The Rainbows",
|
||||||
|
"Ray & Anita",
|
||||||
|
"Red and Kathy",
|
||||||
|
"Reid & Simone",
|
||||||
|
"Richie & The Young Lions",
|
||||||
|
"Ricky and The Rockets",
|
||||||
|
"Righteous Brothers",
|
||||||
|
"Rob & Fab",
|
||||||
|
"Robert and Johnny",
|
||||||
|
"Rodgers and Hammerstein",
|
||||||
|
"Rodgers and Hart",
|
||||||
|
"Roe & Burns",
|
||||||
|
"Rufus & Chaka Khan",
|
||||||
|
"Sam & Dave",
|
||||||
|
"Santo & Johnny",
|
||||||
|
"Sasha & John Digweed",
|
||||||
|
"Scott & Aimee",
|
||||||
|
"Seals & Crofts",
|
||||||
|
"Sergio & Odair Assad",
|
||||||
|
"Sha Na Na",
|
||||||
|
"Shemekia Copeland & The Alligators",
|
||||||
|
"Shirley and Lee",
|
||||||
|
"Simon & Garfunkel",
|
||||||
|
"Solas",
|
||||||
|
"Sonny & Cher",
|
||||||
|
"Steeleye Span",
|
||||||
|
"Steely Dan",
|
||||||
|
"Steve & Eydie",
|
||||||
|
"Syd & The Strawbs",
|
||||||
|
"Tammi & Marvin",
|
||||||
|
"The Avett Brothers",
|
||||||
|
"The B-52s",
|
||||||
|
"The Bacon Brothers",
|
||||||
|
"The Bellamy Brothers",
|
||||||
|
"The Blackwood Brothers",
|
||||||
|
"The Blues Brothers",
|
||||||
|
"The Carter Family",
|
||||||
|
"The Chemical Brothers",
|
||||||
|
"The Chuck Wagon Gang",
|
||||||
|
"The Clark Sisters",
|
||||||
|
"The Coasters",
|
||||||
|
"The Cook Brothers",
|
||||||
|
"The Delmore Brothers",
|
||||||
|
"The Dillards",
|
||||||
|
"The Doobie Brothers",
|
||||||
|
"The Everly Brothers",
|
||||||
|
"The Flying Burrito Brothers",
|
||||||
|
"The Gatlin Brothers",
|
||||||
|
"The Isley Brothers",
|
||||||
|
"The Judds",
|
||||||
|
"The Kalin Twins",
|
||||||
|
"The Louvin Brothers",
|
||||||
|
"The Madden Brothers",
|
||||||
|
"The Neville Brothers",
|
||||||
|
"The Oak Ridge Boys",
|
||||||
|
"The Osborne Brothers",
|
||||||
|
"The Pointer Sisters",
|
||||||
|
"The Proclaimers",
|
||||||
|
"The Stanley Brothers",
|
||||||
|
"The Statler Brothers",
|
||||||
|
"The Walker Brothers",
|
||||||
|
"The Wilburn Brothers",
|
||||||
|
"Thompson Twins",
|
||||||
|
"Tim & Mollie O'Brien",
|
||||||
|
"Tom and Jerry",
|
||||||
|
"Tom and Viv",
|
||||||
|
"Tom Petty & The Heartbreakers",
|
||||||
|
"Tommy and The Twilights",
|
||||||
|
"Tony and The Graduates",
|
||||||
|
"Tucker & Tanya",
|
||||||
|
"Vera & The Beaters",
|
||||||
|
"Vince & Linda",
|
||||||
|
"Waylon & Willie",
|
||||||
|
"Wendy & Lisa",
|
||||||
|
"Wham!",
|
||||||
|
"Will & The Bushmen",
|
||||||
|
"Willie & The Poor Boys",
|
||||||
|
"Wilson Phillips",
|
||||||
|
"Womack & Womack",
|
||||||
|
"Zapp & Roger"
|
||||||
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"description": "Name variations for artist name normalization",
|
"description": "Name variations and known band names for artist name normalization",
|
||||||
"last_updated": "2024-12-19",
|
"last_updated": "2024-12-19",
|
||||||
"version": "3.0",
|
"version": "3.1",
|
||||||
"note": "Removed artists section - now using database search instead"
|
"note": "Added groups section with 200+ known band names that should not be split as collaborations"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,6 +167,32 @@ class MusicBrainzDatabase:
|
|||||||
))
|
))
|
||||||
return (result['name'], result['gid'], 0.98)
|
return (result['name'], result['gid'], 0.98)
|
||||||
|
|
||||||
|
# Try matching with "&" vs "and" variations
|
||||||
|
if '&' in main_artist:
|
||||||
|
and_variant = main_artist.replace('&', 'and')
|
||||||
|
query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
|
||||||
|
self.cursor.execute(query, (and_variant,))
|
||||||
|
result = self.cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
|
||||||
|
original=artist_name,
|
||||||
|
matched=result['name'],
|
||||||
|
score=0.97
|
||||||
|
))
|
||||||
|
return (result['name'], result['gid'], 0.97)
|
||||||
|
elif ' and ' in main_artist.lower():
|
||||||
|
ampersand_variant = main_artist.replace(' and ', ' & ')
|
||||||
|
query = "SELECT name, gid FROM artist WHERE name = %s LIMIT 1"
|
||||||
|
self.cursor.execute(query, (ampersand_variant,))
|
||||||
|
result = self.cursor.fetchone()
|
||||||
|
if result:
|
||||||
|
print(SUCCESS_MESSAGES['fuzzy_match_found'].format(
|
||||||
|
original=artist_name,
|
||||||
|
matched=result['name'],
|
||||||
|
score=0.97
|
||||||
|
))
|
||||||
|
return (result['name'], result['gid'], 0.97)
|
||||||
|
|
||||||
# Try fuzzy search
|
# Try fuzzy search
|
||||||
# Use more specific patterns to avoid false matches
|
# Use more specific patterns to avoid false matches
|
||||||
# Also try different dash characters and name variations
|
# Also try different dash characters and name variations
|
||||||
@ -345,29 +371,8 @@ class MusicBrainzDatabase:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try with full title first, then with parenthetical content removed
|
# Generate title variations using the helper method
|
||||||
search_titles = [title.strip()]
|
search_titles = self._generate_title_variations(title)
|
||||||
|
|
||||||
# Create version with parenthetical content removed
|
|
||||||
import re
|
|
||||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
|
||||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
|
||||||
clean_title = clean_title.strip()
|
|
||||||
if clean_title != title.strip():
|
|
||||||
search_titles.append(clean_title)
|
|
||||||
|
|
||||||
# Also try with specific karaoke patterns removed
|
|
||||||
karaoke_patterns = [
|
|
||||||
r'\s*\(Karaoke Version\)',
|
|
||||||
r'\s*\(Karaoke\)',
|
|
||||||
r'\s*\(Instrumental\)',
|
|
||||||
r'\s*\(Backing Track\)',
|
|
||||||
]
|
|
||||||
for pattern in karaoke_patterns:
|
|
||||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
||||||
karaoke_clean = karaoke_clean.strip()
|
|
||||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
|
||||||
search_titles.append(karaoke_clean)
|
|
||||||
|
|
||||||
if artist_mbid:
|
if artist_mbid:
|
||||||
# Try to find recording by artist and title with all variations
|
# Try to find recording by artist and title with all variations
|
||||||
@ -548,29 +553,8 @@ class MusicBrainzDatabase:
|
|||||||
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
main_artist, collaborators = self._parse_complex_collaboration(clean_name)
|
||||||
collaboration = len(collaborators) > 0
|
collaboration = len(collaborators) > 0
|
||||||
|
|
||||||
# Try with full title first, then with parenthetical content removed
|
# Generate title variations using the helper method
|
||||||
search_titles = [title.strip()]
|
search_titles = self._generate_title_variations(title)
|
||||||
|
|
||||||
# Create version with parenthetical content removed
|
|
||||||
import re
|
|
||||||
# Remove all parenthetical content like (clean), (remix), (karaoke), etc.
|
|
||||||
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
|
||||||
clean_title = clean_title.strip()
|
|
||||||
if clean_title != title.strip():
|
|
||||||
search_titles.append(clean_title)
|
|
||||||
|
|
||||||
# Also try with specific karaoke patterns removed
|
|
||||||
karaoke_patterns = [
|
|
||||||
r'\s*\(Karaoke Version\)',
|
|
||||||
r'\s*\(Karaoke\)',
|
|
||||||
r'\s*\(Instrumental\)',
|
|
||||||
r'\s*\(Backing Track\)',
|
|
||||||
]
|
|
||||||
for pattern in karaoke_patterns:
|
|
||||||
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
|
||||||
karaoke_clean = karaoke_clean.strip()
|
|
||||||
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
|
||||||
search_titles.append(karaoke_clean)
|
|
||||||
|
|
||||||
# Try to find recording with artist credit using all title variations
|
# Try to find recording with artist credit using all title variations
|
||||||
for search_title in search_titles:
|
for search_title in search_titles:
|
||||||
@ -675,13 +659,32 @@ class MusicBrainzDatabase:
|
|||||||
split_pattern = pattern
|
split_pattern = pattern
|
||||||
break
|
break
|
||||||
|
|
||||||
# For "&" separators, be more permissive
|
# For "&" and "and" separators, check against known band names
|
||||||
# Most "&" separators in music are collaborations, not band names
|
# Load band names from JSON file
|
||||||
# Only treat very obvious band names as non-collaborations
|
try:
|
||||||
if part1_words == 1 and part2_words == 1:
|
import json
|
||||||
# Check for very obvious band name patterns
|
import os
|
||||||
obvious_band_names = ['simon & garfunkel', 'hall & oates', 'brooks & dunn']
|
json_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data', 'known_artists.json')
|
||||||
if artist_string.lower() in obvious_band_names:
|
with open(json_path, 'r') as f:
|
||||||
|
known_data = json.load(f)
|
||||||
|
known_band_names = {name.lower() for name in known_data.get('groups', [])}
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError, KeyError):
|
||||||
|
# Fallback to a minimal list if file can't be loaded
|
||||||
|
known_band_names = {
|
||||||
|
"gerry and the pacemakers", "simon & garfunkel", "hall & oates",
|
||||||
|
"brooks & dunn", "the everly brothers"
|
||||||
|
}
|
||||||
|
|
||||||
|
if artist_string.lower() in known_band_names:
|
||||||
|
# This is a known band name, not a collaboration
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For "and" separators, be more conservative
|
||||||
|
# Many band names use "and" (e.g., "Gerry and the Pacemakers")
|
||||||
|
if pattern.lower() == 'and':
|
||||||
|
# Only treat as collaboration if it looks like individual artists
|
||||||
|
# Band names often have "the" in the second part
|
||||||
|
if 'the ' in part2.lower():
|
||||||
# Likely a band name, skip
|
# Likely a band name, skip
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -706,6 +709,48 @@ class MusicBrainzDatabase:
|
|||||||
|
|
||||||
return (main_artist, collaborators)
|
return (main_artist, collaborators)
|
||||||
|
|
||||||
|
def _generate_title_variations(self, title: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Generate title variations by removing parenthetical content.
|
||||||
|
Returns list of title variations to try.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
search_titles = [title.strip()]
|
||||||
|
|
||||||
|
# Remove complete parentheses (content)
|
||||||
|
clean_title = re.sub(r'\s*\([^)]*\)', '', title.strip())
|
||||||
|
clean_title = clean_title.strip()
|
||||||
|
if clean_title != title.strip() and clean_title:
|
||||||
|
search_titles.append(clean_title)
|
||||||
|
|
||||||
|
# Remove unmatched opening parenthesis at end
|
||||||
|
clean_title2 = re.sub(r'\s*\([^)]*$', '', title.strip())
|
||||||
|
clean_title2 = clean_title2.strip()
|
||||||
|
if clean_title2 != title.strip() and clean_title2 and clean_title2 not in search_titles:
|
||||||
|
search_titles.append(clean_title2)
|
||||||
|
|
||||||
|
# Remove unmatched closing parenthesis at start
|
||||||
|
clean_title3 = re.sub(r'^[^)]*\)\s*', '', title.strip())
|
||||||
|
clean_title3 = clean_title3.strip()
|
||||||
|
if clean_title3 != title.strip() and clean_title3 and clean_title3 not in search_titles:
|
||||||
|
search_titles.append(clean_title3)
|
||||||
|
|
||||||
|
# Also try with specific karaoke patterns removed
|
||||||
|
karaoke_patterns = [
|
||||||
|
r'\s*\(Karaoke Version\)',
|
||||||
|
r'\s*\(Karaoke\)',
|
||||||
|
r'\s*\(Instrumental\)',
|
||||||
|
r'\s*\(Backing Track\)',
|
||||||
|
]
|
||||||
|
for pattern in karaoke_patterns:
|
||||||
|
karaoke_clean = re.sub(pattern, '', title.strip(), flags=re.IGNORECASE)
|
||||||
|
karaoke_clean = karaoke_clean.strip()
|
||||||
|
if karaoke_clean != title.strip() and karaoke_clean not in search_titles:
|
||||||
|
search_titles.append(karaoke_clean)
|
||||||
|
|
||||||
|
return search_titles
|
||||||
|
|
||||||
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
def _parse_collaborators(self, collaborators_string: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
Parse a string like "Ne-Yo, Afrojack & Nayer" into individual artists
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user