Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
b794d9dc1c
commit
ec33253403
88
data/contraction_fixes.json
Executable file
88
data/contraction_fixes.json
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
{
|
||||||
|
"dont": "don't",
|
||||||
|
"don t": "don't",
|
||||||
|
"cant": "can't",
|
||||||
|
"can t": "can't",
|
||||||
|
"wont": "won't",
|
||||||
|
"won t": "won't",
|
||||||
|
"im": "I'm",
|
||||||
|
"i m": "I'm",
|
||||||
|
"ive": "I've",
|
||||||
|
"i ve": "I've",
|
||||||
|
"id": "I'd",
|
||||||
|
"i d": "I'd",
|
||||||
|
"ill": "I'll",
|
||||||
|
"i ll": "I'll",
|
||||||
|
"isnt": "isn't",
|
||||||
|
"isn t": "isn't",
|
||||||
|
"arent": "aren't",
|
||||||
|
"aren t": "aren't",
|
||||||
|
"wasnt": "wasn't",
|
||||||
|
"wasn t": "wasn't",
|
||||||
|
"werent": "weren't",
|
||||||
|
"weren t": "weren't",
|
||||||
|
"hasnt": "hasn't",
|
||||||
|
"hasn t": "hasn't",
|
||||||
|
"havent": "haven't",
|
||||||
|
"haven t": "haven't",
|
||||||
|
"shouldnt": "shouldn't",
|
||||||
|
"shouldn t": "shouldn't",
|
||||||
|
"couldnt": "couldn't",
|
||||||
|
"couldn t": "couldn't",
|
||||||
|
"wouldnt": "wouldn't",
|
||||||
|
"wouldn t": "wouldn't",
|
||||||
|
"didnt": "didn't",
|
||||||
|
"didn t": "didn't",
|
||||||
|
"theyre": "they're",
|
||||||
|
"they re": "they're",
|
||||||
|
"youre": "you're",
|
||||||
|
"you re": "you're",
|
||||||
|
"whos": "who's",
|
||||||
|
"who s": "who's",
|
||||||
|
"whats": "what's",
|
||||||
|
"what s": "what's",
|
||||||
|
"thats": "that's",
|
||||||
|
"that s": "that's",
|
||||||
|
"lets": "let's",
|
||||||
|
"let s": "let's",
|
||||||
|
"theres": "there's",
|
||||||
|
"there s": "there's",
|
||||||
|
"heres": "here's",
|
||||||
|
"here s": "here's",
|
||||||
|
"hows": "how's",
|
||||||
|
"how s": "how's",
|
||||||
|
"shes": "she's",
|
||||||
|
"she s": "she's",
|
||||||
|
"hes": "he's",
|
||||||
|
"he s": "he's",
|
||||||
|
"were": "we're",
|
||||||
|
"we re": "we're",
|
||||||
|
"weve": "we've",
|
||||||
|
"we ve": "we've",
|
||||||
|
"well": "we'll",
|
||||||
|
"we ll": "we'll",
|
||||||
|
"its": "it's",
|
||||||
|
"it s": "it's",
|
||||||
|
"itll": "it'll",
|
||||||
|
"it ll": "it'll",
|
||||||
|
"yall": "y'all",
|
||||||
|
"y all": "y'all",
|
||||||
|
"wouldve": "would've",
|
||||||
|
"would ve": "would've",
|
||||||
|
"couldve": "could've",
|
||||||
|
"could ve": "could've",
|
||||||
|
"shouldve": "should've",
|
||||||
|
"should ve": "should've",
|
||||||
|
"mightve": "might've",
|
||||||
|
"might ve": "might've",
|
||||||
|
"mustve": "must've",
|
||||||
|
"must ve": "must've",
|
||||||
|
"maam": "ma'am",
|
||||||
|
"ma am": "ma'am",
|
||||||
|
"oclock": "o'clock",
|
||||||
|
"o clock": "o'clock",
|
||||||
|
"aint": "ain't",
|
||||||
|
"ain t": "ain't",
|
||||||
|
"rocknroll": "rock 'n' roll",
|
||||||
|
"rock n roll": "rock 'n' roll"
|
||||||
|
}
|
||||||
@ -119,14 +119,52 @@ def generate_title_variations(title: str) -> List[str]:
|
|||||||
if title_fixes != title.strip():
|
if title_fixes != title.strip():
|
||||||
search_titles.append(title_fixes)
|
search_titles.append(title_fixes)
|
||||||
|
|
||||||
apostrophe_fixes = [
|
# Load contraction fixes from JSON file
|
||||||
(r'\bDont\b', "Don't"), (r'\bCant\b', "Can't"), (r'\bWont\b', "Won't"),
|
try:
|
||||||
(r'\bArent\b', "Aren't"), (r'\bIsnt\b', "Isn't"), (r'\bWasnt\b', "Wasn't"),
|
from ..utils.data_loader import data_loader
|
||||||
(r'\bDidnt\b', "Didn't"), (r'\bDoesnt\b', "Doesn't"), (r'\bHavent\b', "Haven't"),
|
contraction_fixes = data_loader.load_contraction_fixes()
|
||||||
(r'\bHasnt\b', "Hasn't"), (r'\bWouldnt\b', "Wouldn't"), (r'\bCouldnt\b', "Couldn't"),
|
except:
|
||||||
(r'\bShouldnt\b', "Shouldn't"), (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
|
# Fallback to hardcoded fixes if file loading fails
|
||||||
|
contraction_fixes = {
|
||||||
|
"dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
|
||||||
|
"wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
|
||||||
|
"ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
|
||||||
|
"ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
|
||||||
|
"arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
|
||||||
|
"werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
|
||||||
|
"havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
|
||||||
|
"couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
|
||||||
|
"didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
|
||||||
|
"youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
|
||||||
|
"whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
|
||||||
|
"lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
|
||||||
|
"heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
|
||||||
|
"shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
|
||||||
|
"were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
|
||||||
|
"well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
|
||||||
|
"itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
|
||||||
|
"wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
|
||||||
|
"shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
|
||||||
|
"mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
|
||||||
|
"oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
|
||||||
|
"rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Apply contraction fixes
|
||||||
|
for broken_contraction, fixed_contraction in contraction_fixes.items():
|
||||||
|
# Use word boundaries to avoid partial matches
|
||||||
|
pattern = r'\b' + re.escape(broken_contraction) + r'\b'
|
||||||
|
fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
|
||||||
|
if fixed_title != title_fixes:
|
||||||
|
title_fixes = fixed_title
|
||||||
|
if title_fixes not in search_titles:
|
||||||
|
search_titles.append(title_fixes)
|
||||||
|
|
||||||
|
# Additional hardcoded fixes for edge cases
|
||||||
|
additional_fixes = [
|
||||||
|
(r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
|
||||||
]
|
]
|
||||||
for pattern, replacement in apostrophe_fixes:
|
for pattern, replacement in additional_fixes:
|
||||||
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
||||||
if fixed_title != title_fixes:
|
if fixed_title != title_fixes:
|
||||||
title_fixes = fixed_title
|
title_fixes = fixed_title
|
||||||
|
|||||||
@ -65,6 +65,16 @@ class DataLoader:
|
|||||||
|
|
||||||
return self._known_recordings
|
return self._known_recordings
|
||||||
|
|
||||||
|
def load_contraction_fixes(self) -> Dict[str, str]:
|
||||||
|
"""Load contraction fixes from JSON file."""
|
||||||
|
contractions_file = self.data_dir / "contraction_fixes.json"
|
||||||
|
try:
|
||||||
|
with open(contractions_file, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||||
|
print(f"Warning: Could not load contraction fixes data: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
def reload_data(self) -> None:
|
def reload_data(self) -> None:
|
||||||
"""Reload data from files (useful for testing or updates)."""
|
"""Reload data from files (useful for testing or updates)."""
|
||||||
self._known_artists = None
|
self._known_artists = None
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user