Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-01 10:08:44 -05:00
parent b794d9dc1c
commit ec33253403
3 changed files with 143 additions and 7 deletions

88
data/contraction_fixes.json Executable file
View File

@ -0,0 +1,88 @@
{
"dont": "don't",
"don t": "don't",
"cant": "can't",
"can t": "can't",
"wont": "won't",
"won t": "won't",
"im": "I'm",
"i m": "I'm",
"ive": "I've",
"i ve": "I've",
"id": "I'd",
"i d": "I'd",
"ill": "I'll",
"i ll": "I'll",
"isnt": "isn't",
"isn t": "isn't",
"arent": "aren't",
"aren t": "aren't",
"wasnt": "wasn't",
"wasn t": "wasn't",
"werent": "weren't",
"weren t": "weren't",
"hasnt": "hasn't",
"hasn t": "hasn't",
"havent": "haven't",
"haven t": "haven't",
"shouldnt": "shouldn't",
"shouldn t": "shouldn't",
"couldnt": "couldn't",
"couldn t": "couldn't",
"wouldnt": "wouldn't",
"wouldn t": "wouldn't",
"didnt": "didn't",
"didn t": "didn't",
"theyre": "they're",
"they re": "they're",
"youre": "you're",
"you re": "you're",
"whos": "who's",
"who s": "who's",
"whats": "what's",
"what s": "what's",
"thats": "that's",
"that s": "that's",
"lets": "let's",
"let s": "let's",
"theres": "there's",
"there s": "there's",
"heres": "here's",
"here s": "here's",
"hows": "how's",
"how s": "how's",
"shes": "she's",
"she s": "she's",
"hes": "he's",
"he s": "he's",
"were": "we're",
"we re": "we're",
"weve": "we've",
"we ve": "we've",
"well": "we'll",
"we ll": "we'll",
"its": "it's",
"it s": "it's",
"itll": "it'll",
"it ll": "it'll",
"yall": "y'all",
"y all": "y'all",
"wouldve": "would've",
"would ve": "would've",
"couldve": "could've",
"could ve": "could've",
"shouldve": "should've",
"should ve": "should've",
"mightve": "might've",
"might ve": "might've",
"mustve": "must've",
"must ve": "must've",
"maam": "ma'am",
"ma am": "ma'am",
"oclock": "o'clock",
"o clock": "o'clock",
"aint": "ain't",
"ain t": "ain't",
"rocknroll": "rock 'n' roll",
"rock n roll": "rock 'n' roll"
}

View File

@ -119,14 +119,52 @@ def generate_title_variations(title: str) -> List[str]:
if title_fixes != title.strip():
search_titles.append(title_fixes)
apostrophe_fixes = [
(r'\bDont\b', "Don't"), (r'\bCant\b', "Can't"), (r'\bWont\b', "Won't"),
(r'\bArent\b', "Aren't"), (r'\bIsnt\b', "Isn't"), (r'\bWasnt\b', "Wasn't"),
(r'\bDidnt\b', "Didn't"), (r'\bDoesnt\b', "Doesn't"), (r'\bHavent\b', "Haven't"),
(r'\bHasnt\b', "Hasn't"), (r'\bWouldnt\b', "Wouldn't"), (r'\bCouldnt\b', "Couldn't"),
(r'\bShouldnt\b', "Shouldn't"), (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
# Load contraction fixes from JSON file
try:
from ..utils.data_loader import data_loader
contraction_fixes = data_loader.load_contraction_fixes()
except:
# Fallback to hardcoded fixes if file loading fails
contraction_fixes = {
"dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
"wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
"ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
"ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
"arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
"werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
"havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
"couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
"didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
"youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
"whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
"lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
"heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
"shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
"were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
"well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
"itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
"wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
"shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
"mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
"oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
"rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
}
# Apply contraction fixes
for broken_contraction, fixed_contraction in contraction_fixes.items():
# Use word boundaries to avoid partial matches
pattern = r'\b' + re.escape(broken_contraction) + r'\b'
fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
if fixed_title != title_fixes:
title_fixes = fixed_title
if title_fixes not in search_titles:
search_titles.append(title_fixes)
# Additional hardcoded fixes for edge cases
additional_fixes = [
(r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
]
for pattern, replacement in apostrophe_fixes:
for pattern, replacement in additional_fixes:
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
if fixed_title != title_fixes:
title_fixes = fixed_title

View File

@ -65,6 +65,16 @@ class DataLoader:
return self._known_recordings
def load_contraction_fixes(self) -> Dict[str, str]:
"""Load contraction fixes from JSON file."""
contractions_file = self.data_dir / "contraction_fixes.json"
try:
with open(contractions_file, 'r', encoding='utf-8') as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Warning: Could not load contraction fixes data: {e}")
return {}
def reload_data(self) -> None:
"""Reload data from files (useful for testing or updates)."""
self._known_artists = None