Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
b794d9dc1c
commit
ec33253403
88
data/contraction_fixes.json
Executable file
88
data/contraction_fixes.json
Executable file
@ -0,0 +1,88 @@
|
||||
{
|
||||
"dont": "don't",
|
||||
"don t": "don't",
|
||||
"cant": "can't",
|
||||
"can t": "can't",
|
||||
"wont": "won't",
|
||||
"won t": "won't",
|
||||
"im": "I'm",
|
||||
"i m": "I'm",
|
||||
"ive": "I've",
|
||||
"i ve": "I've",
|
||||
"id": "I'd",
|
||||
"i d": "I'd",
|
||||
"ill": "I'll",
|
||||
"i ll": "I'll",
|
||||
"isnt": "isn't",
|
||||
"isn t": "isn't",
|
||||
"arent": "aren't",
|
||||
"aren t": "aren't",
|
||||
"wasnt": "wasn't",
|
||||
"wasn t": "wasn't",
|
||||
"werent": "weren't",
|
||||
"weren t": "weren't",
|
||||
"hasnt": "hasn't",
|
||||
"hasn t": "hasn't",
|
||||
"havent": "haven't",
|
||||
"haven t": "haven't",
|
||||
"shouldnt": "shouldn't",
|
||||
"shouldn t": "shouldn't",
|
||||
"couldnt": "couldn't",
|
||||
"couldn t": "couldn't",
|
||||
"wouldnt": "wouldn't",
|
||||
"wouldn t": "wouldn't",
|
||||
"didnt": "didn't",
|
||||
"didn t": "didn't",
|
||||
"theyre": "they're",
|
||||
"they re": "they're",
|
||||
"youre": "you're",
|
||||
"you re": "you're",
|
||||
"whos": "who's",
|
||||
"who s": "who's",
|
||||
"whats": "what's",
|
||||
"what s": "what's",
|
||||
"thats": "that's",
|
||||
"that s": "that's",
|
||||
"lets": "let's",
|
||||
"let s": "let's",
|
||||
"theres": "there's",
|
||||
"there s": "there's",
|
||||
"heres": "here's",
|
||||
"here s": "here's",
|
||||
"hows": "how's",
|
||||
"how s": "how's",
|
||||
"shes": "she's",
|
||||
"she s": "she's",
|
||||
"hes": "he's",
|
||||
"he s": "he's",
|
||||
"were": "we're",
|
||||
"we re": "we're",
|
||||
"weve": "we've",
|
||||
"we ve": "we've",
|
||||
"well": "we'll",
|
||||
"we ll": "we'll",
|
||||
"its": "it's",
|
||||
"it s": "it's",
|
||||
"itll": "it'll",
|
||||
"it ll": "it'll",
|
||||
"yall": "y'all",
|
||||
"y all": "y'all",
|
||||
"wouldve": "would've",
|
||||
"would ve": "would've",
|
||||
"couldve": "could've",
|
||||
"could ve": "could've",
|
||||
"shouldve": "should've",
|
||||
"should ve": "should've",
|
||||
"mightve": "might've",
|
||||
"might ve": "might've",
|
||||
"mustve": "must've",
|
||||
"must ve": "must've",
|
||||
"maam": "ma'am",
|
||||
"ma am": "ma'am",
|
||||
"oclock": "o'clock",
|
||||
"o clock": "o'clock",
|
||||
"aint": "ain't",
|
||||
"ain t": "ain't",
|
||||
"rocknroll": "rock 'n' roll",
|
||||
"rock n roll": "rock 'n' roll"
|
||||
}
|
||||
@ -119,14 +119,52 @@ def generate_title_variations(title: str) -> List[str]:
|
||||
if title_fixes != title.strip():
|
||||
search_titles.append(title_fixes)
|
||||
|
||||
apostrophe_fixes = [
|
||||
(r'\bDont\b', "Don't"), (r'\bCant\b', "Can't"), (r'\bWont\b', "Won't"),
|
||||
(r'\bArent\b', "Aren't"), (r'\bIsnt\b', "Isn't"), (r'\bWasnt\b', "Wasn't"),
|
||||
(r'\bDidnt\b', "Didn't"), (r'\bDoesnt\b', "Doesn't"), (r'\bHavent\b', "Haven't"),
|
||||
(r'\bHasnt\b', "Hasn't"), (r'\bWouldnt\b', "Wouldn't"), (r'\bCouldnt\b', "Couldn't"),
|
||||
(r'\bShouldnt\b', "Shouldn't"), (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
|
||||
# Load contraction fixes from JSON file
|
||||
try:
|
||||
from ..utils.data_loader import data_loader
|
||||
contraction_fixes = data_loader.load_contraction_fixes()
|
||||
except:
|
||||
# Fallback to hardcoded fixes if file loading fails
|
||||
contraction_fixes = {
|
||||
"dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
|
||||
"wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
|
||||
"ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
|
||||
"ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
|
||||
"arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
|
||||
"werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
|
||||
"havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
|
||||
"couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
|
||||
"didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
|
||||
"youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
|
||||
"whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
|
||||
"lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
|
||||
"heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
|
||||
"shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
|
||||
"were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
|
||||
"well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
|
||||
"itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
|
||||
"wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
|
||||
"shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
|
||||
"mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
|
||||
"oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
|
||||
"rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
|
||||
}
|
||||
|
||||
# Apply contraction fixes
|
||||
for broken_contraction, fixed_contraction in contraction_fixes.items():
|
||||
# Use word boundaries to avoid partial matches
|
||||
pattern = r'\b' + re.escape(broken_contraction) + r'\b'
|
||||
fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
|
||||
if fixed_title != title_fixes:
|
||||
title_fixes = fixed_title
|
||||
if title_fixes not in search_titles:
|
||||
search_titles.append(title_fixes)
|
||||
|
||||
# Additional hardcoded fixes for edge cases
|
||||
additional_fixes = [
|
||||
(r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
|
||||
]
|
||||
for pattern, replacement in apostrophe_fixes:
|
||||
for pattern, replacement in additional_fixes:
|
||||
fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
|
||||
if fixed_title != title_fixes:
|
||||
title_fixes = fixed_title
|
||||
|
||||
@ -65,6 +65,16 @@ class DataLoader:
|
||||
|
||||
return self._known_recordings
|
||||
|
||||
def load_contraction_fixes(self) -> Dict[str, str]:
|
||||
"""Load contraction fixes from JSON file."""
|
||||
contractions_file = self.data_dir / "contraction_fixes.json"
|
||||
try:
|
||||
with open(contractions_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
print(f"Warning: Could not load contraction fixes data: {e}")
|
||||
return {}
|
||||
|
||||
def reload_data(self) -> None:
|
||||
"""Reload data from files (useful for testing or updates)."""
|
||||
self._known_artists = None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user