From ec3325340398f1211033fff3015b2e6ca167bcb8 Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Fri, 1 Aug 2025 10:08:44 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- data/contraction_fixes.json | 88 ++++++++++++++++++++++++++++ src/utils/artist_title_processing.py | 52 +++++++++++++--- src/utils/data_loader.py | 10 ++++ 3 files changed, 143 insertions(+), 7 deletions(-) create mode 100755 data/contraction_fixes.json diff --git a/data/contraction_fixes.json b/data/contraction_fixes.json new file mode 100755 index 0000000..11b1077 --- /dev/null +++ b/data/contraction_fixes.json @@ -0,0 +1,88 @@ +{ + "dont": "don't", + "don t": "don't", + "cant": "can't", + "can t": "can't", + "wont": "won't", + "won t": "won't", + "im": "I'm", + "i m": "I'm", + "ive": "I've", + "i ve": "I've", + "id": "I'd", + "i d": "I'd", + "ill": "I'll", + "i ll": "I'll", + "isnt": "isn't", + "isn t": "isn't", + "arent": "aren't", + "aren t": "aren't", + "wasnt": "wasn't", + "wasn t": "wasn't", + "werent": "weren't", + "weren t": "weren't", + "hasnt": "hasn't", + "hasn t": "hasn't", + "havent": "haven't", + "haven t": "haven't", + "shouldnt": "shouldn't", + "shouldn t": "shouldn't", + "couldnt": "couldn't", + "couldn t": "couldn't", + "wouldnt": "wouldn't", + "wouldn t": "wouldn't", + "didnt": "didn't", + "didn t": "didn't", + "theyre": "they're", + "they re": "they're", + "youre": "you're", + "you re": "you're", + "whos": "who's", + "who s": "who's", + "whats": "what's", + "what s": "what's", + "thats": "that's", + "that s": "that's", + "lets": "let's", + "let s": "let's", + "theres": "there's", + "there s": "there's", + "heres": "here's", + "here s": "here's", + "hows": "how's", + "how s": "how's", + "shes": "she's", + "she s": "she's", + "hes": "he's", + "he s": "he's", + "were": "we're", + "we re": "we're", + "weve": "we've", + "we ve": "we've", + "well": "we'll", + "we ll": "we'll", + "its": "it's", + "it s": "it's", + "itll": "it'll", + "it ll": "it'll", + "yall": "y'all", + "y all": "y'all", + "wouldve": "would've", + "would ve": "would've", + "couldve": "could've", + "could ve": "could've", + "shouldve": "should've", + "should ve": "should've", + "mightve": "might've", + "might ve": "might've", + "mustve": "must've", + "must ve": "must've", + "maam": "ma'am", + "ma am": "ma'am", + "oclock": "o'clock", + "o clock": "o'clock", + "aint": "ain't", + "ain t": "ain't", + "rocknroll": "rock 'n' roll", + "rock n roll": "rock 'n' roll" +} \ No newline at end of file diff --git a/src/utils/artist_title_processing.py b/src/utils/artist_title_processing.py index d305b37..7f966d3 100644 --- a/src/utils/artist_title_processing.py +++ b/src/utils/artist_title_processing.py @@ -119,14 +119,52 @@ def generate_title_variations(title: str) -> List[str]: if title_fixes != title.strip(): search_titles.append(title_fixes) - apostrophe_fixes = [ - (r'\bDont\b', "Don't"), (r'\bCant\b', "Can't"), (r'\bWont\b', "Won't"), - (r'\bArent\b', "Aren't"), (r'\bIsnt\b', "Isn't"), (r'\bWasnt\b', "Wasn't"), - (r'\bDidnt\b', "Didn't"), (r'\bDoesnt\b', "Doesn't"), (r'\bHavent\b', "Haven't"), - (r'\bHasnt\b', "Hasn't"), (r'\bWouldnt\b', "Wouldn't"), (r'\bCouldnt\b', "Couldn't"), - (r'\bShouldnt\b', "Shouldn't"), (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About") + # Load contraction fixes from JSON file + try: + from ..utils.data_loader import data_loader + contraction_fixes = data_loader.load_contraction_fixes() + except: + # Fallback to hardcoded fixes if file loading fails + contraction_fixes = { + "dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't", + "wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm", + "ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd", + "ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't", + "arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't", + "werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't", + "havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't", + "couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't", + "didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're", + "youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's", + "whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's", + "lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's", + "heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's", + "shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's", + "were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've", + "well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's", + "itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all", + "wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've", + "shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've", + "mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am", + "oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't", + "rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll" + } + + # Apply contraction fixes + for broken_contraction, fixed_contraction in contraction_fixes.items(): + # Use word boundaries to avoid partial matches + pattern = r'\b' + re.escape(broken_contraction) + r'\b' + fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE) + if fixed_title != title_fixes: + title_fixes = fixed_title + if title_fixes not in search_titles: + search_titles.append(title_fixes) + + # Additional hardcoded fixes for edge cases + additional_fixes = [ + (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About") ] - for pattern, replacement in apostrophe_fixes: + for pattern, replacement in additional_fixes: fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE) if fixed_title != title_fixes: title_fixes = fixed_title diff --git a/src/utils/data_loader.py b/src/utils/data_loader.py index 9f2456f..8987423 100644 --- a/src/utils/data_loader.py +++ b/src/utils/data_loader.py @@ -65,6 +65,16 @@ class DataLoader: return self._known_recordings + def load_contraction_fixes(self) -> Dict[str, str]: + """Load contraction fixes from JSON file.""" + contractions_file = self.data_dir / "contraction_fixes.json" + try: + with open(contractions_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Warning: Could not load contraction fixes data: {e}") + return {} + def reload_data(self) -> None: """Reload data from files (useful for testing or updates).""" self._known_artists = None