From ec3325340398f1211033fff3015b2e6ca167bcb8 Mon Sep 17 00:00:00 2001
From: Matt Bruce <mbrucedogs@gmail.com>
Date: Fri, 1 Aug 2025 10:08:44 -0500
Subject: [PATCH] Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

---
 data/contraction_fixes.json          | 88 ++++++++++++++++++++++++++++
 src/utils/artist_title_processing.py | 52 +++++++++++++---
 src/utils/data_loader.py             | 10 ++++
 3 files changed, 143 insertions(+), 7 deletions(-)
 create mode 100755 data/contraction_fixes.json

diff --git a/data/contraction_fixes.json b/data/contraction_fixes.json
new file mode 100755
index 0000000..11b1077
--- /dev/null
+++ b/data/contraction_fixes.json
@@ -0,0 +1,88 @@
+{
+  "dont": "don't",
+  "don t": "don't",
+  "cant": "can't",
+  "can t": "can't",
+  "wont": "won't",
+  "won t": "won't",
+  "im": "I'm",
+  "i m": "I'm",
+  "ive": "I've",
+  "i ve": "I've",
+  "id": "I'd",
+  "i d": "I'd",
+  "ill": "I'll",
+  "i ll": "I'll",
+  "isnt": "isn't",
+  "isn t": "isn't",
+  "arent": "aren't",
+  "aren t": "aren't",
+  "wasnt": "wasn't",
+  "wasn t": "wasn't",
+  "werent": "weren't",
+  "weren t": "weren't",
+  "hasnt": "hasn't",
+  "hasn t": "hasn't",
+  "havent": "haven't",
+  "haven t": "haven't",
+  "shouldnt": "shouldn't",
+  "shouldn t": "shouldn't",
+  "couldnt": "couldn't",
+  "couldn t": "couldn't",
+  "wouldnt": "wouldn't",
+  "wouldn t": "wouldn't",
+  "didnt": "didn't",
+  "didn t": "didn't",
+  "theyre": "they're",
+  "they re": "they're",
+  "youre": "you're",
+  "you re": "you're",
+  "whos": "who's",
+  "who s": "who's",
+  "whats": "what's",
+  "what s": "what's",
+  "thats": "that's",
+  "that s": "that's",
+  "lets": "let's",
+  "let s": "let's",
+  "theres": "there's",
+  "there s": "there's",
+  "heres": "here's",
+  "here s": "here's",
+  "hows": "how's",
+  "how s": "how's",
+  "shes": "she's",
+  "she s": "she's",
+  "hes": "he's",
+  "he s": "he's",
+  "were": "we're",
+  "we re": "we're",
+  "weve": "we've",
+  "we ve": "we've",
+  "well": "we'll",
+  "we ll": "we'll",
+  "its": "it's",
+  "it s": "it's",
+  "itll": "it'll",
+  "it ll": "it'll",
+  "yall": "y'all",
+  "y all": "y'all",
+  "wouldve": "would've",
+  "would ve": "would've",
+  "couldve": "could've",
+  "could ve": "could've",
+  "shouldve": "should've",
+  "should ve": "should've",
+  "mightve": "might've",
+  "might ve": "might've",
+  "mustve": "must've",
+  "must ve": "must've",
+  "maam": "ma'am",
+  "ma am": "ma'am",
+  "oclock": "o'clock",
+  "o clock": "o'clock",
+  "aint": "ain't",
+  "ain t": "ain't",
+  "rocknroll": "rock 'n' roll",
+  "rock n roll": "rock 'n' roll"
+}
\ No newline at end of file
diff --git a/src/utils/artist_title_processing.py b/src/utils/artist_title_processing.py
index d305b37..7f966d3 100644
--- a/src/utils/artist_title_processing.py
+++ b/src/utils/artist_title_processing.py
@@ -119,14 +119,52 @@ def generate_title_variations(title: str) -> List[str]:
     if title_fixes != title.strip():
         search_titles.append(title_fixes)
     
-    apostrophe_fixes = [
-        (r'\bDont\b', "Don't"), (r'\bCant\b', "Can't"), (r'\bWont\b', "Won't"),
-        (r'\bArent\b', "Aren't"), (r'\bIsnt\b', "Isn't"), (r'\bWasnt\b', "Wasn't"),
-        (r'\bDidnt\b', "Didn't"), (r'\bDoesnt\b', "Doesn't"), (r'\bHavent\b', "Haven't"),
-        (r'\bHasnt\b', "Hasn't"), (r'\bWouldnt\b', "Wouldn't"), (r'\bCouldnt\b', "Couldn't"),
-        (r'\bShouldnt\b', "Shouldn't"), (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
+    # Load contraction fixes from JSON file
+    try:
+        from ..utils.data_loader import data_loader
+        contraction_fixes = data_loader.load_contraction_fixes()
+    except:
+        # Fallback to hardcoded fixes if file loading fails
+        contraction_fixes = {
+            "dont": "don't", "don t": "don't", "cant": "can't", "can t": "can't",
+            "wont": "won't", "won t": "won't", "im": "I'm", "i m": "I'm",
+            "ive": "I've", "i ve": "I've", "id": "I'd", "i d": "I'd",
+            "ill": "I'll", "i ll": "I'll", "isnt": "isn't", "isn t": "isn't",
+            "arent": "aren't", "aren t": "aren't", "wasnt": "wasn't", "wasn t": "wasn't",
+            "werent": "weren't", "weren t": "weren't", "hasnt": "hasn't", "hasn t": "hasn't",
+            "havent": "haven't", "haven t": "haven't", "shouldnt": "shouldn't", "shouldn t": "shouldn't",
+            "couldnt": "couldn't", "couldn t": "couldn't", "wouldnt": "wouldn't", "wouldn t": "wouldn't",
+            "didnt": "didn't", "didn t": "didn't", "theyre": "they're", "they re": "they're",
+            "youre": "you're", "you re": "you're", "whos": "who's", "who s": "who's",
+            "whats": "what's", "what s": "what's", "thats": "that's", "that s": "that's",
+            "lets": "let's", "let s": "let's", "theres": "there's", "there s": "there's",
+            "heres": "here's", "here s": "here's", "hows": "how's", "how s": "how's",
+            "shes": "she's", "she s": "she's", "hes": "he's", "he s": "he's",
+            "were": "we're", "we re": "we're", "weve": "we've", "we ve": "we've",
+            "well": "we'll", "we ll": "we'll", "its": "it's", "it s": "it's",
+            "itll": "it'll", "it ll": "it'll", "yall": "y'all", "y all": "y'all",
+            "wouldve": "would've", "would ve": "would've", "couldve": "could've", "could ve": "could've",
+            "shouldve": "should've", "should ve": "should've", "mightve": "might've", "might ve": "might've",
+            "mustve": "must've", "must ve": "must've", "maam": "ma'am", "ma am": "ma'am",
+            "oclock": "o'clock", "o clock": "o'clock", "aint": "ain't", "ain t": "ain't",
+            "rocknroll": "rock 'n' roll", "rock n roll": "rock 'n' roll"
+        }
+    
+    # Apply contraction fixes
+    for broken_contraction, fixed_contraction in contraction_fixes.items():
+        # Use word boundaries to avoid partial matches
+        pattern = r'\b' + re.escape(broken_contraction) + r'\b'
+        fixed_title = re.sub(pattern, fixed_contraction, title_fixes, flags=re.IGNORECASE)
+        if fixed_title != title_fixes:
+            title_fixes = fixed_title
+            if title_fixes not in search_titles:
+                search_titles.append(title_fixes)
+    
+    # Additional hardcoded fixes for edge cases
+    additional_fixes = [
+        (r'\bPhunk\b', "Funk"), (r'\bBout\b', "About")
     ]
-    for pattern, replacement in apostrophe_fixes:
+    for pattern, replacement in additional_fixes:
         fixed_title = re.sub(pattern, replacement, title_fixes, flags=re.IGNORECASE)
         if fixed_title != title_fixes:
             title_fixes = fixed_title
diff --git a/src/utils/data_loader.py b/src/utils/data_loader.py
index 9f2456f..8987423 100644
--- a/src/utils/data_loader.py
+++ b/src/utils/data_loader.py
@@ -65,6 +65,16 @@ class DataLoader:
         
         return self._known_recordings
     
+    def load_contraction_fixes(self) -> Dict[str, str]:
+        """Load contraction fixes from JSON file."""
+        contractions_file = self.data_dir / "contraction_fixes.json"
+        try:
+            with open(contractions_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            print(f"Warning: Could not load contraction fixes data: {e}")
+            return {}
+    
     def reload_data(self) -> None:
         """Reload data from files (useful for testing or updates)."""
         self._known_artists = None