From c9221a35b3660141fcec6def0611e0643edc28fe Mon Sep 17 00:00:00 2001
From: Matt Bruce <mbrucedogs@gmail.com>
Date: Fri, 1 Aug 2025 08:03:18 -0500
Subject: [PATCH] Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

---
 src/tests/README.md                     |  50 ++++++++-
 src/tests/debug_artist_search.py        |  40 +++++++
 src/tests/musicbrainz_cleaner.py        |  10 ++
 src/tests/process_full_dataset.py       |  42 ++++++++
 src/tests/quick_test_20.py              | 108 +++++++++++++++++++
 src/tests/test_100_random.py            | 133 ++++++++++++++++++++++++
 src/tests/test_collaboration_debug.py   |  43 ++++++++
 src/tests/test_failed_collaborations.py |  62 +++++++++++
 src/tests/test_simple_query.py          |  48 +++++++++
 src/tests/test_title_cleaning.py        |  50 +++++++++
 10 files changed, 585 insertions(+), 1 deletion(-)
 create mode 100644 src/tests/debug_artist_search.py
 create mode 100644 src/tests/musicbrainz_cleaner.py
 create mode 100644 src/tests/process_full_dataset.py
 create mode 100644 src/tests/quick_test_20.py
 create mode 100644 src/tests/test_100_random.py
 create mode 100644 src/tests/test_collaboration_debug.py
 create mode 100644 src/tests/test_failed_collaborations.py
 create mode 100644 src/tests/test_simple_query.py
 create mode 100644 src/tests/test_title_cleaning.py

diff --git a/src/tests/README.md b/src/tests/README.md
index 47c5d5f..0a6bbce 100644
--- a/src/tests/README.md
+++ b/src/tests/README.md
@@ -10,9 +10,24 @@ src/tests/
 ├── integration/    # Integration tests for database and API
 ├── debug/          # Debug scripts and troubleshooting tests
 ├── run_tests.py    # Test runner script
-└── README.md       # This file
+├── README.md       # This file
+├── legacy/         # Legacy scripts moved from root directory
+└── moved/          # Test files moved from root directory
 ```
 
+### Legacy Scripts (Moved from Root)
+- `process_full_dataset.py` - Legacy script that redirects to new CLI
+- `musicbrainz_cleaner.py` - Legacy entry point script
+
+### Moved Test Files (Moved from Root)
+- `test_title_cleaning.py` - Test title cleaning functionality
+- `test_simple_query.py` - Test simple database queries
+- `debug_artist_search.py` - Debug artist search functionality
+- `test_failed_collaborations.py` - Test failed collaboration cases
+- `test_collaboration_debug.py` - Debug collaboration parsing
+- `test_100_random.py` - Test 100 random songs
+- `quick_test_20.py` - Quick test with 20 songs
+
 ## 🧪 Test Categories
 
 ### Unit Tests (`unit/`)
@@ -44,6 +59,39 @@ src/tests/
 python3 src/tests/run_tests.py
 ```
 
+### Running Moved Test Files
+
+The following test files were moved from the root directory to `src/tests/`:
+
+```bash
+# Run individual moved test files
+python3 src/tests/test_100_random.py
+python3 src/tests/quick_test_20.py
+python3 src/tests/test_title_cleaning.py
+python3 src/tests/test_simple_query.py
+python3 src/tests/debug_artist_search.py
+python3 src/tests/test_failed_collaborations.py
+python3 src/tests/test_collaboration_debug.py
+```
+
+### Running Legacy Scripts
+
+Legacy scripts that redirect to the new CLI:
+
+```bash
+# Legacy full dataset processing (redirects to CLI)
+python3 src/tests/process_full_dataset.py
+
+# Legacy entry point (redirects to CLI)
+python3 src/tests/musicbrainz_cleaner.py
+```
+
+**Note**: These legacy scripts are kept for backward compatibility but the new CLI is preferred:
+```bash
+# Preferred method (new CLI)
+docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main
+```
+
 ### Run Specific Test Categories
 ```bash
 # Run only unit tests
diff --git a/src/tests/debug_artist_search.py b/src/tests/debug_artist_search.py
new file mode 100644
index 0000000..018e85c
--- /dev/null
+++ b/src/tests/debug_artist_search.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""
+Debug script for individual artist searches
+"""
+
+import sys
+sys.path.insert(0, '/app')
+
+from src.api.database import MusicBrainzDatabase
+
+def test_artist_search():
+    db = MusicBrainzDatabase()
+    
+    # Test the main artists from the collaborations
+    test_artists = [
+        "Kanye",  # From "ft Jamie Foxx West, Kanye"
+        "Kanye West",  # What it should be
+        "Ariana Grande",  # From "Ariana Grande, Normani, Nicki Minaj"
+        "SZA",  # From "SZA, Justin Bieber"
+    ]
+    
+    print("🔍 Testing Individual Artist Searches")
+    print("=" * 50)
+    
+    for artist_name in test_artists:
+        print(f"\n📝 Searching for: '{artist_name}'")
+        
+        try:
+            result = db.fuzzy_search_artist(artist_name)
+            if result:
+                matched_name, mbid, score = result
+                print(f"  ✅ Found: '{matched_name}' (MBID: {mbid}, Score: {score})")
+            else:
+                print(f"  ❌ Not found")
+                
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+
+if __name__ == "__main__":
+    test_artist_search() 
\ No newline at end of file
diff --git a/src/tests/musicbrainz_cleaner.py b/src/tests/musicbrainz_cleaner.py
new file mode 100644
index 0000000..a2f087b
--- /dev/null
+++ b/src/tests/musicbrainz_cleaner.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+"""
+MusicBrainz Data Cleaner - Entry Point
+Simple entry point that imports from the refactored src structure
+"""
+
+from src.cli.main import main
+
+if __name__ == "__main__":
+    exit(main()) 
\ No newline at end of file
diff --git a/src/tests/process_full_dataset.py b/src/tests/process_full_dataset.py
new file mode 100644
index 0000000..ed0a7c1
--- /dev/null
+++ b/src/tests/process_full_dataset.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Legacy script for full dataset processing.
+This script now redirects to the new CLI command for better functionality.
+"""
+
+import sys
+import subprocess
+from pathlib import Path
+
+def main():
+    print("🔄 Redirecting to new CLI command...")
+    print("📝 Use: docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --process-full-dataset")
+    print()
+    
+    # Get the current script's directory
+    script_dir = Path(__file__).parent
+    
+    # Build the new command
+    cmd = [
+        "docker-compose", "run", "--rm", "musicbrainz-cleaner",
+        "python3", "-m", "src.cli.main", "--process-full-dataset"
+    ]
+    
+    # Add any additional arguments passed to this script
+    if len(sys.argv) > 1:
+        cmd.extend(sys.argv[1:])
+    
+    try:
+        # Change to the script directory and run the command
+        subprocess.run(cmd, cwd=script_dir, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running command: {e}")
+        sys.exit(1)
+    except FileNotFoundError:
+        print("❌ Error: docker-compose not found. Make sure Docker is running.")
+        print("💡 Alternative: Run directly with:")
+        print("   python3 -m src.cli.main --process-full-dataset")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/src/tests/quick_test_20.py b/src/tests/quick_test_20.py
new file mode 100644
index 0000000..ed6e1d2
--- /dev/null
+++ b/src/tests/quick_test_20.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Quick test script for 20 random songs
+Simple single-threaded approach
+"""
+
+import sys
+import json
+import time
+from pathlib import Path
+
+# Add the src directory to the path
+sys.path.insert(0, '/app')
+from src.cli.main import MusicBrainzCleaner
+
+def main():
+    print('🚀 Starting quick test with 20 random songs...')
+    
+    # Load songs
+    input_file = Path('data/songs.json')
+    if not input_file.exists():
+        print('❌ songs.json not found')
+        return
+    
+    with open(input_file, 'r') as f:
+        all_songs = json.load(f)
+    
+    print(f'📊 Total songs available: {len(all_songs):,}')
+    
+    # Take 20 random songs
+    import random
+    sample_songs = random.sample(all_songs, 20)
+    print(f'🎯 Testing 20 random songs...')
+    
+    # Initialize cleaner
+    cleaner = MusicBrainzCleaner()
+    
+    # Process songs
+    found_artists = 0
+    found_recordings = 0
+    failed_songs = []
+    
+    start_time = time.time()
+    
+    for i, song in enumerate(sample_songs, 1):
+        print(f'  [{i:2d}/20] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
+        
+        try:
+            result = cleaner.clean_song(song)
+            
+            artist_found = 'mbid' in result
+            recording_found = 'recording_mbid' in result
+            
+            if artist_found and recording_found:
+                found_artists += 1
+                found_recordings += 1
+                print(f'      ✅ Found both artist and recording')
+            else:
+                failed_songs.append({
+                    'original': song,
+                    'cleaned': result,
+                    'artist_found': artist_found,
+                    'recording_found': recording_found,
+                    'artist_name': song.get('artist', 'Unknown'),
+                    'title': song.get('title', 'Unknown')
+                })
+                print(f'      ❌ Artist: {artist_found}, Recording: {recording_found}')
+                
+        except Exception as e:
+            print(f'      💥 Error: {e}')
+            failed_songs.append({
+                'original': song,
+                'cleaned': {'error': str(e)},
+                'artist_found': False,
+                'recording_found': False,
+                'artist_name': song.get('artist', 'Unknown'),
+                'title': song.get('title', 'Unknown'),
+                'error': str(e)
+            })
+    
+    end_time = time.time()
+    processing_time = end_time - start_time
+    
+    # Calculate success rates
+    artist_success_rate = found_artists / 20 * 100
+    recording_success_rate = found_recordings / 20 * 100
+    failed_rate = len(failed_songs) / 20 * 100
+    
+    print(f'\n📊 Final Results:')
+    print(f'  ⏱️  Processing time: {processing_time:.2f} seconds')
+    print(f'  🚀 Speed: {20/processing_time:.1f} songs/second')
+    print(f'  ✅ Artists found: {found_artists}/20 ({artist_success_rate:.1f}%)')
+    print(f'  ✅ Recordings found: {found_recordings}/20 ({recording_success_rate:.1f}%)')
+    print(f'  ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
+    
+    # Show failed songs
+    if failed_songs:
+        print(f'\n🔍 Failed songs:')
+        for i, failed in enumerate(failed_songs, 1):
+            print(f'  [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
+            print(f'      Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
+            if 'error' in failed:
+                print(f'      Error: {failed["error"]}')
+    else:
+        print('\n🎉 All songs processed successfully!')
+
+if __name__ == '__main__':
+    main() 
\ No newline at end of file
diff --git a/src/tests/test_100_random.py b/src/tests/test_100_random.py
new file mode 100644
index 0000000..101fae6
--- /dev/null
+++ b/src/tests/test_100_random.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Test script for 100 random songs
+Simple single-threaded approach
+"""
+
+import sys
+import json
+import time
+from pathlib import Path
+
+# Add the src directory to the path
+sys.path.insert(0, '/app')
+from src.cli.main import MusicBrainzCleaner
+
+def main():
+    print('🚀 Starting test with 100 random songs...')
+    
+    # Load songs
+    input_file = Path('data/songs.json')
+    if not input_file.exists():
+        print('❌ songs.json not found')
+        return
+    
+    with open(input_file, 'r') as f:
+        all_songs = json.load(f)
+    
+    print(f'📊 Total songs available: {len(all_songs):,}')
+    
+    # Take 100 random songs
+    import random
+    sample_songs = random.sample(all_songs, 100)
+    print(f'🎯 Testing 100 random songs...')
+    
+    # Initialize cleaner
+    cleaner = MusicBrainzCleaner()
+    
+    # Process songs
+    found_artists = 0
+    found_recordings = 0
+    failed_songs = []
+    
+    start_time = time.time()
+    
+    for i, song in enumerate(sample_songs, 1):
+        print(f'  [{i:3d}/100] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
+        
+        try:
+            result = cleaner.clean_song(song)
+            
+            # Extract the cleaned song from the tuple (song_dict, success_boolean)
+            cleaned_song, success = result
+            
+            artist_found = 'mbid' in cleaned_song
+            recording_found = 'recording_mbid' in cleaned_song
+            
+            if artist_found and recording_found:
+                found_artists += 1
+                found_recordings += 1
+                print(f'      ✅ Found both artist and recording')
+            else:
+                failed_songs.append({
+                    'original': song,
+                    'cleaned': cleaned_song,
+                    'success': success,
+                    'artist_found': artist_found,
+                    'recording_found': recording_found,
+                    'artist_name': song.get('artist', 'Unknown'),
+                    'title': song.get('title', 'Unknown')
+                })
+                print(f'      ❌ Artist: {artist_found}, Recording: {recording_found}')
+                
+        except Exception as e:
+            print(f'      💥 Error: {e}')
+            failed_songs.append({
+                'original': song,
+                'cleaned': {'error': str(e)},
+                'artist_found': False,
+                'recording_found': False,
+                'artist_name': song.get('artist', 'Unknown'),
+                'title': song.get('title', 'Unknown'),
+                'error': str(e)
+            })
+    
+    end_time = time.time()
+    processing_time = end_time - start_time
+    
+    # Calculate success rates
+    artist_success_rate = found_artists / 100 * 100
+    recording_success_rate = found_recordings / 100 * 100
+    failed_rate = len(failed_songs) / 100 * 100
+    
+    print(f'\n📊 Final Results:')
+    print(f'  ⏱️  Processing time: {processing_time:.2f} seconds')
+    print(f'  🚀 Speed: {100/processing_time:.1f} songs/second')
+    print(f'  ✅ Artists found: {found_artists}/100 ({artist_success_rate:.1f}%)')
+    print(f'  ✅ Recordings found: {found_recordings}/100 ({recording_success_rate:.1f}%)')
+    print(f'  ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
+    
+    # Save detailed report
+    if failed_songs:
+        report_file = 'data/test_100_results.json'
+        report_data = {
+            'test_summary': {
+                'total_tested': 100,
+                'artists_found': found_artists,
+                'recordings_found': found_recordings,
+                'failed_count': len(failed_songs),
+                'artist_success_rate': artist_success_rate,
+                'recording_success_rate': recording_success_rate,
+                'processing_time_seconds': processing_time,
+                'songs_per_second': 100/processing_time
+            },
+            'failed_songs': failed_songs
+        }
+        
+        with open(report_file, 'w') as f:
+            json.dump(report_data, f, indent=2)
+        
+        print(f'\n📄 Detailed report saved to: {report_file}')
+        
+        # Show examples of failed songs
+        print(f'\n🔍 Examples of failed songs:')
+        for i, failed in enumerate(failed_songs[:10], 1):
+            print(f'  [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
+            print(f'      Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
+            if 'error' in failed:
+                print(f'      Error: {failed["error"]}')
+    else:
+        print('\n🎉 All songs processed successfully!')
+
+if __name__ == '__main__':
+    main() 
\ No newline at end of file
diff --git a/src/tests/test_collaboration_debug.py b/src/tests/test_collaboration_debug.py
new file mode 100644
index 0000000..41e7a2c
--- /dev/null
+++ b/src/tests/test_collaboration_debug.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Debug script for collaboration parsing issues
+"""
+
+import sys
+sys.path.insert(0, '/app')
+
+from src.api.database import MusicBrainzDatabase
+
+def test_collaboration_parsing():
+    db = MusicBrainzDatabase()
+    
+    # Test cases from the failed songs
+    test_cases = [
+        "ft Jamie Foxx West, Kanye",
+        "Ariana Grande, Normani, Nicki Minaj", 
+        "SZA, Justin Bieber",
+        "Will.I.Am ft. Britney Spears",  # This should work now
+        "Florida Georgia Line Ft Luke Bryan",  # This should work now
+    ]
+    
+    print("🔍 Testing Collaboration Parsing")
+    print("=" * 50)
+    
+    for artist_string in test_cases:
+        print(f"\n📝 Input: '{artist_string}'")
+        
+        try:
+            main_artist, collaborators = db._parse_complex_collaboration(artist_string)
+            print(f"  ✅ Main artist: '{main_artist}'")
+            print(f"  ✅ Collaborators: {collaborators}")
+            
+            if collaborators:
+                print(f"  🎯 Parsed as collaboration")
+            else:
+                print(f"  ❌ Not detected as collaboration")
+                
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+
+if __name__ == "__main__":
+    test_collaboration_parsing() 
\ No newline at end of file
diff --git a/src/tests/test_failed_collaborations.py b/src/tests/test_failed_collaborations.py
new file mode 100644
index 0000000..d18d3ce
--- /dev/null
+++ b/src/tests/test_failed_collaborations.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""
+Test script for the specific failed collaboration songs
+"""
+
+import sys
+import json
+from pathlib import Path
+
+# Add the src directory to the path
+sys.path.insert(0, '/app')
+from src.cli.main import MusicBrainzCleaner
+
+def main():
+    print('🔍 Testing Failed Collaboration Songs...')
+    
+    # Create test songs from the failed ones
+    failed_songs = [
+        {
+            "artist": "ft Jamie Foxx West, Kanye",
+            "title": "Gold Digger"
+        },
+        {
+            "artist": "Ariana Grande, Normani, Nicki Minaj",
+            "title": "Bad To You"
+        },
+        {
+            "artist": "SZA, Justin Bieber",
+            "title": "Snooze (Acoustic)"
+        }
+    ]
+    
+    # Initialize cleaner
+    cleaner = MusicBrainzCleaner()
+    
+    print(f'🎯 Testing {len(failed_songs)} collaboration songs...')
+    
+    for i, song in enumerate(failed_songs, 1):
+        print(f'\n  [{i}/{len(failed_songs)}] Processing: "{song["artist"]}" - "{song["title"]}"')
+        
+        try:
+            result = cleaner.clean_song(song)
+            cleaned_song, success = result
+            
+            artist_found = 'mbid' in cleaned_song
+            recording_found = 'recording_mbid' in cleaned_song
+            
+            if artist_found and recording_found:
+                print(f'      ✅ Found both artist and recording')
+                print(f'      🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
+                print(f'      🎯 Recording: {cleaned_song.get("recording_mbid", "None")}')
+            elif artist_found:
+                print(f'      ⚠️  Artist found but recording not found')
+                print(f'      🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
+            else:
+                print(f'      ❌ Neither artist nor recording found')
+                
+        except Exception as e:
+            print(f'      ❌ Error: {e}')
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/src/tests/test_simple_query.py b/src/tests/test_simple_query.py
new file mode 100644
index 0000000..14efdaa
--- /dev/null
+++ b/src/tests/test_simple_query.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Simple database query test
+"""
+
+import sys
+sys.path.insert(0, '/app')
+
+from src.api.database import MusicBrainzDatabase
+
+def test_simple_query():
+    db = MusicBrainzDatabase()
+    
+    if not db.connect():
+        print("❌ Could not connect to database")
+        return
+    
+    print("✅ Connected to database")
+    
+    # Test a simple query
+    try:
+        db.cursor.execute("SELECT COUNT(*) FROM artist")
+        count = db.cursor.fetchone()
+        print(f"📊 Total artists in database: {count['count']:,}")
+        
+        # Test specific artist query
+        db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('Ariana Grande',))
+        result = db.cursor.fetchone()
+        if result:
+            print(f"✅ Found Ariana Grande: {result['name']} (MBID: {result['gid']})")
+        else:
+            print("❌ Ariana Grande not found")
+            
+        # Test SZA
+        db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('SZA',))
+        result = db.cursor.fetchone()
+        if result:
+            print(f"✅ Found SZA: {result['name']} (MBID: {result['gid']})")
+        else:
+            print("❌ SZA not found")
+            
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    test_simple_query() 
\ No newline at end of file
diff --git a/src/tests/test_title_cleaning.py b/src/tests/test_title_cleaning.py
new file mode 100644
index 0000000..97b0ed9
--- /dev/null
+++ b/src/tests/test_title_cleaning.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+"""
+Test script for improved title cleaning
+"""
+
+import sys
+sys.path.insert(0, '/app')
+
+from src.api.database import MusicBrainzDatabase
+
+def test_title_cleaning():
+    db = MusicBrainzDatabase()
+    
+    # Test cases from the failed songs
+    test_titles = [
+        "Do I Wanna Know? - Live At the BBC",
+        "All The Small Things (John Lewis Christmas Ad 2022)",
+        "I Don t F  k With You",
+        "Por Mujeres Como Tu",
+        "Thought You Should Know (Without Backing Vocals)",
+        "It Might Be You (from the movie Tootsie)",
+        "Speedy Gonzales (Boone & Speedy Vocals)",
+        "I'm Telling You Now (Two Semitones Down)",
+        "The ELO Medley 1",
+        "Can't Fight This Feeling (Minus Piano)",
+        "The Look Of Love",
+        "Revolution (Without Backing Vocals)",
+        "Right Here, Right Now (My Heart Belongs to You)",
+        "Hush Hush",
+        "On The Floor",
+        "(I've Had) The Time Of My Life",
+    ]
+    
+    print("🔍 Testing Improved Title Cleaning")
+    print("=" * 50)
+    
+    for title in test_titles:
+        print(f"\n📝 Original: '{title}'")
+        
+        try:
+            variations = db._generate_title_variations(title)
+            print(f"  🧹 Cleaned variations ({len(variations)}):")
+            for i, variation in enumerate(variations, 1):
+                print(f"    {i}. '{variation}'")
+                
+        except Exception as e:
+            print(f"  ❌ Error: {e}")
+
+if __name__ == "__main__":
+    test_title_cleaning() 
\ No newline at end of file