Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

2025-08-01 08:03:18 -05:00 · 2025-08-01 08:03:18 -05:00 · c9221a35b3
commit c9221a35b3
parent 9124640bf4
10 changed files with 585 additions and 1 deletions
--- a/src/tests/README.md
+++ b/src/tests/README.md
@ -10,9 +10,24 @@ src/tests/
 ├── integration/    # Integration tests for database and API
 ├── debug/          # Debug scripts and troubleshooting tests
 ├── run_tests.py    # Test runner script
-└── README.md       # This file
+├── README.md       # This file
 ├── legacy/         # Legacy scripts moved from root directory
 └── moved/          # Test files moved from root directory
 ```
 ### Legacy Scripts (Moved from Root)
 - `process_full_dataset.py` - Legacy script that redirects to new CLI
 - `musicbrainz_cleaner.py` - Legacy entry point script
 ### Moved Test Files (Moved from Root)
 - `test_title_cleaning.py` - Test title cleaning functionality
 - `test_simple_query.py` - Test simple database queries
 - `debug_artist_search.py` - Debug artist search functionality
 - `test_failed_collaborations.py` - Test failed collaboration cases
 - `test_collaboration_debug.py` - Debug collaboration parsing
 - `test_100_random.py` - Test 100 random songs
 - `quick_test_20.py` - Quick test with 20 songs
 ## 🧪 Test Categories
 ### Unit Tests (`unit/`)
@ -44,6 +59,39 @@ src/tests/
 python3 src/tests/run_tests.py
 ```
 ### Running Moved Test Files
 The following test files were moved from the root directory to `src/tests/`:
 ```bash
 # Run individual moved test files
 python3 src/tests/test_100_random.py
 python3 src/tests/quick_test_20.py
 python3 src/tests/test_title_cleaning.py
 python3 src/tests/test_simple_query.py
 python3 src/tests/debug_artist_search.py
 python3 src/tests/test_failed_collaborations.py
 python3 src/tests/test_collaboration_debug.py
 ```
 ### Running Legacy Scripts
 Legacy scripts that redirect to the new CLI:
 ```bash
 # Legacy full dataset processing (redirects to CLI)
 python3 src/tests/process_full_dataset.py
 # Legacy entry point (redirects to CLI)
 python3 src/tests/musicbrainz_cleaner.py
 ```
 **Note**: These legacy scripts are kept for backward compatibility but the new CLI is preferred:
 ```bash
 # Preferred method (new CLI)
 docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main
 ```
 ### Run Specific Test Categories
 ```bash
 # Run only unit tests
--- a/src/tests/debug_artist_search.py
+++ b/src/tests/debug_artist_search.py
@ -0,0 +1,40 @@
 #!/usr/bin/env python3
 """
 Debug script for individual artist searches
 """
 import sys
 sys.path.insert(0, '/app')
 from src.api.database import MusicBrainzDatabase
 def test_artist_search():
    db = MusicBrainzDatabase()
    # Test the main artists from the collaborations
    test_artists = [
        "Kanye",  # From "ft Jamie Foxx West, Kanye"
        "Kanye West",  # What it should be
        "Ariana Grande",  # From "Ariana Grande, Normani, Nicki Minaj"
        "SZA",  # From "SZA, Justin Bieber"
    ]
    print("🔍 Testing Individual Artist Searches")
    print("=" * 50)
    for artist_name in test_artists:
        print(f"\n📝 Searching for: '{artist_name}'")
        try:
            result = db.fuzzy_search_artist(artist_name)
            if result:
                matched_name, mbid, score = result
                print(f"  ✅ Found: '{matched_name}' (MBID: {mbid}, Score: {score})")
            else:
                print(f"  ❌ Not found")
        except Exception as e:
            print(f"  ❌ Error: {e}")
 if __name__ == "__main__":
    test_artist_search() 
--- a/src/tests/musicbrainz_cleaner.py
+++ b/src/tests/musicbrainz_cleaner.py
@ -0,0 +1,10 @@
 #!/usr/bin/env python3
 """
 MusicBrainz Data Cleaner - Entry Point
 Simple entry point that imports from the refactored src structure
 """
 from src.cli.main import main
 if __name__ == "__main__":
    exit(main()) 
--- a/src/tests/process_full_dataset.py
+++ b/src/tests/process_full_dataset.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python3
 """
 Legacy script for full dataset processing.
 This script now redirects to the new CLI command for better functionality.
 """
 import sys
 import subprocess
 from pathlib import Path
 def main():
    print("🔄 Redirecting to new CLI command...")
    print("📝 Use: docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --process-full-dataset")
    print()
    # Get the current script's directory
    script_dir = Path(__file__).parent
    # Build the new command
    cmd = [
        "docker-compose", "run", "--rm", "musicbrainz-cleaner",
        "python3", "-m", "src.cli.main", "--process-full-dataset"
    ]
    # Add any additional arguments passed to this script
    if len(sys.argv) > 1:
        cmd.extend(sys.argv[1:])
    try:
        # Change to the script directory and run the command
        subprocess.run(cmd, cwd=script_dir, check=True)
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running command: {e}")
        sys.exit(1)
    except FileNotFoundError:
        print("❌ Error: docker-compose not found. Make sure Docker is running.")
        print("💡 Alternative: Run directly with:")
        print("   python3 -m src.cli.main --process-full-dataset")
        sys.exit(1)
 if __name__ == "__main__":
    main() 
--- a/src/tests/quick_test_20.py
+++ b/src/tests/quick_test_20.py
@ -0,0 +1,108 @@
 #!/usr/bin/env python3
 """
 Quick test script for 20 random songs
 Simple single-threaded approach
 """
 import sys
 import json
 import time
 from pathlib import Path
 # Add the src directory to the path
 sys.path.insert(0, '/app')
 from src.cli.main import MusicBrainzCleaner
 def main():
    print('🚀 Starting quick test with 20 random songs...')
    # Load songs
    input_file = Path('data/songs.json')
    if not input_file.exists():
        print('❌ songs.json not found')
        return
    with open(input_file, 'r') as f:
        all_songs = json.load(f)
    print(f'📊 Total songs available: {len(all_songs):,}')
    # Take 20 random songs
    import random
    sample_songs = random.sample(all_songs, 20)
    print(f'🎯 Testing 20 random songs...')
    # Initialize cleaner
    cleaner = MusicBrainzCleaner()
    # Process songs
    found_artists = 0
    found_recordings = 0
    failed_songs = []
    start_time = time.time()
    for i, song in enumerate(sample_songs, 1):
        print(f'  [{i:2d}/20] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
        try:
            result = cleaner.clean_song(song)
            artist_found = 'mbid' in result
            recording_found = 'recording_mbid' in result
            if artist_found and recording_found:
                found_artists += 1
                found_recordings += 1
                print(f'      ✅ Found both artist and recording')
            else:
                failed_songs.append({
                    'original': song,
                    'cleaned': result,
                    'artist_found': artist_found,
                    'recording_found': recording_found,
                    'artist_name': song.get('artist', 'Unknown'),
                    'title': song.get('title', 'Unknown')
                })
                print(f'      ❌ Artist: {artist_found}, Recording: {recording_found}')
        except Exception as e:
            print(f'      💥 Error: {e}')
            failed_songs.append({
                'original': song,
                'cleaned': {'error': str(e)},
                'artist_found': False,
                'recording_found': False,
                'artist_name': song.get('artist', 'Unknown'),
                'title': song.get('title', 'Unknown'),
                'error': str(e)
            })
    end_time = time.time()
    processing_time = end_time - start_time
    # Calculate success rates
    artist_success_rate = found_artists / 20 * 100
    recording_success_rate = found_recordings / 20 * 100
    failed_rate = len(failed_songs) / 20 * 100
    print(f'\n📊 Final Results:')
    print(f'  ⏱️  Processing time: {processing_time:.2f} seconds')
    print(f'  🚀 Speed: {20/processing_time:.1f} songs/second')
    print(f'  ✅ Artists found: {found_artists}/20 ({artist_success_rate:.1f}%)')
    print(f'  ✅ Recordings found: {found_recordings}/20 ({recording_success_rate:.1f}%)')
    print(f'  ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
    # Show failed songs
    if failed_songs:
        print(f'\n🔍 Failed songs:')
        for i, failed in enumerate(failed_songs, 1):
            print(f'  [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
            print(f'      Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
            if 'error' in failed:
                print(f'      Error: {failed["error"]}')
    else:
        print('\n🎉 All songs processed successfully!')
 if __name__ == '__main__':
    main() 
--- a/src/tests/test_100_random.py
+++ b/src/tests/test_100_random.py
@ -0,0 +1,133 @@
 #!/usr/bin/env python3
 """
 Test script for 100 random songs
 Simple single-threaded approach
 """
 import sys
 import json
 import time
 from pathlib import Path
 # Add the src directory to the path
 sys.path.insert(0, '/app')
 from src.cli.main import MusicBrainzCleaner
 def main():
    print('🚀 Starting test with 100 random songs...')
    # Load songs
    input_file = Path('data/songs.json')
    if not input_file.exists():
        print('❌ songs.json not found')
        return
    with open(input_file, 'r') as f:
        all_songs = json.load(f)
    print(f'📊 Total songs available: {len(all_songs):,}')
    # Take 100 random songs
    import random
    sample_songs = random.sample(all_songs, 100)
    print(f'🎯 Testing 100 random songs...')
    # Initialize cleaner
    cleaner = MusicBrainzCleaner()
    # Process songs
    found_artists = 0
    found_recordings = 0
    failed_songs = []
    start_time = time.time()
    for i, song in enumerate(sample_songs, 1):
        print(f'  [{i:3d}/100] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
        try:
            result = cleaner.clean_song(song)
            # Extract the cleaned song from the tuple (song_dict, success_boolean)
            cleaned_song, success = result
            artist_found = 'mbid' in cleaned_song
            recording_found = 'recording_mbid' in cleaned_song
            if artist_found and recording_found:
                found_artists += 1
                found_recordings += 1
                print(f'      ✅ Found both artist and recording')
            else:
                failed_songs.append({
                    'original': song,
                    'cleaned': cleaned_song,
                    'success': success,
                    'artist_found': artist_found,
                    'recording_found': recording_found,
                    'artist_name': song.get('artist', 'Unknown'),
                    'title': song.get('title', 'Unknown')
                })
                print(f'      ❌ Artist: {artist_found}, Recording: {recording_found}')
        except Exception as e:
            print(f'      💥 Error: {e}')
            failed_songs.append({
                'original': song,
                'cleaned': {'error': str(e)},
                'artist_found': False,
                'recording_found': False,
                'artist_name': song.get('artist', 'Unknown'),
                'title': song.get('title', 'Unknown'),
                'error': str(e)
            })
    end_time = time.time()
    processing_time = end_time - start_time
    # Calculate success rates
    artist_success_rate = found_artists / 100 * 100
    recording_success_rate = found_recordings / 100 * 100
    failed_rate = len(failed_songs) / 100 * 100
    print(f'\n📊 Final Results:')
    print(f'  ⏱️  Processing time: {processing_time:.2f} seconds')
    print(f'  🚀 Speed: {100/processing_time:.1f} songs/second')
    print(f'  ✅ Artists found: {found_artists}/100 ({artist_success_rate:.1f}%)')
    print(f'  ✅ Recordings found: {found_recordings}/100 ({recording_success_rate:.1f}%)')
    print(f'  ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
    # Save detailed report
    if failed_songs:
        report_file = 'data/test_100_results.json'
        report_data = {
            'test_summary': {
                'total_tested': 100,
                'artists_found': found_artists,
                'recordings_found': found_recordings,
                'failed_count': len(failed_songs),
                'artist_success_rate': artist_success_rate,
                'recording_success_rate': recording_success_rate,
                'processing_time_seconds': processing_time,
                'songs_per_second': 100/processing_time
            },
            'failed_songs': failed_songs
        }
        with open(report_file, 'w') as f:
            json.dump(report_data, f, indent=2)
        print(f'\n📄 Detailed report saved to: {report_file}')
        # Show examples of failed songs
        print(f'\n🔍 Examples of failed songs:')
        for i, failed in enumerate(failed_songs[:10], 1):
            print(f'  [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
            print(f'      Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
            if 'error' in failed:
                print(f'      Error: {failed["error"]}')
    else:
        print('\n🎉 All songs processed successfully!')
 if __name__ == '__main__':
    main() 
--- a/src/tests/test_collaboration_debug.py
+++ b/src/tests/test_collaboration_debug.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 """
 Debug script for collaboration parsing issues
 """
 import sys
 sys.path.insert(0, '/app')
 from src.api.database import MusicBrainzDatabase
 def test_collaboration_parsing():
    db = MusicBrainzDatabase()
    # Test cases from the failed songs
    test_cases = [
        "ft Jamie Foxx West, Kanye",
        "Ariana Grande, Normani, Nicki Minaj", 
        "SZA, Justin Bieber",
        "Will.I.Am ft. Britney Spears",  # This should work now
        "Florida Georgia Line Ft Luke Bryan",  # This should work now
    ]
    print("🔍 Testing Collaboration Parsing")
    print("=" * 50)
    for artist_string in test_cases:
        print(f"\n📝 Input: '{artist_string}'")
        try:
            main_artist, collaborators = db._parse_complex_collaboration(artist_string)
            print(f"  ✅ Main artist: '{main_artist}'")
            print(f"  ✅ Collaborators: {collaborators}")
            if collaborators:
                print(f"  🎯 Parsed as collaboration")
            else:
                print(f"  ❌ Not detected as collaboration")
        except Exception as e:
            print(f"  ❌ Error: {e}")
 if __name__ == "__main__":
    test_collaboration_parsing() 
--- a/src/tests/test_failed_collaborations.py
+++ b/src/tests/test_failed_collaborations.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python3
 """
 Test script for the specific failed collaboration songs
 """
 import sys
 import json
 from pathlib import Path
 # Add the src directory to the path
 sys.path.insert(0, '/app')
 from src.cli.main import MusicBrainzCleaner
 def main():
    print('🔍 Testing Failed Collaboration Songs...')
    # Create test songs from the failed ones
    failed_songs = [
        {
            "artist": "ft Jamie Foxx West, Kanye",
            "title": "Gold Digger"
        },
        {
            "artist": "Ariana Grande, Normani, Nicki Minaj",
            "title": "Bad To You"
        },
        {
            "artist": "SZA, Justin Bieber",
            "title": "Snooze (Acoustic)"
        }
    ]
    # Initialize cleaner
    cleaner = MusicBrainzCleaner()
    print(f'🎯 Testing {len(failed_songs)} collaboration songs...')
    for i, song in enumerate(failed_songs, 1):
        print(f'\n  [{i}/{len(failed_songs)}] Processing: "{song["artist"]}" - "{song["title"]}"')
        try:
            result = cleaner.clean_song(song)
            cleaned_song, success = result
            artist_found = 'mbid' in cleaned_song
            recording_found = 'recording_mbid' in cleaned_song
            if artist_found and recording_found:
                print(f'      ✅ Found both artist and recording')
                print(f'      🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
                print(f'      🎯 Recording: {cleaned_song.get("recording_mbid", "None")}')
            elif artist_found:
                print(f'      ⚠️  Artist found but recording not found')
                print(f'      🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
            else:
                print(f'      ❌ Neither artist nor recording found')
        except Exception as e:
            print(f'      ❌ Error: {e}')
 if __name__ == "__main__":
    main() 
--- a/src/tests/test_simple_query.py
+++ b/src/tests/test_simple_query.py
@ -0,0 +1,48 @@
 #!/usr/bin/env python3
 """
 Simple database query test
 """
 import sys
 sys.path.insert(0, '/app')
 from src.api.database import MusicBrainzDatabase
 def test_simple_query():
    db = MusicBrainzDatabase()
    if not db.connect():
        print("❌ Could not connect to database")
        return
    print("✅ Connected to database")
    # Test a simple query
    try:
        db.cursor.execute("SELECT COUNT(*) FROM artist")
        count = db.cursor.fetchone()
        print(f"📊 Total artists in database: {count['count']:,}")
        # Test specific artist query
        db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('Ariana Grande',))
        result = db.cursor.fetchone()
        if result:
            print(f"✅ Found Ariana Grande: {result['name']} (MBID: {result['gid']})")
        else:
            print("❌ Ariana Grande not found")
        # Test SZA
        db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('SZA',))
        result = db.cursor.fetchone()
        if result:
            print(f"✅ Found SZA: {result['name']} (MBID: {result['gid']})")
        else:
            print("❌ SZA not found")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
 if __name__ == "__main__":
    test_simple_query() 
--- a/src/tests/test_title_cleaning.py
+++ b/src/tests/test_title_cleaning.py
@ -0,0 +1,50 @@
 #!/usr/bin/env python3
 """
 Test script for improved title cleaning
 """
 import sys
 sys.path.insert(0, '/app')
 from src.api.database import MusicBrainzDatabase
 def test_title_cleaning():
    db = MusicBrainzDatabase()
    # Test cases from the failed songs
    test_titles = [
        "Do I Wanna Know? - Live At the BBC",
        "All The Small Things (John Lewis Christmas Ad 2022)",
        "I Don t F  k With You",
        "Por Mujeres Como Tu",
        "Thought You Should Know (Without Backing Vocals)",
        "It Might Be You (from the movie Tootsie)",
        "Speedy Gonzales (Boone & Speedy Vocals)",
        "I'm Telling You Now (Two Semitones Down)",
        "The ELO Medley 1",
        "Can't Fight This Feeling (Minus Piano)",
        "The Look Of Love",
        "Revolution (Without Backing Vocals)",
        "Right Here, Right Now (My Heart Belongs to You)",
        "Hush Hush",
        "On The Floor",
        "(I've Had) The Time Of My Life",
    ]
    print("🔍 Testing Improved Title Cleaning")
    print("=" * 50)
    for title in test_titles:
        print(f"\n📝 Original: '{title}'")
        try:
            variations = db._generate_title_variations(title)
            print(f"  🧹 Cleaned variations ({len(variations)}):")
            for i, variation in enumerate(variations, 1):
                print(f"    {i}. '{variation}'")
        except Exception as e:
            print(f"  ❌ Error: {e}")
 if __name__ == "__main__":
    test_title_cleaning()