From c9221a35b3660141fcec6def0611e0643edc28fe Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Fri, 1 Aug 2025 08:03:18 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- src/tests/README.md | 50 ++++++++- src/tests/debug_artist_search.py | 40 +++++++ src/tests/musicbrainz_cleaner.py | 10 ++ src/tests/process_full_dataset.py | 42 ++++++++ src/tests/quick_test_20.py | 108 +++++++++++++++++++ src/tests/test_100_random.py | 133 ++++++++++++++++++++++++ src/tests/test_collaboration_debug.py | 43 ++++++++ src/tests/test_failed_collaborations.py | 62 +++++++++++ src/tests/test_simple_query.py | 48 +++++++++ src/tests/test_title_cleaning.py | 50 +++++++++ 10 files changed, 585 insertions(+), 1 deletion(-) create mode 100644 src/tests/debug_artist_search.py create mode 100644 src/tests/musicbrainz_cleaner.py create mode 100644 src/tests/process_full_dataset.py create mode 100644 src/tests/quick_test_20.py create mode 100644 src/tests/test_100_random.py create mode 100644 src/tests/test_collaboration_debug.py create mode 100644 src/tests/test_failed_collaborations.py create mode 100644 src/tests/test_simple_query.py create mode 100644 src/tests/test_title_cleaning.py diff --git a/src/tests/README.md b/src/tests/README.md index 47c5d5f..0a6bbce 100644 --- a/src/tests/README.md +++ b/src/tests/README.md @@ -10,9 +10,24 @@ src/tests/ ├── integration/ # Integration tests for database and API ├── debug/ # Debug scripts and troubleshooting tests ├── run_tests.py # Test runner script -└── README.md # This file +├── README.md # This file +├── legacy/ # Legacy scripts moved from root directory +└── moved/ # Test files moved from root directory ``` +### Legacy Scripts (Moved from Root) +- `process_full_dataset.py` - Legacy script that redirects to new CLI +- `musicbrainz_cleaner.py` - Legacy entry point script + +### Moved Test Files (Moved from Root) +- `test_title_cleaning.py` - Test title cleaning functionality +- `test_simple_query.py` - Test simple database queries +- `debug_artist_search.py` - Debug artist search functionality +- `test_failed_collaborations.py` - Test failed collaboration cases +- `test_collaboration_debug.py` - Debug collaboration parsing +- `test_100_random.py` - Test 100 random songs +- `quick_test_20.py` - Quick test with 20 songs + ## 🧪 Test Categories ### Unit Tests (`unit/`) @@ -44,6 +59,39 @@ src/tests/ python3 src/tests/run_tests.py ``` +### Running Moved Test Files + +The following test files were moved from the root directory to `src/tests/`: + +```bash +# Run individual moved test files +python3 src/tests/test_100_random.py +python3 src/tests/quick_test_20.py +python3 src/tests/test_title_cleaning.py +python3 src/tests/test_simple_query.py +python3 src/tests/debug_artist_search.py +python3 src/tests/test_failed_collaborations.py +python3 src/tests/test_collaboration_debug.py +``` + +### Running Legacy Scripts + +Legacy scripts that redirect to the new CLI: + +```bash +# Legacy full dataset processing (redirects to CLI) +python3 src/tests/process_full_dataset.py + +# Legacy entry point (redirects to CLI) +python3 src/tests/musicbrainz_cleaner.py +``` + +**Note**: These legacy scripts are kept for backward compatibility but the new CLI is preferred: +```bash +# Preferred method (new CLI) +docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main +``` + ### Run Specific Test Categories ```bash # Run only unit tests diff --git a/src/tests/debug_artist_search.py b/src/tests/debug_artist_search.py new file mode 100644 index 0000000..018e85c --- /dev/null +++ b/src/tests/debug_artist_search.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +""" +Debug script for individual artist searches +""" + +import sys +sys.path.insert(0, '/app') + +from src.api.database import MusicBrainzDatabase + +def test_artist_search(): + db = MusicBrainzDatabase() + + # Test the main artists from the collaborations + test_artists = [ + "Kanye", # From "ft Jamie Foxx West, Kanye" + "Kanye West", # What it should be + "Ariana Grande", # From "Ariana Grande, Normani, Nicki Minaj" + "SZA", # From "SZA, Justin Bieber" + ] + + print("🔍 Testing Individual Artist Searches") + print("=" * 50) + + for artist_name in test_artists: + print(f"\n📝 Searching for: '{artist_name}'") + + try: + result = db.fuzzy_search_artist(artist_name) + if result: + matched_name, mbid, score = result + print(f" ✅ Found: '{matched_name}' (MBID: {mbid}, Score: {score})") + else: + print(f" ❌ Not found") + + except Exception as e: + print(f" ❌ Error: {e}") + +if __name__ == "__main__": + test_artist_search() \ No newline at end of file diff --git a/src/tests/musicbrainz_cleaner.py b/src/tests/musicbrainz_cleaner.py new file mode 100644 index 0000000..a2f087b --- /dev/null +++ b/src/tests/musicbrainz_cleaner.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +""" +MusicBrainz Data Cleaner - Entry Point +Simple entry point that imports from the refactored src structure +""" + +from src.cli.main import main + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/src/tests/process_full_dataset.py b/src/tests/process_full_dataset.py new file mode 100644 index 0000000..ed0a7c1 --- /dev/null +++ b/src/tests/process_full_dataset.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Legacy script for full dataset processing. +This script now redirects to the new CLI command for better functionality. +""" + +import sys +import subprocess +from pathlib import Path + +def main(): + print("🔄 Redirecting to new CLI command...") + print("📝 Use: docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --process-full-dataset") + print() + + # Get the current script's directory + script_dir = Path(__file__).parent + + # Build the new command + cmd = [ + "docker-compose", "run", "--rm", "musicbrainz-cleaner", + "python3", "-m", "src.cli.main", "--process-full-dataset" + ] + + # Add any additional arguments passed to this script + if len(sys.argv) > 1: + cmd.extend(sys.argv[1:]) + + try: + # Change to the script directory and run the command + subprocess.run(cmd, cwd=script_dir, check=True) + except subprocess.CalledProcessError as e: + print(f"❌ Error running command: {e}") + sys.exit(1) + except FileNotFoundError: + print("❌ Error: docker-compose not found. Make sure Docker is running.") + print("💡 Alternative: Run directly with:") + print(" python3 -m src.cli.main --process-full-dataset") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/tests/quick_test_20.py b/src/tests/quick_test_20.py new file mode 100644 index 0000000..ed6e1d2 --- /dev/null +++ b/src/tests/quick_test_20.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Quick test script for 20 random songs +Simple single-threaded approach +""" + +import sys +import json +import time +from pathlib import Path + +# Add the src directory to the path +sys.path.insert(0, '/app') +from src.cli.main import MusicBrainzCleaner + +def main(): + print('🚀 Starting quick test with 20 random songs...') + + # Load songs + input_file = Path('data/songs.json') + if not input_file.exists(): + print('❌ songs.json not found') + return + + with open(input_file, 'r') as f: + all_songs = json.load(f) + + print(f'📊 Total songs available: {len(all_songs):,}') + + # Take 20 random songs + import random + sample_songs = random.sample(all_songs, 20) + print(f'🎯 Testing 20 random songs...') + + # Initialize cleaner + cleaner = MusicBrainzCleaner() + + # Process songs + found_artists = 0 + found_recordings = 0 + failed_songs = [] + + start_time = time.time() + + for i, song in enumerate(sample_songs, 1): + print(f' [{i:2d}/20] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"') + + try: + result = cleaner.clean_song(song) + + artist_found = 'mbid' in result + recording_found = 'recording_mbid' in result + + if artist_found and recording_found: + found_artists += 1 + found_recordings += 1 + print(f' ✅ Found both artist and recording') + else: + failed_songs.append({ + 'original': song, + 'cleaned': result, + 'artist_found': artist_found, + 'recording_found': recording_found, + 'artist_name': song.get('artist', 'Unknown'), + 'title': song.get('title', 'Unknown') + }) + print(f' ❌ Artist: {artist_found}, Recording: {recording_found}') + + except Exception as e: + print(f' 💥 Error: {e}') + failed_songs.append({ + 'original': song, + 'cleaned': {'error': str(e)}, + 'artist_found': False, + 'recording_found': False, + 'artist_name': song.get('artist', 'Unknown'), + 'title': song.get('title', 'Unknown'), + 'error': str(e) + }) + + end_time = time.time() + processing_time = end_time - start_time + + # Calculate success rates + artist_success_rate = found_artists / 20 * 100 + recording_success_rate = found_recordings / 20 * 100 + failed_rate = len(failed_songs) / 20 * 100 + + print(f'\n📊 Final Results:') + print(f' ⏱️ Processing time: {processing_time:.2f} seconds') + print(f' 🚀 Speed: {20/processing_time:.1f} songs/second') + print(f' ✅ Artists found: {found_artists}/20 ({artist_success_rate:.1f}%)') + print(f' ✅ Recordings found: {found_recordings}/20 ({recording_success_rate:.1f}%)') + print(f' ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)') + + # Show failed songs + if failed_songs: + print(f'\n🔍 Failed songs:') + for i, failed in enumerate(failed_songs, 1): + print(f' [{i}] "{failed["artist_name"]}" - "{failed["title"]}"') + print(f' Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}') + if 'error' in failed: + print(f' Error: {failed["error"]}') + else: + print('\n🎉 All songs processed successfully!') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/tests/test_100_random.py b/src/tests/test_100_random.py new file mode 100644 index 0000000..101fae6 --- /dev/null +++ b/src/tests/test_100_random.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Test script for 100 random songs +Simple single-threaded approach +""" + +import sys +import json +import time +from pathlib import Path + +# Add the src directory to the path +sys.path.insert(0, '/app') +from src.cli.main import MusicBrainzCleaner + +def main(): + print('🚀 Starting test with 100 random songs...') + + # Load songs + input_file = Path('data/songs.json') + if not input_file.exists(): + print('❌ songs.json not found') + return + + with open(input_file, 'r') as f: + all_songs = json.load(f) + + print(f'📊 Total songs available: {len(all_songs):,}') + + # Take 100 random songs + import random + sample_songs = random.sample(all_songs, 100) + print(f'🎯 Testing 100 random songs...') + + # Initialize cleaner + cleaner = MusicBrainzCleaner() + + # Process songs + found_artists = 0 + found_recordings = 0 + failed_songs = [] + + start_time = time.time() + + for i, song in enumerate(sample_songs, 1): + print(f' [{i:3d}/100] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"') + + try: + result = cleaner.clean_song(song) + + # Extract the cleaned song from the tuple (song_dict, success_boolean) + cleaned_song, success = result + + artist_found = 'mbid' in cleaned_song + recording_found = 'recording_mbid' in cleaned_song + + if artist_found and recording_found: + found_artists += 1 + found_recordings += 1 + print(f' ✅ Found both artist and recording') + else: + failed_songs.append({ + 'original': song, + 'cleaned': cleaned_song, + 'success': success, + 'artist_found': artist_found, + 'recording_found': recording_found, + 'artist_name': song.get('artist', 'Unknown'), + 'title': song.get('title', 'Unknown') + }) + print(f' ❌ Artist: {artist_found}, Recording: {recording_found}') + + except Exception as e: + print(f' 💥 Error: {e}') + failed_songs.append({ + 'original': song, + 'cleaned': {'error': str(e)}, + 'artist_found': False, + 'recording_found': False, + 'artist_name': song.get('artist', 'Unknown'), + 'title': song.get('title', 'Unknown'), + 'error': str(e) + }) + + end_time = time.time() + processing_time = end_time - start_time + + # Calculate success rates + artist_success_rate = found_artists / 100 * 100 + recording_success_rate = found_recordings / 100 * 100 + failed_rate = len(failed_songs) / 100 * 100 + + print(f'\n📊 Final Results:') + print(f' ⏱️ Processing time: {processing_time:.2f} seconds') + print(f' 🚀 Speed: {100/processing_time:.1f} songs/second') + print(f' ✅ Artists found: {found_artists}/100 ({artist_success_rate:.1f}%)') + print(f' ✅ Recordings found: {found_recordings}/100 ({recording_success_rate:.1f}%)') + print(f' ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)') + + # Save detailed report + if failed_songs: + report_file = 'data/test_100_results.json' + report_data = { + 'test_summary': { + 'total_tested': 100, + 'artists_found': found_artists, + 'recordings_found': found_recordings, + 'failed_count': len(failed_songs), + 'artist_success_rate': artist_success_rate, + 'recording_success_rate': recording_success_rate, + 'processing_time_seconds': processing_time, + 'songs_per_second': 100/processing_time + }, + 'failed_songs': failed_songs + } + + with open(report_file, 'w') as f: + json.dump(report_data, f, indent=2) + + print(f'\n📄 Detailed report saved to: {report_file}') + + # Show examples of failed songs + print(f'\n🔍 Examples of failed songs:') + for i, failed in enumerate(failed_songs[:10], 1): + print(f' [{i}] "{failed["artist_name"]}" - "{failed["title"]}"') + print(f' Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}') + if 'error' in failed: + print(f' Error: {failed["error"]}') + else: + print('\n🎉 All songs processed successfully!') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/tests/test_collaboration_debug.py b/src/tests/test_collaboration_debug.py new file mode 100644 index 0000000..41e7a2c --- /dev/null +++ b/src/tests/test_collaboration_debug.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Debug script for collaboration parsing issues +""" + +import sys +sys.path.insert(0, '/app') + +from src.api.database import MusicBrainzDatabase + +def test_collaboration_parsing(): + db = MusicBrainzDatabase() + + # Test cases from the failed songs + test_cases = [ + "ft Jamie Foxx West, Kanye", + "Ariana Grande, Normani, Nicki Minaj", + "SZA, Justin Bieber", + "Will.I.Am ft. Britney Spears", # This should work now + "Florida Georgia Line Ft Luke Bryan", # This should work now + ] + + print("🔍 Testing Collaboration Parsing") + print("=" * 50) + + for artist_string in test_cases: + print(f"\n📝 Input: '{artist_string}'") + + try: + main_artist, collaborators = db._parse_complex_collaboration(artist_string) + print(f" ✅ Main artist: '{main_artist}'") + print(f" ✅ Collaborators: {collaborators}") + + if collaborators: + print(f" 🎯 Parsed as collaboration") + else: + print(f" ❌ Not detected as collaboration") + + except Exception as e: + print(f" ❌ Error: {e}") + +if __name__ == "__main__": + test_collaboration_parsing() \ No newline at end of file diff --git a/src/tests/test_failed_collaborations.py b/src/tests/test_failed_collaborations.py new file mode 100644 index 0000000..d18d3ce --- /dev/null +++ b/src/tests/test_failed_collaborations.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +""" +Test script for the specific failed collaboration songs +""" + +import sys +import json +from pathlib import Path + +# Add the src directory to the path +sys.path.insert(0, '/app') +from src.cli.main import MusicBrainzCleaner + +def main(): + print('🔍 Testing Failed Collaboration Songs...') + + # Create test songs from the failed ones + failed_songs = [ + { + "artist": "ft Jamie Foxx West, Kanye", + "title": "Gold Digger" + }, + { + "artist": "Ariana Grande, Normani, Nicki Minaj", + "title": "Bad To You" + }, + { + "artist": "SZA, Justin Bieber", + "title": "Snooze (Acoustic)" + } + ] + + # Initialize cleaner + cleaner = MusicBrainzCleaner() + + print(f'🎯 Testing {len(failed_songs)} collaboration songs...') + + for i, song in enumerate(failed_songs, 1): + print(f'\n [{i}/{len(failed_songs)}] Processing: "{song["artist"]}" - "{song["title"]}"') + + try: + result = cleaner.clean_song(song) + cleaned_song, success = result + + artist_found = 'mbid' in cleaned_song + recording_found = 'recording_mbid' in cleaned_song + + if artist_found and recording_found: + print(f' ✅ Found both artist and recording') + print(f' 🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})') + print(f' 🎯 Recording: {cleaned_song.get("recording_mbid", "None")}') + elif artist_found: + print(f' ⚠️ Artist found but recording not found') + print(f' 🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})') + else: + print(f' ❌ Neither artist nor recording found') + + except Exception as e: + print(f' ❌ Error: {e}') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/tests/test_simple_query.py b/src/tests/test_simple_query.py new file mode 100644 index 0000000..14efdaa --- /dev/null +++ b/src/tests/test_simple_query.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Simple database query test +""" + +import sys +sys.path.insert(0, '/app') + +from src.api.database import MusicBrainzDatabase + +def test_simple_query(): + db = MusicBrainzDatabase() + + if not db.connect(): + print("❌ Could not connect to database") + return + + print("✅ Connected to database") + + # Test a simple query + try: + db.cursor.execute("SELECT COUNT(*) FROM artist") + count = db.cursor.fetchone() + print(f"📊 Total artists in database: {count['count']:,}") + + # Test specific artist query + db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('Ariana Grande',)) + result = db.cursor.fetchone() + if result: + print(f"✅ Found Ariana Grande: {result['name']} (MBID: {result['gid']})") + else: + print("❌ Ariana Grande not found") + + # Test SZA + db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('SZA',)) + result = db.cursor.fetchone() + if result: + print(f"✅ Found SZA: {result['name']} (MBID: {result['gid']})") + else: + print("❌ SZA not found") + + except Exception as e: + print(f"❌ Error: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + test_simple_query() \ No newline at end of file diff --git a/src/tests/test_title_cleaning.py b/src/tests/test_title_cleaning.py new file mode 100644 index 0000000..97b0ed9 --- /dev/null +++ b/src/tests/test_title_cleaning.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Test script for improved title cleaning +""" + +import sys +sys.path.insert(0, '/app') + +from src.api.database import MusicBrainzDatabase + +def test_title_cleaning(): + db = MusicBrainzDatabase() + + # Test cases from the failed songs + test_titles = [ + "Do I Wanna Know? - Live At the BBC", + "All The Small Things (John Lewis Christmas Ad 2022)", + "I Don t F k With You", + "Por Mujeres Como Tu", + "Thought You Should Know (Without Backing Vocals)", + "It Might Be You (from the movie Tootsie)", + "Speedy Gonzales (Boone & Speedy Vocals)", + "I'm Telling You Now (Two Semitones Down)", + "The ELO Medley 1", + "Can't Fight This Feeling (Minus Piano)", + "The Look Of Love", + "Revolution (Without Backing Vocals)", + "Right Here, Right Now (My Heart Belongs to You)", + "Hush Hush", + "On The Floor", + "(I've Had) The Time Of My Life", + ] + + print("🔍 Testing Improved Title Cleaning") + print("=" * 50) + + for title in test_titles: + print(f"\n📝 Original: '{title}'") + + try: + variations = db._generate_title_variations(title) + print(f" 🧹 Cleaned variations ({len(variations)}):") + for i, variation in enumerate(variations, 1): + print(f" {i}. '{variation}'") + + except Exception as e: + print(f" ❌ Error: {e}") + +if __name__ == "__main__": + test_title_cleaning() \ No newline at end of file