Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-01 08:03:18 -05:00
parent 9124640bf4
commit c9221a35b3
10 changed files with 585 additions and 1 deletions

View File

@ -10,9 +10,24 @@ src/tests/
├── integration/ # Integration tests for database and API
├── debug/ # Debug scripts and troubleshooting tests
├── run_tests.py # Test runner script
└── README.md # This file
├── README.md # This file
├── legacy/ # Legacy scripts moved from root directory
└── moved/ # Test files moved from root directory
```
### Legacy Scripts (Moved from Root)
- `process_full_dataset.py` - Legacy script that redirects to new CLI
- `musicbrainz_cleaner.py` - Legacy entry point script
### Moved Test Files (Moved from Root)
- `test_title_cleaning.py` - Test title cleaning functionality
- `test_simple_query.py` - Test simple database queries
- `debug_artist_search.py` - Debug artist search functionality
- `test_failed_collaborations.py` - Test failed collaboration cases
- `test_collaboration_debug.py` - Debug collaboration parsing
- `test_100_random.py` - Test 100 random songs
- `quick_test_20.py` - Quick test with 20 songs
## 🧪 Test Categories
### Unit Tests (`unit/`)
@ -44,6 +59,39 @@ src/tests/
python3 src/tests/run_tests.py
```
### Running Moved Test Files
The following test files were moved from the root directory to `src/tests/`:
```bash
# Run individual moved test files
python3 src/tests/test_100_random.py
python3 src/tests/quick_test_20.py
python3 src/tests/test_title_cleaning.py
python3 src/tests/test_simple_query.py
python3 src/tests/debug_artist_search.py
python3 src/tests/test_failed_collaborations.py
python3 src/tests/test_collaboration_debug.py
```
### Running Legacy Scripts
Legacy scripts that redirect to the new CLI:
```bash
# Legacy full dataset processing (redirects to CLI)
python3 src/tests/process_full_dataset.py
# Legacy entry point (redirects to CLI)
python3 src/tests/musicbrainz_cleaner.py
```
**Note**: These legacy scripts are kept for backward compatibility but the new CLI is preferred:
```bash
# Preferred method (new CLI)
docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main
```
### Run Specific Test Categories
```bash
# Run only unit tests

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""
Debug script for individual artist searches
"""
import sys
sys.path.insert(0, '/app')
from src.api.database import MusicBrainzDatabase
def test_artist_search():
db = MusicBrainzDatabase()
# Test the main artists from the collaborations
test_artists = [
"Kanye", # From "ft Jamie Foxx West, Kanye"
"Kanye West", # What it should be
"Ariana Grande", # From "Ariana Grande, Normani, Nicki Minaj"
"SZA", # From "SZA, Justin Bieber"
]
print("🔍 Testing Individual Artist Searches")
print("=" * 50)
for artist_name in test_artists:
print(f"\n📝 Searching for: '{artist_name}'")
try:
result = db.fuzzy_search_artist(artist_name)
if result:
matched_name, mbid, score = result
print(f" ✅ Found: '{matched_name}' (MBID: {mbid}, Score: {score})")
else:
print(f" ❌ Not found")
except Exception as e:
print(f" ❌ Error: {e}")
if __name__ == "__main__":
test_artist_search()

View File

@ -0,0 +1,10 @@
#!/usr/bin/env python3
"""
MusicBrainz Data Cleaner - Entry Point
Simple entry point that imports from the refactored src structure
"""
from src.cli.main import main
if __name__ == "__main__":
exit(main())

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""
Legacy script for full dataset processing.
This script now redirects to the new CLI command for better functionality.
"""
import sys
import subprocess
from pathlib import Path
def main():
print("🔄 Redirecting to new CLI command...")
print("📝 Use: docker-compose run --rm musicbrainz-cleaner python3 -m src.cli.main --process-full-dataset")
print()
# Get the current script's directory
script_dir = Path(__file__).parent
# Build the new command
cmd = [
"docker-compose", "run", "--rm", "musicbrainz-cleaner",
"python3", "-m", "src.cli.main", "--process-full-dataset"
]
# Add any additional arguments passed to this script
if len(sys.argv) > 1:
cmd.extend(sys.argv[1:])
try:
# Change to the script directory and run the command
subprocess.run(cmd, cwd=script_dir, check=True)
except subprocess.CalledProcessError as e:
print(f"❌ Error running command: {e}")
sys.exit(1)
except FileNotFoundError:
print("❌ Error: docker-compose not found. Make sure Docker is running.")
print("💡 Alternative: Run directly with:")
print(" python3 -m src.cli.main --process-full-dataset")
sys.exit(1)
if __name__ == "__main__":
main()

108
src/tests/quick_test_20.py Normal file
View File

@ -0,0 +1,108 @@
#!/usr/bin/env python3
"""
Quick test script for 20 random songs
Simple single-threaded approach
"""
import sys
import json
import time
from pathlib import Path
# Add the src directory to the path
sys.path.insert(0, '/app')
from src.cli.main import MusicBrainzCleaner
def main():
print('🚀 Starting quick test with 20 random songs...')
# Load songs
input_file = Path('data/songs.json')
if not input_file.exists():
print('❌ songs.json not found')
return
with open(input_file, 'r') as f:
all_songs = json.load(f)
print(f'📊 Total songs available: {len(all_songs):,}')
# Take 20 random songs
import random
sample_songs = random.sample(all_songs, 20)
print(f'🎯 Testing 20 random songs...')
# Initialize cleaner
cleaner = MusicBrainzCleaner()
# Process songs
found_artists = 0
found_recordings = 0
failed_songs = []
start_time = time.time()
for i, song in enumerate(sample_songs, 1):
print(f' [{i:2d}/20] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
try:
result = cleaner.clean_song(song)
artist_found = 'mbid' in result
recording_found = 'recording_mbid' in result
if artist_found and recording_found:
found_artists += 1
found_recordings += 1
print(f' ✅ Found both artist and recording')
else:
failed_songs.append({
'original': song,
'cleaned': result,
'artist_found': artist_found,
'recording_found': recording_found,
'artist_name': song.get('artist', 'Unknown'),
'title': song.get('title', 'Unknown')
})
print(f' ❌ Artist: {artist_found}, Recording: {recording_found}')
except Exception as e:
print(f' 💥 Error: {e}')
failed_songs.append({
'original': song,
'cleaned': {'error': str(e)},
'artist_found': False,
'recording_found': False,
'artist_name': song.get('artist', 'Unknown'),
'title': song.get('title', 'Unknown'),
'error': str(e)
})
end_time = time.time()
processing_time = end_time - start_time
# Calculate success rates
artist_success_rate = found_artists / 20 * 100
recording_success_rate = found_recordings / 20 * 100
failed_rate = len(failed_songs) / 20 * 100
print(f'\n📊 Final Results:')
print(f' ⏱️ Processing time: {processing_time:.2f} seconds')
print(f' 🚀 Speed: {20/processing_time:.1f} songs/second')
print(f' ✅ Artists found: {found_artists}/20 ({artist_success_rate:.1f}%)')
print(f' ✅ Recordings found: {found_recordings}/20 ({recording_success_rate:.1f}%)')
print(f' ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
# Show failed songs
if failed_songs:
print(f'\n🔍 Failed songs:')
for i, failed in enumerate(failed_songs, 1):
print(f' [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
print(f' Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
if 'error' in failed:
print(f' Error: {failed["error"]}')
else:
print('\n🎉 All songs processed successfully!')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
Test script for 100 random songs
Simple single-threaded approach
"""
import sys
import json
import time
from pathlib import Path
# Add the src directory to the path
sys.path.insert(0, '/app')
from src.cli.main import MusicBrainzCleaner
def main():
print('🚀 Starting test with 100 random songs...')
# Load songs
input_file = Path('data/songs.json')
if not input_file.exists():
print('❌ songs.json not found')
return
with open(input_file, 'r') as f:
all_songs = json.load(f)
print(f'📊 Total songs available: {len(all_songs):,}')
# Take 100 random songs
import random
sample_songs = random.sample(all_songs, 100)
print(f'🎯 Testing 100 random songs...')
# Initialize cleaner
cleaner = MusicBrainzCleaner()
# Process songs
found_artists = 0
found_recordings = 0
failed_songs = []
start_time = time.time()
for i, song in enumerate(sample_songs, 1):
print(f' [{i:3d}/100] Processing: "{song.get("artist", "Unknown")}" - "{song.get("title", "Unknown")}"')
try:
result = cleaner.clean_song(song)
# Extract the cleaned song from the tuple (song_dict, success_boolean)
cleaned_song, success = result
artist_found = 'mbid' in cleaned_song
recording_found = 'recording_mbid' in cleaned_song
if artist_found and recording_found:
found_artists += 1
found_recordings += 1
print(f' ✅ Found both artist and recording')
else:
failed_songs.append({
'original': song,
'cleaned': cleaned_song,
'success': success,
'artist_found': artist_found,
'recording_found': recording_found,
'artist_name': song.get('artist', 'Unknown'),
'title': song.get('title', 'Unknown')
})
print(f' ❌ Artist: {artist_found}, Recording: {recording_found}')
except Exception as e:
print(f' 💥 Error: {e}')
failed_songs.append({
'original': song,
'cleaned': {'error': str(e)},
'artist_found': False,
'recording_found': False,
'artist_name': song.get('artist', 'Unknown'),
'title': song.get('title', 'Unknown'),
'error': str(e)
})
end_time = time.time()
processing_time = end_time - start_time
# Calculate success rates
artist_success_rate = found_artists / 100 * 100
recording_success_rate = found_recordings / 100 * 100
failed_rate = len(failed_songs) / 100 * 100
print(f'\n📊 Final Results:')
print(f' ⏱️ Processing time: {processing_time:.2f} seconds')
print(f' 🚀 Speed: {100/processing_time:.1f} songs/second')
print(f' ✅ Artists found: {found_artists}/100 ({artist_success_rate:.1f}%)')
print(f' ✅ Recordings found: {found_recordings}/100 ({recording_success_rate:.1f}%)')
print(f' ❌ Failed songs: {len(failed_songs)} ({failed_rate:.1f}%)')
# Save detailed report
if failed_songs:
report_file = 'data/test_100_results.json'
report_data = {
'test_summary': {
'total_tested': 100,
'artists_found': found_artists,
'recordings_found': found_recordings,
'failed_count': len(failed_songs),
'artist_success_rate': artist_success_rate,
'recording_success_rate': recording_success_rate,
'processing_time_seconds': processing_time,
'songs_per_second': 100/processing_time
},
'failed_songs': failed_songs
}
with open(report_file, 'w') as f:
json.dump(report_data, f, indent=2)
print(f'\n📄 Detailed report saved to: {report_file}')
# Show examples of failed songs
print(f'\n🔍 Examples of failed songs:')
for i, failed in enumerate(failed_songs[:10], 1):
print(f' [{i}] "{failed["artist_name"]}" - "{failed["title"]}"')
print(f' Artist found: {failed["artist_found"]}, Recording found: {failed["recording_found"]}')
if 'error' in failed:
print(f' Error: {failed["error"]}')
else:
print('\n🎉 All songs processed successfully!')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""
Debug script for collaboration parsing issues
"""
import sys
sys.path.insert(0, '/app')
from src.api.database import MusicBrainzDatabase
def test_collaboration_parsing():
db = MusicBrainzDatabase()
# Test cases from the failed songs
test_cases = [
"ft Jamie Foxx West, Kanye",
"Ariana Grande, Normani, Nicki Minaj",
"SZA, Justin Bieber",
"Will.I.Am ft. Britney Spears", # This should work now
"Florida Georgia Line Ft Luke Bryan", # This should work now
]
print("🔍 Testing Collaboration Parsing")
print("=" * 50)
for artist_string in test_cases:
print(f"\n📝 Input: '{artist_string}'")
try:
main_artist, collaborators = db._parse_complex_collaboration(artist_string)
print(f" ✅ Main artist: '{main_artist}'")
print(f" ✅ Collaborators: {collaborators}")
if collaborators:
print(f" 🎯 Parsed as collaboration")
else:
print(f" ❌ Not detected as collaboration")
except Exception as e:
print(f" ❌ Error: {e}")
if __name__ == "__main__":
test_collaboration_parsing()

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python3
"""
Test script for the specific failed collaboration songs
"""
import sys
import json
from pathlib import Path
# Add the src directory to the path
sys.path.insert(0, '/app')
from src.cli.main import MusicBrainzCleaner
def main():
print('🔍 Testing Failed Collaboration Songs...')
# Create test songs from the failed ones
failed_songs = [
{
"artist": "ft Jamie Foxx West, Kanye",
"title": "Gold Digger"
},
{
"artist": "Ariana Grande, Normani, Nicki Minaj",
"title": "Bad To You"
},
{
"artist": "SZA, Justin Bieber",
"title": "Snooze (Acoustic)"
}
]
# Initialize cleaner
cleaner = MusicBrainzCleaner()
print(f'🎯 Testing {len(failed_songs)} collaboration songs...')
for i, song in enumerate(failed_songs, 1):
print(f'\n [{i}/{len(failed_songs)}] Processing: "{song["artist"]}" - "{song["title"]}"')
try:
result = cleaner.clean_song(song)
cleaned_song, success = result
artist_found = 'mbid' in cleaned_song
recording_found = 'recording_mbid' in cleaned_song
if artist_found and recording_found:
print(f' ✅ Found both artist and recording')
print(f' 🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
print(f' 🎯 Recording: {cleaned_song.get("recording_mbid", "None")}')
elif artist_found:
print(f' ⚠️ Artist found but recording not found')
print(f' 🎯 Artist: {cleaned_song.get("artist", "Unknown")} (MBID: {cleaned_song.get("mbid", "None")})')
else:
print(f' ❌ Neither artist nor recording found')
except Exception as e:
print(f' ❌ Error: {e}')
if __name__ == "__main__":
main()

View File

@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Simple database query test
"""
import sys
sys.path.insert(0, '/app')
from src.api.database import MusicBrainzDatabase
def test_simple_query():
db = MusicBrainzDatabase()
if not db.connect():
print("❌ Could not connect to database")
return
print("✅ Connected to database")
# Test a simple query
try:
db.cursor.execute("SELECT COUNT(*) FROM artist")
count = db.cursor.fetchone()
print(f"📊 Total artists in database: {count['count']:,}")
# Test specific artist query
db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('Ariana Grande',))
result = db.cursor.fetchone()
if result:
print(f"✅ Found Ariana Grande: {result['name']} (MBID: {result['gid']})")
else:
print("❌ Ariana Grande not found")
# Test SZA
db.cursor.execute("SELECT name, gid FROM artist WHERE name = %s LIMIT 1", ('SZA',))
result = db.cursor.fetchone()
if result:
print(f"✅ Found SZA: {result['name']} (MBID: {result['gid']})")
else:
print("❌ SZA not found")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_simple_query()

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""
Test script for improved title cleaning
"""
import sys
sys.path.insert(0, '/app')
from src.api.database import MusicBrainzDatabase
def test_title_cleaning():
db = MusicBrainzDatabase()
# Test cases from the failed songs
test_titles = [
"Do I Wanna Know? - Live At the BBC",
"All The Small Things (John Lewis Christmas Ad 2022)",
"I Don t F k With You",
"Por Mujeres Como Tu",
"Thought You Should Know (Without Backing Vocals)",
"It Might Be You (from the movie Tootsie)",
"Speedy Gonzales (Boone & Speedy Vocals)",
"I'm Telling You Now (Two Semitones Down)",
"The ELO Medley 1",
"Can't Fight This Feeling (Minus Piano)",
"The Look Of Love",
"Revolution (Without Backing Vocals)",
"Right Here, Right Now (My Heart Belongs to You)",
"Hush Hush",
"On The Floor",
"(I've Had) The Time Of My Life",
]
print("🔍 Testing Improved Title Cleaning")
print("=" * 50)
for title in test_titles:
print(f"\n📝 Original: '{title}'")
try:
variations = db._generate_title_variations(title)
print(f" 🧹 Cleaned variations ({len(variations)}):")
for i, variation in enumerate(variations, 1):
print(f" {i}. '{variation}'")
except Exception as e:
print(f" ❌ Error: {e}")
if __name__ == "__main__":
test_title_cleaning()