Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>

This commit is contained in:
Matt Bruce 2025-08-05 08:26:45 -05:00
parent f127bac480
commit fee61a12bc
3 changed files with 422 additions and 1 deletions

View File

@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
Command-line interface for managing artist lookup table.
Allows adding new artist variations and searching existing ones.
"""
import sys
import argparse
from pathlib import Path
from typing import List
# Add the src directory to the Python path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
from src.utils.artist_lookup import artist_lookup, add_artist_fallback
def main():
"""Main CLI function for artist lookup table management."""
parser = argparse.ArgumentParser(
description="MusicBrainz Data Cleaner - Artist Lookup Table Manager",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
EXAMPLES:
# Search for an artist in the lookup table
python3 -m src.cli.artist_lookup_cli --search "Destiny's Child"
# Add a new artist variation
python3 -m src.cli.artist_lookup_cli --add "New Artist" --mbid "12345678-1234-1234-1234-123456789abc" --variations "Artist, The Artist, Artist Band"
# List all artists in the lookup table
python3 -m src.cli.artist_lookup_cli --list
# Show statistics
python3 -m src.cli.artist_lookup_cli --stats
"""
)
# Create subparsers for different commands
subparsers = parser.add_subparsers(dest='command', help='Available commands')
# Search command
search_parser = subparsers.add_parser('search', help='Search for artists in lookup table')
search_parser.add_argument('query', help='Artist name to search for')
search_parser.add_argument('--min-score', type=float, default=0.6,
help='Minimum fuzzy match score (0.0 to 1.0, default: 0.6)')
# Add command
add_parser = subparsers.add_parser('add', help='Add new artist variation to lookup table')
add_parser.add_argument('--canonical-name', required=True, help='Canonical artist name')
add_parser.add_argument('--mbid', required=True, help='MusicBrainz ID')
add_parser.add_argument('--variations', required=True,
help='Comma-separated list of name variations')
add_parser.add_argument('--notes', default='', help='Optional notes about the artist')
# List command
list_parser = subparsers.add_parser('list', help='List all artists in lookup table')
# Stats command
stats_parser = subparsers.add_parser('stats', help='Show lookup table statistics')
# Parse arguments
args = parser.parse_args()
if not args.command:
parser.print_help()
return
# Execute commands
if args.command == 'search':
search_artists(args.query, args.min_score)
elif args.command == 'add':
add_artist(args.canonical_name, args.mbid, args.variations, args.notes)
elif args.command == 'list':
list_artists()
elif args.command == 'stats':
show_statistics()
def search_artists(query: str, min_score: float):
"""Search for artists in the lookup table."""
print(f"🔍 Searching for '{query}' in artist lookup table...")
print(f"📊 Minimum score: {min_score}")
print("-" * 60)
results = artist_lookup.search_artists(query, min_score)
if not results:
print("❌ No matches found")
return
print(f"✅ Found {len(results)} matches:")
print()
for i, result in enumerate(results, 1):
print(f"{i}. {result['canonical_name']}")
print(f" MBID: {result['mbid']}")
print(f" Score: {result['score']:.2f}")
print(f" Match Type: {result['match_type']}")
if result.get('matched_variation'):
print(f" Matched Variation: {result['matched_variation']}")
if result.get('notes'):
print(f" Notes: {result['notes']}")
print()
def add_artist(canonical_name: str, mbid: str, variations_str: str, notes: str):
"""Add a new artist variation to the lookup table."""
print(f" Adding artist variation to lookup table...")
print(f"📝 Canonical Name: {canonical_name}")
print(f"🆔 MBID: {mbid}")
print(f"📋 Notes: {notes}")
# Parse variations
variations = [v.strip() for v in variations_str.split(',') if v.strip()]
print(f"🔄 Variations: {', '.join(variations)}")
print()
# Add to lookup table
success = add_artist_fallback(canonical_name, mbid, variations, notes)
if success:
print("✅ Artist variation added successfully!")
# Show updated statistics
stats = artist_lookup.get_statistics()
print(f"📊 Updated statistics: {stats['total_artists']} artists, {stats['total_variations']} variations")
else:
print("❌ Failed to add artist variation")
def list_artists():
"""List all artists in the lookup table."""
artists = artist_lookup.list_artists()
if not artists:
print("📚 Artist lookup table is empty")
return
print(f"📚 Found {len(artists)} artists in lookup table:")
print("-" * 60)
for i, artist in enumerate(sorted(artists), 1):
print(f"{i}. {artist}")
def show_statistics():
"""Show lookup table statistics."""
stats = artist_lookup.get_statistics()
print("📊 Artist Lookup Table Statistics")
print("=" * 40)
print(f"Total Artists: {stats['total_artists']}")
print(f"Total Variations: {stats['total_variations']}")
if stats.get('metadata'):
metadata = stats['metadata']
print(f"Version: {metadata.get('version', 'Unknown')}")
print(f"Last Updated: {metadata.get('last_updated', 'Unknown')}")
print(f"Description: {metadata.get('description', 'No description')}")
if __name__ == "__main__":
main()

View File

@ -7,6 +7,7 @@ between CLI and database interactions to ensure consistency.
from typing import Dict, Optional, Tuple, Any, List from typing import Dict, Optional, Tuple, Any, List
from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations
from ..utils.data_loader import data_loader from ..utils.data_loader import data_loader
from ..utils.artist_lookup import find_artist_fallback
from .interfaces import MusicBrainzDataProvider, SongProcessorInterface from .interfaces import MusicBrainzDataProvider, SongProcessorInterface
@ -47,6 +48,16 @@ class SongProcessor(SongProcessorInterface):
if result and isinstance(result, tuple) and len(result) >= 2: if result and isinstance(result, tuple) and len(result) >= 2:
return result[1] # Return MBID from tuple (artist_name, mbid, score) return result[1] # Return MBID from tuple (artist_name, mbid, score)
# Final fallback: Check the artist lookup table
lookup_result = find_artist_fallback(artist_name, min_score=0.8)
if lookup_result:
print(f"🎯 Lookup table match found: {artist_name}{lookup_result['canonical_name']} (score: {lookup_result['score']:.2f})")
# Store the canonical name for later use in clean_song method
if not hasattr(self, '_lookup_canonical_names'):
self._lookup_canonical_names = {}
self._lookup_canonical_names[artist_name] = lookup_result['canonical_name']
return lookup_result['mbid']
return None return None
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]: def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
@ -178,6 +189,10 @@ class SongProcessor(SongProcessorInterface):
Returns: Returns:
Tuple of (cleaned_song, success_status) Tuple of (cleaned_song, success_status)
""" """
# Clear lookup cache for this song
if hasattr(self, '_lookup_canonical_names'):
self._lookup_canonical_names = {}
original_artist = song.get('artist', '') original_artist = song.get('artist', '')
original_title = song.get('title', '') original_title = song.get('title', '')
@ -225,7 +240,15 @@ class SongProcessor(SongProcessorInterface):
# Get artist info # Get artist info
artist_info = self.get_artist_info(artist_mbid) artist_info = self.get_artist_info(artist_mbid)
if artist_info: if artist_info:
song['artist'] = artist_info['name'] # Check if we have a canonical name from lookup table
original_artist = song.get('artist', '')
if hasattr(self, '_lookup_canonical_names') and original_artist in self._lookup_canonical_names:
# Use the canonical name from lookup table instead of database name
song['artist'] = self._lookup_canonical_names[original_artist]
print(f" 🔄 Updated artist name: {original_artist}{song['artist']}")
else:
# Use the database name
song['artist'] = artist_info['name']
song['mbid'] = artist_mbid song['mbid'] = artist_mbid
# Find recording MBID (pass original artist for multiple candidate search) # Find recording MBID (pass original artist for multiple candidate search)

234
src/utils/artist_lookup.py Normal file
View File

@ -0,0 +1,234 @@
"""
Artist lookup table utilities for MusicBrainz Data Cleaner.
Provides fallback artist matching when database search fails.
"""
import json
import os
from pathlib import Path
from typing import Dict, Optional, List, Any
from fuzzywuzzy import fuzz
class ArtistLookupTable:
"""Handles artist name variations lookup for fallback matching."""
def __init__(self, lookup_file: str = "data/artist_lookup.json"):
"""
Initialize the artist lookup table.
Args:
lookup_file: Path to the JSON lookup file
"""
self.lookup_file = Path(lookup_file)
self.lookup_data = self._load_lookup_data()
def _load_lookup_data(self) -> Dict[str, Any]:
"""Load the artist lookup data from JSON file."""
try:
if self.lookup_file.exists():
with open(self.lookup_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"📚 Loaded artist lookup table with {len(data.get('artist_variations', {}))} artists")
return data
else:
print(f"⚠️ Artist lookup file not found: {self.lookup_file}")
return {"artist_variations": {}, "metadata": {}}
except Exception as e:
print(f"❌ Error loading artist lookup table: {e}")
return {"artist_variations": {}, "metadata": {}}
def find_artist_by_variation(self, artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
"""
Find artist by name variation using fuzzy matching.
Args:
artist_name: The artist name to search for
min_score: Minimum fuzzy match score (0.0 to 1.0)
Returns:
Artist info dict with mbid, name, and score, or None if not found
"""
if not artist_name or not self.lookup_data.get('artist_variations'):
return None
artist_name_clean = artist_name.strip()
best_match = None
best_score = 0
for canonical_name, artist_info in self.lookup_data['artist_variations'].items():
# Check against canonical name
score = fuzz.ratio(artist_name_clean.lower(), canonical_name.lower()) / 100.0
if score > best_score and score >= min_score:
best_score = score
best_match = {
'canonical_name': canonical_name,
'mbid': artist_info['mbid'],
'score': score,
'notes': artist_info.get('notes', ''),
'match_type': 'canonical_name'
}
# Check against variations
for variation in artist_info.get('variations', []):
score = fuzz.ratio(artist_name_clean.lower(), variation.lower()) / 100.0
if score > best_score and score >= min_score:
best_score = score
best_match = {
'canonical_name': canonical_name,
'mbid': artist_info['mbid'],
'score': score,
'notes': artist_info.get('notes', ''),
'match_type': 'variation',
'matched_variation': variation
}
return best_match
def add_artist_variation(self, canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
"""
Add a new artist variation to the lookup table.
Args:
canonical_name: The canonical artist name
mbid: The MusicBrainz ID
variations: List of name variations
notes: Optional notes about the artist
Returns:
True if successfully added, False otherwise
"""
try:
if not self.lookup_data.get('artist_variations'):
self.lookup_data['artist_variations'] = {}
self.lookup_data['artist_variations'][canonical_name] = {
'mbid': mbid,
'variations': variations,
'notes': notes
}
# Update metadata
if 'metadata' not in self.lookup_data:
self.lookup_data['metadata'] = {}
self.lookup_data['metadata']['total_artists'] = len(self.lookup_data['artist_variations'])
return self._save_lookup_data()
except Exception as e:
print(f"❌ Error adding artist variation: {e}")
return False
def _save_lookup_data(self) -> bool:
"""Save the lookup data back to the JSON file."""
try:
# Ensure directory exists
self.lookup_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.lookup_file, 'w', encoding='utf-8') as f:
json.dump(self.lookup_data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved artist lookup table to {self.lookup_file}")
return True
except Exception as e:
print(f"❌ Error saving artist lookup table: {e}")
return False
def get_statistics(self) -> Dict[str, Any]:
"""Get statistics about the lookup table."""
if not self.lookup_data.get('artist_variations'):
return {'total_artists': 0, 'total_variations': 0}
total_variations = sum(
len(artist_info.get('variations', [])) + 1 # +1 for canonical name
for artist_info in self.lookup_data['artist_variations'].values()
)
return {
'total_artists': len(self.lookup_data['artist_variations']),
'total_variations': total_variations,
'metadata': self.lookup_data.get('metadata', {})
}
def list_artists(self) -> List[str]:
"""Get a list of all canonical artist names in the lookup table."""
return list(self.lookup_data.get('artist_variations', {}).keys())
def search_artists(self, query: str, min_score: float = 0.6) -> List[Dict[str, Any]]:
"""
Search for artists in the lookup table.
Args:
query: Search query
min_score: Minimum fuzzy match score
Returns:
List of matching artists with scores
"""
results = []
query_lower = query.lower()
for canonical_name, artist_info in self.lookup_data.get('artist_variations', {}).items():
# Check canonical name
score = fuzz.ratio(query_lower, canonical_name.lower()) / 100.0
if score >= min_score:
results.append({
'canonical_name': canonical_name,
'mbid': artist_info['mbid'],
'score': score,
'match_type': 'canonical_name',
'notes': artist_info.get('notes', '')
})
# Check variations
for variation in artist_info.get('variations', []):
score = fuzz.ratio(query_lower, variation.lower()) / 100.0
if score >= min_score:
results.append({
'canonical_name': canonical_name,
'mbid': artist_info['mbid'],
'score': score,
'match_type': 'variation',
'matched_variation': variation,
'notes': artist_info.get('notes', '')
})
# Sort by score (highest first)
results.sort(key=lambda x: x['score'], reverse=True)
return results
# Global instance for easy access
artist_lookup = ArtistLookupTable()
def find_artist_fallback(artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
"""
Convenience function to find artist using the lookup table.
Args:
artist_name: The artist name to search for
min_score: Minimum fuzzy match score
Returns:
Artist info dict or None if not found
"""
return artist_lookup.find_artist_by_variation(artist_name, min_score)
def add_artist_fallback(canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
"""
Convenience function to add artist variation to lookup table.
Args:
canonical_name: The canonical artist name
mbid: The MusicBrainz ID
variations: List of name variations
notes: Optional notes about the artist
Returns:
True if successfully added, False otherwise
"""
return artist_lookup.add_artist_variation(canonical_name, mbid, variations, notes)