Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
f127bac480
commit
fee61a12bc
164
src/cli/artist_lookup_cli.py
Normal file
164
src/cli/artist_lookup_cli.py
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Command-line interface for managing artist lookup table.
|
||||||
|
Allows adding new artist variations and searching existing ones.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
# Add the src directory to the Python path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
|
||||||
|
|
||||||
|
from src.utils.artist_lookup import artist_lookup, add_artist_fallback
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main CLI function for artist lookup table management."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="MusicBrainz Data Cleaner - Artist Lookup Table Manager",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
EXAMPLES:
|
||||||
|
# Search for an artist in the lookup table
|
||||||
|
python3 -m src.cli.artist_lookup_cli --search "Destiny's Child"
|
||||||
|
|
||||||
|
# Add a new artist variation
|
||||||
|
python3 -m src.cli.artist_lookup_cli --add "New Artist" --mbid "12345678-1234-1234-1234-123456789abc" --variations "Artist, The Artist, Artist Band"
|
||||||
|
|
||||||
|
# List all artists in the lookup table
|
||||||
|
python3 -m src.cli.artist_lookup_cli --list
|
||||||
|
|
||||||
|
# Show statistics
|
||||||
|
python3 -m src.cli.artist_lookup_cli --stats
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create subparsers for different commands
|
||||||
|
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||||
|
|
||||||
|
# Search command
|
||||||
|
search_parser = subparsers.add_parser('search', help='Search for artists in lookup table')
|
||||||
|
search_parser.add_argument('query', help='Artist name to search for')
|
||||||
|
search_parser.add_argument('--min-score', type=float, default=0.6,
|
||||||
|
help='Minimum fuzzy match score (0.0 to 1.0, default: 0.6)')
|
||||||
|
|
||||||
|
# Add command
|
||||||
|
add_parser = subparsers.add_parser('add', help='Add new artist variation to lookup table')
|
||||||
|
add_parser.add_argument('--canonical-name', required=True, help='Canonical artist name')
|
||||||
|
add_parser.add_argument('--mbid', required=True, help='MusicBrainz ID')
|
||||||
|
add_parser.add_argument('--variations', required=True,
|
||||||
|
help='Comma-separated list of name variations')
|
||||||
|
add_parser.add_argument('--notes', default='', help='Optional notes about the artist')
|
||||||
|
|
||||||
|
# List command
|
||||||
|
list_parser = subparsers.add_parser('list', help='List all artists in lookup table')
|
||||||
|
|
||||||
|
# Stats command
|
||||||
|
stats_parser = subparsers.add_parser('stats', help='Show lookup table statistics')
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.command:
|
||||||
|
parser.print_help()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Execute commands
|
||||||
|
if args.command == 'search':
|
||||||
|
search_artists(args.query, args.min_score)
|
||||||
|
elif args.command == 'add':
|
||||||
|
add_artist(args.canonical_name, args.mbid, args.variations, args.notes)
|
||||||
|
elif args.command == 'list':
|
||||||
|
list_artists()
|
||||||
|
elif args.command == 'stats':
|
||||||
|
show_statistics()
|
||||||
|
|
||||||
|
|
||||||
|
def search_artists(query: str, min_score: float):
|
||||||
|
"""Search for artists in the lookup table."""
|
||||||
|
print(f"🔍 Searching for '{query}' in artist lookup table...")
|
||||||
|
print(f"📊 Minimum score: {min_score}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
results = artist_lookup.search_artists(query, min_score)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
print("❌ No matches found")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"✅ Found {len(results)} matches:")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
print(f"{i}. {result['canonical_name']}")
|
||||||
|
print(f" MBID: {result['mbid']}")
|
||||||
|
print(f" Score: {result['score']:.2f}")
|
||||||
|
print(f" Match Type: {result['match_type']}")
|
||||||
|
if result.get('matched_variation'):
|
||||||
|
print(f" Matched Variation: {result['matched_variation']}")
|
||||||
|
if result.get('notes'):
|
||||||
|
print(f" Notes: {result['notes']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def add_artist(canonical_name: str, mbid: str, variations_str: str, notes: str):
|
||||||
|
"""Add a new artist variation to the lookup table."""
|
||||||
|
print(f"➕ Adding artist variation to lookup table...")
|
||||||
|
print(f"📝 Canonical Name: {canonical_name}")
|
||||||
|
print(f"🆔 MBID: {mbid}")
|
||||||
|
print(f"📋 Notes: {notes}")
|
||||||
|
|
||||||
|
# Parse variations
|
||||||
|
variations = [v.strip() for v in variations_str.split(',') if v.strip()]
|
||||||
|
print(f"🔄 Variations: {', '.join(variations)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Add to lookup table
|
||||||
|
success = add_artist_fallback(canonical_name, mbid, variations, notes)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Artist variation added successfully!")
|
||||||
|
|
||||||
|
# Show updated statistics
|
||||||
|
stats = artist_lookup.get_statistics()
|
||||||
|
print(f"📊 Updated statistics: {stats['total_artists']} artists, {stats['total_variations']} variations")
|
||||||
|
else:
|
||||||
|
print("❌ Failed to add artist variation")
|
||||||
|
|
||||||
|
|
||||||
|
def list_artists():
|
||||||
|
"""List all artists in the lookup table."""
|
||||||
|
artists = artist_lookup.list_artists()
|
||||||
|
|
||||||
|
if not artists:
|
||||||
|
print("📚 Artist lookup table is empty")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"📚 Found {len(artists)} artists in lookup table:")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for i, artist in enumerate(sorted(artists), 1):
|
||||||
|
print(f"{i}. {artist}")
|
||||||
|
|
||||||
|
|
||||||
|
def show_statistics():
|
||||||
|
"""Show lookup table statistics."""
|
||||||
|
stats = artist_lookup.get_statistics()
|
||||||
|
|
||||||
|
print("📊 Artist Lookup Table Statistics")
|
||||||
|
print("=" * 40)
|
||||||
|
print(f"Total Artists: {stats['total_artists']}")
|
||||||
|
print(f"Total Variations: {stats['total_variations']}")
|
||||||
|
|
||||||
|
if stats.get('metadata'):
|
||||||
|
metadata = stats['metadata']
|
||||||
|
print(f"Version: {metadata.get('version', 'Unknown')}")
|
||||||
|
print(f"Last Updated: {metadata.get('last_updated', 'Unknown')}")
|
||||||
|
print(f"Description: {metadata.get('description', 'No description')}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -7,6 +7,7 @@ between CLI and database interactions to ensure consistency.
|
|||||||
from typing import Dict, Optional, Tuple, Any, List
|
from typing import Dict, Optional, Tuple, Any, List
|
||||||
from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations
|
from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations
|
||||||
from ..utils.data_loader import data_loader
|
from ..utils.data_loader import data_loader
|
||||||
|
from ..utils.artist_lookup import find_artist_fallback
|
||||||
from .interfaces import MusicBrainzDataProvider, SongProcessorInterface
|
from .interfaces import MusicBrainzDataProvider, SongProcessorInterface
|
||||||
|
|
||||||
|
|
||||||
@ -47,6 +48,16 @@ class SongProcessor(SongProcessorInterface):
|
|||||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||||
return result[1] # Return MBID from tuple (artist_name, mbid, score)
|
return result[1] # Return MBID from tuple (artist_name, mbid, score)
|
||||||
|
|
||||||
|
# Final fallback: Check the artist lookup table
|
||||||
|
lookup_result = find_artist_fallback(artist_name, min_score=0.8)
|
||||||
|
if lookup_result:
|
||||||
|
print(f"🎯 Lookup table match found: {artist_name} → {lookup_result['canonical_name']} (score: {lookup_result['score']:.2f})")
|
||||||
|
# Store the canonical name for later use in clean_song method
|
||||||
|
if not hasattr(self, '_lookup_canonical_names'):
|
||||||
|
self._lookup_canonical_names = {}
|
||||||
|
self._lookup_canonical_names[artist_name] = lookup_result['canonical_name']
|
||||||
|
return lookup_result['mbid']
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
|
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
|
||||||
@ -178,6 +189,10 @@ class SongProcessor(SongProcessorInterface):
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (cleaned_song, success_status)
|
Tuple of (cleaned_song, success_status)
|
||||||
"""
|
"""
|
||||||
|
# Clear lookup cache for this song
|
||||||
|
if hasattr(self, '_lookup_canonical_names'):
|
||||||
|
self._lookup_canonical_names = {}
|
||||||
|
|
||||||
original_artist = song.get('artist', '')
|
original_artist = song.get('artist', '')
|
||||||
original_title = song.get('title', '')
|
original_title = song.get('title', '')
|
||||||
|
|
||||||
@ -225,7 +240,15 @@ class SongProcessor(SongProcessorInterface):
|
|||||||
# Get artist info
|
# Get artist info
|
||||||
artist_info = self.get_artist_info(artist_mbid)
|
artist_info = self.get_artist_info(artist_mbid)
|
||||||
if artist_info:
|
if artist_info:
|
||||||
song['artist'] = artist_info['name']
|
# Check if we have a canonical name from lookup table
|
||||||
|
original_artist = song.get('artist', '')
|
||||||
|
if hasattr(self, '_lookup_canonical_names') and original_artist in self._lookup_canonical_names:
|
||||||
|
# Use the canonical name from lookup table instead of database name
|
||||||
|
song['artist'] = self._lookup_canonical_names[original_artist]
|
||||||
|
print(f" 🔄 Updated artist name: {original_artist} → {song['artist']}")
|
||||||
|
else:
|
||||||
|
# Use the database name
|
||||||
|
song['artist'] = artist_info['name']
|
||||||
song['mbid'] = artist_mbid
|
song['mbid'] = artist_mbid
|
||||||
|
|
||||||
# Find recording MBID (pass original artist for multiple candidate search)
|
# Find recording MBID (pass original artist for multiple candidate search)
|
||||||
|
|||||||
234
src/utils/artist_lookup.py
Normal file
234
src/utils/artist_lookup.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
"""
|
||||||
|
Artist lookup table utilities for MusicBrainz Data Cleaner.
|
||||||
|
Provides fallback artist matching when database search fails.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Optional, List, Any
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
|
|
||||||
|
class ArtistLookupTable:
|
||||||
|
"""Handles artist name variations lookup for fallback matching."""
|
||||||
|
|
||||||
|
def __init__(self, lookup_file: str = "data/artist_lookup.json"):
|
||||||
|
"""
|
||||||
|
Initialize the artist lookup table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lookup_file: Path to the JSON lookup file
|
||||||
|
"""
|
||||||
|
self.lookup_file = Path(lookup_file)
|
||||||
|
self.lookup_data = self._load_lookup_data()
|
||||||
|
|
||||||
|
def _load_lookup_data(self) -> Dict[str, Any]:
|
||||||
|
"""Load the artist lookup data from JSON file."""
|
||||||
|
try:
|
||||||
|
if self.lookup_file.exists():
|
||||||
|
with open(self.lookup_file, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
print(f"📚 Loaded artist lookup table with {len(data.get('artist_variations', {}))} artists")
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Artist lookup file not found: {self.lookup_file}")
|
||||||
|
return {"artist_variations": {}, "metadata": {}}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error loading artist lookup table: {e}")
|
||||||
|
return {"artist_variations": {}, "metadata": {}}
|
||||||
|
|
||||||
|
def find_artist_by_variation(self, artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Find artist by name variation using fuzzy matching.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
artist_name: The artist name to search for
|
||||||
|
min_score: Minimum fuzzy match score (0.0 to 1.0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Artist info dict with mbid, name, and score, or None if not found
|
||||||
|
"""
|
||||||
|
if not artist_name or not self.lookup_data.get('artist_variations'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
artist_name_clean = artist_name.strip()
|
||||||
|
best_match = None
|
||||||
|
best_score = 0
|
||||||
|
|
||||||
|
for canonical_name, artist_info in self.lookup_data['artist_variations'].items():
|
||||||
|
# Check against canonical name
|
||||||
|
score = fuzz.ratio(artist_name_clean.lower(), canonical_name.lower()) / 100.0
|
||||||
|
if score > best_score and score >= min_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = {
|
||||||
|
'canonical_name': canonical_name,
|
||||||
|
'mbid': artist_info['mbid'],
|
||||||
|
'score': score,
|
||||||
|
'notes': artist_info.get('notes', ''),
|
||||||
|
'match_type': 'canonical_name'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check against variations
|
||||||
|
for variation in artist_info.get('variations', []):
|
||||||
|
score = fuzz.ratio(artist_name_clean.lower(), variation.lower()) / 100.0
|
||||||
|
if score > best_score and score >= min_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = {
|
||||||
|
'canonical_name': canonical_name,
|
||||||
|
'mbid': artist_info['mbid'],
|
||||||
|
'score': score,
|
||||||
|
'notes': artist_info.get('notes', ''),
|
||||||
|
'match_type': 'variation',
|
||||||
|
'matched_variation': variation
|
||||||
|
}
|
||||||
|
|
||||||
|
return best_match
|
||||||
|
|
||||||
|
def add_artist_variation(self, canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
|
||||||
|
"""
|
||||||
|
Add a new artist variation to the lookup table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
canonical_name: The canonical artist name
|
||||||
|
mbid: The MusicBrainz ID
|
||||||
|
variations: List of name variations
|
||||||
|
notes: Optional notes about the artist
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successfully added, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not self.lookup_data.get('artist_variations'):
|
||||||
|
self.lookup_data['artist_variations'] = {}
|
||||||
|
|
||||||
|
self.lookup_data['artist_variations'][canonical_name] = {
|
||||||
|
'mbid': mbid,
|
||||||
|
'variations': variations,
|
||||||
|
'notes': notes
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update metadata
|
||||||
|
if 'metadata' not in self.lookup_data:
|
||||||
|
self.lookup_data['metadata'] = {}
|
||||||
|
|
||||||
|
self.lookup_data['metadata']['total_artists'] = len(self.lookup_data['artist_variations'])
|
||||||
|
|
||||||
|
return self._save_lookup_data()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error adding artist variation: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _save_lookup_data(self) -> bool:
|
||||||
|
"""Save the lookup data back to the JSON file."""
|
||||||
|
try:
|
||||||
|
# Ensure directory exists
|
||||||
|
self.lookup_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(self.lookup_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(self.lookup_data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"💾 Saved artist lookup table to {self.lookup_file}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error saving artist lookup table: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_statistics(self) -> Dict[str, Any]:
|
||||||
|
"""Get statistics about the lookup table."""
|
||||||
|
if not self.lookup_data.get('artist_variations'):
|
||||||
|
return {'total_artists': 0, 'total_variations': 0}
|
||||||
|
|
||||||
|
total_variations = sum(
|
||||||
|
len(artist_info.get('variations', [])) + 1 # +1 for canonical name
|
||||||
|
for artist_info in self.lookup_data['artist_variations'].values()
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_artists': len(self.lookup_data['artist_variations']),
|
||||||
|
'total_variations': total_variations,
|
||||||
|
'metadata': self.lookup_data.get('metadata', {})
|
||||||
|
}
|
||||||
|
|
||||||
|
def list_artists(self) -> List[str]:
|
||||||
|
"""Get a list of all canonical artist names in the lookup table."""
|
||||||
|
return list(self.lookup_data.get('artist_variations', {}).keys())
|
||||||
|
|
||||||
|
def search_artists(self, query: str, min_score: float = 0.6) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Search for artists in the lookup table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query
|
||||||
|
min_score: Minimum fuzzy match score
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of matching artists with scores
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
query_lower = query.lower()
|
||||||
|
|
||||||
|
for canonical_name, artist_info in self.lookup_data.get('artist_variations', {}).items():
|
||||||
|
# Check canonical name
|
||||||
|
score = fuzz.ratio(query_lower, canonical_name.lower()) / 100.0
|
||||||
|
if score >= min_score:
|
||||||
|
results.append({
|
||||||
|
'canonical_name': canonical_name,
|
||||||
|
'mbid': artist_info['mbid'],
|
||||||
|
'score': score,
|
||||||
|
'match_type': 'canonical_name',
|
||||||
|
'notes': artist_info.get('notes', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check variations
|
||||||
|
for variation in artist_info.get('variations', []):
|
||||||
|
score = fuzz.ratio(query_lower, variation.lower()) / 100.0
|
||||||
|
if score >= min_score:
|
||||||
|
results.append({
|
||||||
|
'canonical_name': canonical_name,
|
||||||
|
'mbid': artist_info['mbid'],
|
||||||
|
'score': score,
|
||||||
|
'match_type': 'variation',
|
||||||
|
'matched_variation': variation,
|
||||||
|
'notes': artist_info.get('notes', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by score (highest first)
|
||||||
|
results.sort(key=lambda x: x['score'], reverse=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Global instance for easy access
|
||||||
|
artist_lookup = ArtistLookupTable()
|
||||||
|
|
||||||
|
|
||||||
|
def find_artist_fallback(artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Convenience function to find artist using the lookup table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
artist_name: The artist name to search for
|
||||||
|
min_score: Minimum fuzzy match score
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Artist info dict or None if not found
|
||||||
|
"""
|
||||||
|
return artist_lookup.find_artist_by_variation(artist_name, min_score)
|
||||||
|
|
||||||
|
|
||||||
|
def add_artist_fallback(canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
|
||||||
|
"""
|
||||||
|
Convenience function to add artist variation to lookup table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
canonical_name: The canonical artist name
|
||||||
|
mbid: The MusicBrainz ID
|
||||||
|
variations: List of name variations
|
||||||
|
notes: Optional notes about the artist
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successfully added, False otherwise
|
||||||
|
"""
|
||||||
|
return artist_lookup.add_artist_variation(canonical_name, mbid, variations, notes)
|
||||||
Loading…
Reference in New Issue
Block a user