Signed-off-by: Matt Bruce <mbrucedogs@gmail.com>
This commit is contained in:
parent
f127bac480
commit
fee61a12bc
164
src/cli/artist_lookup_cli.py
Normal file
164
src/cli/artist_lookup_cli.py
Normal file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Command-line interface for managing artist lookup table.
|
||||
Allows adding new artist variations and searching existing ones.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
# Add the src directory to the Python path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
|
||||
|
||||
from src.utils.artist_lookup import artist_lookup, add_artist_fallback
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI function for artist lookup table management."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="MusicBrainz Data Cleaner - Artist Lookup Table Manager",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
EXAMPLES:
|
||||
# Search for an artist in the lookup table
|
||||
python3 -m src.cli.artist_lookup_cli --search "Destiny's Child"
|
||||
|
||||
# Add a new artist variation
|
||||
python3 -m src.cli.artist_lookup_cli --add "New Artist" --mbid "12345678-1234-1234-1234-123456789abc" --variations "Artist, The Artist, Artist Band"
|
||||
|
||||
# List all artists in the lookup table
|
||||
python3 -m src.cli.artist_lookup_cli --list
|
||||
|
||||
# Show statistics
|
||||
python3 -m src.cli.artist_lookup_cli --stats
|
||||
"""
|
||||
)
|
||||
|
||||
# Create subparsers for different commands
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
|
||||
# Search command
|
||||
search_parser = subparsers.add_parser('search', help='Search for artists in lookup table')
|
||||
search_parser.add_argument('query', help='Artist name to search for')
|
||||
search_parser.add_argument('--min-score', type=float, default=0.6,
|
||||
help='Minimum fuzzy match score (0.0 to 1.0, default: 0.6)')
|
||||
|
||||
# Add command
|
||||
add_parser = subparsers.add_parser('add', help='Add new artist variation to lookup table')
|
||||
add_parser.add_argument('--canonical-name', required=True, help='Canonical artist name')
|
||||
add_parser.add_argument('--mbid', required=True, help='MusicBrainz ID')
|
||||
add_parser.add_argument('--variations', required=True,
|
||||
help='Comma-separated list of name variations')
|
||||
add_parser.add_argument('--notes', default='', help='Optional notes about the artist')
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser('list', help='List all artists in lookup table')
|
||||
|
||||
# Stats command
|
||||
stats_parser = subparsers.add_parser('stats', help='Show lookup table statistics')
|
||||
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
# Execute commands
|
||||
if args.command == 'search':
|
||||
search_artists(args.query, args.min_score)
|
||||
elif args.command == 'add':
|
||||
add_artist(args.canonical_name, args.mbid, args.variations, args.notes)
|
||||
elif args.command == 'list':
|
||||
list_artists()
|
||||
elif args.command == 'stats':
|
||||
show_statistics()
|
||||
|
||||
|
||||
def search_artists(query: str, min_score: float):
|
||||
"""Search for artists in the lookup table."""
|
||||
print(f"🔍 Searching for '{query}' in artist lookup table...")
|
||||
print(f"📊 Minimum score: {min_score}")
|
||||
print("-" * 60)
|
||||
|
||||
results = artist_lookup.search_artists(query, min_score)
|
||||
|
||||
if not results:
|
||||
print("❌ No matches found")
|
||||
return
|
||||
|
||||
print(f"✅ Found {len(results)} matches:")
|
||||
print()
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"{i}. {result['canonical_name']}")
|
||||
print(f" MBID: {result['mbid']}")
|
||||
print(f" Score: {result['score']:.2f}")
|
||||
print(f" Match Type: {result['match_type']}")
|
||||
if result.get('matched_variation'):
|
||||
print(f" Matched Variation: {result['matched_variation']}")
|
||||
if result.get('notes'):
|
||||
print(f" Notes: {result['notes']}")
|
||||
print()
|
||||
|
||||
|
||||
def add_artist(canonical_name: str, mbid: str, variations_str: str, notes: str):
|
||||
"""Add a new artist variation to the lookup table."""
|
||||
print(f"➕ Adding artist variation to lookup table...")
|
||||
print(f"📝 Canonical Name: {canonical_name}")
|
||||
print(f"🆔 MBID: {mbid}")
|
||||
print(f"📋 Notes: {notes}")
|
||||
|
||||
# Parse variations
|
||||
variations = [v.strip() for v in variations_str.split(',') if v.strip()]
|
||||
print(f"🔄 Variations: {', '.join(variations)}")
|
||||
print()
|
||||
|
||||
# Add to lookup table
|
||||
success = add_artist_fallback(canonical_name, mbid, variations, notes)
|
||||
|
||||
if success:
|
||||
print("✅ Artist variation added successfully!")
|
||||
|
||||
# Show updated statistics
|
||||
stats = artist_lookup.get_statistics()
|
||||
print(f"📊 Updated statistics: {stats['total_artists']} artists, {stats['total_variations']} variations")
|
||||
else:
|
||||
print("❌ Failed to add artist variation")
|
||||
|
||||
|
||||
def list_artists():
|
||||
"""List all artists in the lookup table."""
|
||||
artists = artist_lookup.list_artists()
|
||||
|
||||
if not artists:
|
||||
print("📚 Artist lookup table is empty")
|
||||
return
|
||||
|
||||
print(f"📚 Found {len(artists)} artists in lookup table:")
|
||||
print("-" * 60)
|
||||
|
||||
for i, artist in enumerate(sorted(artists), 1):
|
||||
print(f"{i}. {artist}")
|
||||
|
||||
|
||||
def show_statistics():
|
||||
"""Show lookup table statistics."""
|
||||
stats = artist_lookup.get_statistics()
|
||||
|
||||
print("📊 Artist Lookup Table Statistics")
|
||||
print("=" * 40)
|
||||
print(f"Total Artists: {stats['total_artists']}")
|
||||
print(f"Total Variations: {stats['total_variations']}")
|
||||
|
||||
if stats.get('metadata'):
|
||||
metadata = stats['metadata']
|
||||
print(f"Version: {metadata.get('version', 'Unknown')}")
|
||||
print(f"Last Updated: {metadata.get('last_updated', 'Unknown')}")
|
||||
print(f"Description: {metadata.get('description', 'No description')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -7,6 +7,7 @@ between CLI and database interactions to ensure consistency.
|
||||
from typing import Dict, Optional, Tuple, Any, List
|
||||
from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations
|
||||
from ..utils.data_loader import data_loader
|
||||
from ..utils.artist_lookup import find_artist_fallback
|
||||
from .interfaces import MusicBrainzDataProvider, SongProcessorInterface
|
||||
|
||||
|
||||
@ -47,6 +48,16 @@ class SongProcessor(SongProcessorInterface):
|
||||
if result and isinstance(result, tuple) and len(result) >= 2:
|
||||
return result[1] # Return MBID from tuple (artist_name, mbid, score)
|
||||
|
||||
# Final fallback: Check the artist lookup table
|
||||
lookup_result = find_artist_fallback(artist_name, min_score=0.8)
|
||||
if lookup_result:
|
||||
print(f"🎯 Lookup table match found: {artist_name} → {lookup_result['canonical_name']} (score: {lookup_result['score']:.2f})")
|
||||
# Store the canonical name for later use in clean_song method
|
||||
if not hasattr(self, '_lookup_canonical_names'):
|
||||
self._lookup_canonical_names = {}
|
||||
self._lookup_canonical_names[artist_name] = lookup_result['canonical_name']
|
||||
return lookup_result['mbid']
|
||||
|
||||
return None
|
||||
|
||||
def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]:
|
||||
@ -178,6 +189,10 @@ class SongProcessor(SongProcessorInterface):
|
||||
Returns:
|
||||
Tuple of (cleaned_song, success_status)
|
||||
"""
|
||||
# Clear lookup cache for this song
|
||||
if hasattr(self, '_lookup_canonical_names'):
|
||||
self._lookup_canonical_names = {}
|
||||
|
||||
original_artist = song.get('artist', '')
|
||||
original_title = song.get('title', '')
|
||||
|
||||
@ -225,7 +240,15 @@ class SongProcessor(SongProcessorInterface):
|
||||
# Get artist info
|
||||
artist_info = self.get_artist_info(artist_mbid)
|
||||
if artist_info:
|
||||
song['artist'] = artist_info['name']
|
||||
# Check if we have a canonical name from lookup table
|
||||
original_artist = song.get('artist', '')
|
||||
if hasattr(self, '_lookup_canonical_names') and original_artist in self._lookup_canonical_names:
|
||||
# Use the canonical name from lookup table instead of database name
|
||||
song['artist'] = self._lookup_canonical_names[original_artist]
|
||||
print(f" 🔄 Updated artist name: {original_artist} → {song['artist']}")
|
||||
else:
|
||||
# Use the database name
|
||||
song['artist'] = artist_info['name']
|
||||
song['mbid'] = artist_mbid
|
||||
|
||||
# Find recording MBID (pass original artist for multiple candidate search)
|
||||
|
||||
234
src/utils/artist_lookup.py
Normal file
234
src/utils/artist_lookup.py
Normal file
@ -0,0 +1,234 @@
|
||||
"""
|
||||
Artist lookup table utilities for MusicBrainz Data Cleaner.
|
||||
Provides fallback artist matching when database search fails.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, List, Any
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
|
||||
class ArtistLookupTable:
|
||||
"""Handles artist name variations lookup for fallback matching."""
|
||||
|
||||
def __init__(self, lookup_file: str = "data/artist_lookup.json"):
|
||||
"""
|
||||
Initialize the artist lookup table.
|
||||
|
||||
Args:
|
||||
lookup_file: Path to the JSON lookup file
|
||||
"""
|
||||
self.lookup_file = Path(lookup_file)
|
||||
self.lookup_data = self._load_lookup_data()
|
||||
|
||||
def _load_lookup_data(self) -> Dict[str, Any]:
|
||||
"""Load the artist lookup data from JSON file."""
|
||||
try:
|
||||
if self.lookup_file.exists():
|
||||
with open(self.lookup_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
print(f"📚 Loaded artist lookup table with {len(data.get('artist_variations', {}))} artists")
|
||||
return data
|
||||
else:
|
||||
print(f"⚠️ Artist lookup file not found: {self.lookup_file}")
|
||||
return {"artist_variations": {}, "metadata": {}}
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading artist lookup table: {e}")
|
||||
return {"artist_variations": {}, "metadata": {}}
|
||||
|
||||
def find_artist_by_variation(self, artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Find artist by name variation using fuzzy matching.
|
||||
|
||||
Args:
|
||||
artist_name: The artist name to search for
|
||||
min_score: Minimum fuzzy match score (0.0 to 1.0)
|
||||
|
||||
Returns:
|
||||
Artist info dict with mbid, name, and score, or None if not found
|
||||
"""
|
||||
if not artist_name or not self.lookup_data.get('artist_variations'):
|
||||
return None
|
||||
|
||||
artist_name_clean = artist_name.strip()
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for canonical_name, artist_info in self.lookup_data['artist_variations'].items():
|
||||
# Check against canonical name
|
||||
score = fuzz.ratio(artist_name_clean.lower(), canonical_name.lower()) / 100.0
|
||||
if score > best_score and score >= min_score:
|
||||
best_score = score
|
||||
best_match = {
|
||||
'canonical_name': canonical_name,
|
||||
'mbid': artist_info['mbid'],
|
||||
'score': score,
|
||||
'notes': artist_info.get('notes', ''),
|
||||
'match_type': 'canonical_name'
|
||||
}
|
||||
|
||||
# Check against variations
|
||||
for variation in artist_info.get('variations', []):
|
||||
score = fuzz.ratio(artist_name_clean.lower(), variation.lower()) / 100.0
|
||||
if score > best_score and score >= min_score:
|
||||
best_score = score
|
||||
best_match = {
|
||||
'canonical_name': canonical_name,
|
||||
'mbid': artist_info['mbid'],
|
||||
'score': score,
|
||||
'notes': artist_info.get('notes', ''),
|
||||
'match_type': 'variation',
|
||||
'matched_variation': variation
|
||||
}
|
||||
|
||||
return best_match
|
||||
|
||||
def add_artist_variation(self, canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
|
||||
"""
|
||||
Add a new artist variation to the lookup table.
|
||||
|
||||
Args:
|
||||
canonical_name: The canonical artist name
|
||||
mbid: The MusicBrainz ID
|
||||
variations: List of name variations
|
||||
notes: Optional notes about the artist
|
||||
|
||||
Returns:
|
||||
True if successfully added, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not self.lookup_data.get('artist_variations'):
|
||||
self.lookup_data['artist_variations'] = {}
|
||||
|
||||
self.lookup_data['artist_variations'][canonical_name] = {
|
||||
'mbid': mbid,
|
||||
'variations': variations,
|
||||
'notes': notes
|
||||
}
|
||||
|
||||
# Update metadata
|
||||
if 'metadata' not in self.lookup_data:
|
||||
self.lookup_data['metadata'] = {}
|
||||
|
||||
self.lookup_data['metadata']['total_artists'] = len(self.lookup_data['artist_variations'])
|
||||
|
||||
return self._save_lookup_data()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error adding artist variation: {e}")
|
||||
return False
|
||||
|
||||
def _save_lookup_data(self) -> bool:
|
||||
"""Save the lookup data back to the JSON file."""
|
||||
try:
|
||||
# Ensure directory exists
|
||||
self.lookup_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(self.lookup_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.lookup_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Saved artist lookup table to {self.lookup_file}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error saving artist lookup table: {e}")
|
||||
return False
|
||||
|
||||
def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get statistics about the lookup table."""
|
||||
if not self.lookup_data.get('artist_variations'):
|
||||
return {'total_artists': 0, 'total_variations': 0}
|
||||
|
||||
total_variations = sum(
|
||||
len(artist_info.get('variations', [])) + 1 # +1 for canonical name
|
||||
for artist_info in self.lookup_data['artist_variations'].values()
|
||||
)
|
||||
|
||||
return {
|
||||
'total_artists': len(self.lookup_data['artist_variations']),
|
||||
'total_variations': total_variations,
|
||||
'metadata': self.lookup_data.get('metadata', {})
|
||||
}
|
||||
|
||||
def list_artists(self) -> List[str]:
|
||||
"""Get a list of all canonical artist names in the lookup table."""
|
||||
return list(self.lookup_data.get('artist_variations', {}).keys())
|
||||
|
||||
def search_artists(self, query: str, min_score: float = 0.6) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for artists in the lookup table.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
min_score: Minimum fuzzy match score
|
||||
|
||||
Returns:
|
||||
List of matching artists with scores
|
||||
"""
|
||||
results = []
|
||||
query_lower = query.lower()
|
||||
|
||||
for canonical_name, artist_info in self.lookup_data.get('artist_variations', {}).items():
|
||||
# Check canonical name
|
||||
score = fuzz.ratio(query_lower, canonical_name.lower()) / 100.0
|
||||
if score >= min_score:
|
||||
results.append({
|
||||
'canonical_name': canonical_name,
|
||||
'mbid': artist_info['mbid'],
|
||||
'score': score,
|
||||
'match_type': 'canonical_name',
|
||||
'notes': artist_info.get('notes', '')
|
||||
})
|
||||
|
||||
# Check variations
|
||||
for variation in artist_info.get('variations', []):
|
||||
score = fuzz.ratio(query_lower, variation.lower()) / 100.0
|
||||
if score >= min_score:
|
||||
results.append({
|
||||
'canonical_name': canonical_name,
|
||||
'mbid': artist_info['mbid'],
|
||||
'score': score,
|
||||
'match_type': 'variation',
|
||||
'matched_variation': variation,
|
||||
'notes': artist_info.get('notes', '')
|
||||
})
|
||||
|
||||
# Sort by score (highest first)
|
||||
results.sort(key=lambda x: x['score'], reverse=True)
|
||||
return results
|
||||
|
||||
|
||||
# Global instance for easy access
|
||||
artist_lookup = ArtistLookupTable()
|
||||
|
||||
|
||||
def find_artist_fallback(artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Convenience function to find artist using the lookup table.
|
||||
|
||||
Args:
|
||||
artist_name: The artist name to search for
|
||||
min_score: Minimum fuzzy match score
|
||||
|
||||
Returns:
|
||||
Artist info dict or None if not found
|
||||
"""
|
||||
return artist_lookup.find_artist_by_variation(artist_name, min_score)
|
||||
|
||||
|
||||
def add_artist_fallback(canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool:
|
||||
"""
|
||||
Convenience function to add artist variation to lookup table.
|
||||
|
||||
Args:
|
||||
canonical_name: The canonical artist name
|
||||
mbid: The MusicBrainz ID
|
||||
variations: List of name variations
|
||||
notes: Optional notes about the artist
|
||||
|
||||
Returns:
|
||||
True if successfully added, False otherwise
|
||||
"""
|
||||
return artist_lookup.add_artist_variation(canonical_name, mbid, variations, notes)
|
||||
Loading…
Reference in New Issue
Block a user