From fee61a12bcdfdb75a36035383a9cc81fdabc227e Mon Sep 17 00:00:00 2001 From: Matt Bruce Date: Tue, 5 Aug 2025 08:26:45 -0500 Subject: [PATCH] Signed-off-by: Matt Bruce --- src/cli/artist_lookup_cli.py | 164 ++++++++++++++++++++++++ src/core/song_processor.py | 25 +++- src/utils/artist_lookup.py | 234 +++++++++++++++++++++++++++++++++++ 3 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 src/cli/artist_lookup_cli.py create mode 100644 src/utils/artist_lookup.py diff --git a/src/cli/artist_lookup_cli.py b/src/cli/artist_lookup_cli.py new file mode 100644 index 0000000..5039fe7 --- /dev/null +++ b/src/cli/artist_lookup_cli.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Command-line interface for managing artist lookup table. +Allows adding new artist variations and searching existing ones. +""" + +import sys +import argparse +from pathlib import Path +from typing import List + +# Add the src directory to the Python path +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + +from src.utils.artist_lookup import artist_lookup, add_artist_fallback + + +def main(): + """Main CLI function for artist lookup table management.""" + parser = argparse.ArgumentParser( + description="MusicBrainz Data Cleaner - Artist Lookup Table Manager", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +EXAMPLES: + # Search for an artist in the lookup table + python3 -m src.cli.artist_lookup_cli --search "Destiny's Child" + + # Add a new artist variation + python3 -m src.cli.artist_lookup_cli --add "New Artist" --mbid "12345678-1234-1234-1234-123456789abc" --variations "Artist, The Artist, Artist Band" + + # List all artists in the lookup table + python3 -m src.cli.artist_lookup_cli --list + + # Show statistics + python3 -m src.cli.artist_lookup_cli --stats + """ + ) + + # Create subparsers for different commands + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Search command + search_parser = subparsers.add_parser('search', help='Search for artists in lookup table') + search_parser.add_argument('query', help='Artist name to search for') + search_parser.add_argument('--min-score', type=float, default=0.6, + help='Minimum fuzzy match score (0.0 to 1.0, default: 0.6)') + + # Add command + add_parser = subparsers.add_parser('add', help='Add new artist variation to lookup table') + add_parser.add_argument('--canonical-name', required=True, help='Canonical artist name') + add_parser.add_argument('--mbid', required=True, help='MusicBrainz ID') + add_parser.add_argument('--variations', required=True, + help='Comma-separated list of name variations') + add_parser.add_argument('--notes', default='', help='Optional notes about the artist') + + # List command + list_parser = subparsers.add_parser('list', help='List all artists in lookup table') + + # Stats command + stats_parser = subparsers.add_parser('stats', help='Show lookup table statistics') + + # Parse arguments + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + # Execute commands + if args.command == 'search': + search_artists(args.query, args.min_score) + elif args.command == 'add': + add_artist(args.canonical_name, args.mbid, args.variations, args.notes) + elif args.command == 'list': + list_artists() + elif args.command == 'stats': + show_statistics() + + +def search_artists(query: str, min_score: float): + """Search for artists in the lookup table.""" + print(f"🔍 Searching for '{query}' in artist lookup table...") + print(f"📊 Minimum score: {min_score}") + print("-" * 60) + + results = artist_lookup.search_artists(query, min_score) + + if not results: + print("❌ No matches found") + return + + print(f"✅ Found {len(results)} matches:") + print() + + for i, result in enumerate(results, 1): + print(f"{i}. {result['canonical_name']}") + print(f" MBID: {result['mbid']}") + print(f" Score: {result['score']:.2f}") + print(f" Match Type: {result['match_type']}") + if result.get('matched_variation'): + print(f" Matched Variation: {result['matched_variation']}") + if result.get('notes'): + print(f" Notes: {result['notes']}") + print() + + +def add_artist(canonical_name: str, mbid: str, variations_str: str, notes: str): + """Add a new artist variation to the lookup table.""" + print(f"➕ Adding artist variation to lookup table...") + print(f"📝 Canonical Name: {canonical_name}") + print(f"🆔 MBID: {mbid}") + print(f"📋 Notes: {notes}") + + # Parse variations + variations = [v.strip() for v in variations_str.split(',') if v.strip()] + print(f"🔄 Variations: {', '.join(variations)}") + print() + + # Add to lookup table + success = add_artist_fallback(canonical_name, mbid, variations, notes) + + if success: + print("✅ Artist variation added successfully!") + + # Show updated statistics + stats = artist_lookup.get_statistics() + print(f"📊 Updated statistics: {stats['total_artists']} artists, {stats['total_variations']} variations") + else: + print("❌ Failed to add artist variation") + + +def list_artists(): + """List all artists in the lookup table.""" + artists = artist_lookup.list_artists() + + if not artists: + print("📚 Artist lookup table is empty") + return + + print(f"📚 Found {len(artists)} artists in lookup table:") + print("-" * 60) + + for i, artist in enumerate(sorted(artists), 1): + print(f"{i}. {artist}") + + +def show_statistics(): + """Show lookup table statistics.""" + stats = artist_lookup.get_statistics() + + print("📊 Artist Lookup Table Statistics") + print("=" * 40) + print(f"Total Artists: {stats['total_artists']}") + print(f"Total Variations: {stats['total_variations']}") + + if stats.get('metadata'): + metadata = stats['metadata'] + print(f"Version: {metadata.get('version', 'Unknown')}") + print(f"Last Updated: {metadata.get('last_updated', 'Unknown')}") + print(f"Description: {metadata.get('description', 'No description')}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/core/song_processor.py b/src/core/song_processor.py index d50db72..c66ced1 100644 --- a/src/core/song_processor.py +++ b/src/core/song_processor.py @@ -7,6 +7,7 @@ between CLI and database interactions to ensure consistency. from typing import Dict, Optional, Tuple, Any, List from ..utils.artist_title_processing import parse_complex_collaboration, generate_title_variations from ..utils.data_loader import data_loader +from ..utils.artist_lookup import find_artist_fallback from .interfaces import MusicBrainzDataProvider, SongProcessorInterface @@ -47,6 +48,16 @@ class SongProcessor(SongProcessorInterface): if result and isinstance(result, tuple) and len(result) >= 2: return result[1] # Return MBID from tuple (artist_name, mbid, score) + # Final fallback: Check the artist lookup table + lookup_result = find_artist_fallback(artist_name, min_score=0.8) + if lookup_result: + print(f"🎯 Lookup table match found: {artist_name} → {lookup_result['canonical_name']} (score: {lookup_result['score']:.2f})") + # Store the canonical name for later use in clean_song method + if not hasattr(self, '_lookup_canonical_names'): + self._lookup_canonical_names = {} + self._lookup_canonical_names[artist_name] = lookup_result['canonical_name'] + return lookup_result['mbid'] + return None def find_recording_mbid(self, artist_mbid: str, title: str, original_artist: str = None) -> Optional[str]: @@ -178,6 +189,10 @@ class SongProcessor(SongProcessorInterface): Returns: Tuple of (cleaned_song, success_status) """ + # Clear lookup cache for this song + if hasattr(self, '_lookup_canonical_names'): + self._lookup_canonical_names = {} + original_artist = song.get('artist', '') original_title = song.get('title', '') @@ -225,7 +240,15 @@ class SongProcessor(SongProcessorInterface): # Get artist info artist_info = self.get_artist_info(artist_mbid) if artist_info: - song['artist'] = artist_info['name'] + # Check if we have a canonical name from lookup table + original_artist = song.get('artist', '') + if hasattr(self, '_lookup_canonical_names') and original_artist in self._lookup_canonical_names: + # Use the canonical name from lookup table instead of database name + song['artist'] = self._lookup_canonical_names[original_artist] + print(f" 🔄 Updated artist name: {original_artist} → {song['artist']}") + else: + # Use the database name + song['artist'] = artist_info['name'] song['mbid'] = artist_mbid # Find recording MBID (pass original artist for multiple candidate search) diff --git a/src/utils/artist_lookup.py b/src/utils/artist_lookup.py new file mode 100644 index 0000000..c17990a --- /dev/null +++ b/src/utils/artist_lookup.py @@ -0,0 +1,234 @@ +""" +Artist lookup table utilities for MusicBrainz Data Cleaner. +Provides fallback artist matching when database search fails. +""" + +import json +import os +from pathlib import Path +from typing import Dict, Optional, List, Any +from fuzzywuzzy import fuzz + + +class ArtistLookupTable: + """Handles artist name variations lookup for fallback matching.""" + + def __init__(self, lookup_file: str = "data/artist_lookup.json"): + """ + Initialize the artist lookup table. + + Args: + lookup_file: Path to the JSON lookup file + """ + self.lookup_file = Path(lookup_file) + self.lookup_data = self._load_lookup_data() + + def _load_lookup_data(self) -> Dict[str, Any]: + """Load the artist lookup data from JSON file.""" + try: + if self.lookup_file.exists(): + with open(self.lookup_file, 'r', encoding='utf-8') as f: + data = json.load(f) + print(f"📚 Loaded artist lookup table with {len(data.get('artist_variations', {}))} artists") + return data + else: + print(f"⚠️ Artist lookup file not found: {self.lookup_file}") + return {"artist_variations": {}, "metadata": {}} + except Exception as e: + print(f"❌ Error loading artist lookup table: {e}") + return {"artist_variations": {}, "metadata": {}} + + def find_artist_by_variation(self, artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]: + """ + Find artist by name variation using fuzzy matching. + + Args: + artist_name: The artist name to search for + min_score: Minimum fuzzy match score (0.0 to 1.0) + + Returns: + Artist info dict with mbid, name, and score, or None if not found + """ + if not artist_name or not self.lookup_data.get('artist_variations'): + return None + + artist_name_clean = artist_name.strip() + best_match = None + best_score = 0 + + for canonical_name, artist_info in self.lookup_data['artist_variations'].items(): + # Check against canonical name + score = fuzz.ratio(artist_name_clean.lower(), canonical_name.lower()) / 100.0 + if score > best_score and score >= min_score: + best_score = score + best_match = { + 'canonical_name': canonical_name, + 'mbid': artist_info['mbid'], + 'score': score, + 'notes': artist_info.get('notes', ''), + 'match_type': 'canonical_name' + } + + # Check against variations + for variation in artist_info.get('variations', []): + score = fuzz.ratio(artist_name_clean.lower(), variation.lower()) / 100.0 + if score > best_score and score >= min_score: + best_score = score + best_match = { + 'canonical_name': canonical_name, + 'mbid': artist_info['mbid'], + 'score': score, + 'notes': artist_info.get('notes', ''), + 'match_type': 'variation', + 'matched_variation': variation + } + + return best_match + + def add_artist_variation(self, canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool: + """ + Add a new artist variation to the lookup table. + + Args: + canonical_name: The canonical artist name + mbid: The MusicBrainz ID + variations: List of name variations + notes: Optional notes about the artist + + Returns: + True if successfully added, False otherwise + """ + try: + if not self.lookup_data.get('artist_variations'): + self.lookup_data['artist_variations'] = {} + + self.lookup_data['artist_variations'][canonical_name] = { + 'mbid': mbid, + 'variations': variations, + 'notes': notes + } + + # Update metadata + if 'metadata' not in self.lookup_data: + self.lookup_data['metadata'] = {} + + self.lookup_data['metadata']['total_artists'] = len(self.lookup_data['artist_variations']) + + return self._save_lookup_data() + + except Exception as e: + print(f"❌ Error adding artist variation: {e}") + return False + + def _save_lookup_data(self) -> bool: + """Save the lookup data back to the JSON file.""" + try: + # Ensure directory exists + self.lookup_file.parent.mkdir(parents=True, exist_ok=True) + + with open(self.lookup_file, 'w', encoding='utf-8') as f: + json.dump(self.lookup_data, f, indent=2, ensure_ascii=False) + + print(f"💾 Saved artist lookup table to {self.lookup_file}") + return True + + except Exception as e: + print(f"❌ Error saving artist lookup table: {e}") + return False + + def get_statistics(self) -> Dict[str, Any]: + """Get statistics about the lookup table.""" + if not self.lookup_data.get('artist_variations'): + return {'total_artists': 0, 'total_variations': 0} + + total_variations = sum( + len(artist_info.get('variations', [])) + 1 # +1 for canonical name + for artist_info in self.lookup_data['artist_variations'].values() + ) + + return { + 'total_artists': len(self.lookup_data['artist_variations']), + 'total_variations': total_variations, + 'metadata': self.lookup_data.get('metadata', {}) + } + + def list_artists(self) -> List[str]: + """Get a list of all canonical artist names in the lookup table.""" + return list(self.lookup_data.get('artist_variations', {}).keys()) + + def search_artists(self, query: str, min_score: float = 0.6) -> List[Dict[str, Any]]: + """ + Search for artists in the lookup table. + + Args: + query: Search query + min_score: Minimum fuzzy match score + + Returns: + List of matching artists with scores + """ + results = [] + query_lower = query.lower() + + for canonical_name, artist_info in self.lookup_data.get('artist_variations', {}).items(): + # Check canonical name + score = fuzz.ratio(query_lower, canonical_name.lower()) / 100.0 + if score >= min_score: + results.append({ + 'canonical_name': canonical_name, + 'mbid': artist_info['mbid'], + 'score': score, + 'match_type': 'canonical_name', + 'notes': artist_info.get('notes', '') + }) + + # Check variations + for variation in artist_info.get('variations', []): + score = fuzz.ratio(query_lower, variation.lower()) / 100.0 + if score >= min_score: + results.append({ + 'canonical_name': canonical_name, + 'mbid': artist_info['mbid'], + 'score': score, + 'match_type': 'variation', + 'matched_variation': variation, + 'notes': artist_info.get('notes', '') + }) + + # Sort by score (highest first) + results.sort(key=lambda x: x['score'], reverse=True) + return results + + +# Global instance for easy access +artist_lookup = ArtistLookupTable() + + +def find_artist_fallback(artist_name: str, min_score: float = 0.8) -> Optional[Dict[str, Any]]: + """ + Convenience function to find artist using the lookup table. + + Args: + artist_name: The artist name to search for + min_score: Minimum fuzzy match score + + Returns: + Artist info dict or None if not found + """ + return artist_lookup.find_artist_by_variation(artist_name, min_score) + + +def add_artist_fallback(canonical_name: str, mbid: str, variations: List[str], notes: str = "") -> bool: + """ + Convenience function to add artist variation to lookup table. + + Args: + canonical_name: The canonical artist name + mbid: The MusicBrainz ID + variations: List of name variations + notes: Optional notes about the artist + + Returns: + True if successfully added, False otherwise + """ + return artist_lookup.add_artist_variation(canonical_name, mbid, variations, notes) \ No newline at end of file