musicbrainz-cleaner/src/utils/data_loader.py

87 lines
3.6 KiB
Python

"""
Data loader utility for MusicBrainz Data Cleaner.
Loads known artist and recording MBIDs from JSON files.
"""
import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple
class DataLoader:
"""Loads and manages known artist and recording data from JSON files."""
def __init__(self, data_dir: str = "data"):
self.data_dir = Path(data_dir)
self._known_artists: Optional[Dict[str, str]] = None
self._known_recordings: Optional[Dict[Tuple[str, str], str]] = None
def load_known_artists(self) -> Dict[str, str]:
"""Load known artist MBIDs from JSON file."""
if self._known_artists is None:
artists_file = self.data_dir / "known_artists.json"
try:
with open(artists_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._known_artists = data.get('artists', {})
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Warning: Could not load known artists data: {e}")
self._known_artists = {}
return self._known_artists
def load_name_variations(self) -> Dict[str, str]:
"""Load name variations from JSON file."""
artists_file = self.data_dir / "known_artists.json"
try:
with open(artists_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get('name_variations', {})
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Warning: Could not load name variations data: {e}")
return {}
def load_known_recordings(self) -> Dict[Tuple[str, str], str]:
"""Load known recording MBIDs from JSON file."""
if self._known_recordings is None:
recordings_file = self.data_dir / "known_recordings.json"
try:
with open(recordings_file, 'r', encoding='utf-8') as f:
data = json.load(f)
recordings = data.get('recordings', [])
# Convert list format to tuple key format
self._known_recordings = {}
for recording in recordings:
artist_mbid = recording.get('artist_mbid')
title = recording.get('title')
recording_mbid = recording.get('recording_mbid')
if all([artist_mbid, title, recording_mbid]):
self._known_recordings[(artist_mbid, title)] = recording_mbid
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Warning: Could not load known recordings data: {e}")
self._known_recordings = {}
return self._known_recordings
def load_contraction_fixes(self) -> Dict[str, str]:
"""Load contraction fixes from JSON file."""
contractions_file = self.data_dir / "contraction_fixes.json"
try:
with open(contractions_file, 'r', encoding='utf-8') as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Warning: Could not load contraction fixes data: {e}")
return {}
def reload_data(self) -> None:
"""Reload data from files (useful for testing or updates)."""
self._known_artists = None
self._known_recordings = None
self.load_known_artists()
self.load_known_recordings()
# Global instance for easy access
data_loader = DataLoader()