87 lines
3.6 KiB
Python
87 lines
3.6 KiB
Python
"""
|
|
Data loader utility for MusicBrainz Data Cleaner.
|
|
Loads known artist and recording MBIDs from JSON files.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
class DataLoader:
|
|
"""Loads and manages known artist and recording data from JSON files."""
|
|
|
|
def __init__(self, data_dir: str = "data"):
|
|
self.data_dir = Path(data_dir)
|
|
self._known_artists: Optional[Dict[str, str]] = None
|
|
self._known_recordings: Optional[Dict[Tuple[str, str], str]] = None
|
|
|
|
def load_known_artists(self) -> Dict[str, str]:
|
|
"""Load known artist MBIDs from JSON file."""
|
|
if self._known_artists is None:
|
|
artists_file = self.data_dir / "known_artists.json"
|
|
try:
|
|
with open(artists_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
self._known_artists = data.get('artists', {})
|
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
print(f"Warning: Could not load known artists data: {e}")
|
|
self._known_artists = {}
|
|
|
|
return self._known_artists
|
|
|
|
def load_name_variations(self) -> Dict[str, str]:
|
|
"""Load name variations from JSON file."""
|
|
artists_file = self.data_dir / "known_artists.json"
|
|
try:
|
|
with open(artists_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
return data.get('name_variations', {})
|
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
print(f"Warning: Could not load name variations data: {e}")
|
|
return {}
|
|
|
|
def load_known_recordings(self) -> Dict[Tuple[str, str], str]:
|
|
"""Load known recording MBIDs from JSON file."""
|
|
if self._known_recordings is None:
|
|
recordings_file = self.data_dir / "known_recordings.json"
|
|
try:
|
|
with open(recordings_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
recordings = data.get('recordings', [])
|
|
|
|
# Convert list format to tuple key format
|
|
self._known_recordings = {}
|
|
for recording in recordings:
|
|
artist_mbid = recording.get('artist_mbid')
|
|
title = recording.get('title')
|
|
recording_mbid = recording.get('recording_mbid')
|
|
if all([artist_mbid, title, recording_mbid]):
|
|
self._known_recordings[(artist_mbid, title)] = recording_mbid
|
|
|
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
print(f"Warning: Could not load known recordings data: {e}")
|
|
self._known_recordings = {}
|
|
|
|
return self._known_recordings
|
|
|
|
def load_contraction_fixes(self) -> Dict[str, str]:
|
|
"""Load contraction fixes from JSON file."""
|
|
contractions_file = self.data_dir / "contraction_fixes.json"
|
|
try:
|
|
with open(contractions_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
print(f"Warning: Could not load contraction fixes data: {e}")
|
|
return {}
|
|
|
|
def reload_data(self) -> None:
|
|
"""Reload data from files (useful for testing or updates)."""
|
|
self._known_artists = None
|
|
self._known_recordings = None
|
|
self.load_known_artists()
|
|
self.load_known_recordings()
|
|
|
|
|
|
# Global instance for easy access
|
|
data_loader = DataLoader() |