KaraokeMerge/cli/playlist_validator.py

350 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Playlist validation module for the Karaoke Song Library Cleanup Tool.
Validates playlist songs against the song library using exact and fuzzy matching.
"""
import json
import os
from typing import Dict, List, Any, Tuple, Optional
from collections import defaultdict
import difflib
try:
from fuzzywuzzy import fuzz
FUZZY_AVAILABLE = True
except ImportError:
FUZZY_AVAILABLE = False
from utils import (
normalize_artist_title,
extract_channel_from_path,
get_file_extension,
parse_multi_artist,
clean_artist_name,
validate_song_data
)
from matching import SongMatcher
class PlaylistValidator:
"""Validates playlist songs against the song library."""
def __init__(self, config: Dict[str, Any], data_dir: str = "../data"):
self.config = config
self.data_dir = data_dir
self.song_matcher = SongMatcher(config, data_dir)
self.fuzzy_threshold = config.get('matching', {}).get('fuzzy_threshold', 0.8)
# Load song library
self.all_songs = self._load_all_songs()
if not self.all_songs:
raise ValueError("Could not load song library from allSongs.json")
# Create lookup dictionaries for faster matching
self._build_lookup_tables()
def _load_all_songs(self) -> List[Dict[str, Any]]:
"""Load the song library from songs.json."""
all_songs_path = os.path.join(self.data_dir, 'songs.json')
try:
with open(all_songs_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Error loading song library: {e}")
return []
def _build_lookup_tables(self):
"""Build lookup tables for faster exact matching."""
self.exact_lookup = {}
self.artist_title_lookup = {}
for song in self.all_songs:
if not validate_song_data(song):
continue
# Clean and handle artist names
cleaned_artist = clean_artist_name(song['artist'])
if not cleaned_artist:
cleaned_artist = song['artist'] # Fallback to original if cleaning fails
# Handle multi-artist songs (after cleaning)
artists = parse_multi_artist(cleaned_artist)
if not artists:
artists = [cleaned_artist]
# Create exact match keys
for artist in artists:
normalized_key = normalize_artist_title(artist, song['title'], False)
if normalized_key not in self.exact_lookup:
self.exact_lookup[normalized_key] = []
self.exact_lookup[normalized_key].append(song)
# Also store by artist-title for fuzzy matching
artist_title_key = f"{artist.lower()} - {song['title'].lower()}"
if artist_title_key not in self.artist_title_lookup:
self.artist_title_lookup[artist_title_key] = []
self.artist_title_lookup[artist_title_key].append(song)
def find_exact_match(self, artist: str, title: str) -> Optional[List[Dict[str, Any]]]:
"""Find exact matches for artist/title combination."""
normalized_key = normalize_artist_title(artist, title, False)
return self.exact_lookup.get(normalized_key, [])
def find_fuzzy_matches(self, artist: str, title: str, threshold: float = None) -> List[Tuple[Dict[str, Any], float]]:
"""Find fuzzy matches for artist/title combination."""
if not FUZZY_AVAILABLE:
return []
if threshold is None:
threshold = self.fuzzy_threshold
query = f"{artist.lower()} - {title.lower()}"
matches = []
for key, songs in self.artist_title_lookup.items():
similarity = fuzz.ratio(query, key) / 100.0
if similarity >= threshold:
# Get the best song from this group using existing priority logic
best_song, _ = self.song_matcher.select_best_song(songs, artist, title)
matches.append((best_song, similarity))
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
return matches
def validate_playlist(self, playlist: Dict[str, Any], dry_run: bool = True) -> Dict[str, Any]:
"""Validate a single playlist against the song library."""
results = {
'playlist_title': playlist.get('title', 'Unknown Playlist'),
'total_songs': len(playlist.get('songs', [])),
'exact_matches': [],
'fuzzy_matches': [],
'missing_songs': [],
'summary': {
'exact_match_count': 0,
'fuzzy_match_count': 0,
'missing_count': 0,
'needs_manual_review': 0
}
}
for song in playlist.get('songs', []):
artist = song.get('artist', '')
title = song.get('title', '')
position = song.get('position', 0)
if not artist or not title:
results['missing_songs'].append({
'position': position,
'artist': artist,
'title': title,
'reason': 'Missing artist or title'
})
results['summary']['missing_count'] += 1
continue
# Try exact match first
exact_matches = self.find_exact_match(artist, title)
if exact_matches:
# Get the best song using existing priority logic
best_song, _ = self.song_matcher.select_best_song(exact_matches, artist, title)
results['exact_matches'].append({
'position': position,
'playlist_artist': artist,
'playlist_title': title,
'found_song': best_song,
'match_type': 'exact'
})
results['summary']['exact_match_count'] += 1
else:
# Try fuzzy matching
fuzzy_matches = self.find_fuzzy_matches(artist, title)
if fuzzy_matches:
best_fuzzy_song, similarity = fuzzy_matches[0]
results['fuzzy_matches'].append({
'position': position,
'playlist_artist': artist,
'playlist_title': title,
'found_song': best_fuzzy_song,
'similarity': similarity,
'match_type': 'fuzzy',
'needs_manual_review': True
})
results['summary']['fuzzy_match_count'] += 1
results['summary']['needs_manual_review'] += 1
else:
results['missing_songs'].append({
'position': position,
'artist': artist,
'title': title,
'reason': 'No matches found'
})
results['summary']['missing_count'] += 1
return results
def validate_all_playlists(self, dry_run: bool = True) -> Dict[str, Any]:
"""Validate all playlists in songList.json."""
playlists_path = os.path.join(self.data_dir, 'songList.json')
try:
with open(playlists_path, 'r', encoding='utf-8') as f:
playlists = json.load(f)
except Exception as e:
print(f"Error loading playlists: {e}")
return {}
all_results = {
'total_playlists': len(playlists),
'playlist_results': [],
'overall_summary': {
'total_songs': 0,
'exact_matches': 0,
'fuzzy_matches': 0,
'missing_songs': 0,
'needs_manual_review': 0
}
}
for playlist in playlists:
result = self.validate_playlist(playlist, dry_run)
all_results['playlist_results'].append(result)
# Update overall summary
summary = result['summary']
all_results['overall_summary']['total_songs'] += result['total_songs']
all_results['overall_summary']['exact_matches'] += summary['exact_match_count']
all_results['overall_summary']['fuzzy_matches'] += summary['fuzzy_match_count']
all_results['overall_summary']['missing_songs'] += summary['missing_count']
all_results['overall_summary']['needs_manual_review'] += summary['needs_manual_review']
return all_results
def update_playlist_song(self, playlist_index: int, song_position: int,
new_artist: str, new_title: str, dry_run: bool = True) -> bool:
"""Update a playlist song with corrected artist/title."""
playlists_path = os.path.join(self.data_dir, 'songList.json')
try:
with open(playlists_path, 'r', encoding='utf-8') as f:
playlists = json.load(f)
except Exception as e:
print(f"Error loading playlists: {e}")
return False
if playlist_index >= len(playlists):
print(f"Invalid playlist index: {playlist_index}")
return False
playlist = playlists[playlist_index]
songs = playlist.get('songs', [])
# Find the song by position
for song in songs:
if song.get('position') == song_position:
if dry_run:
print(f"DRY RUN: Would update playlist '{playlist['title']}' song {song_position}")
print(f" From: {song['artist']} - {song['title']}")
print(f" To: {new_artist} - {new_title}")
else:
song['artist'] = new_artist
song['title'] = new_title
# Save the updated playlists
try:
with open(playlists_path, 'w', encoding='utf-8') as f:
json.dump(playlists, f, indent=2, ensure_ascii=False)
print(f"Updated playlist '{playlist['title']}' song {song_position}")
return True
except Exception as e:
print(f"Error saving playlists: {e}")
return False
break
else:
print(f"Song with position {song_position} not found in playlist")
return False
return True
def main():
"""Main function for CLI usage."""
import argparse
parser = argparse.ArgumentParser(description='Validate playlists against song library')
parser.add_argument('--config', default='../config/config.json', help='Configuration file path')
parser.add_argument('--data-dir', default='../data', help='Data directory path')
parser.add_argument('--dry-run', action='store_true', default=True, help='Dry run mode (default)')
parser.add_argument('--apply', action='store_true', help='Apply changes (disable dry run)')
parser.add_argument('--playlist-index', type=int, help='Validate specific playlist by index')
parser.add_argument('--output', help='Output results to JSON file')
args = parser.parse_args()
# Load configuration
try:
with open(args.config, 'r') as f:
config = json.load(f)
except Exception as e:
print(f"Error loading config: {e}")
return
# Create validator
validator = PlaylistValidator(config, args.data_dir)
# Determine dry run mode
dry_run = not args.apply
if args.playlist_index is not None:
# Validate specific playlist
playlists_path = os.path.join(args.data_dir, 'songList.json')
try:
with open(playlists_path, 'r', encoding='utf-8') as f:
playlists = json.load(f)
except Exception as e:
print(f"Error loading playlists: {e}")
return
if args.playlist_index >= len(playlists):
print(f"Invalid playlist index: {args.playlist_index}")
return
result = validator.validate_playlist(playlists[args.playlist_index], dry_run)
print(f"\nPlaylist: {result['playlist_title']}")
print(f"Total songs: {result['total_songs']}")
print(f"Exact matches: {result['summary']['exact_match_count']}")
print(f"Fuzzy matches: {result['summary']['fuzzy_match_count']}")
print(f"Missing: {result['summary']['missing_count']}")
print(f"Need manual review: {result['summary']['needs_manual_review']}")
else:
# Validate all playlists
results = validator.validate_all_playlists(dry_run)
print(f"\nPlaylist Validation Results:")
print(f"Total playlists: {results['total_playlists']}")
print(f"Total songs: {results['overall_summary']['total_songs']}")
print(f"Exact matches: {results['overall_summary']['exact_matches']}")
print(f"Fuzzy matches: {results['overall_summary']['fuzzy_matches']}")
print(f"Missing: {results['overall_summary']['missing_songs']}")
print(f"Need manual review: {results['overall_summary']['needs_manual_review']}")
if args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to: {args.output}")
except Exception as e:
print(f"Error saving results: {e}")
if __name__ == '__main__':
main()