KaraokeMerge/cli/playlist_validator.py

#!/usr/bin/env python3
"""
Playlist validation module for the Karaoke Song Library Cleanup Tool.
Validates playlist songs against the song library using exact and fuzzy matching.
"""

import json
import os
from typing import Dict, List, Any, Tuple, Optional
from collections import defaultdict
import difflib

try:
    from fuzzywuzzy import fuzz
    FUZZY_AVAILABLE = True
except ImportError:
    FUZZY_AVAILABLE = False

from utils import (
    normalize_artist_title,
    extract_channel_from_path,
    get_file_extension,
    parse_multi_artist,
    clean_artist_name,
    validate_song_data
)

from matching import SongMatcher


class PlaylistValidator:
    """Validates playlist songs against the song library."""

    def __init__(self, config: Dict[str, Any], data_dir: str = "../data"):
        self.config = config
        self.data_dir = data_dir
        self.song_matcher = SongMatcher(config, data_dir)
        self.fuzzy_threshold = config.get('matching', {}).get('fuzzy_threshold', 0.8)

        # Load song library
        self.all_songs = self._load_all_songs()
        if not self.all_songs:
            raise ValueError("Could not load song library from allSongs.json")

        # Create lookup dictionaries for faster matching
        self._build_lookup_tables()

    def _load_all_songs(self) -> List[Dict[str, Any]]:
        """Load the song library from songs.json."""
        all_songs_path = os.path.join(self.data_dir, 'songs.json')
        try:
            with open(all_songs_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading song library: {e}")
            return []

    def _build_lookup_tables(self):
        """Build lookup tables for faster exact matching."""
        self.exact_lookup = {}
        self.artist_title_lookup = {}

        for song in self.all_songs:
            if not validate_song_data(song):
                continue

            # Clean and handle artist names
            cleaned_artist = clean_artist_name(song['artist'])
            if not cleaned_artist:
                cleaned_artist = song['artist']  # Fallback to original if cleaning fails

            # Handle multi-artist songs (after cleaning)
            artists = parse_multi_artist(cleaned_artist)
            if not artists:
                artists = [cleaned_artist]

            # Create exact match keys
            for artist in artists:
                normalized_key = normalize_artist_title(artist, song['title'], False)
                if normalized_key not in self.exact_lookup:
                    self.exact_lookup[normalized_key] = []
                self.exact_lookup[normalized_key].append(song)

                # Also store by artist-title for fuzzy matching
                artist_title_key = f"{artist.lower()} - {song['title'].lower()}"
                if artist_title_key not in self.artist_title_lookup:
                    self.artist_title_lookup[artist_title_key] = []
                self.artist_title_lookup[artist_title_key].append(song)

    def find_exact_match(self, artist: str, title: str) -> Optional[List[Dict[str, Any]]]:
        """Find exact matches for artist/title combination."""
        normalized_key = normalize_artist_title(artist, title, False)
        return self.exact_lookup.get(normalized_key, [])

    def find_fuzzy_matches(self, artist: str, title: str, threshold: float = None) -> List[Tuple[Dict[str, Any], float]]:
        """Find fuzzy matches for artist/title combination."""
        if not FUZZY_AVAILABLE:
            return []

        if threshold is None:
            threshold = self.fuzzy_threshold

        query = f"{artist.lower()} - {title.lower()}"
        matches = []

        for key, songs in self.artist_title_lookup.items():
            similarity = fuzz.ratio(query, key) / 100.0
            if similarity >= threshold:
                # Get the best song from this group using existing priority logic
                best_song, _ = self.song_matcher.select_best_song(songs, artist, title)
                matches.append((best_song, similarity))

        # Sort by similarity score (highest first)
        matches.sort(key=lambda x: x[1], reverse=True)
        return matches

    def validate_playlist(self, playlist: Dict[str, Any], dry_run: bool = True) -> Dict[str, Any]:
        """Validate a single playlist against the song library."""
        results = {
            'playlist_title': playlist.get('title', 'Unknown Playlist'),
            'total_songs': len(playlist.get('songs', [])),
            'exact_matches': [],
            'fuzzy_matches': [],
            'missing_songs': [],
            'summary': {
                'exact_match_count': 0,
                'fuzzy_match_count': 0,
                'missing_count': 0,
                'needs_manual_review': 0
            }
        }

        for song in playlist.get('songs', []):
            artist = song.get('artist', '')
            title = song.get('title', '')
            position = song.get('position', 0)

            if not artist or not title:
                results['missing_songs'].append({
                    'position': position,
                    'artist': artist,
                    'title': title,
                    'reason': 'Missing artist or title'
                })
                results['summary']['missing_count'] += 1
                continue

            # Try exact match first
            exact_matches = self.find_exact_match(artist, title)

            if exact_matches:
                # Get the best song using existing priority logic
                best_song, _ = self.song_matcher.select_best_song(exact_matches, artist, title)

                results['exact_matches'].append({
                    'position': position,
                    'playlist_artist': artist,
                    'playlist_title': title,
                    'found_song': best_song,
                    'match_type': 'exact'
                })
                results['summary']['exact_match_count'] += 1

            else:
                # Try fuzzy matching
                fuzzy_matches = self.find_fuzzy_matches(artist, title)

                if fuzzy_matches:
                    best_fuzzy_song, similarity = fuzzy_matches[0]

                    results['fuzzy_matches'].append({
                        'position': position,
                        'playlist_artist': artist,
                        'playlist_title': title,
                        'found_song': best_fuzzy_song,
                        'similarity': similarity,
                        'match_type': 'fuzzy',
                        'needs_manual_review': True
                    })
                    results['summary']['fuzzy_match_count'] += 1
                    results['summary']['needs_manual_review'] += 1

                else:
                    results['missing_songs'].append({
                        'position': position,
                        'artist': artist,
                        'title': title,
                        'reason': 'No matches found'
                    })
                    results['summary']['missing_count'] += 1

        return results

    def validate_all_playlists(self, dry_run: bool = True) -> Dict[str, Any]:
        """Validate all playlists in songList.json."""
        playlists_path = os.path.join(self.data_dir, 'songList.json')

        try:
            with open(playlists_path, 'r', encoding='utf-8') as f:
                playlists = json.load(f)
        except Exception as e:
            print(f"Error loading playlists: {e}")
            return {}

        all_results = {
            'total_playlists': len(playlists),
            'playlist_results': [],
            'overall_summary': {
                'total_songs': 0,
                'exact_matches': 0,
                'fuzzy_matches': 0,
                'missing_songs': 0,
                'needs_manual_review': 0
            }
        }

        for playlist in playlists:
            result = self.validate_playlist(playlist, dry_run)
            all_results['playlist_results'].append(result)

            # Update overall summary
            summary = result['summary']
            all_results['overall_summary']['total_songs'] += result['total_songs']
            all_results['overall_summary']['exact_matches'] += summary['exact_match_count']
            all_results['overall_summary']['fuzzy_matches'] += summary['fuzzy_match_count']
            all_results['overall_summary']['missing_songs'] += summary['missing_count']
            all_results['overall_summary']['needs_manual_review'] += summary['needs_manual_review']

        return all_results

    def update_playlist_song(self, playlist_index: int, song_position: int,
                           new_artist: str, new_title: str, dry_run: bool = True) -> bool:
        """Update a playlist song with corrected artist/title."""
        playlists_path = os.path.join(self.data_dir, 'songList.json')

        try:
            with open(playlists_path, 'r', encoding='utf-8') as f:
                playlists = json.load(f)
        except Exception as e:
            print(f"Error loading playlists: {e}")
            return False

        if playlist_index >= len(playlists):
            print(f"Invalid playlist index: {playlist_index}")
            return False

        playlist = playlists[playlist_index]
        songs = playlist.get('songs', [])

        # Find the song by position
        for song in songs:
            if song.get('position') == song_position:
                if dry_run:
                    print(f"DRY RUN: Would update playlist '{playlist['title']}' song {song_position}")
                    print(f"  From: {song['artist']} - {song['title']}")
                    print(f"  To:   {new_artist} - {new_title}")
                else:
                    song['artist'] = new_artist
                    song['title'] = new_title

                    # Save the updated playlists
                    try:
                        with open(playlists_path, 'w', encoding='utf-8') as f:
                            json.dump(playlists, f, indent=2, ensure_ascii=False)
                        print(f"Updated playlist '{playlist['title']}' song {song_position}")
                        return True
                    except Exception as e:
                        print(f"Error saving playlists: {e}")
                        return False
                break
        else:
            print(f"Song with position {song_position} not found in playlist")
            return False

        return True


def main():
    """Main function for CLI usage."""
    import argparse

    parser = argparse.ArgumentParser(description='Validate playlists against song library')
    parser.add_argument('--config', default='../config/config.json', help='Configuration file path')
    parser.add_argument('--data-dir', default='../data', help='Data directory path')
    parser.add_argument('--dry-run', action='store_true', default=True, help='Dry run mode (default)')
    parser.add_argument('--apply', action='store_true', help='Apply changes (disable dry run)')
    parser.add_argument('--playlist-index', type=int, help='Validate specific playlist by index')
    parser.add_argument('--output', help='Output results to JSON file')

    args = parser.parse_args()

    # Load configuration
    try:
        with open(args.config, 'r') as f:
            config = json.load(f)
    except Exception as e:
        print(f"Error loading config: {e}")
        return

    # Create validator
    validator = PlaylistValidator(config, args.data_dir)

    # Determine dry run mode
    dry_run = not args.apply

    if args.playlist_index is not None:
        # Validate specific playlist
        playlists_path = os.path.join(args.data_dir, 'songList.json')
        try:
            with open(playlists_path, 'r', encoding='utf-8') as f:
                playlists = json.load(f)
        except Exception as e:
            print(f"Error loading playlists: {e}")
            return

        if args.playlist_index >= len(playlists):
            print(f"Invalid playlist index: {args.playlist_index}")
            return

        result = validator.validate_playlist(playlists[args.playlist_index], dry_run)
        print(f"\nPlaylist: {result['playlist_title']}")
        print(f"Total songs: {result['total_songs']}")
        print(f"Exact matches: {result['summary']['exact_match_count']}")
        print(f"Fuzzy matches: {result['summary']['fuzzy_match_count']}")
        print(f"Missing: {result['summary']['missing_count']}")
        print(f"Need manual review: {result['summary']['needs_manual_review']}")

    else:
        # Validate all playlists
        results = validator.validate_all_playlists(dry_run)

        print(f"\nPlaylist Validation Results:")
        print(f"Total playlists: {results['total_playlists']}")
        print(f"Total songs: {results['overall_summary']['total_songs']}")
        print(f"Exact matches: {results['overall_summary']['exact_matches']}")
        print(f"Fuzzy matches: {results['overall_summary']['fuzzy_matches']}")
        print(f"Missing: {results['overall_summary']['missing_songs']}")
        print(f"Need manual review: {results['overall_summary']['needs_manual_review']}")

        if args.output:
            try:
                with open(args.output, 'w', encoding='utf-8') as f:
                    json.dump(results, f, indent=2, ensure_ascii=False)
                print(f"\nResults saved to: {args.output}")
            except Exception as e:
                print(f"Error saving results: {e}")


if __name__ == '__main__':
    main()