KaraokeMerge/cli/utils.py

"""
Utility functions for the Karaoke Song Library Cleanup Tool.
"""
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Any, Optional


def load_json_file(file_path: str) -> Any:
    """Load and parse a JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON in {file_path}: {e}")


def save_json_file(data: Any, file_path: str, indent: int = 2) -> None:
    """Save data to a JSON file."""
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)


def get_file_extension(file_path: str) -> str:
    """Extract file extension from file path."""
    return os.path.splitext(file_path)[1].lower()


def get_base_filename(file_path: str) -> str:
    """Get the base filename without extension for CDG/MP3 pairing."""
    return os.path.splitext(file_path)[0]


def find_mp3_pairs(songs: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    Group songs into MP3 pairs (CDG/MP3) and standalone files.
    Returns a dict with keys: 'pairs', 'standalone_mp4', 'standalone_mp3'
    """
    pairs = []
    standalone_mp4 = []
    standalone_mp3 = []

    # Create lookup for CDG and MP3 files by base filename
    cdg_lookup = {}
    mp3_lookup = {}

    for song in songs:
        ext = get_file_extension(song['path'])
        base_name = get_base_filename(song['path'])

        if ext == '.cdg':
            cdg_lookup[base_name] = song
        elif ext == '.mp3':
            mp3_lookup[base_name] = song
        elif ext == '.mp4':
            standalone_mp4.append(song)

    # Find CDG/MP3 pairs (treat as MP3)
    for base_name in cdg_lookup:
        if base_name in mp3_lookup:
            # Found a pair
            cdg_song = cdg_lookup[base_name]
            mp3_song = mp3_lookup[base_name]
            pairs.append([cdg_song, mp3_song])
        else:
            # CDG without MP3 - treat as standalone MP3
            standalone_mp3.append(cdg_lookup[base_name])

    # Find MP3s without CDG
    for base_name in mp3_lookup:
        if base_name not in cdg_lookup:
            standalone_mp3.append(mp3_lookup[base_name])

    return {
        'pairs': pairs,
        'standalone_mp4': standalone_mp4,
        'standalone_mp3': standalone_mp3
    }


def normalize_artist_title(artist: str, title: str, case_sensitive: bool = False) -> str:
    """Normalize artist and title for consistent matching."""
    if not case_sensitive:
        artist = artist.lower()
        title = title.lower()

    # Remove common punctuation and extra spaces
    artist = re.sub(r'[^\w\s]', ' ', artist).strip()
    title = re.sub(r'[^\w\s]', ' ', title).strip()

    # Replace multiple spaces with single space
    artist = re.sub(r'\s+', ' ', artist)
    title = re.sub(r'\s+', ' ', title)

    return f"{artist}|{title}"


def extract_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
    """Extract channel information from file path based on configured folder names."""
    if not file_path.lower().endswith('.mp4'):
        return None

    if not channel_priorities:
        return None

    # Look for configured channel priority folder names in the path
    path_lower = file_path.lower()

    for channel in channel_priorities:
        # Escape special regex characters in the channel name
        escaped_channel = re.escape(channel.lower())
        if re.search(escaped_channel, path_lower):
            return channel

    # If no configured channel found, extract the parent directory name
    # Split path and look for the folder containing the file
    parts = file_path.split('\\')

    # Look for MP4 folder structure: MP4/ChannelName/song.mp4
    for i, part in enumerate(parts):
        if part.lower() == 'mp4' and i < len(parts) - 1:
            # If MP4 is found, return the next folder (the actual channel)
            if i + 1 < len(parts):
                next_part = parts[i + 1]
                # Check if the next part is a folder (no file extension) or a file
                if '.' not in next_part:
                    return next_part  # It's a folder, so it's the channel name
                else:
                    return 'MP4 Root'  # File is directly in MP4 folder
            else:
                return 'MP4 Root'

    # If no specific channel found, return the folder containing the file
    if len(parts) >= 2:
        parent_folder = parts[-2]  # Second to last part (folder containing the file)
        # If parent folder is MP4, then file is in root
        if parent_folder.lower() == 'mp4':
            return 'MP4 Root'
        # Make sure we're returning a folder name, not a filename
        if '.' not in parent_folder:
            return parent_folder
        else:
            # If parent is also a file, go up one more level
            if len(parts) >= 3:
                grandparent = parts[-3]
                if '.' not in grandparent:
                    return grandparent

    return None


def extract_consolidated_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
    """Extract channel information with consolidated MP3/CDG handling."""
    path_lower = file_path.lower()
    parts = file_path.split('\\')

    # Handle MP3/CDG files - consolidate all CDG folder content under "MP3"
    if file_path.lower().endswith(('.mp3', '.cdg')):
        # Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3
        for i, part in enumerate(parts):
            if part.lower() == 'cdg' and i < len(parts) - 1:
                # If CDG is found, return "MP3" as the consolidated channel
                return 'MP3'

        # If not in CDG folder but still MP3/CDG, return "MP3"
        return 'MP3'

    # Handle MP4 files - keep existing channel logic
    if not file_path.lower().endswith('.mp4'):
        return None

    if not channel_priorities:
        return None

    # Look for configured channel priority folder names in the path
    for channel in channel_priorities:
        # Escape special regex characters in the channel name
        escaped_channel = re.escape(channel.lower())
        if re.search(escaped_channel, path_lower):
            return channel

    # If no configured channel found, extract the parent directory name
    # Look for MP4 folder structure: MP4/ChannelName/song.mp4
    for i, part in enumerate(parts):
        if part.lower() == 'mp4' and i < len(parts) - 1:
            # If MP4 is found, return the next folder (the actual channel)
            if i + 1 < len(parts):
                next_part = parts[i + 1]
                # Check if the next part is a folder (no file extension) or a file
                if '.' not in next_part:
                    return next_part  # It's a folder, so it's the channel name
                else:
                    return 'MP4 Root'  # File is directly in MP4 folder
            else:
                return 'MP4 Root'

    # If no specific channel found, return the folder containing the file
    if len(parts) >= 2:
        parent_folder = parts[-2]  # Second to last part (folder containing the file)
        # If parent folder is MP4, then file is in root
        if parent_folder.lower() == 'mp4':
            return 'MP4 Root'
        # Make sure we're returning a folder name, not a filename
        if '.' not in parent_folder:
            return parent_folder
        else:
            # If parent is also a file, go up one more level
            if len(parts) >= 3:
                grandparent = parts[-3]
                if '.' not in grandparent:
                    return grandparent

    return None


def clean_artist_name(artist_string: str) -> str:
    """Clean artist name by removing features, collaborations, etc."""
    if not artist_string:
        return ""

    # Remove common feature/collaboration patterns (more precise)
    patterns_to_remove = [
        r'\s*feat\.?\s*.*$',      # feat. anything after
        r'\s*ft\.?\s*.*$',        # ft. anything after
        r'\s*featuring\s*.*$',    # featuring anything after
        r'\s*with\s*.*$',         # with anything after
        r'\s*presents\s*.*$',     # presents anything after
        r'\s*featuring\s*.*$',    # featuring anything after
        r'\s*feat\s*.*$',         # feat anything after
        r'\s*ft\s*.*$',           # ft anything after
    ]

    # Handle comma/semicolon/slash patterns more carefully
    # Only remove if they're followed by feature words
    separator_patterns = [
        r'\s*,\s*(feat\.?|ft\.?|featuring|with|presents).*$',  # comma followed by feature words
        r'\s*;\s*(feat\.?|ft\.?|featuring|with|presents).*$',  # semicolon followed by feature words
        r'\s*/\s*(feat\.?|ft\.?|featuring|with|presents).*$',  # slash followed by feature words
    ]

    cleaned_artist = artist_string

    # Apply feature removal patterns first
    for pattern in patterns_to_remove:
        cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE)

    # Apply separator patterns only if they're followed by feature words
    for pattern in separator_patterns:
        cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE)

    # Clean up any trailing separators that might be left
    cleaned_artist = re.sub(r'\s*[,;/]\s*$', '', cleaned_artist)

    # Clean up extra whitespace
    cleaned_artist = re.sub(r'\s+', ' ', cleaned_artist).strip()

    return cleaned_artist


def parse_multi_artist(artist_string: str) -> List[str]:
    """Parse multi-artist strings with various delimiters."""
    if not artist_string:
        return []

    # Common delimiters for multi-artist songs
    delimiters = [
        r'\s*feat\.?\s*',
        r'\s*ft\.?\s*',
        r'\s*featuring\s*',
        r'\s*&\s*',
        r'\s*and\s*',
        r'\s*,\s*',
        r'\s*;\s*',
        r'\s*/\s*'
    ]

    # Split by delimiters
    artists = [artist_string]
    for delimiter in delimiters:
        new_artists = []
        for artist in artists:
            new_artists.extend(re.split(delimiter, artist))
        artists = [a.strip() for a in new_artists if a.strip()]

    return artists


def format_file_size(size_bytes: int) -> str:
    """Format file size in human readable format."""
    if size_bytes == 0:
        return "0B"

    size_names = ["B", "KB", "MB", "GB"]
    i = 0
    while size_bytes >= 1024 and i < len(size_names) - 1:
        size_bytes /= 1024.0
        i += 1

    return f"{size_bytes:.1f}{size_names[i]}"


def validate_song_data(song: Dict[str, Any]) -> bool:
    """Validate that a song object has required fields."""
    required_fields = ['artist', 'title', 'path']
    return all(field in song and song[field] for field in required_fields)