""" Utility functions for the Karaoke Song Library Cleanup Tool. """ import json import os import re from pathlib import Path from typing import Dict, List, Any, Optional def load_json_file(file_path: str) -> Any: """Load and parse a JSON file.""" try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: raise FileNotFoundError(f"File not found: {file_path}") except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON in {file_path}: {e}") def save_json_file(data: Any, file_path: str, indent: int = 2) -> None: """Save data to a JSON file.""" os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=indent, ensure_ascii=False) def get_file_extension(file_path: str) -> str: """Extract file extension from file path.""" return os.path.splitext(file_path)[1].lower() def get_base_filename(file_path: str) -> str: """Get the base filename without extension for CDG/MP3 pairing.""" return os.path.splitext(file_path)[0] def find_mp3_pairs(songs: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: """ Group songs into MP3 pairs (CDG/MP3) and standalone files. Returns a dict with keys: 'pairs', 'standalone_mp4', 'standalone_mp3' """ pairs = [] standalone_mp4 = [] standalone_mp3 = [] # Create lookup for CDG and MP3 files by base filename cdg_lookup = {} mp3_lookup = {} for song in songs: ext = get_file_extension(song['path']) base_name = get_base_filename(song['path']) if ext == '.cdg': cdg_lookup[base_name] = song elif ext == '.mp3': mp3_lookup[base_name] = song elif ext == '.mp4': standalone_mp4.append(song) # Find CDG/MP3 pairs (treat as MP3) for base_name in cdg_lookup: if base_name in mp3_lookup: # Found a pair cdg_song = cdg_lookup[base_name] mp3_song = mp3_lookup[base_name] pairs.append([cdg_song, mp3_song]) else: # CDG without MP3 - treat as standalone MP3 standalone_mp3.append(cdg_lookup[base_name]) # Find MP3s without CDG for base_name in mp3_lookup: if base_name not in cdg_lookup: standalone_mp3.append(mp3_lookup[base_name]) return { 'pairs': pairs, 'standalone_mp4': standalone_mp4, 'standalone_mp3': standalone_mp3 } def normalize_artist_title(artist: str, title: str, case_sensitive: bool = False) -> str: """Normalize artist and title for consistent matching.""" if not case_sensitive: artist = artist.lower() title = title.lower() # Remove common punctuation and extra spaces artist = re.sub(r'[^\w\s]', ' ', artist).strip() title = re.sub(r'[^\w\s]', ' ', title).strip() # Replace multiple spaces with single space artist = re.sub(r'\s+', ' ', artist) title = re.sub(r'\s+', ' ', title) return f"{artist}|{title}" def extract_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]: """Extract channel information from file path based on configured folder names.""" if not file_path.lower().endswith('.mp4'): return None if not channel_priorities: return None # Look for configured channel priority folder names in the path path_lower = file_path.lower() for channel in channel_priorities: # Escape special regex characters in the channel name escaped_channel = re.escape(channel.lower()) if re.search(escaped_channel, path_lower): return channel # If no configured channel found, extract the parent directory name # Split path and look for the folder containing the file parts = file_path.split('\\') # Look for MP4 folder structure: MP4/ChannelName/song.mp4 for i, part in enumerate(parts): if part.lower() == 'mp4' and i < len(parts) - 1: # If MP4 is found, return the next folder (the actual channel) if i + 1 < len(parts): next_part = parts[i + 1] # Check if the next part is a folder (no file extension) or a file if '.' not in next_part: return next_part # It's a folder, so it's the channel name else: return 'MP4 Root' # File is directly in MP4 folder else: return 'MP4 Root' # If no specific channel found, return the folder containing the file if len(parts) >= 2: parent_folder = parts[-2] # Second to last part (folder containing the file) # If parent folder is MP4, then file is in root if parent_folder.lower() == 'mp4': return 'MP4 Root' # Make sure we're returning a folder name, not a filename if '.' not in parent_folder: return parent_folder else: # If parent is also a file, go up one more level if len(parts) >= 3: grandparent = parts[-3] if '.' not in grandparent: return grandparent return None def extract_consolidated_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]: """Extract channel information with consolidated MP3/CDG handling.""" path_lower = file_path.lower() parts = file_path.split('\\') # Handle MP3/CDG files - consolidate all CDG folder content under "MP3" if file_path.lower().endswith(('.mp3', '.cdg')): # Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3 for i, part in enumerate(parts): if part.lower() == 'cdg' and i < len(parts) - 1: # If CDG is found, return "MP3" as the consolidated channel return 'MP3' # If not in CDG folder but still MP3/CDG, return "MP3" return 'MP3' # Handle MP4 files - keep existing channel logic if not file_path.lower().endswith('.mp4'): return None if not channel_priorities: return None # Look for configured channel priority folder names in the path for channel in channel_priorities: # Escape special regex characters in the channel name escaped_channel = re.escape(channel.lower()) if re.search(escaped_channel, path_lower): return channel # If no configured channel found, extract the parent directory name # Look for MP4 folder structure: MP4/ChannelName/song.mp4 for i, part in enumerate(parts): if part.lower() == 'mp4' and i < len(parts) - 1: # If MP4 is found, return the next folder (the actual channel) if i + 1 < len(parts): next_part = parts[i + 1] # Check if the next part is a folder (no file extension) or a file if '.' not in next_part: return next_part # It's a folder, so it's the channel name else: return 'MP4 Root' # File is directly in MP4 folder else: return 'MP4 Root' # If no specific channel found, return the folder containing the file if len(parts) >= 2: parent_folder = parts[-2] # Second to last part (folder containing the file) # If parent folder is MP4, then file is in root if parent_folder.lower() == 'mp4': return 'MP4 Root' # Make sure we're returning a folder name, not a filename if '.' not in parent_folder: return parent_folder else: # If parent is also a file, go up one more level if len(parts) >= 3: grandparent = parts[-3] if '.' not in grandparent: return grandparent return None def clean_artist_name(artist_string: str) -> str: """Clean artist name by removing features, collaborations, etc.""" if not artist_string: return "" # Remove common feature/collaboration patterns (more precise) patterns_to_remove = [ r'\s*feat\.?\s*.*$', # feat. anything after r'\s*ft\.?\s*.*$', # ft. anything after r'\s*featuring\s*.*$', # featuring anything after r'\s*with\s*.*$', # with anything after r'\s*presents\s*.*$', # presents anything after r'\s*featuring\s*.*$', # featuring anything after r'\s*feat\s*.*$', # feat anything after r'\s*ft\s*.*$', # ft anything after ] # Handle comma/semicolon/slash patterns more carefully # Only remove if they're followed by feature words separator_patterns = [ r'\s*,\s*(feat\.?|ft\.?|featuring|with|presents).*$', # comma followed by feature words r'\s*;\s*(feat\.?|ft\.?|featuring|with|presents).*$', # semicolon followed by feature words r'\s*/\s*(feat\.?|ft\.?|featuring|with|presents).*$', # slash followed by feature words ] cleaned_artist = artist_string # Apply feature removal patterns first for pattern in patterns_to_remove: cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE) # Apply separator patterns only if they're followed by feature words for pattern in separator_patterns: cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE) # Clean up any trailing separators that might be left cleaned_artist = re.sub(r'\s*[,;/]\s*$', '', cleaned_artist) # Clean up extra whitespace cleaned_artist = re.sub(r'\s+', ' ', cleaned_artist).strip() return cleaned_artist def parse_multi_artist(artist_string: str) -> List[str]: """Parse multi-artist strings with various delimiters.""" if not artist_string: return [] # Common delimiters for multi-artist songs delimiters = [ r'\s*feat\.?\s*', r'\s*ft\.?\s*', r'\s*featuring\s*', r'\s*&\s*', r'\s*and\s*', r'\s*,\s*', r'\s*;\s*', r'\s*/\s*' ] # Split by delimiters artists = [artist_string] for delimiter in delimiters: new_artists = [] for artist in artists: new_artists.extend(re.split(delimiter, artist)) artists = [a.strip() for a in new_artists if a.strip()] return artists def format_file_size(size_bytes: int) -> str: """Format file size in human readable format.""" if size_bytes == 0: return "0B" size_names = ["B", "KB", "MB", "GB"] i = 0 while size_bytes >= 1024 and i < len(size_names) - 1: size_bytes /= 1024.0 i += 1 return f"{size_bytes:.1f}{size_names[i]}" def validate_song_data(song: Dict[str, Any]) -> bool: """Validate that a song object has required fields.""" required_fields = ['artist', 'title', 'path'] return all(field in song and song[field] for field in required_fields)