KaraokeMerge/cli/utils.py

266 lines
9.1 KiB
Python

"""
Utility functions for the Karaoke Song Library Cleanup Tool.
"""
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Any, Optional
def load_json_file(file_path: str) -> Any:
"""Load and parse a JSON file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in {file_path}: {e}")
def save_json_file(data: Any, file_path: str, indent: int = 2) -> None:
"""Save data to a JSON file."""
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=indent, ensure_ascii=False)
def get_file_extension(file_path: str) -> str:
"""Extract file extension from file path."""
return os.path.splitext(file_path)[1].lower()
def get_base_filename(file_path: str) -> str:
"""Get the base filename without extension for CDG/MP3 pairing."""
return os.path.splitext(file_path)[0]
def find_mp3_pairs(songs: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
Group songs into MP3 pairs (CDG/MP3) and standalone files.
Returns a dict with keys: 'pairs', 'standalone_mp4', 'standalone_mp3'
"""
pairs = []
standalone_mp4 = []
standalone_mp3 = []
# Create lookup for CDG and MP3 files by base filename
cdg_lookup = {}
mp3_lookup = {}
for song in songs:
ext = get_file_extension(song['path'])
base_name = get_base_filename(song['path'])
if ext == '.cdg':
cdg_lookup[base_name] = song
elif ext == '.mp3':
mp3_lookup[base_name] = song
elif ext == '.mp4':
standalone_mp4.append(song)
# Find CDG/MP3 pairs (treat as MP3)
for base_name in cdg_lookup:
if base_name in mp3_lookup:
# Found a pair
cdg_song = cdg_lookup[base_name]
mp3_song = mp3_lookup[base_name]
pairs.append([cdg_song, mp3_song])
else:
# CDG without MP3 - treat as standalone MP3
standalone_mp3.append(cdg_lookup[base_name])
# Find MP3s without CDG
for base_name in mp3_lookup:
if base_name not in cdg_lookup:
standalone_mp3.append(mp3_lookup[base_name])
return {
'pairs': pairs,
'standalone_mp4': standalone_mp4,
'standalone_mp3': standalone_mp3
}
def normalize_artist_title(artist: str, title: str, case_sensitive: bool = False) -> str:
"""Normalize artist and title for consistent matching."""
if not case_sensitive:
artist = artist.lower()
title = title.lower()
# Remove common punctuation and extra spaces
artist = re.sub(r'[^\w\s]', ' ', artist).strip()
title = re.sub(r'[^\w\s]', ' ', title).strip()
# Replace multiple spaces with single space
artist = re.sub(r'\s+', ' ', artist)
title = re.sub(r'\s+', ' ', title)
return f"{artist}|{title}"
def extract_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
"""Extract channel information from file path based on configured folder names."""
if not file_path.lower().endswith('.mp4'):
return None
if not channel_priorities:
return None
# Look for configured channel priority folder names in the path
path_lower = file_path.lower()
for channel in channel_priorities:
# Escape special regex characters in the channel name
escaped_channel = re.escape(channel.lower())
if re.search(escaped_channel, path_lower):
return channel
# If no configured channel found, extract the parent directory name
# Split path and look for the folder containing the file
parts = file_path.split('\\')
# Look for MP4 folder structure: MP4/ChannelName/song.mp4
for i, part in enumerate(parts):
if part.lower() == 'mp4' and i < len(parts) - 1:
# If MP4 is found, return the next folder (the actual channel)
if i + 1 < len(parts):
next_part = parts[i + 1]
# Check if the next part is a folder (no file extension) or a file
if '.' not in next_part:
return next_part # It's a folder, so it's the channel name
else:
return 'MP4 Root' # File is directly in MP4 folder
else:
return 'MP4 Root'
# If no specific channel found, return the folder containing the file
if len(parts) >= 2:
parent_folder = parts[-2] # Second to last part (folder containing the file)
# If parent folder is MP4, then file is in root
if parent_folder.lower() == 'mp4':
return 'MP4 Root'
# Make sure we're returning a folder name, not a filename
if '.' not in parent_folder:
return parent_folder
else:
# If parent is also a file, go up one more level
if len(parts) >= 3:
grandparent = parts[-3]
if '.' not in grandparent:
return grandparent
return None
def extract_consolidated_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
"""Extract channel information with consolidated MP3/CDG handling."""
path_lower = file_path.lower()
parts = file_path.split('\\')
# Handle MP3/CDG files - consolidate all CDG folder content under "MP3"
if file_path.lower().endswith(('.mp3', '.cdg')):
# Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3
for i, part in enumerate(parts):
if part.lower() == 'cdg' and i < len(parts) - 1:
# If CDG is found, return "MP3" as the consolidated channel
return 'MP3'
# If not in CDG folder but still MP3/CDG, return "MP3"
return 'MP3'
# Handle MP4 files - keep existing channel logic
if not file_path.lower().endswith('.mp4'):
return None
if not channel_priorities:
return None
# Look for configured channel priority folder names in the path
for channel in channel_priorities:
# Escape special regex characters in the channel name
escaped_channel = re.escape(channel.lower())
if re.search(escaped_channel, path_lower):
return channel
# If no configured channel found, extract the parent directory name
# Look for MP4 folder structure: MP4/ChannelName/song.mp4
for i, part in enumerate(parts):
if part.lower() == 'mp4' and i < len(parts) - 1:
# If MP4 is found, return the next folder (the actual channel)
if i + 1 < len(parts):
next_part = parts[i + 1]
# Check if the next part is a folder (no file extension) or a file
if '.' not in next_part:
return next_part # It's a folder, so it's the channel name
else:
return 'MP4 Root' # File is directly in MP4 folder
else:
return 'MP4 Root'
# If no specific channel found, return the folder containing the file
if len(parts) >= 2:
parent_folder = parts[-2] # Second to last part (folder containing the file)
# If parent folder is MP4, then file is in root
if parent_folder.lower() == 'mp4':
return 'MP4 Root'
# Make sure we're returning a folder name, not a filename
if '.' not in parent_folder:
return parent_folder
else:
# If parent is also a file, go up one more level
if len(parts) >= 3:
grandparent = parts[-3]
if '.' not in grandparent:
return grandparent
return None
def parse_multi_artist(artist_string: str) -> List[str]:
"""Parse multi-artist strings with various delimiters."""
if not artist_string:
return []
# Common delimiters for multi-artist songs
delimiters = [
r'\s*feat\.?\s*',
r'\s*ft\.?\s*',
r'\s*featuring\s*',
r'\s*&\s*',
r'\s*and\s*',
r'\s*,\s*',
r'\s*;\s*',
r'\s*/\s*'
]
# Split by delimiters
artists = [artist_string]
for delimiter in delimiters:
new_artists = []
for artist in artists:
new_artists.extend(re.split(delimiter, artist))
artists = [a.strip() for a in new_artists if a.strip()]
return artists
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format."""
if size_bytes == 0:
return "0B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f}{size_names[i]}"
def validate_song_data(song: Dict[str, Any]) -> bool:
"""Validate that a song object has required fields."""
required_fields = ['artist', 'title', 'path']
return all(field in song and song[field] for field in required_fields)