310 lines
11 KiB
Python
310 lines
11 KiB
Python
"""
|
|
Utility functions for the Karaoke Song Library Cleanup Tool.
|
|
"""
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
|
|
def load_json_file(file_path: str) -> Any:
|
|
"""Load and parse a JSON file."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in {file_path}: {e}")
|
|
|
|
|
|
def save_json_file(data: Any, file_path: str, indent: int = 2) -> None:
|
|
"""Save data to a JSON file."""
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
|
|
|
|
def get_file_extension(file_path: str) -> str:
|
|
"""Extract file extension from file path."""
|
|
return os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
def get_base_filename(file_path: str) -> str:
|
|
"""Get the base filename without extension for CDG/MP3 pairing."""
|
|
return os.path.splitext(file_path)[0]
|
|
|
|
|
|
def find_mp3_pairs(songs: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Group songs into MP3 pairs (CDG/MP3) and standalone files.
|
|
Returns a dict with keys: 'pairs', 'standalone_mp4', 'standalone_mp3'
|
|
"""
|
|
pairs = []
|
|
standalone_mp4 = []
|
|
standalone_mp3 = []
|
|
|
|
# Create lookup for CDG and MP3 files by base filename
|
|
cdg_lookup = {}
|
|
mp3_lookup = {}
|
|
|
|
for song in songs:
|
|
ext = get_file_extension(song['path'])
|
|
base_name = get_base_filename(song['path'])
|
|
|
|
if ext == '.cdg':
|
|
cdg_lookup[base_name] = song
|
|
elif ext == '.mp3':
|
|
mp3_lookup[base_name] = song
|
|
elif ext == '.mp4':
|
|
standalone_mp4.append(song)
|
|
|
|
# Find CDG/MP3 pairs (treat as MP3)
|
|
for base_name in cdg_lookup:
|
|
if base_name in mp3_lookup:
|
|
# Found a pair
|
|
cdg_song = cdg_lookup[base_name]
|
|
mp3_song = mp3_lookup[base_name]
|
|
pairs.append([cdg_song, mp3_song])
|
|
else:
|
|
# CDG without MP3 - treat as standalone MP3
|
|
standalone_mp3.append(cdg_lookup[base_name])
|
|
|
|
# Find MP3s without CDG
|
|
for base_name in mp3_lookup:
|
|
if base_name not in cdg_lookup:
|
|
standalone_mp3.append(mp3_lookup[base_name])
|
|
|
|
return {
|
|
'pairs': pairs,
|
|
'standalone_mp4': standalone_mp4,
|
|
'standalone_mp3': standalone_mp3
|
|
}
|
|
|
|
|
|
def normalize_artist_title(artist: str, title: str, case_sensitive: bool = False) -> str:
|
|
"""Normalize artist and title for consistent matching."""
|
|
if not case_sensitive:
|
|
artist = artist.lower()
|
|
title = title.lower()
|
|
|
|
# Remove common punctuation and extra spaces
|
|
artist = re.sub(r'[^\w\s]', ' ', artist).strip()
|
|
title = re.sub(r'[^\w\s]', ' ', title).strip()
|
|
|
|
# Replace multiple spaces with single space
|
|
artist = re.sub(r'\s+', ' ', artist)
|
|
title = re.sub(r'\s+', ' ', title)
|
|
|
|
return f"{artist}|{title}"
|
|
|
|
|
|
def extract_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
|
|
"""Extract channel information from file path based on configured folder names."""
|
|
if not file_path.lower().endswith('.mp4'):
|
|
return None
|
|
|
|
if not channel_priorities:
|
|
return None
|
|
|
|
# Look for configured channel priority folder names in the path
|
|
path_lower = file_path.lower()
|
|
|
|
for channel in channel_priorities:
|
|
# Escape special regex characters in the channel name
|
|
escaped_channel = re.escape(channel.lower())
|
|
if re.search(escaped_channel, path_lower):
|
|
return channel
|
|
|
|
# If no configured channel found, extract the parent directory name
|
|
# Split path and look for the folder containing the file
|
|
parts = file_path.split('\\')
|
|
|
|
# Look for MP4 folder structure: MP4/ChannelName/song.mp4
|
|
for i, part in enumerate(parts):
|
|
if part.lower() == 'mp4' and i < len(parts) - 1:
|
|
# If MP4 is found, return the next folder (the actual channel)
|
|
if i + 1 < len(parts):
|
|
next_part = parts[i + 1]
|
|
# Check if the next part is a folder (no file extension) or a file
|
|
if '.' not in next_part:
|
|
return next_part # It's a folder, so it's the channel name
|
|
else:
|
|
return 'MP4 Root' # File is directly in MP4 folder
|
|
else:
|
|
return 'MP4 Root'
|
|
|
|
# If no specific channel found, return the folder containing the file
|
|
if len(parts) >= 2:
|
|
parent_folder = parts[-2] # Second to last part (folder containing the file)
|
|
# If parent folder is MP4, then file is in root
|
|
if parent_folder.lower() == 'mp4':
|
|
return 'MP4 Root'
|
|
# Make sure we're returning a folder name, not a filename
|
|
if '.' not in parent_folder:
|
|
return parent_folder
|
|
else:
|
|
# If parent is also a file, go up one more level
|
|
if len(parts) >= 3:
|
|
grandparent = parts[-3]
|
|
if '.' not in grandparent:
|
|
return grandparent
|
|
|
|
return None
|
|
|
|
|
|
def extract_consolidated_channel_from_path(file_path: str, channel_priorities: List[str] = None) -> Optional[str]:
|
|
"""Extract channel information with consolidated MP3/CDG handling."""
|
|
path_lower = file_path.lower()
|
|
parts = file_path.split('\\')
|
|
|
|
# Handle MP3/CDG files - consolidate all CDG folder content under "MP3"
|
|
if file_path.lower().endswith(('.mp3', '.cdg')):
|
|
# Look for CDG folder structure: CDG/ParentFolder/SubFolder/file.mp3
|
|
for i, part in enumerate(parts):
|
|
if part.lower() == 'cdg' and i < len(parts) - 1:
|
|
# If CDG is found, return "MP3" as the consolidated channel
|
|
return 'MP3'
|
|
|
|
# If not in CDG folder but still MP3/CDG, return "MP3"
|
|
return 'MP3'
|
|
|
|
# Handle MP4 files - keep existing channel logic
|
|
if not file_path.lower().endswith('.mp4'):
|
|
return None
|
|
|
|
if not channel_priorities:
|
|
return None
|
|
|
|
# Look for configured channel priority folder names in the path
|
|
for channel in channel_priorities:
|
|
# Escape special regex characters in the channel name
|
|
escaped_channel = re.escape(channel.lower())
|
|
if re.search(escaped_channel, path_lower):
|
|
return channel
|
|
|
|
# If no configured channel found, extract the parent directory name
|
|
# Look for MP4 folder structure: MP4/ChannelName/song.mp4
|
|
for i, part in enumerate(parts):
|
|
if part.lower() == 'mp4' and i < len(parts) - 1:
|
|
# If MP4 is found, return the next folder (the actual channel)
|
|
if i + 1 < len(parts):
|
|
next_part = parts[i + 1]
|
|
# Check if the next part is a folder (no file extension) or a file
|
|
if '.' not in next_part:
|
|
return next_part # It's a folder, so it's the channel name
|
|
else:
|
|
return 'MP4 Root' # File is directly in MP4 folder
|
|
else:
|
|
return 'MP4 Root'
|
|
|
|
# If no specific channel found, return the folder containing the file
|
|
if len(parts) >= 2:
|
|
parent_folder = parts[-2] # Second to last part (folder containing the file)
|
|
# If parent folder is MP4, then file is in root
|
|
if parent_folder.lower() == 'mp4':
|
|
return 'MP4 Root'
|
|
# Make sure we're returning a folder name, not a filename
|
|
if '.' not in parent_folder:
|
|
return parent_folder
|
|
else:
|
|
# If parent is also a file, go up one more level
|
|
if len(parts) >= 3:
|
|
grandparent = parts[-3]
|
|
if '.' not in grandparent:
|
|
return grandparent
|
|
|
|
return None
|
|
|
|
|
|
def clean_artist_name(artist_string: str) -> str:
|
|
"""Clean artist name by removing features, collaborations, etc."""
|
|
if not artist_string:
|
|
return ""
|
|
|
|
# Remove common feature/collaboration patterns (more precise)
|
|
patterns_to_remove = [
|
|
r'\s*feat\.?\s*.*$', # feat. anything after
|
|
r'\s*ft\.?\s*.*$', # ft. anything after
|
|
r'\s*featuring\s*.*$', # featuring anything after
|
|
r'\s*with\s*.*$', # with anything after
|
|
r'\s*presents\s*.*$', # presents anything after
|
|
r'\s*featuring\s*.*$', # featuring anything after
|
|
r'\s*feat\s*.*$', # feat anything after
|
|
r'\s*ft\s*.*$', # ft anything after
|
|
]
|
|
|
|
# Handle comma/semicolon/slash patterns more carefully
|
|
# Only remove if they're followed by feature words
|
|
separator_patterns = [
|
|
r'\s*,\s*(feat\.?|ft\.?|featuring|with|presents).*$', # comma followed by feature words
|
|
r'\s*;\s*(feat\.?|ft\.?|featuring|with|presents).*$', # semicolon followed by feature words
|
|
r'\s*/\s*(feat\.?|ft\.?|featuring|with|presents).*$', # slash followed by feature words
|
|
]
|
|
|
|
cleaned_artist = artist_string
|
|
|
|
# Apply feature removal patterns first
|
|
for pattern in patterns_to_remove:
|
|
cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE)
|
|
|
|
# Apply separator patterns only if they're followed by feature words
|
|
for pattern in separator_patterns:
|
|
cleaned_artist = re.sub(pattern, '', cleaned_artist, flags=re.IGNORECASE)
|
|
|
|
# Clean up any trailing separators that might be left
|
|
cleaned_artist = re.sub(r'\s*[,;/]\s*$', '', cleaned_artist)
|
|
|
|
# Clean up extra whitespace
|
|
cleaned_artist = re.sub(r'\s+', ' ', cleaned_artist).strip()
|
|
|
|
return cleaned_artist
|
|
|
|
|
|
def parse_multi_artist(artist_string: str) -> List[str]:
|
|
"""Parse multi-artist strings with various delimiters."""
|
|
if not artist_string:
|
|
return []
|
|
|
|
# Common delimiters for multi-artist songs
|
|
delimiters = [
|
|
r'\s*feat\.?\s*',
|
|
r'\s*ft\.?\s*',
|
|
r'\s*featuring\s*',
|
|
r'\s*&\s*',
|
|
r'\s*and\s*',
|
|
r'\s*,\s*',
|
|
r'\s*;\s*',
|
|
r'\s*/\s*'
|
|
]
|
|
|
|
# Split by delimiters
|
|
artists = [artist_string]
|
|
for delimiter in delimiters:
|
|
new_artists = []
|
|
for artist in artists:
|
|
new_artists.extend(re.split(delimiter, artist))
|
|
artists = [a.strip() for a in new_artists if a.strip()]
|
|
|
|
return artists
|
|
|
|
|
|
def format_file_size(size_bytes: int) -> str:
|
|
"""Format file size in human readable format."""
|
|
if size_bytes == 0:
|
|
return "0B"
|
|
|
|
size_names = ["B", "KB", "MB", "GB"]
|
|
i = 0
|
|
while size_bytes >= 1024 and i < len(size_names) - 1:
|
|
size_bytes /= 1024.0
|
|
i += 1
|
|
|
|
return f"{size_bytes:.1f}{size_names[i]}"
|
|
|
|
|
|
def validate_song_data(song: Dict[str, Any]) -> bool:
|
|
"""Validate that a song object has required fields."""
|
|
required_fields = ['artist', 'title', 'path']
|
|
return all(field in song and song[field] for field in required_fields) |