KaraokeVideoDownloader/karaoke_downloader/file_utils.py

240 lines
6.9 KiB
Python

"""
File utilities for filename sanitization, path operations, and file validation.
Centralizes common file operations to eliminate code duplication.
"""
import re
from pathlib import Path
from typing import List, Optional, Tuple
# Constants for filename operations
DEFAULT_FILENAME_LENGTH_LIMIT = 100
DEFAULT_ARTIST_LENGTH_LIMIT = 30
DEFAULT_TITLE_LENGTH_LIMIT = 60
# Windows invalid characters
INVALID_FILENAME_CHARS = ["?", ":", "*", '"', "<", ">", "|", "/", "\\"]
def sanitize_filename(
artist: str, title: str, max_length: int = DEFAULT_FILENAME_LENGTH_LIMIT
) -> str:
"""
Create a safe filename from artist and title.
Args:
artist: Song artist name
title: Song title
max_length: Maximum filename length (default: 100)
Returns:
Sanitized filename string
"""
# Clean up title
safe_title = (
title.replace("(From ", "")
.replace(")", "")
.replace(" - ", " ")
.replace(":", "")
)
safe_title = safe_title.replace("'", "").replace('"', "")
# Clean up artist
safe_artist = artist.replace("'", "").replace('"', "").strip()
# Remove invalid characters
for char in INVALID_FILENAME_CHARS:
safe_title = safe_title.replace(char, "")
safe_artist = safe_artist.replace(char, "")
# Remove problematic patterns
safe_title = (
safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
)
safe_artist = safe_artist.strip()
# Create filename
filename = f"{safe_artist} - {safe_title}.mp4"
# Limit filename length if needed
if len(filename) > max_length:
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
return filename
def generate_possible_filenames(
artist: str, title: str, channel_name: str
) -> List[str]:
"""
Generate possible filename patterns for different download modes.
Args:
artist: Song artist name
title: Song title
channel_name: Channel name
Returns:
List of possible filename patterns
"""
safe_title = sanitize_title_for_filenames(title)
safe_artist = artist.replace("'", "").replace('"', "").strip()
return [
f"{safe_artist} - {safe_title}.mp4", # Songlist mode
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
f"{safe_artist} - {safe_title} (Karaoke Version).mp4", # Channel videos mode
]
def sanitize_title_for_filenames(title: str) -> str:
"""
Sanitize title specifically for filename generation.
Args:
title: Song title
Returns:
Sanitized title string
"""
safe_title = title
for char in INVALID_FILENAME_CHARS:
safe_title = safe_title.replace(char, "")
safe_title = (
safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
)
return safe_title
def check_file_exists_with_patterns(
downloads_dir: Path, channel_name: str, artist: str, title: str
) -> Tuple[bool, Optional[Path]]:
"""
Check if a file exists using multiple possible filename patterns.
Also checks for files with (2), (3), etc. suffixes that yt-dlp might create.
Args:
downloads_dir: Base downloads directory
channel_name: Channel name
artist: Song artist
title: Song title
Returns:
Tuple of (exists, file_path) where file_path is None if not found
"""
possible_filenames = generate_possible_filenames(artist, title, channel_name)
channel_dir = downloads_dir / channel_name
for filename in possible_filenames:
if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT:
# Apply length limits if needed
safe_artist = artist.replace("'", "").replace('"', "").strip()
safe_title = sanitize_title_for_filenames(title)
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
# Check for exact filename match
file_path = channel_dir / filename
if file_path.exists() and file_path.stat().st_size > 0:
return True, file_path
# Check for files with (2), (3), etc. suffixes
base_name = filename.replace(".mp4", "")
for suffix in range(2, 10): # Check up to (9)
suffixed_filename = f"{base_name} ({suffix}).mp4"
suffixed_path = channel_dir / suffixed_filename
if suffixed_path.exists() and suffixed_path.stat().st_size > 0:
return True, suffixed_path
return False, None
def get_unique_filename(
downloads_dir: Path, channel_name: str, artist: str, title: str
) -> Tuple[Path, bool]:
"""
Get a unique filename for download, checking for existing files including duplicates.
Args:
downloads_dir: Base downloads directory
channel_name: Channel name
artist: Song artist
title: Song title
Returns:
Tuple of (file_path, is_existing) where is_existing indicates if a file already exists
"""
filename = sanitize_filename(artist, title)
channel_dir = downloads_dir / channel_name
file_path = channel_dir / filename
# Check if file already exists
exists, existing_path = check_file_exists_with_patterns(downloads_dir, channel_name, artist, title)
if exists and existing_path:
print(f"📁 File already exists: {existing_path.name}")
return existing_path, True
return file_path, False
def ensure_directory_exists(directory: Path) -> None:
"""
Ensure a directory exists, creating it if necessary.
Args:
directory: Directory path to ensure exists
"""
directory.mkdir(parents=True, exist_ok=True)
def is_valid_mp4_file(file_path: Path) -> bool:
"""
Check if a file is a valid MP4 file.
Args:
file_path: Path to the file to check
Returns:
True if file is a valid MP4, False otherwise
"""
if not file_path.exists():
return False
# Check file size
if file_path.stat().st_size == 0:
return False
# Check file extension
if file_path.suffix.lower() != ".mp4":
return False
# Basic MP4 header check (first 4 bytes should be 'ftyp')
try:
with open(file_path, "rb") as f:
header = f.read(8)
if len(header) >= 8 and header[4:8] == b"ftyp":
return True
except (IOError, OSError):
pass
return False
def cleanup_temp_files(file_path: Path) -> None:
"""
Clean up temporary files created by yt-dlp.
Args:
file_path: Base file path (without extension)
"""
temp_extensions = [".info.json", ".meta", ".webp", ".jpg", ".png"]
for ext in temp_extensions:
temp_file = file_path.with_suffix(ext)
if temp_file.exists():
try:
temp_file.unlink()
except (IOError, OSError):
pass # Ignore cleanup errors