260 lines
11 KiB
Python
260 lines
11 KiB
Python
"""
|
||
Channel-specific parsing utilities for extracting artist and title from video titles.
|
||
|
||
This module handles the different title formats used by various karaoke channels,
|
||
providing channel-specific parsing rules to extract artist and title information
|
||
correctly for ID3 tagging and filename generation.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
from typing import Dict, List, Optional, Tuple, Any
|
||
from pathlib import Path
|
||
|
||
from karaoke_downloader.data_path_manager import get_data_path_manager
|
||
|
||
|
||
class ChannelParser:
|
||
"""Handles channel-specific parsing of video titles to extract artist and title."""
|
||
|
||
def __init__(self, channels_file: str = None):
|
||
if channels_file is None:
|
||
channels_file = str(get_data_path_manager().get_channels_json_path())
|
||
"""Initialize the parser with channel configuration."""
|
||
self.channels_file = Path(channels_file)
|
||
self.channels_config = self._load_channels_config()
|
||
|
||
def _load_channels_config(self) -> Dict[str, Any]:
|
||
"""Load the channels configuration from JSON file."""
|
||
if not self.channels_file.exists():
|
||
raise FileNotFoundError(f"Channels configuration file not found: {self.channels_file}")
|
||
|
||
with open(self.channels_file, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
|
||
def get_channel_config(self, channel_name: str) -> Optional[Dict[str, Any]]:
|
||
"""Get the configuration for a specific channel."""
|
||
for channel in self.channels_config.get("channels", []):
|
||
if channel["name"] == channel_name:
|
||
return channel
|
||
return None
|
||
|
||
def extract_artist_title(self, video_title: str, channel_name: str) -> Tuple[str, str]:
|
||
"""
|
||
Extract artist and title from a video title using channel-specific parsing rules.
|
||
|
||
Args:
|
||
video_title: The full video title from YouTube
|
||
channel_name: The name of the channel (must match config)
|
||
|
||
Returns:
|
||
Tuple of (artist, title) - both may be empty strings if parsing fails
|
||
"""
|
||
channel_config = self.get_channel_config(channel_name)
|
||
if not channel_config:
|
||
# Fallback to global settings
|
||
return self._fallback_parse(video_title)
|
||
|
||
parsing_rules = channel_config.get("parsing_rules", {})
|
||
format_type = parsing_rules.get("format", "artist_title_separator")
|
||
|
||
if format_type == "artist_title_separator":
|
||
return self._parse_artist_title_separator(video_title, parsing_rules)
|
||
elif format_type == "artist_title_spaces":
|
||
return self._parse_artist_title_spaces(video_title, parsing_rules)
|
||
elif format_type == "title_artist_pipe":
|
||
return self._parse_title_artist_pipe(video_title, parsing_rules)
|
||
else:
|
||
return self._fallback_parse(video_title)
|
||
|
||
def _parse_artist_title_separator(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||
"""Parse format: 'Artist - Title' or 'Title - Artist'."""
|
||
separator = rules.get("separator", " - ")
|
||
artist_first = rules.get("artist_first", True)
|
||
|
||
if separator not in video_title:
|
||
return "", video_title.strip()
|
||
|
||
parts = video_title.split(separator, 1)
|
||
if len(parts) != 2:
|
||
return "", video_title.strip()
|
||
|
||
part1, part2 = parts[0].strip(), parts[1].strip()
|
||
|
||
# Apply cleanup to both parts
|
||
part1_clean = self._cleanup_title(part1, rules.get("title_cleanup", {}))
|
||
part2_clean = self._cleanup_title(part2, rules.get("title_cleanup", {}))
|
||
|
||
if artist_first:
|
||
return part1_clean, part2_clean
|
||
else:
|
||
return part2_clean, part1_clean
|
||
|
||
def _parse_artist_title_spaces(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||
"""Parse format: 'Artist Title' (multiple spaces)."""
|
||
separator = rules.get("separator", " ")
|
||
multi_artist_sep = rules.get("multi_artist_separator", ", ")
|
||
|
||
# Try multiple space patterns to handle inconsistent spacing
|
||
# Look for the LAST occurrence of multiple spaces to handle cases with commas
|
||
space_patterns = [" ", " ", " "] # 3, 2, 4 spaces
|
||
|
||
for pattern in space_patterns:
|
||
if pattern in video_title:
|
||
# Split on the LAST occurrence of the pattern
|
||
last_index = video_title.rfind(pattern)
|
||
if last_index != -1:
|
||
artist_part = video_title[:last_index].strip()
|
||
title_part = video_title[last_index + len(pattern):].strip()
|
||
|
||
# Handle multiple artists (e.g., "Artist1, Artist2")
|
||
if multi_artist_sep in artist_part:
|
||
# Keep the full artist string as is
|
||
artist = artist_part
|
||
else:
|
||
artist = artist_part
|
||
|
||
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||
|
||
return artist, title
|
||
|
||
# Try dash patterns as fallback for inconsistent formatting
|
||
dash_patterns = [" - ", " – ", " -"] # Regular dash, en dash, dash without trailing space
|
||
|
||
for pattern in dash_patterns:
|
||
if pattern in video_title:
|
||
# Split on the LAST occurrence of the pattern
|
||
last_index = video_title.rfind(pattern)
|
||
if last_index != -1:
|
||
artist_part = video_title[:last_index].strip()
|
||
title_part = video_title[last_index + len(pattern):].strip()
|
||
|
||
# Handle multiple artists (e.g., "Artist1, Artist2")
|
||
if multi_artist_sep in artist_part:
|
||
# Keep the full artist string as is
|
||
artist = artist_part
|
||
else:
|
||
artist = artist_part
|
||
|
||
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||
|
||
return artist, title
|
||
|
||
# If no pattern matches, return empty artist and full title
|
||
return "", video_title.strip()
|
||
|
||
def _parse_title_artist_pipe(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||
"""Parse format: 'Title | Artist'."""
|
||
separator = rules.get("separator", " | ")
|
||
|
||
if separator not in video_title:
|
||
return "", video_title.strip()
|
||
|
||
parts = video_title.split(separator, 1)
|
||
if len(parts) != 2:
|
||
return "", video_title.strip()
|
||
|
||
title_part, artist_part = parts[0].strip(), parts[1].strip()
|
||
|
||
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||
artist = self._cleanup_title(artist_part, rules.get("artist_cleanup", {}))
|
||
|
||
return artist, title
|
||
|
||
def _cleanup_title(self, text: str, cleanup_rules: Dict[str, Any]) -> str:
|
||
"""Apply cleanup rules to remove suffixes and normalize text."""
|
||
if not cleanup_rules:
|
||
return text.strip()
|
||
|
||
cleaned = text.strip()
|
||
|
||
# Handle remove_suffix rule
|
||
if "remove_suffix" in cleanup_rules:
|
||
suffixes = cleanup_rules["remove_suffix"].get("suffixes", [])
|
||
for suffix in suffixes:
|
||
if cleaned.endswith(suffix):
|
||
cleaned = cleaned[:-len(suffix)].strip()
|
||
break
|
||
|
||
return cleaned
|
||
|
||
def _fallback_parse(self, video_title: str) -> Tuple[str, str]:
|
||
"""Fallback parsing using global settings."""
|
||
global_settings = self.channels_config.get("global_parsing_settings", {})
|
||
fallback_format = global_settings.get("fallback_format", "artist_title_separator")
|
||
fallback_separator = global_settings.get("fallback_separator", " - ")
|
||
|
||
if fallback_format == "artist_title_separator":
|
||
if fallback_separator in video_title:
|
||
parts = video_title.split(fallback_separator, 1)
|
||
if len(parts) == 2:
|
||
artist = parts[0].strip()
|
||
title = parts[1].strip()
|
||
# Apply global suffix cleanup
|
||
for suffix in global_settings.get("common_suffixes", []):
|
||
if title.endswith(suffix):
|
||
title = title[:-len(suffix)].strip()
|
||
break
|
||
return artist, title
|
||
|
||
# If all else fails, return empty artist and full title
|
||
return "", video_title.strip()
|
||
|
||
def is_playlist_title(self, video_title: str, channel_name: str) -> bool:
|
||
"""Check if a video title appears to be a playlist rather than a single song."""
|
||
channel_config = self.get_channel_config(channel_name)
|
||
if not channel_config:
|
||
return self._is_playlist_by_global_rules(video_title)
|
||
|
||
parsing_rules = channel_config.get("parsing_rules", {})
|
||
playlist_indicators = parsing_rules.get("playlist_indicators", [])
|
||
|
||
if not playlist_indicators:
|
||
return self._is_playlist_by_global_rules(video_title)
|
||
|
||
title_upper = video_title.upper()
|
||
for indicator in playlist_indicators:
|
||
if indicator.upper() in title_upper:
|
||
return True
|
||
|
||
return False
|
||
|
||
def _is_playlist_by_global_rules(self, video_title: str) -> bool:
|
||
"""Check if title is a playlist using global rules."""
|
||
global_settings = self.channels_config.get("global_parsing_settings", {})
|
||
playlist_indicators = global_settings.get("playlist_indicators", [])
|
||
|
||
title_upper = video_title.upper()
|
||
for indicator in playlist_indicators:
|
||
if indicator.upper() in title_upper:
|
||
return True
|
||
|
||
return False
|
||
|
||
def get_all_channel_names(self) -> List[str]:
|
||
"""Get a list of all configured channel names."""
|
||
return [channel["name"] for channel in self.channels_config.get("channels", [])]
|
||
|
||
def get_channel_url(self, channel_name: str) -> Optional[str]:
|
||
"""Get the URL for a specific channel."""
|
||
channel_config = self.get_channel_config(channel_name)
|
||
return channel_config.get("url") if channel_config else None
|
||
|
||
|
||
# Convenience function for backward compatibility
|
||
def extract_artist_title(video_title: str, channel_name: str, channels_file: str = None) -> Tuple[str, str]:
|
||
if channels_file is None:
|
||
channels_file = str(get_data_path_manager().get_channels_json_path())
|
||
"""
|
||
Convenience function to extract artist and title from a video title.
|
||
|
||
Args:
|
||
video_title: The full video title from YouTube
|
||
channel_name: The name of the channel
|
||
channels_file: Path to the channels configuration file
|
||
|
||
Returns:
|
||
Tuple of (artist, title)
|
||
"""
|
||
parser = ChannelParser(channels_file)
|
||
return parser.extract_artist_title(video_title, channel_name) |