""" Channel-specific parsing utilities for extracting artist and title from video titles. This module handles the different title formats used by various karaoke channels, providing channel-specific parsing rules to extract artist and title information correctly for ID3 tagging and filename generation. """ import json import re from typing import Dict, List, Optional, Tuple, Any from pathlib import Path from karaoke_downloader.data_path_manager import get_data_path_manager class ChannelParser: """Handles channel-specific parsing of video titles to extract artist and title.""" def __init__(self, channels_file: str = None): if channels_file is None: channels_file = str(get_data_path_manager().get_channels_json_path()) """Initialize the parser with channel configuration.""" self.channels_file = Path(channels_file) self.channels_config = self._load_channels_config() def _load_channels_config(self) -> Dict[str, Any]: """Load the channels configuration from JSON file.""" if not self.channels_file.exists(): raise FileNotFoundError(f"Channels configuration file not found: {self.channels_file}") with open(self.channels_file, 'r', encoding='utf-8') as f: return json.load(f) def get_channel_config(self, channel_name: str) -> Optional[Dict[str, Any]]: """Get the configuration for a specific channel.""" for channel in self.channels_config.get("channels", []): if channel["name"] == channel_name: return channel return None def extract_artist_title(self, video_title: str, channel_name: str) -> Tuple[str, str]: """ Extract artist and title from a video title using channel-specific parsing rules. Args: video_title: The full video title from YouTube channel_name: The name of the channel (must match config) Returns: Tuple of (artist, title) - both may be empty strings if parsing fails """ channel_config = self.get_channel_config(channel_name) if not channel_config: # Fallback to global settings return self._fallback_parse(video_title) parsing_rules = channel_config.get("parsing_rules", {}) format_type = parsing_rules.get("format", "artist_title_separator") if format_type == "artist_title_separator": return self._parse_artist_title_separator(video_title, parsing_rules) elif format_type == "artist_title_spaces": return self._parse_artist_title_spaces(video_title, parsing_rules) elif format_type == "title_artist_pipe": return self._parse_title_artist_pipe(video_title, parsing_rules) else: return self._fallback_parse(video_title) def _parse_artist_title_separator(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]: """Parse format: 'Artist - Title' or 'Title - Artist'.""" separator = rules.get("separator", " - ") artist_first = rules.get("artist_first", True) if separator not in video_title: return "", video_title.strip() parts = video_title.split(separator, 1) if len(parts) != 2: return "", video_title.strip() part1, part2 = parts[0].strip(), parts[1].strip() # Apply cleanup to both parts part1_clean = self._cleanup_title(part1, rules.get("title_cleanup", {})) part2_clean = self._cleanup_title(part2, rules.get("title_cleanup", {})) if artist_first: return part1_clean, part2_clean else: return part2_clean, part1_clean def _parse_artist_title_spaces(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]: """Parse format: 'Artist Title' (multiple spaces).""" separator = rules.get("separator", " ") multi_artist_sep = rules.get("multi_artist_separator", ", ") # Try multiple space patterns to handle inconsistent spacing # Look for the LAST occurrence of multiple spaces to handle cases with commas space_patterns = [" ", " ", " "] # 3, 2, 4 spaces for pattern in space_patterns: if pattern in video_title: # Split on the LAST occurrence of the pattern last_index = video_title.rfind(pattern) if last_index != -1: artist_part = video_title[:last_index].strip() title_part = video_title[last_index + len(pattern):].strip() # Handle multiple artists (e.g., "Artist1, Artist2") if multi_artist_sep in artist_part: # Keep the full artist string as is artist = artist_part else: artist = artist_part title = self._cleanup_title(title_part, rules.get("title_cleanup", {})) return artist, title # Try dash patterns as fallback for inconsistent formatting dash_patterns = [" - ", " – ", " -"] # Regular dash, en dash, dash without trailing space for pattern in dash_patterns: if pattern in video_title: # Split on the LAST occurrence of the pattern last_index = video_title.rfind(pattern) if last_index != -1: artist_part = video_title[:last_index].strip() title_part = video_title[last_index + len(pattern):].strip() # Handle multiple artists (e.g., "Artist1, Artist2") if multi_artist_sep in artist_part: # Keep the full artist string as is artist = artist_part else: artist = artist_part title = self._cleanup_title(title_part, rules.get("title_cleanup", {})) return artist, title # If no pattern matches, return empty artist and full title return "", video_title.strip() def _parse_title_artist_pipe(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]: """Parse format: 'Title | Artist'.""" separator = rules.get("separator", " | ") if separator not in video_title: return "", video_title.strip() parts = video_title.split(separator, 1) if len(parts) != 2: return "", video_title.strip() title_part, artist_part = parts[0].strip(), parts[1].strip() title = self._cleanup_title(title_part, rules.get("title_cleanup", {})) artist = self._cleanup_title(artist_part, rules.get("artist_cleanup", {})) return artist, title def _cleanup_title(self, text: str, cleanup_rules: Dict[str, Any]) -> str: """Apply cleanup rules to remove suffixes and normalize text.""" if not cleanup_rules: return text.strip() cleaned = text.strip() # Handle remove_suffix rule if "remove_suffix" in cleanup_rules: suffixes = cleanup_rules["remove_suffix"].get("suffixes", []) for suffix in suffixes: if cleaned.endswith(suffix): cleaned = cleaned[:-len(suffix)].strip() break return cleaned def _fallback_parse(self, video_title: str) -> Tuple[str, str]: """Fallback parsing using global settings.""" global_settings = self.channels_config.get("global_parsing_settings", {}) fallback_format = global_settings.get("fallback_format", "artist_title_separator") fallback_separator = global_settings.get("fallback_separator", " - ") if fallback_format == "artist_title_separator": if fallback_separator in video_title: parts = video_title.split(fallback_separator, 1) if len(parts) == 2: artist = parts[0].strip() title = parts[1].strip() # Apply global suffix cleanup for suffix in global_settings.get("common_suffixes", []): if title.endswith(suffix): title = title[:-len(suffix)].strip() break return artist, title # If all else fails, return empty artist and full title return "", video_title.strip() def is_playlist_title(self, video_title: str, channel_name: str) -> bool: """Check if a video title appears to be a playlist rather than a single song.""" channel_config = self.get_channel_config(channel_name) if not channel_config: return self._is_playlist_by_global_rules(video_title) parsing_rules = channel_config.get("parsing_rules", {}) playlist_indicators = parsing_rules.get("playlist_indicators", []) if not playlist_indicators: return self._is_playlist_by_global_rules(video_title) title_upper = video_title.upper() for indicator in playlist_indicators: if indicator.upper() in title_upper: return True return False def _is_playlist_by_global_rules(self, video_title: str) -> bool: """Check if title is a playlist using global rules.""" global_settings = self.channels_config.get("global_parsing_settings", {}) playlist_indicators = global_settings.get("playlist_indicators", []) title_upper = video_title.upper() for indicator in playlist_indicators: if indicator.upper() in title_upper: return True return False def get_all_channel_names(self) -> List[str]: """Get a list of all configured channel names.""" return [channel["name"] for channel in self.channels_config.get("channels", [])] def get_channel_url(self, channel_name: str) -> Optional[str]: """Get the URL for a specific channel.""" channel_config = self.get_channel_config(channel_name) return channel_config.get("url") if channel_config else None # Convenience function for backward compatibility def extract_artist_title(video_title: str, channel_name: str, channels_file: str = None) -> Tuple[str, str]: if channels_file is None: channels_file = str(get_data_path_manager().get_channels_json_path()) """ Convenience function to extract artist and title from a video title. Args: video_title: The full video title from YouTube channel_name: The name of the channel channels_file: Path to the channels configuration file Returns: Tuple of (artist, title) """ parser = ChannelParser(channels_file) return parser.extract_artist_title(video_title, channel_name)