KaraokeVideoDownloader/karaoke_downloader/channel_parser.py

"""
Channel-specific parsing utilities for extracting artist and title from video titles.

This module handles the different title formats used by various karaoke channels,
providing channel-specific parsing rules to extract artist and title information
correctly for ID3 tagging and filename generation.
"""

import json
import re
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path

from karaoke_downloader.data_path_manager import get_data_path_manager


class ChannelParser:
    """Handles channel-specific parsing of video titles to extract artist and title."""

    def __init__(self, channels_file: str = None):
        if channels_file is None:
            channels_file = str(get_data_path_manager().get_channels_json_path())
        """Initialize the parser with channel configuration."""
        self.channels_file = Path(channels_file)
        self.channels_config = self._load_channels_config()

    def _load_channels_config(self) -> Dict[str, Any]:
        """Load the channels configuration from JSON file."""
        if not self.channels_file.exists():
            raise FileNotFoundError(f"Channels configuration file not found: {self.channels_file}")

        with open(self.channels_file, 'r', encoding='utf-8') as f:
            return json.load(f)

    def get_channel_config(self, channel_name: str) -> Optional[Dict[str, Any]]:
        """Get the configuration for a specific channel."""
        for channel in self.channels_config.get("channels", []):
            if channel["name"] == channel_name:
                return channel
        return None

    def extract_artist_title(self, video_title: str, channel_name: str) -> Tuple[str, str]:
        """
        Extract artist and title from a video title using channel-specific parsing rules.

        Args:
            video_title: The full video title from YouTube
            channel_name: The name of the channel (must match config)

        Returns:
            Tuple of (artist, title) - both may be empty strings if parsing fails
        """
        channel_config = self.get_channel_config(channel_name)
        if not channel_config:
            # Fallback to global settings
            return self._fallback_parse(video_title)

        parsing_rules = channel_config.get("parsing_rules", {})
        format_type = parsing_rules.get("format", "artist_title_separator")

        if format_type == "artist_title_separator":
            return self._parse_artist_title_separator(video_title, parsing_rules)
        elif format_type == "artist_title_spaces":
            return self._parse_artist_title_spaces(video_title, parsing_rules)
        elif format_type == "title_artist_pipe":
            return self._parse_title_artist_pipe(video_title, parsing_rules)
        else:
            return self._fallback_parse(video_title)

    def _parse_artist_title_separator(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
        """Parse format: 'Artist - Title' or 'Title - Artist'."""
        separator = rules.get("separator", " - ")
        artist_first = rules.get("artist_first", True)

        if separator not in video_title:
            return "", video_title.strip()

        parts = video_title.split(separator, 1)
        if len(parts) != 2:
            return "", video_title.strip()

        part1, part2 = parts[0].strip(), parts[1].strip()

        # Apply cleanup to both parts
        part1_clean = self._cleanup_title(part1, rules.get("title_cleanup", {}))
        part2_clean = self._cleanup_title(part2, rules.get("title_cleanup", {}))

        if artist_first:
            return part1_clean, part2_clean
        else:
            return part2_clean, part1_clean

    def _parse_artist_title_spaces(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
        """Parse format: 'Artist   Title' (multiple spaces)."""
        separator = rules.get("separator", "   ")
        multi_artist_sep = rules.get("multi_artist_separator", ",  ")

        # Try multiple space patterns to handle inconsistent spacing
        # Look for the LAST occurrence of multiple spaces to handle cases with commas
        space_patterns = ["   ", "  ", "    "]  # 3, 2, 4 spaces

        for pattern in space_patterns:
            if pattern in video_title:
                # Split on the LAST occurrence of the pattern
                last_index = video_title.rfind(pattern)
                if last_index != -1:
                    artist_part = video_title[:last_index].strip()
                    title_part = video_title[last_index + len(pattern):].strip()

                    # Handle multiple artists (e.g., "Artist1,  Artist2")
                    if multi_artist_sep in artist_part:
                        # Keep the full artist string as is
                        artist = artist_part
                    else:
                        artist = artist_part

                    title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))

                    return artist, title

        # Try dash patterns as fallback for inconsistent formatting
        dash_patterns = [" - ", " – ", " -"]  # Regular dash, en dash, dash without trailing space

        for pattern in dash_patterns:
            if pattern in video_title:
                # Split on the LAST occurrence of the pattern
                last_index = video_title.rfind(pattern)
                if last_index != -1:
                    artist_part = video_title[:last_index].strip()
                    title_part = video_title[last_index + len(pattern):].strip()

                    # Handle multiple artists (e.g., "Artist1,  Artist2")
                    if multi_artist_sep in artist_part:
                        # Keep the full artist string as is
                        artist = artist_part
                    else:
                        artist = artist_part

                    title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))

                    return artist, title

        # If no pattern matches, return empty artist and full title
        return "", video_title.strip()

    def _parse_title_artist_pipe(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
        """Parse format: 'Title | Artist'."""
        separator = rules.get("separator", " | ")

        if separator not in video_title:
            return "", video_title.strip()

        parts = video_title.split(separator, 1)
        if len(parts) != 2:
            return "", video_title.strip()

        title_part, artist_part = parts[0].strip(), parts[1].strip()

        title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
        artist = self._cleanup_title(artist_part, rules.get("artist_cleanup", {}))

        return artist, title

    def _cleanup_title(self, text: str, cleanup_rules: Dict[str, Any]) -> str:
        """Apply cleanup rules to remove suffixes and normalize text."""
        if not cleanup_rules:
            return text.strip()

        cleaned = text.strip()

        # Handle remove_suffix rule
        if "remove_suffix" in cleanup_rules:
            suffixes = cleanup_rules["remove_suffix"].get("suffixes", [])
            for suffix in suffixes:
                if cleaned.endswith(suffix):
                    cleaned = cleaned[:-len(suffix)].strip()
                    break

        return cleaned

    def _fallback_parse(self, video_title: str) -> Tuple[str, str]:
        """Fallback parsing using global settings."""
        global_settings = self.channels_config.get("global_parsing_settings", {})
        fallback_format = global_settings.get("fallback_format", "artist_title_separator")
        fallback_separator = global_settings.get("fallback_separator", " - ")

        if fallback_format == "artist_title_separator":
            if fallback_separator in video_title:
                parts = video_title.split(fallback_separator, 1)
                if len(parts) == 2:
                    artist = parts[0].strip()
                    title = parts[1].strip()
                    # Apply global suffix cleanup
                    for suffix in global_settings.get("common_suffixes", []):
                        if title.endswith(suffix):
                            title = title[:-len(suffix)].strip()
                            break
                    return artist, title

        # If all else fails, return empty artist and full title
        return "", video_title.strip()

    def is_playlist_title(self, video_title: str, channel_name: str) -> bool:
        """Check if a video title appears to be a playlist rather than a single song."""
        channel_config = self.get_channel_config(channel_name)
        if not channel_config:
            return self._is_playlist_by_global_rules(video_title)

        parsing_rules = channel_config.get("parsing_rules", {})
        playlist_indicators = parsing_rules.get("playlist_indicators", [])

        if not playlist_indicators:
            return self._is_playlist_by_global_rules(video_title)

        title_upper = video_title.upper()
        for indicator in playlist_indicators:
            if indicator.upper() in title_upper:
                return True

        return False

    def _is_playlist_by_global_rules(self, video_title: str) -> bool:
        """Check if title is a playlist using global rules."""
        global_settings = self.channels_config.get("global_parsing_settings", {})
        playlist_indicators = global_settings.get("playlist_indicators", [])

        title_upper = video_title.upper()
        for indicator in playlist_indicators:
            if indicator.upper() in title_upper:
                return True

        return False

    def get_all_channel_names(self) -> List[str]:
        """Get a list of all configured channel names."""
        return [channel["name"] for channel in self.channels_config.get("channels", [])]

    def get_channel_url(self, channel_name: str) -> Optional[str]:
        """Get the URL for a specific channel."""
        channel_config = self.get_channel_config(channel_name)
        return channel_config.get("url") if channel_config else None


# Convenience function for backward compatibility
def extract_artist_title(video_title: str, channel_name: str, channels_file: str = None) -> Tuple[str, str]:
    if channels_file is None:
        channels_file = str(get_data_path_manager().get_channels_json_path())
    """
    Convenience function to extract artist and title from a video title.

    Args:
        video_title: The full video title from YouTube
        channel_name: The name of the channel
        channels_file: Path to the channels configuration file

    Returns:
        Tuple of (artist, title)
    """
    parser = ChannelParser(channels_file)
    return parser.extract_artist_title(video_title, channel_name)