KaraokeVideoDownloader/karaoke_downloader/channel_parser.py

254 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Channel-specific parsing utilities for extracting artist and title from video titles.
This module handles the different title formats used by various karaoke channels,
providing channel-specific parsing rules to extract artist and title information
correctly for ID3 tagging and filename generation.
"""
import json
import re
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
class ChannelParser:
"""Handles channel-specific parsing of video titles to extract artist and title."""
def __init__(self, channels_file: str = "data/channels.json"):
"""Initialize the parser with channel configuration."""
self.channels_file = Path(channels_file)
self.channels_config = self._load_channels_config()
def _load_channels_config(self) -> Dict[str, Any]:
"""Load the channels configuration from JSON file."""
if not self.channels_file.exists():
raise FileNotFoundError(f"Channels configuration file not found: {self.channels_file}")
with open(self.channels_file, 'r', encoding='utf-8') as f:
return json.load(f)
def get_channel_config(self, channel_name: str) -> Optional[Dict[str, Any]]:
"""Get the configuration for a specific channel."""
for channel in self.channels_config.get("channels", []):
if channel["name"] == channel_name:
return channel
return None
def extract_artist_title(self, video_title: str, channel_name: str) -> Tuple[str, str]:
"""
Extract artist and title from a video title using channel-specific parsing rules.
Args:
video_title: The full video title from YouTube
channel_name: The name of the channel (must match config)
Returns:
Tuple of (artist, title) - both may be empty strings if parsing fails
"""
channel_config = self.get_channel_config(channel_name)
if not channel_config:
# Fallback to global settings
return self._fallback_parse(video_title)
parsing_rules = channel_config.get("parsing_rules", {})
format_type = parsing_rules.get("format", "artist_title_separator")
if format_type == "artist_title_separator":
return self._parse_artist_title_separator(video_title, parsing_rules)
elif format_type == "artist_title_spaces":
return self._parse_artist_title_spaces(video_title, parsing_rules)
elif format_type == "title_artist_pipe":
return self._parse_title_artist_pipe(video_title, parsing_rules)
else:
return self._fallback_parse(video_title)
def _parse_artist_title_separator(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
"""Parse format: 'Artist - Title' or 'Title - Artist'."""
separator = rules.get("separator", " - ")
artist_first = rules.get("artist_first", True)
if separator not in video_title:
return "", video_title.strip()
parts = video_title.split(separator, 1)
if len(parts) != 2:
return "", video_title.strip()
part1, part2 = parts[0].strip(), parts[1].strip()
# Apply cleanup to both parts
part1_clean = self._cleanup_title(part1, rules.get("title_cleanup", {}))
part2_clean = self._cleanup_title(part2, rules.get("title_cleanup", {}))
if artist_first:
return part1_clean, part2_clean
else:
return part2_clean, part1_clean
def _parse_artist_title_spaces(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
"""Parse format: 'Artist Title' (multiple spaces)."""
separator = rules.get("separator", " ")
multi_artist_sep = rules.get("multi_artist_separator", ", ")
# Try multiple space patterns to handle inconsistent spacing
# Look for the LAST occurrence of multiple spaces to handle cases with commas
space_patterns = [" ", " ", " "] # 3, 2, 4 spaces
for pattern in space_patterns:
if pattern in video_title:
# Split on the LAST occurrence of the pattern
last_index = video_title.rfind(pattern)
if last_index != -1:
artist_part = video_title[:last_index].strip()
title_part = video_title[last_index + len(pattern):].strip()
# Handle multiple artists (e.g., "Artist1, Artist2")
if multi_artist_sep in artist_part:
# Keep the full artist string as is
artist = artist_part
else:
artist = artist_part
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
return artist, title
# Try dash patterns as fallback for inconsistent formatting
dash_patterns = [" - ", " ", " -"] # Regular dash, en dash, dash without trailing space
for pattern in dash_patterns:
if pattern in video_title:
# Split on the LAST occurrence of the pattern
last_index = video_title.rfind(pattern)
if last_index != -1:
artist_part = video_title[:last_index].strip()
title_part = video_title[last_index + len(pattern):].strip()
# Handle multiple artists (e.g., "Artist1, Artist2")
if multi_artist_sep in artist_part:
# Keep the full artist string as is
artist = artist_part
else:
artist = artist_part
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
return artist, title
# If no pattern matches, return empty artist and full title
return "", video_title.strip()
def _parse_title_artist_pipe(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
"""Parse format: 'Title | Artist'."""
separator = rules.get("separator", " | ")
if separator not in video_title:
return "", video_title.strip()
parts = video_title.split(separator, 1)
if len(parts) != 2:
return "", video_title.strip()
title_part, artist_part = parts[0].strip(), parts[1].strip()
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
artist = self._cleanup_title(artist_part, rules.get("artist_cleanup", {}))
return artist, title
def _cleanup_title(self, text: str, cleanup_rules: Dict[str, Any]) -> str:
"""Apply cleanup rules to remove suffixes and normalize text."""
if not cleanup_rules:
return text.strip()
cleaned = text.strip()
# Handle remove_suffix rule
if "remove_suffix" in cleanup_rules:
suffixes = cleanup_rules["remove_suffix"].get("suffixes", [])
for suffix in suffixes:
if cleaned.endswith(suffix):
cleaned = cleaned[:-len(suffix)].strip()
break
return cleaned
def _fallback_parse(self, video_title: str) -> Tuple[str, str]:
"""Fallback parsing using global settings."""
global_settings = self.channels_config.get("global_parsing_settings", {})
fallback_format = global_settings.get("fallback_format", "artist_title_separator")
fallback_separator = global_settings.get("fallback_separator", " - ")
if fallback_format == "artist_title_separator":
if fallback_separator in video_title:
parts = video_title.split(fallback_separator, 1)
if len(parts) == 2:
artist = parts[0].strip()
title = parts[1].strip()
# Apply global suffix cleanup
for suffix in global_settings.get("common_suffixes", []):
if title.endswith(suffix):
title = title[:-len(suffix)].strip()
break
return artist, title
# If all else fails, return empty artist and full title
return "", video_title.strip()
def is_playlist_title(self, video_title: str, channel_name: str) -> bool:
"""Check if a video title appears to be a playlist rather than a single song."""
channel_config = self.get_channel_config(channel_name)
if not channel_config:
return self._is_playlist_by_global_rules(video_title)
parsing_rules = channel_config.get("parsing_rules", {})
playlist_indicators = parsing_rules.get("playlist_indicators", [])
if not playlist_indicators:
return self._is_playlist_by_global_rules(video_title)
title_upper = video_title.upper()
for indicator in playlist_indicators:
if indicator.upper() in title_upper:
return True
return False
def _is_playlist_by_global_rules(self, video_title: str) -> bool:
"""Check if title is a playlist using global rules."""
global_settings = self.channels_config.get("global_parsing_settings", {})
playlist_indicators = global_settings.get("playlist_indicators", [])
title_upper = video_title.upper()
for indicator in playlist_indicators:
if indicator.upper() in title_upper:
return True
return False
def get_all_channel_names(self) -> List[str]:
"""Get a list of all configured channel names."""
return [channel["name"] for channel in self.channels_config.get("channels", [])]
def get_channel_url(self, channel_name: str) -> Optional[str]:
"""Get the URL for a specific channel."""
channel_config = self.get_channel_config(channel_name)
return channel_config.get("url") if channel_config else None
# Convenience function for backward compatibility
def extract_artist_title(video_title: str, channel_name: str, channels_file: str = "data/channels.json") -> Tuple[str, str]:
"""
Convenience function to extract artist and title from a video title.
Args:
video_title: The full video title from YouTube
channel_name: The name of the channel
channels_file: Path to the channels configuration file
Returns:
Tuple of (artist, title)
"""
parser = ChannelParser(channels_file)
return parser.extract_artist_title(video_title, channel_name)