Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>
This commit is contained in:
parent
c864af7794
commit
24a6a37efd
@ -7966,5 +7966,69 @@
|
|||||||
"channel": "@KaraokeOnVEVO",
|
"channel": "@KaraokeOnVEVO",
|
||||||
"marked_at": "2025-07-28T08:08:58.214983",
|
"marked_at": "2025-07-28T08:08:58.214983",
|
||||||
"reason": "already_on_server"
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"zara larsson_midnight sun": {
|
||||||
|
"artist": "Zara Larsson",
|
||||||
|
"title": "Midnight Sun",
|
||||||
|
"video_title": "Zara Larsson Midnight Sun",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:12.805038",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"imagine dragons, j i d_enemy": {
|
||||||
|
"artist": "Imagine Dragons, J I D",
|
||||||
|
"title": "Enemy",
|
||||||
|
"video_title": "Imagine Dragons, J I D Enemy",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:12.822951",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"jonas blue, why don't we_don't wake me up": {
|
||||||
|
"artist": "Jonas Blue, Why Don't We",
|
||||||
|
"title": "Don't Wake Me Up",
|
||||||
|
"video_title": "Jonas Blue, Why Don't We Don't Wake Me Up",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:12.844018",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"rex orange county_pluto projector": {
|
||||||
|
"artist": "Rex Orange County",
|
||||||
|
"title": "Pluto Projector",
|
||||||
|
"video_title": "Rex Orange County Pluto Projector",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:12.858730",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"charlie puth_light switch": {
|
||||||
|
"artist": "Charlie Puth",
|
||||||
|
"title": "Light Switch",
|
||||||
|
"video_title": "Charlie Puth Light Switch",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:12.878327",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"the rolling stones_(i can't get no) satisfaction": {
|
||||||
|
"artist": "The Rolling Stones",
|
||||||
|
"title": "(I Can't Get No) Satisfaction",
|
||||||
|
"video_title": "(I Can't Get No) Satisfaction - The Rolling Stones KARAOKE Without Backing Vocals",
|
||||||
|
"channel": "@VocalStarKaraoke",
|
||||||
|
"marked_at": "2025-07-28T09:18:13.023146",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"lauren spencer smith_fingers crossed": {
|
||||||
|
"artist": "Lauren Spencer Smith",
|
||||||
|
"title": "Fingers Crossed",
|
||||||
|
"video_title": "Lauren Spencer Smith Fingers Crossed",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:20:07.067847",
|
||||||
|
"reason": "already_on_server"
|
||||||
|
},
|
||||||
|
"tems_crazy tings": {
|
||||||
|
"artist": "Tems",
|
||||||
|
"title": "Crazy Tings",
|
||||||
|
"video_title": "Tems Crazy Tings",
|
||||||
|
"channel": "@sing2karaoke",
|
||||||
|
"marked_at": "2025-07-28T09:20:07.089571",
|
||||||
|
"reason": "already_on_server"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
254
karaoke_downloader/channel_parser.py
Normal file
254
karaoke_downloader/channel_parser.py
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
"""
|
||||||
|
Channel-specific parsing utilities for extracting artist and title from video titles.
|
||||||
|
|
||||||
|
This module handles the different title formats used by various karaoke channels,
|
||||||
|
providing channel-specific parsing rules to extract artist and title information
|
||||||
|
correctly for ID3 tagging and filename generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional, Tuple, Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class ChannelParser:
|
||||||
|
"""Handles channel-specific parsing of video titles to extract artist and title."""
|
||||||
|
|
||||||
|
def __init__(self, channels_file: str = "data/channels.json"):
|
||||||
|
"""Initialize the parser with channel configuration."""
|
||||||
|
self.channels_file = Path(channels_file)
|
||||||
|
self.channels_config = self._load_channels_config()
|
||||||
|
|
||||||
|
def _load_channels_config(self) -> Dict[str, Any]:
|
||||||
|
"""Load the channels configuration from JSON file."""
|
||||||
|
if not self.channels_file.exists():
|
||||||
|
raise FileNotFoundError(f"Channels configuration file not found: {self.channels_file}")
|
||||||
|
|
||||||
|
with open(self.channels_file, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def get_channel_config(self, channel_name: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get the configuration for a specific channel."""
|
||||||
|
for channel in self.channels_config.get("channels", []):
|
||||||
|
if channel["name"] == channel_name:
|
||||||
|
return channel
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_artist_title(self, video_title: str, channel_name: str) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Extract artist and title from a video title using channel-specific parsing rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
video_title: The full video title from YouTube
|
||||||
|
channel_name: The name of the channel (must match config)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (artist, title) - both may be empty strings if parsing fails
|
||||||
|
"""
|
||||||
|
channel_config = self.get_channel_config(channel_name)
|
||||||
|
if not channel_config:
|
||||||
|
# Fallback to global settings
|
||||||
|
return self._fallback_parse(video_title)
|
||||||
|
|
||||||
|
parsing_rules = channel_config.get("parsing_rules", {})
|
||||||
|
format_type = parsing_rules.get("format", "artist_title_separator")
|
||||||
|
|
||||||
|
if format_type == "artist_title_separator":
|
||||||
|
return self._parse_artist_title_separator(video_title, parsing_rules)
|
||||||
|
elif format_type == "artist_title_spaces":
|
||||||
|
return self._parse_artist_title_spaces(video_title, parsing_rules)
|
||||||
|
elif format_type == "title_artist_pipe":
|
||||||
|
return self._parse_title_artist_pipe(video_title, parsing_rules)
|
||||||
|
else:
|
||||||
|
return self._fallback_parse(video_title)
|
||||||
|
|
||||||
|
def _parse_artist_title_separator(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
|
"""Parse format: 'Artist - Title' or 'Title - Artist'."""
|
||||||
|
separator = rules.get("separator", " - ")
|
||||||
|
artist_first = rules.get("artist_first", True)
|
||||||
|
|
||||||
|
if separator not in video_title:
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
parts = video_title.split(separator, 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
part1, part2 = parts[0].strip(), parts[1].strip()
|
||||||
|
|
||||||
|
# Apply cleanup to both parts
|
||||||
|
part1_clean = self._cleanup_title(part1, rules.get("title_cleanup", {}))
|
||||||
|
part2_clean = self._cleanup_title(part2, rules.get("title_cleanup", {}))
|
||||||
|
|
||||||
|
if artist_first:
|
||||||
|
return part1_clean, part2_clean
|
||||||
|
else:
|
||||||
|
return part2_clean, part1_clean
|
||||||
|
|
||||||
|
def _parse_artist_title_spaces(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
|
"""Parse format: 'Artist Title' (multiple spaces)."""
|
||||||
|
separator = rules.get("separator", " ")
|
||||||
|
multi_artist_sep = rules.get("multi_artist_separator", ", ")
|
||||||
|
|
||||||
|
# Try multiple space patterns to handle inconsistent spacing
|
||||||
|
# Look for the LAST occurrence of multiple spaces to handle cases with commas
|
||||||
|
space_patterns = [" ", " ", " "] # 3, 2, 4 spaces
|
||||||
|
|
||||||
|
for pattern in space_patterns:
|
||||||
|
if pattern in video_title:
|
||||||
|
# Split on the LAST occurrence of the pattern
|
||||||
|
last_index = video_title.rfind(pattern)
|
||||||
|
if last_index != -1:
|
||||||
|
artist_part = video_title[:last_index].strip()
|
||||||
|
title_part = video_title[last_index + len(pattern):].strip()
|
||||||
|
|
||||||
|
# Handle multiple artists (e.g., "Artist1, Artist2")
|
||||||
|
if multi_artist_sep in artist_part:
|
||||||
|
# Keep the full artist string as is
|
||||||
|
artist = artist_part
|
||||||
|
else:
|
||||||
|
artist = artist_part
|
||||||
|
|
||||||
|
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||||||
|
|
||||||
|
return artist, title
|
||||||
|
|
||||||
|
# Try dash patterns as fallback for inconsistent formatting
|
||||||
|
dash_patterns = [" - ", " – ", " -"] # Regular dash, en dash, dash without trailing space
|
||||||
|
|
||||||
|
for pattern in dash_patterns:
|
||||||
|
if pattern in video_title:
|
||||||
|
# Split on the LAST occurrence of the pattern
|
||||||
|
last_index = video_title.rfind(pattern)
|
||||||
|
if last_index != -1:
|
||||||
|
artist_part = video_title[:last_index].strip()
|
||||||
|
title_part = video_title[last_index + len(pattern):].strip()
|
||||||
|
|
||||||
|
# Handle multiple artists (e.g., "Artist1, Artist2")
|
||||||
|
if multi_artist_sep in artist_part:
|
||||||
|
# Keep the full artist string as is
|
||||||
|
artist = artist_part
|
||||||
|
else:
|
||||||
|
artist = artist_part
|
||||||
|
|
||||||
|
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||||||
|
|
||||||
|
return artist, title
|
||||||
|
|
||||||
|
# If no pattern matches, return empty artist and full title
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
def _parse_title_artist_pipe(self, video_title: str, rules: Dict[str, Any]) -> Tuple[str, str]:
|
||||||
|
"""Parse format: 'Title | Artist'."""
|
||||||
|
separator = rules.get("separator", " | ")
|
||||||
|
|
||||||
|
if separator not in video_title:
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
parts = video_title.split(separator, 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
title_part, artist_part = parts[0].strip(), parts[1].strip()
|
||||||
|
|
||||||
|
title = self._cleanup_title(title_part, rules.get("title_cleanup", {}))
|
||||||
|
artist = self._cleanup_title(artist_part, rules.get("artist_cleanup", {}))
|
||||||
|
|
||||||
|
return artist, title
|
||||||
|
|
||||||
|
def _cleanup_title(self, text: str, cleanup_rules: Dict[str, Any]) -> str:
|
||||||
|
"""Apply cleanup rules to remove suffixes and normalize text."""
|
||||||
|
if not cleanup_rules:
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
cleaned = text.strip()
|
||||||
|
|
||||||
|
# Handle remove_suffix rule
|
||||||
|
if "remove_suffix" in cleanup_rules:
|
||||||
|
suffixes = cleanup_rules["remove_suffix"].get("suffixes", [])
|
||||||
|
for suffix in suffixes:
|
||||||
|
if cleaned.endswith(suffix):
|
||||||
|
cleaned = cleaned[:-len(suffix)].strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
def _fallback_parse(self, video_title: str) -> Tuple[str, str]:
|
||||||
|
"""Fallback parsing using global settings."""
|
||||||
|
global_settings = self.channels_config.get("global_parsing_settings", {})
|
||||||
|
fallback_format = global_settings.get("fallback_format", "artist_title_separator")
|
||||||
|
fallback_separator = global_settings.get("fallback_separator", " - ")
|
||||||
|
|
||||||
|
if fallback_format == "artist_title_separator":
|
||||||
|
if fallback_separator in video_title:
|
||||||
|
parts = video_title.split(fallback_separator, 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
artist = parts[0].strip()
|
||||||
|
title = parts[1].strip()
|
||||||
|
# Apply global suffix cleanup
|
||||||
|
for suffix in global_settings.get("common_suffixes", []):
|
||||||
|
if title.endswith(suffix):
|
||||||
|
title = title[:-len(suffix)].strip()
|
||||||
|
break
|
||||||
|
return artist, title
|
||||||
|
|
||||||
|
# If all else fails, return empty artist and full title
|
||||||
|
return "", video_title.strip()
|
||||||
|
|
||||||
|
def is_playlist_title(self, video_title: str, channel_name: str) -> bool:
|
||||||
|
"""Check if a video title appears to be a playlist rather than a single song."""
|
||||||
|
channel_config = self.get_channel_config(channel_name)
|
||||||
|
if not channel_config:
|
||||||
|
return self._is_playlist_by_global_rules(video_title)
|
||||||
|
|
||||||
|
parsing_rules = channel_config.get("parsing_rules", {})
|
||||||
|
playlist_indicators = parsing_rules.get("playlist_indicators", [])
|
||||||
|
|
||||||
|
if not playlist_indicators:
|
||||||
|
return self._is_playlist_by_global_rules(video_title)
|
||||||
|
|
||||||
|
title_upper = video_title.upper()
|
||||||
|
for indicator in playlist_indicators:
|
||||||
|
if indicator.upper() in title_upper:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_playlist_by_global_rules(self, video_title: str) -> bool:
|
||||||
|
"""Check if title is a playlist using global rules."""
|
||||||
|
global_settings = self.channels_config.get("global_parsing_settings", {})
|
||||||
|
playlist_indicators = global_settings.get("playlist_indicators", [])
|
||||||
|
|
||||||
|
title_upper = video_title.upper()
|
||||||
|
for indicator in playlist_indicators:
|
||||||
|
if indicator.upper() in title_upper:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_all_channel_names(self) -> List[str]:
|
||||||
|
"""Get a list of all configured channel names."""
|
||||||
|
return [channel["name"] for channel in self.channels_config.get("channels", [])]
|
||||||
|
|
||||||
|
def get_channel_url(self, channel_name: str) -> Optional[str]:
|
||||||
|
"""Get the URL for a specific channel."""
|
||||||
|
channel_config = self.get_channel_config(channel_name)
|
||||||
|
return channel_config.get("url") if channel_config else None
|
||||||
|
|
||||||
|
|
||||||
|
# Convenience function for backward compatibility
|
||||||
|
def extract_artist_title(video_title: str, channel_name: str, channels_file: str = "data/channels.json") -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Convenience function to extract artist and title from a video title.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
video_title: The full video title from YouTube
|
||||||
|
channel_name: The name of the channel
|
||||||
|
channels_file: Path to the channels configuration file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (artist, title)
|
||||||
|
"""
|
||||||
|
parser = ChannelParser(channels_file)
|
||||||
|
return parser.extract_artist_title(video_title, channel_name)
|
||||||
@ -1,17 +1,95 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Karaoke Video Downloader CLI
|
||||||
|
Command-line interface for the karaoke video downloader.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
from typing import List
|
||||||
|
|
||||||
|
from karaoke_downloader.channel_parser import ChannelParser
|
||||||
|
from karaoke_downloader.config_manager import AppConfig
|
||||||
from karaoke_downloader.downloader import KaraokeDownloader
|
from karaoke_downloader.downloader import KaraokeDownloader
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
|
DEFAULT_LATEST_PER_CHANNEL_LIMIT = 10
|
||||||
DEFAULT_FUZZY_THRESHOLD = 85
|
DEFAULT_FUZZY_THRESHOLD = 85
|
||||||
DEFAULT_LATEST_PER_CHANNEL_LIMIT = 5
|
|
||||||
DEFAULT_DISPLAY_LIMIT = 10
|
|
||||||
DEFAULT_CACHE_DURATION_HOURS = 24
|
def load_channels_from_json(channels_file: str = "data/channels.json") -> List[str]:
|
||||||
|
"""
|
||||||
|
Load channel URLs from the new JSON format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channels_file: Path to the channels.json file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of channel URLs
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parser = ChannelParser(channels_file)
|
||||||
|
channels = parser.channels_config.get("channels", [])
|
||||||
|
return [channel["url"] for channel in channels]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error loading channels from {channels_file}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def load_channels_from_text(channels_file: str = "data/channels.txt") -> List[str]:
|
||||||
|
"""
|
||||||
|
Load channel URLs from the old text format (for backward compatibility).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channels_file: Path to the channels.txt file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of channel URLs
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(channels_file, "r", encoding="utf-8") as f:
|
||||||
|
return [
|
||||||
|
line.strip()
|
||||||
|
for line in f
|
||||||
|
if line.strip() and not line.strip().startswith("#")
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error loading channels from {channels_file}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def load_channels(channel_file: str = None) -> List[str]:
|
||||||
|
"""
|
||||||
|
Load channel URLs from either JSON or text format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_file: Path to the channel file (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of channel URLs
|
||||||
|
"""
|
||||||
|
if channel_file:
|
||||||
|
# Use the specified file
|
||||||
|
if channel_file.endswith('.json'):
|
||||||
|
return load_channels_from_json(channel_file)
|
||||||
|
else:
|
||||||
|
return load_channels_from_text(channel_file)
|
||||||
|
else:
|
||||||
|
# Try JSON first, then fall back to text
|
||||||
|
json_file = "data/channels.json"
|
||||||
|
txt_file = "data/channels.txt"
|
||||||
|
|
||||||
|
if os.path.exists(json_file):
|
||||||
|
print(f"📋 Using new JSON format: {json_file}")
|
||||||
|
return load_channels_from_json(json_file)
|
||||||
|
elif os.path.exists(txt_file):
|
||||||
|
print(f"📋 Using legacy text format: {txt_file}")
|
||||||
|
return load_channels_from_text(txt_file)
|
||||||
|
else:
|
||||||
|
print("❌ No channel file found. Please create data/channels.json or data/channels.txt")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -282,17 +360,16 @@ Examples:
|
|||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
# --- END NEW ---
|
# --- END NEW ---
|
||||||
|
|
||||||
# --- NEW: If no URL or file is provided, but --songlist-only is set, use all channels in data/channels.txt ---
|
# --- NEW: If no URL or file is provided, but --songlist-only is set, use all channels ---
|
||||||
if (args.songlist_only or args.songlist_focus) and not args.url and not args.file:
|
if (args.songlist_only or args.songlist_focus) and not args.url and not args.file:
|
||||||
channels_file = Path("data/channels.txt")
|
channel_urls = load_channels()
|
||||||
if channels_file.exists():
|
if channel_urls:
|
||||||
args.file = str(channels_file)
|
|
||||||
print(
|
print(
|
||||||
"📋 No URL or --file provided, defaulting to all channels in data/channels.txt for songlist mode."
|
"📋 No URL or --file provided, defaulting to all configured channels for songlist mode."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
"❌ No URL, --file, or data/channels.txt found. Please provide a channel URL or a file with channel URLs."
|
"❌ No URL, --file, or channel configuration found. Please provide a channel URL or create data/channels.json."
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
# --- END NEW ---
|
# --- END NEW ---
|
||||||
@ -388,17 +465,11 @@ Examples:
|
|||||||
print(f" ... and {len(tracking) - 10} more")
|
print(f" ... and {len(tracking) - 10} more")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
elif args.songlist_only or args.songlist_focus:
|
elif args.songlist_only or args.songlist_focus:
|
||||||
# Use provided file or default to data/channels.txt
|
# Use provided file or default to channels configuration
|
||||||
channel_file = args.file if args.file else "data/channels.txt"
|
channel_urls = load_channels(args.file)
|
||||||
if not os.path.exists(channel_file):
|
if not channel_urls:
|
||||||
print(f"❌ Channel file not found: {channel_file}")
|
print(f"❌ No channels found in configuration")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
with open(channel_file, "r", encoding="utf-8") as f:
|
|
||||||
channel_urls = [
|
|
||||||
line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.strip().startswith("#")
|
|
||||||
]
|
|
||||||
limit = args.limit if args.limit else None
|
limit = args.limit if args.limit else None
|
||||||
success = downloader.download_songlist_across_channels(
|
success = downloader.download_songlist_across_channels(
|
||||||
channel_urls,
|
channel_urls,
|
||||||
@ -412,17 +483,11 @@ Examples:
|
|||||||
max_channel_workers=args.channel_workers,
|
max_channel_workers=args.channel_workers,
|
||||||
)
|
)
|
||||||
elif args.latest_per_channel:
|
elif args.latest_per_channel:
|
||||||
# Use provided file or default to data/channels.txt
|
# Use provided file or default to channels configuration
|
||||||
channel_file = args.file if args.file else "data/channels.txt"
|
channel_urls = load_channels(args.file)
|
||||||
if not os.path.exists(channel_file):
|
if not channel_urls:
|
||||||
print(f"❌ Channel file not found: {channel_file}")
|
print(f"❌ No channels found in configuration")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
with open(channel_file, "r", encoding="utf-8") as f:
|
|
||||||
channel_urls = [
|
|
||||||
line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.strip().startswith("#")
|
|
||||||
]
|
|
||||||
limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT
|
limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT
|
||||||
force_refresh_download_plan = (
|
force_refresh_download_plan = (
|
||||||
args.force_download_plan if hasattr(args, "force_download_plan") else False
|
args.force_download_plan if hasattr(args, "force_download_plan") else False
|
||||||
@ -448,17 +513,11 @@ Examples:
|
|||||||
else:
|
else:
|
||||||
# Default behavior: download from channels (equivalent to --latest-per-channel)
|
# Default behavior: download from channels (equivalent to --latest-per-channel)
|
||||||
print("🎯 No specific mode specified, defaulting to download from channels")
|
print("🎯 No specific mode specified, defaulting to download from channels")
|
||||||
channel_file = args.file if args.file else "data/channels.txt"
|
channel_urls = load_channels(args.file)
|
||||||
if not os.path.exists(channel_file):
|
if not channel_urls:
|
||||||
print(f"❌ Channel file not found: {channel_file}")
|
print(f"❌ No channels found in configuration")
|
||||||
print("Please provide a channel URL or ensure data/channels.txt exists")
|
print("Please provide a channel URL or create data/channels.json")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
with open(channel_file, "r", encoding="utf-8") as f:
|
|
||||||
channel_urls = [
|
|
||||||
line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.strip().startswith("#")
|
|
||||||
]
|
|
||||||
limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT
|
limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT
|
||||||
force_refresh_download_plan = (
|
force_refresh_download_plan = (
|
||||||
args.force_download_plan if hasattr(args, "force_download_plan") else False
|
args.force_download_plan if hasattr(args, "force_download_plan") else False
|
||||||
|
|||||||
@ -297,9 +297,10 @@ class DownloadPipeline:
|
|||||||
video_title = video.get("title", "")
|
video_title = video.get("title", "")
|
||||||
|
|
||||||
# Extract artist and title from video title
|
# Extract artist and title from video title
|
||||||
from karaoke_downloader.id3_utils import extract_artist_title
|
from karaoke_downloader.channel_parser import ChannelParser
|
||||||
|
|
||||||
artist, title = extract_artist_title(video_title)
|
channel_parser = ChannelParser()
|
||||||
|
artist, title = channel_parser.extract_artist_title(video_title, channel_name)
|
||||||
|
|
||||||
print(f" ({i}/{total}) Processing: {artist} - {title}")
|
print(f" ({i}/{total}) Processing: {artist} - {title}")
|
||||||
|
|
||||||
|
|||||||
@ -17,17 +17,16 @@ from karaoke_downloader.cache_manager import (
|
|||||||
load_cached_plan,
|
load_cached_plan,
|
||||||
save_plan_cache,
|
save_plan_cache,
|
||||||
)
|
)
|
||||||
# Import all fuzzy matching functions including the enhanced extract_artist_title
|
# Import all fuzzy matching functions
|
||||||
# This ensures consistent parsing across all modules and supports multiple video title formats
|
|
||||||
from karaoke_downloader.fuzzy_matcher import (
|
from karaoke_downloader.fuzzy_matcher import (
|
||||||
create_song_key,
|
create_song_key,
|
||||||
create_video_key,
|
create_video_key,
|
||||||
extract_artist_title,
|
|
||||||
get_similarity_function,
|
get_similarity_function,
|
||||||
is_exact_match,
|
is_exact_match,
|
||||||
is_fuzzy_match,
|
is_fuzzy_match,
|
||||||
normalize_title,
|
normalize_title,
|
||||||
)
|
)
|
||||||
|
from karaoke_downloader.channel_parser import ChannelParser
|
||||||
from karaoke_downloader.youtube_utils import get_channel_info
|
from karaoke_downloader.youtube_utils import get_channel_info
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
@ -127,10 +126,11 @@ def _scan_channel_for_matches(
|
|||||||
video_matches = []
|
video_matches = []
|
||||||
|
|
||||||
# Pre-process video titles for efficient matching
|
# Pre-process video titles for efficient matching
|
||||||
|
channel_parser = ChannelParser()
|
||||||
if fuzzy_match:
|
if fuzzy_match:
|
||||||
# For fuzzy matching, create normalized video keys
|
# For fuzzy matching, create normalized video keys
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
# Find best match among remaining songs
|
# Find best match among remaining songs
|
||||||
@ -162,7 +162,7 @@ def _scan_channel_for_matches(
|
|||||||
else:
|
else:
|
||||||
# For exact matching, use direct key comparison
|
# For exact matching, use direct key comparison
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
if video_key in song_keys:
|
if video_key in song_keys:
|
||||||
@ -241,10 +241,11 @@ def build_download_plan(
|
|||||||
video_matches = []
|
video_matches = []
|
||||||
|
|
||||||
# Pre-process video titles for efficient matching
|
# Pre-process video titles for efficient matching
|
||||||
|
channel_parser = ChannelParser()
|
||||||
if fuzzy_match:
|
if fuzzy_match:
|
||||||
# For fuzzy matching, create normalized video keys
|
# For fuzzy matching, create normalized video keys
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
# Find best match among remaining songs (thread-safe)
|
# Find best match among remaining songs (thread-safe)
|
||||||
@ -283,7 +284,7 @@ def build_download_plan(
|
|||||||
else:
|
else:
|
||||||
# For exact matching, use direct key comparison
|
# For exact matching, use direct key comparison
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
with song_lookup_lock:
|
with song_lookup_lock:
|
||||||
@ -345,10 +346,11 @@ def build_download_plan(
|
|||||||
video_matches = [] # Initialize video_matches for this channel
|
video_matches = [] # Initialize video_matches for this channel
|
||||||
|
|
||||||
# Pre-process video titles for efficient matching
|
# Pre-process video titles for efficient matching
|
||||||
|
channel_parser = ChannelParser()
|
||||||
if fuzzy_match:
|
if fuzzy_match:
|
||||||
# For fuzzy matching, create normalized video keys
|
# For fuzzy matching, create normalized video keys
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
# Find best match among remaining songs
|
# Find best match among remaining songs
|
||||||
@ -381,7 +383,7 @@ def build_download_plan(
|
|||||||
else:
|
else:
|
||||||
# For exact matching, use direct key comparison
|
# For exact matching, use direct key comparison
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
v_artist, v_title = channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
video_key = create_song_key(v_artist, v_title)
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
if video_key in song_keys:
|
if video_key in song_keys:
|
||||||
|
|||||||
@ -32,7 +32,8 @@ from karaoke_downloader.fuzzy_matcher import (
|
|||||||
is_exact_match,
|
is_exact_match,
|
||||||
is_fuzzy_match,
|
is_fuzzy_match,
|
||||||
)
|
)
|
||||||
from karaoke_downloader.id3_utils import add_id3_tags, extract_artist_title
|
from karaoke_downloader.id3_utils import add_id3_tags
|
||||||
|
from karaoke_downloader.channel_parser import ChannelParser
|
||||||
from karaoke_downloader.server_manager import (
|
from karaoke_downloader.server_manager import (
|
||||||
check_and_mark_server_duplicate,
|
check_and_mark_server_duplicate,
|
||||||
is_song_marked_as_server_duplicate,
|
is_song_marked_as_server_duplicate,
|
||||||
@ -105,6 +106,9 @@ class KaraokeDownloader:
|
|||||||
# Load server songs for availability checking
|
# Load server songs for availability checking
|
||||||
self.server_songs = load_server_songs()
|
self.server_songs = load_server_songs()
|
||||||
|
|
||||||
|
# Initialize channel parser for title parsing
|
||||||
|
self.channel_parser = ChannelParser()
|
||||||
|
|
||||||
# Parallel download settings
|
# Parallel download settings
|
||||||
self.enable_parallel_downloads = False
|
self.enable_parallel_downloads = False
|
||||||
self.parallel_workers = 3
|
self.parallel_workers = 3
|
||||||
@ -220,7 +224,7 @@ class KaraokeDownloader:
|
|||||||
matches = []
|
matches = []
|
||||||
similarity = get_similarity_function()
|
similarity = get_similarity_function()
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
artist, title = extract_artist_title(video["title"])
|
artist, title = self.channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
key = create_song_key(artist, title)
|
key = create_song_key(artist, title)
|
||||||
if fuzzy_match:
|
if fuzzy_match:
|
||||||
# Fuzzy match against all songlist keys
|
# Fuzzy match against all songlist keys
|
||||||
@ -702,7 +706,7 @@ class KaraokeDownloader:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Extract artist and title for tracking
|
# Extract artist and title for tracking
|
||||||
artist, title_clean = extract_artist_title(title)
|
artist, title_clean = self.channel_parser.extract_artist_title(title, channel_name)
|
||||||
|
|
||||||
task = DownloadTask(
|
task = DownloadTask(
|
||||||
video_id=video_id,
|
video_id=video_id,
|
||||||
@ -810,7 +814,7 @@ class KaraokeDownloader:
|
|||||||
)
|
)
|
||||||
filename = f"{channel_name} - {safe_title}.mp4"
|
filename = f"{channel_name} - {safe_title}.mp4"
|
||||||
# Extract artist and title for tracking
|
# Extract artist and title for tracking
|
||||||
artist, title_clean = extract_artist_title(title)
|
artist, title_clean = self.channel_parser.extract_artist_title(title, channel_name)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f" ({v_idx+1}/{len(videos)}) Processing: {artist} - {title_clean}"
|
f" ({v_idx+1}/{len(videos)}) Processing: {artist} - {title_clean}"
|
||||||
@ -942,7 +946,7 @@ class KaraokeDownloader:
|
|||||||
# Pre-filter videos to exclude known duplicates before processing
|
# Pre-filter videos to exclude known duplicates before processing
|
||||||
pre_filtered_videos = []
|
pre_filtered_videos = []
|
||||||
for video in available_videos:
|
for video in available_videos:
|
||||||
artist, title = extract_artist_title(video["title"])
|
artist, title = self.channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
song_key = create_song_key(artist, title)
|
song_key = create_song_key(artist, title)
|
||||||
if song_key not in known_duplicate_keys:
|
if song_key not in known_duplicate_keys:
|
||||||
pre_filtered_videos.append(video)
|
pre_filtered_videos.append(video)
|
||||||
@ -960,7 +964,7 @@ class KaraokeDownloader:
|
|||||||
break # We have enough videos for this channel
|
break # We have enough videos for this channel
|
||||||
|
|
||||||
videos_checked += 1
|
videos_checked += 1
|
||||||
artist, title = extract_artist_title(video["title"])
|
artist, title = self.channel_parser.extract_artist_title(video["title"], channel_name)
|
||||||
|
|
||||||
# Check if should skip this song during planning phase
|
# Check if should skip this song during planning phase
|
||||||
should_skip, reason, filtered_count = self._should_skip_song(
|
should_skip, reason, filtered_count = self._should_skip_song(
|
||||||
|
|||||||
@ -54,12 +54,19 @@ def sanitize_filename(
|
|||||||
)
|
)
|
||||||
safe_artist = safe_artist.strip()
|
safe_artist = safe_artist.strip()
|
||||||
|
|
||||||
# Create filename
|
# Create filename - handle empty artist case
|
||||||
filename = f"{safe_artist} - {safe_title}.mp4"
|
if not safe_artist or safe_artist.strip() == "":
|
||||||
|
# If no artist, just use the title
|
||||||
|
filename = f"{safe_title}.mp4"
|
||||||
|
else:
|
||||||
|
filename = f"{safe_artist} - {safe_title}.mp4"
|
||||||
|
|
||||||
# Limit filename length if needed
|
# Limit filename length if needed
|
||||||
if len(filename) > max_length:
|
if len(filename) > max_length:
|
||||||
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
if not safe_artist or safe_artist.strip() == "":
|
||||||
|
filename = f"{safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
||||||
|
else:
|
||||||
|
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
@ -81,11 +88,19 @@ def generate_possible_filenames(
|
|||||||
safe_title = sanitize_title_for_filenames(title)
|
safe_title = sanitize_title_for_filenames(title)
|
||||||
safe_artist = artist.replace("'", "").replace('"', "").strip()
|
safe_artist = artist.replace("'", "").replace('"', "").strip()
|
||||||
|
|
||||||
return [
|
# Handle empty artist case
|
||||||
f"{safe_artist} - {safe_title}.mp4", # Songlist mode
|
if not safe_artist or safe_artist.strip() == "":
|
||||||
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
|
return [
|
||||||
f"{safe_artist} - {safe_title} (Karaoke Version).mp4", # Channel videos mode
|
f"{safe_title}.mp4", # Songlist mode (no artist)
|
||||||
]
|
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
|
||||||
|
f"{safe_title} (Karaoke Version).mp4", # Channel videos mode (no artist)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
f"{safe_artist} - {safe_title}.mp4", # Songlist mode
|
||||||
|
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
|
||||||
|
f"{safe_artist} - {safe_title} (Karaoke Version).mp4", # Channel videos mode
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def sanitize_title_for_filenames(title: str) -> str:
|
def sanitize_title_for_filenames(title: str) -> str:
|
||||||
@ -131,7 +146,10 @@ def check_file_exists_with_patterns(
|
|||||||
# Apply length limits if needed
|
# Apply length limits if needed
|
||||||
safe_artist = artist.replace("'", "").replace('"', "").strip()
|
safe_artist = artist.replace("'", "").replace('"', "").strip()
|
||||||
safe_title = sanitize_title_for_filenames(title)
|
safe_title = sanitize_title_for_filenames(title)
|
||||||
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
if not safe_artist or safe_artist.strip() == "":
|
||||||
|
filename = f"{safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
||||||
|
else:
|
||||||
|
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
|
||||||
|
|
||||||
# Check for exact filename match
|
# Check for exact filename match
|
||||||
file_path = channel_dir / filename
|
file_path = channel_dir / filename
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user