1228 lines
50 KiB
Python
1228 lines
50 KiB
Python
import hashlib
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
import json
|
|
import subprocess
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from karaoke_downloader.cache_manager import (
|
|
delete_plan_cache,
|
|
get_download_plan_cache_file,
|
|
load_cached_plan,
|
|
save_plan_cache,
|
|
)
|
|
from karaoke_downloader.channel_manager import (
|
|
download_from_file,
|
|
reset_channel_downloads,
|
|
)
|
|
from karaoke_downloader.config_manager import get_config_manager, load_config
|
|
from karaoke_downloader.data_path_manager import get_data_path_manager
|
|
from karaoke_downloader.download_pipeline import DownloadPipeline
|
|
from karaoke_downloader.download_planner import build_download_plan
|
|
from karaoke_downloader.error_utils import handle_yt_dlp_error, log_error
|
|
from karaoke_downloader.file_utils import ensure_directory_exists, sanitize_filename
|
|
from karaoke_downloader.fuzzy_matcher import (
|
|
create_song_key,
|
|
create_video_key,
|
|
get_similarity_function,
|
|
is_exact_match,
|
|
is_fuzzy_match,
|
|
)
|
|
from karaoke_downloader.id3_utils import add_id3_tags
|
|
from karaoke_downloader.channel_parser import ChannelParser
|
|
from karaoke_downloader.server_manager import (
|
|
check_and_mark_server_duplicate,
|
|
is_song_marked_as_server_duplicate,
|
|
is_song_on_server,
|
|
load_server_duplicates_tracking,
|
|
load_server_songs,
|
|
)
|
|
from karaoke_downloader.song_validator import create_song_validator
|
|
from karaoke_downloader.songlist_manager import (
|
|
is_songlist_song_downloaded,
|
|
load_songlist,
|
|
load_songlist_tracking,
|
|
mark_songlist_song_downloaded,
|
|
normalize_title,
|
|
save_songlist_tracking,
|
|
)
|
|
from karaoke_downloader.tracking_manager import FormatType, SongStatus, TrackingManager
|
|
from karaoke_downloader.video_downloader import (
|
|
download_video_and_track,
|
|
execute_download_plan,
|
|
is_valid_mp4,
|
|
cleanup_cache,
|
|
)
|
|
from karaoke_downloader.parallel_downloader import (
|
|
ParallelDownloader,
|
|
DownloadTask,
|
|
create_parallel_downloader,
|
|
)
|
|
from karaoke_downloader.youtube_utils import get_channel_info, get_playlist_info
|
|
from karaoke_downloader.manual_video_manager import is_manual_channel, get_manual_channel_info, get_manual_videos_for_channel
|
|
|
|
# Constants
|
|
DEFAULT_FUZZY_THRESHOLD = 85
|
|
DEFAULT_CACHE_EXPIRATION_DAYS = 1
|
|
DEFAULT_DISPLAY_LIMIT = 10
|
|
|
|
DATA_DIR = Path("data")
|
|
|
|
|
|
class KaraokeDownloader:
|
|
def __init__(self):
|
|
# Load configuration
|
|
self.config_manager = get_config_manager()
|
|
self.config = self.config_manager.load_config()
|
|
|
|
# Initialize paths
|
|
self.yt_dlp_path = self.config.yt_dlp_path # Keep as string for command parsing
|
|
self.downloads_dir = Path(self.config.folder_structure.downloads_dir)
|
|
self.logs_dir = Path(self.config.folder_structure.logs_dir)
|
|
|
|
# Ensure directories exist
|
|
ensure_directory_exists(self.downloads_dir)
|
|
ensure_directory_exists(self.logs_dir)
|
|
|
|
# Initialize tracking
|
|
data_path_manager = get_data_path_manager()
|
|
tracking_file = data_path_manager.get_karaoke_tracking_path()
|
|
cache_dir = data_path_manager.get_channel_cache_dir()
|
|
self.tracker = TrackingManager(
|
|
tracking_file=tracking_file, cache_dir=cache_dir
|
|
)
|
|
|
|
# Initialize song validator
|
|
self.song_validator = create_song_validator(self.tracker, self.downloads_dir)
|
|
|
|
# Load songlist tracking
|
|
self.songlist_tracking_file = DATA_DIR / "songlist_tracking.json"
|
|
self.songlist_tracking = load_songlist_tracking(
|
|
str(self.songlist_tracking_file)
|
|
)
|
|
|
|
# Load server songs for availability checking
|
|
self.server_songs = load_server_songs()
|
|
|
|
# Initialize channel parser for title parsing
|
|
self.channel_parser = ChannelParser()
|
|
|
|
# Parallel download settings
|
|
self.enable_parallel_downloads = False
|
|
self.parallel_workers = 3
|
|
|
|
# Songlist focus mode attributes
|
|
self.songlist_focus_titles = None
|
|
self.songlist_only = False
|
|
self.use_songlist_priority = True
|
|
|
|
# Download mode attributes
|
|
self.dry_run = False
|
|
self.download_limit = None
|
|
self.force_download = False
|
|
self.songlist_file_path = str(get_data_path_manager().get_songlist_path()) # Default songlist file path
|
|
|
|
def _load_config(self):
|
|
"""Load configuration using the config manager."""
|
|
return self.config_manager.load_config()
|
|
|
|
def _should_skip_song(
|
|
self,
|
|
artist,
|
|
title,
|
|
channel_name,
|
|
video_id,
|
|
video_title,
|
|
server_songs=None,
|
|
server_duplicates_tracking=None,
|
|
):
|
|
"""
|
|
Check if a song should be skipped using the centralized SongValidator.
|
|
|
|
Returns:
|
|
tuple: (should_skip, reason, total_filtered)
|
|
"""
|
|
return self.song_validator.should_skip_song(
|
|
artist,
|
|
title,
|
|
channel_name,
|
|
video_id,
|
|
video_title,
|
|
server_songs,
|
|
server_duplicates_tracking,
|
|
self.force_download,
|
|
)
|
|
|
|
def _mark_song_failed(self, artist, title, video_id, channel_name, error_message):
|
|
"""
|
|
Mark a song as failed in tracking using the SongValidator.
|
|
"""
|
|
self.song_validator.mark_song_failed(
|
|
artist, title, video_id, channel_name, error_message
|
|
)
|
|
|
|
def _handle_download_failure(
|
|
self, artist, title, video_id, channel_name, error_type, error_details=""
|
|
):
|
|
"""
|
|
Handle download failures using the SongValidator.
|
|
|
|
Args:
|
|
artist: Song artist
|
|
title: Song title
|
|
video_id: YouTube video ID
|
|
channel_name: Channel name
|
|
error_type: Type of error (e.g., "yt-dlp failed", "file verification failed")
|
|
error_details: Additional error details
|
|
"""
|
|
self.song_validator.handle_download_failure(
|
|
artist, title, video_id, channel_name, error_type, error_details
|
|
)
|
|
|
|
def download_channel_videos(
|
|
self,
|
|
url,
|
|
force_refresh=False,
|
|
fuzzy_match=False,
|
|
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
|
force_download=False,
|
|
dry_run=False,
|
|
):
|
|
"""Download videos from a channel or playlist URL, respecting songlist-only and limit flags. Supports fuzzy matching."""
|
|
|
|
# Check if this is a manual channel
|
|
from karaoke_downloader.manual_video_manager import is_manual_channel, get_manual_channel_info, get_manual_videos_for_channel
|
|
|
|
if is_manual_channel(url):
|
|
channel_name, channel_id = get_manual_channel_info(url)
|
|
print(f"\n🎬 Downloading from manual channel: {channel_name} ({url})")
|
|
|
|
# Load manual videos
|
|
manual_videos = get_manual_videos_for_channel(channel_name)
|
|
if not manual_videos:
|
|
print("⚠️ No manual videos found. Skipping.")
|
|
return False
|
|
|
|
# Convert to the expected format
|
|
available_videos = []
|
|
for video in manual_videos:
|
|
available_videos.append({
|
|
"title": video.get("title", ""),
|
|
"id": video.get("id", ""),
|
|
"url": video.get("url", "")
|
|
})
|
|
|
|
print(f"📋 Found {len(available_videos)} manual videos")
|
|
|
|
# Process manual videos (skip yt-dlp)
|
|
return self._process_videos_for_download(available_videos, channel_name, force_refresh, fuzzy_match, fuzzy_threshold, force_download, dry_run)
|
|
|
|
# Regular YouTube channel processing - USE TRACKING MANAGER CACHE
|
|
channel_name, channel_id = get_channel_info(url)
|
|
print(f"\n🎬 Downloading from channel: {channel_name} ({url})")
|
|
|
|
# Use tracking manager to get videos (with cache support)
|
|
available_videos = self.tracker.get_channel_video_list(
|
|
url,
|
|
str(self.yt_dlp_path),
|
|
force_refresh=force_refresh
|
|
)
|
|
|
|
if not available_videos:
|
|
print("⚠️ No videos found for this channel. Skipping.")
|
|
return False
|
|
|
|
print(f"📋 Found {len(available_videos)} videos from channel")
|
|
|
|
# Process videos for download
|
|
return self._process_videos_for_download(available_videos, channel_name, force_refresh, fuzzy_match, fuzzy_threshold, force_download, dry_run)
|
|
|
|
def download_all_channel_videos(
|
|
self,
|
|
url,
|
|
force_refresh=False,
|
|
force_download=False,
|
|
limit=None,
|
|
dry_run=False,
|
|
):
|
|
"""Download ALL videos from a channel, skipping existing files and songs in songs.json."""
|
|
|
|
# Check if this is a manual channel
|
|
from karaoke_downloader.manual_video_manager import is_manual_channel, get_manual_channel_info, get_manual_videos_for_channel
|
|
|
|
if is_manual_channel(url):
|
|
channel_name, channel_id = get_manual_channel_info(url)
|
|
print(f"\n🎬 Downloading ALL videos from manual channel: {channel_name} ({url})")
|
|
|
|
# Load manual videos
|
|
manual_videos = get_manual_videos_for_channel(channel_name)
|
|
if not manual_videos:
|
|
print("⚠️ No manual videos found. Skipping.")
|
|
return False
|
|
|
|
# Convert to the expected format
|
|
available_videos = []
|
|
for video in manual_videos:
|
|
available_videos.append({
|
|
"title": video.get("title", ""),
|
|
"id": video.get("id", ""),
|
|
"url": video.get("url", "")
|
|
})
|
|
|
|
print(f"📋 Found {len(available_videos)} manual videos")
|
|
|
|
# Process all manual videos (skip songlist filtering)
|
|
return self._process_all_videos_for_download(available_videos, channel_name, force_refresh, force_download, limit, dry_run)
|
|
|
|
# Regular YouTube channel processing - USE TRACKING MANAGER CACHE
|
|
channel_name, channel_id = get_channel_info(url)
|
|
print(f"\n🎬 Downloading ALL videos from channel: {channel_name} ({url})")
|
|
|
|
# Use tracking manager to get videos (with cache support)
|
|
available_videos = self.tracker.get_channel_video_list(
|
|
url,
|
|
str(self.yt_dlp_path),
|
|
force_refresh=force_refresh
|
|
)
|
|
|
|
if not available_videos:
|
|
print("⚠️ No videos found for this channel. Skipping.")
|
|
return False
|
|
|
|
print(f"📋 Found {len(available_videos)} videos from channel")
|
|
|
|
# Process all videos for download (skip songlist filtering)
|
|
return self._process_all_videos_for_download(available_videos, channel_name, force_refresh, force_download, limit, dry_run)
|
|
|
|
def _process_all_videos_for_download(self, available_videos, channel_name, force_refresh=False, force_download=False, limit=None, dry_run=False):
|
|
"""Process ALL videos for download (no songlist filtering)."""
|
|
print(f"🔍 Processing {len(available_videos)} videos for download (no songlist filtering)...")
|
|
|
|
# Load server songs and duplicates tracking for availability checking
|
|
server_songs = load_server_songs()
|
|
server_duplicates_tracking = load_server_duplicates_tracking()
|
|
|
|
# Load songlist to check if songs are already in songs.json
|
|
songlist = load_songlist(self.songlist_file_path)
|
|
songlist_artists_titles = set()
|
|
if songlist:
|
|
for song in songlist:
|
|
song_key = f"{song['artist']} - {song['title']}"
|
|
songlist_artists_titles.add(song_key)
|
|
|
|
# Process all videos, skipping existing files and songs in songs.json
|
|
videos_to_download = []
|
|
skipped_count = 0
|
|
|
|
for video in available_videos:
|
|
video_title = video["title"]
|
|
video_id = video["id"]
|
|
|
|
# Extract artist and title
|
|
artist, extracted_title = self.channel_parser.extract_artist_title(video_title, channel_name)
|
|
if not artist and not extracted_title:
|
|
# Fallback: use the full title
|
|
artist = ""
|
|
extracted_title = video_title
|
|
|
|
# Create filename
|
|
filename = f"{artist} - {extracted_title}.mp4" if artist and extracted_title else f"{video_title}.mp4"
|
|
output_path = self.downloads_dir / channel_name / filename
|
|
|
|
# Check if file already exists
|
|
if output_path.exists() and not force_download:
|
|
print(f" ⏭️ Skipping existing file: {filename}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Check if song is already in songs.json
|
|
song_key = f"{artist} - {extracted_title}"
|
|
if song_key in songlist_artists_titles and not force_download:
|
|
# Check if there's already an MP4 version in songs.json
|
|
if song_key in server_songs:
|
|
song_info = server_songs[song_key]
|
|
if song_info.get("is_mp4", False):
|
|
print(f" ⏭️ Skipping song in songs.json (MP4): {song_key}")
|
|
skipped_count += 1
|
|
continue
|
|
else:
|
|
print(f" 📥 Found MP3 version in songs.json, will download MP4: {song_key}")
|
|
# Continue to download the MP4 version
|
|
else:
|
|
print(f" 📥 Found song in songlist, will download MP4: {song_key}")
|
|
# Continue to download the MP4 version
|
|
|
|
# Check if already downloaded (tracking)
|
|
if self.tracker.is_song_downloaded(artist, extracted_title, channel_name, video_id) and not force_download:
|
|
print(f" ⏭️ Skipping already tracked: {song_key}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Check if on server
|
|
if not force_download and check_and_mark_server_duplicate(
|
|
server_songs,
|
|
server_duplicates_tracking,
|
|
artist,
|
|
extracted_title,
|
|
video_title,
|
|
channel_name,
|
|
):
|
|
print(f" ⏭️ Skipping server duplicate: {song_key}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Add to download list
|
|
videos_to_download.append((video, artist, extracted_title, filename))
|
|
|
|
print(f"📊 Found {len(videos_to_download)} videos to download (skipped {skipped_count})")
|
|
|
|
# Apply limit if specified
|
|
if limit is not None:
|
|
original_count = len(videos_to_download)
|
|
videos_to_download = videos_to_download[:limit]
|
|
print(f"🎯 Limited to first {limit} videos (was {original_count} total)")
|
|
|
|
if not videos_to_download:
|
|
print("🎵 No new videos to download.")
|
|
return True
|
|
|
|
# Create download plan for caching and resuming
|
|
download_plan = []
|
|
for video, artist, title, filename in videos_to_download:
|
|
download_plan.append({
|
|
"video_id": video["id"],
|
|
"artist": artist,
|
|
"title": title,
|
|
"filename": filename,
|
|
"channel_name": channel_name,
|
|
"video_title": video.get("title", ""),
|
|
"force_download": force_download
|
|
})
|
|
|
|
# Cache the download plan for resuming
|
|
import hashlib
|
|
from karaoke_downloader.cache_manager import get_download_plan_cache_file, save_plan_cache, load_cached_plan
|
|
|
|
plan_kwargs = {
|
|
"channel": channel_name,
|
|
"total_videos": len(available_videos),
|
|
"force_download": force_download,
|
|
}
|
|
if limit:
|
|
plan_kwargs["limit"] = limit
|
|
|
|
cache_file = get_download_plan_cache_file("all_videos", **plan_kwargs)
|
|
|
|
# Check for existing cache
|
|
cached_plan, _ = load_cached_plan(cache_file)
|
|
if cached_plan and not force_refresh:
|
|
print(f"\n📋 Found existing download plan cache: {cache_file.name}")
|
|
print(f" 🎬 Cached videos to download: {len(cached_plan)}")
|
|
download_plan = cached_plan
|
|
else:
|
|
# Create new download plan
|
|
download_plan = []
|
|
for video, artist, title, filename in videos_to_download:
|
|
download_plan.append({
|
|
"video_id": video["id"],
|
|
"artist": artist,
|
|
"title": title,
|
|
"filename": filename,
|
|
"channel_name": channel_name,
|
|
"video_title": video.get("title", ""),
|
|
"force_download": force_download
|
|
})
|
|
|
|
# Save the new plan to cache
|
|
save_plan_cache(cache_file, download_plan, []) # No unmatched for all-videos mode
|
|
|
|
# Show download plan summary
|
|
print(f"\n📋 Download Plan Summary:")
|
|
print(f" 📺 Channel: {channel_name}")
|
|
print(f" 🎬 Total videos to download: {len(videos_to_download)}")
|
|
print(f" ⏭️ Videos skipped: {skipped_count}")
|
|
if limit:
|
|
print(f" 🎯 Limit applied: {limit} videos")
|
|
print(f" 📁 Output directory: downloads/{channel_name}/")
|
|
print(f" 💾 Download plan cached to: {cache_file.name}")
|
|
|
|
# Use unified download workflow
|
|
downloaded_count, success = self.execute_unified_download_workflow(
|
|
download_plan=download_plan,
|
|
cache_file=cache_file,
|
|
limit=limit,
|
|
show_progress=True,
|
|
dry_run=dry_run,
|
|
)
|
|
|
|
return success
|
|
|
|
def download_songlist_across_channels(
|
|
self,
|
|
channel_urls,
|
|
limit=None,
|
|
force_refresh_download_plan=False,
|
|
fuzzy_match=False,
|
|
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
|
force_download=False,
|
|
show_pagination=False,
|
|
parallel_channels=False,
|
|
max_channel_workers=3,
|
|
dry_run=False,
|
|
):
|
|
"""
|
|
Download songs from the songlist across multiple channels.
|
|
"""
|
|
# Set force download flag
|
|
self.force_download = force_download
|
|
|
|
# Apply songlist focus filtering if specified
|
|
if self.songlist_focus_titles:
|
|
# Load the raw songlist data to filter by playlist titles
|
|
songlist_file = Path(self.songlist_file_path)
|
|
if not songlist_file.exists():
|
|
print(f"⚠️ Songlist file not found: {self.songlist_file_path}")
|
|
return False
|
|
|
|
try:
|
|
with open(songlist_file, "r", encoding="utf-8") as f:
|
|
raw_data = json.load(f)
|
|
|
|
# Filter playlists by title
|
|
focused_playlists = []
|
|
print(f"🔍 Looking for playlists: {self.songlist_focus_titles}")
|
|
print(f"🔍 Available playlists in songList.json:")
|
|
for i, playlist in enumerate(raw_data[:5]): # Show first 5 playlists
|
|
print(f" {i+1}. '{playlist.get('title', 'NO TITLE')}'")
|
|
if len(raw_data) > 5:
|
|
print(f" ... and {len(raw_data) - 5} more playlists")
|
|
|
|
for playlist in raw_data:
|
|
playlist_title = playlist.get("title", "")
|
|
if playlist_title in self.songlist_focus_titles:
|
|
focused_playlists.append(playlist)
|
|
print(f"✅ Found matching playlist: '{playlist_title}'")
|
|
|
|
if not focused_playlists:
|
|
print(
|
|
f"⚠️ No playlists found matching the specified titles: {', '.join(self.songlist_focus_titles)}"
|
|
)
|
|
return False
|
|
|
|
# Flatten the focused playlists into songs
|
|
focused_songs = []
|
|
seen = set()
|
|
for playlist in focused_playlists:
|
|
if "songs" in playlist:
|
|
for song in playlist["songs"]:
|
|
if "artist" in song and "title" in song:
|
|
artist = song["artist"].strip()
|
|
title = song["title"].strip()
|
|
key = f"{artist.lower()}_{title.lower()}"
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
focused_songs.append(
|
|
{
|
|
"artist": artist,
|
|
"title": title,
|
|
"position": song.get("position", 0),
|
|
}
|
|
)
|
|
|
|
songlist = focused_songs
|
|
print(
|
|
f"\n🎯 Songlist focus mode: {len(focused_songs)} songs from {len(focused_playlists)} playlists selected"
|
|
)
|
|
print(f"🎯 Focused playlists: {', '.join(self.songlist_focus_titles)}")
|
|
|
|
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
print(f"⚠️ Could not load songlist for filtering from {self.songlist_file_path}: {e}")
|
|
return False
|
|
else:
|
|
# Load songlist normally (flattened from all playlists)
|
|
songlist = load_songlist(self.songlist_file_path)
|
|
if not songlist:
|
|
print("⚠️ No songlist loaded. Skipping.")
|
|
return False
|
|
|
|
# Filter for songs not yet downloaded
|
|
if self.force_download:
|
|
undownloaded = songlist
|
|
print(f"\n🎯 {len(songlist)} total unique songs in songlist.")
|
|
print(f"💪 Force mode enabled - will download all songs regardless of existing status")
|
|
else:
|
|
undownloaded = [
|
|
s
|
|
for s in songlist
|
|
if not is_songlist_song_downloaded(
|
|
self.songlist_tracking, s["artist"], s["title"]
|
|
)
|
|
]
|
|
print(f"\n🎯 {len(songlist)} total unique songs in songlist.")
|
|
print(f"\n🎯 {len(undownloaded)} unique songlist songs to download.")
|
|
|
|
# Load server songs and duplicates tracking for availability checking
|
|
server_songs = load_server_songs()
|
|
server_duplicates_tracking = load_server_duplicates_tracking()
|
|
|
|
# Initialize counters for logging
|
|
server_available_mp4 = 0
|
|
server_available_other = 0
|
|
marked_duplicates = 0
|
|
|
|
# Further filter out songs already on server or marked as duplicates
|
|
if self.force_download:
|
|
not_on_server = undownloaded
|
|
print(f"💪 Force mode enabled - will download all songs regardless of server status")
|
|
else:
|
|
not_on_server = []
|
|
|
|
for song in undownloaded:
|
|
artist, title = song["artist"], song["title"]
|
|
|
|
# Check if already marked as server duplicate
|
|
if is_song_marked_as_server_duplicate(
|
|
server_duplicates_tracking, artist, title
|
|
):
|
|
marked_duplicates += 1
|
|
continue
|
|
|
|
# Check if already on server and mark for future skipping
|
|
if check_and_mark_server_duplicate(
|
|
server_songs,
|
|
server_duplicates_tracking,
|
|
artist,
|
|
title,
|
|
f"{artist} - {title}",
|
|
"songlist",
|
|
):
|
|
server_available_mp4 += 1
|
|
continue
|
|
|
|
# Check if song is on server but in different format (MP3/CDG)
|
|
key = f"{artist.lower()}_{normalize_title(title)}"
|
|
if key in server_songs:
|
|
song_info = server_songs[key]
|
|
if song_info.get("is_mp3", False) or song_info.get("is_cdg", False):
|
|
server_available_other += 1
|
|
print(f"🎵 Found {artist} - {title} on server as {song_info.get('is_mp3', False) and 'MP3' or 'CDG'} format, will download video version")
|
|
|
|
not_on_server.append(song)
|
|
|
|
# Apply limit to undownloaded list before logging
|
|
# This ensures that only the specified number of songs are processed and logged,
|
|
# providing accurate counts when using --limit
|
|
if limit is not None:
|
|
original_count = len(not_on_server)
|
|
not_on_server = not_on_server[:limit]
|
|
print(f"\n🎯 Limited to first {limit} songs (was {original_count} total)")
|
|
|
|
undownloaded = not_on_server
|
|
|
|
# Now log the counts based on the limited list
|
|
if server_available_mp4 > 0:
|
|
print(
|
|
f"\n🎵 {server_available_mp4} songs already available as MP4 on server, skipping."
|
|
)
|
|
if server_available_other > 0:
|
|
# Only count songs that are in the limited list
|
|
limited_server_other = sum(1 for song in not_on_server
|
|
if f"{song['artist'].lower()}_{normalize_title(song['title'])}" in server_songs)
|
|
if limited_server_other > 0:
|
|
print(
|
|
f"\n🎵 {limited_server_other} songs found on server as MP3/CDG, will download video versions."
|
|
)
|
|
if marked_duplicates > 0:
|
|
print(
|
|
f"\n🏷️ {marked_duplicates} songs previously marked as server duplicates, skipping."
|
|
)
|
|
|
|
print(f"\n🎯 {len(undownloaded)} songs need to be downloaded.")
|
|
if not undownloaded:
|
|
print("🎵 All songlist songs already downloaded.")
|
|
return True
|
|
|
|
# --- Download plan building (same for both normal and focus modes) ---
|
|
# --- Download plan cache logic ---
|
|
plan_mode = "songlist"
|
|
# Include only parameters that affect the plan generation (exclude limit since it only affects execution)
|
|
plan_kwargs = {
|
|
"channels": len(channel_urls),
|
|
"fuzzy": fuzzy_match,
|
|
"threshold": fuzzy_threshold,
|
|
}
|
|
# Add channel URLs hash to ensure same channels = same cache
|
|
channels_hash = hashlib.md5(
|
|
"|".join(sorted(channel_urls)).encode()
|
|
).hexdigest()[:8]
|
|
plan_kwargs["channels_hash"] = channels_hash
|
|
cache_file = get_download_plan_cache_file(plan_mode, **plan_kwargs)
|
|
use_cache = False
|
|
download_plan, unmatched = load_cached_plan(cache_file)
|
|
if (
|
|
not force_refresh_download_plan
|
|
and download_plan is not None
|
|
and unmatched is not None
|
|
):
|
|
use_cache = True
|
|
print(f"\n📋 Using cached download plan from: {cache_file}")
|
|
if not use_cache:
|
|
print(f"\n🔍 Pre-scanning {len(channel_urls)} channels for matches...")
|
|
print(f"🔍 Scanning {len(undownloaded)} songs against all channels...")
|
|
download_plan, unmatched = build_download_plan(
|
|
channel_urls,
|
|
undownloaded,
|
|
self.tracker,
|
|
self.yt_dlp_path,
|
|
fuzzy_match=fuzzy_match,
|
|
fuzzy_threshold=fuzzy_threshold,
|
|
show_pagination=show_pagination,
|
|
parallel_channels=parallel_channels,
|
|
max_channel_workers=max_channel_workers,
|
|
)
|
|
save_plan_cache(cache_file, download_plan, unmatched)
|
|
print(f"💾 Download plan cached to: {cache_file}")
|
|
print(
|
|
f"\n📊 Download plan ready: {len(download_plan)} songs will be downloaded."
|
|
)
|
|
print(f"❌ {len(unmatched)} songs could not be found in any channel.")
|
|
if unmatched:
|
|
print("Unmatched songs:")
|
|
for song in unmatched[:DEFAULT_DISPLAY_LIMIT]:
|
|
print(f" - {song['artist']} - {song['title']}")
|
|
if len(unmatched) > DEFAULT_DISPLAY_LIMIT:
|
|
print(f" ...and {len(unmatched)-DEFAULT_DISPLAY_LIMIT} more.")
|
|
|
|
# --- Download phase ---
|
|
downloaded_count, success = self.execute_unified_download_workflow(
|
|
download_plan=download_plan,
|
|
cache_file=cache_file,
|
|
limit=limit,
|
|
dry_run=dry_run,
|
|
)
|
|
return success
|
|
|
|
def download_latest_per_channel(
|
|
self,
|
|
channel_urls,
|
|
limit=None,
|
|
force_refresh_download_plan=False,
|
|
fuzzy_match=False,
|
|
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
|
force_download=False,
|
|
dry_run=False,
|
|
):
|
|
"""
|
|
Download the latest N videos from each channel.
|
|
|
|
Args:
|
|
channel_urls: List of channel URLs to process
|
|
limit: Number of latest videos to download from each channel
|
|
force_refresh_download_plan: Force refresh the download plan cache
|
|
fuzzy_match: Whether to use fuzzy matching
|
|
fuzzy_threshold: Threshold for fuzzy matching
|
|
force_download: Force download regardless of existing files
|
|
|
|
Returns:
|
|
bool: True if successful, False otherwise
|
|
"""
|
|
print(f"\n🎬 Downloading latest {limit} videos from {len(channel_urls)} channels")
|
|
|
|
# Build download plan for latest videos from each channel
|
|
download_plan = []
|
|
total_videos_found = 0
|
|
|
|
for i, channel_url in enumerate(channel_urls, 1):
|
|
print(f"\n🚦 Processing channel {i}/{len(channel_urls)}: {channel_url}")
|
|
|
|
# Get channel info
|
|
channel_name, channel_id = get_channel_info(channel_url)
|
|
print(f" ✅ Channel: {channel_name}")
|
|
|
|
# Get videos from channel
|
|
available_videos = self.tracker.get_channel_video_list(
|
|
channel_url,
|
|
str(self.yt_dlp_path),
|
|
force_refresh=False
|
|
)
|
|
|
|
if not available_videos:
|
|
print(f" ⚠️ No videos found for {channel_name}")
|
|
continue
|
|
|
|
print(f" 📊 Found {len(available_videos)} videos")
|
|
|
|
# Take the latest N videos (they're already sorted by date)
|
|
latest_videos = available_videos[:limit] if limit else available_videos
|
|
print(f" 🎯 Processing latest {len(latest_videos)} videos")
|
|
|
|
# Process each video
|
|
for video in latest_videos:
|
|
video_title = video["title"]
|
|
video_id = video["id"]
|
|
|
|
# Extract artist and title
|
|
artist, extracted_title = self.channel_parser.extract_artist_title(video_title, channel_name)
|
|
if not artist and not extracted_title:
|
|
# Fallback: use the full title
|
|
artist = ""
|
|
extracted_title = video_title
|
|
|
|
# Create filename
|
|
filename = sanitize_filename(artist, extracted_title)
|
|
|
|
# Add to download plan
|
|
download_plan.append({
|
|
"video_id": video_id,
|
|
"artist": artist,
|
|
"title": extracted_title,
|
|
"filename": filename,
|
|
"channel_name": channel_name,
|
|
"video_title": video_title,
|
|
"force_download": force_download
|
|
})
|
|
|
|
total_videos_found += 1
|
|
|
|
print(f"\n📋 Download plan created: {total_videos_found} videos from {len(channel_urls)} channels")
|
|
|
|
if not download_plan:
|
|
print("❌ No videos to download")
|
|
return False
|
|
|
|
# Create cache file for progress tracking
|
|
import hashlib
|
|
from karaoke_downloader.cache_manager import get_download_plan_cache_file, save_plan_cache
|
|
|
|
plan_kwargs = {
|
|
"channels": len(channel_urls),
|
|
"limit_per_channel": limit,
|
|
"force_download": force_download,
|
|
}
|
|
|
|
# Add channel URLs hash to ensure same channels = same cache
|
|
channels_hash = hashlib.md5(
|
|
"|".join(sorted(channel_urls)).encode()
|
|
).hexdigest()[:8]
|
|
plan_kwargs["channels_hash"] = channels_hash
|
|
|
|
cache_file = get_download_plan_cache_file("latest_per_channel", **plan_kwargs)
|
|
|
|
# Save the plan to cache
|
|
save_plan_cache(cache_file, download_plan, []) # No unmatched for latest-per-channel mode
|
|
|
|
print(f"💾 Download plan cached to: {cache_file.name}")
|
|
|
|
# Use unified download workflow
|
|
downloaded_count, success = self.execute_unified_download_workflow(
|
|
download_plan=download_plan,
|
|
cache_file=cache_file,
|
|
limit=None, # Limit already applied during plan building
|
|
show_progress=True,
|
|
dry_run=dry_run,
|
|
)
|
|
|
|
return success
|
|
|
|
def _process_videos_for_download(self, available_videos, channel_name, force_refresh=False, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD, force_download=False, dry_run=False):
|
|
"""Process videos for download (used for both manual and regular channels)."""
|
|
songlist = load_songlist(self.songlist_file_path)
|
|
|
|
# For manual videos with force_download, bypass songlist filtering
|
|
if force_download and channel_name == "@ManualVideos":
|
|
print(f"💪 Force mode enabled - downloading all {len(available_videos)} manual videos")
|
|
|
|
# Download all videos directly without songlist filtering
|
|
success_count = 0
|
|
for i, video in enumerate(available_videos, 1):
|
|
title = video["title"]
|
|
video_id = video["id"]
|
|
|
|
print(f"\n⬇️ Downloading {i}/{len(available_videos)}: {title}")
|
|
print(f" 🎬 Video: {title} ({channel_name})")
|
|
|
|
# Create filename from title
|
|
artist, extracted_title = self.channel_parser.extract_artist_title(title, channel_name)
|
|
if not artist and not extracted_title:
|
|
# Fallback: use the full title as filename
|
|
filename = sanitize_filename("", title)
|
|
else:
|
|
filename = sanitize_filename(artist, extracted_title)
|
|
|
|
# Download the video
|
|
success = self._download_single_video(video, channel_name, filename, force_download=True)
|
|
if success:
|
|
success_count += 1
|
|
|
|
print(f"\n✅ Downloaded {success_count}/{len(available_videos)} manual videos")
|
|
return success_count > 0
|
|
|
|
# Regular songlist-based processing
|
|
if not songlist:
|
|
print("⚠️ No songlist loaded. Skipping.")
|
|
return False
|
|
|
|
# Load server songs and duplicates tracking for availability checking
|
|
server_songs = load_server_songs()
|
|
server_duplicates_tracking = load_server_duplicates_tracking()
|
|
|
|
limit = getattr(self.config, "limit", 1)
|
|
|
|
# Normalize songlist for matching
|
|
normalized_songlist = {
|
|
create_song_key(s["artist"], s["title"]): s for s in songlist
|
|
}
|
|
matches = []
|
|
similarity = get_similarity_function()
|
|
|
|
print(f"🔍 Scanning {len(available_videos)} videos for songlist matches...")
|
|
|
|
for video in available_videos:
|
|
title = video["title"]
|
|
video_id = video["id"]
|
|
|
|
# Extract artist and title using channel parser
|
|
artist, extracted_title = self.channel_parser.extract_artist_title(title, channel_name)
|
|
|
|
if not artist and not extracted_title:
|
|
continue
|
|
|
|
song_key = create_song_key(artist, extracted_title)
|
|
|
|
# Check for exact matches first
|
|
if song_key in normalized_songlist:
|
|
song_data = normalized_songlist[song_key]
|
|
matches.append({
|
|
"video": video,
|
|
"song": song_data,
|
|
"match_type": "exact",
|
|
"match_score": 100.0,
|
|
"artist": artist,
|
|
"title": extracted_title
|
|
})
|
|
print(f" ✅ Exact match: {artist} - {extracted_title}")
|
|
continue
|
|
|
|
# Check for fuzzy matches if enabled
|
|
if fuzzy_match:
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for song_key, song_data in normalized_songlist.items():
|
|
score = similarity(f"{artist} {extracted_title}", f"{song_data['artist']} {song_data['title']}")
|
|
if score > best_score and score >= fuzzy_threshold:
|
|
best_score = score
|
|
best_match = song_data
|
|
|
|
if best_match:
|
|
matches.append({
|
|
"video": video,
|
|
"song": best_match,
|
|
"match_type": "fuzzy",
|
|
"match_score": best_score,
|
|
"artist": artist,
|
|
"title": extracted_title
|
|
})
|
|
print(f" 🎯 Fuzzy match ({best_score:.1f}%): {artist} - {extracted_title} -> {best_match['artist']} - {best_match['title']}")
|
|
|
|
print(f"📊 Found {len(matches)} matches out of {len(available_videos)} videos")
|
|
|
|
if not matches:
|
|
print("❌ No matches found in songlist")
|
|
return False
|
|
|
|
# Sort matches by score (exact matches first, then by fuzzy score)
|
|
matches.sort(key=lambda x: (x["match_type"] != "exact", -x["match_score"]))
|
|
|
|
# Limit downloads
|
|
if limit:
|
|
matches = matches[:limit]
|
|
print(f"🎯 Limiting to {len(matches)} downloads")
|
|
|
|
# Convert matches to a download plan
|
|
download_plan = []
|
|
for match in matches:
|
|
video = match["video"]
|
|
song = match["song"]
|
|
artist = match["artist"]
|
|
title = match["title"]
|
|
video_id = video["id"]
|
|
|
|
# Create filename
|
|
filename = sanitize_filename(artist, title)
|
|
output_path = self.downloads_dir / channel_name / filename
|
|
|
|
# Add to download plan
|
|
download_plan.append({
|
|
"video_id": video_id,
|
|
"artist": artist,
|
|
"title": title,
|
|
"filename": filename,
|
|
"channel_name": channel_name,
|
|
"video_title": video["title"],
|
|
"force_download": force_download
|
|
})
|
|
|
|
# Use the unified download workflow
|
|
downloaded_count, success = self.execute_unified_download_workflow(
|
|
download_plan=download_plan,
|
|
cache_file=None, # No specific cache file for this mode
|
|
limit=limit,
|
|
show_progress=True,
|
|
dry_run=dry_run,
|
|
)
|
|
|
|
return success
|
|
|
|
def _download_single_video(self, video, channel_name, filename, force_download=False):
|
|
"""Download a single video using the download pipeline."""
|
|
video_id = video["id"]
|
|
video_title = video["title"]
|
|
|
|
# Extract artist and title for tracking
|
|
artist, extracted_title = self.channel_parser.extract_artist_title(video_title, channel_name)
|
|
if not artist and not extracted_title:
|
|
# Fallback: use the full title
|
|
artist = ""
|
|
extracted_title = video_title
|
|
|
|
output_path = self.downloads_dir / channel_name / filename
|
|
|
|
# Use the download pipeline
|
|
pipeline = DownloadPipeline(
|
|
yt_dlp_path=str(self.yt_dlp_path),
|
|
config=self.config,
|
|
downloads_dir=self.downloads_dir,
|
|
songlist_tracking=self.songlist_tracking,
|
|
tracker=self.tracker,
|
|
)
|
|
|
|
success = pipeline.execute_pipeline(
|
|
video_id=video_id,
|
|
artist=artist,
|
|
title=extracted_title,
|
|
channel_name=channel_name,
|
|
video_title=video_title
|
|
)
|
|
|
|
if success:
|
|
print(f"✅ Successfully downloaded: {video_title}")
|
|
else:
|
|
print(f"❌ Failed to download: {video_title}")
|
|
|
|
return success
|
|
|
|
def execute_unified_download_workflow(
|
|
self,
|
|
download_plan,
|
|
cache_file=None,
|
|
limit=None,
|
|
show_progress=True,
|
|
dry_run=False,
|
|
):
|
|
"""
|
|
Unified download workflow that all download modes use.
|
|
|
|
Args:
|
|
download_plan: List of download items with video_id, artist, title, channel_name, video_title
|
|
cache_file: Optional cache file for progress tracking
|
|
limit: Optional limit on number of downloads
|
|
show_progress: Whether to show progress information
|
|
dry_run: If True, only show the plan without downloading
|
|
|
|
Returns:
|
|
tuple: (downloaded_count, success)
|
|
"""
|
|
if not download_plan:
|
|
print("📋 No videos to download in plan")
|
|
return 0, True
|
|
|
|
total_to_download = len(download_plan)
|
|
if limit:
|
|
total_to_download = min(limit, total_to_download)
|
|
download_plan = download_plan[:limit]
|
|
|
|
if show_progress:
|
|
print(f"\n🎬 Starting downloads: {total_to_download} videos")
|
|
print(f" 📁 Output directory: downloads/")
|
|
if cache_file:
|
|
print(f" 💾 Progress tracking: {cache_file.name}")
|
|
|
|
# Handle dry-run mode
|
|
if dry_run:
|
|
print(f"\n🔍 DRY RUN MODE - No downloads will be performed")
|
|
print(f"📋 Download plan preview:")
|
|
print(f" 📊 Total videos in plan: {len(download_plan)}")
|
|
print(f" 📁 Output directory: downloads/")
|
|
|
|
# Show first few items as preview
|
|
preview_count = min(5, len(download_plan))
|
|
print(f"\n📋 Preview of first {preview_count} videos:")
|
|
for i, item in enumerate(download_plan[:preview_count], 1):
|
|
print(f" {i:2d}. {item['artist']} - {item['title']} ({item['channel_name']})")
|
|
|
|
if len(download_plan) > preview_count:
|
|
print(f" ... and {len(download_plan) - preview_count} more videos")
|
|
|
|
print(f"\n✅ Dry run completed - {len(download_plan)} videos would be downloaded")
|
|
return len(download_plan), True
|
|
|
|
# Choose execution method based on parallel settings
|
|
if self.enable_parallel_downloads:
|
|
return self._execute_parallel_downloads(download_plan, cache_file, show_progress)
|
|
else:
|
|
return self._execute_sequential_downloads(download_plan, cache_file, show_progress)
|
|
|
|
def _execute_sequential_downloads(self, download_plan, cache_file, show_progress):
|
|
"""Execute downloads sequentially using the download pipeline."""
|
|
success_count = 0
|
|
total_to_download = len(download_plan)
|
|
|
|
# Create download pipeline
|
|
pipeline = DownloadPipeline(
|
|
yt_dlp_path=str(self.yt_dlp_path),
|
|
config=self.config,
|
|
downloads_dir=self.downloads_dir,
|
|
songlist_tracking=self.songlist_tracking,
|
|
tracker=self.tracker,
|
|
)
|
|
|
|
for i, plan_item in enumerate(download_plan, 1):
|
|
if show_progress:
|
|
print(f"\n⬇️ Downloading {i}/{total_to_download}: {plan_item['artist']} - {plan_item['title']}")
|
|
print(f" 🎬 Video: {plan_item['video_title']} ({plan_item['channel_name']})")
|
|
|
|
success = pipeline.execute_pipeline(
|
|
video_id=plan_item["video_id"],
|
|
artist=plan_item["artist"],
|
|
title=plan_item["title"],
|
|
channel_name=plan_item["channel_name"],
|
|
video_title=plan_item["video_title"],
|
|
)
|
|
|
|
if success:
|
|
success_count += 1
|
|
if show_progress:
|
|
print(f"✅ Successfully downloaded: {plan_item['artist']} - {plan_item['title']}")
|
|
else:
|
|
if show_progress:
|
|
print(f"❌ Failed to download: {plan_item['artist']} - {plan_item['title']}")
|
|
|
|
# Update cache if provided
|
|
if cache_file:
|
|
# Remove completed item from plan and update cache
|
|
download_plan.remove(plan_item)
|
|
from karaoke_downloader.cache_manager import save_plan_cache
|
|
save_plan_cache(cache_file, download_plan, []) # No unmatched for unified workflow
|
|
|
|
if not download_plan: # All downloads completed
|
|
from karaoke_downloader.cache_manager import delete_plan_cache
|
|
delete_plan_cache(cache_file)
|
|
if show_progress:
|
|
print("🗑️ All downloads completed, deleted download plan cache.")
|
|
|
|
if show_progress:
|
|
print(f"\n🎉 Download complete! {success_count}/{total_to_download} videos downloaded successfully")
|
|
|
|
return success_count, success_count > 0
|
|
|
|
def _execute_parallel_downloads(self, download_plan, cache_file, show_progress):
|
|
"""Execute downloads in parallel using the parallel downloader."""
|
|
from karaoke_downloader.parallel_downloader import create_parallel_downloader
|
|
|
|
# Create parallel downloader
|
|
parallel_downloader = create_parallel_downloader(
|
|
yt_dlp_path=str(self.yt_dlp_path),
|
|
config=self.config,
|
|
downloads_dir=self.downloads_dir,
|
|
max_workers=self.parallel_workers,
|
|
songlist_tracking=self.songlist_tracking,
|
|
tracker=self.tracker,
|
|
)
|
|
|
|
# Convert download plan to tasks
|
|
tasks = []
|
|
for item in download_plan:
|
|
from karaoke_downloader.parallel_downloader import DownloadTask
|
|
task = DownloadTask(
|
|
video_id=item["video_id"],
|
|
artist=item["artist"],
|
|
title=item["title"],
|
|
channel_name=item["channel_name"],
|
|
video_title=item["video_title"],
|
|
)
|
|
tasks.append(task)
|
|
|
|
# Add tasks to the downloader queue
|
|
parallel_downloader.add_download_tasks(tasks)
|
|
|
|
# Execute parallel downloads
|
|
results = parallel_downloader.execute_downloads(show_progress=show_progress)
|
|
|
|
# Count successes
|
|
success_count = sum(1 for result in results if result.success)
|
|
total_to_download = len(tasks)
|
|
|
|
if show_progress:
|
|
print(f"\n🎉 Parallel download complete! {success_count}/{total_to_download} videos downloaded successfully")
|
|
|
|
return success_count, success_count > 0
|
|
|
|
|
|
def reset_songlist_all():
|
|
"""Delete all files tracked in songlist_tracking.json, clear songlist_tracking.json, and remove songlist songs from karaoke_tracking.json."""
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Load songlist tracking
|
|
data_path_manager = get_data_path_manager()
|
|
songlist_tracking_file = data_path_manager.get_songlist_tracking_path()
|
|
karaoke_tracking_file = data_path_manager.get_karaoke_tracking_path()
|
|
if songlist_tracking_file.exists():
|
|
with open(songlist_tracking_file, "r", encoding="utf-8") as f:
|
|
tracking = json.load(f)
|
|
else:
|
|
tracking = {}
|
|
# Delete all files tracked
|
|
for entry in tracking.values():
|
|
file_path = entry.get("file_path")
|
|
if file_path:
|
|
p = Path(file_path)
|
|
try:
|
|
if p.exists():
|
|
p.unlink()
|
|
print(f"🗑️ Deleted: {p}")
|
|
except Exception as e:
|
|
print(f"⚠️ Could not delete {p}: {e}")
|
|
# Clear songlist_tracking.json
|
|
songlist_tracking_file.write_text("{}", encoding="utf-8")
|
|
print("🧹 Cleared songlist_tracking.json")
|
|
# Remove songlist songs from karaoke_tracking.json
|
|
if karaoke_tracking_file.exists():
|
|
with open(karaoke_tracking_file, "r", encoding="utf-8") as f:
|
|
karaoke_data = json.load(f)
|
|
song_keys_to_remove = []
|
|
for song_id, song in karaoke_data.get("songs", {}).items():
|
|
artist = song.get("artist", "")
|
|
title = song.get("title", song.get("name", ""))
|
|
key = f"{artist.lower()}_{normalize_title(title)}"
|
|
if key in tracking:
|
|
song_keys_to_remove.append(song_id)
|
|
for song_id in song_keys_to_remove:
|
|
del karaoke_data["songs"][song_id]
|
|
with open(karaoke_tracking_file, "w", encoding="utf-8") as f:
|
|
json.dump(karaoke_data, f, indent=2, ensure_ascii=False)
|
|
print(
|
|
f"🧹 Removed {len(song_keys_to_remove)} songlist songs from karaoke_tracking.json"
|
|
)
|
|
print("✅ Global songlist reset complete.")
|
|
|
|
# For brevity, the rest of the class methods should be copied here from the original download_karaoke.py,
|
|
# updating all references to use the new karaoke_downloader.* imports as needed.
|