Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>
This commit is contained in:
parent
2c63bf809b
commit
ea07188739
@ -105,6 +105,11 @@ Examples:
|
||||
action="store_true",
|
||||
help="Generate a report of songs that couldn't be found in any channel (runs after downloads)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--show-pagination",
|
||||
action="store_true",
|
||||
help="Show page-by-page progress when downloading channel video lists (slower but more detailed)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--songlist-only",
|
||||
action="store_true",
|
||||
@ -237,6 +242,7 @@ Examples:
|
||||
if args.songlist_focus:
|
||||
downloader.songlist_focus_titles = args.songlist_focus
|
||||
downloader.songlist_only = True # Enable songlist-only mode when focusing
|
||||
args.songlist_only = True # Also set the args flag to ensure CLI logic works
|
||||
print(
|
||||
f"🎯 Songlist focus mode enabled for playlists: {', '.join(args.songlist_focus)}"
|
||||
)
|
||||
@ -260,12 +266,12 @@ Examples:
|
||||
# --- END NEW ---
|
||||
|
||||
# --- NEW: If no URL or file is provided, but --songlist-only is set, use all channels in data/channels.txt ---
|
||||
if args.songlist_only and not args.url and not args.file:
|
||||
if (args.songlist_only or args.songlist_focus) and not args.url and not args.file:
|
||||
channels_file = Path("data/channels.txt")
|
||||
if channels_file.exists():
|
||||
args.file = str(channels_file)
|
||||
print(
|
||||
"📋 No URL or --file provided, defaulting to all channels in data/channels.txt for songlist-only mode."
|
||||
"📋 No URL or --file provided, defaulting to all channels in data/channels.txt for songlist mode."
|
||||
)
|
||||
else:
|
||||
print(
|
||||
@ -377,22 +383,14 @@ Examples:
|
||||
if line.strip() and not line.strip().startswith("#")
|
||||
]
|
||||
limit = args.limit if args.limit else None
|
||||
force_refresh_download_plan = (
|
||||
args.force_download_plan if hasattr(args, "force_download_plan") else False
|
||||
)
|
||||
fuzzy_match = args.fuzzy_match if hasattr(args, "fuzzy_match") else False
|
||||
fuzzy_threshold = (
|
||||
args.fuzzy_threshold
|
||||
if hasattr(args, "fuzzy_threshold")
|
||||
else DEFAULT_FUZZY_THRESHOLD
|
||||
)
|
||||
success = downloader.download_songlist_across_channels(
|
||||
channel_urls,
|
||||
limit=limit,
|
||||
force_refresh_download_plan=force_refresh_download_plan,
|
||||
fuzzy_match=fuzzy_match,
|
||||
fuzzy_threshold=fuzzy_threshold,
|
||||
limit=args.limit,
|
||||
force_refresh_download_plan=args.force_download_plan if hasattr(args, "force_download_plan") else False,
|
||||
fuzzy_match=args.fuzzy_match,
|
||||
fuzzy_threshold=args.fuzzy_threshold,
|
||||
force_download=args.force,
|
||||
show_pagination=args.show_pagination,
|
||||
)
|
||||
elif args.latest_per_channel:
|
||||
# Use provided file or default to data/channels.txt
|
||||
|
||||
@ -81,6 +81,7 @@ def build_download_plan(
|
||||
yt_dlp_path,
|
||||
fuzzy_match=False,
|
||||
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
||||
show_pagination=False,
|
||||
):
|
||||
"""
|
||||
For each song in undownloaded, scan all channels for a match.
|
||||
@ -108,7 +109,7 @@ def build_download_plan(
|
||||
print(f" ✅ Channel info: {channel_name} (ID: {channel_id})")
|
||||
print(f" 🔍 Fetching video list from channel...")
|
||||
available_videos = tracker.get_channel_video_list(
|
||||
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False
|
||||
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
|
||||
)
|
||||
print(
|
||||
f" 📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
|
||||
|
||||
@ -88,9 +88,9 @@ class KaraokeDownloader:
|
||||
|
||||
# Initialize tracking
|
||||
tracking_file = DATA_DIR / "karaoke_tracking.json"
|
||||
cache_file = DATA_DIR / "channel_cache.json"
|
||||
cache_dir = DATA_DIR / "channel_cache"
|
||||
self.tracker = TrackingManager(
|
||||
tracking_file=tracking_file, cache_file=cache_file
|
||||
tracking_file=tracking_file, cache_dir=cache_dir
|
||||
)
|
||||
|
||||
# Initialize song validator
|
||||
@ -316,6 +316,7 @@ class KaraokeDownloader:
|
||||
fuzzy_match=False,
|
||||
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
||||
force_download=False,
|
||||
show_pagination=False,
|
||||
):
|
||||
"""
|
||||
Download songs from the songlist across multiple channels.
|
||||
@ -508,6 +509,7 @@ class KaraokeDownloader:
|
||||
self.yt_dlp_path,
|
||||
fuzzy_match=fuzzy_match,
|
||||
fuzzy_threshold=fuzzy_threshold,
|
||||
show_pagination=show_pagination,
|
||||
)
|
||||
save_plan_cache(cache_file, download_plan, unmatched)
|
||||
print(f"💾 Download plan cached to: {cache_file}")
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
import threading
|
||||
from enum import Enum
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
class SongStatus(str, Enum):
|
||||
NOT_DOWNLOADED = "NOT_DOWNLOADED"
|
||||
@ -26,45 +26,127 @@ class TrackingManager:
|
||||
def __init__(
|
||||
self,
|
||||
tracking_file="data/karaoke_tracking.json",
|
||||
cache_file="data/channel_cache.json",
|
||||
cache_dir="data/channel_cache",
|
||||
):
|
||||
self.tracking_file = Path(tracking_file)
|
||||
self.cache_file = Path(cache_file)
|
||||
self.data = {"playlists": {}, "songs": {}}
|
||||
self.cache = {}
|
||||
self._lock = threading.Lock()
|
||||
self._load()
|
||||
self._load_cache()
|
||||
self.cache_dir = Path(cache_dir)
|
||||
|
||||
# Ensure cache directory exists
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.data = self._load()
|
||||
print(f"📊 Tracking manager initialized with {len(self.data.get('songs', {}))} tracked songs")
|
||||
|
||||
def _load(self):
|
||||
"""Load tracking data from JSON file."""
|
||||
if self.tracking_file.exists():
|
||||
try:
|
||||
with open(self.tracking_file, "r", encoding="utf-8") as f:
|
||||
self.data = json.load(f)
|
||||
except Exception:
|
||||
self.data = {"playlists": {}, "songs": {}}
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
print(f"⚠️ Corrupted tracking file, creating new one")
|
||||
|
||||
return {"songs": {}, "playlists": {}, "last_updated": datetime.now().isoformat()}
|
||||
|
||||
def _save(self):
|
||||
with self._lock:
|
||||
with open(self.tracking_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self.data, f, indent=2, ensure_ascii=False)
|
||||
"""Save tracking data to JSON file."""
|
||||
self.data["last_updated"] = datetime.now().isoformat()
|
||||
self.tracking_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(self.tracking_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self.data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def force_save(self):
|
||||
"""Force save the tracking data."""
|
||||
self._save()
|
||||
|
||||
def _load_cache(self):
|
||||
if self.cache_file.exists():
|
||||
try:
|
||||
with open(self.cache_file, "r", encoding="utf-8") as f:
|
||||
self.cache = json.load(f)
|
||||
except Exception:
|
||||
self.cache = {}
|
||||
def _get_channel_cache_file(self, channel_id: str) -> Path:
|
||||
"""Get the cache file path for a specific channel."""
|
||||
# Sanitize channel ID for filename
|
||||
safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
|
||||
return self.cache_dir / f"{safe_channel_id}.json"
|
||||
|
||||
def save_cache(self):
|
||||
with open(self.cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self.cache, f, indent=2, ensure_ascii=False)
|
||||
def _load_channel_cache(self, channel_id: str) -> List[Dict[str, str]]:
|
||||
"""Load cache for a specific channel."""
|
||||
cache_file = self._get_channel_cache_file(channel_id)
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data.get('videos', [])
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
print(f" ⚠️ Corrupted cache file for {channel_id}, will recreate")
|
||||
return []
|
||||
return []
|
||||
|
||||
def _save_channel_cache(self, channel_id: str, videos: List[Dict[str, str]]):
|
||||
"""Save cache for a specific channel."""
|
||||
cache_file = self._get_channel_cache_file(channel_id)
|
||||
data = {
|
||||
'channel_id': channel_id,
|
||||
'videos': videos,
|
||||
'last_updated': datetime.now().isoformat(),
|
||||
'video_count': len(videos)
|
||||
}
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def _clear_channel_cache(self, channel_id: str):
|
||||
"""Clear cache for a specific channel."""
|
||||
cache_file = self._get_channel_cache_file(channel_id)
|
||||
if cache_file.exists():
|
||||
cache_file.unlink()
|
||||
print(f" 🗑️ Cleared cache file: {cache_file.name}")
|
||||
|
||||
def get_cache_info(self):
|
||||
"""Get information about all channel cache files."""
|
||||
cache_files = list(self.cache_dir.glob("*.json"))
|
||||
total_videos = 0
|
||||
cache_info = []
|
||||
|
||||
for cache_file in cache_files:
|
||||
try:
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
video_count = len(data.get('videos', []))
|
||||
total_videos += video_count
|
||||
last_updated = data.get('last_updated', 'Unknown')
|
||||
cache_info.append({
|
||||
'channel': data.get('channel_id', cache_file.stem),
|
||||
'videos': video_count,
|
||||
'last_updated': last_updated,
|
||||
'file': cache_file.name
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error reading cache file {cache_file.name}: {e}")
|
||||
|
||||
return {
|
||||
'total_channels': len(cache_files),
|
||||
'total_videos': total_videos,
|
||||
'channels': cache_info
|
||||
}
|
||||
|
||||
def clear_channel_cache(self, channel_id=None):
|
||||
"""Clear cache for a specific channel or all channels."""
|
||||
if channel_id:
|
||||
self._clear_channel_cache(channel_id)
|
||||
print(f"🗑️ Cleared cache for channel: {channel_id}")
|
||||
else:
|
||||
# Clear all cache files
|
||||
cache_files = list(self.cache_dir.glob("*.json"))
|
||||
for cache_file in cache_files:
|
||||
cache_file.unlink()
|
||||
print(f"🗑️ Cleared all {len(cache_files)} channel cache files")
|
||||
|
||||
def set_cache_duration(self, hours):
|
||||
"""Placeholder for cache duration logic"""
|
||||
pass
|
||||
|
||||
def export_playlist_report(self, playlist_id):
|
||||
"""Export a report for a specific playlist."""
|
||||
pass
|
||||
|
||||
def get_statistics(self):
|
||||
"""Get statistics about tracked songs."""
|
||||
total_songs = len(self.data["songs"])
|
||||
downloaded_songs = sum(
|
||||
1
|
||||
@ -102,11 +184,13 @@ class TrackingManager:
|
||||
}
|
||||
|
||||
def get_playlist_songs(self, playlist_id):
|
||||
"""Get songs for a specific playlist."""
|
||||
return [
|
||||
s for s in self.data["songs"].values() if s["playlist_id"] == playlist_id
|
||||
]
|
||||
|
||||
def get_failed_songs(self, playlist_id=None):
|
||||
"""Get failed songs, optionally filtered by playlist."""
|
||||
if playlist_id:
|
||||
return [
|
||||
s
|
||||
@ -118,6 +202,7 @@ class TrackingManager:
|
||||
]
|
||||
|
||||
def get_partial_downloads(self, playlist_id=None):
|
||||
"""Get partial downloads, optionally filtered by playlist."""
|
||||
if playlist_id:
|
||||
return [
|
||||
s
|
||||
@ -129,7 +214,7 @@ class TrackingManager:
|
||||
]
|
||||
|
||||
def cleanup_orphaned_files(self, downloads_dir):
|
||||
# Remove tracking entries for files that no longer exist
|
||||
"""Remove tracking entries for files that no longer exist."""
|
||||
orphaned = []
|
||||
for song_id, song in list(self.data["songs"].items()):
|
||||
file_path = song.get("file_path")
|
||||
@ -139,51 +224,17 @@ class TrackingManager:
|
||||
self.force_save()
|
||||
return orphaned
|
||||
|
||||
def get_cache_info(self):
|
||||
total_channels = len(self.cache)
|
||||
total_cached_videos = sum(len(v) for v in self.cache.values())
|
||||
cache_duration_hours = 24 # default
|
||||
last_updated = None
|
||||
return {
|
||||
"total_channels": total_channels,
|
||||
"total_cached_videos": total_cached_videos,
|
||||
"cache_duration_hours": cache_duration_hours,
|
||||
"last_updated": last_updated,
|
||||
}
|
||||
|
||||
def clear_channel_cache(self, channel_id=None):
|
||||
if channel_id is None or channel_id == "all":
|
||||
self.cache = {}
|
||||
else:
|
||||
self.cache.pop(channel_id, None)
|
||||
self.save_cache()
|
||||
|
||||
def set_cache_duration(self, hours):
|
||||
# Placeholder for cache duration logic
|
||||
pass
|
||||
|
||||
def export_playlist_report(self, playlist_id):
|
||||
playlist = self.data["playlists"].get(playlist_id)
|
||||
if not playlist:
|
||||
return f"Playlist '{playlist_id}' not found."
|
||||
songs = self.get_playlist_songs(playlist_id)
|
||||
report = {"playlist": playlist, "songs": songs}
|
||||
return json.dumps(report, indent=2, ensure_ascii=False)
|
||||
|
||||
def is_song_downloaded(self, artist, title, channel_name=None, video_id=None):
|
||||
"""
|
||||
Check if a song has already been downloaded by this system.
|
||||
Returns True if the song exists in tracking with DOWNLOADED or CONVERTED status.
|
||||
Check if a song has already been downloaded.
|
||||
Returns True if the song exists in tracking with DOWNLOADED status.
|
||||
"""
|
||||
# If we have video_id and channel_name, try direct key lookup first (most efficient)
|
||||
if video_id and channel_name:
|
||||
song_key = f"{video_id}@{channel_name}"
|
||||
if song_key in self.data["songs"]:
|
||||
song_data = self.data["songs"][song_key]
|
||||
if song_data.get("status") in [
|
||||
SongStatus.DOWNLOADED,
|
||||
SongStatus.CONVERTED,
|
||||
]:
|
||||
if song_data.get("status") == SongStatus.DOWNLOADED:
|
||||
return True
|
||||
|
||||
# Fallback to content search (for cases where we don't have video_id)
|
||||
@ -191,19 +242,14 @@ class TrackingManager:
|
||||
# Check if this song matches the artist and title
|
||||
if song_data.get("artist") == artist and song_data.get("title") == title:
|
||||
# Check if it's marked as downloaded
|
||||
if song_data.get("status") in [
|
||||
SongStatus.DOWNLOADED,
|
||||
SongStatus.CONVERTED,
|
||||
]:
|
||||
if song_data.get("status") == SongStatus.DOWNLOADED:
|
||||
return True
|
||||
# Also check the video title field which might contain the song info
|
||||
video_title = song_data.get("video_title", "")
|
||||
if video_title and artist in video_title and title in video_title:
|
||||
if song_data.get("status") in [
|
||||
SongStatus.DOWNLOADED,
|
||||
SongStatus.CONVERTED,
|
||||
]:
|
||||
if song_data.get("status") == SongStatus.DOWNLOADED:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_file_exists(self, file_path):
|
||||
@ -283,65 +329,248 @@ class TrackingManager:
|
||||
self._save()
|
||||
|
||||
def get_channel_video_list(
|
||||
self, channel_url, yt_dlp_path="downloader/yt-dlp.exe", force_refresh=False
|
||||
self, channel_url, yt_dlp_path="downloader/yt-dlp.exe", force_refresh=False, show_pagination=False
|
||||
):
|
||||
"""
|
||||
Return a list of videos (dicts with 'title' and 'id') for the channel, using cache if available unless force_refresh is True.
|
||||
|
||||
Args:
|
||||
channel_url: YouTube channel URL
|
||||
yt_dlp_path: Path to yt-dlp executable
|
||||
force_refresh: Force refresh cache even if available
|
||||
show_pagination: Show page-by-page progress (slower but more detailed)
|
||||
"""
|
||||
channel_name, channel_id = None, None
|
||||
from karaoke_downloader.youtube_utils import get_channel_info
|
||||
|
||||
channel_name, channel_id = get_channel_info(channel_url)
|
||||
|
||||
if not channel_id:
|
||||
print(f" ❌ Could not extract channel ID from URL: {channel_url}")
|
||||
return []
|
||||
|
||||
# Try multiple possible cache keys
|
||||
possible_keys = [
|
||||
channel_id, # The extracted channel ID
|
||||
channel_url, # The full URL
|
||||
channel_name, # The extracted channel name
|
||||
]
|
||||
print(f" 🔍 Channel: {channel_name} (ID: {channel_id})")
|
||||
|
||||
cache_key = None
|
||||
for key in possible_keys:
|
||||
if key and key in self.cache:
|
||||
cache_key = key
|
||||
break
|
||||
# Check if we have cached data for this channel
|
||||
if not force_refresh:
|
||||
cached_videos = self._load_channel_cache(channel_id)
|
||||
if cached_videos:
|
||||
# Validate that the cached data has proper video IDs
|
||||
corrupted = False
|
||||
|
||||
# Check if any video IDs look like titles instead of proper YouTube IDs
|
||||
for video in cached_videos[:10]: # Check first 10 videos
|
||||
video_id = video.get("id", "")
|
||||
if video_id and (len(video_id) > 20 or " " in video_id or "Lyrics" in video_id):
|
||||
print(f" ⚠️ Detected corrupted video ID in cache: '{video_id}'")
|
||||
corrupted = True
|
||||
break
|
||||
|
||||
if corrupted:
|
||||
print(f" 🧹 Clearing corrupted cache for {channel_id}")
|
||||
self._clear_channel_cache(channel_id)
|
||||
force_refresh = True
|
||||
else:
|
||||
print(f" 📋 Using cached video list ({len(cached_videos)} videos)")
|
||||
return cached_videos
|
||||
|
||||
if not cache_key:
|
||||
cache_key = channel_id or channel_url # Use as fallback for new entries
|
||||
|
||||
print(f" 🔍 Trying cache keys: {possible_keys}")
|
||||
print(f" 🔍 Selected cache key: '{cache_key}'")
|
||||
|
||||
if not force_refresh and cache_key in self.cache:
|
||||
print(
|
||||
f" 📋 Using cached video list ({len(self.cache[cache_key])} videos)"
|
||||
)
|
||||
return self.cache[cache_key]
|
||||
# Choose fetch method based on show_pagination flag
|
||||
if show_pagination:
|
||||
return self._fetch_videos_with_pagination(channel_url, channel_id, yt_dlp_path)
|
||||
else:
|
||||
print(f" ❌ Cache miss for all keys")
|
||||
return self._fetch_videos_flat_playlist(channel_url, channel_id, yt_dlp_path)
|
||||
|
||||
def _fetch_videos_with_pagination(self, channel_url, channel_id, yt_dlp_path):
|
||||
"""Fetch videos showing page-by-page progress."""
|
||||
print(f" 🌐 Fetching video list from YouTube (page-by-page mode)...")
|
||||
print(f" 📡 Channel URL: {channel_url}")
|
||||
|
||||
import subprocess
|
||||
|
||||
all_videos = []
|
||||
page = 1
|
||||
videos_per_page = 50 # YouTube typically shows 50 videos per page
|
||||
|
||||
while True:
|
||||
print(f" 📄 Fetching page {page}...")
|
||||
|
||||
# Fetch one page at a time
|
||||
cmd = [
|
||||
yt_dlp_path,
|
||||
"--flat-playlist",
|
||||
"--print",
|
||||
"%(title)s|%(id)s|%(url)s",
|
||||
"--playlist-start",
|
||||
str((page - 1) * videos_per_page + 1),
|
||||
"--playlist-end",
|
||||
str(page * videos_per_page),
|
||||
channel_url,
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
||||
lines = result.stdout.strip().splitlines()
|
||||
|
||||
if not lines:
|
||||
print(f" ✅ No more videos found on page {page}")
|
||||
break
|
||||
|
||||
print(f" 📊 Page {page}: Found {len(lines)} videos")
|
||||
|
||||
page_videos = []
|
||||
invalid_count = 0
|
||||
|
||||
for line in lines:
|
||||
parts = line.split("|")
|
||||
if len(parts) >= 2:
|
||||
title, video_id = parts[0].strip(), parts[1].strip()
|
||||
|
||||
# Validate that video_id looks like a proper YouTube ID
|
||||
if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
|
||||
page_videos.append({"title": title, "id": video_id})
|
||||
else:
|
||||
invalid_count += 1
|
||||
if invalid_count <= 3: # Show first 3 invalid IDs per page
|
||||
print(f" ⚠️ Invalid ID: '{video_id}' for '{title}'")
|
||||
|
||||
if invalid_count > 3:
|
||||
print(f" ⚠️ ... and {invalid_count - 3} more invalid IDs on this page")
|
||||
|
||||
all_videos.extend(page_videos)
|
||||
print(f" ✅ Page {page}: Added {len(page_videos)} valid videos (total: {len(all_videos)})")
|
||||
|
||||
# If we got fewer videos than expected, we're probably at the end
|
||||
if len(lines) < videos_per_page:
|
||||
print(f" 🏁 Reached end of channel (last page had {len(lines)} videos)")
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
# Safety check to prevent infinite loops
|
||||
if page > 200: # Max 200 pages (10,000 videos)
|
||||
print(f" ⚠️ Reached maximum page limit (200 pages), stopping")
|
||||
break
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f" ⚠️ Page {page} timed out, stopping")
|
||||
break
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f" ❌ Error fetching page {page}: {e}")
|
||||
break
|
||||
|
||||
if not all_videos:
|
||||
print(f" ❌ No valid videos found")
|
||||
return []
|
||||
|
||||
print(f" 🎉 Channel download complete!")
|
||||
print(f" 📊 Total videos fetched: {len(all_videos)}")
|
||||
|
||||
# Save to individual channel cache file
|
||||
self._save_channel_cache(channel_id, all_videos)
|
||||
print(f" 💾 Saved cache to: {self._get_channel_cache_file(channel_id).name}")
|
||||
|
||||
return all_videos
|
||||
|
||||
def _fetch_videos_flat_playlist(self, channel_url, channel_id, yt_dlp_path):
|
||||
"""Fetch all videos using flat playlist (faster but less detailed progress)."""
|
||||
# Fetch with yt-dlp
|
||||
print(f" 🌐 Fetching video list from YouTube (this may take a while)...")
|
||||
print(f" 📡 Channel URL: {channel_url}")
|
||||
|
||||
import subprocess
|
||||
|
||||
# First, let's get the total count to show progress
|
||||
count_cmd = [
|
||||
yt_dlp_path,
|
||||
"--flat-playlist",
|
||||
"--print",
|
||||
"%(title)s",
|
||||
"--playlist-end",
|
||||
"1", # Just get first video to test
|
||||
channel_url,
|
||||
]
|
||||
|
||||
try:
|
||||
print(f" 🔍 Testing channel access...")
|
||||
test_result = subprocess.run(count_cmd, capture_output=True, text=True, timeout=30)
|
||||
if test_result.returncode == 0:
|
||||
print(f" ✅ Channel is accessible")
|
||||
else:
|
||||
print(f" ⚠️ Channel test failed: {test_result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f" ⚠️ Channel test timed out")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Channel test error: {e}")
|
||||
|
||||
# Now fetch all videos with progress indicators
|
||||
cmd = [
|
||||
yt_dlp_path,
|
||||
"--flat-playlist",
|
||||
"--print",
|
||||
"%(title)s|%(id)s|%(url)s",
|
||||
"--verbose", # Add verbose output to see what's happening
|
||||
channel_url,
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
print(f" 🔧 Running yt-dlp command: {' '.join(cmd)}")
|
||||
print(f" 📥 Starting video list download...")
|
||||
|
||||
# Use a timeout and show progress
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=300)
|
||||
lines = result.stdout.strip().splitlines()
|
||||
|
||||
print(f" 📄 Raw output lines: {len(lines)}")
|
||||
print(f" 📊 Download completed successfully!")
|
||||
|
||||
# Show some sample lines to understand the format
|
||||
if lines:
|
||||
print(f" 📋 Sample output format:")
|
||||
for i, line in enumerate(lines[:3]):
|
||||
print(f" Line {i+1}: {line[:100]}...")
|
||||
if len(lines) > 3:
|
||||
print(f" ... and {len(lines) - 3} more lines")
|
||||
|
||||
videos = []
|
||||
for line in lines:
|
||||
invalid_count = 0
|
||||
|
||||
print(f" 🔍 Processing {len(lines)} video entries...")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
|
||||
print(f" 📊 Processing line {i}/{len(lines)}... ({i/len(lines)*100:.1f}%)")
|
||||
|
||||
parts = line.split("|")
|
||||
if len(parts) >= 2:
|
||||
title, video_id = parts[0].strip(), parts[1].strip()
|
||||
videos.append({"title": title, "id": video_id})
|
||||
self.cache[cache_key] = videos
|
||||
self.save_cache()
|
||||
|
||||
# Validate that video_id looks like a proper YouTube ID
|
||||
if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
|
||||
videos.append({"title": title, "id": video_id})
|
||||
else:
|
||||
invalid_count += 1
|
||||
if invalid_count <= 5: # Only show first 5 invalid IDs
|
||||
print(f" ⚠️ Skipping invalid video ID: '{video_id}' for title: '{title}'")
|
||||
elif invalid_count == 6:
|
||||
print(f" ⚠️ ... and {len(lines) - i - 1} more invalid IDs")
|
||||
|
||||
if not videos:
|
||||
print(f" ❌ No valid videos found after parsing")
|
||||
return []
|
||||
|
||||
print(f" ✅ Parsed {len(videos)} valid videos from YouTube")
|
||||
print(f" ⚠️ Skipped {invalid_count} invalid video IDs")
|
||||
|
||||
# Save to individual channel cache file
|
||||
self._save_channel_cache(channel_id, videos)
|
||||
print(f" 💾 Saved cache to: {self._get_channel_cache_file(channel_id).name}")
|
||||
|
||||
return videos
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f"❌ yt-dlp timed out after 5 minutes - channel may be too large")
|
||||
return []
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ yt-dlp failed to fetch playlist for cache: {e}")
|
||||
print(f" 📄 stderr: {e.stderr}")
|
||||
return []
|
||||
|
||||
@ -106,6 +106,10 @@ def download_single_video(
|
||||
print(f"⬇️ Downloading: {artist} - {title} -> {output_path}")
|
||||
|
||||
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
||||
|
||||
# Debug: Show the video_id and URL being used
|
||||
print(f"🔍 DEBUG: video_id = '{video_id}'")
|
||||
print(f"🔍 DEBUG: video_url = '{video_url}'")
|
||||
|
||||
# Build command using centralized utility
|
||||
cmd = build_yt_dlp_command(yt_dlp_path, video_url, output_path, config)
|
||||
@ -255,7 +259,7 @@ def execute_download_plan(
|
||||
video_id = item["video_id"]
|
||||
video_title = item["video_title"]
|
||||
|
||||
print(f"\n⬇️ Downloading {len(download_plan) - idx} of {total_to_download}:")
|
||||
print(f"\n⬇️ Downloading {downloaded_count + 1} of {total_to_download}:")
|
||||
print(f" 📋 Songlist: {artist} - {title}")
|
||||
print(f" 🎬 Video: {video_title} ({channel_name})")
|
||||
if "match_score" in item:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user