Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>
This commit is contained in:
parent
ea07188739
commit
f914d54067
122
build_cache_from_raw.py
Normal file
122
build_cache_from_raw.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Script to build channel cache from raw yt-dlp output file.
|
||||||
|
This uses the fixed parsing logic to handle titles with | characters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def parse_raw_output_file(raw_file_path):
|
||||||
|
"""Parse the raw output file and extract valid videos."""
|
||||||
|
videos = []
|
||||||
|
invalid_count = 0
|
||||||
|
|
||||||
|
print(f"🔍 Parsing raw output file: {raw_file_path}")
|
||||||
|
|
||||||
|
with open(raw_file_path, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Skip header lines (lines starting with #)
|
||||||
|
data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()]
|
||||||
|
|
||||||
|
print(f"📄 Found {len(data_lines)} data lines to process")
|
||||||
|
|
||||||
|
for i, line in enumerate(data_lines):
|
||||||
|
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
|
||||||
|
print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)")
|
||||||
|
|
||||||
|
# Remove line number prefix (e.g., " 1234: ")
|
||||||
|
line = re.sub(r'^\s*\d+:\s*', '', line.strip())
|
||||||
|
|
||||||
|
# More robust parsing that handles titles with | characters
|
||||||
|
# Extract video ID directly from the URL that yt-dlp provides
|
||||||
|
|
||||||
|
# Find the URL and extract video ID from it
|
||||||
|
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
|
||||||
|
if not url_match:
|
||||||
|
invalid_count += 1
|
||||||
|
if invalid_count <= 5:
|
||||||
|
print(f"⚠️ Skipping line with no URL: '{line[:100]}...'")
|
||||||
|
elif invalid_count == 6:
|
||||||
|
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid lines")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract video ID directly from the URL
|
||||||
|
video_id = url_match.group(1)
|
||||||
|
|
||||||
|
# Extract title (everything before the video ID in the line)
|
||||||
|
title = line[:line.find(video_id)].rstrip('|').strip()
|
||||||
|
|
||||||
|
# Validate video ID
|
||||||
|
if video_id and (
|
||||||
|
len(video_id) == 11 and
|
||||||
|
video_id.replace('-', '').replace('_', '').isalnum() and
|
||||||
|
" " not in video_id and
|
||||||
|
"Lyrics" not in video_id and
|
||||||
|
"KARAOKE" not in video_id.upper() and
|
||||||
|
"Vocal" not in video_id and
|
||||||
|
"Guide" not in video_id
|
||||||
|
):
|
||||||
|
videos.append({"title": title, "id": video_id})
|
||||||
|
else:
|
||||||
|
invalid_count += 1
|
||||||
|
if invalid_count <= 5: # Only show first 5 invalid IDs
|
||||||
|
print(f"⚠️ Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
|
||||||
|
elif invalid_count == 6:
|
||||||
|
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid IDs")
|
||||||
|
|
||||||
|
print(f"✅ Parsed {len(videos)} valid videos from raw output")
|
||||||
|
print(f"⚠️ Skipped {invalid_count} invalid video IDs")
|
||||||
|
|
||||||
|
return videos
|
||||||
|
|
||||||
|
def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"):
|
||||||
|
"""Save the parsed videos to a cache file."""
|
||||||
|
cache_dir = Path(cache_dir)
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Sanitize channel ID for filename
|
||||||
|
safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
|
||||||
|
cache_file = cache_dir / f"{safe_channel_id}.json"
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'channel_id': channel_id,
|
||||||
|
'videos': videos,
|
||||||
|
'last_updated': datetime.now().isoformat(),
|
||||||
|
'video_count': len(videos)
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"💾 Saved cache to: {cache_file.name}")
|
||||||
|
return cache_file
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to build cache from raw output."""
|
||||||
|
raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt")
|
||||||
|
|
||||||
|
if not raw_file_path.exists():
|
||||||
|
print(f"❌ Raw output file not found: {raw_file_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse the raw output file
|
||||||
|
videos = parse_raw_output_file(raw_file_path)
|
||||||
|
|
||||||
|
if not videos:
|
||||||
|
print("❌ No valid videos found")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Save to cache file
|
||||||
|
channel_id = "@VocalStarKaraoke"
|
||||||
|
cache_file = save_cache_file(channel_id, videos)
|
||||||
|
|
||||||
|
print(f"🎉 Cache build complete!")
|
||||||
|
print(f"📊 Total videos in cache: {len(videos)}")
|
||||||
|
print(f"📁 Cache file: {cache_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
236590
data/channel_cache.json
236590
data/channel_cache.json
File diff suppressed because it is too large
Load Diff
@ -110,6 +110,17 @@ Examples:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Show page-by-page progress when downloading channel video lists (slower but more detailed)",
|
help="Show page-by-page progress when downloading channel video lists (slower but more detailed)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--parallel-channels",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable parallel channel scanning for faster channel processing (scans multiple channels simultaneously)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--channel-workers",
|
||||||
|
type=int,
|
||||||
|
default=3,
|
||||||
|
help="Number of parallel channel scanning workers (default: 3, max: 10)",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--songlist-only",
|
"--songlist-only",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@ -183,7 +194,7 @@ Examples:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--parallel",
|
"--parallel",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable parallel downloads for improved speed",
|
help="Enable parallel downloads for improved speed (3-5x faster for large batches)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--workers",
|
"--workers",
|
||||||
@ -209,6 +220,11 @@ Examples:
|
|||||||
print("❌ Error: --workers must be between 1 and 10")
|
print("❌ Error: --workers must be between 1 and 10")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Validate channel workers argument
|
||||||
|
if args.channel_workers < 1 or args.channel_workers > 10:
|
||||||
|
print("❌ Error: --channel-workers must be between 1 and 10")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
yt_dlp_path = Path("downloader/yt-dlp.exe")
|
yt_dlp_path = Path("downloader/yt-dlp.exe")
|
||||||
if not yt_dlp_path.exists():
|
if not yt_dlp_path.exists():
|
||||||
print("❌ Error: yt-dlp.exe not found in downloader/ directory")
|
print("❌ Error: yt-dlp.exe not found in downloader/ directory")
|
||||||
@ -391,6 +407,8 @@ Examples:
|
|||||||
fuzzy_threshold=args.fuzzy_threshold,
|
fuzzy_threshold=args.fuzzy_threshold,
|
||||||
force_download=args.force,
|
force_download=args.force,
|
||||||
show_pagination=args.show_pagination,
|
show_pagination=args.show_pagination,
|
||||||
|
parallel_channels=args.parallel_channels,
|
||||||
|
max_channel_workers=args.channel_workers,
|
||||||
)
|
)
|
||||||
elif args.latest_per_channel:
|
elif args.latest_per_channel:
|
||||||
# Use provided file or default to data/channels.txt
|
# Use provided file or default to data/channels.txt
|
||||||
|
|||||||
@ -3,10 +3,13 @@ Download plan building utilities.
|
|||||||
Handles pre-scanning channels and building download plans.
|
Handles pre-scanning channels and building download plans.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from karaoke_downloader.cache_manager import (
|
from karaoke_downloader.cache_manager import (
|
||||||
delete_plan_cache,
|
delete_plan_cache,
|
||||||
@ -16,12 +19,13 @@ from karaoke_downloader.cache_manager import (
|
|||||||
)
|
)
|
||||||
from karaoke_downloader.fuzzy_matcher import (
|
from karaoke_downloader.fuzzy_matcher import (
|
||||||
create_song_key,
|
create_song_key,
|
||||||
extract_artist_title,
|
create_video_key,
|
||||||
get_similarity_function,
|
get_similarity_function,
|
||||||
is_exact_match,
|
is_exact_match,
|
||||||
is_fuzzy_match,
|
is_fuzzy_match,
|
||||||
normalize_title,
|
normalize_title,
|
||||||
)
|
)
|
||||||
|
from karaoke_downloader.id3_utils import extract_artist_title
|
||||||
from karaoke_downloader.youtube_utils import get_channel_info
|
from karaoke_downloader.youtube_utils import get_channel_info
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
@ -29,6 +33,7 @@ DEFAULT_FILENAME_LENGTH_LIMIT = 100
|
|||||||
DEFAULT_ARTIST_LENGTH_LIMIT = 30
|
DEFAULT_ARTIST_LENGTH_LIMIT = 30
|
||||||
DEFAULT_TITLE_LENGTH_LIMIT = 60
|
DEFAULT_TITLE_LENGTH_LIMIT = 60
|
||||||
DEFAULT_FUZZY_THRESHOLD = 85
|
DEFAULT_FUZZY_THRESHOLD = 85
|
||||||
|
DEFAULT_DISPLAY_LIMIT = 10
|
||||||
|
|
||||||
|
|
||||||
def generate_unmatched_report(unmatched: List[Dict[str, Any]], report_path: str = None) -> str:
|
def generate_unmatched_report(unmatched: List[Dict[str, Any]], report_path: str = None) -> str:
|
||||||
@ -74,6 +79,111 @@ def generate_unmatched_report(unmatched: List[Dict[str, Any]], report_path: str
|
|||||||
return str(report_file)
|
return str(report_file)
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_channel_for_matches(
|
||||||
|
channel_url,
|
||||||
|
channel_name,
|
||||||
|
channel_id,
|
||||||
|
song_keys,
|
||||||
|
song_lookup,
|
||||||
|
fuzzy_match,
|
||||||
|
fuzzy_threshold,
|
||||||
|
show_pagination,
|
||||||
|
yt_dlp_path,
|
||||||
|
tracker,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Scan a single channel for matches (used in parallel processing).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_url: URL of the channel to scan
|
||||||
|
channel_name: Name of the channel
|
||||||
|
channel_id: ID of the channel
|
||||||
|
song_keys: Set of song keys to match against
|
||||||
|
song_lookup: Dictionary mapping song keys to song data
|
||||||
|
fuzzy_match: Whether to use fuzzy matching
|
||||||
|
fuzzy_threshold: Threshold for fuzzy matching
|
||||||
|
show_pagination: Whether to show pagination progress
|
||||||
|
yt_dlp_path: Path to yt-dlp executable
|
||||||
|
tracker: Tracking manager instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of video matches found in this channel
|
||||||
|
"""
|
||||||
|
print(f"\n🚦 Scanning channel: {channel_name} ({channel_url})")
|
||||||
|
|
||||||
|
# Get channel info if not provided
|
||||||
|
if not channel_name or not channel_id:
|
||||||
|
channel_name, channel_id = get_channel_info(channel_url)
|
||||||
|
|
||||||
|
# Fetch video list from channel
|
||||||
|
available_videos = tracker.get_channel_video_list(
|
||||||
|
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" 📊 Channel has {len(available_videos)} videos to scan")
|
||||||
|
|
||||||
|
video_matches = []
|
||||||
|
|
||||||
|
# Pre-process video titles for efficient matching
|
||||||
|
if fuzzy_match:
|
||||||
|
# For fuzzy matching, create normalized video keys
|
||||||
|
for video in available_videos:
|
||||||
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
|
# Find best match among remaining songs
|
||||||
|
best_match = None
|
||||||
|
best_score = 0
|
||||||
|
for song_key in song_keys:
|
||||||
|
if song_key in song_lookup: # Only check unmatched songs
|
||||||
|
score = get_similarity_function()(song_key, video_key)
|
||||||
|
if score >= fuzzy_threshold and score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = song_key
|
||||||
|
|
||||||
|
if best_match:
|
||||||
|
song = song_lookup[best_match]
|
||||||
|
video_matches.append(
|
||||||
|
{
|
||||||
|
"artist": song["artist"],
|
||||||
|
"title": song["title"],
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_url": channel_url,
|
||||||
|
"video_id": video["id"],
|
||||||
|
"video_title": video["title"],
|
||||||
|
"match_score": best_score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Remove matched song from future consideration
|
||||||
|
del song_lookup[best_match]
|
||||||
|
song_keys.remove(best_match)
|
||||||
|
else:
|
||||||
|
# For exact matching, use direct key comparison
|
||||||
|
for video in available_videos:
|
||||||
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
|
if video_key in song_keys:
|
||||||
|
song = song_lookup[video_key]
|
||||||
|
video_matches.append(
|
||||||
|
{
|
||||||
|
"artist": song["artist"],
|
||||||
|
"title": song["title"],
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_url": channel_url,
|
||||||
|
"video_id": video["id"],
|
||||||
|
"video_title": video["title"],
|
||||||
|
"match_score": 100,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Remove matched song from future consideration
|
||||||
|
del song_lookup[video_key]
|
||||||
|
song_keys.remove(video_key)
|
||||||
|
|
||||||
|
print(f" ✅ Found {len(video_matches)} matches in {channel_name}")
|
||||||
|
return video_matches
|
||||||
|
|
||||||
|
|
||||||
def build_download_plan(
|
def build_download_plan(
|
||||||
channel_urls,
|
channel_urls,
|
||||||
undownloaded,
|
undownloaded,
|
||||||
@ -82,6 +192,8 @@ def build_download_plan(
|
|||||||
fuzzy_match=False,
|
fuzzy_match=False,
|
||||||
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
||||||
show_pagination=False,
|
show_pagination=False,
|
||||||
|
parallel_channels=False,
|
||||||
|
max_channel_workers=3,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
For each song in undownloaded, scan all channels for a match.
|
For each song in undownloaded, scan all channels for a match.
|
||||||
@ -102,85 +214,198 @@ def build_download_plan(
|
|||||||
song_keys.add(key)
|
song_keys.add(key)
|
||||||
song_lookup[key] = song
|
song_lookup[key] = song
|
||||||
|
|
||||||
for i, channel_url in enumerate(channel_urls, 1):
|
if parallel_channels:
|
||||||
print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}")
|
print(f"🚀 Running parallel channel scanning with {max_channel_workers} workers.")
|
||||||
print(f" 🔍 Getting channel info...")
|
|
||||||
channel_name, channel_id = get_channel_info(channel_url)
|
# Create a thread-safe copy of song data for parallel processing
|
||||||
print(f" ✅ Channel info: {channel_name} (ID: {channel_id})")
|
import threading
|
||||||
print(f" 🔍 Fetching video list from channel...")
|
song_keys_lock = threading.Lock()
|
||||||
available_videos = tracker.get_channel_video_list(
|
song_lookup_lock = threading.Lock()
|
||||||
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
|
|
||||||
)
|
def scan_channel_safe(channel_url):
|
||||||
print(
|
"""Thread-safe channel scanning function."""
|
||||||
f" 📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
|
print(f"\n🚦 Scanning channel: {channel_url}")
|
||||||
)
|
|
||||||
matches_this_channel = 0
|
# Get channel info
|
||||||
video_matches = [] # Initialize video_matches for this channel
|
channel_name, channel_id = get_channel_info(channel_url)
|
||||||
|
print(f" ✅ Channel info: {channel_name} (ID: {channel_id})")
|
||||||
|
|
||||||
|
# Fetch video list from channel
|
||||||
|
available_videos = tracker.get_channel_video_list(
|
||||||
|
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
|
||||||
|
)
|
||||||
|
print(f" 📊 Channel has {len(available_videos)} videos to scan")
|
||||||
|
|
||||||
|
video_matches = []
|
||||||
|
|
||||||
|
# Pre-process video titles for efficient matching
|
||||||
|
if fuzzy_match:
|
||||||
|
# For fuzzy matching, create normalized video keys
|
||||||
|
for video in available_videos:
|
||||||
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
# Pre-process video titles for efficient matching
|
# Find best match among remaining songs (thread-safe)
|
||||||
if fuzzy_match:
|
best_match = None
|
||||||
# For fuzzy matching, create normalized video keys
|
best_score = 0
|
||||||
for video in available_videos:
|
with song_keys_lock:
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
available_song_keys = list(song_keys) # Copy for iteration
|
||||||
video_key = create_song_key(v_artist, v_title)
|
|
||||||
|
for song_key in available_song_keys:
|
||||||
|
with song_lookup_lock:
|
||||||
|
if song_key in song_lookup: # Only check unmatched songs
|
||||||
|
score = get_similarity_function()(song_key, video_key)
|
||||||
|
if score >= fuzzy_threshold and score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = song_key
|
||||||
|
|
||||||
# Find best match among remaining songs
|
if best_match:
|
||||||
best_match = None
|
with song_lookup_lock:
|
||||||
best_score = 0
|
if best_match in song_lookup: # Double-check it's still available
|
||||||
for song_key in song_keys:
|
song = song_lookup[best_match]
|
||||||
if song_key in song_lookup: # Only check unmatched songs
|
video_matches.append(
|
||||||
score = get_similarity_function()(song_key, video_key)
|
{
|
||||||
if score >= fuzzy_threshold and score > best_score:
|
"artist": song["artist"],
|
||||||
best_score = score
|
"title": song["title"],
|
||||||
best_match = song_key
|
"channel_name": channel_name,
|
||||||
|
"channel_url": channel_url,
|
||||||
|
"video_id": video["id"],
|
||||||
|
"video_title": video["title"],
|
||||||
|
"match_score": best_score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Remove matched song from future consideration
|
||||||
|
del song_lookup[best_match]
|
||||||
|
with song_keys_lock:
|
||||||
|
song_keys.discard(best_match)
|
||||||
|
else:
|
||||||
|
# For exact matching, use direct key comparison
|
||||||
|
for video in available_videos:
|
||||||
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
if best_match:
|
with song_lookup_lock:
|
||||||
song = song_lookup[best_match]
|
if video_key in song_keys and video_key in song_lookup:
|
||||||
video_matches.append(
|
song = song_lookup[video_key]
|
||||||
{
|
video_matches.append(
|
||||||
"artist": song["artist"],
|
{
|
||||||
"title": song["title"],
|
"artist": song["artist"],
|
||||||
"channel_name": channel_name,
|
"title": song["title"],
|
||||||
"channel_url": channel_url,
|
"channel_name": channel_name,
|
||||||
"video_id": video["id"],
|
"channel_url": channel_url,
|
||||||
"video_title": video["title"],
|
"video_id": video["id"],
|
||||||
"match_score": best_score,
|
"video_title": video["title"],
|
||||||
}
|
"match_score": 100,
|
||||||
)
|
}
|
||||||
# Remove matched song from future consideration
|
)
|
||||||
del song_lookup[best_match]
|
# Remove matched song from future consideration
|
||||||
song_keys.remove(best_match)
|
del song_lookup[video_key]
|
||||||
matches_this_channel += 1
|
with song_keys_lock:
|
||||||
else:
|
song_keys.discard(video_key)
|
||||||
# For exact matching, use direct key comparison
|
|
||||||
for video in available_videos:
|
print(f" ✅ Found {len(video_matches)} matches in {channel_name}")
|
||||||
v_artist, v_title = extract_artist_title(video["title"])
|
return video_matches
|
||||||
video_key = create_song_key(v_artist, v_title)
|
|
||||||
|
# Execute parallel channel scanning
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_channel_workers) as executor:
|
||||||
|
# Submit all channel scanning tasks
|
||||||
|
future_to_channel = {
|
||||||
|
executor.submit(scan_channel_safe, channel_url): channel_url
|
||||||
|
for channel_url in channel_urls
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process results as they complete
|
||||||
|
for future in concurrent.futures.as_completed(future_to_channel):
|
||||||
|
channel_url = future_to_channel[future]
|
||||||
|
try:
|
||||||
|
video_matches = future.result()
|
||||||
|
plan.extend(video_matches)
|
||||||
|
channel_name, _ = get_channel_info(channel_url)
|
||||||
|
channel_match_counts[channel_name] = len(video_matches)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error processing channel {channel_url}: {e}")
|
||||||
|
channel_name, _ = get_channel_info(channel_url)
|
||||||
|
channel_match_counts[channel_name] = 0
|
||||||
|
else:
|
||||||
|
for i, channel_url in enumerate(channel_urls, 1):
|
||||||
|
print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}")
|
||||||
|
print(f" 🔍 Getting channel info...")
|
||||||
|
channel_name, channel_id = get_channel_info(channel_url)
|
||||||
|
print(f" ✅ Channel info: {channel_name} (ID: {channel_id})")
|
||||||
|
print(f" 🔍 Fetching video list from channel...")
|
||||||
|
available_videos = tracker.get_channel_video_list(
|
||||||
|
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" 📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
|
||||||
|
)
|
||||||
|
matches_this_channel = 0
|
||||||
|
video_matches = [] # Initialize video_matches for this channel
|
||||||
|
|
||||||
if video_key in song_keys:
|
# Pre-process video titles for efficient matching
|
||||||
song = song_lookup[video_key]
|
if fuzzy_match:
|
||||||
video_matches.append(
|
# For fuzzy matching, create normalized video keys
|
||||||
{
|
for video in available_videos:
|
||||||
"artist": song["artist"],
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
"title": song["title"],
|
video_key = create_song_key(v_artist, v_title)
|
||||||
"channel_name": channel_name,
|
|
||||||
"channel_url": channel_url,
|
|
||||||
"video_id": video["id"],
|
|
||||||
"video_title": video["title"],
|
|
||||||
"match_score": 100,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
# Remove matched song from future consideration
|
|
||||||
del song_lookup[video_key]
|
|
||||||
song_keys.remove(video_key)
|
|
||||||
matches_this_channel += 1
|
|
||||||
|
|
||||||
# Add matches to plan
|
# Find best match among remaining songs
|
||||||
plan.extend(video_matches)
|
best_match = None
|
||||||
|
best_score = 0
|
||||||
|
for song_key in song_keys:
|
||||||
|
if song_key in song_lookup: # Only check unmatched songs
|
||||||
|
score = get_similarity_function()(song_key, video_key)
|
||||||
|
if score >= fuzzy_threshold and score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = song_key
|
||||||
|
|
||||||
# Print match count once per channel
|
if best_match:
|
||||||
channel_match_counts[channel_name] = matches_this_channel
|
song = song_lookup[best_match]
|
||||||
print(f" → Found {matches_this_channel} songlist matches in this channel.")
|
video_matches.append(
|
||||||
|
{
|
||||||
|
"artist": song["artist"],
|
||||||
|
"title": song["title"],
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_url": channel_url,
|
||||||
|
"video_id": video["id"],
|
||||||
|
"video_title": video["title"],
|
||||||
|
"match_score": best_score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Remove matched song from future consideration
|
||||||
|
del song_lookup[best_match]
|
||||||
|
song_keys.remove(best_match)
|
||||||
|
matches_this_channel += 1
|
||||||
|
else:
|
||||||
|
# For exact matching, use direct key comparison
|
||||||
|
for video in available_videos:
|
||||||
|
v_artist, v_title = extract_artist_title(video["title"])
|
||||||
|
video_key = create_song_key(v_artist, v_title)
|
||||||
|
|
||||||
|
if video_key in song_keys:
|
||||||
|
song = song_lookup[video_key]
|
||||||
|
video_matches.append(
|
||||||
|
{
|
||||||
|
"artist": song["artist"],
|
||||||
|
"title": song["title"],
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_url": channel_url,
|
||||||
|
"video_id": video["id"],
|
||||||
|
"video_title": video["title"],
|
||||||
|
"match_score": 100,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Remove matched song from future consideration
|
||||||
|
del song_lookup[video_key]
|
||||||
|
song_keys.remove(video_key)
|
||||||
|
matches_this_channel += 1
|
||||||
|
|
||||||
|
# Add matches to plan
|
||||||
|
plan.extend(video_matches)
|
||||||
|
|
||||||
|
# Print match count once per channel
|
||||||
|
channel_match_counts[channel_name] = matches_this_channel
|
||||||
|
print(f" → Found {matches_this_channel} songlist matches in this channel.")
|
||||||
|
|
||||||
# Remaining unmatched songs
|
# Remaining unmatched songs
|
||||||
unmatched = list(song_lookup.values())
|
unmatched = list(song_lookup.values())
|
||||||
|
|||||||
@ -317,6 +317,8 @@ class KaraokeDownloader:
|
|||||||
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
|
||||||
force_download=False,
|
force_download=False,
|
||||||
show_pagination=False,
|
show_pagination=False,
|
||||||
|
parallel_channels=False,
|
||||||
|
max_channel_workers=3,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Download songs from the songlist across multiple channels.
|
Download songs from the songlist across multiple channels.
|
||||||
@ -510,6 +512,8 @@ class KaraokeDownloader:
|
|||||||
fuzzy_match=fuzzy_match,
|
fuzzy_match=fuzzy_match,
|
||||||
fuzzy_threshold=fuzzy_threshold,
|
fuzzy_threshold=fuzzy_threshold,
|
||||||
show_pagination=show_pagination,
|
show_pagination=show_pagination,
|
||||||
|
parallel_channels=parallel_channels,
|
||||||
|
max_channel_workers=max_channel_workers,
|
||||||
)
|
)
|
||||||
save_plan_cache(cache_file, download_plan, unmatched)
|
save_plan_cache(cache_file, download_plan, unmatched)
|
||||||
print(f"💾 Download plan cached to: {cache_file}")
|
print(f"💾 Download plan cached to: {cache_file}")
|
||||||
|
|||||||
@ -359,9 +359,18 @@ class TrackingManager:
|
|||||||
corrupted = False
|
corrupted = False
|
||||||
|
|
||||||
# Check if any video IDs look like titles instead of proper YouTube IDs
|
# Check if any video IDs look like titles instead of proper YouTube IDs
|
||||||
for video in cached_videos[:10]: # Check first 10 videos
|
for video in cached_videos[:20]: # Check first 20 videos
|
||||||
video_id = video.get("id", "")
|
video_id = video.get("id", "")
|
||||||
if video_id and (len(video_id) > 20 or " " in video_id or "Lyrics" in video_id):
|
# More comprehensive validation - YouTube IDs should be 11 characters and contain only alphanumeric, hyphens, and underscores
|
||||||
|
if video_id and (
|
||||||
|
len(video_id) != 11 or
|
||||||
|
not video_id.replace('-', '').replace('_', '').isalnum() or
|
||||||
|
" " in video_id or
|
||||||
|
"Lyrics" in video_id or
|
||||||
|
"KARAOKE" in video_id.upper() or
|
||||||
|
"Vocal" in video_id or
|
||||||
|
"Guide" in video_id
|
||||||
|
):
|
||||||
print(f" ⚠️ Detected corrupted video ID in cache: '{video_id}'")
|
print(f" ⚠️ Detected corrupted video ID in cache: '{video_id}'")
|
||||||
corrupted = True
|
corrupted = True
|
||||||
break
|
break
|
||||||
@ -389,7 +398,7 @@ class TrackingManager:
|
|||||||
|
|
||||||
all_videos = []
|
all_videos = []
|
||||||
page = 1
|
page = 1
|
||||||
videos_per_page = 50 # YouTube typically shows 50 videos per page
|
videos_per_page = 200 # YouTube/yt-dlp supports up to 200 videos per page, reducing API calls and errors
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print(f" 📄 Fetching page {page}...")
|
print(f" 📄 Fetching page {page}...")
|
||||||
@ -408,9 +417,26 @@ class TrackingManager:
|
|||||||
]
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
|
# Increased timeout to 180 seconds for larger pages (200 videos)
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=180)
|
||||||
lines = result.stdout.strip().splitlines()
|
lines = result.stdout.strip().splitlines()
|
||||||
|
|
||||||
|
# Save raw output for debugging (for each page)
|
||||||
|
raw_output_file = self._get_channel_cache_file(channel_id).parent / f"{channel_id}_raw_output_page{page}.txt"
|
||||||
|
try:
|
||||||
|
with open(raw_output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f"# Raw yt-dlp output for {channel_id} - Page {page}\n")
|
||||||
|
f.write(f"# Channel URL: {channel_url}\n")
|
||||||
|
f.write(f"# Command: {' '.join(cmd)}\n")
|
||||||
|
f.write(f"# Timestamp: {datetime.now().isoformat()}\n")
|
||||||
|
f.write(f"# Total lines: {len(lines)}\n")
|
||||||
|
f.write("#" * 80 + "\n\n")
|
||||||
|
for i, line in enumerate(lines, 1):
|
||||||
|
f.write(f"{i:6d}: {line}\n")
|
||||||
|
print(f" 💾 Saved raw output to: {raw_output_file.name}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Could not save raw output: {e}")
|
||||||
|
|
||||||
if not lines:
|
if not lines:
|
||||||
print(f" ✅ No more videos found on page {page}")
|
print(f" ✅ No more videos found on page {page}")
|
||||||
break
|
break
|
||||||
@ -421,17 +447,38 @@ class TrackingManager:
|
|||||||
invalid_count = 0
|
invalid_count = 0
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
parts = line.split("|")
|
if not line.strip():
|
||||||
if len(parts) >= 2:
|
continue
|
||||||
title, video_id = parts[0].strip(), parts[1].strip()
|
|
||||||
|
# More robust parsing that handles titles with | characters
|
||||||
# Validate that video_id looks like a proper YouTube ID
|
# Extract video ID directly from the URL that yt-dlp provides
|
||||||
if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
|
|
||||||
page_videos.append({"title": title, "id": video_id})
|
# Find the URL and extract video ID from it
|
||||||
else:
|
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
|
||||||
invalid_count += 1
|
if not url_match:
|
||||||
if invalid_count <= 3: # Show first 3 invalid IDs per page
|
continue
|
||||||
print(f" ⚠️ Invalid ID: '{video_id}' for '{title}'")
|
|
||||||
|
# Extract video ID directly from the URL
|
||||||
|
video_id = url_match.group(1)
|
||||||
|
|
||||||
|
# Extract title (everything before the video ID in the line)
|
||||||
|
title = line[:line.find(video_id)].rstrip('|').strip()
|
||||||
|
|
||||||
|
# Validate video ID
|
||||||
|
if video_id and (
|
||||||
|
len(video_id) == 11 and
|
||||||
|
video_id.replace('-', '').replace('_', '').isalnum() and
|
||||||
|
" " not in video_id and
|
||||||
|
"Lyrics" not in video_id and
|
||||||
|
"KARAOKE" not in video_id.upper() and
|
||||||
|
"Vocal" not in video_id and
|
||||||
|
"Guide" not in video_id
|
||||||
|
):
|
||||||
|
page_videos.append({"title": title, "id": video_id})
|
||||||
|
else:
|
||||||
|
invalid_count += 1
|
||||||
|
if invalid_count <= 3: # Show first 3 invalid IDs per page
|
||||||
|
print(f" ⚠️ Invalid ID: '{video_id}' for '{title[:50]}...'")
|
||||||
|
|
||||||
if invalid_count > 3:
|
if invalid_count > 3:
|
||||||
print(f" ⚠️ ... and {invalid_count - 3} more invalid IDs on this page")
|
print(f" ⚠️ ... and {invalid_count - 3} more invalid IDs on this page")
|
||||||
@ -447,8 +494,8 @@ class TrackingManager:
|
|||||||
page += 1
|
page += 1
|
||||||
|
|
||||||
# Safety check to prevent infinite loops
|
# Safety check to prevent infinite loops
|
||||||
if page > 200: # Max 200 pages (10,000 videos)
|
if page > 50: # Max 50 pages (10,000 videos with 200 per page)
|
||||||
print(f" ⚠️ Reached maximum page limit (200 pages), stopping")
|
print(f" ⚠️ Reached maximum page limit (50 pages), stopping")
|
||||||
break
|
break
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
@ -457,6 +504,9 @@ class TrackingManager:
|
|||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f" ❌ Error fetching page {page}: {e}")
|
print(f" ❌ Error fetching page {page}: {e}")
|
||||||
break
|
break
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f" ⏹️ User interrupted, stopping at page {page}")
|
||||||
|
break
|
||||||
|
|
||||||
if not all_videos:
|
if not all_videos:
|
||||||
print(f" ❌ No valid videos found")
|
print(f" ❌ No valid videos found")
|
||||||
@ -520,6 +570,22 @@ class TrackingManager:
|
|||||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=300)
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=300)
|
||||||
lines = result.stdout.strip().splitlines()
|
lines = result.stdout.strip().splitlines()
|
||||||
|
|
||||||
|
# Save raw output for debugging
|
||||||
|
raw_output_file = self._get_channel_cache_file(channel_id).parent / f"{channel_id}_raw_output.txt"
|
||||||
|
try:
|
||||||
|
with open(raw_output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f"# Raw yt-dlp output for {channel_id}\n")
|
||||||
|
f.write(f"# Channel URL: {channel_url}\n")
|
||||||
|
f.write(f"# Command: {' '.join(cmd)}\n")
|
||||||
|
f.write(f"# Timestamp: {datetime.now().isoformat()}\n")
|
||||||
|
f.write(f"# Total lines: {len(lines)}\n")
|
||||||
|
f.write("#" * 80 + "\n\n")
|
||||||
|
for i, line in enumerate(lines, 1):
|
||||||
|
f.write(f"{i:6d}: {line}\n")
|
||||||
|
print(f" 💾 Saved raw output to: {raw_output_file.name}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Could not save raw output: {e}")
|
||||||
|
|
||||||
print(f" 📄 Raw output lines: {len(lines)}")
|
print(f" 📄 Raw output lines: {len(lines)}")
|
||||||
print(f" 📊 Download completed successfully!")
|
print(f" 📊 Download completed successfully!")
|
||||||
|
|
||||||
@ -540,19 +606,42 @@ class TrackingManager:
|
|||||||
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
|
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
|
||||||
print(f" 📊 Processing line {i}/{len(lines)}... ({i/len(lines)*100:.1f}%)")
|
print(f" 📊 Processing line {i}/{len(lines)}... ({i/len(lines)*100:.1f}%)")
|
||||||
|
|
||||||
parts = line.split("|")
|
# More robust parsing that handles titles with | characters
|
||||||
if len(parts) >= 2:
|
# Extract video ID directly from the URL that yt-dlp provides
|
||||||
title, video_id = parts[0].strip(), parts[1].strip()
|
|
||||||
|
# Find the URL and extract video ID from it
|
||||||
# Validate that video_id looks like a proper YouTube ID
|
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
|
||||||
if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
|
if not url_match:
|
||||||
videos.append({"title": title, "id": video_id})
|
invalid_count += 1
|
||||||
else:
|
if invalid_count <= 5:
|
||||||
invalid_count += 1
|
print(f" ⚠️ Skipping line with no URL: '{line[:100]}...'")
|
||||||
if invalid_count <= 5: # Only show first 5 invalid IDs
|
elif invalid_count == 6:
|
||||||
print(f" ⚠️ Skipping invalid video ID: '{video_id}' for title: '{title}'")
|
print(f" ⚠️ ... and {len(lines) - i - 1} more invalid lines")
|
||||||
elif invalid_count == 6:
|
continue
|
||||||
print(f" ⚠️ ... and {len(lines) - i - 1} more invalid IDs")
|
|
||||||
|
# Extract video ID directly from the URL
|
||||||
|
video_id = url_match.group(1)
|
||||||
|
|
||||||
|
# Extract title (everything before the video ID in the line)
|
||||||
|
title = line[:line.find(video_id)].rstrip('|').strip()
|
||||||
|
|
||||||
|
# Validate video ID
|
||||||
|
if video_id and (
|
||||||
|
len(video_id) == 11 and
|
||||||
|
video_id.replace('-', '').replace('_', '').isalnum() and
|
||||||
|
" " not in video_id and
|
||||||
|
"Lyrics" not in video_id and
|
||||||
|
"KARAOKE" not in video_id.upper() and
|
||||||
|
"Vocal" not in video_id and
|
||||||
|
"Guide" not in video_id
|
||||||
|
):
|
||||||
|
videos.append({"title": title, "id": video_id})
|
||||||
|
else:
|
||||||
|
invalid_count += 1
|
||||||
|
if invalid_count <= 5: # Only show first 5 invalid IDs
|
||||||
|
print(f" ⚠️ Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
|
||||||
|
elif invalid_count == 6:
|
||||||
|
print(f" ⚠️ ... and {len(lines) - i - 1} more invalid IDs")
|
||||||
|
|
||||||
if not videos:
|
if not videos:
|
||||||
print(f" ❌ No valid videos found after parsing")
|
print(f" ❌ No valid videos found after parsing")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user