Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>

2025-07-27 20:33:26 -05:00 · 2025-07-27 20:33:26 -05:00 · f914d54067
commit f914d54067
parent ea07188739
6 changed files with 564 additions and 236696 deletions
--- a/build_cache_from_raw.py
+++ b/build_cache_from_raw.py
@ -0,0 +1,122 @@
 #!/usr/bin/env python3
 """
 Script to build channel cache from raw yt-dlp output file.
 This uses the fixed parsing logic to handle titles with | characters.
 """
 import json
 import re
 from datetime import datetime
 from pathlib import Path
 def parse_raw_output_file(raw_file_path):
    """Parse the raw output file and extract valid videos."""
    videos = []
    invalid_count = 0
    print(f"🔍 Parsing raw output file: {raw_file_path}")
    with open(raw_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # Skip header lines (lines starting with #)
    data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()]
    print(f"📄 Found {len(data_lines)} data lines to process")
    for i, line in enumerate(data_lines):
        if i % 1000 == 0 and i > 0:  # Progress indicator every 1000 lines
            print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)")
        # Remove line number prefix (e.g., "  1234: ")
        line = re.sub(r'^\s*\d+:\s*', '', line.strip())
        # More robust parsing that handles titles with | characters
        # Extract video ID directly from the URL that yt-dlp provides
        # Find the URL and extract video ID from it
        url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
        if not url_match:
            invalid_count += 1
            if invalid_count <= 5:
                print(f"⚠️  Skipping line with no URL: '{line[:100]}...'")
            elif invalid_count == 6:
                print(f"⚠️  ... and {len(data_lines) - i - 1} more invalid lines")
            continue
        # Extract video ID directly from the URL
        video_id = url_match.group(1)
        # Extract title (everything before the video ID in the line)
        title = line[:line.find(video_id)].rstrip('|').strip()
        # Validate video ID
        if video_id and (
            len(video_id) == 11 and 
            video_id.replace('-', '').replace('_', '').isalnum() and
            " " not in video_id and 
            "Lyrics" not in video_id and
            "KARAOKE" not in video_id.upper() and
            "Vocal" not in video_id and
            "Guide" not in video_id
        ):
            videos.append({"title": title, "id": video_id})
        else:
            invalid_count += 1
            if invalid_count <= 5:  # Only show first 5 invalid IDs
                print(f"⚠️  Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
            elif invalid_count == 6:
                print(f"⚠️  ... and {len(data_lines) - i - 1} more invalid IDs")
    print(f"✅ Parsed {len(videos)} valid videos from raw output")
    print(f"⚠️  Skipped {invalid_count} invalid video IDs")
    return videos
 def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"):
    """Save the parsed videos to a cache file."""
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)
    # Sanitize channel ID for filename
    safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
    cache_file = cache_dir / f"{safe_channel_id}.json"
    data = {
        'channel_id': channel_id,
        'videos': videos,
        'last_updated': datetime.now().isoformat(),
        'video_count': len(videos)
    }
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"💾 Saved cache to: {cache_file.name}")
    return cache_file
 def main():
    """Main function to build cache from raw output."""
    raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt")
    if not raw_file_path.exists():
        print(f"❌ Raw output file not found: {raw_file_path}")
        return
    # Parse the raw output file
    videos = parse_raw_output_file(raw_file_path)
    if not videos:
        print("❌ No valid videos found")
        return
    # Save to cache file
    channel_id = "@VocalStarKaraoke"
    cache_file = save_cache_file(channel_id, videos)
    print(f"🎉 Cache build complete!")
    print(f"📊 Total videos in cache: {len(videos)}")
    print(f"📁 Cache file: {cache_file}")
 if __name__ == "__main__":
    main() 
--- a/data/channel_cache.json
+++ b/data/channel_cache.json
--- a/karaoke_downloader/cli.py
+++ b/karaoke_downloader/cli.py
@ -110,6 +110,17 @@ Examples:
        action="store_true",
        help="Show page-by-page progress when downloading channel video lists (slower but more detailed)",
    )
    parser.add_argument(
        "--parallel-channels",
        action="store_true",
        help="Enable parallel channel scanning for faster channel processing (scans multiple channels simultaneously)",
    )
    parser.add_argument(
        "--channel-workers",
        type=int,
        default=3,
        help="Number of parallel channel scanning workers (default: 3, max: 10)",
    )
    parser.add_argument(
        "--songlist-only",
        action="store_true",
@ -183,7 +194,7 @@ Examples:
    parser.add_argument(
        "--parallel",
        action="store_true",
-        help="Enable parallel downloads for improved speed",
+        help="Enable parallel downloads for improved speed (3-5x faster for large batches)",
    )
    parser.add_argument(
        "--workers",
@ -209,6 +220,11 @@ Examples:
        print("❌ Error: --workers must be between 1 and 10")
        sys.exit(1)
    # Validate channel workers argument
    if args.channel_workers < 1 or args.channel_workers > 10:
        print("❌ Error: --channel-workers must be between 1 and 10")
        sys.exit(1)
    yt_dlp_path = Path("downloader/yt-dlp.exe")
    if not yt_dlp_path.exists():
        print("❌ Error: yt-dlp.exe not found in downloader/ directory")
@ -391,6 +407,8 @@ Examples:
            fuzzy_threshold=args.fuzzy_threshold,
            force_download=args.force,
            show_pagination=args.show_pagination,
            parallel_channels=args.parallel_channels,
            max_channel_workers=args.channel_workers,
        )
    elif args.latest_per_channel:
        # Use provided file or default to data/channels.txt
--- a/karaoke_downloader/download_planner.py
+++ b/karaoke_downloader/download_planner.py
@ -3,10 +3,13 @@ Download plan building utilities.
 Handles pre-scanning channels and building download plans.
 """
 import concurrent.futures
 import hashlib
 import json
 import sys
 from datetime import datetime
 from pathlib import Path
-from typing import List, Dict, Any
+from typing import Any, Dict, List, Optional, Tuple
 from karaoke_downloader.cache_manager import (
    delete_plan_cache,
@ -16,12 +19,13 @@ from karaoke_downloader.cache_manager import (
 )
 from karaoke_downloader.fuzzy_matcher import (
    create_song_key,
-    extract_artist_title,
+    create_video_key,
    get_similarity_function,
    is_exact_match,
    is_fuzzy_match,
    normalize_title,
 )
 from karaoke_downloader.id3_utils import extract_artist_title
 from karaoke_downloader.youtube_utils import get_channel_info
 # Constants
@ -29,6 +33,7 @@ DEFAULT_FILENAME_LENGTH_LIMIT = 100
 DEFAULT_ARTIST_LENGTH_LIMIT = 30
 DEFAULT_TITLE_LENGTH_LIMIT = 60
 DEFAULT_FUZZY_THRESHOLD = 85
 DEFAULT_DISPLAY_LIMIT = 10
 def generate_unmatched_report(unmatched: List[Dict[str, Any]], report_path: str = None) -> str:
@ -74,6 +79,111 @@ def generate_unmatched_report(unmatched: List[Dict[str, Any]], report_path: str
    return str(report_file)
 def _scan_channel_for_matches(
    channel_url,
    channel_name,
    channel_id,
    song_keys,
    song_lookup,
    fuzzy_match,
    fuzzy_threshold,
    show_pagination,
    yt_dlp_path,
    tracker,
 ):
    """
    Scan a single channel for matches (used in parallel processing).
    Args:
        channel_url: URL of the channel to scan
        channel_name: Name of the channel
        channel_id: ID of the channel
        song_keys: Set of song keys to match against
        song_lookup: Dictionary mapping song keys to song data
        fuzzy_match: Whether to use fuzzy matching
        fuzzy_threshold: Threshold for fuzzy matching
        show_pagination: Whether to show pagination progress
        yt_dlp_path: Path to yt-dlp executable
        tracker: Tracking manager instance
    Returns:
        List of video matches found in this channel
    """
    print(f"\n🚦 Scanning channel: {channel_name} ({channel_url})")
    # Get channel info if not provided
    if not channel_name or not channel_id:
        channel_name, channel_id = get_channel_info(channel_url)
    # Fetch video list from channel
    available_videos = tracker.get_channel_video_list(
        channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
    )
    print(f"   📊 Channel has {len(available_videos)} videos to scan")
    video_matches = []
    # Pre-process video titles for efficient matching
    if fuzzy_match:
        # For fuzzy matching, create normalized video keys
        for video in available_videos:
            v_artist, v_title = extract_artist_title(video["title"])
            video_key = create_song_key(v_artist, v_title)
            # Find best match among remaining songs
            best_match = None
            best_score = 0
            for song_key in song_keys:
                if song_key in song_lookup:  # Only check unmatched songs
                    score = get_similarity_function()(song_key, video_key)
                    if score >= fuzzy_threshold and score > best_score:
                        best_score = score
                        best_match = song_key
            if best_match:
                song = song_lookup[best_match]
                video_matches.append(
                    {
                        "artist": song["artist"],
                        "title": song["title"],
                        "channel_name": channel_name,
                        "channel_url": channel_url,
                        "video_id": video["id"],
                        "video_title": video["title"],
                        "match_score": best_score,
                    }
                )
                # Remove matched song from future consideration
                del song_lookup[best_match]
                song_keys.remove(best_match)
    else:
        # For exact matching, use direct key comparison
        for video in available_videos:
            v_artist, v_title = extract_artist_title(video["title"])
            video_key = create_song_key(v_artist, v_title)
            if video_key in song_keys:
                song = song_lookup[video_key]
                video_matches.append(
                    {
                        "artist": song["artist"],
                        "title": song["title"],
                        "channel_name": channel_name,
                        "channel_url": channel_url,
                        "video_id": video["id"],
                        "video_title": video["title"],
                        "match_score": 100,
                    }
                )
                # Remove matched song from future consideration
                del song_lookup[video_key]
                song_keys.remove(video_key)
    print(f"   ✅ Found {len(video_matches)} matches in {channel_name}")
    return video_matches
 def build_download_plan(
    channel_urls,
    undownloaded,
@ -82,6 +192,8 @@ def build_download_plan(
    fuzzy_match=False,
    fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
    show_pagination=False,
    parallel_channels=False,
    max_channel_workers=3,
 ):
    """
    For each song in undownloaded, scan all channels for a match.
@ -102,85 +214,198 @@ def build_download_plan(
        song_keys.add(key)
        song_lookup[key] = song
-    for i, channel_url in enumerate(channel_urls, 1):
+    if parallel_channels:
-        print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}")
+        print(f"🚀 Running parallel channel scanning with {max_channel_workers} workers.")
-        print(f"   🔍 Getting channel info...")
+        
-        channel_name, channel_id = get_channel_info(channel_url)
+        # Create a thread-safe copy of song data for parallel processing
-        print(f"   ✅ Channel info: {channel_name} (ID: {channel_id})")
+        import threading
-        print(f"   🔍 Fetching video list from channel...")
+        song_keys_lock = threading.Lock()
-        available_videos = tracker.get_channel_video_list(
+        song_lookup_lock = threading.Lock()
-            channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
+        
-        )
+        def scan_channel_safe(channel_url):
-        print(
+            """Thread-safe channel scanning function."""
-            f"   📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
+            print(f"\n🚦 Scanning channel: {channel_url}")
-        )
+            
-        matches_this_channel = 0
+            # Get channel info
-        video_matches = []  # Initialize video_matches for this channel
+            channel_name, channel_id = get_channel_info(channel_url)
            print(f"   ✅ Channel info: {channel_name} (ID: {channel_id})")
            # Fetch video list from channel
            available_videos = tracker.get_channel_video_list(
                channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
            )
            print(f"   📊 Channel has {len(available_videos)} videos to scan")
            video_matches = []
            # Pre-process video titles for efficient matching
            if fuzzy_match:
                # For fuzzy matching, create normalized video keys
                for video in available_videos:
                    v_artist, v_title = extract_artist_title(video["title"])
                    video_key = create_song_key(v_artist, v_title)
-        # Pre-process video titles for efficient matching
+                    # Find best match among remaining songs (thread-safe)
-        if fuzzy_match:
+                    best_match = None
-            # For fuzzy matching, create normalized video keys
+                    best_score = 0
-            for video in available_videos:
+                    with song_keys_lock:
-                v_artist, v_title = extract_artist_title(video["title"])
+                        available_song_keys = list(song_keys)  # Copy for iteration
-                video_key = create_song_key(v_artist, v_title)
+                    
                    for song_key in available_song_keys:
                        with song_lookup_lock:
                            if song_key in song_lookup:  # Only check unmatched songs
                                score = get_similarity_function()(song_key, video_key)
                                if score >= fuzzy_threshold and score > best_score:
                                    best_score = score
                                    best_match = song_key
-                # Find best match among remaining songs
+                    if best_match:
-                best_match = None
+                        with song_lookup_lock:
-                best_score = 0
+                            if best_match in song_lookup:  # Double-check it's still available
-                for song_key in song_keys:
+                                song = song_lookup[best_match]
-                    if song_key in song_lookup:  # Only check unmatched songs
+                                video_matches.append(
-                        score = get_similarity_function()(song_key, video_key)
+                                    {
-                        if score >= fuzzy_threshold and score > best_score:
+                                        "artist": song["artist"],
-                            best_score = score
+                                        "title": song["title"],
-                            best_match = song_key
+                                        "channel_name": channel_name,
                                        "channel_url": channel_url,
                                        "video_id": video["id"],
                                        "video_title": video["title"],
                                        "match_score": best_score,
                                    }
                                )
                                # Remove matched song from future consideration
                                del song_lookup[best_match]
                                with song_keys_lock:
                                    song_keys.discard(best_match)
            else:
                # For exact matching, use direct key comparison
                for video in available_videos:
                    v_artist, v_title = extract_artist_title(video["title"])
                    video_key = create_song_key(v_artist, v_title)
-                if best_match:
+                    with song_lookup_lock:
-                    song = song_lookup[best_match]
+                        if video_key in song_keys and video_key in song_lookup:
-                    video_matches.append(
+                            song = song_lookup[video_key]
-                        {
+                            video_matches.append(
-                            "artist": song["artist"],
+                                {
-                            "title": song["title"],
+                                    "artist": song["artist"],
-                            "channel_name": channel_name,
+                                    "title": song["title"],
-                            "channel_url": channel_url,
+                                    "channel_name": channel_name,
-                            "video_id": video["id"],
+                                    "channel_url": channel_url,
-                            "video_title": video["title"],
+                                    "video_id": video["id"],
-                            "match_score": best_score,
+                                    "video_title": video["title"],
-                        }
+                                    "match_score": 100,
-                    )
+                                }
-                    # Remove matched song from future consideration
+                            )
-                    del song_lookup[best_match]
+                            # Remove matched song from future consideration
-                    song_keys.remove(best_match)
+                            del song_lookup[video_key]
-                    matches_this_channel += 1
+                            with song_keys_lock:
-        else:
+                                song_keys.discard(video_key)
-            # For exact matching, use direct key comparison
+            
-            for video in available_videos:
+            print(f"   ✅ Found {len(video_matches)} matches in {channel_name}")
-                v_artist, v_title = extract_artist_title(video["title"])
+            return video_matches
-                video_key = create_song_key(v_artist, v_title)
+        
        # Execute parallel channel scanning
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_channel_workers) as executor:
            # Submit all channel scanning tasks
            future_to_channel = {
                executor.submit(scan_channel_safe, channel_url): channel_url 
                for channel_url in channel_urls
            }
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_channel):
                channel_url = future_to_channel[future]
                try:
                    video_matches = future.result()
                    plan.extend(video_matches)
                    channel_name, _ = get_channel_info(channel_url)
                    channel_match_counts[channel_name] = len(video_matches)
                except Exception as e:
                    print(f"⚠️ Error processing channel {channel_url}: {e}")
                    channel_name, _ = get_channel_info(channel_url)
                    channel_match_counts[channel_name] = 0
    else:
        for i, channel_url in enumerate(channel_urls, 1):
            print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}")
            print(f"   🔍 Getting channel info...")
            channel_name, channel_id = get_channel_info(channel_url)
            print(f"   ✅ Channel info: {channel_name} (ID: {channel_id})")
            print(f"   🔍 Fetching video list from channel...")
            available_videos = tracker.get_channel_video_list(
                channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False, show_pagination=show_pagination
            )
            print(
                f"   📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
            )
            matches_this_channel = 0
            video_matches = []  # Initialize video_matches for this channel
-                if video_key in song_keys:
+            # Pre-process video titles for efficient matching
-                    song = song_lookup[video_key]
+            if fuzzy_match:
-                    video_matches.append(
+                # For fuzzy matching, create normalized video keys
-                        {
+                for video in available_videos:
-                            "artist": song["artist"],
+                    v_artist, v_title = extract_artist_title(video["title"])
-                            "title": song["title"],
+                    video_key = create_song_key(v_artist, v_title)
                            "channel_name": channel_name,
                            "channel_url": channel_url,
                            "video_id": video["id"],
                            "video_title": video["title"],
                            "match_score": 100,
                        }
                    )
                    # Remove matched song from future consideration
                    del song_lookup[video_key]
                    song_keys.remove(video_key)
                    matches_this_channel += 1
-        # Add matches to plan
+                    # Find best match among remaining songs
-        plan.extend(video_matches)
+                    best_match = None
                    best_score = 0
                    for song_key in song_keys:
                        if song_key in song_lookup:  # Only check unmatched songs
                            score = get_similarity_function()(song_key, video_key)
                            if score >= fuzzy_threshold and score > best_score:
                                best_score = score
                                best_match = song_key
-        # Print match count once per channel
+                    if best_match:
-        channel_match_counts[channel_name] = matches_this_channel
+                        song = song_lookup[best_match]
-        print(f"   → Found {matches_this_channel} songlist matches in this channel.")
+                        video_matches.append(
                            {
                                "artist": song["artist"],
                                "title": song["title"],
                                "channel_name": channel_name,
                                "channel_url": channel_url,
                                "video_id": video["id"],
                                "video_title": video["title"],
                                "match_score": best_score,
                            }
                        )
                        # Remove matched song from future consideration
                        del song_lookup[best_match]
                        song_keys.remove(best_match)
                        matches_this_channel += 1
            else:
                # For exact matching, use direct key comparison
                for video in available_videos:
                    v_artist, v_title = extract_artist_title(video["title"])
                    video_key = create_song_key(v_artist, v_title)
                    if video_key in song_keys:
                        song = song_lookup[video_key]
                        video_matches.append(
                            {
                                "artist": song["artist"],
                                "title": song["title"],
                                "channel_name": channel_name,
                                "channel_url": channel_url,
                                "video_id": video["id"],
                                "video_title": video["title"],
                                "match_score": 100,
                            }
                        )
                        # Remove matched song from future consideration
                        del song_lookup[video_key]
                        song_keys.remove(video_key)
                        matches_this_channel += 1
            # Add matches to plan
            plan.extend(video_matches)
            # Print match count once per channel
            channel_match_counts[channel_name] = matches_this_channel
            print(f"   → Found {matches_this_channel} songlist matches in this channel.")
    # Remaining unmatched songs
    unmatched = list(song_lookup.values())
--- a/karaoke_downloader/downloader.py
+++ b/karaoke_downloader/downloader.py
@ -317,6 +317,8 @@ class KaraokeDownloader:
        fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
        force_download=False,
        show_pagination=False,
        parallel_channels=False,
        max_channel_workers=3,
    ):
        """
        Download songs from the songlist across multiple channels.
@ -510,6 +512,8 @@ class KaraokeDownloader:
                fuzzy_match=fuzzy_match,
                fuzzy_threshold=fuzzy_threshold,
                show_pagination=show_pagination,
                parallel_channels=parallel_channels,
                max_channel_workers=max_channel_workers,
            )
            save_plan_cache(cache_file, download_plan, unmatched)
            print(f"💾 Download plan cached to: {cache_file}")
--- a/karaoke_downloader/tracking_manager.py
+++ b/karaoke_downloader/tracking_manager.py
@ -359,9 +359,18 @@ class TrackingManager:
                corrupted = False
                # Check if any video IDs look like titles instead of proper YouTube IDs
-                for video in cached_videos[:10]:  # Check first 10 videos
+                for video in cached_videos[:20]:  # Check first 20 videos
                    video_id = video.get("id", "")
-                    if video_id and (len(video_id) > 20 or " " in video_id or "Lyrics" in video_id):
+                    # More comprehensive validation - YouTube IDs should be 11 characters and contain only alphanumeric, hyphens, and underscores
                    if video_id and (
                        len(video_id) != 11 or 
                        not video_id.replace('-', '').replace('_', '').isalnum() or
                        " " in video_id or 
                        "Lyrics" in video_id or
                        "KARAOKE" in video_id.upper() or
                        "Vocal" in video_id or
                        "Guide" in video_id
                    ):
                        print(f"   ⚠️  Detected corrupted video ID in cache: '{video_id}'")
                        corrupted = True
                        break
@ -389,7 +398,7 @@ class TrackingManager:
        all_videos = []
        page = 1
-        videos_per_page = 50  # YouTube typically shows 50 videos per page
+        videos_per_page = 200  # YouTube/yt-dlp supports up to 200 videos per page, reducing API calls and errors
        while True:
            print(f"   📄 Fetching page {page}...")
@ -408,9 +417,26 @@ class TrackingManager:
            ]
            try:
-                result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
+                # Increased timeout to 180 seconds for larger pages (200 videos)
                result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=180)
                lines = result.stdout.strip().splitlines()
                # Save raw output for debugging (for each page)
                raw_output_file = self._get_channel_cache_file(channel_id).parent / f"{channel_id}_raw_output_page{page}.txt"
                try:
                    with open(raw_output_file, 'w', encoding='utf-8') as f:
                        f.write(f"# Raw yt-dlp output for {channel_id} - Page {page}\n")
                        f.write(f"# Channel URL: {channel_url}\n")
                        f.write(f"# Command: {' '.join(cmd)}\n")
                        f.write(f"# Timestamp: {datetime.now().isoformat()}\n")
                        f.write(f"# Total lines: {len(lines)}\n")
                        f.write("#" * 80 + "\n\n")
                        for i, line in enumerate(lines, 1):
                            f.write(f"{i:6d}: {line}\n")
                    print(f"   💾 Saved raw output to: {raw_output_file.name}")
                except Exception as e:
                    print(f"   ⚠️  Could not save raw output: {e}")
                if not lines:
                    print(f"   ✅ No more videos found on page {page}")
                    break
@ -421,17 +447,38 @@ class TrackingManager:
                invalid_count = 0
                for line in lines:
-                    parts = line.split("|")
+                    if not line.strip():
-                    if len(parts) >= 2:
+                        continue
-                        title, video_id = parts[0].strip(), parts[1].strip()
+                    
-                        
+                    # More robust parsing that handles titles with | characters
-                        # Validate that video_id looks like a proper YouTube ID
+                    # Extract video ID directly from the URL that yt-dlp provides
-                        if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
+                    
-                            page_videos.append({"title": title, "id": video_id})
+                    # Find the URL and extract video ID from it
-                        else:
+                    url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
-                            invalid_count += 1
+                    if not url_match:
-                            if invalid_count <= 3:  # Show first 3 invalid IDs per page
+                        continue
-                                print(f"      ⚠️  Invalid ID: '{video_id}' for '{title}'")
+                    
                    # Extract video ID directly from the URL
                    video_id = url_match.group(1)
                    # Extract title (everything before the video ID in the line)
                    title = line[:line.find(video_id)].rstrip('|').strip()
                    # Validate video ID
                    if video_id and (
                        len(video_id) == 11 and 
                        video_id.replace('-', '').replace('_', '').isalnum() and
                        " " not in video_id and 
                        "Lyrics" not in video_id and
                        "KARAOKE" not in video_id.upper() and
                        "Vocal" not in video_id and
                        "Guide" not in video_id
                    ):
                        page_videos.append({"title": title, "id": video_id})
                    else:
                        invalid_count += 1
                        if invalid_count <= 3:  # Show first 3 invalid IDs per page
                            print(f"      ⚠️  Invalid ID: '{video_id}' for '{title[:50]}...'")
                if invalid_count > 3:
                    print(f"      ⚠️  ... and {invalid_count - 3} more invalid IDs on this page")
@ -447,8 +494,8 @@ class TrackingManager:
                page += 1
                # Safety check to prevent infinite loops
-                if page > 200:  # Max 200 pages (10,000 videos)
+                if page > 50:  # Max 50 pages (10,000 videos with 200 per page)
-                    print(f"   ⚠️  Reached maximum page limit (200 pages), stopping")
+                    print(f"   ⚠️  Reached maximum page limit (50 pages), stopping")
                    break
            except subprocess.TimeoutExpired:
@ -457,6 +504,9 @@ class TrackingManager:
            except subprocess.CalledProcessError as e:
                print(f"   ❌ Error fetching page {page}: {e}")
                break
            except KeyboardInterrupt:
                print(f"   ⏹️  User interrupted, stopping at page {page}")
                break
        if not all_videos:
            print(f"   ❌ No valid videos found")
@ -520,6 +570,22 @@ class TrackingManager:
            result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=300)
            lines = result.stdout.strip().splitlines()
            # Save raw output for debugging
            raw_output_file = self._get_channel_cache_file(channel_id).parent / f"{channel_id}_raw_output.txt"
            try:
                with open(raw_output_file, 'w', encoding='utf-8') as f:
                    f.write(f"# Raw yt-dlp output for {channel_id}\n")
                    f.write(f"# Channel URL: {channel_url}\n")
                    f.write(f"# Command: {' '.join(cmd)}\n")
                    f.write(f"# Timestamp: {datetime.now().isoformat()}\n")
                    f.write(f"# Total lines: {len(lines)}\n")
                    f.write("#" * 80 + "\n\n")
                    for i, line in enumerate(lines, 1):
                        f.write(f"{i:6d}: {line}\n")
                print(f"   💾 Saved raw output to: {raw_output_file.name}")
            except Exception as e:
                print(f"   ⚠️  Could not save raw output: {e}")
            print(f"   📄 Raw output lines: {len(lines)}")
            print(f"   📊 Download completed successfully!")
@ -540,19 +606,42 @@ class TrackingManager:
                if i % 1000 == 0 and i > 0:  # Progress indicator every 1000 lines
                    print(f"   📊 Processing line {i}/{len(lines)}... ({i/len(lines)*100:.1f}%)")
-                parts = line.split("|")
+                # More robust parsing that handles titles with | characters
-                if len(parts) >= 2:
+                # Extract video ID directly from the URL that yt-dlp provides
-                    title, video_id = parts[0].strip(), parts[1].strip()
+                
-                    
+                # Find the URL and extract video ID from it
-                    # Validate that video_id looks like a proper YouTube ID
+                url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
-                    if video_id and (len(video_id) <= 20 and " " not in video_id and "Lyrics" not in video_id):
+                if not url_match:
-                        videos.append({"title": title, "id": video_id})
+                    invalid_count += 1
-                    else:
+                    if invalid_count <= 5:
-                        invalid_count += 1
+                        print(f"   ⚠️  Skipping line with no URL: '{line[:100]}...'")
-                        if invalid_count <= 5:  # Only show first 5 invalid IDs
+                    elif invalid_count == 6:
-                            print(f"   ⚠️  Skipping invalid video ID: '{video_id}' for title: '{title}'")
+                        print(f"   ⚠️  ... and {len(lines) - i - 1} more invalid lines")
-                        elif invalid_count == 6:
+                    continue
-                            print(f"   ⚠️  ... and {len(lines) - i - 1} more invalid IDs")
+                
                # Extract video ID directly from the URL
                video_id = url_match.group(1)
                # Extract title (everything before the video ID in the line)
                title = line[:line.find(video_id)].rstrip('|').strip()
                # Validate video ID
                if video_id and (
                    len(video_id) == 11 and 
                    video_id.replace('-', '').replace('_', '').isalnum() and
                    " " not in video_id and 
                    "Lyrics" not in video_id and
                    "KARAOKE" not in video_id.upper() and
                    "Vocal" not in video_id and
                    "Guide" not in video_id
                ):
                    videos.append({"title": title, "id": video_id})
                else:
                    invalid_count += 1
                    if invalid_count <= 5:  # Only show first 5 invalid IDs
                        print(f"   ⚠️  Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
                    elif invalid_count == 6:
                        print(f"   ⚠️  ... and {len(lines) - i - 1} more invalid IDs")
            if not videos:
                print(f"   ❌ No valid videos found after parsing")