KaraokeVideoDownloader/build_cache_from_raw.py

#!/usr/bin/env python3
"""
Script to build channel cache from raw yt-dlp output file.
This uses the fixed parsing logic to handle titles with | characters.
"""

import json
import re
from datetime import datetime
from pathlib import Path

def parse_raw_output_file(raw_file_path):
    """Parse the raw output file and extract valid videos."""
    videos = []
    invalid_count = 0

    print(f"🔍 Parsing raw output file: {raw_file_path}")

    with open(raw_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Skip header lines (lines starting with #)
    data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()]

    print(f"📄 Found {len(data_lines)} data lines to process")

    for i, line in enumerate(data_lines):
        if i % 1000 == 0 and i > 0:  # Progress indicator every 1000 lines
            print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)")

        # Remove line number prefix (e.g., "  1234: ")
        line = re.sub(r'^\s*\d+:\s*', '', line.strip())

        # More robust parsing that handles titles with | characters
        # Extract video ID directly from the URL that yt-dlp provides

        # Find the URL and extract video ID from it
        url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
        if not url_match:
            invalid_count += 1
            if invalid_count <= 5:
                print(f"⚠️  Skipping line with no URL: '{line[:100]}...'")
            elif invalid_count == 6:
                print(f"⚠️  ... and {len(data_lines) - i - 1} more invalid lines")
            continue

        # Extract video ID directly from the URL
        video_id = url_match.group(1)

        # Extract title (everything before the video ID in the line)
        title = line[:line.find(video_id)].rstrip('|').strip()

        # Validate video ID
        if video_id and (
            len(video_id) == 11 and
            video_id.replace('-', '').replace('_', '').isalnum() and
            " " not in video_id and
            "Lyrics" not in video_id and
            "KARAOKE" not in video_id.upper() and
            "Vocal" not in video_id and
            "Guide" not in video_id
        ):
            videos.append({"title": title, "id": video_id})
        else:
            invalid_count += 1
            if invalid_count <= 5:  # Only show first 5 invalid IDs
                print(f"⚠️  Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
            elif invalid_count == 6:
                print(f"⚠️  ... and {len(data_lines) - i - 1} more invalid IDs")

    print(f"✅ Parsed {len(videos)} valid videos from raw output")
    print(f"⚠️  Skipped {invalid_count} invalid video IDs")

    return videos

def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"):
    """Save the parsed videos to a cache file."""
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)

    # Sanitize channel ID for filename
    safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
    cache_file = cache_dir / f"{safe_channel_id}.json"

    data = {
        'channel_id': channel_id,
        'videos': videos,
        'last_updated': datetime.now().isoformat(),
        'video_count': len(videos)
    }

    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved cache to: {cache_file.name}")
    return cache_file

def main():
    """Main function to build cache from raw output."""
    raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt")

    if not raw_file_path.exists():
        print(f"❌ Raw output file not found: {raw_file_path}")
        return

    # Parse the raw output file
    videos = parse_raw_output_file(raw_file_path)

    if not videos:
        print("❌ No valid videos found")
        return

    # Save to cache file
    channel_id = "@VocalStarKaraoke"
    cache_file = save_cache_file(channel_id, videos)

    print(f"🎉 Cache build complete!")
    print(f"📊 Total videos in cache: {len(videos)}")
    print(f"📁 Cache file: {cache_file}")

if __name__ == "__main__":
    main()