#!/usr/bin/env python3 """ Script to build channel cache from raw yt-dlp output file. This uses the fixed parsing logic to handle titles with | characters. """ import json import re from datetime import datetime from pathlib import Path def parse_raw_output_file(raw_file_path): """Parse the raw output file and extract valid videos.""" videos = [] invalid_count = 0 print(f"🔍 Parsing raw output file: {raw_file_path}") with open(raw_file_path, 'r', encoding='utf-8') as f: lines = f.readlines() # Skip header lines (lines starting with #) data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()] print(f"📄 Found {len(data_lines)} data lines to process") for i, line in enumerate(data_lines): if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)") # Remove line number prefix (e.g., " 1234: ") line = re.sub(r'^\s*\d+:\s*', '', line.strip()) # More robust parsing that handles titles with | characters # Extract video ID directly from the URL that yt-dlp provides # Find the URL and extract video ID from it url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line) if not url_match: invalid_count += 1 if invalid_count <= 5: print(f"⚠️ Skipping line with no URL: '{line[:100]}...'") elif invalid_count == 6: print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid lines") continue # Extract video ID directly from the URL video_id = url_match.group(1) # Extract title (everything before the video ID in the line) title = line[:line.find(video_id)].rstrip('|').strip() # Validate video ID if video_id and ( len(video_id) == 11 and video_id.replace('-', '').replace('_', '').isalnum() and " " not in video_id and "Lyrics" not in video_id and "KARAOKE" not in video_id.upper() and "Vocal" not in video_id and "Guide" not in video_id ): videos.append({"title": title, "id": video_id}) else: invalid_count += 1 if invalid_count <= 5: # Only show first 5 invalid IDs print(f"⚠️ Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'") elif invalid_count == 6: print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid IDs") print(f"✅ Parsed {len(videos)} valid videos from raw output") print(f"⚠️ Skipped {invalid_count} invalid video IDs") return videos def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"): """Save the parsed videos to a cache file.""" cache_dir = Path(cache_dir) cache_dir.mkdir(parents=True, exist_ok=True) # Sanitize channel ID for filename safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id) cache_file = cache_dir / f"{safe_channel_id}.json" data = { 'channel_id': channel_id, 'videos': videos, 'last_updated': datetime.now().isoformat(), 'video_count': len(videos) } with open(cache_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"💾 Saved cache to: {cache_file.name}") return cache_file def main(): """Main function to build cache from raw output.""" raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt") if not raw_file_path.exists(): print(f"❌ Raw output file not found: {raw_file_path}") return # Parse the raw output file videos = parse_raw_output_file(raw_file_path) if not videos: print("❌ No valid videos found") return # Save to cache file channel_id = "@VocalStarKaraoke" cache_file = save_cache_file(channel_id, videos) print(f"🎉 Cache build complete!") print(f"📊 Total videos in cache: {len(videos)}") print(f"📁 Cache file: {cache_file}") if __name__ == "__main__": main()