KaraokeVideoDownloader/build_cache_from_raw.py

122 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Script to build channel cache from raw yt-dlp output file.
This uses the fixed parsing logic to handle titles with | characters.
"""
import json
import re
from datetime import datetime
from pathlib import Path
def parse_raw_output_file(raw_file_path):
"""Parse the raw output file and extract valid videos."""
videos = []
invalid_count = 0
print(f"🔍 Parsing raw output file: {raw_file_path}")
with open(raw_file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Skip header lines (lines starting with #)
data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()]
print(f"📄 Found {len(data_lines)} data lines to process")
for i, line in enumerate(data_lines):
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)")
# Remove line number prefix (e.g., " 1234: ")
line = re.sub(r'^\s*\d+:\s*', '', line.strip())
# More robust parsing that handles titles with | characters
# Extract video ID directly from the URL that yt-dlp provides
# Find the URL and extract video ID from it
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
if not url_match:
invalid_count += 1
if invalid_count <= 5:
print(f"⚠️ Skipping line with no URL: '{line[:100]}...'")
elif invalid_count == 6:
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid lines")
continue
# Extract video ID directly from the URL
video_id = url_match.group(1)
# Extract title (everything before the video ID in the line)
title = line[:line.find(video_id)].rstrip('|').strip()
# Validate video ID
if video_id and (
len(video_id) == 11 and
video_id.replace('-', '').replace('_', '').isalnum() and
" " not in video_id and
"Lyrics" not in video_id and
"KARAOKE" not in video_id.upper() and
"Vocal" not in video_id and
"Guide" not in video_id
):
videos.append({"title": title, "id": video_id})
else:
invalid_count += 1
if invalid_count <= 5: # Only show first 5 invalid IDs
print(f"⚠️ Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
elif invalid_count == 6:
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid IDs")
print(f"✅ Parsed {len(videos)} valid videos from raw output")
print(f"⚠️ Skipped {invalid_count} invalid video IDs")
return videos
def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"):
"""Save the parsed videos to a cache file."""
cache_dir = Path(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)
# Sanitize channel ID for filename
safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
cache_file = cache_dir / f"{safe_channel_id}.json"
data = {
'channel_id': channel_id,
'videos': videos,
'last_updated': datetime.now().isoformat(),
'video_count': len(videos)
}
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved cache to: {cache_file.name}")
return cache_file
def main():
"""Main function to build cache from raw output."""
raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt")
if not raw_file_path.exists():
print(f"❌ Raw output file not found: {raw_file_path}")
return
# Parse the raw output file
videos = parse_raw_output_file(raw_file_path)
if not videos:
print("❌ No valid videos found")
return
# Save to cache file
channel_id = "@VocalStarKaraoke"
cache_file = save_cache_file(channel_id, videos)
print(f"🎉 Cache build complete!")
print(f"📊 Total videos in cache: {len(videos)}")
print(f"📁 Cache file: {cache_file}")
if __name__ == "__main__":
main()