122 lines
4.3 KiB
Python
122 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to build channel cache from raw yt-dlp output file.
|
|
This uses the fixed parsing logic to handle titles with | characters.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
def parse_raw_output_file(raw_file_path):
|
|
"""Parse the raw output file and extract valid videos."""
|
|
videos = []
|
|
invalid_count = 0
|
|
|
|
print(f"🔍 Parsing raw output file: {raw_file_path}")
|
|
|
|
with open(raw_file_path, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
# Skip header lines (lines starting with #)
|
|
data_lines = [line for line in lines if not line.strip().startswith('#') and line.strip()]
|
|
|
|
print(f"📄 Found {len(data_lines)} data lines to process")
|
|
|
|
for i, line in enumerate(data_lines):
|
|
if i % 1000 == 0 and i > 0: # Progress indicator every 1000 lines
|
|
print(f"📊 Processing line {i}/{len(data_lines)}... ({i/len(data_lines)*100:.1f}%)")
|
|
|
|
# Remove line number prefix (e.g., " 1234: ")
|
|
line = re.sub(r'^\s*\d+:\s*', '', line.strip())
|
|
|
|
# More robust parsing that handles titles with | characters
|
|
# Extract video ID directly from the URL that yt-dlp provides
|
|
|
|
# Find the URL and extract video ID from it
|
|
url_match = re.search(r'https://www\.youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})', line)
|
|
if not url_match:
|
|
invalid_count += 1
|
|
if invalid_count <= 5:
|
|
print(f"⚠️ Skipping line with no URL: '{line[:100]}...'")
|
|
elif invalid_count == 6:
|
|
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid lines")
|
|
continue
|
|
|
|
# Extract video ID directly from the URL
|
|
video_id = url_match.group(1)
|
|
|
|
# Extract title (everything before the video ID in the line)
|
|
title = line[:line.find(video_id)].rstrip('|').strip()
|
|
|
|
# Validate video ID
|
|
if video_id and (
|
|
len(video_id) == 11 and
|
|
video_id.replace('-', '').replace('_', '').isalnum() and
|
|
" " not in video_id and
|
|
"Lyrics" not in video_id and
|
|
"KARAOKE" not in video_id.upper() and
|
|
"Vocal" not in video_id and
|
|
"Guide" not in video_id
|
|
):
|
|
videos.append({"title": title, "id": video_id})
|
|
else:
|
|
invalid_count += 1
|
|
if invalid_count <= 5: # Only show first 5 invalid IDs
|
|
print(f"⚠️ Skipping invalid video ID: '{video_id}' for title: '{title[:50]}...'")
|
|
elif invalid_count == 6:
|
|
print(f"⚠️ ... and {len(data_lines) - i - 1} more invalid IDs")
|
|
|
|
print(f"✅ Parsed {len(videos)} valid videos from raw output")
|
|
print(f"⚠️ Skipped {invalid_count} invalid video IDs")
|
|
|
|
return videos
|
|
|
|
def save_cache_file(channel_id, videos, cache_dir="data/channel_cache"):
|
|
"""Save the parsed videos to a cache file."""
|
|
cache_dir = Path(cache_dir)
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sanitize channel ID for filename
|
|
safe_channel_id = re.sub(r'[<>:"/\\|?*]', '_', channel_id)
|
|
cache_file = cache_dir / f"{safe_channel_id}.json"
|
|
|
|
data = {
|
|
'channel_id': channel_id,
|
|
'videos': videos,
|
|
'last_updated': datetime.now().isoformat(),
|
|
'video_count': len(videos)
|
|
}
|
|
|
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"💾 Saved cache to: {cache_file.name}")
|
|
return cache_file
|
|
|
|
def main():
|
|
"""Main function to build cache from raw output."""
|
|
raw_file_path = Path("data/channel_cache/@VocalStarKaraoke_raw_output.txt")
|
|
|
|
if not raw_file_path.exists():
|
|
print(f"❌ Raw output file not found: {raw_file_path}")
|
|
return
|
|
|
|
# Parse the raw output file
|
|
videos = parse_raw_output_file(raw_file_path)
|
|
|
|
if not videos:
|
|
print("❌ No valid videos found")
|
|
return
|
|
|
|
# Save to cache file
|
|
channel_id = "@VocalStarKaraoke"
|
|
cache_file = save_cache_file(channel_id, videos)
|
|
|
|
print(f"🎉 Cache build complete!")
|
|
print(f"📊 Total videos in cache: {len(videos)}")
|
|
print(f"📁 Cache file: {cache_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |