Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>

This commit is contained in:
mbrucedogs 2025-07-27 12:38:35 -05:00
parent 7090fad1fd
commit 2c63bf809b
5 changed files with 170 additions and 137 deletions

1
PRD.md
View File

@ -177,6 +177,7 @@ KaroakeVideoDownloader/
- `--songlist-priority`: Prioritize songlist songs in download queue
- `--songlist-only`: Download only songs from the songlist
- `--songlist-focus <PLAYLIST_TITLE1> <PLAYLIST_TITLE2>...`: Focus on specific playlists by title (e.g., `--songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"`)
- `--songlist-file <FILE_PATH>`: Custom songlist file path to use with --songlist-focus (default: data/songList.json)
- `--force`: **Force download from channels, bypassing all existing file checks and re-downloading if necessary**
- `--songlist-status`: Show songlist download progress
- `--limit <N>`: Limit number of downloads (enables fast mode with early exit)

View File

@ -112,6 +112,11 @@ python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"
```
### Focus on Specific Playlists from Custom File
```bash
python download_karaoke.py --songlist-focus "CCKaraoke" --songlist-file "data/my_custom_songlist.json"
```
### Force Download from Channels (Bypass All Existing File Checks)
```bash
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --force
@ -255,6 +260,7 @@ KaroakeVideoDownloader/
- `--songlist-priority`: Prioritize songlist songs in download queue
- `--songlist-only`: Download only songs from the songlist
- `--songlist-focus <PLAYLIST_TITLE1> <PLAYLIST_TITLE2>...`: Focus on specific playlists by title (e.g., `--songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"`)
- `--songlist-file <FILE_PATH>`: Custom songlist file path to use with --songlist-focus (default: data/songList.json)
- `--songlist-status`: Show songlist download progress
- `--limit <N>`: Limit number of downloads (enables fast mode with early exit)
- `--resolution <720p|1080p|...>`: Override resolution
@ -292,6 +298,9 @@ python download_karaoke.py --songlist-only
# Focused fuzzy matching (target specific playlists with flexible matching)
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --fuzzy-match --fuzzy-threshold 80 --limit 10
# Focus on specific playlists from a custom file
python download_karaoke.py --songlist-focus "CCKaraoke" --songlist-file "data/my_custom_songlist.json" --limit 10
# Force download with fuzzy matching (bypass all existing file checks)
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --force --fuzzy-match --fuzzy-threshold 80 --limit 10

View File

@ -65,6 +65,12 @@ python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --limit 5
# Focus on specific playlists with parallel processing
python download_karaoke.py --parallel --workers 3 --songlist-focus "2025 - Apple Top 50" --limit 5
# Focus on specific playlists from a custom songlist file
python download_karaoke.py --songlist-focus "CCKaraoke" --songlist-file "data/my_custom_songlist.json"
# Focus on specific playlists from a custom file with force mode
python download_karaoke.py --songlist-focus "CCKaraoke" --songlist-file "data/my_custom_songlist.json" --force
# Force download from channels regardless of existing files or server duplicates
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --force
@ -82,15 +88,30 @@ python download_karaoke.py --songlist-status
## 📊 UNMATCHED SONGS REPORTS
# Generate report of songs that couldn't be found in any channel
# Generate report of songs that couldn't be found in any channel (standalone)
python download_karaoke.py --generate-unmatched-report
# Generate report with fuzzy matching enabled
# Generate report with fuzzy matching enabled (standalone)
python download_karaoke.py --generate-unmatched-report --fuzzy-match --fuzzy-threshold 85
# Generate report using a specific channel file
# Generate report using a specific channel file (standalone)
python download_karaoke.py --generate-unmatched-report --file data/my_channels.txt
# Generate report from a custom songlist file (standalone)
python download_karaoke.py --generate-unmatched-report --songlist-file "data/my_custom_songlist.json"
# Generate report with focus on specific playlists from a custom file (standalone)
python download_karaoke.py --songlist-focus "CCKaraoke" --songlist-file "data/my_custom_songlist.json" --generate-unmatched-report
# Download songs AND generate unmatched report (additive feature)
python download_karaoke.py --songlist-only --limit 10 --generate-unmatched-report
# Download with fuzzy matching AND generate unmatched report
python download_karaoke.py --songlist-only --fuzzy-match --fuzzy-threshold 85 --limit 10 --generate-unmatched-report
# Download from specific playlists AND generate unmatched report
python download_karaoke.py --songlist-focus "CCKaraoke" --limit 10 --generate-unmatched-report
# Generate report with custom fuzzy threshold
python download_karaoke.py --generate-unmatched-report --fuzzy-match --fuzzy-threshold 80

View File

@ -103,7 +103,7 @@ Examples:
parser.add_argument(
"--generate-unmatched-report",
action="store_true",
help="Generate a report of songs that couldn't be found in any channel",
help="Generate a report of songs that couldn't be found in any channel (runs after downloads)",
)
parser.add_argument(
"--songlist-only",
@ -116,6 +116,11 @@ Examples:
metavar="PLAYLIST_TITLE",
help='Focus on specific playlists by title (e.g., --songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100")',
)
parser.add_argument(
"--songlist-file",
metavar="FILE_PATH",
help="Custom songlist file path to use with --songlist-focus (default: data/songList.json)",
)
parser.add_argument(
"--force",
action="store_true",
@ -235,6 +240,9 @@ Examples:
print(
f"🎯 Songlist focus mode enabled for playlists: {', '.join(args.songlist_focus)}"
)
if args.songlist_file:
downloader.songlist_file_path = args.songlist_file
print(f"📁 Using custom songlist file: {args.songlist_file}")
if args.force:
downloader.force_download = True
print("💪 Force mode enabled - will download regardless of existing files or server duplicates")
@ -337,134 +345,6 @@ Examples:
elif args.cleanup:
orphaned = downloader.tracker.cleanup_orphaned_files(downloader.downloads_dir)
print(f"🧹 Cleaned up {len(orphaned)} orphaned tracking entries")
sys.exit(0)
elif args.generate_unmatched_report:
from karaoke_downloader.download_planner import generate_unmatched_report
from karaoke_downloader.songlist_manager import load_songlist
print("🔍 Generating unmatched songs report...")
# Load songlist based on focus mode
if args.songlist_focus:
# Load focused playlists
songlist_file = Path("data/songList.json")
if not songlist_file.exists():
print("⚠️ Songlist file not found: data/songList.json")
sys.exit(1)
try:
with open(songlist_file, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Filter playlists by title
focused_playlists = []
print(f"🔍 Looking for playlists: {args.songlist_focus}")
print(f"🔍 Available playlists in songList.json:")
for i, playlist in enumerate(raw_data[:5]): # Show first 5 playlists
print(f" {i+1}. '{playlist.get('title', 'NO TITLE')}'")
if len(raw_data) > 5:
print(f" ... and {len(raw_data) - 5} more playlists")
for playlist in raw_data:
playlist_title = playlist.get("title", "")
if playlist_title in args.songlist_focus:
focused_playlists.append(playlist)
print(f"✅ Found matching playlist: '{playlist_title}'")
if not focused_playlists:
print(
f"⚠️ No playlists found matching the specified titles: {', '.join(args.songlist_focus)}"
)
sys.exit(1)
# Flatten the focused playlists into songs
focused_songs = []
seen = set()
for playlist in focused_playlists:
if "songs" in playlist:
for song in playlist["songs"]:
if "artist" in song and "title" in song:
artist = song["artist"].strip()
title = song["title"].strip()
key = f"{artist.lower()}_{title.lower()}"
if key in seen:
continue
seen.add(key)
focused_songs.append(
{
"artist": artist,
"title": title,
"position": song.get("position", 0),
}
)
songlist = focused_songs
print(
f"\n🎯 Songlist focus mode: {len(focused_songs)} songs from {len(focused_playlists)} playlists selected"
)
print(f"🎯 Focused playlists: {', '.join(args.songlist_focus)}")
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load songlist for filtering: {e}")
sys.exit(1)
else:
# Load all songs from songlist
songlist = load_songlist()
if not songlist:
print("❌ No songlist found. Please ensure data/songList.json exists.")
sys.exit(1)
# Load channel URLs
channel_file = args.file if args.file else "data/channels.txt"
if not os.path.exists(channel_file):
print(f"❌ Channel file not found: {channel_file}")
sys.exit(1)
with open(channel_file, "r", encoding='utf-8') as f:
channel_urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
print(f"📋 Analyzing {len(songlist)} songs against {len(channel_urls)} channels...")
# Build download plan to get unmatched songs
from karaoke_downloader.download_planner import build_download_plan
fuzzy_match = args.fuzzy_match if hasattr(args, "fuzzy_match") else False
fuzzy_threshold = (
args.fuzzy_threshold
if hasattr(args, "fuzzy_threshold")
else DEFAULT_FUZZY_THRESHOLD
)
try:
download_plan, unmatched = build_download_plan(
channel_urls,
songlist,
downloader.tracker,
downloader.yt_dlp_path,
fuzzy_match=fuzzy_match,
fuzzy_threshold=fuzzy_threshold,
)
if unmatched:
report_file = generate_unmatched_report(unmatched)
print(f"\n📋 Unmatched songs report generated successfully!")
print(f"📁 Report saved to: {report_file}")
print(f"📊 Summary: {len(download_plan)} songs found, {len(unmatched)} songs not found")
print(f"\n🔍 First 10 unmatched songs:")
for i, song in enumerate(unmatched[:10], 1):
print(f" {i:2d}. {song['artist']} - {song['title']}")
if len(unmatched) > 10:
print(f" ... and {len(unmatched) - 10} more songs")
else:
print(f"\n✅ All {len(songlist)} songs were found in the channels!")
except Exception as e:
print(f"❌ Error generating report: {e}")
sys.exit(1)
sys.exit(0)
elif args.songlist_status:
songlist = downloader._load_songlist()
@ -551,6 +431,127 @@ Examples:
else:
parser.print_help()
sys.exit(1)
# Generate unmatched report if requested (additive feature)
if args.generate_unmatched_report:
from karaoke_downloader.download_planner import generate_unmatched_report, build_download_plan
from karaoke_downloader.songlist_manager import load_songlist
print("\n🔍 Generating unmatched songs report...")
# Load songlist based on focus mode
if args.songlist_focus:
# Load focused playlists
songlist_file_path = args.songlist_file if args.songlist_file else "data/songList.json"
songlist_file = Path(songlist_file_path)
if not songlist_file.exists():
print(f"⚠️ Songlist file not found: {songlist_file_path}")
else:
try:
with open(songlist_file, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Filter playlists by title
focused_playlists = []
for playlist in raw_data:
playlist_title = playlist.get("title", "")
if playlist_title in args.songlist_focus:
focused_playlists.append(playlist)
if focused_playlists:
# Flatten the focused playlists into songs
focused_songs = []
seen = set()
for playlist in focused_playlists:
if "songs" in playlist:
for song in playlist["songs"]:
if "artist" in song and "title" in song:
artist = song["artist"].strip()
title = song["title"].strip()
key = f"{artist.lower()}_{title.lower()}"
if key in seen:
continue
seen.add(key)
focused_songs.append(
{
"artist": artist,
"title": title,
"position": song.get("position", 0),
}
)
songlist = focused_songs
else:
print(f"⚠️ No playlists found matching: {', '.join(args.songlist_focus)}")
songlist = []
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load songlist for report: {e}")
songlist = []
else:
# Load all songs from songlist
songlist_path = args.songlist_file if args.songlist_file else "data/songList.json"
songlist = load_songlist(songlist_path)
if songlist:
# Load channel URLs
channel_file = args.file if args.file else "data/channels.txt"
if os.path.exists(channel_file):
with open(channel_file, "r", encoding='utf-8') as f:
channel_urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
print(f"📋 Analyzing {len(songlist)} songs against {len(channel_urls)} channels...")
# Build download plan to get unmatched songs
fuzzy_match = args.fuzzy_match if hasattr(args, "fuzzy_match") else False
fuzzy_threshold = (
args.fuzzy_threshold
if hasattr(args, "fuzzy_threshold")
else DEFAULT_FUZZY_THRESHOLD
)
try:
download_plan, unmatched = build_download_plan(
channel_urls,
songlist,
downloader.tracker,
downloader.yt_dlp_path,
fuzzy_match=fuzzy_match,
fuzzy_threshold=fuzzy_threshold,
)
if unmatched:
report_file = generate_unmatched_report(unmatched)
print(f"\n📋 Unmatched songs report generated successfully!")
print(f"📁 Report saved to: {report_file}")
print(f"📊 Summary: {len(download_plan)} songs found, {len(unmatched)} songs not found")
print(f"\n🔍 First 10 unmatched songs:")
for i, song in enumerate(unmatched[:10], 1):
print(f" {i:2d}. {song['artist']} - {song['title']}")
if len(unmatched) > 10:
print(f" ... and {len(unmatched) - 10} more songs")
else:
print(f"\n✅ All {len(songlist)} songs were found in the channels!")
except Exception as e:
print(f"❌ Error generating report: {e}")
else:
print(f"❌ Channel file not found: {channel_file}")
else:
print("❌ No songlist available for report generation")
# If no download command was specified but generate-unmatched-report was used, exit here
if not any([args.songlist_only, args.songlist_focus, args.latest_per_channel, args.url]):
if args.generate_unmatched_report:
sys.exit(0)
else:
parser.print_help()
sys.exit(1)
downloader.tracker.force_save()
if success:
print("\n🎤 All downloads completed successfully!")

View File

@ -115,6 +115,7 @@ class KaraokeDownloader:
self.use_songlist_priority = True
self.download_limit = None
self.force_download = False
self.songlist_file_path = "data/songList.json" # Default songlist file path
def _load_config(self):
"""Load configuration using the config manager."""
@ -183,7 +184,7 @@ class KaraokeDownloader:
"""Download videos from a channel or playlist URL, respecting songlist-only and limit flags. Supports fuzzy matching."""
channel_name, channel_id = get_channel_info(url)
print(f"\n🎬 Downloading from channel: {channel_name} ({url})")
songlist = load_songlist()
songlist = load_songlist(self.songlist_file_path)
if not songlist:
print("⚠️ No songlist loaded. Skipping.")
return False
@ -325,9 +326,9 @@ class KaraokeDownloader:
# Apply songlist focus filtering if specified
if self.songlist_focus_titles:
# Load the raw songlist data to filter by playlist titles
songlist_file = Path("data/songList.json")
songlist_file = Path(self.songlist_file_path)
if not songlist_file.exists():
print("⚠️ Songlist file not found: data/songList.json")
print(f"⚠️ Songlist file not found: {self.songlist_file_path}")
return False
try:
@ -383,11 +384,11 @@ class KaraokeDownloader:
print(f"🎯 Focused playlists: {', '.join(self.songlist_focus_titles)}")
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load songlist for filtering: {e}")
print(f"⚠️ Could not load songlist for filtering from {self.songlist_file_path}: {e}")
return False
else:
# Load songlist normally (flattened from all playlists)
songlist = load_songlist()
songlist = load_songlist(self.songlist_file_path)
if not songlist:
print("⚠️ No songlist loaded. Skipping.")
return False