From 84088b442483f9a2f171a072694638ccf0e8e6a4 Mon Sep 17 00:00:00 2001 From: mbrucedogs Date: Fri, 25 Jul 2025 08:35:12 -0500 Subject: [PATCH] Signed-off-by: mbrucedogs --- PRD.md | 37 +- README.md | 32 +- data/server_duplicates_tracking.json | 338 +++++++++++ karaoke_downloader/cache_manager.py | 76 +++ karaoke_downloader/channel_manager.py | 93 +++ karaoke_downloader/cli.py | 22 +- karaoke_downloader/config_manager.py | 77 +++ karaoke_downloader/download_planner.py | 129 +++++ karaoke_downloader/downloader.py | 763 ++++++++++--------------- karaoke_downloader/fuzzy_matcher.py | 87 +++ karaoke_downloader/server_manager.py | 86 +++ karaoke_downloader/songlist_manager.py | 38 +- karaoke_downloader/tracking_manager.py | 99 ++++ karaoke_downloader/video_downloader.py | 327 +++++++++++ 14 files changed, 1719 insertions(+), 485 deletions(-) create mode 100644 data/server_duplicates_tracking.json create mode 100644 karaoke_downloader/cache_manager.py create mode 100644 karaoke_downloader/channel_manager.py create mode 100644 karaoke_downloader/config_manager.py create mode 100644 karaoke_downloader/download_planner.py create mode 100644 karaoke_downloader/fuzzy_matcher.py create mode 100644 karaoke_downloader/server_manager.py create mode 100644 karaoke_downloader/video_downloader.py diff --git a/PRD.md b/PRD.md index 3f48e56..21f46b4 100644 --- a/PRD.md +++ b/PRD.md @@ -1,8 +1,27 @@ -# ๐ŸŽค Karaoke Video Downloader โ€“ PRD (v2.2) +# ๐ŸŽค Karaoke Video Downloader โ€“ PRD (v3.1) ## โœ… Overview -A Python-based Windows CLI tool to download karaoke videos from YouTube channels/playlists using `yt-dlp.exe`, with advanced tracking, songlist prioritization, and flexible configuration. +A Python-based Windows CLI tool to download karaoke videos from YouTube channels/playlists using `yt-dlp.exe`, with advanced tracking, songlist prioritization, and flexible configuration. The codebase has been refactored into a modular architecture for improved maintainability and separation of concerns. + +--- + +## ๐Ÿ—๏ธ Architecture +The codebase has been refactored into focused modules: + +- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions +- **`download_planner.py`**: Download plan building and channel scanning (optimized) +- **`cache_manager.py`**: Cache operations and file I/O management +- **`video_downloader.py`**: Core video download execution and orchestration +- **`channel_manager.py`**: Channel and file management operations +- **`downloader.py`**: Main orchestrator and CLI interface + +### Benefits of Modular Architecture: +- **Single Responsibility**: Each module has a focused purpose +- **Testability**: Individual components can be tested separately +- **Maintainability**: Easier to find and fix issues +- **Reusability**: Components can be used independently +- **Robustness**: Better error handling and interruption recovery --- @@ -12,6 +31,7 @@ A Python-based Windows CLI tool to download karaoke videos from YouTube channels - Avoid re-downloading the same videos (robust tracking). - Prioritize and track a custom songlist across channels. - Allow flexible, user-friendly configuration. +- Provide robust interruption handling and progress recovery. --- @@ -71,6 +91,8 @@ python download_karaoke.py --clear-cache SingKingKaraoke - โœ… **Deduplication across channels**: Ensures the same song (by artist + normalized title) is not downloaded more than once, even if it appears in multiple channels. Tracks unique keys and skips duplicates. - โœ… **Fuzzy matching**: Optionally use fuzzy string matching for songlist-to-video matching with configurable threshold (0-100, default 85). Uses rapidfuzz if available, falls back to difflib. - โœ… **Default channel file**: If no --file is specified for songlist-only or latest-per-channel modes, automatically uses data/channels.txt as the default channel list. +- โœ… **Robust interruption handling**: Progress is saved after each download, and files are checked for existence before downloading to prevent re-downloads if the process is interrupted. +- โœ… **Optimized scanning performance**: High-performance channel scanning with O(nร—m) complexity, pre-processed lookups, and early termination for faster matching of large songlists and channels. --- @@ -78,8 +100,13 @@ python download_karaoke.py --clear-cache SingKingKaraoke ``` KaroakeVideoDownloader/ โ”œโ”€โ”€ karaoke_downloader/ # All core Python code and utilities -โ”‚ โ”œโ”€โ”€ downloader.py # Main downloader class +โ”‚ โ”œโ”€โ”€ downloader.py # Main orchestrator and CLI interface โ”‚ โ”œโ”€โ”€ cli.py # CLI entry point +โ”‚ โ”œโ”€โ”€ fuzzy_matcher.py # Fuzzy matching logic and similarity functions +โ”‚ โ”œโ”€โ”€ download_planner.py # Download plan building and channel scanning (optimized) +โ”‚ โ”œโ”€โ”€ cache_manager.py # Cache operations and file I/O management +โ”‚ โ”œโ”€โ”€ video_downloader.py # Core video download execution and orchestration +โ”‚ โ”œโ”€โ”€ channel_manager.py # Channel and file management operations โ”‚ โ”œโ”€โ”€ id3_utils.py # ID3 tagging helpers โ”‚ โ”œโ”€โ”€ songlist_manager.py # Songlist logic โ”‚ โ”œโ”€โ”€ youtube_utils.py # YouTube helpers @@ -140,6 +167,8 @@ KaroakeVideoDownloader/ - **Deduplication across channels:** Tracks unique song keys (artist + normalized title) to ensure the same song is not downloaded from multiple channels, even if it appears in more than one channel's video list. - **Fuzzy matching:** Uses string similarity algorithms to find approximate matches between songlist entries and video titles, tolerating minor differences, typos, or extra words like "Karaoke" or "Official Video". - **Default channel file:** For songlist-only and latest-per-channel modes, if no --file is specified, automatically uses data/channels.txt as the default channel list, reducing the need to specify the file path repeatedly. +- **Robust interruption handling:** Progress is saved after each download, and files are checked for existence before downloading to prevent re-downloads if the process is interrupted. +- **Optimized scanning algorithm:** High-performance channel scanning with O(nร—m) complexity, pre-processed song lookups using sets and dictionaries, and early termination for faster matching of large songlists and channels. --- @@ -149,3 +178,5 @@ KaroakeVideoDownloader/ - [ ] Download scheduling and retry logic - [ ] More granular status reporting - [ ] Parallel downloads for improved speed +- [ ] Unit tests for all modules +- [ ] Integration tests for end-to-end workflows diff --git a/README.md b/README.md index 9c1add4..3cf922b 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,25 @@ A Python-based Windows CLI tool to download karaoke videos from YouTube channels - ๐Ÿงน **Automatic Cleanup**: Removes extra yt-dlp files - ๐Ÿ“ˆ **Real-Time Progress**: Detailed console and log output - ๐Ÿงน **Reset/Clear Channel**: Reset all tracking and files for a channel, or clear channel cache via CLI -- ๐Ÿ—‚๏ธ **Latest-per-channel download**: Download the latest N videos from each channel in a single batch, with a per-channel download plan, robust resume, and unique plan cache. Use --latest-per-channel and --limit N. +- ๐Ÿ—‚๏ธ **Latest-per-channel download**: Download the latest N videos from each channel in a single batch, with server deduplication, fuzzy matching support, per-channel download plan, robust resume, and unique plan cache. Use --latest-per-channel and --limit N. - ๐Ÿงฉ **Fuzzy Matching**: Optionally use fuzzy string matching for songlist-to-video matching (with --fuzzy-match, requires rapidfuzz for best results) - โšก **Fast Mode with Early Exit**: When a limit is set, scans channels and songs in order, downloads immediately when a match is found, and stops as soon as the limit is reached with successful downloads - ๐Ÿ”„ **Deduplication Across Channels**: Ensures the same song is not downloaded from multiple channels, even if it appears in more than one channel's video list - ๐Ÿ“‹ **Default Channel File**: Automatically uses data/channels.txt as the default channel list for songlist modes (no need to specify --file every time) +- ๐Ÿ›ก๏ธ **Robust Interruption Handling**: Progress is saved after each download, preventing re-downloads if the process is interrupted +- โšก **Optimized Scanning**: High-performance channel scanning with O(nร—m) complexity, pre-processed lookups, and early termination for faster matching +- ๐Ÿท๏ธ **Server Duplicates Tracking**: Automatically checks against local songs.json file and marks duplicates for future skipping, preventing re-downloads of songs already on the server + +## ๐Ÿ—๏ธ Architecture +The codebase has been refactored into a modular architecture for better maintainability and separation of concerns: + +- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions +- **`download_planner.py`**: Download plan building and channel scanning (optimized) +- **`cache_manager.py`**: Cache operations and file I/O management +- **`server_manager.py`**: Server songs loading and server duplicates tracking +- **`video_downloader.py`**: Core video download execution and orchestration +- **`channel_manager.py`**: Channel and file management operations +- **`downloader.py`**: Main orchestrator and CLI interface ## ๐Ÿ“‹ Requirements - **Windows 10/11** @@ -48,6 +62,11 @@ python download_karaoke.py --songlist-only --limit 10 --fuzzy-match --fuzzy-thre python download_karaoke.py --latest-per-channel --limit 5 ``` +### Download Latest N Videos Per Channel (with fuzzy matching) +```bash +python download_karaoke.py --latest-per-channel --limit 5 --fuzzy-match --fuzzy-threshold 85 +``` + ### Prioritize Songlist in Download Queue ```bash python download_karaoke.py --songlist-priority @@ -101,14 +120,21 @@ python download_karaoke.py --clear-cache all ## ๐Ÿ› ๏ธ Tracking & Caching - **data/karaoke_tracking.json**: Tracks all downloads, statuses, and formats - **data/songlist_tracking.json**: Tracks global songlist download progress +- **data/server_duplicates_tracking.json**: Tracks songs found to be duplicates on the server for future skipping - **data/channel_cache.json**: Caches channel video lists for performance ## ๐Ÿ“‚ Folder Structure ``` KaroakeVideoDownloader/ โ”œโ”€โ”€ karaoke_downloader/ # All core Python code and utilities -โ”‚ โ”œโ”€โ”€ downloader.py # Main downloader class +โ”‚ โ”œโ”€โ”€ downloader.py # Main orchestrator and CLI interface โ”‚ โ”œโ”€โ”€ cli.py # CLI entry point +โ”‚ โ”œโ”€โ”€ fuzzy_matcher.py # Fuzzy matching logic and similarity functions +โ”‚ โ”œโ”€โ”€ download_planner.py # Download plan building and channel scanning (optimized) +โ”‚ โ”œโ”€โ”€ cache_manager.py # Cache operations and file I/O management +โ”‚ โ”œโ”€โ”€ server_manager.py # Server songs loading and server duplicates tracking +โ”‚ โ”œโ”€โ”€ video_downloader.py # Core video download execution and orchestration +โ”‚ โ”œโ”€โ”€ channel_manager.py # Channel and file management operations โ”‚ โ”œโ”€โ”€ id3_utils.py # ID3 tagging helpers โ”‚ โ”œโ”€โ”€ songlist_manager.py # Songlist logic โ”‚ โ”œโ”€โ”€ youtube_utils.py # YouTube helpers @@ -147,6 +173,7 @@ KaroakeVideoDownloader/ - `--reset-channel `: **Reset all tracking and files for a channel** - `--reset-songlist`: **When used with --reset-channel, also reset songlist songs for this channel** - `--clear-cache `: **Clear channel video cache for a specific channel or all** +- `--clear-server-duplicates`: **Clear server duplicates tracking (allows re-checking songs against server)** - `--latest-per-channel`: **Download the latest N videos from each channel (use with --limit)** - `--fuzzy-match`: Enable fuzzy matching for songlist-to-video matching (uses rapidfuzz if available) - `--fuzzy-threshold `: Fuzzy match threshold (0-100, default 85) @@ -166,6 +193,7 @@ python download_karaoke.py --songlist-only python download_karaoke.py --reset-channel SingKingKaraoke python download_karaoke.py --reset-channel SingKingKaraoke --reset-songlist python download_karaoke.py --clear-cache all +python download_karaoke.py --clear-server-duplicates ``` ## ๐Ÿท๏ธ ID3 Tagging diff --git a/data/server_duplicates_tracking.json b/data/server_duplicates_tracking.json new file mode 100644 index 0000000..8944839 --- /dev/null +++ b/data/server_duplicates_tracking.json @@ -0,0 +1,338 @@ +{ + "little richard_long tall sally": { + "artist": "Little Richard", + "title": "Long Tall Sally", + "video_title": "Little Richard - Long Tall Sally (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-24T22:11:47.738475", + "reason": "already_on_server" + }, + "lobo_me and you and a dog named boo": { + "artist": "Lobo", + "title": "Me And You And A Dog Named Boo", + "video_title": "Lobo - Me And You And A Dog Named Boo (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T07:26:38.285721", + "reason": "already_on_server" + }, + "royal teens_short shorts": { + "artist": "Royal Teens", + "title": "Short Shorts", + "video_title": "Royal Teens - Short Shorts (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T07:26:38.286537", + "reason": "already_on_server" + }, + "traveling wilburys_end of the line": { + "artist": "Traveling Wilburys", + "title": "End Of The Line", + "video_title": "Traveling Wilburys - End Of The Line (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T07:52:51.215910", + "reason": "already_on_server" + }, + "george jones_a picture of me (without you)": { + "artist": "George Jones", + "title": "A Picture Of Me (Without You)", + "video_title": "George Jones - A Picture Of Me (Without You) (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:02:17.408739", + "reason": "already_on_server" + }, + "lola young_messy": { + "artist": "Lola Young", + "title": "Messy", + "video_title": "Lola Young - Messy (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:02:17.429626", + "reason": "already_on_server" + }, + "gigi perez_sailor song": { + "artist": "Gigi Perez", + "title": "Sailor Song", + "video_title": "Gigi Perez - Sailor Song (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:02:17.431932", + "reason": "already_on_server" + }, + "sum 41_fat lip": { + "artist": "Sum 41", + "title": "Fat Lip", + "video_title": "Sum 41 - Fat Lip (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:02:17.434162", + "reason": "already_on_server" + }, + "the verve_bitter sweet symphony": { + "artist": "The Verve", + "title": "Bitter Sweet Symphony", + "video_title": "The Verve - Bitter Sweet Symphony (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:02:17.436617", + "reason": "already_on_server" + }, + "lionel richie_all night long": { + "artist": "Lionel Richie", + "title": "All Night Long", + "video_title": "Lionel Richie - All Night Long (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:02:17.440237", + "reason": "already_on_server" + }, + "kenny rogers_the gambler": { + "artist": "Kenny Rogers", + "title": "The Gambler", + "video_title": "Kenny Rogers - The Gambler (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:06:39.419631", + "reason": "already_on_server" + }, + "rod stewart_maggie may": { + "artist": "Rod Stewart", + "title": "Maggie May", + "video_title": "Rod Stewart - Maggie May (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:06:39.422101", + "reason": "already_on_server" + }, + "tom jones_it's not unusual": { + "artist": "Tom Jones", + "title": "It's Not Unusual", + "video_title": "Tom Jones - It's Not Unusual (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:06:39.424447", + "reason": "already_on_server" + }, + "morgan wallen_i got better": { + "artist": "Morgan Wallen", + "title": "I Got Better", + "video_title": "Morgan Wallen - I Got Better (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.590485", + "reason": "already_on_server" + }, + "ella langley_weren't for the wind": { + "artist": "Ella Langley", + "title": "weren't for the wind", + "video_title": "Ella Langley - weren't for the wind (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.593194", + "reason": "already_on_server" + }, + "bell biv devoe_poison": { + "artist": "Bell Biv Devoe", + "title": "Poison", + "video_title": "Bell Biv Devoe - Poison (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.607400", + "reason": "already_on_server" + }, + "morgan wallen_superman": { + "artist": "Morgan Wallen", + "title": "Superman", + "video_title": "Morgan Wallen - Superman (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.620085", + "reason": "already_on_server" + }, + "the fray_look after you": { + "artist": "The Fray", + "title": "Look After You", + "video_title": "The Fray - Look After You (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.634792", + "reason": "already_on_server" + }, + "justin bieber_one less lonely girl": { + "artist": "Justin Bieber", + "title": "One Less Lonely Girl", + "video_title": "Justin Bieber - One Less Lonely Girl (Karaoke Version)", + "channel": "SingKingKaraoke", + "marked_at": "2025-07-25T08:33:25.639304", + "reason": "already_on_server" + }, + "the beatles_all my loving": { + "artist": "The Beatles", + "title": "All My Loving", + "video_title": "The Beatles - All My Loving (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.743418", + "reason": "already_on_server" + }, + "james taylor_sweet baby james": { + "artist": "James Taylor", + "title": "Sweet Baby James", + "video_title": "James Taylor - Sweet Baby James (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.746800", + "reason": "already_on_server" + }, + "phil collins_sussudio": { + "artist": "Phil Collins", + "title": "Sussudio", + "video_title": "Phil Collins - Sussudio (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.749990", + "reason": "already_on_server" + }, + "avril lavigne_things i'll never say": { + "artist": "Avril Lavigne", + "title": "Things I'll Never Say", + "video_title": "Avril Lavigne - Things I'll Never Say (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.766538", + "reason": "already_on_server" + }, + "def leppard_bringin' on the heartbreak": { + "artist": "Def Leppard", + "title": "Bringin' On The Heartbreak", + "video_title": "Def Leppard - Bringin' On The Heartbreak (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.793929", + "reason": "already_on_server" + }, + "no doubt_rock steady": { + "artist": "No Doubt", + "title": "Rock Steady", + "video_title": "No Doubt - Rock Steady (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.797153", + "reason": "already_on_server" + }, + "ultravox_vienna": { + "artist": "Ultravox", + "title": "Vienna", + "video_title": "Ultravox - Vienna (Karaoke)", + "channel": "KaraokeOnVEVO", + "marked_at": "2025-07-25T08:33:25.798966", + "reason": "already_on_server" + }, + "nickelback_far away": { + "artist": "Nickelback", + "title": "Far Away", + "video_title": "Nickelback - Far Away (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.835135", + "reason": "already_on_server" + }, + "lana del rey_diet mountain dew": { + "artist": "Lana Del Rey", + "title": "Diet Mountain Dew", + "video_title": "Lana Del Rey - Diet Mountain Dew (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.837998", + "reason": "already_on_server" + }, + "poison_every rose has its thorn": { + "artist": "Poison", + "title": "Every Rose Has Its Thorn", + "video_title": "Poison - Every Rose Has Its Thorn (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.841689", + "reason": "already_on_server" + }, + "adele_hometown glory": { + "artist": "Adele", + "title": "Hometown Glory", + "video_title": "Adele - Hometown Glory (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.850667", + "reason": "already_on_server" + }, + "lorde_green light": { + "artist": "Lorde", + "title": "Green Light", + "video_title": "Lorde - Green Light (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.856011", + "reason": "already_on_server" + }, + "the isley brothers_shout": { + "artist": "The Isley Brothers", + "title": "Shout", + "video_title": "The Isley Brothers - Shout (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.861753", + "reason": "already_on_server" + }, + "tate mcrae_sports car": { + "artist": "Tate McRae", + "title": "Sports Car", + "video_title": "Tate McRae - Sports Car (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.864819", + "reason": "already_on_server" + }, + "myles smith_stargazing": { + "artist": "Myles Smith", + "title": "Stargazing", + "video_title": "Myles Smith - Stargazing (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.876345", + "reason": "already_on_server" + }, + "belinda carlisle_heaven is a place on earth": { + "artist": "Belinda Carlisle", + "title": "Heaven Is A Place On Earth", + "video_title": "Belinda Carlisle - Heaven Is A Place On Earth (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.883470", + "reason": "already_on_server" + }, + "r.e.m._losing my religion": { + "artist": "R.E.M.", + "title": "Losing My Religion", + "video_title": "R.E.M. - Losing My Religion (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.888733", + "reason": "already_on_server" + }, + "bad bunny_dtmf": { + "artist": "Bad Bunny", + "title": "DtMF", + "video_title": "Bad Bunny - DtMF (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.898975", + "reason": "already_on_server" + }, + "lady gaga_judas": { + "artist": "Lady Gaga", + "title": "Judas", + "video_title": "Lady Gaga - Judas (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.900680", + "reason": "already_on_server" + }, + "lisa_money": { + "artist": "Lisa", + "title": "Money", + "video_title": "Lisa - Money (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.902196", + "reason": "already_on_server" + }, + "alex warren_ordinary": { + "artist": "Alex Warren", + "title": "Ordinary", + "video_title": "Alex Warren - Ordinary (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.906505", + "reason": "already_on_server" + }, + "nickelback_how you remind me": { + "artist": "Nickelback", + "title": "How You Remind Me", + "video_title": "Nickelback - How You Remind Me (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.908105", + "reason": "already_on_server" + }, + "green day_american idiot": { + "artist": "Green Day", + "title": "American Idiot", + "video_title": "Green Day - American Idiot (Karaoke Version)", + "channel": "StingrayKaraoke", + "marked_at": "2025-07-25T08:33:25.909641", + "reason": "already_on_server" + } +} \ No newline at end of file diff --git a/karaoke_downloader/cache_manager.py b/karaoke_downloader/cache_manager.py new file mode 100644 index 0000000..9ef2c17 --- /dev/null +++ b/karaoke_downloader/cache_manager.py @@ -0,0 +1,76 @@ +""" +Cache management utilities for download plans. +Handles caching, loading, and cleanup of download plan data. +""" + +import json +import hashlib +from pathlib import Path +from datetime import datetime, timedelta + +# Constants +DEFAULT_CACHE_EXPIRATION_DAYS = 1 +DEFAULT_CACHE_FILENAME_LENGTH_LIMIT = 200 # Increased from 60 +DEFAULT_CACHE_FILENAME_PREFIX_LENGTH = 100 # Increased from 40 + +def get_download_plan_cache_file(mode, **kwargs): + """Generate a unique cache filename based on mode and key parameters.""" + parts = [f"plan_{mode}"] + + # Handle parameters in a more readable way + for k, v in sorted(kwargs.items()): + if k == "channels_hash": + # Use a shorter version of the hash for readability + parts.append(f"hash{v[:8]}") + else: + parts.append(f"{k}{v}") + + base = "_".join(parts) + + # Hash for safety if string is still too long + if len(base) > DEFAULT_CACHE_FILENAME_LENGTH_LIMIT: + base = base[:DEFAULT_CACHE_FILENAME_PREFIX_LENGTH] + "_" + hashlib.md5(base.encode()).hexdigest()[:8] + + return Path(f"data/{base}.json") + +def load_cached_plan(cache_file, max_age_days=DEFAULT_CACHE_EXPIRATION_DAYS): + """Load a cached download plan if it exists and is not expired.""" + if not cache_file.exists(): + return None, None + + try: + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + cache_time = datetime.fromisoformat(cache_data.get('timestamp')) + if datetime.now() - cache_time < timedelta(days=max_age_days): + print(f"๐Ÿ—‚๏ธ Using cached download plan from {cache_time} ({cache_file.name}).") + return cache_data['download_plan'], cache_data['unmatched'] + except Exception as e: + print(f"โš ๏ธ Could not load download plan cache: {e}") + + return None, None + +def save_plan_cache(cache_file, download_plan, unmatched): + """Save a download plan to cache.""" + if download_plan: + cache_data = { + 'timestamp': datetime.now().isoformat(), + 'download_plan': download_plan, + 'unmatched': unmatched + } + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump(cache_data, f, indent=2, ensure_ascii=False) + print(f"๐Ÿ—‚๏ธ Saved new download plan cache: {cache_file.name}") + else: + if cache_file.exists(): + cache_file.unlink() + print(f"๐Ÿ—‚๏ธ No matches found, not saving download plan cache.") + +def delete_plan_cache(cache_file): + """Delete a download plan cache file.""" + if cache_file.exists(): + try: + cache_file.unlink() + print(f"๐Ÿ—‘๏ธ Deleted download plan cache: {cache_file.name}") + except Exception as e: + print(f"โš ๏ธ Could not delete download plan cache: {e}") \ No newline at end of file diff --git a/karaoke_downloader/channel_manager.py b/karaoke_downloader/channel_manager.py new file mode 100644 index 0000000..be09454 --- /dev/null +++ b/karaoke_downloader/channel_manager.py @@ -0,0 +1,93 @@ +import os +from pathlib import Path +from karaoke_downloader.songlist_manager import ( + save_songlist_tracking, is_songlist_song_downloaded, normalize_title +) + +def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file, channel_name, reset_songlist=False, delete_files=False): + """ + Reset all tracking and optionally files for a channel. + If reset_songlist is False, songlist songs are preserved (tracking and files). + If reset_songlist is True, songlist songs for this channel are also reset/deleted. + """ + print(f"\n๐Ÿ”„ Resetting channel: {channel_name} (reset_songlist={reset_songlist}, delete_files={delete_files})") + # Find channel_id from channel_name + channel_id = None + for pid, playlist in tracker.data.get('playlists', {}).items(): + if playlist['name'] == channel_name or pid == channel_name: + channel_id = pid + break + if not channel_id: + print(f"โŒ Channel '{channel_name}' not found in tracking.") + return + # Get all songs for this channel + songs_to_reset = [] + for song_id, song in tracker.data.get('songs', {}).items(): + if song['playlist_id'] == channel_id: + # Check if this is a songlist song + artist, title = song.get('artist', ''), song.get('title', song.get('name', '')) + key = f"{artist.lower()}_{normalize_title(title)}" + is_songlist = key in songlist_tracking + if is_songlist and not reset_songlist: + continue # skip songlist songs if not resetting them + songs_to_reset.append((song_id, song, is_songlist)) + # Reset tracking and optionally delete files + files_preserved = 0 + files_deleted = 0 + for song_id, song, is_songlist in songs_to_reset: + # Remove from main tracking + tracker.data['songs'][song_id]['status'] = 'NOT_DOWNLOADED' + tracker.data['songs'][song_id]['formats'] = {} + tracker.data['songs'][song_id]['last_error'] = '' + tracker.data['songs'][song_id]['download_attempts'] = 0 + tracker.data['songs'][song_id]['last_updated'] = None + # Remove from songlist tracking if needed + if is_songlist and reset_songlist: + artist, title = song.get('artist', ''), song.get('title', song.get('name', '')) + key = f"{artist.lower()}_{normalize_title(title)}" + if key in songlist_tracking: + del songlist_tracking[key] + # Delete file if requested + if delete_files: + file_path = song.get('file_path') + if file_path: + try: + p = Path(file_path) + if p.exists(): + p.unlink() + files_deleted += 1 + else: + files_preserved += 1 + except Exception as e: + print(f"โš ๏ธ Could not delete file {file_path}: {e}") + # Remove all songlist_tracking entries for this channel if reset_songlist is True + if reset_songlist: + keys_to_remove = [k for k, v in songlist_tracking.items() if v.get('channel') == channel_name] + for k in keys_to_remove: + del songlist_tracking[k] + # Save changes + tracker.force_save() + save_songlist_tracking(songlist_tracking, str(songlist_tracking_file)) + print(f"โœ… Reset {len(songs_to_reset)} songs for channel '{channel_name}'.") + if delete_files: + print(f" Files deleted: {files_deleted}, files preserved: {files_preserved}") + if not reset_songlist: + print(f" Songlist songs were preserved.") + +def download_from_file(self, file_path, force_refresh=False): + file = Path(file_path) + if not file.exists(): + print(f"โŒ File not found: {file_path}") + return False + with open(file, "r", encoding="utf-8") as f: + urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")] + if not urls: + print(f"โŒ No URLs found in {file_path}") + return False + all_success = True + for url in urls: + print(f"\nโžก๏ธ Processing: {url}") + success = self.download_channel_videos(url, force_refresh=force_refresh) + if not success: + all_success = False + return all_success \ No newline at end of file diff --git a/karaoke_downloader/cli.py b/karaoke_downloader/cli.py index eb6e410..d1d360b 100644 --- a/karaoke_downloader/cli.py +++ b/karaoke_downloader/cli.py @@ -4,6 +4,12 @@ from pathlib import Path from karaoke_downloader.downloader import KaraokeDownloader import os +# Constants +DEFAULT_FUZZY_THRESHOLD = 85 +DEFAULT_LATEST_PER_CHANNEL_LIMIT = 5 +DEFAULT_DISPLAY_LIMIT = 10 +DEFAULT_CACHE_DURATION_HOURS = 24 + def main(): parser = argparse.ArgumentParser( description="Karaoke Video Downloader - Download YouTube playlists and channel videos for karaoke", @@ -35,6 +41,7 @@ Examples: parser.add_argument('--reset-channel', metavar='CHANNEL_NAME', help='Reset all tracking and files for a channel') parser.add_argument('--reset-songlist', action='store_true', help='When used with --reset-channel, also reset songlist songs for this channel') parser.add_argument('--reset-songlist-all', action='store_true', help='Reset all songlist tracking and delete all songlist-downloaded files (global)') + parser.add_argument('--clear-server-duplicates', action='store_true', help='Clear server duplicates tracking (allows re-checking songs against server)') parser.add_argument('--version', '-v', action='version', version='Karaoke Playlist Downloader v1.0') parser.add_argument('--force-download-plan', action='store_true', help='Force refresh the download plan cache (re-scan all channels for matches)') parser.add_argument('--latest-per-channel', action='store_true', help='Download the latest N videos from each channel (use with --limit)') @@ -101,6 +108,13 @@ Examples: print('โœ… All songlist tracking and files have been reset.') sys.exit(0) + if args.clear_server_duplicates: + from karaoke_downloader.server_manager import save_server_duplicates_tracking + save_server_duplicates_tracking({}) + print('โœ… Server duplicates tracking has been cleared.') + print('โ„น๏ธ Songs will be re-checked against the server on next run.') + sys.exit(0) + if args.status: stats = downloader.tracker.get_statistics() print("๐ŸŽค Karaoke Downloader Status") @@ -169,7 +183,7 @@ Examples: limit = args.limit if args.limit else None force_refresh_download_plan = args.force_download_plan if hasattr(args, 'force_download_plan') else False fuzzy_match = args.fuzzy_match if hasattr(args, 'fuzzy_match') else False - fuzzy_threshold = args.fuzzy_threshold if hasattr(args, 'fuzzy_threshold') else 90 + fuzzy_threshold = args.fuzzy_threshold if hasattr(args, 'fuzzy_threshold') else DEFAULT_FUZZY_THRESHOLD success = downloader.download_songlist_across_channels(channel_urls, limit=limit, force_refresh_download_plan=force_refresh_download_plan, fuzzy_match=fuzzy_match, fuzzy_threshold=fuzzy_threshold) elif args.latest_per_channel: # Use provided file or default to data/channels.txt @@ -179,9 +193,11 @@ Examples: sys.exit(1) with open(channel_file, "r", encoding="utf-8") as f: channel_urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")] - limit = args.limit if args.limit else 5 + limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT force_refresh_download_plan = args.force_download_plan if hasattr(args, 'force_download_plan') else False - success = downloader.download_latest_per_channel(channel_urls, limit=limit, force_refresh_download_plan=force_refresh_download_plan) + fuzzy_match = args.fuzzy_match if hasattr(args, 'fuzzy_match') else False + fuzzy_threshold = args.fuzzy_threshold if hasattr(args, 'fuzzy_threshold') else DEFAULT_FUZZY_THRESHOLD + success = downloader.download_latest_per_channel(channel_urls, limit=limit, force_refresh_download_plan=force_refresh_download_plan, fuzzy_match=fuzzy_match, fuzzy_threshold=fuzzy_threshold) elif args.url: success = downloader.download_channel_videos(args.url, force_refresh=args.refresh) else: diff --git a/karaoke_downloader/config_manager.py b/karaoke_downloader/config_manager.py new file mode 100644 index 0000000..4920c92 --- /dev/null +++ b/karaoke_downloader/config_manager.py @@ -0,0 +1,77 @@ +""" +Configuration management utilities. +Handles loading and managing application configuration. +""" + +import json +from pathlib import Path + +DATA_DIR = Path("data") + +def load_config(): + """Load configuration from data/config.json or return defaults.""" + config_file = DATA_DIR / "config.json" + if config_file.exists(): + try: + with open(config_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + print(f"Warning: Could not load config.json: {e}") + + return get_default_config() + +def get_default_config(): + """Get the default configuration.""" + return { + "download_settings": { + "format": "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best", + "preferred_resolution": "720p", + "audio_format": "mp3", + "audio_quality": "0", + "subtitle_language": "en", + "subtitle_format": "srt", + "write_metadata": False, + "write_thumbnail": False, + "write_description": False, + "write_annotations": False, + "write_comments": False, + "write_subtitles": False, + "embed_metadata": False, + "add_metadata": False, + "continue_downloads": True, + "no_overwrites": True, + "ignore_errors": True, + "no_warnings": False + }, + "folder_structure": { + "downloads_dir": "downloads", + "logs_dir": "logs", + "tracking_file": str(DATA_DIR / "karaoke_tracking.json") + }, + "logging": { + "level": "INFO", + "format": "%(asctime)s - %(levelname)s - %(message)s", + "include_console": True, + "include_file": True + }, + "yt_dlp_path": "downloader/yt-dlp.exe" + } + +def save_config(config): + """Save configuration to data/config.json.""" + config_file = DATA_DIR / "config.json" + config_file.parent.mkdir(exist_ok=True) + + try: + with open(config_file, 'w', encoding='utf-8') as f: + json.dump(config, f, indent=2, ensure_ascii=False) + return True + except Exception as e: + print(f"Error saving config: {e}") + return False + +def update_config(updates): + """Update configuration with new values.""" + config = load_config() + config.update(updates) + return save_config(config) \ No newline at end of file diff --git a/karaoke_downloader/download_planner.py b/karaoke_downloader/download_planner.py new file mode 100644 index 0000000..5e5d410 --- /dev/null +++ b/karaoke_downloader/download_planner.py @@ -0,0 +1,129 @@ +""" +Download plan building utilities. +Handles pre-scanning channels and building download plans. +""" + +from karaoke_downloader.youtube_utils import get_channel_info +from karaoke_downloader.fuzzy_matcher import ( + is_fuzzy_match, + is_exact_match, + create_song_key, + extract_artist_title, + get_similarity_function +) +from karaoke_downloader.cache_manager import ( + get_download_plan_cache_file, + load_cached_plan, + save_plan_cache, + delete_plan_cache +) + +# Constants +DEFAULT_FILENAME_LENGTH_LIMIT = 100 +DEFAULT_ARTIST_LENGTH_LIMIT = 30 +DEFAULT_TITLE_LENGTH_LIMIT = 60 +DEFAULT_FUZZY_THRESHOLD = 85 + +def build_download_plan(channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD): + """ + For each song in undownloaded, scan all channels for a match. + Use fuzzy matching if enabled. + Return (download_plan, unmatched_songs): + - download_plan: list of dicts {artist, title, channel_name, channel_url, video_id, video_title, match_score} + - unmatched_songs: list of songs not found in any channel + """ + plan = [] + unmatched = [] + channel_match_counts = {} + + # Pre-process songlist for O(1) lookups + song_keys = set() + song_lookup = {} + for song in undownloaded: + key = create_song_key(song['artist'], song['title']) + song_keys.add(key) + song_lookup[key] = song + + for i, channel_url in enumerate(channel_urls, 1): + channel_name, channel_id = get_channel_info(channel_url) + print(f"\n๐Ÿšฆ Starting channel {i}/{len(channel_urls)}: {channel_name} ({channel_url})") + available_videos = tracker.get_channel_video_list( + channel_url, + yt_dlp_path=str(yt_dlp_path), + force_refresh=False + ) + print(f" ๐Ÿ“Š Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs") + matches_this_channel = 0 + + # Pre-process video titles for efficient matching + if fuzzy_match: + # For fuzzy matching, create normalized video keys + video_matches = [] + for video in available_videos: + v_artist, v_title = extract_artist_title(video['title']) + video_key = create_song_key(v_artist, v_title) + + # Find best match among remaining songs + best_match = None + best_score = 0 + for song_key in song_keys: + if song_key in song_lookup: # Only check unmatched songs + score = get_similarity_function()(song_key, video_key) + if score >= fuzzy_threshold and score > best_score: + best_score = score + best_match = song_key + + if best_match: + song = song_lookup[best_match] + video_matches.append({ + 'artist': song['artist'], + 'title': song['title'], + 'channel_name': channel_name, + 'channel_url': channel_url, + 'video_id': video['id'], + 'video_title': video['title'], + 'match_score': best_score + }) + # Remove matched song from future consideration + del song_lookup[best_match] + song_keys.remove(best_match) + matches_this_channel += 1 + else: + # For exact matching, use direct key comparison + for video in available_videos: + v_artist, v_title = extract_artist_title(video['title']) + video_key = create_song_key(v_artist, v_title) + + if video_key in song_keys: + song = song_lookup[video_key] + video_matches.append({ + 'artist': song['artist'], + 'title': song['title'], + 'channel_name': channel_name, + 'channel_url': channel_url, + 'video_id': video['id'], + 'video_title': video['title'], + 'match_score': 100 + }) + # Remove matched song from future consideration + del song_lookup[video_key] + song_keys.remove(video_key) + matches_this_channel += 1 + + # Add matches to plan + plan.extend(video_matches) + + # Print match count once per channel + channel_match_counts[channel_name] = matches_this_channel + print(f" โ†’ Found {matches_this_channel} songlist matches in this channel.") + + # Remaining unmatched songs + unmatched = list(song_lookup.values()) + + # Print summary table + print("\n๐Ÿ“Š Channel match summary:") + for channel, count in channel_match_counts.items(): + print(f" {channel}: {count} matches") + print(f" TOTAL: {sum(channel_match_counts.values())} matches across {len(channel_match_counts)} channels.") + + return plan, unmatched \ No newline at end of file diff --git a/karaoke_downloader/downloader.py b/karaoke_downloader/downloader.py index aecd75d..d50fa9f 100644 --- a/karaoke_downloader/downloader.py +++ b/karaoke_downloader/downloader.py @@ -9,12 +9,30 @@ from karaoke_downloader.tracking_manager import TrackingManager, SongStatus, For from karaoke_downloader.id3_utils import add_id3_tags, extract_artist_title from karaoke_downloader.songlist_manager import ( load_songlist, load_songlist_tracking, save_songlist_tracking, - is_songlist_song_downloaded, mark_songlist_song_downloaded, normalize_title, - load_server_songs, is_song_on_server + is_songlist_song_downloaded, mark_songlist_song_downloaded, normalize_title +) +from karaoke_downloader.server_manager import ( + load_server_songs, is_song_on_server, load_server_duplicates_tracking, + check_and_mark_server_duplicate, is_song_marked_as_server_duplicate ) from karaoke_downloader.youtube_utils import get_channel_info, get_playlist_info +from karaoke_downloader.fuzzy_matcher import get_similarity_function, is_fuzzy_match, is_exact_match, create_song_key, create_video_key import logging import hashlib +from karaoke_downloader.download_planner import build_download_plan +from karaoke_downloader.cache_manager import ( + get_download_plan_cache_file, load_cached_plan, save_plan_cache, delete_plan_cache +) +from karaoke_downloader.video_downloader import download_video_and_track, is_valid_mp4, execute_download_plan +from karaoke_downloader.channel_manager import reset_channel_downloads, download_from_file + +# Constants +DEFAULT_FUZZY_THRESHOLD = 85 +DEFAULT_CACHE_EXPIRATION_DAYS = 1 +DEFAULT_FILENAME_LENGTH_LIMIT = 100 +DEFAULT_ARTIST_LENGTH_LIMIT = 30 +DEFAULT_TITLE_LENGTH_LIMIT = 60 +DEFAULT_DISPLAY_LIMIT = 10 DATA_DIR = Path("data") @@ -75,95 +93,87 @@ class KaraokeDownloader: "yt_dlp_path": "downloader/yt-dlp.exe" } - def reset_channel_downloads(self, channel_name, reset_songlist=False, delete_files=False): + def _should_skip_song(self, artist, title, channel_name, video_id, video_title, server_songs=None, server_duplicates_tracking=None): """ - Reset all tracking and optionally files for a channel. - If reset_songlist is False, songlist songs are preserved (tracking and files). - If reset_songlist is True, songlist songs for this channel are also reset/deleted. + Centralized method to check if a song should be skipped. + Performs four checks in order: + 1. Already downloaded (tracking) + 2. File exists on filesystem + 3. Already on server + 4. Previously failed download (bad file) + + Returns: + tuple: (should_skip, reason, total_filtered) """ - print(f"\n๐Ÿ”„ Resetting channel: {channel_name} (reset_songlist={reset_songlist}, delete_files={delete_files})") - # Find channel_id from channel_name - channel_id = None - for pid, playlist in self.tracker.data.get('playlists', {}).items(): - if playlist['name'] == channel_name or pid == channel_name: - channel_id = pid - break - if not channel_id: - print(f"โŒ Channel '{channel_name}' not found in tracking.") - return - # Get all songs for this channel - songs_to_reset = [] - for song_id, song in self.tracker.data.get('songs', {}).items(): - if song['playlist_id'] == channel_id: - # Check if this is a songlist song - artist, title = song.get('artist', ''), song.get('title', song.get('name', '')) - key = f"{artist.lower()}_{normalize_title(title)}" - is_songlist = key in self.songlist_tracking - if is_songlist and not reset_songlist: - continue # skip songlist songs if not resetting them - songs_to_reset.append((song_id, song, is_songlist)) - # Reset tracking and optionally delete files - files_preserved = 0 - files_deleted = 0 - for song_id, song, is_songlist in songs_to_reset: - # Remove from main tracking - self.tracker.data['songs'][song_id]['status'] = 'NOT_DOWNLOADED' - self.tracker.data['songs'][song_id]['formats'] = {} - self.tracker.data['songs'][song_id]['last_error'] = '' - self.tracker.data['songs'][song_id]['download_attempts'] = 0 - self.tracker.data['songs'][song_id]['last_updated'] = None - # Remove from songlist tracking if needed - if is_songlist and reset_songlist: - artist, title = song.get('artist', ''), song.get('title', song.get('name', '')) - key = f"{artist.lower()}_{normalize_title(title)}" - if key in self.songlist_tracking: - del self.songlist_tracking[key] - # Delete file if requested - if delete_files: - file_path = song.get('file_path') - if file_path: - try: - p = Path(file_path) - if p.exists(): - p.unlink() - files_deleted += 1 - else: - files_preserved += 1 - except Exception as e: - print(f"โš ๏ธ Could not delete file {file_path}: {e}") - # --- FIX: Remove all songlist_tracking entries for this channel if reset_songlist is True --- - if reset_songlist: - keys_to_remove = [k for k, v in self.songlist_tracking.items() if v.get('channel') == channel_name] - for k in keys_to_remove: - del self.songlist_tracking[k] - # Save changes - self.tracker.force_save() - save_songlist_tracking(self.songlist_tracking, str(self.songlist_tracking_file)) - print(f"โœ… Reset {len(songs_to_reset)} songs for channel '{channel_name}'.") - if delete_files: - print(f" Files deleted: {files_deleted}, files preserved: {files_preserved}") - if not reset_songlist: - print(f" Songlist songs were preserved.") + total_filtered = 0 + + # Check 1: Already downloaded by this system + if self.tracker.is_song_downloaded(artist, title, channel_name, video_id): + return True, "already downloaded", total_filtered + + # Check 2: File already exists on filesystem + # Generate the expected filename based on the download mode context + safe_title = title + invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\'] + for char in invalid_chars: + safe_title = safe_title.replace(char, "") + safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip() + + # Try different filename patterns that might exist + possible_filenames = [ + f"{artist} - {safe_title}.mp4", # Songlist mode + f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode + f"{artist} - {safe_title} (Karaoke Version).mp4" # Channel videos mode + ] + + for filename in possible_filenames: + if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT: + # Apply length limits if needed + safe_artist = artist.replace("'", "").replace('"', "").strip() + filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4" + + output_path = self.downloads_dir / channel_name / filename + if output_path.exists() and output_path.stat().st_size > 0: + return True, "file exists", total_filtered + + # Check 3: Already on server (if server data provided) + if server_songs is not None and server_duplicates_tracking is not None: + from karaoke_downloader.server_manager import check_and_mark_server_duplicate + if check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, video_title, channel_name): + total_filtered += 1 + return True, "on server", total_filtered + + # Check 4: Previously failed download (bad file) + if self.tracker.is_song_failed(artist, title, channel_name, video_id): + return True, "previously failed", total_filtered + + return False, None, total_filtered + + def _mark_song_failed(self, artist, title, video_id, channel_name, error_message): + """ + Centralized method to mark a song as failed in tracking. + """ + self.tracker.mark_song_failed(artist, title, video_id, channel_name, error_message) + print(f"๐Ÿท๏ธ Marked song as failed: {artist} - {title}") + + def _handle_download_failure(self, artist, title, video_id, channel_name, error_type, error_details=""): + """ + Centralized method to handle download failures. + + Args: + artist: Song artist + title: Song title + video_id: YouTube video ID + channel_name: Channel name + error_type: Type of error (e.g., "yt-dlp failed", "file verification failed") + error_details: Additional error details + """ + error_msg = f"{error_type}" + if error_details: + error_msg += f": {error_details}" + self._mark_song_failed(artist, title, video_id, channel_name, error_msg) - def download_from_file(self, file_path, force_refresh=False): - file = Path(file_path) - if not file.exists(): - print(f"โŒ File not found: {file_path}") - return False - with open(file, "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")] - if not urls: - print(f"โŒ No URLs found in {file_path}") - return False - all_success = True - for url in urls: - print(f"\nโžก๏ธ Processing: {url}") - success = self.download_channel_videos(url, force_refresh=force_refresh) - if not success: - all_success = False - return all_success - - def download_channel_videos(self, url, force_refresh=False, fuzzy_match=False, fuzzy_threshold=90): + def download_channel_videos(self, url, force_refresh=False, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD): """Download videos from a channel or playlist URL, respecting songlist-only and limit flags. Supports fuzzy matching.""" channel_name, channel_id = get_channel_info(url) print(f"\n๐ŸŽฌ Downloading from channel: {channel_name} ({url})") @@ -171,6 +181,11 @@ class KaraokeDownloader: if not songlist: print("โš ๏ธ No songlist loaded. Skipping.") return False + + # Load server songs and duplicates tracking for availability checking + server_songs = load_server_songs() + server_duplicates_tracking = load_server_duplicates_tracking() + limit = self.config.get('limit', 1) cmd = [ str(self.yt_dlp_path), @@ -191,21 +206,14 @@ class KaraokeDownloader: title, video_id = parts[0].strip(), parts[1].strip() available_videos.append({'title': title, 'id': video_id}) # Normalize songlist for matching - try: - from rapidfuzz import fuzz - def similarity(a, b): - return fuzz.ratio(a, b) - except ImportError: - import difflib - def similarity(a, b): - return int(difflib.SequenceMatcher(None, a, b).ratio() * 100) normalized_songlist = { - f"{s['artist'].lower()}_{normalize_title(s['title'])}": s for s in songlist + create_song_key(s['artist'], s['title']): s for s in songlist } matches = [] + similarity = get_similarity_function() for video in available_videos: artist, title = extract_artist_title(video['title']) - key = f"{artist.lower()}_{normalize_title(title)}" + key = create_song_key(artist, title) if fuzzy_match: # Fuzzy match against all songlist keys best_score = 0 @@ -216,15 +224,26 @@ class KaraokeDownloader: best_score = score best_song = song if best_score >= fuzzy_threshold and best_song: + # Check if already downloaded or on server if not is_songlist_song_downloaded(self.songlist_tracking, best_song['artist'], best_song['title']): - matches.append((video, best_song)) - print(f" โ†’ Fuzzy match: {artist} - {title} <-> {best_song['artist']} - {best_song['title']} (score: {best_score})") + # Check if already marked as server duplicate + if not is_song_marked_as_server_duplicate(server_duplicates_tracking, best_song['artist'], best_song['title']): + # Check if already on server and mark for future skipping + if not check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, best_song['artist'], best_song['title'], video['title'], channel_name): + matches.append((video, best_song)) + print(f" โ†’ Fuzzy match: {artist} - {title} <-> {best_song['artist']} - {best_song['title']} (score: {best_score})") if len(matches) >= limit: break else: if key in normalized_songlist: - if not is_songlist_song_downloaded(self.songlist_tracking, artist, title): - matches.append((video, normalized_songlist[key])) + song = normalized_songlist[key] + # Check if already downloaded or on server + if not is_songlist_song_downloaded(self.songlist_tracking, song['artist'], song['title']): + # Check if already marked as server duplicate + if not is_song_marked_as_server_duplicate(server_duplicates_tracking, song['artist'], song['title']): + # Check if already on server and mark for future skipping + if not check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, song['artist'], song['title'], video['title'], channel_name): + matches.append((video, song)) if len(matches) >= limit: break if not matches: @@ -247,12 +266,18 @@ class KaraokeDownloader: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as e: print(f"โŒ yt-dlp failed: {e}") + # Mark song as failed in tracking immediately + self._handle_download_failure(artist, title, video['id'], channel_name, "yt-dlp failed", str(e)) continue if not output_path.exists() or output_path.stat().st_size == 0: print(f"โŒ Download failed or file is empty: {output_path}") + # Mark song as failed in tracking immediately + self._handle_download_failure(artist, title, video['id'], channel_name, "Download failed", "file does not exist or is empty") continue - if not self._is_valid_mp4(output_path): + if not is_valid_mp4(output_path): print(f"โŒ File is not a valid MP4: {output_path}") + # Mark song as failed in tracking immediately + self._handle_download_failure(artist, title, video['id'], channel_name, "Download failed", "file is not a valid MP4") continue add_id3_tags(output_path, f"{artist} - {title} (Karaoke Version)", channel_name) mark_songlist_song_downloaded(self.songlist_tracking, artist, title, channel_name, output_path) @@ -260,107 +285,7 @@ class KaraokeDownloader: print(f"๐ŸŽ‰ All post-processing complete for: {output_path}") return True - def build_download_plan(self, channel_urls, undownloaded, fuzzy_match=False, fuzzy_threshold=90): - """ - For each song in undownloaded, scan all channels for a match. - Use fuzzy matching if enabled. - Return (download_plan, unmatched_songs): - - download_plan: list of dicts {artist, title, channel_name, channel_url, video_id, video_title} - - unmatched_songs: list of songs not found in any channel - """ - try: - from rapidfuzz import fuzz - def similarity(a, b): - return fuzz.ratio(a, b) - except ImportError: - import difflib - def similarity(a, b): - return int(difflib.SequenceMatcher(None, a, b).ratio() * 100) - plan = [] - unmatched = [] - channel_match_counts = {} - for channel_url in channel_urls: - channel_name, channel_id = get_channel_info(channel_url) - print(f"\n๐Ÿšฆ Starting channel: {channel_name} ({channel_url})") - available_videos = self.tracker.get_channel_video_list( - channel_url, - yt_dlp_path=str(self.yt_dlp_path), - force_refresh=False - ) - matches_this_channel = 0 - channel_fuzzy_matches = [] # For optional top-N reporting - for song in undownloaded: - artist, title = song['artist'], song['title'] - found = False - song_key = f"{artist.lower()}_{normalize_title(title)}" - for video in available_videos: - v_artist, v_title = extract_artist_title(video['title']) - video_key = f"{v_artist.lower()}_{normalize_title(v_title)}" - if fuzzy_match: - score = similarity(song_key, video_key) - if score >= fuzzy_threshold: - if not any(p['artist'] == artist and p['title'] == title for p in plan): - plan.append({ - 'artist': artist, - 'title': title, - 'channel_name': channel_name, - 'channel_url': channel_url, - 'video_id': video['id'], - 'video_title': video['title'], - 'match_score': score - }) - # print(f" โ†’ Match: \"{artist} - {title}\" <-> \"{video['title']}\" (score: {score})") - matches_this_channel += 1 - found = True - break - else: - if (normalize_title(v_artist) == normalize_title(artist) and normalize_title(v_title) == normalize_title(title)) or \ - (normalize_title(video['title']) == normalize_title(f"{artist} - {title}")): - if not any(p['artist'] == artist and p['title'] == title for p in plan): - plan.append({ - 'artist': artist, - 'title': title, - 'channel_name': channel_name, - 'channel_url': channel_url, - 'video_id': video['id'], - 'video_title': video['title'], - 'match_score': 100 - }) - # print(f" โ†’ Match: \"{artist} - {title}\" <-> \"{video['title']}\" (exact)") - matches_this_channel += 1 - found = True - break - # Don't break here; keep looking for all matches in this channel - channel_match_counts[channel_name] = matches_this_channel - print(f" โ†’ Found {matches_this_channel} songlist matches in this channel.") - # Optionally, print top 3 fuzzy matches for review - # if fuzzy_match and channel_fuzzy_matches: - # top_matches = sorted(channel_fuzzy_matches, key=lambda x: -x[3])[:3] - # for a, t, vt, s in top_matches: - # print(f" Top match: {a} - {t} <-> {vt} (score: {s})") - # Now find unmatched songs - for song in undownloaded: - if not any(p['artist'] == song['artist'] and p['title'] == song['title'] for p in plan): - unmatched.append(song) - # Print summary table - print("\n๐Ÿ“Š Channel match summary:") - for channel, count in channel_match_counts.items(): - print(f" {channel}: {count} matches") - print(f" TOTAL: {sum(channel_match_counts.values())} matches across {len(channel_match_counts)} channels.") - return plan, unmatched - - def get_download_plan_cache_file(self, mode, **kwargs): - """Generate a unique cache filename based on mode and key parameters.""" - parts = [f"plan_{mode}"] - for k, v in sorted(kwargs.items()): - parts.append(f"{k}{v}") - base = "_".join(parts) - # Hash for safety if string is long - if len(base) > 60: - base = base[:40] + "_" + hashlib.md5(base.encode()).hexdigest() - return Path(f"data/{base}.json") - - def download_songlist_across_channels(self, channel_urls, limit=None, force_refresh_download_plan=False, fuzzy_match=False, fuzzy_threshold=90): + def download_songlist_across_channels(self, channel_urls, limit=None, force_refresh_download_plan=False, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD): """ For each song in the songlist, try each channel in order and download from the first channel where it is found. Download up to 'limit' songs, skipping any that cannot be found, until the limit is reached or all possible matches are exhausted. @@ -371,35 +296,52 @@ class KaraokeDownloader: return False # Filter for songs not yet downloaded undownloaded = [s for s in songlist if not is_songlist_song_downloaded(self.songlist_tracking, s['artist'], s['title'])] - print(f"๐ŸŽฏ {len(songlist)} total unique songs in songlist.") - print(f"๐ŸŽฏ {len(undownloaded)} unique songlist songs to download.") - # Further filter out songs already on server - not_on_server = [s for s in undownloaded if not is_song_on_server(self.server_songs, s['artist'], s['title'])] - server_available = len(undownloaded) - len(not_on_server) + print(f"\n๐ŸŽฏ {len(songlist)} total unique songs in songlist.") + print(f"\n๐ŸŽฏ {len(undownloaded)} unique songlist songs to download.") + + # Load server songs and duplicates tracking for availability checking + server_songs = load_server_songs() + server_duplicates_tracking = load_server_duplicates_tracking() + + # Further filter out songs already on server or marked as duplicates + not_on_server = [] + server_available = 0 + marked_duplicates = 0 + + for song in undownloaded: + artist, title = song['artist'], song['title'] + + # Check if already marked as server duplicate + if is_song_marked_as_server_duplicate(server_duplicates_tracking, artist, title): + marked_duplicates += 1 + continue + + # Check if already on server and mark for future skipping + if check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, f"{artist} - {title}", "songlist"): + server_available += 1 + continue + + not_on_server.append(song) + if server_available > 0: - print(f"๐ŸŽต {server_available} songs already available on server, skipping.") + print(f"\n๐ŸŽต {server_available} songs already available on server, skipping.") + if marked_duplicates > 0: + print(f"\n๐Ÿท๏ธ {marked_duplicates} songs previously marked as server duplicates, skipping.") + undownloaded = not_on_server - print(f"๐ŸŽฏ {len(undownloaded)} songs need to be downloaded.") + print(f"\n๐ŸŽฏ {len(undownloaded)} songs need to be downloaded.") if not undownloaded: print("๐ŸŽต All songlist songs already downloaded.") return True # --- FAST MODE: Early exit and deduplication if limit is set --- if limit is not None: print("\nโšก Fast mode enabled: will stop as soon as limit is reached with successful downloads.") - try: - from rapidfuzz import fuzz - def similarity(a, b): - return fuzz.ratio(a, b) - except ImportError: - import difflib - def similarity(a, b): - return int(difflib.SequenceMatcher(None, a, b).ratio() * 100) + similarity = get_similarity_function() downloaded_count = 0 unique_keys = set() total_attempted = 0 for channel_url in channel_urls: channel_name, channel_id = get_channel_info(channel_url) - print(f"\n๐Ÿšฆ Starting channel: {channel_name} ({channel_url})") available_videos = self.tracker.get_channel_video_list( channel_url, yt_dlp_path=str(self.yt_dlp_path), @@ -407,22 +349,28 @@ class KaraokeDownloader: ) for song in undownloaded: artist, title = song['artist'], song['title'] - key = f"{artist.lower()}_{normalize_title(title)}" + key = create_song_key(artist, title) if key in unique_keys: continue # Already downloaded or queued + + # Check if should skip this song during planning phase + should_skip, reason, _ = self._should_skip_song( + artist, title, channel_name, None, f"{artist} - {title}", + server_songs, server_duplicates_tracking + ) + if should_skip: + continue + found = False for video in available_videos: v_artist, v_title = extract_artist_title(video['title']) - video_key = f"{v_artist.lower()}_{normalize_title(v_title)}" + video_key = create_song_key(v_artist, v_title) if fuzzy_match: score = similarity(key, video_key) if score >= fuzzy_threshold: - print(f" โ†’ Match: \"{artist} - {title}\" <-> \"{video['title']}\" (score: {score})") found = True else: - if (normalize_title(v_artist) == normalize_title(artist) and normalize_title(v_title) == normalize_title(title)) or \ - (normalize_title(video['title']) == normalize_title(f"{artist} - {title}")): - print(f" โ†’ Match: \"{artist} - {title}\" <-> \"{video['title']}\" (exact)") + if is_exact_match(artist, title, video['title']): found = True if found: print(f"\nโฌ‡๏ธ Downloading {downloaded_count+1} of {limit}:") @@ -441,8 +389,18 @@ class KaraokeDownloader: safe_artist = safe_artist.strip() filename = f"{safe_artist} - {safe_title}.mp4" # Call the actual download function (simulate the same as in the plan loop) - success = self._download_video_and_track( - channel_name, channel_url, video['id'], video['title'], artist, title, filename + success = download_video_and_track( + self.yt_dlp_path, + self.config, + self.downloads_dir, + self.songlist_tracking, + channel_name, + channel_url, + video['id'], + video['title'], + artist, + title, + filename ) total_attempted += 1 if success: @@ -459,195 +417,89 @@ class KaraokeDownloader: if downloaded_count < limit: print(f"โš ๏ธ Only {downloaded_count} songs were downloaded. Some may not have been found or downloads failed.") return True + # --- ORIGINAL FULL PLAN MODE (no limit) --- - # Removed per-song printout for cleaner output - # print("๐Ÿ” Songs to search for:") - # for song in undownloaded: - # print(f" - {song['artist']} - {song['title']}") # --- Download plan cache logic --- plan_mode = "songlist" - plan_kwargs = {"limit": limit or "all", "channels": len(channel_urls)} - cache_file = self.get_download_plan_cache_file(plan_mode, **plan_kwargs) + # Include all parameters that affect the plan generation + plan_kwargs = { + "limit": limit or "all", + "channels": len(channel_urls), + "fuzzy": fuzzy_match, + "threshold": fuzzy_threshold + } + # Add channel URLs hash to ensure same channels = same cache + channels_hash = hashlib.md5("|".join(sorted(channel_urls)).encode()).hexdigest()[:8] + plan_kwargs["channels_hash"] = channels_hash + cache_file = get_download_plan_cache_file(plan_mode, **plan_kwargs) use_cache = False - if not force_refresh_download_plan and cache_file.exists(): - try: - with open(cache_file, 'r', encoding='utf-8') as f: - cache_data = json.load(f) - cache_time = datetime.fromisoformat(cache_data.get('timestamp')) - if datetime.now() - cache_time < timedelta(days=1): - print(f"๐Ÿ—‚๏ธ Using cached download plan from {cache_time} ({cache_file.name}).") - download_plan = cache_data['download_plan'] - unmatched = cache_data['unmatched'] - use_cache = True - except Exception as e: - print(f"โš ๏ธ Could not load download plan cache: {e}") + download_plan, unmatched = load_cached_plan(cache_file) + if not force_refresh_download_plan and download_plan is not None: + use_cache = True if not use_cache: - print("\n๐Ÿ”Ž Pre-scanning channels for matches...") - download_plan, unmatched = self.build_download_plan(channel_urls, undownloaded, fuzzy_match=fuzzy_match, fuzzy_threshold=fuzzy_threshold) - if download_plan: - cache_data = { - 'timestamp': datetime.now().isoformat(), - 'download_plan': download_plan, - 'unmatched': unmatched - } - with open(cache_file, 'w', encoding='utf-8') as f: - json.dump(cache_data, f, indent=2, ensure_ascii=False) - print(f"๐Ÿ—‚๏ธ Saved new download plan cache: {cache_file.name}") - else: - if cache_file.exists(): - cache_file.unlink() - print(f"๐Ÿ—‚๏ธ No matches found, not saving download plan cache.") + print("\n๐Ÿ” Pre-scanning channels for matches...") + download_plan, unmatched = build_download_plan( + channel_urls, + undownloaded, + self.tracker, + self.yt_dlp_path, + fuzzy_match=fuzzy_match, + fuzzy_threshold=fuzzy_threshold + ) + save_plan_cache(cache_file, download_plan, unmatched) print(f"\n๐Ÿ“Š Download plan ready: {len(download_plan)} songs will be downloaded.") print(f"โŒ {len(unmatched)} songs could not be found in any channel.") if unmatched: print("Unmatched songs:") - for song in unmatched[:10]: + for song in unmatched[:DEFAULT_DISPLAY_LIMIT]: print(f" - {song['artist']} - {song['title']}") - if len(unmatched) > 10: - print(f" ...and {len(unmatched)-10} more.") + if len(unmatched) > DEFAULT_DISPLAY_LIMIT: + print(f" ...and {len(unmatched)-DEFAULT_DISPLAY_LIMIT} more.") # --- Download phase --- - downloaded_count = 0 - total_to_download = limit if limit is not None else len(download_plan) - for idx, item in enumerate(download_plan): - if limit is not None and downloaded_count >= limit: - break - artist = item['artist'] - title = item['title'] - channel_name = item['channel_name'] - channel_url = item['channel_url'] - video_id = item['video_id'] - video_title = item['video_title'] - print(f"\nโฌ‡๏ธ Downloading {idx+1} of {total_to_download}:") - print(f" ๐Ÿ“‹ Songlist: {artist} - {title}") - print(f" ๐ŸŽฌ Video: {video_title} ({channel_name})") - if 'match_score' in item: - print(f" ๐ŸŽฏ Match Score: {item['match_score']:.1f}%") - # --- Existing download logic here, using channel_name, video_id, etc. --- - # (Copy the download logic from the previous loop, using these variables) - # Create a shorter, safer filename - do this ONCE and use consistently - safe_title = title.replace("(From ", "").replace(")", "").replace(" - ", " ").replace(":", "").replace("'", "").replace('"', "") - safe_artist = artist.replace("'", "").replace('"', "") - # Remove all Windows-invalid characters - invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\'] - for char in invalid_chars: - safe_title = safe_title.replace(char, "") - safe_artist = safe_artist.replace(char, "") - # Also remove any other potentially problematic characters - safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip() - safe_artist = safe_artist.strip() - - filename = f"{safe_artist} - {safe_title}.mp4" - # Limit filename length to avoid Windows path issues - if len(filename) > 100: - filename = f"{safe_artist[:30]} - {safe_title[:60]}.mp4" - - output_path = self.downloads_dir / channel_name / filename - output_path.parent.mkdir(parents=True, exist_ok=True) - print(f"โฌ‡๏ธ Downloading: {artist} - {title} -> {output_path}") - video_url = f"https://www.youtube.com/watch?v={video_id}" - dlp_cmd = [ - str(self.yt_dlp_path), - "--no-check-certificates", - "--ignore-errors", - "--no-warnings", - "-o", str(output_path), - "-f", self.config["download_settings"]["format"], - video_url - ] - print(f"๐Ÿ”ง Running command: {' '.join(dlp_cmd)}") - print(f"๐Ÿ“บ Resolution settings: {self.config.get('download_settings', {}).get('preferred_resolution', 'Unknown')}") - print(f"๐ŸŽฌ Format string: {self.config.get('download_settings', {}).get('format', 'Unknown')}") - - # Debug: Show available formats (optional) - if self.config.get('debug_show_formats', False): - print(f"๐Ÿ” Checking available formats for: {video_url}") - format_cmd = [ - str(self.yt_dlp_path), - "--list-formats", - video_url - ] - try: - format_result = subprocess.run(format_cmd, capture_output=True, text=True, timeout=30) - print(f"๐Ÿ“‹ Available formats:\n{format_result.stdout}") - except Exception as e: - print(f"โš ๏ธ Could not check formats: {e}") - try: - result = subprocess.run(dlp_cmd, capture_output=True, text=True, check=True) - print(f"โœ… yt-dlp completed successfully") - print(f"๐Ÿ“„ yt-dlp stdout: {result.stdout}") - except subprocess.CalledProcessError as e: - print(f"โŒ yt-dlp failed with exit code {e.returncode}") - print(f"โŒ yt-dlp stderr: {e.stderr}") - continue - if not output_path.exists(): - print(f"โŒ Download failed: file does not exist: {output_path}") - # Check if yt-dlp saved it somewhere else - possible_files = list(output_path.parent.glob("*.mp4")) - if possible_files: - print(f"๐Ÿ” Found these files in the directory: {[f.name for f in possible_files]}") - # Look for a file that matches our pattern (artist - title) - artist_part = safe_artist.lower() - title_part = safe_title.lower() - for file in possible_files: - file_lower = file.stem.lower() - if artist_part in file_lower and any(word in file_lower for word in title_part.split()): - print(f"๐ŸŽฏ Found matching file: {file.name}") - output_path = file - break - else: - print(f"โŒ No matching file found for: {artist} - {title}") - continue - else: - continue - if output_path.stat().st_size == 0: - print(f"โŒ Download failed: file is empty (0 bytes): {output_path}") - continue - # TEMP: Skipping MP4 validation for debugging - # if not self._is_valid_mp4(output_path): - # print(f"โŒ File is not a valid MP4: {output_path}") - # continue - add_id3_tags(output_path, f"{artist} - {title} (Karaoke Version)", channel_name) - mark_songlist_song_downloaded(self.songlist_tracking, artist, title, channel_name, output_path) - print(f"โœ… Downloaded and tracked: {artist} - {title}") - print(f"๐ŸŽ‰ All post-processing complete for: {output_path}") - downloaded_count += 1 - # After each download, if this was the last song, delete the cache - if idx + 1 == total_to_download: - if cache_file.exists(): - try: - cache_file.unlink() - print(f"๐Ÿ—‘๏ธ Deleted download plan cache after last song downloaded: {cache_file.name}") - except Exception as e: - print(f"โš ๏ธ Could not delete download plan cache: {e}") - print(f"๐ŸŽ‰ Downloaded {downloaded_count} songlist songs.") - print(f"๐Ÿ“Š Summary: Processed {len(channel_urls)} channels, found {downloaded_count} songs, {len(unmatched)} songs not found.") - # Delete the download plan cache if all planned downloads are done - if cache_file.exists(): - try: - cache_file.unlink() - print(f"๐Ÿ—‘๏ธ Deleted download plan cache after completion: {cache_file.name}") - except Exception as e: - print(f"โš ๏ธ Could not delete download plan cache: {e}") - return True + downloaded_count, success = execute_download_plan( + download_plan=download_plan, + unmatched=unmatched, + cache_file=cache_file, + config=self.config, + yt_dlp_path=self.yt_dlp_path, + downloads_dir=self.downloads_dir, + songlist_tracking=self.songlist_tracking, + limit=limit + ) + return success - def download_latest_per_channel(self, channel_urls, limit=5, force_refresh_download_plan=False): + def download_latest_per_channel(self, channel_urls, limit=5, force_refresh_download_plan=False, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD): """ Download the latest N videos from each channel in channel_urls. - Pre-scan all channels for their latest N videos. + - Check against local songs file to avoid duplicates. - Build a per-channel download plan and cache it. - Resume robustly if interrupted (removes each channel from the plan as it completes). - Deletes the plan cache when all channels are done. """ + # Load server songs for availability checking + server_songs = load_server_songs() + server_duplicates_tracking = load_server_duplicates_tracking() + plan_mode = "latest_per_channel" - plan_kwargs = {"limit": limit, "channels": len(channel_urls)} - cache_file = self.get_download_plan_cache_file(plan_mode, **plan_kwargs) + # Include all parameters that affect the plan generation + plan_kwargs = { + "limit": limit, + "channels": len(channel_urls), + "fuzzy": fuzzy_match, + "threshold": fuzzy_threshold + } + # Add channel URLs hash to ensure same channels = same cache + channels_hash = hashlib.md5("|".join(sorted(channel_urls)).encode()).hexdigest()[:8] + plan_kwargs["channels_hash"] = channels_hash + cache_file = get_download_plan_cache_file(plan_mode, **plan_kwargs) use_cache = False if not force_refresh_download_plan and cache_file.exists(): try: with open(cache_file, 'r', encoding='utf-8') as f: plan_data = json.load(f) cache_time = datetime.fromisoformat(plan_data.get('timestamp')) - if datetime.now() - cache_time < timedelta(days=1): + if datetime.now() - cache_time < timedelta(days=DEFAULT_CACHE_EXPIRATION_DAYS): print(f"๐Ÿ—‚๏ธ Using cached latest-per-channel plan from {cache_time} ({cache_file.name}).") channel_plans = plan_data['channel_plans'] use_cache = True @@ -656,6 +508,10 @@ class KaraokeDownloader: if not use_cache: print("\n๐Ÿ”Ž Pre-scanning all channels for latest videos...") channel_plans = [] + total_found = 0 + total_filtered = 0 + total_marked = 0 + for channel_url in channel_urls: channel_name, channel_id = get_channel_info(channel_url) print(f"\n๐Ÿšฆ Starting channel: {channel_name} ({channel_url})") @@ -664,14 +520,58 @@ class KaraokeDownloader: yt_dlp_path=str(self.yt_dlp_path), force_refresh=False ) - # Sort by upload order (assume yt-dlp returns in order, or sort by id if available) - latest_videos = available_videos[:limit] - print(f" โ†’ Found {len(latest_videos)} latest videos for this channel.") + print(f" โ†’ Found {len(available_videos)} total videos for this channel.") + + # Pre-filter: Create a set of known duplicate keys for O(1) lookup + known_duplicate_keys = set() + for song_key in server_duplicates_tracking.keys(): + known_duplicate_keys.add(song_key) + + # Pre-filter videos to exclude known duplicates before processing + pre_filtered_videos = [] + for video in available_videos: + artist, title = extract_artist_title(video['title']) + song_key = create_song_key(artist, title) + if song_key not in known_duplicate_keys: + pre_filtered_videos.append(video) + + print(f" โ†’ After pre-filtering: {len(pre_filtered_videos)} videos not previously marked as duplicates.") + + # Process videos until we reach the limit for this channel + filtered_videos = [] + videos_checked = 0 + + for video in pre_filtered_videos: + if len(filtered_videos) >= limit: + break # We have enough videos for this channel + + videos_checked += 1 + artist, title = extract_artist_title(video['title']) + + # Check if should skip this song during planning phase + should_skip, reason, filtered_count = self._should_skip_song( + artist, title, channel_name, video['id'], video['title'], + server_songs, server_duplicates_tracking + ) + if should_skip: + total_filtered += 1 + if reason == "on server": + total_marked += filtered_count + continue + + filtered_videos.append(video) + + print(f" โ†’ After processing: {len(filtered_videos)} videos to download (checked {videos_checked} videos, filtered out {videos_checked - len(filtered_videos)} already on server).") + total_found += len(filtered_videos) + channel_plans.append({ 'channel_name': channel_name, 'channel_url': channel_url, - 'videos': latest_videos + 'videos': filtered_videos }) + + print(f"\n๐Ÿ“Š Summary: {total_found} videos to download across {len(channel_plans)} channels (filtered out {total_filtered} already on server, marked {total_marked} new duplicates for future skipping).") + plan_data = { 'timestamp': datetime.now().isoformat(), 'channel_plans': channel_plans @@ -696,8 +596,9 @@ class KaraokeDownloader: safe_title = safe_title.replace(char, "") safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip() filename = f"{channel_name} - {safe_title}.mp4" - if len(filename) > 100: - filename = f"{channel_name[:30]} - {safe_title[:60]}.mp4" + # Limit filename length to avoid Windows path issues + if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT: + filename = f"{channel_name[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4" output_path = self.downloads_dir / channel_name / filename output_path.parent.mkdir(parents=True, exist_ok=True) print(f" ({v_idx+1}/{len(videos)}) Downloading: {title} -> {output_path}") @@ -717,11 +618,27 @@ class KaraokeDownloader: except subprocess.CalledProcessError as e: print(f" โŒ yt-dlp failed with exit code {e.returncode}") print(f" โŒ yt-dlp stderr: {e.stderr}") + # Mark song as failed in tracking immediately + artist, title_clean = extract_artist_title(title) + self._handle_download_failure(artist, title_clean, video_id, channel_name, "yt-dlp failed", f"exit code {e.returncode}: {e.stderr}") continue if not output_path.exists() or output_path.stat().st_size == 0: print(f" โŒ Download failed or file is empty: {output_path}") + # Mark song as failed in tracking immediately + artist, title_clean = extract_artist_title(title) + self._handle_download_failure(artist, title_clean, video_id, channel_name, "Download failed", "file does not exist or is empty") continue + + # Extract artist and title for tracking + artist, title_clean = extract_artist_title(title) + + # Add ID3 tags add_id3_tags(output_path, title, channel_name) + + # Mark as downloaded in tracking system + file_size = output_path.stat().st_size if output_path.exists() else None + self.tracker.mark_song_downloaded(artist, title_clean, video_id, channel_name, output_path, file_size) + print(f" โœ… Downloaded and tagged: {title}") # After channel is done, remove it from the plan and update cache channel_plans[idx]['videos'] = [] @@ -738,58 +655,6 @@ class KaraokeDownloader: print(f"๐ŸŽ‰ All latest videos downloaded for all channels!") return True - def _is_valid_mp4(self, file_path): - """Check if the file is a valid MP4 using ffprobe, if available.""" - try: - cmd = ["ffprobe", "-v", "error", "-select_streams", "v:0", "-show_entries", "stream=codec_name", "-of", "default=noprint_wrappers=1:nokey=1", str(file_path)] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - return "mp4" in result.stdout or "h264" in result.stdout or "hevc" in result.stdout - except Exception: - # If ffprobe is not available, skip the check - return True - - def _download_video_and_track(self, channel_name, channel_url, video_id, video_title, artist, title, filename): - """ - Helper to download a single video and track its status. - Returns True if successful, False otherwise. - """ - output_path = self.downloads_dir / channel_name / filename - output_path.parent.mkdir(parents=True, exist_ok=True) - print(f"โฌ‡๏ธ Downloading: {artist} - {title} -> {output_path}") - video_url = f"https://www.youtube.com/watch?v={video_id}" - dlp_cmd = [ - str(self.yt_dlp_path), - "--no-check-certificates", - "--ignore-errors", - "--no-warnings", - "-o", str(output_path), - "-f", self.config["download_settings"]["format"], - video_url - ] - try: - result = subprocess.run(dlp_cmd, capture_output=True, text=True, check=True) - print(f"โœ… yt-dlp completed successfully") - print(f"๐Ÿ“„ yt-dlp stdout: {result.stdout}") - except subprocess.CalledProcessError as e: - print(f"โŒ yt-dlp failed with exit code {e.returncode}") - print(f"โŒ yt-dlp stderr: {e.stderr}") - return False - if not output_path.exists(): - print(f"โŒ Download failed: file does not exist: {output_path}") - return False - if output_path.stat().st_size == 0: - print(f"โŒ Download failed: file is empty (0 bytes): {output_path}") - return False - # TEMP: Skipping MP4 validation for debugging - # if not self._is_valid_mp4(output_path): - # print(f"โŒ File is not a valid MP4: {output_path}") - # return False - add_id3_tags(output_path, f"{artist} - {title} (Karaoke Version)", channel_name) - mark_songlist_song_downloaded(self.songlist_tracking, artist, title, channel_name, output_path) - print(f"โœ… Downloaded and tracked: {artist} - {title}") - print(f"๐ŸŽ‰ All post-processing complete for: {output_path}") - return True - def reset_songlist_all(): """Delete all files tracked in songlist_tracking.json, clear songlist_tracking.json, and remove songlist songs from karaoke_tracking.json.""" import json diff --git a/karaoke_downloader/fuzzy_matcher.py b/karaoke_downloader/fuzzy_matcher.py new file mode 100644 index 0000000..09a37e6 --- /dev/null +++ b/karaoke_downloader/fuzzy_matcher.py @@ -0,0 +1,87 @@ +""" +Fuzzy matching utilities for songlist-to-video matching. +Handles similarity calculations and match validation. +""" + +def get_similarity_function(): + """ + Get the best available similarity function. + Returns rapidfuzz if available, otherwise falls back to difflib. + """ + try: + from rapidfuzz import fuzz + def similarity(a, b): + return fuzz.ratio(a, b) + return similarity + except ImportError: + import difflib + def similarity(a, b): + return int(difflib.SequenceMatcher(None, a, b).ratio() * 100) + return similarity + +def normalize_title(title): + """Normalize a title for comparison.""" + normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip() + return " ".join(normalized.split()).lower() + +def extract_artist_title(video_title): + """Extract artist and title from video title.""" + if " - " in video_title: + parts = video_title.split(" - ", 1) + return parts[0].strip(), parts[1].strip() + return "", video_title + +def create_song_key(artist, title): + """Create a normalized key for song comparison.""" + return f"{artist.lower()}_{normalize_title(title)}" + +def create_video_key(video_title): + """Create a normalized key for video comparison.""" + artist, title = extract_artist_title(video_title) + return f"{artist.lower()}_{normalize_title(title)}" + +def is_fuzzy_match(songlist_artist, songlist_title, video_title, threshold=90): + """ + Check if a songlist entry matches a video title using fuzzy matching. + + Args: + songlist_artist: Artist from songlist + songlist_title: Title from songlist + video_title: YouTube video title + threshold: Minimum similarity score (0-100) + + Returns: + tuple: (is_match, score) where is_match is boolean and score is the similarity score + """ + similarity = get_similarity_function() + + song_key = create_song_key(songlist_artist, songlist_title) + video_key = create_video_key(video_title) + + score = similarity(song_key, video_key) + is_match = score >= threshold + + return is_match, score + +def is_exact_match(songlist_artist, songlist_title, video_title): + """ + Check if a songlist entry exactly matches a video title. + + Args: + songlist_artist: Artist from songlist + songlist_title: Title from songlist + video_title: YouTube video title + + Returns: + bool: True if exact match, False otherwise + """ + v_artist, v_title = extract_artist_title(video_title) + + # Check artist and title separately + artist_match = normalize_title(v_artist) == normalize_title(songlist_artist) + title_match = normalize_title(v_title) == normalize_title(songlist_title) + + # Also check if video title matches "artist - title" format + full_title_match = normalize_title(video_title) == normalize_title(f"{songlist_artist} - {songlist_title}") + + return (artist_match and title_match) or full_title_match \ No newline at end of file diff --git a/karaoke_downloader/server_manager.py b/karaoke_downloader/server_manager.py new file mode 100644 index 0000000..dcd838a --- /dev/null +++ b/karaoke_downloader/server_manager.py @@ -0,0 +1,86 @@ +""" +Server management utilities. +Handles server songs loading and server duplicates tracking. +""" + +import json +from pathlib import Path +from datetime import datetime + +def load_server_songs(songs_path="data/songs.json"): + """Load the list of songs already available on the server.""" + songs_file = Path(songs_path) + if not songs_file.exists(): + print(f"โš ๏ธ Server songs file not found: {songs_path}") + return set() + try: + with open(songs_file, 'r', encoding='utf-8') as f: + data = json.load(f) + server_songs = set() + for song in data: + if "artist" in song and "title" in song: + artist = song["artist"].strip() + title = song["title"].strip() + key = f"{artist.lower()}_{normalize_title(title)}" + server_songs.add(key) + print(f"๐Ÿ“‹ Loaded {len(server_songs)} songs from server (songs.json)") + return server_songs + except (json.JSONDecodeError, FileNotFoundError) as e: + print(f"โš ๏ธ Could not load server songs: {e}") + return set() + +def is_song_on_server(server_songs, artist, title): + """Check if a song is already available on the server.""" + key = f"{artist.lower()}_{normalize_title(title)}" + return key in server_songs + +def load_server_duplicates_tracking(tracking_path="data/server_duplicates_tracking.json"): + """Load the tracking of songs found to be duplicates on the server.""" + tracking_file = Path(tracking_path) + if not tracking_file.exists(): + return {} + try: + with open(tracking_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + print(f"โš ๏ธ Could not load server duplicates tracking: {e}") + return {} + +def save_server_duplicates_tracking(tracking, tracking_path="data/server_duplicates_tracking.json"): + """Save the tracking of songs found to be duplicates on the server.""" + try: + with open(tracking_path, 'w', encoding='utf-8') as f: + json.dump(tracking, f, indent=2, ensure_ascii=False) + except Exception as e: + print(f"โš ๏ธ Could not save server duplicates tracking: {e}") + +def is_song_marked_as_server_duplicate(tracking, artist, title): + """Check if a song has been marked as a server duplicate.""" + key = f"{artist.lower()}_{normalize_title(title)}" + return key in tracking + +def mark_song_as_server_duplicate(tracking, artist, title, video_title, channel_name): + """Mark a song as a server duplicate for future skipping.""" + key = f"{artist.lower()}_{normalize_title(title)}" + tracking[key] = { + "artist": artist, + "title": title, + "video_title": video_title, + "channel": channel_name, + "marked_at": datetime.now().isoformat(), + "reason": "already_on_server" + } + save_server_duplicates_tracking(tracking) + +def check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, video_title, channel_name): + """Check if a song is on server and mark it as duplicate if so. Returns True if it's a duplicate.""" + if is_song_on_server(server_songs, artist, title): + if not is_song_marked_as_server_duplicate(server_duplicates_tracking, artist, title): + mark_song_as_server_duplicate(server_duplicates_tracking, artist, title, video_title, channel_name) + return True + return False + +def normalize_title(title): + """Normalize a title for consistent key generation.""" + normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip() + return " ".join(normalized.split()).lower() \ No newline at end of file diff --git a/karaoke_downloader/songlist_manager.py b/karaoke_downloader/songlist_manager.py index 995c48d..09a372b 100644 --- a/karaoke_downloader/songlist_manager.py +++ b/karaoke_downloader/songlist_manager.py @@ -1,6 +1,15 @@ +""" +Songlist management utilities. +Handles songlist loading, tracking, and songlist-specific operations. +""" + import json from pathlib import Path from datetime import datetime +from karaoke_downloader.server_manager import ( + load_server_songs, is_song_on_server, load_server_duplicates_tracking, + check_and_mark_server_duplicate, is_song_marked_as_server_duplicate +) def load_songlist(songlist_path="data/songList.json"): songlist_file = Path(songlist_path) @@ -68,31 +77,4 @@ def mark_songlist_song_downloaded(tracking, artist, title, channel_name, file_pa "file_path": str(file_path), "downloaded_at": datetime.now().isoformat() } - save_songlist_tracking(tracking) - -def load_server_songs(songs_path="data/songs.json"): - """Load the list of songs already available on the server.""" - songs_file = Path(songs_path) - if not songs_file.exists(): - print(f"โš ๏ธ Server songs file not found: {songs_path}") - return set() - try: - with open(songs_file, 'r', encoding='utf-8') as f: - data = json.load(f) - server_songs = set() - for song in data: - if "artist" in song and "title" in song: - artist = song["artist"].strip() - title = song["title"].strip() - key = f"{artist.lower()}_{normalize_title(title)}" - server_songs.add(key) - print(f"๐Ÿ“‹ Loaded {len(server_songs)} songs from server (songs.json)") - return server_songs - except (json.JSONDecodeError, FileNotFoundError) as e: - print(f"โš ๏ธ Could not load server songs: {e}") - return set() - -def is_song_on_server(server_songs, artist, title): - """Check if a song is already available on the server.""" - key = f"{artist.lower()}_{normalize_title(title)}" - return key in server_songs \ No newline at end of file + save_songlist_tracking(tracking) \ No newline at end of file diff --git a/karaoke_downloader/tracking_manager.py b/karaoke_downloader/tracking_manager.py index 7b590ad..598b0ba 100644 --- a/karaoke_downloader/tracking_manager.py +++ b/karaoke_downloader/tracking_manager.py @@ -135,6 +135,105 @@ class TrackingManager: } return json.dumps(report, indent=2, ensure_ascii=False) + def is_song_downloaded(self, artist, title, channel_name=None, video_id=None): + """ + Check if a song has already been downloaded by this system. + Returns True if the song exists in tracking with DOWNLOADED or CONVERTED status. + """ + # If we have video_id and channel_name, try direct key lookup first (most efficient) + if video_id and channel_name: + song_key = f"{video_id}@{channel_name}" + if song_key in self.data['songs']: + song_data = self.data['songs'][song_key] + if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]: + return True + + # Fallback to content search (for cases where we don't have video_id) + for song_id, song_data in self.data['songs'].items(): + # Check if this song matches the artist and title + if song_data.get('artist') == artist and song_data.get('title') == title: + # Check if it's marked as downloaded + if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]: + return True + # Also check the video title field which might contain the song info + video_title = song_data.get('video_title', '') + if video_title and artist in video_title and title in video_title: + if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]: + return True + return False + + def is_file_exists(self, file_path): + """ + Check if a file already exists on the filesystem. + """ + return Path(file_path).exists() + + def is_song_failed(self, artist, title, channel_name=None, video_id=None): + """ + Check if a song has previously failed to download. + Returns True if the song exists in tracking with FAILED status. + """ + # If we have video_id and channel_name, try direct key lookup first (most efficient) + if video_id and channel_name: + song_key = f"{video_id}@{channel_name}" + if song_key in self.data['songs']: + song_data = self.data['songs'][song_key] + if song_data.get('status') == SongStatus.FAILED: + return True + + # Fallback to content search (for cases where we don't have video_id) + for song_id, song_data in self.data['songs'].items(): + # Check if this song matches the artist and title + if song_data.get('artist') == artist and song_data.get('title') == title: + # Check if it's marked as failed + if song_data.get('status') == SongStatus.FAILED: + return True + # Also check the video title field which might contain the song info + video_title = song_data.get('video_title', '') + if video_title and artist in video_title and title in video_title: + if song_data.get('status') == SongStatus.FAILED: + return True + return False + + def mark_song_downloaded(self, artist, title, video_id, channel_name, file_path, file_size=None): + """ + Mark a song as downloaded in the tracking system. + """ + # Use the existing tracking structure: video_id@channel_name + song_key = f"{video_id}@{channel_name}" + + self.data['songs'][song_key] = { + 'artist': artist, + 'title': title, + 'video_id': video_id, + 'channel_name': channel_name, + 'video_title': f"{artist} - {title}", + 'file_path': str(file_path), + 'file_size': file_size, + 'status': SongStatus.DOWNLOADED, + 'last_updated': datetime.now().isoformat() + } + self._save() + + def mark_song_failed(self, artist, title, video_id, channel_name, error_message=None): + """ + Mark a song as failed in the tracking system. + """ + # Use the existing tracking structure: video_id@channel_name + song_key = f"{video_id}@{channel_name}" + + self.data['songs'][song_key] = { + 'artist': artist, + 'title': title, + 'video_id': video_id, + 'channel_name': channel_name, + 'video_title': f"{artist} - {title}", + 'status': SongStatus.FAILED, + 'error_message': error_message, + 'last_updated': datetime.now().isoformat() + } + self._save() + def get_channel_video_list(self, channel_url, yt_dlp_path="downloader/yt-dlp.exe", force_refresh=False): """ Return a list of videos (dicts with 'title' and 'id') for the channel, using cache if available unless force_refresh is True. diff --git a/karaoke_downloader/video_downloader.py b/karaoke_downloader/video_downloader.py new file mode 100644 index 0000000..95c76b9 --- /dev/null +++ b/karaoke_downloader/video_downloader.py @@ -0,0 +1,327 @@ +""" +Core video download logic and file validation. +Handles the actual downloading and post-processing of videos. +""" + +import subprocess +from pathlib import Path +from karaoke_downloader.id3_utils import add_id3_tags +from karaoke_downloader.songlist_manager import mark_songlist_song_downloaded +from karaoke_downloader.download_planner import save_plan_cache + +# Constants +DEFAULT_FILENAME_LENGTH_LIMIT = 100 +DEFAULT_ARTIST_LENGTH_LIMIT = 30 +DEFAULT_TITLE_LENGTH_LIMIT = 60 +DEFAULT_FORMAT_CHECK_TIMEOUT = 30 + +def sanitize_filename(artist, title): + """ + Create a safe filename from artist and title. + Removes invalid characters and limits length. + """ + # Create a shorter, safer filename + safe_title = title.replace("(From ", "").replace(")", "").replace(" - ", " ").replace(":", "").replace("'", "").replace('"', "") + safe_artist = artist.replace("'", "").replace('"', "") + + # Remove all Windows-invalid characters + invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\'] + for char in invalid_chars: + safe_title = safe_title.replace(char, "") + safe_artist = safe_artist.replace(char, "") + + # Also remove any other potentially problematic characters + safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip() + safe_artist = safe_artist.strip() + + filename = f"{safe_artist} - {safe_title}.mp4" + + # Limit filename length to avoid Windows path issues + if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT: + filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4" + + return filename + +def is_valid_mp4(file_path): + """ + Check if a file is a valid MP4 file. + Uses ffprobe if available, otherwise checks file extension and size. + """ + if not file_path.exists(): + return False + + # Check file size + if file_path.stat().st_size == 0: + return False + + # Try to use ffprobe for validation + try: + import subprocess + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(file_path)], + capture_output=True, + text=True, + check=True + ) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + # If ffprobe is not available, just check the extension and size + return file_path.suffix.lower() == '.mp4' and file_path.stat().st_size > 0 + +def download_video_and_track(yt_dlp_path, config, downloads_dir, songlist_tracking, + channel_name, channel_url, video_id, video_title, + artist, title, filename): + """ + Download a single video and track its status. + Returns True if successful, False otherwise. + """ + output_path = downloads_dir / channel_name / filename + return download_single_video( + output_path, video_id, config, yt_dlp_path, + artist, title, channel_name, songlist_tracking + ) + +def download_single_video(output_path, video_id, config, yt_dlp_path, + artist, title, channel_name, songlist_tracking): + """Download a single video and handle post-processing.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + print(f"โฌ‡๏ธ Downloading: {artist} - {title} -> {output_path}") + + video_url = f"https://www.youtube.com/watch?v={video_id}" + dlp_cmd = [ + str(yt_dlp_path), + "--no-check-certificates", + "--ignore-errors", + "--no-warnings", + "-o", str(output_path), + "-f", config["download_settings"]["format"], + video_url + ] + + print(f"๐Ÿ”ง Running command: {' '.join(dlp_cmd)}") + print(f"๐Ÿ“บ Resolution settings: {config.get('download_settings', {}).get('preferred_resolution', 'Unknown')}") + print(f"๐ŸŽฌ Format string: {config.get('download_settings', {}).get('format', 'Unknown')}") + + # Debug: Show available formats (optional) + if config.get('debug_show_formats', False): + show_available_formats(yt_dlp_path, video_url) + + try: + result = subprocess.run(dlp_cmd, capture_output=True, text=True, check=True) + print(f"โœ… yt-dlp completed successfully") + print(f"๐Ÿ“„ yt-dlp stdout: {result.stdout}") + except subprocess.CalledProcessError as e: + print(f"โŒ yt-dlp failed with exit code {e.returncode}") + print(f"โŒ yt-dlp stderr: {e.stderr}") + # Mark song as failed in tracking + error_msg = f"yt-dlp failed with exit code {e.returncode}: {e.stderr}" + _mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg) + return False + + # Verify download + if not verify_download(output_path, artist, title, video_id, channel_name): + return False + + # Post-processing + add_id3_tags(output_path, f"{artist} - {title} (Karaoke Version)", channel_name) + mark_songlist_song_downloaded(songlist_tracking, artist, title, channel_name, output_path) + + print(f"โœ… Downloaded and tracked: {artist} - {title}") + print(f"๐ŸŽ‰ All post-processing complete for: {output_path}") + + return True + +def _mark_song_failed_standalone(artist, title, video_id, channel_name, error_message): + """Standalone helper to mark a song as failed in tracking.""" + from karaoke_downloader.tracking_manager import TrackingManager + tracker = TrackingManager() + tracker.mark_song_failed(artist, title, video_id, channel_name, error_message) + print(f"๐Ÿท๏ธ Marked song as failed: {artist} - {title}") + +def show_available_formats(yt_dlp_path, video_url): + """Show available formats for debugging.""" + print(f"๐Ÿ” Checking available formats for: {video_url}") + format_cmd = [ + str(yt_dlp_path), + "--list-formats", + video_url + ] + try: + format_result = subprocess.run(format_cmd, capture_output=True, text=True, timeout=DEFAULT_FORMAT_CHECK_TIMEOUT) + print(f"๐Ÿ“‹ Available formats:\n{format_result.stdout}") + except Exception as e: + print(f"โš ๏ธ Could not check formats: {e}") + +def verify_download(output_path, artist, title, video_id=None, channel_name=None): + """Verify that the download was successful.""" + if not output_path.exists(): + print(f"โŒ Download failed: file does not exist: {output_path}") + # Check if yt-dlp saved it somewhere else + possible_files = list(output_path.parent.glob("*.mp4")) + if possible_files: + print(f"๐Ÿ” Found these files in the directory: {[f.name for f in possible_files]}") + # Look for a file that matches our pattern (artist - title) + artist_part = artist.lower() + title_part = title.lower() + for file in possible_files: + file_lower = file.stem.lower() + if artist_part in file_lower and any(word in file_lower for word in title_part.split()): + print(f"๐ŸŽฏ Found matching file: {file.name}") + output_path = file + break + else: + print(f"โŒ No matching file found for: {artist} - {title}") + # Mark song as failed if we have the required info + if video_id and channel_name: + error_msg = f"Download failed: file does not exist and no matching file found" + _mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg) + return False + else: + # Mark song as failed if we have the required info + if video_id and channel_name: + error_msg = f"Download failed: file does not exist" + _mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg) + return False + + if output_path.stat().st_size == 0: + print(f"โŒ Download failed: file is empty (0 bytes): {output_path}") + return False + + # Optional MP4 validation + # if not is_valid_mp4(output_path): + # print(f"โŒ File is not a valid MP4: {output_path}") + # return False + + return True + +def execute_download_plan(download_plan, unmatched, cache_file, config, yt_dlp_path, + downloads_dir, songlist_tracking, limit=None): + """ + Execute a download plan with progress tracking and cache management. + + Args: + download_plan: List of download items to process + unmatched: List of unmatched songs + cache_file: Path to cache file for progress tracking + config: Configuration dictionary + yt_dlp_path: Path to yt-dlp executable + downloads_dir: Directory for downloads + songlist_tracking: Songlist tracking data + limit: Optional limit on number of downloads + + Returns: + tuple: (downloaded_count, success) + """ + downloaded_count = 0 + total_to_download = limit if limit is not None else len(download_plan) + + for idx, item in enumerate(download_plan[:]): # Use slice to allow modification during iteration + if limit is not None and downloaded_count >= limit: + break + + artist = item['artist'] + title = item['title'] + channel_name = item['channel_name'] + channel_url = item['channel_url'] + video_id = item['video_id'] + video_title = item['video_title'] + + print(f"\nโฌ‡๏ธ Downloading {idx+1} of {total_to_download}:") + print(f" ๐Ÿ“‹ Songlist: {artist} - {title}") + print(f" ๐ŸŽฌ Video: {video_title} ({channel_name})") + if 'match_score' in item: + print(f" ๐ŸŽฏ Match Score: {item['match_score']:.1f}%") + + # Create filename + filename = sanitize_filename(artist, title) + output_path = downloads_dir / channel_name / filename + + # Download the file + success = download_single_video( + output_path, video_id, config, yt_dlp_path, + artist, title, channel_name, songlist_tracking + ) + + if success: + downloaded_count += 1 + # Remove completed item from plan and update cache + download_plan.pop(idx) + save_plan_cache(cache_file, download_plan, unmatched) + print(f"๐Ÿ—‘๏ธ Removed completed item from download plan. {len(download_plan)} items remaining.") + + # Delete cache if all items are complete + if len(download_plan) == 0: + cleanup_cache(cache_file) + + print(f"๐ŸŽ‰ Downloaded {downloaded_count} songlist songs.") + print(f"๐Ÿ“Š Summary: Found {downloaded_count} songs, {len(unmatched)} songs not found.") + + # Final cleanup + cleanup_cache(cache_file) + + return downloaded_count, True + +def cleanup_cache(cache_file): + """Clean up the cache file.""" + if cache_file.exists(): + try: + cache_file.unlink() + print(f"๐Ÿ—‘๏ธ Deleted download plan cache: {cache_file.name}") + except Exception as e: + print(f"โš ๏ธ Could not delete download plan cache: {e}") + +def should_skip_song_standalone(artist, title, channel_name, video_id, video_title, downloads_dir, tracker=None, server_songs=None, server_duplicates_tracking=None): + """ + Standalone function to check if a song should be skipped. + Performs four checks in order: + 1. Already downloaded (tracking) - if tracker provided + 2. File exists on filesystem + 3. Already on server - if server data provided + 4. Previously failed download (bad file) - if tracker provided + + Returns: + tuple: (should_skip, reason, total_filtered) + """ + total_filtered = 0 + + # Check 1: Already downloaded by this system (if tracker provided) + if tracker and tracker.is_song_downloaded(artist, title, channel_name, video_id): + return True, "already downloaded", total_filtered + + # Check 2: File already exists on filesystem + # Generate the expected filename based on the download mode context + safe_title = title + invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\'] + for char in invalid_chars: + safe_title = safe_title.replace(char, "") + safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip() + + # Try different filename patterns that might exist + possible_filenames = [ + f"{artist} - {safe_title}.mp4", # Songlist mode + f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode + f"{artist} - {safe_title} (Karaoke Version).mp4" # Channel videos mode + ] + + for filename in possible_filenames: + if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT: + # Apply length limits if needed + safe_artist = artist.replace("'", "").replace('"', "").strip() + filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4" + + output_path = downloads_dir / channel_name / filename + if output_path.exists() and output_path.stat().st_size > 0: + return True, "file exists", total_filtered + + # Check 3: Already on server (if server data provided) + if server_songs is not None and server_duplicates_tracking is not None: + from karaoke_downloader.server_manager import check_and_mark_server_duplicate + if check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, video_title, channel_name): + total_filtered += 1 + return True, "on server", total_filtered + + # Check 4: Previously failed download (bad file) - if tracker provided + if tracker and tracker.is_song_failed(artist, title, channel_name, video_id): + return True, "previously failed", total_filtered + + return False, None, total_filtered \ No newline at end of file