Compare commits

...

10 Commits

31 changed files with 20667 additions and 1365 deletions

18
.flake8 Normal file
View File

@ -0,0 +1,18 @@
[flake8]
max-line-length = 88
extend-ignore =
E203
E501
W503
W504
exclude =
.git,
__pycache__,
.venv,
.mypy_cache,
build,
dist,
*.egg-info,
per-file-ignores =
__init__.py:F401
max-complexity = 10

127
PRD.md
View File

@ -1,27 +1,48 @@
# 🎤 Karaoke Video Downloader PRD (v3.1)
# 🎤 Karaoke Video Downloader PRD (v3.3)
## ✅ Overview
A Python-based Windows CLI tool to download karaoke videos from YouTube channels/playlists using `yt-dlp.exe`, with advanced tracking, songlist prioritization, and flexible configuration. The codebase has been refactored into a modular architecture for improved maintainability and separation of concerns.
A Python-based Windows CLI tool to download karaoke videos from YouTube channels/playlists using `yt-dlp.exe`, with advanced tracking, songlist prioritization, and flexible configuration. The codebase has been comprehensively refactored into a modular architecture with centralized utilities for improved maintainability, error handling, and code reuse.
---
## 🏗️ Architecture
The codebase has been refactored into focused modules:
The codebase has been refactored into focused modules with centralized utilities:
- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions
- **`download_planner.py`**: Download plan building and channel scanning (optimized)
- **`cache_manager.py`**: Cache operations and file I/O management
- **`video_downloader.py`**: Core video download execution and orchestration
- **`channel_manager.py`**: Channel and file management operations
### Core Modules:
- **`downloader.py`**: Main orchestrator and CLI interface
- **`video_downloader.py`**: Core video download execution and orchestration
- **`tracking_manager.py`**: Download tracking and status management
- **`download_planner.py`**: Download plan building and channel scanning
- **`cache_manager.py`**: Cache operations and file I/O management
- **`channel_manager.py`**: Channel and file management operations
- **`songlist_manager.py`**: Songlist operations and tracking
- **`server_manager.py`**: Server song availability checking
- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions
### Benefits of Modular Architecture:
### Utility Modules (v3.2):
- **`youtube_utils.py`**: Centralized YouTube operations and yt-dlp command generation
- **`error_utils.py`**: Standardized error handling and formatting
- **`download_pipeline.py`**: Abstracted download → verify → tag → track pipeline
- **`id3_utils.py`**: ID3 tagging utilities
- **`config_manager.py`**: Configuration management
- **`resolution_cli.py`**: Resolution checking utilities
- **`tracking_cli.py`**: Tracking management CLI
### New Utility Modules (v3.3):
- **`file_utils.py`**: Centralized file operations, filename sanitization, and file validation
- **`song_validator.py`**: Centralized song validation logic for checking if songs should be downloaded
### Benefits of Enhanced Modular Architecture:
- **Single Responsibility**: Each module has a focused purpose
- **Centralized Utilities**: Common operations (file operations, song validation, yt-dlp commands, error handling) are centralized
- **Reduced Duplication**: Eliminated ~150 lines of code duplication across modules
- **Testability**: Individual components can be tested separately
- **Maintainability**: Easier to find and fix issues
- **Reusability**: Components can be used independently
- **Robustness**: Better error handling and interruption recovery
- **Consistency**: Standardized error messages and processing pipelines
- **Type Safety**: Comprehensive type hints across all new modules
---
@ -79,6 +100,7 @@ python download_karaoke.py --clear-cache SingKingKaraoke
- ✅ Configurable download resolution and yt-dlp options (`data/config.json`)
- ✅ Songlist integration: prioritize and track custom songlists
- ✅ Songlist-only mode: download only songs from the songlist
- ✅ Songlist focus mode: download only songs from specific playlists by title
- ✅ Global songlist tracking to avoid duplicates across channels
- ✅ ID3 tagging for artist/title in MP4 files (mutagen)
- ✅ Real-time progress and detailed logging
@ -93,6 +115,13 @@ python download_karaoke.py --clear-cache SingKingKaraoke
- ✅ **Default channel file**: If no --file is specified for songlist-only or latest-per-channel modes, automatically uses data/channels.txt as the default channel list.
- ✅ **Robust interruption handling**: Progress is saved after each download, and files are checked for existence before downloading to prevent re-downloads if the process is interrupted.
- ✅ **Optimized scanning performance**: High-performance channel scanning with O(n×m) complexity, pre-processed lookups, and early termination for faster matching of large songlists and channels.
- ✅ **Centralized yt-dlp command generation**: Standardized command building and execution across all download operations
- ✅ **Enhanced error handling**: Structured exception hierarchy with consistent error messages and formatting
- ✅ **Abstracted download pipeline**: Reusable download → verify → tag → track process for consistent processing
- ✅ **Reduced code duplication**: Eliminated duplicate code across modules through centralized utilities
- ✅ **Centralized file operations**: Single source of truth for filename sanitization, file validation, and path operations
- ✅ **Centralized song validation**: Unified logic for checking if songs should be downloaded across all modules
- ✅ **Enhanced configuration management**: Structured configuration with dataclasses, type safety, and validation
---
@ -102,15 +131,21 @@ KaroakeVideoDownloader/
├── karaoke_downloader/ # All core Python code and utilities
│ ├── downloader.py # Main orchestrator and CLI interface
│ ├── cli.py # CLI entry point
│ ├── fuzzy_matcher.py # Fuzzy matching logic and similarity functions
│ ├── download_planner.py # Download plan building and channel scanning (optimized)
│ ├── cache_manager.py # Cache operations and file I/O management
│ ├── video_downloader.py # Core video download execution and orchestration
│ ├── tracking_manager.py # Download tracking and status management
│ ├── download_planner.py # Download plan building and channel scanning
│ ├── cache_manager.py # Cache operations and file I/O management
│ ├── channel_manager.py # Channel and file management operations
│ ├── id3_utils.py # ID3 tagging helpers
│ ├── songlist_manager.py # Songlist logic
│ ├── youtube_utils.py # YouTube helpers
│ ├── tracking_manager.py # Tracking logic
│ ├── songlist_manager.py # Songlist operations and tracking
│ ├── server_manager.py # Server song availability checking
│ ├── fuzzy_matcher.py # Fuzzy matching logic and similarity functions
│ ├── youtube_utils.py # Centralized YouTube operations and yt-dlp commands
│ ├── error_utils.py # Standardized error handling and formatting
│ ├── download_pipeline.py # Abstracted download → verify → tag → track pipeline
│ ├── id3_utils.py # ID3 tagging utilities
│ ├── config_manager.py # Configuration management with dataclasses
│ ├── file_utils.py # Centralized file operations and filename handling
│ ├── song_validator.py # Centralized song validation logic
│ ├── check_resolution.py # Resolution checker utility
│ ├── resolution_cli.py # Resolution config CLI
│ └── tracking_cli.py # Tracking management CLI
@ -140,6 +175,7 @@ KaroakeVideoDownloader/
- `--file <data/channels.txt>`: Download from a list of channels (optional, defaults to data/channels.txt for songlist modes)
- `--songlist-priority`: Prioritize songlist songs in download queue
- `--songlist-only`: Download only songs from the songlist
- `--songlist-focus <PLAYLIST_TITLE1> <PLAYLIST_TITLE2>...`: Focus on specific playlists by title (e.g., `--songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"`)
- `--songlist-status`: Show songlist download progress
- `--limit <N>`: Limit number of downloads (enables fast mode with early exit)
- `--resolution <720p|1080p|...>`: Override resolution
@ -151,6 +187,8 @@ KaroakeVideoDownloader/
- `--latest-per-channel`: **Download the latest N videos from each channel (use with --limit)**
- `--fuzzy-match`: **Enable fuzzy matching for songlist-to-video matching (uses rapidfuzz if available)**
- `--fuzzy-threshold <N>`: **Fuzzy match threshold (0-100, default 85)**
- `--parallel`: **Enable parallel downloads for improved speed**
- `--workers <N>`: **Number of parallel download workers (1-10, default: 3)**
---
@ -161,6 +199,46 @@ KaroakeVideoDownloader/
- **ID3 Tagging:** Artist/title extracted from video title and embedded in MP4 files.
- **Cleanup:** Extra files from yt-dlp (e.g., `.info.json`) are automatically removed after download.
- **Reset/Clear:** Use `--reset-channel` to reset all tracking and files for a channel (optionally including songlist songs with `--reset-songlist`). Use `--clear-cache` to clear cached video lists for a channel or all channels.
## 🔧 Refactoring Improvements (v3.3)
The codebase has been comprehensively refactored to improve maintainability and reduce code duplication. Recent improvements have enhanced reliability, performance, and code organization:
### **New Utility Modules (v3.3)**
- **`file_utils.py`**: Centralized file operations, filename sanitization, and file validation
- `sanitize_filename()`: Create safe filenames from artist/title
- `generate_possible_filenames()`: Generate filename patterns for different modes
- `check_file_exists_with_patterns()`: Check for existing files using multiple patterns
- `is_valid_mp4_file()`: Validate MP4 files with header checking
- `cleanup_temp_files()`: Remove temporary yt-dlp files
- `ensure_directory_exists()`: Safe directory creation
- **`song_validator.py`**: Centralized song validation logic
- `SongValidator` class: Unified logic for checking if songs should be downloaded
- `should_skip_song()`: Comprehensive validation with multiple criteria
- `mark_song_failed()`: Consistent failure tracking
- `handle_download_failure()`: Standardized error handling
- **Enhanced `config_manager.py`**: Robust configuration management with dataclasses
- `ConfigManager` class: Type-safe configuration loading and caching
- `DownloadSettings`, `FolderStructure`, `LoggingConfig` dataclasses
- Configuration validation and merging with defaults
- Dynamic resolution updates
### **Benefits Achieved**
- **Eliminated Code Duplication**: ~150 lines of duplicate code removed across modules
- **Centralized File Operations**: Single source of truth for filename handling and file validation
- **Unified Song Validation**: Consistent logic for checking if songs should be downloaded
- **Enhanced Type Safety**: Comprehensive type hints across all new modules
- **Improved Configuration Management**: Structured configuration with validation and caching
- **Better Error Handling**: Consistent patterns via centralized utilities
- **Enhanced Maintainability**: Changes to file operations or song validation only require updates in one place
- **Improved Testability**: Modular components can be tested independently
- **Better Developer Experience**: Clear function signatures and comprehensive documentation
### **Previous Improvements (v3.2)**
- **Centralized yt-dlp Command Generation**: Standardized command building and execution across all download operations
- **Enhanced Error Handling**: Structured exception hierarchy with consistent error messages and formatting
- **Abstracted Download Pipeline**: Reusable download → verify → tag → track process for consistent processing
- **Download plan pre-scan:** Before downloading, the tool scans all channels for songlist matches, builds a download plan, and prints stats (matches, unmatched, per-channel breakdown). The plan is cached for 1 day and reused unless --force-download-plan is set.
- **Latest-per-channel plan:** Download the latest N videos from each channel, with a per-channel plan and robust resume. Each channel is removed from the plan as it completes. Plan cache is deleted when all channels are done.
- **Fast mode with early exit:** When a limit is set, the tool scans channels and songs in order, downloads immediately when a match is found, and stops as soon as the limit is reached with successful downloads. This provides much faster performance for small limits compared to the full pre-scan approach.
@ -169,6 +247,18 @@ KaroakeVideoDownloader/
- **Default channel file:** For songlist-only and latest-per-channel modes, if no --file is specified, automatically uses data/channels.txt as the default channel list, reducing the need to specify the file path repeatedly.
- **Robust interruption handling:** Progress is saved after each download, and files are checked for existence before downloading to prevent re-downloads if the process is interrupted.
- **Optimized scanning algorithm:** High-performance channel scanning with O(n×m) complexity, pre-processed song lookups using sets and dictionaries, and early termination for faster matching of large songlists and channels.
- **Enhanced cache management:** Improved channel cache key handling for better cache hit rates and reduced YouTube API calls.
- **Robust download plan execution:** Fixed index management in download plan execution to prevent errors during interrupted downloads.
### **New Parallel Download System (v3.4)**
- **Parallel downloader module:** `parallel_downloader.py` provides thread-safe concurrent download management
- **Configurable concurrency:** Use `--parallel --workers N` to enable parallel downloads with N workers (1-10)
- **Thread-safe operations:** All tracking, caching, and progress operations are thread-safe
- **Real-time progress tracking:** Shows active downloads, completion status, and overall progress
- **Automatic retry mechanism:** Failed downloads are automatically retried with reduced concurrency
- **Backward compatibility:** Sequential downloads remain the default when `--parallel` is not used
- **Performance improvements:** Significantly faster downloads for large batches (3-5x speedup with 3-5 workers)
- **Integrated with all modes:** Works with both songlist-across-channels and latest-per-channel download modes
---
@ -177,6 +267,9 @@ KaroakeVideoDownloader/
- [ ] More advanced song matching (multi-language)
- [ ] Download scheduling and retry logic
- [ ] More granular status reporting
- [ ] Parallel downloads for improved speed
- [x] **Parallel downloads for improved speed** ✅ **COMPLETED**
- [ ] Unit tests for all modules
- [ ] Integration tests for end-to-end workflows
- [ ] Plugin system for custom file operations
- [ ] Advanced configuration UI
- [ ] Real-time download progress visualization

203
README.md
View File

@ -20,17 +20,64 @@ A Python-based Windows CLI tool to download karaoke videos from YouTube channels
- 🛡️ **Robust Interruption Handling**: Progress is saved after each download, preventing re-downloads if the process is interrupted
- ⚡ **Optimized Scanning**: High-performance channel scanning with O(n×m) complexity, pre-processed lookups, and early termination for faster matching
- 🏷️ **Server Duplicates Tracking**: Automatically checks against local songs.json file and marks duplicates for future skipping, preventing re-downloads of songs already on the server
- ⚡ **Parallel Downloads**: Enable concurrent downloads with `--parallel --workers N` for significantly faster batch downloads (3-5x speedup)
## 🏗️ Architecture
The codebase has been refactored into a modular architecture for better maintainability and separation of concerns:
The codebase has been comprehensively refactored into a modular architecture with centralized utilities for improved maintainability, error handling, and code reuse:
- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions
- **`download_planner.py`**: Download plan building and channel scanning (optimized)
- **`cache_manager.py`**: Cache operations and file I/O management
- **`server_manager.py`**: Server songs loading and server duplicates tracking
- **`video_downloader.py`**: Core video download execution and orchestration
- **`channel_manager.py`**: Channel and file management operations
### Core Modules:
- **`downloader.py`**: Main orchestrator and CLI interface
- **`video_downloader.py`**: Core video download execution and orchestration
- **`tracking_manager.py`**: Download tracking and status management
- **`download_planner.py`**: Download plan building and channel scanning
- **`cache_manager.py`**: Cache operations and file I/O management
- **`channel_manager.py`**: Channel and file management operations
- **`songlist_manager.py`**: Songlist operations and tracking
- **`server_manager.py`**: Server song availability checking
- **`fuzzy_matcher.py`**: Fuzzy matching logic and similarity functions
### Utility Modules (v3.2):
- **`youtube_utils.py`**: Centralized YouTube operations and yt-dlp command generation
- **`error_utils.py`**: Standardized error handling and formatting
- **`download_pipeline.py`**: Abstracted download → verify → tag → track pipeline
- **`id3_utils.py`**: ID3 tagging utilities
- **`config_manager.py`**: Configuration management
- **`resolution_cli.py`**: Resolution checking utilities
- **`tracking_cli.py`**: Tracking management CLI
### New Utility Modules (v3.3):
- **`parallel_downloader.py`**: Parallel download management with thread-safe operations
- `ParallelDownloader` class: Manages concurrent downloads with configurable workers
- `DownloadTask` and `DownloadResult` dataclasses: Structured task and result management
- Thread-safe progress tracking and error handling
- Automatic retry mechanism for failed downloads
- **`file_utils.py`**: Centralized file operations, filename sanitization, and file validation
- `sanitize_filename()`: Create safe filenames from artist/title
- `generate_possible_filenames()`: Generate filename patterns for different modes
- `check_file_exists_with_patterns()`: Check for existing files using multiple patterns
- `is_valid_mp4_file()`: Validate MP4 files with header checking
- `cleanup_temp_files()`: Remove temporary yt-dlp files
- `ensure_directory_exists()`: Safe directory creation
- **`song_validator.py`**: Centralized song validation logic
- `SongValidator` class: Unified logic for checking if songs should be downloaded
- `should_skip_song()`: Comprehensive validation with multiple criteria
- `mark_song_failed()`: Consistent failure tracking
- `handle_download_failure()`: Standardized error handling
- **Enhanced `config_manager.py`**: Robust configuration management with dataclasses
- `ConfigManager` class: Type-safe configuration loading and caching
- `DownloadSettings`, `FolderStructure`, `LoggingConfig` dataclasses
- Configuration validation and merging with defaults
- Dynamic resolution updates
### Benefits:
- **Centralized Utilities**: Common operations (file operations, song validation, yt-dlp commands, error handling) are centralized
- **Reduced Duplication**: Eliminated ~150 lines of code duplication across modules
- **Consistency**: Standardized error messages and processing pipelines
- **Maintainability**: Changes isolated to specific modules
- **Testability**: Modular components can be tested independently
- **Type Safety**: Comprehensive type hints across all new modules
## 📋 Requirements
- **Windows 10/11**
@ -42,6 +89,8 @@ The codebase has been refactored into a modular architecture for better maintain
## 🚀 Quick Start
> **💡 Pro Tip**: For a complete list of all available commands, see `commands.txt` - you can copy/paste any command directly into your terminal!
### Download a Channel
```bash
python download_karaoke.py https://www.youtube.com/@SingKingKaraoke/videos
@ -52,6 +101,16 @@ python download_karaoke.py https://www.youtube.com/@SingKingKaraoke/videos
python download_karaoke.py --songlist-only --limit 5
```
### Download with Parallel Processing
```bash
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
```
### Focus on Specific Playlists by Title
```bash
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"
```
### Download with Fuzzy Matching
```bash
python download_karaoke.py --songlist-only --limit 10 --fuzzy-match --fuzzy-threshold 85
@ -107,13 +166,26 @@ python download_karaoke.py --clear-cache all
- Place your prioritized song list in `data/songList.json` (see example format below).
- The tool will match and prioritize these songs across all available channel videos.
- Use `--songlist-only` to download only these songs, or `--songlist-priority` to prioritize them in the queue.
- Use `--songlist-focus` to download only songs from specific playlists by title (e.g., `--songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"`).
- Download progress for the songlist is tracked globally in `data/songlist_tracking.json`.
#### Example `data/songList.json`
```json
[
{ "artist": "Taylor Swift", "title": "Cruel Summer" },
{ "artist": "Billie Eilish", "title": "Happier Than Ever" }
{
"title": "2025 - Apple Top 50",
"songs": [
{ "artist": "Kendrick Lamar & SZA", "title": "luther", "position": 1 },
{ "artist": "Kendrick Lamar", "title": "Not Like Us", "position": 2 }
]
},
{
"title": "2024 - Billboard Hot 100",
"songs": [
{ "artist": "Taylor Swift", "title": "Cruel Summer", "position": 1 },
{ "artist": "Billie Eilish", "title": "Happier Than Ever", "position": 2 }
]
}
]
```
@ -126,19 +198,25 @@ python download_karaoke.py --clear-cache all
## 📂 Folder Structure
```
KaroakeVideoDownloader/
├── commands.txt # Complete CLI commands reference (copy/paste ready)
├── karaoke_downloader/ # All core Python code and utilities
│ ├── downloader.py # Main orchestrator and CLI interface
│ ├── cli.py # CLI entry point
│ ├── fuzzy_matcher.py # Fuzzy matching logic and similarity functions
│ ├── download_planner.py # Download plan building and channel scanning (optimized)
│ ├── cache_manager.py # Cache operations and file I/O management
│ ├── server_manager.py # Server songs loading and server duplicates tracking
│ ├── video_downloader.py # Core video download execution and orchestration
│ ├── tracking_manager.py # Download tracking and status management
│ ├── download_planner.py # Download plan building and channel scanning
│ ├── cache_manager.py # Cache operations and file I/O management
│ ├── channel_manager.py # Channel and file management operations
│ ├── id3_utils.py # ID3 tagging helpers
│ ├── songlist_manager.py # Songlist logic
│ ├── youtube_utils.py # YouTube helpers
│ ├── tracking_manager.py # Tracking logic
│ ├── songlist_manager.py # Songlist operations and tracking
│ ├── server_manager.py # Server song availability checking
│ ├── fuzzy_matcher.py # Fuzzy matching logic and similarity functions
│ ├── youtube_utils.py # Centralized YouTube operations and yt-dlp commands
│ ├── error_utils.py # Standardized error handling and formatting
│ ├── download_pipeline.py # Abstracted download → verify → tag → track pipeline
│ ├── id3_utils.py # ID3 tagging utilities
│ ├── config_manager.py # Configuration management with dataclasses
│ ├── file_utils.py # Centralized file operations and filename handling
│ ├── song_validator.py # Centralized song validation logic
│ ├── check_resolution.py # Resolution checker utility
│ ├── resolution_cli.py # Resolution config CLI
│ └── tracking_cli.py # Tracking management CLI
@ -163,9 +241,14 @@ KaroakeVideoDownloader/
```
## 🚦 CLI Options
> **📋 Complete Command Reference**: See `commands.txt` for all available commands with examples - perfect for copy/paste!
### Key Options:
- `--file <data/channels.txt>`: Download from a list of channels (optional, defaults to data/channels.txt for songlist modes)
- `--songlist-priority`: Prioritize songlist songs in download queue
- `--songlist-only`: Download only songs from the songlist
- `--songlist-focus <PLAYLIST_TITLE1> <PLAYLIST_TITLE2>...`: Focus on specific playlists by title (e.g., `--songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"`)
- `--songlist-status`: Show songlist download progress
- `--limit <N>`: Limit number of downloads (enables fast mode with early exit)
- `--resolution <720p|1080p|...>`: Override resolution
@ -177,14 +260,22 @@ KaroakeVideoDownloader/
- `--latest-per-channel`: **Download the latest N videos from each channel (use with --limit)**
- `--fuzzy-match`: Enable fuzzy matching for songlist-to-video matching (uses rapidfuzz if available)
- `--fuzzy-threshold <N>`: Fuzzy match threshold (0-100, default 85)
- `--parallel`: Enable parallel downloads for improved speed
- `--workers <N>`: Number of parallel download workers (1-10, default: 3)
## 📝 Example Usage
> **💡 For complete examples**: See `commands.txt` for all command variations with explanations!
```bash
# Fast mode with fuzzy matching (no need to specify --file)
python download_karaoke.py --songlist-only --limit 10 --fuzzy-match --fuzzy-threshold 85
# Latest videos per channel
python download_karaoke.py --latest-per-channel --limit 5
# Parallel downloads for faster processing
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
# Latest videos per channel with parallel downloads
python download_karaoke.py --parallel --workers 3 --latest-per-channel --limit 5
# Traditional full scan (no limit)
python download_karaoke.py --songlist-only
@ -206,6 +297,80 @@ python download_karaoke.py --clear-server-duplicates
- All options are in `data/config.json` (format, resolution, metadata, etc.)
- You can edit this file or use CLI flags to override
## 📋 Command Reference File
**`commands.txt`** contains a comprehensive list of all CLI commands with explanations. This file is designed for easy copy/paste usage and includes:
- All basic download commands
- Songlist operations
- Latest-per-channel downloads
- Cache and tracking management
- Reset and cleanup operations
- Advanced combinations
- Common workflows
- Troubleshooting commands
> **🔄 Maintenance Note**: The `commands.txt` file should be kept up to date with any CLI changes. When adding new command-line options or modifying existing ones, update this file to reflect all available commands and their usage.
## 🔧 Refactoring Improvements (v3.3)
The codebase has been comprehensively refactored to improve maintainability and reduce code duplication. Recent improvements have enhanced reliability, performance, and code organization:
### **New Utility Modules (v3.3)**
- **`file_utils.py`**: Centralized file operations, filename sanitization, and file validation
- `sanitize_filename()`: Create safe filenames from artist/title
- `generate_possible_filenames()`: Generate filename patterns for different modes
- `check_file_exists_with_patterns()`: Check for existing files using multiple patterns
- `is_valid_mp4_file()`: Validate MP4 files with header checking
- `cleanup_temp_files()`: Remove temporary yt-dlp files
- `ensure_directory_exists()`: Safe directory creation
- **`song_validator.py`**: Centralized song validation logic
- `SongValidator` class: Unified logic for checking if songs should be downloaded
- `should_skip_song()`: Comprehensive validation with multiple criteria
- `mark_song_failed()`: Consistent failure tracking
- `handle_download_failure()`: Standardized error handling
- **Enhanced `config_manager.py`**: Robust configuration management with dataclasses
- `ConfigManager` class: Type-safe configuration loading and caching
- `DownloadSettings`, `FolderStructure`, `LoggingConfig` dataclasses
- Configuration validation and merging with defaults
- Dynamic resolution updates
### **Benefits Achieved**
- **Eliminated Code Duplication**: ~150 lines of duplicate code removed across modules
- **Centralized File Operations**: Single source of truth for filename handling and file validation
- **Unified Song Validation**: Consistent logic for checking if songs should be downloaded
- **Enhanced Type Safety**: Comprehensive type hints across all new modules
- **Improved Configuration Management**: Structured configuration with validation and caching
- **Better Error Handling**: Consistent patterns via centralized utilities
- **Enhanced Maintainability**: Changes to file operations or song validation only require updates in one place
- **Improved Testability**: Modular components can be tested independently
- **Better Developer Experience**: Clear function signatures and comprehensive documentation
### **New Parallel Download System (v3.4)**
- **Parallel downloader module:** `parallel_downloader.py` provides thread-safe concurrent download management
- **Configurable concurrency:** Use `--parallel --workers N` to enable parallel downloads with N workers (1-10)
- **Thread-safe operations:** All tracking, caching, and progress operations are thread-safe
- **Real-time progress tracking:** Shows active downloads, completion status, and overall progress
- **Automatic retry mechanism:** Failed downloads are automatically retried with reduced concurrency
- **Backward compatibility:** Sequential downloads remain the default when `--parallel` is not used
- **Performance improvements:** Significantly faster downloads for large batches (3-5x speedup with 3-5 workers)
- **Integrated with all modes:** Works with both songlist-across-channels and latest-per-channel download modes
### **Previous Improvements (v3.2)**
- **Centralized yt-dlp Command Generation**: Standardized command building and execution across all download operations
- **Enhanced Error Handling**: Structured exception hierarchy with consistent error messages and formatting
- **Abstracted Download Pipeline**: Reusable download → verify → tag → track process for consistent processing
- **Download plan pre-scan:** Before downloading, the tool scans all channels for songlist matches, builds a download plan, and prints stats (matches, unmatched, per-channel breakdown). The plan is cached for 1 day and reused unless --force-download-plan is set.
- **Latest-per-channel plan:** Download the latest N videos from each channel, with a per-channel plan and robust resume. Each channel is removed from the plan as it completes. Plan cache is deleted when all channels are done.
- **Fast mode with early exit:** When a limit is set, the tool scans channels and songs in order, downloads immediately when a match is found, and stops as soon as the limit is reached with successful downloads. This provides much faster performance for small limits compared to the full pre-scan approach.
- **Deduplication across channels:** Tracks unique song keys (artist + normalized title) to ensure the same song is not downloaded from multiple channels, even if it appears in more than one channel's video list.
- **Fuzzy matching:** Uses string similarity algorithms to find approximate matches between songlist entries and video titles, tolerating minor differences, typos, or extra words like "Karaoke" or "Official Video".
- **Default channel file:** For songlist-only and latest-per-channel modes, if no --file is specified, automatically uses data/channels.txt as the default channel list, reducing the need to specify the file path repeatedly.
- **Robust interruption handling:** Progress is saved after each download, and files are checked for existence before downloading to prevent re-downloads if the process is interrupted.
- **Optimized scanning algorithm:** High-performance channel scanning with O(n×m) complexity, pre-processed song lookups using sets and dictionaries, and early termination for faster matching of large songlists and channels.
- **Enhanced cache management:** Improved channel cache key handling for better cache hit rates and reduced YouTube API calls.
- **Robust download plan execution:** Fixed index management in download plan execution to prevent errors during interrupted downloads.
## 🐞 Troubleshooting
- Ensure `yt-dlp.exe` is in the `downloader/` folder
- Check `logs/` for error details

242
commands.txt Normal file
View File

@ -0,0 +1,242 @@
# 🎤 Karaoke Video Downloader - CLI Commands Reference
# Copy and paste these commands into your terminal
# Updated: v3.4 (includes parallel downloads and all refactoring improvements)
## 📥 BASIC DOWNLOADS
# Download a single channel
python download_karaoke.py https://www.youtube.com/@SingKingKaraoke/videos
# Download from a file containing multiple channel URLs
python download_karaoke.py --file data/channels.txt
# Download with custom resolution (480p, 720p, 1080p, 1440p, 2160p)
python download_karaoke.py --resolution 1080p https://www.youtube.com/@SingKingKaraoke/videos
# Limit number of downloads (fast mode with early exit)
python download_karaoke.py --limit 10 https://www.youtube.com/@SingKingKaraoke/videos
# Enable parallel downloads for faster processing (3-5x speedup)
python download_karaoke.py --parallel --workers 5 --limit 10 https://www.youtube.com/@SingKingKaraoke/videos
## 🎵 SONGLIST OPERATIONS
# Download only songs from your songlist (uses data/channels.txt by default)
python download_karaoke.py --songlist-only
# Download only songlist songs with limit
python download_karaoke.py --songlist-only --limit 5
# Download songlist songs with fuzzy matching (more flexible matching)
python download_karaoke.py --songlist-only --fuzzy-match --limit 10
# Download songlist songs with custom fuzzy threshold (0-100, default 90)
python download_karaoke.py --songlist-only --fuzzy-match --fuzzy-threshold 85 --limit 10
# Download songlist songs with parallel processing (much faster)
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
# Download songlist songs with parallel processing and fuzzy matching
python download_karaoke.py --parallel --workers 5 --songlist-only --fuzzy-match --fuzzy-threshold 85 --limit 10
# Focus on specific playlists by title (download only songs from these playlists)
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100"
# Focus on specific playlists with fuzzy matching
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --fuzzy-match --fuzzy-threshold 85
# Focus on specific playlists with limit
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --limit 5
# Focus on specific playlists with parallel processing
python download_karaoke.py --parallel --workers 3 --songlist-focus "2025 - Apple Top 50" --limit 5
# Prioritize songlist songs in download queue (default behavior)
python download_karaoke.py --songlist-priority https://www.youtube.com/@SingKingKaraoke/videos
# Disable songlist prioritization
python download_karaoke.py --no-songlist-priority https://www.youtube.com/@SingKingKaraoke/videos
# Show songlist download status and statistics
python download_karaoke.py --songlist-status
## ⚡ PARALLEL DOWNLOADS (v3.4)
# Basic parallel downloads (3-5x faster than sequential)
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
# Parallel downloads with different worker counts
python download_karaoke.py --parallel --workers 3 --songlist-only --limit 10 # Conservative
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10 # Balanced
python download_karaoke.py --parallel --workers 8 --songlist-only --limit 10 # Aggressive
# Parallel downloads for latest-per-channel mode
python download_karaoke.py --parallel --workers 3 --latest-per-channel --limit 5
# Parallel downloads with fuzzy matching
python download_karaoke.py --parallel --workers 5 --songlist-only --fuzzy-match --fuzzy-threshold 85 --limit 10
# Parallel downloads with custom resolution
python download_karaoke.py --parallel --workers 5 --resolution 1080p --songlist-only --limit 10
## 🗂️ LATEST-PER-CHANNEL DOWNLOADS
# Download latest 5 videos from each channel
python download_karaoke.py --latest-per-channel --limit 5
# Download latest videos with fuzzy matching
python download_karaoke.py --latest-per-channel --limit 5 --fuzzy-match --fuzzy-threshold 85
# Download latest videos with parallel processing (much faster)
python download_karaoke.py --parallel --workers 3 --latest-per-channel --limit 5
# Download latest videos with parallel processing and fuzzy matching
python download_karaoke.py --parallel --workers 3 --latest-per-channel --limit 5 --fuzzy-match --fuzzy-threshold 85
# Download latest videos from specific channels file
python download_karaoke.py --latest-per-channel --limit 5 --file data/channels.txt
## 🔄 CACHE & TRACKING MANAGEMENT
# Show download status and statistics
python download_karaoke.py --status
# Show channel cache information
python download_karaoke.py --cache-info
# Clear cache for a specific channel
python download_karaoke.py --clear-cache SingKingKaraoke
# Clear cache for all channels
python download_karaoke.py --clear-cache all
# Set cache duration (in hours)
python download_karaoke.py --cache-duration 48
# Force refresh channel cache (ignore cached data)
python download_karaoke.py --refresh https://www.youtube.com/@SingKingKaraoke/videos
# Force refresh download plan cache (re-scan all channels for matches)
python download_karaoke.py --force-download-plan --songlist-only
# Clear server duplicates tracking (allows re-checking songs against server)
python download_karaoke.py --clear-server-duplicates
## 🧹 RESET & CLEANUP OPERATIONS
# Reset all tracking and files for a specific channel
python download_karaoke.py --reset-channel SingKingKaraoke
# Reset channel and also reset songlist songs for this channel
python download_karaoke.py --reset-channel SingKingKaraoke --reset-songlist
# Reset all songlist tracking and delete all songlist-downloaded files (GLOBAL)
python download_karaoke.py --reset-songlist-all
# Clean up orphaned tracking entries
python download_karaoke.py --cleanup
## 📊 REPORTS & SYNC
# Generate detailed report for a specific playlist
python download_karaoke.py --report PLAYLIST_ID
# Only sync playlist without downloading (update tracking)
python download_karaoke.py --sync https://www.youtube.com/@SingKingKaraoke/videos
# Show version information
python download_karaoke.py --version
## 🎯 ADVANCED COMBINATIONS
# Fast songlist download with fuzzy matching and high quality
python download_karaoke.py --songlist-only --limit 20 --fuzzy-match --fuzzy-threshold 85 --resolution 1080p
# Latest videos per channel with fuzzy matching
python download_karaoke.py --latest-per-channel --limit 3 --fuzzy-match --fuzzy-threshold 90 --file data/channels.txt
# Force refresh everything and download songlist
python download_karaoke.py --songlist-only --force-download-plan --refresh --limit 10
# High-quality download with custom cache duration
python download_karaoke.py --resolution 1080p --cache-duration 72 --limit 5 https://www.youtube.com/@SingKingKaraoke/videos
## 📋 COMMON WORKFLOWS
# 1. Quick songlist download (most common)
python download_karaoke.py --songlist-only --limit 10
# 1b. Fast parallel songlist download (3-5x faster)
python download_karaoke.py --parallel --workers 5 --songlist-only --limit 10
# 1b. Focus on specific playlists (fast targeted download)
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --limit 5
# 2. Latest videos from all channels
python download_karaoke.py --latest-per-channel --limit 5
# 2b. Fast parallel latest videos download
python download_karaoke.py --parallel --workers 3 --latest-per-channel --limit 5
# 3. High-quality single channel download
python download_karaoke.py --resolution 1080p --limit 20 https://www.youtube.com/@SingKingKaraoke/videos
# 4. Fuzzy matching for better song discovery
python download_karaoke.py --songlist-only --fuzzy-match --fuzzy-threshold 80 --limit 15
# 4c. Fast parallel fuzzy matching
python download_karaoke.py --parallel --workers 5 --songlist-only --fuzzy-match --fuzzy-threshold 80 --limit 15
# 4b. Focused fuzzy matching (target specific playlists with flexible matching)
python download_karaoke.py --songlist-focus "2025 - Apple Top 50" --fuzzy-match --fuzzy-threshold 80 --limit 10
# 5. Reset and start fresh
python download_karaoke.py --reset-channel SingKingKaraoke --reset-songlist
# 6. Check status and clear cache if needed
python download_karaoke.py --status
python download_karaoke.py --clear-cache all
## 🔧 TROUBLESHOOTING COMMANDS
# Check if everything is working
python download_karaoke.py --version
# Force refresh everything
python download_karaoke.py --force-download-plan --refresh --clear-cache all
# Reset everything and start fresh
python download_karaoke.py --reset-songlist-all
python download_karaoke.py --clear-server-duplicates
## 📝 NOTES
# Default files used:
# - data/channels.txt (default channel list for songlist modes)
# - data/songList.json (your prioritized song list)
# - data/config.json (download settings)
# Resolution options: 480p, 720p (default), 1080p, 1440p, 2160p
# Fuzzy threshold: 0-100 (higher = more strict matching, default 90)
# The system automatically:
# - Uses data/channels.txt if no --file specified in songlist modes
# - Caches channel data for 24 hours (configurable)
# - Tracks all downloads in JSON files
# - Avoids re-downloading existing files
# - Checks for server duplicates
# For best performance:
# - Use --parallel --workers 5 for 3-5x faster downloads
# - Use --limit for faster downloads
# - Use --fuzzy-match for better song discovery
# - Use --refresh sparingly (forces re-scan)
# - Clear cache if you encounter issues
# Parallel download tips:
# - Start with --workers 3 for conservative approach
# - Use --workers 5 for balanced performance
# - Use --workers 8-10 only on fast connections
# - Monitor system resources during parallel downloads
# - Reduce workers if you experience connection issues

View File

@ -29967,7 +29967,7 @@
},
"settings": {
"cache_duration_hours": 168,
"last_updated": "2025-07-23T20:17:15.426193"
"last_updated": "2025-07-24T20:17:15.426193"
},
"@SingKingKaraoke": [
{

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,259 @@
[
{
"title": "2025 - Apple Music Top 50",
"songs": [
{
"position": 1,
"title": "luther",
"artist": "Kendrick Lamar & SZA"
},
{
"position": 2,
"title": "Not Like Us",
"artist": "Kendrick Lamar"
},
{
"position": 3,
"title": "30 For 30",
"artist": "SZA"
},
{
"position": 4,
"title": "I'm The Problem",
"artist": "Morgan Wallen"
},
{
"position": 5,
"title": "NOKIA",
"artist": "Drake"
},
{
"position": 6,
"title": "DtMF",
"artist": "Bad Bunny"
},
{
"position": 7,
"title": "Burning Blue",
"artist": "Mariah the Scientist"
},
{
"position": 8,
"title": "What I Want",
"artist": "Morgan Wallen & Tate McRae"
},
{
"position": 9,
"title": "GIMME A HUG",
"artist": "Drake"
},
{
"position": 10,
"title": "EVIL J0RDAN",
"artist": "Playboi Carti"
},
{
"position": 11,
"title": "What Did I Miss",
"artist": "Drake"
},
{
"position": 12,
"title": "Dum, Dumb, and Dumber",
"artist": "Lil Baby, Young Thug & Future"
},
{
"position": 13,
"title": "DAISIES",
"artist": "Justin Bieber"
},
{
"position": 14,
"title": "ALL I CAN TAKE",
"artist": "Justin Bieber"
},
{
"position": 15,
"title": "BAILE INoLVIDABLE",
"artist": "Bad Bunny"
},
{
"position": 16,
"title": "Just In Case",
"artist": "Morgan Wallen"
},
{
"position": 17,
"title": "Blue Strips",
"artist": "Jessie Murph"
},
{
"position": 18,
"title": "All The Way",
"artist": "BigXthaPlug & Bailey Zimmerman"
},
{
"position": 19,
"title": "I Ain't Comin' Back",
"artist": "Morgan Wallen & Post Malone"
},
{
"position": 20,
"title": "Superman",
"artist": "Morgan Wallen"
},
{
"position": 21,
"title": "CN TOWER",
"artist": "PARTYNEXTDOOR & Drake"
},
{
"position": 22,
"title": "Outside",
"artist": "Cardi B"
},
{
"position": 23,
"title": "KICK OUT",
"artist": "Travis Scott"
},
{
"position": 24,
"title": "RATHER LIE",
"artist": "Playboi Carti"
},
{
"position": 25,
"title": "Listen Up",
"artist": "Lil Baby"
},
{
"position": 26,
"title": "Smile",
"artist": "Morgan Wallen"
},
{
"position": 27,
"title": "tv off",
"artist": "Kendrick Lamar"
},
{
"position": 28,
"title": "I Got Better",
"artist": "Morgan Wallen"
},
{
"position": 29,
"title": "Cry For Me",
"artist": "The Weeknd"
},
{
"position": 30,
"title": "NUEVAYoL",
"artist": "Bad Bunny"
},
{
"position": 31,
"title": "By Myself",
"artist": "Lil Baby & Rylo Rodriguez"
},
{
"position": 32,
"title": "DUMBO",
"artist": "Travis Scott"
},
{
"position": 33,
"title": "Crazy Train",
"artist": "Ozzy Osbourne"
},
{
"position": 34,
"title": "Courtesy of the Red, White and Blue",
"artist": "Toby Keith"
},
{
"position": 35,
"title": "I'm A Little Crazy",
"artist": "Morgan Wallen"
},
{
"position": 36,
"title": "20 Cigarettes",
"artist": "Morgan Wallen"
},
{
"position": 37,
"title": "VOY A LLeVARTE PA PR",
"artist": "Bad Bunny"
},
{
"position": 38,
"title": "SOMETHING ABOUT YOU",
"artist": "PARTYNEXTDOOR & Drake"
},
{
"position": 39,
"title": "RATHER LIE",
"artist": "Playboi Carti & The Weeknd"
},
{
"position": 40,
"title": "GO BABY",
"artist": "Justin Bieber"
},
{
"position": 41,
"title": "F U 2x",
"artist": "Lil Baby"
},
{
"position": 42,
"title": "Vanish Mode",
"artist": "Lil Durk"
},
{
"position": 43,
"title": "CHAMPAIN & VACAY",
"artist": "Travis Scott, Don Toliver & Waka Flocka Flame"
},
{
"position": 44,
"title": "Die With A Smile",
"artist": "Lady Gaga & Bruno Mars"
},
{
"position": 45,
"title": "SOMEBODY LOVES ME",
"artist": "PARTYNEXTDOOR & Drake"
},
{
"position": 46,
"title": "squabble up",
"artist": "Kendrick Lamar"
},
{
"position": 47,
"title": "MOTH BALLS",
"artist": "PARTYNEXTDOOR & Drake"
},
{
"position": 48,
"title": "GOOD CREDIT",
"artist": "Playboi Carti & Kendrick Lamar"
},
{
"position": 49,
"title": "WAY IT IS",
"artist": "Justin Bieber & Gunna"
},
{
"position": 50,
"title": "They Want To Be You",
"artist": "Lil Durk"
}
]
},
{
"songs": [
{

118
fix_code_quality.py Normal file
View File

@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
Script to fix code quality issues in the karaoke downloader codebase.
This script addresses:
1. Unused imports (F401)
2. F-string missing placeholders (F541)
3. Unused variables (F841)
4. Missing type annotations
"""
import re
import subprocess
from pathlib import Path
from typing import List, Set
def fix_unused_imports(file_path: Path) -> None:
"""Remove unused imports from a file."""
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
# Track which imports are actually used
used_imports: Set[str] = set()
# Find all import statements
import_pattern = re.compile(r'^from\s+(\S+)\s+import\s+(.+)$')
import_simple_pattern = re.compile(r'^import\s+(.+)$')
for line in lines:
# Check for from ... import statements
match = import_pattern.match(line.strip())
if match:
module = match.group(1)
imports = match.group(2)
# Parse individual imports
for imp in imports.split(','):
imp = imp.strip()
if ' as ' in imp:
imp = imp.split(' as ')[0].strip()
used_imports.add(imp)
# Check for simple import statements
match = import_simple_pattern.match(line.strip())
if match:
module = match.group(1)
if ' as ' in module:
module = module.split(' as ')[1].strip()
used_imports.add(module)
# Check which imports are actually used in the code
code_content = '\n'.join(lines)
actually_used: Set[str] = set()
for imp in used_imports:
if imp in code_content:
actually_used.add(imp)
print(f"Used imports in {file_path.name}: {actually_used}")
def fix_f_string_placeholders(file_path: Path) -> None:
"""Fix f-strings that are missing placeholders."""
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
f_string_pattern = re.compile(r'f"([^"]*)"')
for i, line in enumerate(lines):
matches = f_string_pattern.findall(line)
for match in matches:
if not re.search(r'\{[^}]*\}', match):
print(f"F541: Line {i+1} in {file_path.name}: f-string missing placeholders")
print(f" {line.strip()}")
def fix_unused_variables(file_path: Path) -> None:
"""Find unused variables."""
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
# Pattern to find variable assignments
var_pattern = re.compile(r'^\s*(\w+)\s*=\s*')
for i, line in enumerate(lines):
match = var_pattern.match(line)
if match:
var_name = match.group(1)
# Check if variable is used later in the file
remaining_content = '\n'.join(lines[i+1:])
if var_name not in remaining_content:
print(f"F841: Line {i+1} in {file_path.name}: unused variable '{var_name}'")
def main():
"""Main function to fix code quality issues."""
karaoke_dir = Path("karaoke_downloader")
if not karaoke_dir.exists():
print("Error: karaoke_downloader directory not found")
return
python_files = list(karaoke_dir.glob("*.py"))
print("🔍 Analyzing code quality issues...")
print("=" * 50)
for file_path in python_files:
print(f"\n📁 {file_path.name}:")
fix_unused_imports(file_path)
fix_f_string_placeholders(file_path)
fix_unused_variables(file_path)
print("\n" + "=" * 50)
print("✅ Analysis complete!")
if __name__ == "__main__":
main()

View File

@ -1,4 +1 @@
"""karaoke_downloader package: core modules for the Karaoke Video Downloader CLI tool."""

View File

@ -3,20 +3,22 @@ Cache management utilities for download plans.
Handles caching, loading, and cleanup of download plan data.
"""
import json
import hashlib
from pathlib import Path
import json
from datetime import datetime, timedelta
from pathlib import Path
# Constants
DEFAULT_CACHE_EXPIRATION_DAYS = 1
DEFAULT_CACHE_FILENAME_LENGTH_LIMIT = 200 # Increased from 60
DEFAULT_CACHE_FILENAME_PREFIX_LENGTH = 100 # Increased from 40
def get_download_plan_cache_file(mode, **kwargs):
"""Generate a unique cache filename based on mode and key parameters."""
parts = [f"plan_{mode}"]
# Handle parameters in a more readable way
for k, v in sorted(kwargs.items()):
if k == "channels_hash":
@ -24,47 +26,56 @@ def get_download_plan_cache_file(mode, **kwargs):
parts.append(f"hash{v[:8]}")
else:
parts.append(f"{k}{v}")
base = "_".join(parts)
# Hash for safety if string is still too long
if len(base) > DEFAULT_CACHE_FILENAME_LENGTH_LIMIT:
base = base[:DEFAULT_CACHE_FILENAME_PREFIX_LENGTH] + "_" + hashlib.md5(base.encode()).hexdigest()[:8]
base = (
base[:DEFAULT_CACHE_FILENAME_PREFIX_LENGTH]
+ "_"
+ hashlib.md5(base.encode()).hexdigest()[:8]
)
return Path(f"data/{base}.json")
def load_cached_plan(cache_file, max_age_days=DEFAULT_CACHE_EXPIRATION_DAYS):
"""Load a cached download plan if it exists and is not expired."""
if not cache_file.exists():
return None, None
try:
with open(cache_file, 'r', encoding='utf-8') as f:
with open(cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
cache_time = datetime.fromisoformat(cache_data.get('timestamp'))
cache_time = datetime.fromisoformat(cache_data.get("timestamp"))
if datetime.now() - cache_time < timedelta(days=max_age_days):
print(f"🗂️ Using cached download plan from {cache_time} ({cache_file.name}).")
return cache_data['download_plan'], cache_data['unmatched']
print(
f"🗂️ Using cached download plan from {cache_time} ({cache_file.name})."
)
return cache_data["download_plan"], cache_data["unmatched"]
except Exception as e:
print(f"⚠️ Could not load download plan cache: {e}")
return None, None
def save_plan_cache(cache_file, download_plan, unmatched):
"""Save a download plan to cache."""
if download_plan:
cache_data = {
'timestamp': datetime.now().isoformat(),
'download_plan': download_plan,
'unmatched': unmatched
"timestamp": datetime.now().isoformat(),
"download_plan": download_plan,
"unmatched": unmatched,
}
with open(cache_file, 'w', encoding='utf-8') as f:
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f, indent=2, ensure_ascii=False)
print(f"🗂️ Saved new download plan cache: {cache_file.name}")
else:
if cache_file.exists():
cache_file.unlink()
print(f"🗂️ No matches found, not saving download plan cache.")
print("🗂️ No matches found, not saving download plan cache.")
def delete_plan_cache(cache_file):
"""Delete a download plan cache file."""
@ -73,4 +84,4 @@ def delete_plan_cache(cache_file):
cache_file.unlink()
print(f"🗑️ Deleted download plan cache: {cache_file.name}")
except Exception as e:
print(f"⚠️ Could not delete download plan cache: {e}")
print(f"⚠️ Could not delete download plan cache: {e}")

View File

@ -1,20 +1,34 @@
import os
from pathlib import Path
from karaoke_downloader.songlist_manager import (
save_songlist_tracking, is_songlist_song_downloaded, normalize_title
is_songlist_song_downloaded,
normalize_title,
save_songlist_tracking,
)
def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file, channel_name, reset_songlist=False, delete_files=False):
def reset_channel_downloads(
tracker,
songlist_tracking,
songlist_tracking_file,
channel_name,
reset_songlist=False,
delete_files=False,
):
"""
Reset all tracking and optionally files for a channel.
If reset_songlist is False, songlist songs are preserved (tracking and files).
If reset_songlist is True, songlist songs for this channel are also reset/deleted.
"""
print(f"\n🔄 Resetting channel: {channel_name} (reset_songlist={reset_songlist}, delete_files={delete_files})")
print(
f"\n🔄 Resetting channel: {channel_name} (reset_songlist={reset_songlist}, delete_files={delete_files})"
)
# Find channel_id from channel_name
channel_id = None
for pid, playlist in tracker.data.get('playlists', {}).items():
if playlist['name'] == channel_name or pid == channel_name:
for pid, playlist in tracker.data.get("playlists", {}).items():
if playlist["name"] == channel_name or pid == channel_name:
channel_id = pid
break
if not channel_id:
@ -22,10 +36,12 @@ def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file,
return
# Get all songs for this channel
songs_to_reset = []
for song_id, song in tracker.data.get('songs', {}).items():
if song['playlist_id'] == channel_id:
for song_id, song in tracker.data.get("songs", {}).items():
if song["playlist_id"] == channel_id:
# Check if this is a songlist song
artist, title = song.get('artist', ''), song.get('title', song.get('name', ''))
artist, title = song.get("artist", ""), song.get(
"title", song.get("name", "")
)
key = f"{artist.lower()}_{normalize_title(title)}"
is_songlist = key in songlist_tracking
if is_songlist and not reset_songlist:
@ -36,20 +52,22 @@ def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file,
files_deleted = 0
for song_id, song, is_songlist in songs_to_reset:
# Remove from main tracking
tracker.data['songs'][song_id]['status'] = 'NOT_DOWNLOADED'
tracker.data['songs'][song_id]['formats'] = {}
tracker.data['songs'][song_id]['last_error'] = ''
tracker.data['songs'][song_id]['download_attempts'] = 0
tracker.data['songs'][song_id]['last_updated'] = None
tracker.data["songs"][song_id]["status"] = "NOT_DOWNLOADED"
tracker.data["songs"][song_id]["formats"] = {}
tracker.data["songs"][song_id]["last_error"] = ""
tracker.data["songs"][song_id]["download_attempts"] = 0
tracker.data["songs"][song_id]["last_updated"] = None
# Remove from songlist tracking if needed
if is_songlist and reset_songlist:
artist, title = song.get('artist', ''), song.get('title', song.get('name', ''))
artist, title = song.get("artist", ""), song.get(
"title", song.get("name", "")
)
key = f"{artist.lower()}_{normalize_title(title)}"
if key in songlist_tracking:
del songlist_tracking[key]
# Delete file if requested
if delete_files:
file_path = song.get('file_path')
file_path = song.get("file_path")
if file_path:
try:
p = Path(file_path)
@ -62,7 +80,9 @@ def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file,
print(f"⚠️ Could not delete file {file_path}: {e}")
# Remove all songlist_tracking entries for this channel if reset_songlist is True
if reset_songlist:
keys_to_remove = [k for k, v in songlist_tracking.items() if v.get('channel') == channel_name]
keys_to_remove = [
k for k, v in songlist_tracking.items() if v.get("channel") == channel_name
]
for k in keys_to_remove:
del songlist_tracking[k]
# Save changes
@ -72,7 +92,8 @@ def reset_channel_downloads(tracker, songlist_tracking, songlist_tracking_file,
if delete_files:
print(f" Files deleted: {files_deleted}, files preserved: {files_preserved}")
if not reset_songlist:
print(f" Songlist songs were preserved.")
print(" Songlist songs were preserved.")
def download_from_file(self, file_path, force_refresh=False):
file = Path(file_path)
@ -80,7 +101,11 @@ def download_from_file(self, file_path, force_refresh=False):
print(f"❌ File not found: {file_path}")
return False
with open(file, "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")]
urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
if not urls:
print(f"❌ No URLs found in {file_path}")
return False
@ -90,4 +115,4 @@ def download_from_file(self, file_path, force_refresh=False):
success = self.download_channel_videos(url, force_refresh=force_refresh)
if not success:
all_success = False
return all_success
return all_success

View File

@ -3,122 +3,138 @@
Script to check the actual resolution of downloaded MP4 files.
"""
import subprocess
import json
from pathlib import Path
import sys
import json
import subprocess
from pathlib import Path
def get_video_info_ffprobe(file_path):
"""Get video information using ffprobe."""
try:
cmd = [
'ffprobe',
'-v', 'quiet',
'-print_format', 'json',
'-show_streams',
str(file_path)
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_streams",
str(file_path),
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
# Find video stream
for stream in data.get('streams', []):
if stream.get('codec_type') == 'video':
width = stream.get('width')
height = stream.get('height')
codec = stream.get('codec_name')
bitrate = stream.get('bit_rate')
for stream in data.get("streams", []):
if stream.get("codec_type") == "video":
width = stream.get("width")
height = stream.get("height")
codec = stream.get("codec_name")
bitrate = stream.get("bit_rate")
return {
'width': width,
'height': height,
'codec': codec,
'bitrate': bitrate,
'resolution': f"{width}x{height}" if width and height else "Unknown"
"width": width,
"height": height,
"codec": codec,
"bitrate": bitrate,
"resolution": (
f"{width}x{height}" if width and height else "Unknown"
),
}
return None
except (subprocess.CalledProcessError, json.JSONDecodeError, FileNotFoundError) as e:
except (
subprocess.CalledProcessError,
json.JSONDecodeError,
FileNotFoundError,
) as e:
return None
def get_video_info_python(file_path):
"""Get video information using Python libraries (fallback)."""
try:
import cv2
cap = cv2.VideoCapture(str(file_path))
if cap.isOpened():
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return {
'width': width,
'height': height,
'codec': 'Unknown',
'bitrate': None,
'resolution': f"{width}x{height}"
"width": width,
"height": height,
"codec": "Unknown",
"bitrate": None,
"resolution": f"{width}x{height}",
}
except ImportError:
pass
try:
from moviepy.editor import VideoFileClip
clip = VideoFileClip(str(file_path))
width, height = clip.size
clip.close()
return {
'width': width,
'height': height,
'codec': 'Unknown',
'bitrate': None,
'resolution': f"{width}x{height}"
"width": width,
"height": height,
"codec": "Unknown",
"bitrate": None,
"resolution": f"{width}x{height}",
}
except ImportError:
pass
return None
def check_resolutions(downloads_dir="downloads"):
"""Check resolutions of all MP4 files in the downloads directory."""
downloads_path = Path(downloads_dir)
if not downloads_path.exists():
print(f"❌ Downloads directory '{downloads_dir}' not found!")
return
mp4_files = list(downloads_path.rglob("*.mp4"))
if not mp4_files:
print(f"❌ No MP4 files found in '{downloads_dir}'!")
return
print(f"🔍 Checking resolution of {len(mp4_files)} MP4 files...")
print("=" * 80)
resolutions = {}
total_files = 0
successful_checks = 0
for mp4_file in sorted(mp4_files):
total_files += 1
relative_path = mp4_file.relative_to(downloads_path)
# Try ffprobe first, then Python libraries
info = get_video_info_ffprobe(mp4_file)
if not info:
info = get_video_info_python(mp4_file)
if info:
successful_checks += 1
resolution = info['resolution']
resolution = info["resolution"]
resolutions[resolution] = resolutions.get(resolution, 0) + 1
# Determine if it's 720p or not
width, height = info['width'], info['height']
is_720p = (width == 1280 and height == 720) or (width == 720 and height == 1280)
width, height = info["width"], info["height"]
is_720p = (width == 1280 and height == 720) or (
width == 720 and height == 1280
)
status = "✅ 720p" if is_720p else "❌ Not 720p"
print(f"{status} | {resolution:>12} | {relative_path}")
else:
print(f"❓ Unknown | {'Unknown':>12} | {relative_path}")
print("=" * 80)
print(f"📊 Summary:")
print(f" Total files checked: {total_files}")
@ -126,12 +142,17 @@ def check_resolutions(downloads_dir="downloads"):
print(f" Failed to analyze: {total_files - successful_checks}")
print()
print("📈 Resolution breakdown:")
for resolution, count in sorted(resolutions.items(), key=lambda x: x[1], reverse=True):
for resolution, count in sorted(
resolutions.items(), key=lambda x: x[1], reverse=True
):
percentage = (count / successful_checks) * 100
is_720p = "1280x720" in resolution or "720x1280" in resolution
status = "✅ 720p" if is_720p else "❌ Other"
print(f" {status} | {resolution:>12} | {count:>3} files ({percentage:>5.1f}%)")
print(
f" {status} | {resolution:>12} | {count:>3} files ({percentage:>5.1f}%)"
)
def main():
"""Main function."""
@ -139,8 +160,9 @@ def main():
downloads_dir = sys.argv[1]
else:
downloads_dir = "downloads"
check_resolutions(downloads_dir)
if __name__ == "__main__":
main()
main()

View File

@ -1,8 +1,10 @@
import sys
import argparse
from pathlib import Path
from karaoke_downloader.downloader import KaraokeDownloader
import os
import sys
from pathlib import Path
from karaoke_downloader.downloader import KaraokeDownloader
# Constants
DEFAULT_FUZZY_THRESHOLD = 85
@ -10,6 +12,7 @@ DEFAULT_LATEST_PER_CHANNEL_LIMIT = 5
DEFAULT_DISPLAY_LIMIT = 10
DEFAULT_CACHE_DURATION_HOURS = 24
def main():
parser = argparse.ArgumentParser(
description="Karaoke Video Downloader - Download YouTube playlists and channel videos for karaoke",
@ -20,35 +23,160 @@ Examples:
python download_karaoke.py https://www.youtube.com/@SingKingKaraoke/videos
python download_karaoke.py --file data/channels.txt
python download_karaoke.py --reset-channel SingKingKaraoke --delete-files
"""
""",
)
parser.add_argument(
"url", nargs="?", help="YouTube playlist or channel URL to download"
)
parser.add_argument(
"--file",
"-f",
help="Text file containing playlist or channel URLs (one per line)",
)
parser.add_argument(
"--status",
"-s",
action="store_true",
help="Show download status and statistics",
)
parser.add_argument(
"--report",
"-r",
metavar="PLAYLIST_ID",
help="Generate a detailed report for a specific playlist",
)
parser.add_argument(
"--sync",
"--sync-only",
action="store_true",
help="Only sync playlist without downloading (update tracking)",
)
parser.add_argument(
"--cleanup", action="store_true", help="Clean up orphaned tracking entries"
)
parser.add_argument(
"--resolution",
"--res",
choices=["480p", "720p", "1080p", "1440p", "2160p"],
default="720p",
help="Preferred video resolution (default: 720p)",
)
parser.add_argument(
"--limit",
"-l",
type=int,
help="Limit the number of videos to download (e.g., --limit 10)",
)
parser.add_argument(
"--refresh",
"--force-refresh",
action="store_true",
help="Force refresh channel cache (ignore cached data)",
)
parser.add_argument(
"--cache-info", action="store_true", help="Show channel cache information"
)
parser.add_argument(
"--clear-cache",
metavar="CHANNEL_ID",
nargs="?",
const="all",
help="Clear cache for specific channel or all channels (use --clear-cache all)",
)
parser.add_argument(
"--cache-duration",
type=int,
metavar="HOURS",
help="Set cache duration in hours (default: 24)",
)
parser.add_argument(
"--songlist-priority",
action="store_true",
help="Prioritize downloads based on data/songList.json (default: enabled)",
)
parser.add_argument(
"--no-songlist-priority",
action="store_true",
help="Disable songlist prioritization",
)
parser.add_argument(
"--songlist-only",
action="store_true",
help="Only download songs that are in the songlist (skip all others)",
)
parser.add_argument(
"--songlist-focus",
nargs="+",
metavar="PLAYLIST_TITLE",
help='Focus on specific playlists by title (e.g., --songlist-focus "2025 - Apple Top 50" "2024 - Billboard Hot 100")',
)
parser.add_argument(
"--songlist-status",
action="store_true",
help="Show songlist download status and statistics",
)
parser.add_argument(
"--reset-channel",
metavar="CHANNEL_NAME",
help="Reset all tracking and files for a channel",
)
parser.add_argument(
"--reset-songlist",
action="store_true",
help="When used with --reset-channel, also reset songlist songs for this channel",
)
parser.add_argument(
"--reset-songlist-all",
action="store_true",
help="Reset all songlist tracking and delete all songlist-downloaded files (global)",
)
parser.add_argument(
"--clear-server-duplicates",
action="store_true",
help="Clear server duplicates tracking (allows re-checking songs against server)",
)
parser.add_argument(
"--version", "-v", action="version", version="Karaoke Playlist Downloader v1.0"
)
parser.add_argument(
"--force-download-plan",
action="store_true",
help="Force refresh the download plan cache (re-scan all channels for matches)",
)
parser.add_argument(
"--latest-per-channel",
action="store_true",
help="Download the latest N videos from each channel (use with --limit)",
)
parser.add_argument(
"--fuzzy-match",
action="store_true",
help="Enable fuzzy matching for songlist-to-video matching (uses rapidfuzz if available)",
)
parser.add_argument(
"--fuzzy-threshold",
type=int,
default=90,
help="Fuzzy match threshold (0-100, default 90)",
)
parser.add_argument(
"--parallel",
action="store_true",
help="Enable parallel downloads for improved speed",
)
parser.add_argument(
"--workers",
type=int,
default=3,
help="Number of parallel download workers (default: 3, max: 10)",
)
parser.add_argument('url', nargs='?', help='YouTube playlist or channel URL to download')
parser.add_argument('--file', '-f', help='Text file containing playlist or channel URLs (one per line)')
parser.add_argument('--status', '-s', action='store_true', help='Show download status and statistics')
parser.add_argument('--report', '-r', metavar='PLAYLIST_ID', help='Generate a detailed report for a specific playlist')
parser.add_argument('--sync', '--sync-only', action='store_true', help='Only sync playlist without downloading (update tracking)')
parser.add_argument('--cleanup', action='store_true', help='Clean up orphaned tracking entries')
parser.add_argument('--resolution', '--res', choices=['480p', '720p', '1080p', '1440p', '2160p'], default='720p', help='Preferred video resolution (default: 720p)')
parser.add_argument('--limit', '-l', type=int, help='Limit the number of videos to download (e.g., --limit 10)')
parser.add_argument('--refresh', '--force-refresh', action='store_true', help='Force refresh channel cache (ignore cached data)')
parser.add_argument('--cache-info', action='store_true', help='Show channel cache information')
parser.add_argument('--clear-cache', metavar='CHANNEL_ID', nargs='?', const='all', help='Clear cache for specific channel or all channels (use --clear-cache all)')
parser.add_argument('--cache-duration', type=int, metavar='HOURS', help='Set cache duration in hours (default: 24)')
parser.add_argument('--songlist-priority', action='store_true', help='Prioritize downloads based on data/songList.json (default: enabled)')
parser.add_argument('--no-songlist-priority', action='store_true', help='Disable songlist prioritization')
parser.add_argument('--songlist-only', action='store_true', help='Only download songs that are in the songlist (skip all others)')
parser.add_argument('--songlist-status', action='store_true', help='Show songlist download status and statistics')
parser.add_argument('--reset-channel', metavar='CHANNEL_NAME', help='Reset all tracking and files for a channel')
parser.add_argument('--reset-songlist', action='store_true', help='When used with --reset-channel, also reset songlist songs for this channel')
parser.add_argument('--reset-songlist-all', action='store_true', help='Reset all songlist tracking and delete all songlist-downloaded files (global)')
parser.add_argument('--clear-server-duplicates', action='store_true', help='Clear server duplicates tracking (allows re-checking songs against server)')
parser.add_argument('--version', '-v', action='version', version='Karaoke Playlist Downloader v1.0')
parser.add_argument('--force-download-plan', action='store_true', help='Force refresh the download plan cache (re-scan all channels for matches)')
parser.add_argument('--latest-per-channel', action='store_true', help='Download the latest N videos from each channel (use with --limit)')
parser.add_argument('--fuzzy-match', action='store_true', help='Enable fuzzy matching for songlist-to-video matching (uses rapidfuzz if available)')
parser.add_argument('--fuzzy-threshold', type=int, default=90, help='Fuzzy match threshold (0-100, default 90)')
args = parser.parse_args()
# Validate workers argument
if args.workers < 1 or args.workers > 10:
print("❌ Error: --workers must be between 1 and 10")
sys.exit(1)
yt_dlp_path = Path("downloader/yt-dlp.exe")
if not yt_dlp_path.exists():
print("❌ Error: yt-dlp.exe not found in downloader/ directory")
@ -56,6 +184,16 @@ Examples:
sys.exit(1)
downloader = KaraokeDownloader()
# Set parallel download options
if args.parallel:
downloader.enable_parallel_downloads = True
downloader.parallel_workers = args.workers
print(f"⚡ Parallel downloads enabled with {args.workers} workers")
else:
downloader.enable_parallel_downloads = False
print("🐌 Sequential downloads enabled")
if args.limit:
downloader.download_limit = args.limit
print(f"🎯 Download limit set to: {args.limit} videos")
@ -68,25 +206,22 @@ Examples:
if args.songlist_only:
downloader.songlist_only = True
print("🎯 Songlist-only mode enabled (will only download songlist songs)")
if args.resolution != '720p':
resolution_map = {
'480p': '480',
'720p': '720',
'1080p': '1080',
'1440p': '1440',
'2160p': '2160'
}
height = resolution_map[args.resolution]
downloader.config["download_settings"]["format"] = f"best[height<={height}][ext=mp4]/best[height<={height}]/best[ext=mp4]/best"
downloader.config["download_settings"]["preferred_resolution"] = args.resolution
print(f"🎬 Using resolution: {args.resolution}")
if args.songlist_focus:
downloader.songlist_focus_titles = args.songlist_focus
downloader.songlist_only = True # Enable songlist-only mode when focusing
print(
f"🎯 Songlist focus mode enabled for playlists: {', '.join(args.songlist_focus)}"
)
if args.resolution != "720p":
downloader.config_manager.update_resolution(args.resolution)
# --- NEW: Reset channel CLI command ---
if args.reset_channel:
downloader.reset_channel_downloads(
args.reset_channel,
reset_songlist=args.reset_songlist,
delete_files=True # Always delete files if they exist
delete_files=True, # Always delete files if they exist
)
sys.exit(0)
# --- END NEW ---
@ -96,23 +231,29 @@ Examples:
channels_file = Path("data/channels.txt")
if channels_file.exists():
args.file = str(channels_file)
print("📋 No URL or --file provided, defaulting to all channels in data/channels.txt for songlist-only mode.")
print(
"📋 No URL or --file provided, defaulting to all channels in data/channels.txt for songlist-only mode."
)
else:
print("❌ No URL, --file, or data/channels.txt found. Please provide a channel URL or a file with channel URLs.")
print(
"❌ No URL, --file, or data/channels.txt found. Please provide a channel URL or a file with channel URLs."
)
sys.exit(1)
# --- END NEW ---
if args.reset_songlist_all:
from karaoke_downloader.downloader import reset_songlist_all
reset_songlist_all()
print('✅ All songlist tracking and files have been reset.')
print("✅ All songlist tracking and files have been reset.")
sys.exit(0)
if args.clear_server_duplicates:
from karaoke_downloader.server_manager import save_server_duplicates_tracking
save_server_duplicates_tracking({})
print('✅ Server duplicates tracking has been cleared.')
print(' Songs will be re-checked against the server on next run.')
print("✅ Server duplicates tracking has been cleared.")
print(" Songs will be re-checked against the server on next run.")
sys.exit(0)
if args.status:
@ -137,7 +278,7 @@ Examples:
print(f"Last Updated: {cache_info['last_updated']}")
sys.exit(0)
elif args.clear_cache:
if args.clear_cache == 'all':
if args.clear_cache == "all":
downloader.tracker.clear_channel_cache()
print("🧹 Cleared all channel caches")
else:
@ -168,23 +309,41 @@ Examples:
if tracking:
print(f"\n📁 Downloaded songs:")
for key, info in list(tracking.items())[:10]:
print(f"{info['artist']} - {info['title']} (from {info['channel']})")
print(
f"{info['artist']} - {info['title']} (from {info['channel']})"
)
if len(tracking) > 10:
print(f" ... and {len(tracking) - 10} more")
sys.exit(0)
elif args.songlist_only:
elif args.songlist_only or args.songlist_focus:
# Use provided file or default to data/channels.txt
channel_file = args.file if args.file else "data/channels.txt"
if not os.path.exists(channel_file):
print(f"❌ Channel file not found: {channel_file}")
sys.exit(1)
with open(channel_file, "r", encoding="utf-8") as f:
channel_urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")]
channel_urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
limit = args.limit if args.limit else None
force_refresh_download_plan = args.force_download_plan if hasattr(args, 'force_download_plan') else False
fuzzy_match = args.fuzzy_match if hasattr(args, 'fuzzy_match') else False
fuzzy_threshold = args.fuzzy_threshold if hasattr(args, 'fuzzy_threshold') else DEFAULT_FUZZY_THRESHOLD
success = downloader.download_songlist_across_channels(channel_urls, limit=limit, force_refresh_download_plan=force_refresh_download_plan, fuzzy_match=fuzzy_match, fuzzy_threshold=fuzzy_threshold)
force_refresh_download_plan = (
args.force_download_plan if hasattr(args, "force_download_plan") else False
)
fuzzy_match = args.fuzzy_match if hasattr(args, "fuzzy_match") else False
fuzzy_threshold = (
args.fuzzy_threshold
if hasattr(args, "fuzzy_threshold")
else DEFAULT_FUZZY_THRESHOLD
)
success = downloader.download_songlist_across_channels(
channel_urls,
limit=limit,
force_refresh_download_plan=force_refresh_download_plan,
fuzzy_match=fuzzy_match,
fuzzy_threshold=fuzzy_threshold,
)
elif args.latest_per_channel:
# Use provided file or default to data/channels.txt
channel_file = args.file if args.file else "data/channels.txt"
@ -192,14 +351,32 @@ Examples:
print(f"❌ Channel file not found: {channel_file}")
sys.exit(1)
with open(channel_file, "r", encoding="utf-8") as f:
channel_urls = [line.strip() for line in f if line.strip() and not line.strip().startswith("#")]
channel_urls = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
limit = args.limit if args.limit else DEFAULT_LATEST_PER_CHANNEL_LIMIT
force_refresh_download_plan = args.force_download_plan if hasattr(args, 'force_download_plan') else False
fuzzy_match = args.fuzzy_match if hasattr(args, 'fuzzy_match') else False
fuzzy_threshold = args.fuzzy_threshold if hasattr(args, 'fuzzy_threshold') else DEFAULT_FUZZY_THRESHOLD
success = downloader.download_latest_per_channel(channel_urls, limit=limit, force_refresh_download_plan=force_refresh_download_plan, fuzzy_match=fuzzy_match, fuzzy_threshold=fuzzy_threshold)
force_refresh_download_plan = (
args.force_download_plan if hasattr(args, "force_download_plan") else False
)
fuzzy_match = args.fuzzy_match if hasattr(args, "fuzzy_match") else False
fuzzy_threshold = (
args.fuzzy_threshold
if hasattr(args, "fuzzy_threshold")
else DEFAULT_FUZZY_THRESHOLD
)
success = downloader.download_latest_per_channel(
channel_urls,
limit=limit,
force_refresh_download_plan=force_refresh_download_plan,
fuzzy_match=fuzzy_match,
fuzzy_threshold=fuzzy_threshold,
)
elif args.url:
success = downloader.download_channel_videos(args.url, force_refresh=args.refresh)
success = downloader.download_channel_videos(
args.url, force_refresh=args.refresh
)
else:
parser.print_help()
sys.exit(1)
@ -209,4 +386,4 @@ Examples:
sys.exit(0)
else:
print("\n⚠️ Some downloads may have failed. Check the logs for details.")
sys.exit(1)
sys.exit(1)

View File

@ -1,77 +1,323 @@
"""
Configuration management utilities.
Handles loading and managing application configuration.
Configuration management utilities for the karaoke downloader.
Provides centralized configuration loading, validation, and management.
"""
import json
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional, Union
DATA_DIR = Path("data")
# Default configuration values
DEFAULT_CONFIG = {
"download_settings": {
"format": "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best",
"preferred_resolution": "720p",
"audio_format": "mp3",
"audio_quality": "0",
"subtitle_language": "en",
"subtitle_format": "srt",
"write_metadata": False,
"write_thumbnail": False,
"write_description": False,
"write_annotations": False,
"write_comments": False,
"write_subtitles": False,
"embed_metadata": False,
"add_metadata": False,
"continue_downloads": True,
"no_overwrites": True,
"ignore_errors": True,
"no_warnings": False,
},
"folder_structure": {
"downloads_dir": "downloads",
"logs_dir": "logs",
"tracking_file": "data/karaoke_tracking.json",
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(levelname)s - %(message)s",
"include_console": True,
"include_file": True,
},
"yt_dlp_path": "downloader/yt-dlp.exe",
}
def load_config():
"""Load configuration from data/config.json or return defaults."""
config_file = DATA_DIR / "config.json"
if config_file.exists():
try:
with open(config_file, 'r', encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"Warning: Could not load config.json: {e}")
return get_default_config()
# Resolution mapping for CLI arguments
RESOLUTION_MAP = {
"480p": "480",
"720p": "720",
"1080p": "1080",
"1440p": "1440",
"2160p": "2160",
}
def get_default_config():
"""Get the default configuration."""
return {
"download_settings": {
"format": "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best",
"preferred_resolution": "720p",
"audio_format": "mp3",
"audio_quality": "0",
"subtitle_language": "en",
"subtitle_format": "srt",
"write_metadata": False,
"write_thumbnail": False,
"write_description": False,
"write_annotations": False,
"write_comments": False,
"write_subtitles": False,
"embed_metadata": False,
"add_metadata": False,
"continue_downloads": True,
"no_overwrites": True,
"ignore_errors": True,
"no_warnings": False
},
"folder_structure": {
"downloads_dir": "downloads",
"logs_dir": "logs",
"tracking_file": str(DATA_DIR / "karaoke_tracking.json")
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(levelname)s - %(message)s",
"include_console": True,
"include_file": True
},
"yt_dlp_path": "downloader/yt-dlp.exe"
}
def save_config(config):
"""Save configuration to data/config.json."""
config_file = DATA_DIR / "config.json"
config_file.parent.mkdir(exist_ok=True)
try:
with open(config_file, 'w', encoding='utf-8') as f:
json.dump(config, f, indent=2, ensure_ascii=False)
return True
except Exception as e:
print(f"Error saving config: {e}")
return False
@dataclass
class DownloadSettings:
"""Configuration for download settings."""
def update_config(updates):
"""Update configuration with new values."""
config = load_config()
config.update(updates)
return save_config(config)
format: str = "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best"
outtmpl: str = "%(title)s_720p.%(ext)s"
merge_output_format: str = "mp4"
noplaylist: bool = True
postprocessors: list = None
preferred_resolution: str = "720p"
audio_format: str = "mp3"
audio_quality: str = "0"
subtitle_language: str = "en"
subtitle_format: str = "srt"
write_metadata: bool = False
write_thumbnail: bool = False
write_description: bool = False
writedescription: bool = False
write_annotations: bool = False
writeannotations: bool = False
write_comments: bool = False
writecomments: bool = False
write_subtitles: bool = False
writesubtitles: bool = False
writeinfojson: bool = False
writethumbnail: bool = False
embed_metadata: bool = False
add_metadata: bool = False
continue_downloads: bool = True
continuedl: bool = True
no_overwrites: bool = True
nooverwrites: bool = True
ignore_errors: bool = True
ignoreerrors: bool = True
no_warnings: bool = False
def __post_init__(self):
"""Initialize default values for complex fields."""
if self.postprocessors is None:
self.postprocessors = [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "0",
}
]
@dataclass
class FolderStructure:
"""Configuration for folder structure."""
downloads_dir: str = "downloads"
logs_dir: str = "logs"
tracking_file: str = "data/karaoke_tracking.json"
@dataclass
class LoggingConfig:
"""Configuration for logging."""
level: str = "INFO"
format: str = "%(asctime)s - %(levelname)s - %(message)s"
include_console: bool = True
include_file: bool = True
@dataclass
class AppConfig:
"""Main application configuration."""
download_settings: DownloadSettings = field(default_factory=DownloadSettings)
folder_structure: FolderStructure = field(default_factory=FolderStructure)
logging: LoggingConfig = field(default_factory=LoggingConfig)
yt_dlp_path: str = "downloader/yt-dlp.exe"
_config_file: Optional[Path] = None
_last_modified: Optional[datetime] = None
class ConfigManager:
"""
Manages application configuration with loading, validation, and caching.
"""
def __init__(self, config_file: Union[str, Path] = "data/config.json"):
"""
Initialize the configuration manager.
Args:
config_file: Path to the configuration file
"""
self.config_file = Path(config_file)
self._config: Optional[AppConfig] = None
self._last_modified: Optional[datetime] = None
def load_config(self, force_reload: bool = False) -> AppConfig:
"""
Load configuration from file with caching.
Args:
force_reload: Force reload even if file hasn't changed
Returns:
AppConfig instance
"""
# Check if we need to reload
if not force_reload and self._config is not None:
if self.config_file.exists():
current_mtime = datetime.fromtimestamp(self.config_file.stat().st_mtime)
if self._last_modified and current_mtime <= self._last_modified:
return self._config
# Load configuration
config_data = self._load_config_file()
self._config = self._create_config_from_dict(config_data)
self._last_modified = datetime.now()
return self._config
def _load_config_file(self) -> Dict[str, Any]:
"""
Load configuration from file with fallback to defaults.
Returns:
Configuration dictionary
"""
if self.config_file.exists():
try:
with open(self.config_file, "r", encoding="utf-8") as f:
file_config = json.load(f)
# Merge with defaults
return self._merge_configs(DEFAULT_CONFIG, file_config)
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"Warning: Could not load config.json: {e}")
print("Using default configuration.")
return DEFAULT_CONFIG.copy()
def _merge_configs(
self, default: Dict[str, Any], user: Dict[str, Any]
) -> Dict[str, Any]:
"""
Merge user configuration with defaults.
Args:
default: Default configuration
user: User configuration
Returns:
Merged configuration
"""
merged = default.copy()
for key, value in user.items():
if (
key in merged
and isinstance(merged[key], dict)
and isinstance(value, dict)
):
merged[key] = self._merge_configs(merged[key], value)
else:
merged[key] = value
return merged
def _create_config_from_dict(self, config_data: Dict[str, Any]) -> AppConfig:
"""
Create AppConfig from dictionary.
Args:
config_data: Configuration dictionary
Returns:
AppConfig instance
"""
download_settings = DownloadSettings(**config_data.get("download_settings", {}))
folder_structure = FolderStructure(**config_data.get("folder_structure", {}))
logging_config = LoggingConfig(**config_data.get("logging", {}))
return AppConfig(
download_settings=download_settings,
folder_structure=folder_structure,
logging=logging_config,
yt_dlp_path=config_data.get("yt_dlp_path", "downloader/yt-dlp.exe"),
_config_file=self.config_file,
)
def update_resolution(self, resolution: str) -> None:
"""
Update the download format based on resolution.
Args:
resolution: Resolution string (e.g., "720p", "1080p")
"""
if self._config is None:
self.load_config()
if resolution in RESOLUTION_MAP:
height = RESOLUTION_MAP[resolution]
format_str = f"best[height<={height}][ext=mp4]/best[height<={height}]/best[ext=mp4]/best"
self._config.download_settings.format = format_str
self._config.download_settings.preferred_resolution = resolution
print(f"🎬 Using resolution: {resolution}")
def get_config(self) -> AppConfig:
"""
Get the current configuration.
Returns:
AppConfig instance
"""
if self._config is None:
return self.load_config()
return self._config
def save_config(self) -> None:
"""
Save current configuration to file.
"""
if self._config is None:
return
config_dict = {
"download_settings": self._config.download_settings.__dict__,
"folder_structure": self._config.folder_structure.__dict__,
"logging": self._config.logging.__dict__,
"yt_dlp_path": self._config.yt_dlp_path,
}
# Ensure directory exists
self.config_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.config_file, "w", encoding="utf-8") as f:
json.dump(config_dict, f, indent=2, ensure_ascii=False)
print(f"Configuration saved to {self.config_file}")
# Global configuration manager instance
_config_manager: Optional[ConfigManager] = None
def get_config_manager() -> ConfigManager:
"""
Get the global configuration manager instance.
Returns:
ConfigManager instance
"""
global _config_manager
if _config_manager is None:
_config_manager = ConfigManager()
return _config_manager
def load_config(force_reload: bool = False) -> AppConfig:
"""
Load configuration using the global manager.
Args:
force_reload: Force reload even if file hasn't changed
Returns:
AppConfig instance
"""
return get_config_manager().load_config(force_reload)

View File

@ -0,0 +1,299 @@
"""
Download pipeline that abstracts the complete download verify tag track process.
"""
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from karaoke_downloader.config_manager import AppConfig
from karaoke_downloader.error_utils import (
handle_file_validation_error,
handle_yt_dlp_error,
log_error,
)
from karaoke_downloader.id3_utils import add_id3_tags
from karaoke_downloader.songlist_manager import mark_songlist_song_downloaded
from karaoke_downloader.video_downloader import is_valid_mp4, sanitize_filename
from karaoke_downloader.youtube_utils import (
build_yt_dlp_command,
execute_yt_dlp_command,
show_available_formats,
)
class DownloadPipeline:
"""
Handles the complete download pipeline: download verify tag track
"""
def __init__(
self,
yt_dlp_path: str,
config: Union[AppConfig, Dict[str, Any]],
downloads_dir: Path,
songlist_tracking: Optional[Dict] = None,
tracker=None,
):
self.yt_dlp_path = yt_dlp_path
self.config = config
self.downloads_dir = downloads_dir
self.songlist_tracking = songlist_tracking or {}
self.tracker = tracker
def execute_pipeline(
self,
video_id: str,
artist: str,
title: str,
channel_name: str,
video_title: Optional[str] = None,
) -> bool:
"""
Execute the complete download pipeline for a single video.
Args:
video_id: YouTube video ID
artist: Artist name
title: Song title
channel_name: Channel name
video_title: Original video title (optional)
Returns:
True if successful, False otherwise
"""
try:
# Step 1: Prepare file path
filename = sanitize_filename(artist, title)
output_path = self.downloads_dir / channel_name / filename
# Step 2: Download video
if not self._download_video(video_id, output_path, artist, title, channel_name):
return False
# Step 3: Verify download
if not self._verify_download(
output_path, artist, title, video_id, channel_name
):
return False
# Step 4: Add ID3 tags
if not self._add_tags(output_path, artist, title, channel_name):
return False
# Step 5: Track download
if not self._track_download(
output_path, artist, title, video_id, channel_name
):
return False
print(f"✅ Pipeline completed successfully: {artist} - {title}")
return True
except Exception as e:
print(f"❌ Pipeline failed for {artist} - {title}: {e}")
# Mark song as failed in tracking
if self.tracker:
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, f"Pipeline failed: {e}"
)
return False
def _download_video(
self, video_id: str, output_path: Path, artist: str, title: str, channel_name: str
) -> bool:
"""Step 1: Download the video using yt-dlp."""
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"⬇️ Downloading: {artist} - {title} -> {output_path}")
video_url = f"https://www.youtube.com/watch?v={video_id}"
# Build command using centralized utility
cmd = build_yt_dlp_command(
self.yt_dlp_path, video_url, output_path, self.config
)
print(f"🔧 Running command: {' '.join(cmd)}")
print(
f"📺 Resolution settings: {self.config.download_settings.preferred_resolution}"
)
print(f"🎬 Format string: {self.config.download_settings.format}")
# Debug: Show available formats (optional)
if (
hasattr(self.config, "debug_show_formats")
and self.config.debug_show_formats
):
show_available_formats(video_url, self.yt_dlp_path)
try:
result = execute_yt_dlp_command(cmd)
print(f"✅ yt-dlp completed successfully")
print(f"📄 yt-dlp stdout: {result.stdout}")
return True
except subprocess.CalledProcessError as e:
error = handle_yt_dlp_error(e, artist, title, video_id)
log_error(error)
# Mark song as failed in tracking
if self.tracker:
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, error.message
)
return False
def _verify_download(
self,
output_path: Path,
artist: str,
title: str,
video_id: str,
channel_name: str,
) -> bool:
"""Step 2: Verify that the download was successful."""
if not output_path.exists():
print(f"❌ Download failed: file does not exist: {output_path}")
# Check if yt-dlp saved it somewhere else
possible_files = list(output_path.parent.glob("*.mp4"))
if possible_files:
print(
f"🔍 Found these files in the directory: {[f.name for f in possible_files]}"
)
# Look for a file that matches our pattern (artist - title)
artist_part = artist.lower()
title_part = title.lower()
for file in possible_files:
file_lower = file.stem.lower()
if artist_part in file_lower and any(
word in file_lower for word in title_part.split()
):
print(f"🎯 Found matching file: {file.name}")
output_path = file
break
else:
print(f"❌ No matching file found for: {artist} - {title}")
# Mark song as failed in tracking
if self.tracker:
error_msg = f"Download failed: file does not exist and no matching file found"
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, error_msg
)
return False
else:
# Mark song as failed in tracking
if self.tracker:
error_msg = f"Download failed: file does not exist"
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, error_msg
)
return False
# Validate file
if not is_valid_mp4(output_path):
error = handle_file_validation_error(
"File is not a valid MP4",
output_path,
artist,
title,
video_id,
channel_name,
)
log_error(error)
# Mark song as failed in tracking
if self.tracker:
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, error.message
)
return False
print(f"✅ Download verified: {output_path}")
return True
def _add_tags(
self, output_path: Path, artist: str, title: str, channel_name: str
) -> bool:
"""Step 3: Add ID3 tags to the downloaded file."""
try:
add_id3_tags(
output_path, f"{artist} - {title} (Karaoke Version)", channel_name
)
print(f"🏷️ Added ID3 tags: {artist} - {title}")
return True
except Exception as e:
print(f"⚠️ Failed to add ID3 tags: {e}")
# Don't fail the pipeline for tag issues
return True
def _track_download(
self,
output_path: Path,
artist: str,
title: str,
video_id: str,
channel_name: str,
) -> bool:
"""Step 4: Track the download in the tracking system."""
try:
# Track in songlist if available
if self.songlist_tracking is not None:
mark_songlist_song_downloaded(
self.songlist_tracking, artist, title, channel_name, output_path
)
# Track in main tracking system if available
if self.tracker is not None:
file_size = output_path.stat().st_size if output_path.exists() else None
self.tracker.mark_song_downloaded(
artist, title, video_id, channel_name, output_path, file_size
)
print(f"📊 Tracked download: {artist} - {title}")
return True
except Exception as e:
print(f"⚠️ Failed to track download: {e}")
# Don't fail the pipeline for tracking issues
return True
def batch_execute(
self,
videos: List[Dict[str, Any]],
channel_name: str,
limit: Optional[int] = None,
) -> Tuple[int, int]:
"""
Execute the pipeline for multiple videos.
Args:
videos: List of video dictionaries with 'id', 'title', etc.
channel_name: Channel name
limit: Optional limit on number of videos to process
Returns:
Tuple of (successful_downloads, total_attempted)
"""
if limit:
videos = videos[:limit]
successful = 0
total = len(videos)
for i, video in enumerate(videos, 1):
video_id = video["id"]
video_title = video.get("title", "")
# Extract artist and title from video title
from karaoke_downloader.id3_utils import extract_artist_title
artist, title = extract_artist_title(video_title)
print(f" ({i}/{total}) Processing: {artist} - {title}")
if self.execute_pipeline(
video_id, artist, title, channel_name, video_title
):
successful += 1
else:
print(f" ❌ Failed to process: {artist} - {title}")
return successful, total

View File

@ -3,20 +3,20 @@ Download plan building utilities.
Handles pre-scanning channels and building download plans.
"""
from karaoke_downloader.youtube_utils import get_channel_info
from karaoke_downloader.fuzzy_matcher import (
is_fuzzy_match,
is_exact_match,
create_song_key,
extract_artist_title,
get_similarity_function
)
from karaoke_downloader.cache_manager import (
delete_plan_cache,
get_download_plan_cache_file,
load_cached_plan,
save_plan_cache,
delete_plan_cache
)
from karaoke_downloader.fuzzy_matcher import (
create_song_key,
extract_artist_title,
get_similarity_function,
is_exact_match,
is_fuzzy_match,
)
from karaoke_downloader.youtube_utils import get_channel_info
# Constants
DEFAULT_FILENAME_LENGTH_LIMIT = 100
@ -24,7 +24,15 @@ DEFAULT_ARTIST_LENGTH_LIMIT = 30
DEFAULT_TITLE_LENGTH_LIMIT = 60
DEFAULT_FUZZY_THRESHOLD = 85
def build_download_plan(channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD):
def build_download_plan(
channel_urls,
undownloaded,
tracker,
yt_dlp_path,
fuzzy_match=False,
fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD,
):
"""
For each song in undownloaded, scan all channels for a match.
Use fuzzy matching if enabled.
@ -35,34 +43,37 @@ def build_download_plan(channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_
plan = []
unmatched = []
channel_match_counts = {}
# Pre-process songlist for O(1) lookups
song_keys = set()
song_lookup = {}
for song in undownloaded:
key = create_song_key(song['artist'], song['title'])
key = create_song_key(song["artist"], song["title"])
song_keys.add(key)
song_lookup[key] = song
for i, channel_url in enumerate(channel_urls, 1):
print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}")
print(f" 🔍 Getting channel info...")
channel_name, channel_id = get_channel_info(channel_url)
print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_name} ({channel_url})")
print(f" ✅ Channel info: {channel_name} (ID: {channel_id})")
print(f" 🔍 Fetching video list from channel...")
available_videos = tracker.get_channel_video_list(
channel_url,
yt_dlp_path=str(yt_dlp_path),
force_refresh=False
channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False
)
print(
f" 📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs"
)
print(f" 📊 Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs")
matches_this_channel = 0
video_matches = [] # Initialize video_matches for this channel
# Pre-process video titles for efficient matching
if fuzzy_match:
# For fuzzy matching, create normalized video keys
video_matches = []
for video in available_videos:
v_artist, v_title = extract_artist_title(video['title'])
v_artist, v_title = extract_artist_title(video["title"])
video_key = create_song_key(v_artist, v_title)
# Find best match among remaining songs
best_match = None
best_score = 0
@ -72,18 +83,20 @@ def build_download_plan(channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_
if score >= fuzzy_threshold and score > best_score:
best_score = score
best_match = song_key
if best_match:
song = song_lookup[best_match]
video_matches.append({
'artist': song['artist'],
'title': song['title'],
'channel_name': channel_name,
'channel_url': channel_url,
'video_id': video['id'],
'video_title': video['title'],
'match_score': best_score
})
video_matches.append(
{
"artist": song["artist"],
"title": song["title"],
"channel_name": channel_name,
"channel_url": channel_url,
"video_id": video["id"],
"video_title": video["title"],
"match_score": best_score,
}
)
# Remove matched song from future consideration
del song_lookup[best_match]
song_keys.remove(best_match)
@ -91,39 +104,43 @@ def build_download_plan(channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_
else:
# For exact matching, use direct key comparison
for video in available_videos:
v_artist, v_title = extract_artist_title(video['title'])
v_artist, v_title = extract_artist_title(video["title"])
video_key = create_song_key(v_artist, v_title)
if video_key in song_keys:
song = song_lookup[video_key]
video_matches.append({
'artist': song['artist'],
'title': song['title'],
'channel_name': channel_name,
'channel_url': channel_url,
'video_id': video['id'],
'video_title': video['title'],
'match_score': 100
})
video_matches.append(
{
"artist": song["artist"],
"title": song["title"],
"channel_name": channel_name,
"channel_url": channel_url,
"video_id": video["id"],
"video_title": video["title"],
"match_score": 100,
}
)
# Remove matched song from future consideration
del song_lookup[video_key]
song_keys.remove(video_key)
matches_this_channel += 1
# Add matches to plan
plan.extend(video_matches)
# Print match count once per channel
channel_match_counts[channel_name] = matches_this_channel
print(f" → Found {matches_this_channel} songlist matches in this channel.")
# Remaining unmatched songs
unmatched = list(song_lookup.values())
# Print summary table
print("\n📊 Channel match summary:")
for channel, count in channel_match_counts.items():
print(f" {channel}: {count} matches")
print(f" TOTAL: {sum(channel_match_counts.values())} matches across {len(channel_match_counts)} channels.")
return plan, unmatched
print(
f" TOTAL: {sum(channel_match_counts.values())} matches across {len(channel_match_counts)} channels."
)
return plan, unmatched

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,206 @@
"""
Error handling and formatting utilities for consistent error messages across the application.
"""
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional
class DownloadError(Exception):
"""Base exception for download-related errors."""
def __init__(
self,
message: str,
error_type: str = "download_error",
details: Optional[str] = None,
):
self.message = message
self.error_type = error_type
self.details = details
super().__init__(self.message)
class YtDlpError(DownloadError):
"""Exception for yt-dlp specific errors."""
def __init__(
self,
message: str,
exit_code: Optional[int] = None,
stderr: Optional[str] = None,
):
self.exit_code = exit_code
self.stderr = stderr
super().__init__(
message, "yt_dlp_error", f"Exit code: {exit_code}, Stderr: {stderr}"
)
class FileValidationError(DownloadError):
"""Exception for file validation errors."""
def __init__(self, message: str, file_path: Optional[Path] = None):
self.file_path = file_path
super().__init__(message, "file_validation_error", f"File: {file_path}")
def format_error_message(
error_type: str,
artist: str,
title: str,
video_id: Optional[str] = None,
channel_name: Optional[str] = None,
details: Optional[str] = None,
) -> str:
"""
Format a consistent error message for tracking and logging.
Args:
error_type: Type of error (e.g., "yt-dlp failed", "file verification failed")
artist: Artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name (optional)
details: Additional error details (optional)
Returns:
Formatted error message
"""
base_msg = f"{error_type}: {artist} - {title}"
if video_id:
base_msg += f" (Video ID: {video_id})"
if channel_name:
base_msg += f" (Channel: {channel_name})"
if details:
base_msg += f" - {details}"
return base_msg
def handle_yt_dlp_error(
exception: subprocess.CalledProcessError,
artist: str,
title: str,
video_id: Optional[str] = None,
channel_name: Optional[str] = None,
) -> YtDlpError:
"""
Handle yt-dlp subprocess errors and create a standardized exception.
Args:
exception: The CalledProcessError from subprocess.run
artist: Artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name (optional)
Returns:
YtDlpError with formatted message
"""
error_msg = format_error_message(
"yt-dlp failed",
artist,
title,
video_id,
channel_name,
f"exit code {exception.returncode}: {exception.stderr}",
)
return YtDlpError(
error_msg, exit_code=exception.returncode, stderr=exception.stderr
)
def handle_file_validation_error(
message: str,
file_path: Path,
artist: str,
title: str,
video_id: Optional[str] = None,
channel_name: Optional[str] = None,
) -> FileValidationError:
"""
Handle file validation errors and create a standardized exception.
Args:
message: Error message
file_path: Path to the file that failed validation
artist: Artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name (optional)
Returns:
FileValidationError with formatted message
"""
error_msg = format_error_message(
"file validation failed",
artist,
title,
video_id,
channel_name,
f"{message} - File: {file_path}",
)
return FileValidationError(error_msg, file_path)
def log_error(error: DownloadError, logger=None) -> None:
"""
Log an error with consistent formatting.
Args:
error: DownloadError instance
logger: Optional logger instance
"""
if logger:
logger.error(f"{error.message}")
if error.details:
logger.error(f" Details: {error.details}")
else:
print(f"{error.message}")
if error.details:
print(f" Details: {error.details}")
def create_error_context(
artist: str,
title: str,
video_id: Optional[str] = None,
channel_name: Optional[str] = None,
file_path: Optional[Path] = None,
) -> Dict[str, Any]:
"""
Create a context dictionary for error reporting.
Args:
artist: Artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name (optional)
file_path: File path (optional)
Returns:
Dictionary with error context
"""
context = {
"artist": artist,
"title": title,
"timestamp": None, # Could be added if needed
}
if video_id:
context["video_id"] = video_id
if channel_name:
context["channel_name"] = channel_name
if file_path:
context["file_path"] = str(file_path)
return context

View File

@ -0,0 +1,200 @@
"""
File utilities for filename sanitization, path operations, and file validation.
Centralizes common file operations to eliminate code duplication.
"""
import re
from pathlib import Path
from typing import List, Optional, Tuple
# Constants for filename operations
DEFAULT_FILENAME_LENGTH_LIMIT = 100
DEFAULT_ARTIST_LENGTH_LIMIT = 30
DEFAULT_TITLE_LENGTH_LIMIT = 60
# Windows invalid characters
INVALID_FILENAME_CHARS = ["?", ":", "*", '"', "<", ">", "|", "/", "\\"]
def sanitize_filename(
artist: str, title: str, max_length: int = DEFAULT_FILENAME_LENGTH_LIMIT
) -> str:
"""
Create a safe filename from artist and title.
Args:
artist: Song artist name
title: Song title
max_length: Maximum filename length (default: 100)
Returns:
Sanitized filename string
"""
# Clean up title
safe_title = (
title.replace("(From ", "")
.replace(")", "")
.replace(" - ", " ")
.replace(":", "")
)
safe_title = safe_title.replace("'", "").replace('"', "")
# Clean up artist
safe_artist = artist.replace("'", "").replace('"', "").strip()
# Remove invalid characters
for char in INVALID_FILENAME_CHARS:
safe_title = safe_title.replace(char, "")
safe_artist = safe_artist.replace(char, "")
# Remove problematic patterns
safe_title = (
safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
)
safe_artist = safe_artist.strip()
# Create filename
filename = f"{safe_artist} - {safe_title}.mp4"
# Limit filename length if needed
if len(filename) > max_length:
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
return filename
def generate_possible_filenames(
artist: str, title: str, channel_name: str
) -> List[str]:
"""
Generate possible filename patterns for different download modes.
Args:
artist: Song artist name
title: Song title
channel_name: Channel name
Returns:
List of possible filename patterns
"""
safe_title = sanitize_title_for_filenames(title)
safe_artist = artist.replace("'", "").replace('"', "").strip()
return [
f"{safe_artist} - {safe_title}.mp4", # Songlist mode
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
f"{safe_artist} - {safe_title} (Karaoke Version).mp4", # Channel videos mode
]
def sanitize_title_for_filenames(title: str) -> str:
"""
Sanitize title specifically for filename generation.
Args:
title: Song title
Returns:
Sanitized title string
"""
safe_title = title
for char in INVALID_FILENAME_CHARS:
safe_title = safe_title.replace(char, "")
safe_title = (
safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
)
return safe_title
def check_file_exists_with_patterns(
downloads_dir: Path, channel_name: str, artist: str, title: str
) -> Tuple[bool, Optional[Path]]:
"""
Check if a file exists using multiple possible filename patterns.
Args:
downloads_dir: Base downloads directory
channel_name: Channel name
artist: Song artist
title: Song title
Returns:
Tuple of (exists, file_path) where file_path is None if not found
"""
possible_filenames = generate_possible_filenames(artist, title, channel_name)
channel_dir = downloads_dir / channel_name
for filename in possible_filenames:
if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT:
# Apply length limits if needed
safe_artist = artist.replace("'", "").replace('"', "").strip()
safe_title = sanitize_title_for_filenames(title)
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
file_path = channel_dir / filename
if file_path.exists() and file_path.stat().st_size > 0:
return True, file_path
return False, None
def ensure_directory_exists(directory: Path) -> None:
"""
Ensure a directory exists, creating it if necessary.
Args:
directory: Directory path to ensure exists
"""
directory.mkdir(parents=True, exist_ok=True)
def is_valid_mp4_file(file_path: Path) -> bool:
"""
Check if a file is a valid MP4 file.
Args:
file_path: Path to the file to check
Returns:
True if file is a valid MP4, False otherwise
"""
if not file_path.exists():
return False
# Check file size
if file_path.stat().st_size == 0:
return False
# Check file extension
if file_path.suffix.lower() != ".mp4":
return False
# Basic MP4 header check (first 4 bytes should be 'ftyp')
try:
with open(file_path, "rb") as f:
header = f.read(8)
if len(header) >= 8 and header[4:8] == b"ftyp":
return True
except (IOError, OSError):
pass
return False
def cleanup_temp_files(file_path: Path) -> None:
"""
Clean up temporary files created by yt-dlp.
Args:
file_path: Base file path (without extension)
"""
temp_extensions = [".info.json", ".meta", ".webp", ".jpg", ".png"]
for ext in temp_extensions:
temp_file = file_path.with_suffix(ext)
if temp_file.exists():
try:
temp_file.unlink()
except (IOError, OSError):
pass # Ignore cleanup errors

View File

@ -3,6 +3,7 @@ Fuzzy matching utilities for songlist-to-video matching.
Handles similarity calculations and match validation.
"""
def get_similarity_function():
"""
Get the best available similarity function.
@ -10,20 +11,26 @@ def get_similarity_function():
"""
try:
from rapidfuzz import fuzz
def similarity(a, b):
return fuzz.ratio(a, b)
return similarity
except ImportError:
import difflib
def similarity(a, b):
return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
return similarity
def normalize_title(title):
"""Normalize a title for comparison."""
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
def extract_artist_title(video_title):
"""Extract artist and title from video title."""
if " - " in video_title:
@ -31,57 +38,63 @@ def extract_artist_title(video_title):
return parts[0].strip(), parts[1].strip()
return "", video_title
def create_song_key(artist, title):
"""Create a normalized key for song comparison."""
return f"{artist.lower()}_{normalize_title(title)}"
def create_video_key(video_title):
"""Create a normalized key for video comparison."""
artist, title = extract_artist_title(video_title)
return f"{artist.lower()}_{normalize_title(title)}"
def is_fuzzy_match(songlist_artist, songlist_title, video_title, threshold=90):
"""
Check if a songlist entry matches a video title using fuzzy matching.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
threshold: Minimum similarity score (0-100)
Returns:
tuple: (is_match, score) where is_match is boolean and score is the similarity score
"""
similarity = get_similarity_function()
song_key = create_song_key(songlist_artist, songlist_title)
video_key = create_video_key(video_title)
score = similarity(song_key, video_key)
is_match = score >= threshold
return is_match, score
def is_exact_match(songlist_artist, songlist_title, video_title):
"""
Check if a songlist entry exactly matches a video title.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
Returns:
bool: True if exact match, False otherwise
"""
v_artist, v_title = extract_artist_title(video_title)
# Check artist and title separately
artist_match = normalize_title(v_artist) == normalize_title(songlist_artist)
title_match = normalize_title(v_title) == normalize_title(songlist_title)
# Also check if video title matches "artist - title" format
full_title_match = normalize_title(video_title) == normalize_title(f"{songlist_artist} - {songlist_title}")
return (artist_match and title_match) or full_title_match
full_title_match = normalize_title(video_title) == normalize_title(
f"{songlist_artist} - {songlist_title}"
)
return (artist_match and title_match) or full_title_match

View File

@ -6,8 +6,11 @@ try:
except ImportError:
MUTAGEN_AVAILABLE = False
def extract_artist_title(video_title):
title = video_title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
title = (
video_title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
)
if " - " in title:
parts = title.split(" - ", 1)
if len(parts) == 2:
@ -16,6 +19,7 @@ def extract_artist_title(video_title):
return artist, song_title
return "Unknown Artist", title
def add_id3_tags(file_path, video_title, channel_name):
if not MUTAGEN_AVAILABLE:
print("⚠️ mutagen not available - skipping ID3 tagging")
@ -23,11 +27,11 @@ def add_id3_tags(file_path, video_title, channel_name):
try:
artist, title = extract_artist_title(video_title)
mp4 = MP4(str(file_path))
mp4['\xa9nam'] = title
mp4['\xa9ART'] = artist
mp4['\xa9alb'] = f"{channel_name} Karaoke"
mp4['\xa9gen'] = "Karaoke"
mp4["\xa9nam"] = title
mp4["\xa9ART"] = artist
mp4["\xa9alb"] = f"{channel_name} Karaoke"
mp4["\xa9gen"] = "Karaoke"
mp4.save()
print(f"📝 Added ID3 tags: Artist='{artist}', Title='{title}'")
except Exception as e:
print(f"⚠️ Could not add ID3 tags: {e}")
print(f"⚠️ Could not add ID3 tags: {e}")

View File

@ -0,0 +1,376 @@
"""
Parallel download management for concurrent video downloads.
Handles thread-safe operations, progress tracking, and error handling.
"""
import concurrent.futures
import threading
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from dataclasses import dataclass, field
from karaoke_downloader.config_manager import AppConfig
from karaoke_downloader.download_pipeline import DownloadPipeline
from karaoke_downloader.tracking_manager import TrackingManager
@dataclass
class DownloadTask:
"""Represents a single download task with all necessary information."""
video_id: str
artist: str
title: str
channel_name: str
video_title: Optional[str] = None
priority: int = 0 # Higher number = higher priority
retry_count: int = 0
max_retries: int = 3
created_at: float = field(default_factory=time.time)
def __post_init__(self):
if self.created_at == 0:
self.created_at = time.time()
@dataclass
class DownloadResult:
"""Result of a download operation."""
task: DownloadTask
success: bool
error_message: Optional[str] = None
file_path: Optional[Path] = None
download_time: float = 0.0
file_size: Optional[int] = None
class ParallelDownloader:
"""
Manages parallel downloads with thread-safe operations and progress tracking.
"""
def __init__(
self,
yt_dlp_path: str,
config: Union[AppConfig, Dict[str, Any]],
downloads_dir: Path,
max_workers: int = 3,
songlist_tracking: Optional[Dict] = None,
tracker: Optional[TrackingManager] = None,
):
"""
Initialize the parallel downloader.
Args:
yt_dlp_path: Path to yt-dlp executable
config: Configuration object or dictionary
downloads_dir: Base downloads directory
max_workers: Maximum number of concurrent downloads
songlist_tracking: Optional songlist tracking data
tracker: Optional tracking manager
"""
self.yt_dlp_path = yt_dlp_path
self.config = config
self.downloads_dir = downloads_dir
self.max_workers = max_workers
self.songlist_tracking = songlist_tracking or {}
self.tracker = tracker
# Thread-safe state management
self._lock = threading.Lock()
self._active_downloads = 0
self._completed_downloads = 0
self._failed_downloads = 0
self._total_downloads = 0
self._start_time = None
# Progress tracking
self._progress_callbacks = []
self._download_queue = []
self._results = []
# Create download pipeline
self.pipeline = DownloadPipeline(
yt_dlp_path=yt_dlp_path,
config=config,
downloads_dir=downloads_dir,
songlist_tracking=songlist_tracking,
tracker=tracker,
)
def add_progress_callback(self, callback) -> None:
"""Add a progress callback function."""
with self._lock:
self._progress_callbacks.append(callback)
def _notify_progress(self, message: str, **kwargs) -> None:
"""Notify all progress callbacks."""
with self._lock:
for callback in self._progress_callbacks:
try:
callback(message, **kwargs)
except Exception as e:
print(f"⚠️ Progress callback error: {e}")
def add_download_task(self, task: DownloadTask) -> None:
"""Add a download task to the queue."""
with self._lock:
self._download_queue.append(task)
self._total_downloads += 1
def add_download_tasks(self, tasks: List[DownloadTask]) -> None:
"""Add multiple download tasks to the queue."""
with self._lock:
self._download_queue.extend(tasks)
self._total_downloads += len(tasks)
def _download_single_task(self, task: DownloadTask) -> DownloadResult:
"""Execute a single download task."""
start_time = time.time()
try:
with self._lock:
self._active_downloads += 1
self._notify_progress(
"Starting download",
task=task,
active_downloads=self._active_downloads,
total_downloads=self._total_downloads
)
# Execute the download pipeline
success = self.pipeline.execute_pipeline(
video_id=task.video_id,
artist=task.artist,
title=task.title,
channel_name=task.channel_name,
video_title=task.video_title,
)
download_time = time.time() - start_time
# Determine file path if successful
file_path = None
file_size = None
if success:
filename = f"{task.artist} - {task.title}.mp4"
file_path = self.downloads_dir / task.channel_name / filename
if file_path.exists():
file_size = file_path.stat().st_size
result = DownloadResult(
task=task,
success=success,
file_path=file_path,
download_time=download_time,
file_size=file_size,
)
with self._lock:
if success:
self._completed_downloads += 1
else:
self._failed_downloads += 1
self._active_downloads -= 1
self._notify_progress(
"Download completed" if success else "Download failed",
result=result,
active_downloads=self._active_downloads,
completed_downloads=self._completed_downloads,
failed_downloads=self._failed_downloads,
total_downloads=self._total_downloads
)
return result
except Exception as e:
download_time = time.time() - start_time
with self._lock:
self._failed_downloads += 1
self._active_downloads -= 1
result = DownloadResult(
task=task,
success=False,
error_message=str(e),
download_time=download_time,
)
self._notify_progress(
"Download error",
result=result,
active_downloads=self._active_downloads,
completed_downloads=self._completed_downloads,
failed_downloads=self._failed_downloads,
total_downloads=self._total_downloads
)
return result
def _retry_failed_downloads(self, failed_results: List[DownloadResult]) -> List[DownloadResult]:
"""Retry failed downloads up to their max retry count."""
retry_tasks = []
for result in failed_results:
if result.task.retry_count < result.task.max_retries:
result.task.retry_count += 1
retry_tasks.append(result.task)
if not retry_tasks:
return []
print(f"🔄 Retrying {len(retry_tasks)} failed downloads...")
# Execute retries with reduced concurrency to avoid overwhelming the system
retry_workers = max(1, self.max_workers // 2)
with concurrent.futures.ThreadPoolExecutor(max_workers=retry_workers) as executor:
future_to_task = {
executor.submit(self._download_single_task, task): task
for task in retry_tasks
}
retry_results = []
for future in concurrent.futures.as_completed(future_to_task):
result = future.result()
retry_results.append(result)
return retry_results
def execute_downloads(self, show_progress: bool = True) -> List[DownloadResult]:
"""
Execute all queued downloads in parallel.
Args:
show_progress: Whether to show progress information
Returns:
List of download results
"""
if not self._download_queue:
print("📭 No downloads queued.")
return []
# Sort tasks by priority (higher priority first)
with self._lock:
self._download_queue.sort(key=lambda x: x.priority, reverse=True)
tasks = self._download_queue.copy()
self._download_queue.clear()
self._start_time = time.time()
self._results = []
print(f"🚀 Starting parallel downloads with {self.max_workers} workers...")
print(f"📋 Total tasks: {len(tasks)}")
# Progress display thread
progress_thread = None
if show_progress:
progress_thread = threading.Thread(
target=self._progress_display_loop,
daemon=True
)
progress_thread.start()
try:
# Execute downloads in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_task = {
executor.submit(self._download_single_task, task): task
for task in tasks
}
for future in concurrent.futures.as_completed(future_to_task):
result = future.result()
self._results.append(result)
# Handle retries for failed downloads
failed_results = [r for r in self._results if not r.success]
if failed_results:
retry_results = self._retry_failed_downloads(failed_results)
self._results.extend(retry_results)
finally:
# Stop progress display
if progress_thread and progress_thread.is_alive():
self._stop_progress = True
progress_thread.join(timeout=1)
# Final summary
total_time = time.time() - self._start_time
successful = len([r for r in self._results if r.success])
failed = len([r for r in self._results if not r.success])
print(f"\n🎉 Parallel downloads completed!")
print(f" ✅ Successful: {successful}")
print(f" ❌ Failed: {failed}")
print(f" ⏱️ Total time: {total_time:.1f}s")
print(f" 📊 Average time per download: {total_time/len(tasks):.1f}s")
return self._results
def _progress_display_loop(self) -> None:
"""Display progress updates in a separate thread."""
self._stop_progress = False
while not self._stop_progress:
with self._lock:
active = self._active_downloads
completed = self._completed_downloads
failed = self._failed_downloads
total = self._total_downloads
if total > 0:
progress = (completed + failed) / total * 100
print(f"\r📊 Progress: {progress:.1f}% | Active: {active} | Completed: {completed} | Failed: {failed} | Total: {total}", end="", flush=True)
time.sleep(1)
print() # New line after progress display
def get_stats(self) -> Dict[str, Any]:
"""Get current download statistics."""
with self._lock:
return {
"active_downloads": self._active_downloads,
"completed_downloads": self._completed_downloads,
"failed_downloads": self._failed_downloads,
"total_downloads": self._total_downloads,
"queued_downloads": len(self._download_queue),
"elapsed_time": time.time() - self._start_time if self._start_time else 0,
}
def create_parallel_downloader(
yt_dlp_path: str,
config: Union[AppConfig, Dict[str, Any]],
downloads_dir: Path,
max_workers: int = 3,
songlist_tracking: Optional[Dict] = None,
tracker: Optional[TrackingManager] = None,
) -> ParallelDownloader:
"""
Factory function to create a parallel downloader instance.
Args:
yt_dlp_path: Path to yt-dlp executable
config: Configuration object or dictionary
downloads_dir: Base downloads directory
max_workers: Maximum number of concurrent downloads
songlist_tracking: Optional songlist tracking data
tracker: Optional tracking manager
Returns:
ParallelDownloader instance
"""
return ParallelDownloader(
yt_dlp_path=yt_dlp_path,
config=config,
downloads_dir=downloads_dir,
max_workers=max_workers,
songlist_tracking=songlist_tracking,
tracker=tracker,
)

View File

@ -4,19 +4,20 @@ Resolution Update Utility for Karaoke Playlist Downloader
Easily update the preferred video resolution in the configuration.
"""
import json
import argparse
import json
from pathlib import Path
def update_resolution(resolution):
"""Update the resolution in config.json file."""
config_file = Path("config.json")
# Load existing config or create default
if config_file.exists():
try:
with open(config_file, 'r', encoding='utf-8') as f:
with open(config_file, "r", encoding="utf-8") as f:
config = json.load(f)
except json.JSONDecodeError:
print("❌ Error: Invalid config.json file")
@ -42,52 +43,56 @@ def update_resolution(resolution):
"continue_downloads": True,
"no_overwrites": True,
"ignore_errors": True,
"no_warnings": False
"no_warnings": False,
},
"folder_structure": {
"downloads_dir": "downloads",
"logs_dir": "logs",
"tracking_file": "karaoke_tracking.json"
"tracking_file": "karaoke_tracking.json",
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(levelname)s - %(message)s",
"include_console": True,
"include_file": True
"include_file": True,
},
"yt_dlp_path": "downloader/yt-dlp.exe"
"yt_dlp_path": "downloader/yt-dlp.exe",
}
# Resolution mapping
resolution_map = {
'480p': '480',
'720p': '720',
'1080p': '1080',
'1440p': '1440',
'2160p': '2160'
"480p": "480",
"720p": "720",
"1080p": "1080",
"1440p": "1440",
"2160p": "2160",
}
if resolution not in resolution_map:
print(f"❌ Error: Invalid resolution '{resolution}'")
print(f"Valid options: {', '.join(resolution_map.keys())}")
return False
height = resolution_map[resolution]
old_resolution = config["download_settings"].get("preferred_resolution", "720p")
# Update the format string
config["download_settings"]["format"] = f"best[height<={height}][ext=mp4]/best[height<={height}]/best[ext=mp4]/best"
config["download_settings"][
"format"
] = f"best[height<={height}][ext=mp4]/best[height<={height}]/best[ext=mp4]/best"
config["download_settings"]["preferred_resolution"] = resolution
# Save the updated config
try:
with open(config_file, 'w', encoding='utf-8') as f:
with open(config_file, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
print(f"✅ Successfully updated resolution from {old_resolution} to {resolution}")
print(
f"✅ Successfully updated resolution from {old_resolution} to {resolution}"
)
print(f"📝 Format string: {config['download_settings']['format']}")
return True
except Exception as e:
print(f"❌ Error saving config: {e}")
return False
@ -96,21 +101,25 @@ def update_resolution(resolution):
def show_current_resolution():
"""Show the current resolution setting."""
config_file = Path("config.json")
if not config_file.exists():
print("📝 No config.json file found. Using default 720p resolution.")
return
try:
with open(config_file, 'r', encoding='utf-8') as f:
with open(config_file, "r", encoding="utf-8") as f:
config = json.load(f)
current_resolution = config["download_settings"].get("preferred_resolution", "720p")
current_format = config["download_settings"].get("format", "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best")
current_resolution = config["download_settings"].get(
"preferred_resolution", "720p"
)
current_format = config["download_settings"].get(
"format", "best[height<=720][ext=mp4]/best[height<=720]/best[ext=mp4]/best"
)
print(f"🎬 Current resolution: {current_resolution}")
print(f"📝 Format string: {current_format}")
except json.JSONDecodeError:
print("❌ Error: Invalid config.json file")
except Exception as e:
@ -126,23 +135,22 @@ Examples:
python update_resolution.py --show
python update_resolution.py --resolution 1080p
python update_resolution.py --resolution 720p
"""
""",
)
parser.add_argument(
'--resolution', '-r',
choices=['480p', '720p', '1080p', '1440p', '2160p'],
help='Set the preferred video resolution'
"--resolution",
"-r",
choices=["480p", "720p", "1080p", "1440p", "2160p"],
help="Set the preferred video resolution",
)
parser.add_argument(
'--show', '-s',
action='store_true',
help='Show current resolution setting'
"--show", "-s", action="store_true", help="Show current resolution setting"
)
args = parser.parse_args()
if args.show:
show_current_resolution()
elif args.resolution:
@ -152,4 +160,4 @@ Examples:
if __name__ == "__main__":
main()
main()

View File

@ -4,8 +4,9 @@ Handles server songs loading and server duplicates tracking.
"""
import json
from pathlib import Path
from datetime import datetime
from pathlib import Path
def load_server_songs(songs_path="data/songs.json"):
"""Load the list of songs already available on the server."""
@ -14,7 +15,7 @@ def load_server_songs(songs_path="data/songs.json"):
print(f"⚠️ Server songs file not found: {songs_path}")
return set()
try:
with open(songs_file, 'r', encoding='utf-8') as f:
with open(songs_file, "r", encoding="utf-8") as f:
data = json.load(f)
server_songs = set()
for song in data:
@ -29,36 +30,45 @@ def load_server_songs(songs_path="data/songs.json"):
print(f"⚠️ Could not load server songs: {e}")
return set()
def is_song_on_server(server_songs, artist, title):
"""Check if a song is already available on the server."""
key = f"{artist.lower()}_{normalize_title(title)}"
return key in server_songs
def load_server_duplicates_tracking(tracking_path="data/server_duplicates_tracking.json"):
def load_server_duplicates_tracking(
tracking_path="data/server_duplicates_tracking.json",
):
"""Load the tracking of songs found to be duplicates on the server."""
tracking_file = Path(tracking_path)
if not tracking_file.exists():
return {}
try:
with open(tracking_file, 'r', encoding='utf-8') as f:
with open(tracking_file, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load server duplicates tracking: {e}")
return {}
def save_server_duplicates_tracking(tracking, tracking_path="data/server_duplicates_tracking.json"):
def save_server_duplicates_tracking(
tracking, tracking_path="data/server_duplicates_tracking.json"
):
"""Save the tracking of songs found to be duplicates on the server."""
try:
with open(tracking_path, 'w', encoding='utf-8') as f:
with open(tracking_path, "w", encoding="utf-8") as f:
json.dump(tracking, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"⚠️ Could not save server duplicates tracking: {e}")
def is_song_marked_as_server_duplicate(tracking, artist, title):
"""Check if a song has been marked as a server duplicate."""
key = f"{artist.lower()}_{normalize_title(title)}"
return key in tracking
def mark_song_as_server_duplicate(tracking, artist, title, video_title, channel_name):
"""Mark a song as a server duplicate for future skipping."""
key = f"{artist.lower()}_{normalize_title(title)}"
@ -68,19 +78,27 @@ def mark_song_as_server_duplicate(tracking, artist, title, video_title, channel_
"video_title": video_title,
"channel": channel_name,
"marked_at": datetime.now().isoformat(),
"reason": "already_on_server"
"reason": "already_on_server",
}
save_server_duplicates_tracking(tracking)
def check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, video_title, channel_name):
def check_and_mark_server_duplicate(
server_songs, server_duplicates_tracking, artist, title, video_title, channel_name
):
"""Check if a song is on server and mark it as duplicate if so. Returns True if it's a duplicate."""
if is_song_on_server(server_songs, artist, title):
if not is_song_marked_as_server_duplicate(server_duplicates_tracking, artist, title):
mark_song_as_server_duplicate(server_duplicates_tracking, artist, title, video_title, channel_name)
if not is_song_marked_as_server_duplicate(
server_duplicates_tracking, artist, title
):
mark_song_as_server_duplicate(
server_duplicates_tracking, artist, title, video_title, channel_name
)
return True
return False
def normalize_title(title):
"""Normalize a title for consistent key generation."""
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
return " ".join(normalized.split()).lower()

View File

@ -0,0 +1,158 @@
"""
Song validation utilities for checking if songs should be downloaded.
Centralizes song validation logic to eliminate code duplication.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from karaoke_downloader.file_utils import check_file_exists_with_patterns
from karaoke_downloader.tracking_manager import TrackingManager
class SongValidator:
"""
Centralized song validation logic for checking if songs should be downloaded.
"""
def __init__(self, tracker: TrackingManager, downloads_dir: Path):
"""
Initialize the song validator.
Args:
tracker: Tracking manager instance
downloads_dir: Base downloads directory
"""
self.tracker = tracker
self.downloads_dir = downloads_dir
def should_skip_song(
self,
artist: str,
title: str,
channel_name: str,
video_id: Optional[str] = None,
video_title: Optional[str] = None,
server_songs: Optional[Dict[str, Any]] = None,
server_duplicates_tracking: Optional[Dict[str, Any]] = None,
) -> Tuple[bool, Optional[str], int]:
"""
Check if a song should be skipped based on multiple criteria.
Performs checks in order:
1. Already downloaded (tracking)
2. File exists on filesystem
3. Already on server
4. Previously failed download (bad file)
Args:
artist: Song artist name
title: Song title
channel_name: Channel name
video_id: YouTube video ID (optional)
video_title: YouTube video title (optional)
server_songs: Server songs data (optional)
server_duplicates_tracking: Server duplicates tracking (optional)
Returns:
Tuple of (should_skip, reason, total_filtered)
"""
total_filtered = 0
# Check 1: Already downloaded by this system
if self.tracker.is_song_downloaded(artist, title, channel_name, video_id):
return True, "already downloaded", total_filtered
# Check 2: File already exists on filesystem
file_exists, _ = check_file_exists_with_patterns(
self.downloads_dir, channel_name, artist, title
)
if file_exists:
return True, "file exists", total_filtered
# Check 3: Already on server (if server data provided)
if server_songs is not None and server_duplicates_tracking is not None:
from karaoke_downloader.server_manager import (
check_and_mark_server_duplicate,
)
if check_and_mark_server_duplicate(
server_songs,
server_duplicates_tracking,
artist,
title,
video_title,
channel_name,
):
total_filtered += 1
return True, "on server", total_filtered
# Check 4: Previously failed download (bad file)
if self.tracker.is_song_failed(artist, title, channel_name, video_id):
return True, "previously failed", total_filtered
return False, None, total_filtered
def mark_song_failed(
self,
artist: str,
title: str,
video_id: Optional[str],
channel_name: str,
error_message: str,
) -> None:
"""
Mark a song as failed in tracking.
Args:
artist: Song artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name
error_message: Error message to record
"""
self.tracker.mark_song_failed(
artist, title, video_id, channel_name, error_message
)
print(f"🏷️ Marked song as failed: {artist} - {title}")
def handle_download_failure(
self,
artist: str,
title: str,
video_id: Optional[str],
channel_name: str,
error_type: str,
error_details: str = "",
) -> None:
"""
Handle download failures with consistent error formatting.
Args:
artist: Song artist name
title: Song title
video_id: YouTube video ID (optional)
channel_name: Channel name
error_type: Type of error (e.g., "yt-dlp failed", "file verification failed")
error_details: Additional error details
"""
error_msg = f"{error_type}"
if error_details:
error_msg += f": {error_details}"
self.mark_song_failed(artist, title, video_id, channel_name, error_msg)
def create_song_validator(
tracker: TrackingManager, downloads_dir: Path
) -> SongValidator:
"""
Factory function to create a song validator instance.
Args:
tracker: Tracking manager instance
downloads_dir: Base downloads directory
Returns:
SongValidator instance
"""
return SongValidator(tracker, downloads_dir)

View File

@ -4,20 +4,25 @@ Handles songlist loading, tracking, and songlist-specific operations.
"""
import json
from pathlib import Path
from datetime import datetime
from pathlib import Path
from karaoke_downloader.server_manager import (
load_server_songs, is_song_on_server, load_server_duplicates_tracking,
check_and_mark_server_duplicate, is_song_marked_as_server_duplicate
check_and_mark_server_duplicate,
is_song_marked_as_server_duplicate,
is_song_on_server,
load_server_duplicates_tracking,
load_server_songs,
)
def load_songlist(songlist_path="data/songList.json"):
songlist_file = Path(songlist_path)
if not songlist_file.exists():
print(f"⚠️ Songlist file not found: {songlist_path}")
return []
try:
with open(songlist_file, 'r', encoding='utf-8') as f:
with open(songlist_file, "r", encoding="utf-8") as f:
data = json.load(f)
all_songs = []
seen = set()
@ -31,43 +36,50 @@ def load_songlist(songlist_path="data/songList.json"):
if key in seen:
continue
seen.add(key)
all_songs.append({
"artist": artist,
"title": title,
"position": song.get("position", 0)
})
all_songs.append(
{
"artist": artist,
"title": title,
"position": song.get("position", 0),
}
)
print(f"📋 Loaded {len(all_songs)} unique songs from songlist (deduplicated)")
return all_songs
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load songlist: {e}")
return []
def normalize_title(title):
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
def load_songlist_tracking(tracking_path="data/songlist_tracking.json"):
tracking_file = Path(tracking_path)
if not tracking_file.exists():
return {}
try:
with open(tracking_file, 'r', encoding='utf-8') as f:
with open(tracking_file, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"⚠️ Could not load songlist tracking: {e}")
return {}
def save_songlist_tracking(tracking, tracking_path="data/songlist_tracking.json"):
try:
with open(tracking_path, 'w', encoding='utf-8') as f:
with open(tracking_path, "w", encoding="utf-8") as f:
json.dump(tracking, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"⚠️ Could not save songlist tracking: {e}")
def is_songlist_song_downloaded(tracking, artist, title):
key = f"{artist.lower()}_{normalize_title(title)}"
return key in tracking
def mark_songlist_song_downloaded(tracking, artist, title, channel_name, file_path):
key = f"{artist.lower()}_{normalize_title(title)}"
tracking[key] = {
@ -75,6 +87,6 @@ def mark_songlist_song_downloaded(tracking, artist, title, channel_name, file_pa
"title": title,
"channel": channel_name,
"file_path": str(file_path),
"downloaded_at": datetime.now().isoformat()
"downloaded_at": datetime.now().isoformat(),
}
save_songlist_tracking(tracking)
save_songlist_tracking(tracking)

View File

@ -5,16 +5,17 @@ Provides tools to manage and analyze the tracking database.
"""
import argparse
import sys
import json
from pathlib import Path
from tracking_manager import TrackingManager, SongStatus
import sys
from tracking_manager import SongStatus, TrackingManager
def show_statistics(tracker):
"""Show overall statistics."""
stats = tracker.get_statistics()
print("🎤 Karaoke Downloader Statistics")
print("=" * 50)
print(f"📊 Total Songs: {stats['total_songs']}")
@ -24,23 +25,23 @@ def show_statistics(tracker):
print(f"⚠️ Partial Downloads: {stats['partial_songs']}")
print(f"💾 Total Size: {stats['total_size_mb']} MB")
print(f"🕒 Last Updated: {stats['last_updated']}")
if stats['total_songs'] > 0:
success_rate = (stats['downloaded_songs'] / stats['total_songs']) * 100
if stats["total_songs"] > 0:
success_rate = (stats["downloaded_songs"] / stats["total_songs"]) * 100
print(f"📈 Success Rate: {success_rate:.1f}%")
def list_playlists(tracker):
"""List all playlists with their statistics."""
playlists = tracker.data['playlists']
playlists = tracker.data["playlists"]
if not playlists:
print("📭 No playlists found in tracking database.")
return
print("📋 Playlists in Database")
print("=" * 50)
for playlist_id, playlist in playlists.items():
print(f"\n🎵 {playlist['name']}")
print(f" ID: {playlist_id}")
@ -54,13 +55,13 @@ def list_playlists(tracker):
def show_playlist_details(tracker, playlist_id):
"""Show detailed information about a specific playlist."""
if playlist_id not in tracker.data['playlists']:
if playlist_id not in tracker.data["playlists"]:
print(f"❌ Playlist '{playlist_id}' not found in tracking database.")
return
playlist = tracker.data['playlists'][playlist_id]
playlist = tracker.data["playlists"][playlist_id]
songs = tracker.get_playlist_songs(playlist_id)
print(f"🎵 Playlist Details: {playlist['name']}")
print("=" * 50)
print(f"ID: {playlist_id}")
@ -70,7 +71,7 @@ def show_playlist_details(tracker, playlist_id):
print(f"Failed: {playlist['failed_songs']}")
print(f"Added: {playlist['added_date']}")
print(f"Last Synced: {playlist['last_synced'] or 'Never'}")
print(f"\n📝 Songs:")
for i, song in enumerate(songs, 1):
status_icon = {
@ -79,15 +80,15 @@ def show_playlist_details(tracker, playlist_id):
SongStatus.DOWNLOADING: "",
SongStatus.PARTIAL: "⚠️",
SongStatus.FAILED: "",
SongStatus.NOT_DOWNLOADED: "⏸️"
}.get(song['status'], "")
formats = ", ".join(song['formats'].keys()) if song['formats'] else "None"
SongStatus.NOT_DOWNLOADED: "⏸️",
}.get(song["status"], "")
formats = ", ".join(song["formats"].keys()) if song["formats"] else "None"
print(f" {i:2d}. {status_icon} {song['title']}")
print(f" Video ID: {song['video_id']}")
print(f" Status: {song['status']}")
print(f" Formats: {formats}")
if song['last_error']:
if song["last_error"]:
print(f" Error: {song['last_error']}")
print()
@ -101,16 +102,19 @@ def show_failed_songs(tracker, playlist_id=None):
return
print(f"❌ Failed Songs in Playlist: {playlist_id}")
else:
songs = [song for song in tracker.data['songs'].values()
if song['status'] == SongStatus.FAILED]
songs = [
song
for song in tracker.data["songs"].values()
if song["status"] == SongStatus.FAILED
]
if not songs:
print("✅ No failed songs found in any playlist.")
return
print("❌ All Failed Songs")
print("=" * 50)
for song in songs:
playlist_name = tracker.data['playlists'][song['playlist_id']]['name']
playlist_name = tracker.data["playlists"][song["playlist_id"]]["name"]
print(f"\n🎵 {song['title']}")
print(f" Playlist: {playlist_name}")
print(f" Video ID: {song['video_id']}")
@ -128,16 +132,19 @@ def show_partial_downloads(tracker, playlist_id=None):
return
print(f"⚠️ Partial Downloads in Playlist: {playlist_id}")
else:
songs = [song for song in tracker.data['songs'].values()
if song['status'] == SongStatus.PARTIAL]
songs = [
song
for song in tracker.data["songs"].values()
if song["status"] == SongStatus.PARTIAL
]
if not songs:
print("✅ No partial downloads found in any playlist.")
return
print("⚠️ All Partial Downloads")
print("=" * 50)
for song in songs:
playlist_name = tracker.data['playlists'][song['playlist_id']]['name']
playlist_name = tracker.data["playlists"][song["playlist_id"]]["name"]
print(f"\n🎵 {song['title']}")
print(f" Playlist: {playlist_name}")
print(f" Video ID: {song['video_id']}")
@ -148,11 +155,11 @@ def show_partial_downloads(tracker, playlist_id=None):
def cleanup_orphaned_entries(tracker, downloads_dir):
"""Clean up orphaned tracking entries."""
orphaned = tracker.cleanup_orphaned_files(downloads_dir)
if orphaned:
print(f"🧹 Cleaned up {len(orphaned)} orphaned tracking entries:")
for song_id in orphaned:
song = tracker.data['songs'].get(song_id)
song = tracker.data["songs"].get(song_id)
if song:
print(f" - {song['title']} (ID: {song['video_id']})")
else:
@ -162,7 +169,7 @@ def cleanup_orphaned_entries(tracker, downloads_dir):
def export_database(tracker, output_file):
"""Export the tracking database to a JSON file."""
try:
with open(output_file, 'w', encoding='utf-8') as f:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(tracker.data, f, indent=2, ensure_ascii=False)
print(f"💾 Database exported to: {output_file}")
except Exception as e:
@ -182,62 +189,50 @@ Examples:
python manage_tracking.py --partial
python manage_tracking.py --cleanup
python manage_tracking.py --export backup.json
"""
""",
)
parser.add_argument(
'--stats', '--statistics',
action='store_true',
help='Show overall statistics'
"--stats", "--statistics", action="store_true", help="Show overall statistics"
)
parser.add_argument(
'--list-playlists',
action='store_true',
help='List all playlists in the database'
"--list-playlists",
action="store_true",
help="List all playlists in the database",
)
parser.add_argument(
'--playlist',
metavar='PLAYLIST_ID',
help='Show detailed information about a specific playlist'
"--playlist",
metavar="PLAYLIST_ID",
help="Show detailed information about a specific playlist",
)
parser.add_argument("--failed", action="store_true", help="Show all failed songs")
parser.add_argument(
'--failed',
action='store_true',
help='Show all failed songs'
"--partial", action="store_true", help="Show all partial downloads"
)
parser.add_argument(
'--partial',
action='store_true',
help='Show all partial downloads'
"--cleanup", action="store_true", help="Clean up orphaned tracking entries"
)
parser.add_argument(
'--cleanup',
action='store_true',
help='Clean up orphaned tracking entries'
"--export", metavar="FILE", help="Export tracking database to JSON file"
)
parser.add_argument(
'--export',
metavar='FILE',
help='Export tracking database to JSON file'
"--tracking-file",
default="karaoke_tracking.json",
help="Path to tracking database file (default: karaoke_tracking.json)",
)
parser.add_argument(
'--tracking-file',
default='karaoke_tracking.json',
help='Path to tracking database file (default: karaoke_tracking.json)'
)
args = parser.parse_args()
# Initialize tracking manager
tracker = TrackingManager(args.tracking_file)
# Process commands
if args.stats:
show_statistics(tracker)
@ -260,4 +255,4 @@ Examples:
if __name__ == "__main__":
main()
main()

View File

@ -1,8 +1,10 @@
import json
from pathlib import Path
from enum import Enum
from datetime import datetime
import threading
from enum import Enum
import json
from datetime import datetime
from pathlib import Path
class SongStatus(str, Enum):
NOT_DOWNLOADED = "NOT_DOWNLOADED"
@ -13,19 +15,22 @@ class SongStatus(str, Enum):
CONVERTING = "CONVERTING"
CONVERTED = "CONVERTED"
class FormatType(str, Enum):
MP4 = "MP4"
MP3 = "MP3"
OTHER = "OTHER"
class TrackingManager:
def __init__(self, tracking_file="data/karaoke_tracking.json", cache_file="data/channel_cache.json"):
def __init__(
self,
tracking_file="data/karaoke_tracking.json",
cache_file="data/channel_cache.json",
):
self.tracking_file = Path(tracking_file)
self.cache_file = Path(cache_file)
self.data = {
"playlists": {},
"songs": {}
}
self.data = {"playlists": {}, "songs": {}}
self.cache = {}
self._lock = threading.Lock()
self._load()
@ -34,14 +39,14 @@ class TrackingManager:
def _load(self):
if self.tracking_file.exists():
try:
with open(self.tracking_file, 'r', encoding='utf-8') as f:
with open(self.tracking_file, "r", encoding="utf-8") as f:
self.data = json.load(f)
except Exception:
self.data = {"playlists": {}, "songs": {}}
def _save(self):
with self._lock:
with open(self.tracking_file, 'w', encoding='utf-8') as f:
with open(self.tracking_file, "w", encoding="utf-8") as f:
json.dump(self.data, f, indent=2, ensure_ascii=False)
def force_save(self):
@ -50,23 +55,42 @@ class TrackingManager:
def _load_cache(self):
if self.cache_file.exists():
try:
with open(self.cache_file, 'r', encoding='utf-8') as f:
with open(self.cache_file, "r", encoding="utf-8") as f:
self.cache = json.load(f)
except Exception:
self.cache = {}
def save_cache(self):
with open(self.cache_file, 'w', encoding='utf-8') as f:
with open(self.cache_file, "w", encoding="utf-8") as f:
json.dump(self.cache, f, indent=2, ensure_ascii=False)
def get_statistics(self):
total_songs = len(self.data['songs'])
downloaded_songs = sum(1 for s in self.data['songs'].values() if s['status'] in [SongStatus.DOWNLOADED, SongStatus.CONVERTED])
failed_songs = sum(1 for s in self.data['songs'].values() if s['status'] == SongStatus.FAILED)
partial_songs = sum(1 for s in self.data['songs'].values() if s['status'] == SongStatus.PARTIAL)
total_playlists = len(self.data['playlists'])
total_size_mb = sum(s.get('file_size', 0) for s in self.data['songs'].values() if s.get('file_size')) / (1024*1024)
last_updated = max((s.get('last_updated') for s in self.data['songs'].values() if s.get('last_updated')), default=None)
total_songs = len(self.data["songs"])
downloaded_songs = sum(
1
for s in self.data["songs"].values()
if s["status"] in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]
)
failed_songs = sum(
1 for s in self.data["songs"].values() if s["status"] == SongStatus.FAILED
)
partial_songs = sum(
1 for s in self.data["songs"].values() if s["status"] == SongStatus.PARTIAL
)
total_playlists = len(self.data["playlists"])
total_size_mb = sum(
s.get("file_size", 0)
for s in self.data["songs"].values()
if s.get("file_size")
) / (1024 * 1024)
last_updated = max(
(
s.get("last_updated")
for s in self.data["songs"].values()
if s.get("last_updated")
),
default=None,
)
return {
"total_songs": total_songs,
"downloaded_songs": downloaded_songs,
@ -74,30 +98,44 @@ class TrackingManager:
"partial_songs": partial_songs,
"total_playlists": total_playlists,
"total_size_mb": round(total_size_mb, 2),
"last_updated": last_updated
"last_updated": last_updated,
}
def get_playlist_songs(self, playlist_id):
return [s for s in self.data['songs'].values() if s['playlist_id'] == playlist_id]
return [
s for s in self.data["songs"].values() if s["playlist_id"] == playlist_id
]
def get_failed_songs(self, playlist_id=None):
if playlist_id:
return [s for s in self.data['songs'].values() if s['playlist_id'] == playlist_id and s['status'] == SongStatus.FAILED]
return [s for s in self.data['songs'].values() if s['status'] == SongStatus.FAILED]
return [
s
for s in self.data["songs"].values()
if s["playlist_id"] == playlist_id and s["status"] == SongStatus.FAILED
]
return [
s for s in self.data["songs"].values() if s["status"] == SongStatus.FAILED
]
def get_partial_downloads(self, playlist_id=None):
if playlist_id:
return [s for s in self.data['songs'].values() if s['playlist_id'] == playlist_id and s['status'] == SongStatus.PARTIAL]
return [s for s in self.data['songs'].values() if s['status'] == SongStatus.PARTIAL]
return [
s
for s in self.data["songs"].values()
if s["playlist_id"] == playlist_id and s["status"] == SongStatus.PARTIAL
]
return [
s for s in self.data["songs"].values() if s["status"] == SongStatus.PARTIAL
]
def cleanup_orphaned_files(self, downloads_dir):
# Remove tracking entries for files that no longer exist
orphaned = []
for song_id, song in list(self.data['songs'].items()):
file_path = song.get('file_path')
for song_id, song in list(self.data["songs"].items()):
file_path = song.get("file_path")
if file_path and not Path(file_path).exists():
orphaned.append(song_id)
del self.data['songs'][song_id]
del self.data["songs"][song_id]
self.force_save()
return orphaned
@ -110,11 +148,11 @@ class TrackingManager:
"total_channels": total_channels,
"total_cached_videos": total_cached_videos,
"cache_duration_hours": cache_duration_hours,
"last_updated": last_updated
"last_updated": last_updated,
}
def clear_channel_cache(self, channel_id=None):
if channel_id is None or channel_id == 'all':
if channel_id is None or channel_id == "all":
self.cache = {}
else:
self.cache.pop(channel_id, None)
@ -125,14 +163,11 @@ class TrackingManager:
pass
def export_playlist_report(self, playlist_id):
playlist = self.data['playlists'].get(playlist_id)
playlist = self.data["playlists"].get(playlist_id)
if not playlist:
return f"Playlist '{playlist_id}' not found."
songs = self.get_playlist_songs(playlist_id)
report = {
"playlist": playlist,
"songs": songs
}
report = {"playlist": playlist, "songs": songs}
return json.dumps(report, indent=2, ensure_ascii=False)
def is_song_downloaded(self, artist, title, channel_name=None, video_id=None):
@ -143,31 +178,40 @@ class TrackingManager:
# If we have video_id and channel_name, try direct key lookup first (most efficient)
if video_id and channel_name:
song_key = f"{video_id}@{channel_name}"
if song_key in self.data['songs']:
song_data = self.data['songs'][song_key]
if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]:
if song_key in self.data["songs"]:
song_data = self.data["songs"][song_key]
if song_data.get("status") in [
SongStatus.DOWNLOADED,
SongStatus.CONVERTED,
]:
return True
# Fallback to content search (for cases where we don't have video_id)
for song_id, song_data in self.data['songs'].items():
for song_id, song_data in self.data["songs"].items():
# Check if this song matches the artist and title
if song_data.get('artist') == artist and song_data.get('title') == title:
if song_data.get("artist") == artist and song_data.get("title") == title:
# Check if it's marked as downloaded
if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]:
if song_data.get("status") in [
SongStatus.DOWNLOADED,
SongStatus.CONVERTED,
]:
return True
# Also check the video title field which might contain the song info
video_title = song_data.get('video_title', '')
video_title = song_data.get("video_title", "")
if video_title and artist in video_title and title in video_title:
if song_data.get('status') in [SongStatus.DOWNLOADED, SongStatus.CONVERTED]:
if song_data.get("status") in [
SongStatus.DOWNLOADED,
SongStatus.CONVERTED,
]:
return True
return False
def is_file_exists(self, file_path):
"""
Check if a file already exists on the filesystem.
"""
return Path(file_path).exists()
def is_song_failed(self, artist, title, channel_name=None, video_id=None):
"""
Check if a song has previously failed to download.
@ -176,94 +220,128 @@ class TrackingManager:
# If we have video_id and channel_name, try direct key lookup first (most efficient)
if video_id and channel_name:
song_key = f"{video_id}@{channel_name}"
if song_key in self.data['songs']:
song_data = self.data['songs'][song_key]
if song_data.get('status') == SongStatus.FAILED:
if song_key in self.data["songs"]:
song_data = self.data["songs"][song_key]
if song_data.get("status") == SongStatus.FAILED:
return True
# Fallback to content search (for cases where we don't have video_id)
for song_id, song_data in self.data['songs'].items():
for song_id, song_data in self.data["songs"].items():
# Check if this song matches the artist and title
if song_data.get('artist') == artist and song_data.get('title') == title:
if song_data.get("artist") == artist and song_data.get("title") == title:
# Check if it's marked as failed
if song_data.get('status') == SongStatus.FAILED:
if song_data.get("status") == SongStatus.FAILED:
return True
# Also check the video title field which might contain the song info
video_title = song_data.get('video_title', '')
video_title = song_data.get("video_title", "")
if video_title and artist in video_title and title in video_title:
if song_data.get('status') == SongStatus.FAILED:
if song_data.get("status") == SongStatus.FAILED:
return True
return False
def mark_song_downloaded(self, artist, title, video_id, channel_name, file_path, file_size=None):
def mark_song_downloaded(
self, artist, title, video_id, channel_name, file_path, file_size=None
):
"""
Mark a song as downloaded in the tracking system.
"""
# Use the existing tracking structure: video_id@channel_name
song_key = f"{video_id}@{channel_name}"
self.data['songs'][song_key] = {
'artist': artist,
'title': title,
'video_id': video_id,
'channel_name': channel_name,
'video_title': f"{artist} - {title}",
'file_path': str(file_path),
'file_size': file_size,
'status': SongStatus.DOWNLOADED,
'last_updated': datetime.now().isoformat()
self.data["songs"][song_key] = {
"artist": artist,
"title": title,
"video_id": video_id,
"channel_name": channel_name,
"video_title": f"{artist} - {title}",
"file_path": str(file_path),
"file_size": file_size,
"status": SongStatus.DOWNLOADED,
"last_updated": datetime.now().isoformat(),
}
self._save()
def mark_song_failed(self, artist, title, video_id, channel_name, error_message=None):
def mark_song_failed(
self, artist, title, video_id, channel_name, error_message=None
):
"""
Mark a song as failed in the tracking system.
"""
# Use the existing tracking structure: video_id@channel_name
song_key = f"{video_id}@{channel_name}"
self.data['songs'][song_key] = {
'artist': artist,
'title': title,
'video_id': video_id,
'channel_name': channel_name,
'video_title': f"{artist} - {title}",
'status': SongStatus.FAILED,
'error_message': error_message,
'last_updated': datetime.now().isoformat()
self.data["songs"][song_key] = {
"artist": artist,
"title": title,
"video_id": video_id,
"channel_name": channel_name,
"video_title": f"{artist} - {title}",
"status": SongStatus.FAILED,
"error_message": error_message,
"last_updated": datetime.now().isoformat(),
}
self._save()
def get_channel_video_list(self, channel_url, yt_dlp_path="downloader/yt-dlp.exe", force_refresh=False):
def get_channel_video_list(
self, channel_url, yt_dlp_path="downloader/yt-dlp.exe", force_refresh=False
):
"""
Return a list of videos (dicts with 'title' and 'id') for the channel, using cache if available unless force_refresh is True.
"""
channel_name, channel_id = None, None
from karaoke_downloader.youtube_utils import get_channel_info
channel_name, channel_id = get_channel_info(channel_url)
cache_key = channel_id or channel_url
# Try multiple possible cache keys
possible_keys = [
channel_id, # The extracted channel ID
channel_url, # The full URL
channel_name, # The extracted channel name
]
cache_key = None
for key in possible_keys:
if key and key in self.cache:
cache_key = key
break
if not cache_key:
cache_key = channel_id or channel_url # Use as fallback for new entries
print(f" 🔍 Trying cache keys: {possible_keys}")
print(f" 🔍 Selected cache key: '{cache_key}'")
if not force_refresh and cache_key in self.cache:
print(
f" 📋 Using cached video list ({len(self.cache[cache_key])} videos)"
)
return self.cache[cache_key]
else:
print(f" ❌ Cache miss for all keys")
# Fetch with yt-dlp
print(f" 🌐 Fetching video list from YouTube (this may take a while)...")
import subprocess
cmd = [
yt_dlp_path,
'--flat-playlist',
'--print', '%(title)s|%(id)s|%(url)s',
channel_url
"--flat-playlist",
"--print",
"%(title)s|%(id)s|%(url)s",
channel_url,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
lines = result.stdout.strip().splitlines()
videos = []
for line in lines:
parts = line.split('|')
parts = line.split("|")
if len(parts) >= 2:
title, video_id = parts[0].strip(), parts[1].strip()
videos.append({'title': title, 'id': video_id})
videos.append({"title": title, "id": video_id})
self.cache[cache_key] = videos
self.save_cache()
return videos
except subprocess.CalledProcessError as e:
print(f"❌ yt-dlp failed to fetch playlist for cache: {e}")
return []
return []

View File

@ -5,152 +5,162 @@ Handles the actual downloading and post-processing of videos.
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
from karaoke_downloader.config_manager import AppConfig
from karaoke_downloader.download_planner import save_plan_cache
from karaoke_downloader.error_utils import (
handle_file_validation_error,
handle_yt_dlp_error,
log_error,
)
from karaoke_downloader.file_utils import (
cleanup_temp_files,
ensure_directory_exists,
is_valid_mp4_file,
sanitize_filename,
)
from karaoke_downloader.id3_utils import add_id3_tags
from karaoke_downloader.songlist_manager import mark_songlist_song_downloaded
from karaoke_downloader.download_planner import save_plan_cache
from karaoke_downloader.youtube_utils import (
build_yt_dlp_command,
execute_yt_dlp_command,
show_available_formats,
)
# Constants
DEFAULT_FILENAME_LENGTH_LIMIT = 100
DEFAULT_ARTIST_LENGTH_LIMIT = 30
DEFAULT_TITLE_LENGTH_LIMIT = 60
DEFAULT_FORMAT_CHECK_TIMEOUT = 30
def sanitize_filename(artist, title):
"""
Create a safe filename from artist and title.
Removes invalid characters and limits length.
"""
# Create a shorter, safer filename
safe_title = title.replace("(From ", "").replace(")", "").replace(" - ", " ").replace(":", "").replace("'", "").replace('"', "")
safe_artist = artist.replace("'", "").replace('"', "")
# Remove all Windows-invalid characters
invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\']
for char in invalid_chars:
safe_title = safe_title.replace(char, "")
safe_artist = safe_artist.replace(char, "")
# Also remove any other potentially problematic characters
safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
safe_artist = safe_artist.strip()
filename = f"{safe_artist} - {safe_title}.mp4"
# Limit filename length to avoid Windows path issues
if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT:
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
return filename
def is_valid_mp4(file_path):
def is_valid_mp4(file_path: Path) -> bool:
"""
Check if a file is a valid MP4 file.
Uses ffprobe if available, otherwise checks file extension and size.
"""
if not file_path.exists():
return False
# Check file size
if file_path.stat().st_size == 0:
return False
# Try to use ffprobe for validation
try:
import subprocess
result = subprocess.run(
['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(file_path)],
capture_output=True,
text=True,
check=True
)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
# If ffprobe is not available, just check the extension and size
return file_path.suffix.lower() == '.mp4' and file_path.stat().st_size > 0
def download_video_and_track(yt_dlp_path, config, downloads_dir, songlist_tracking,
channel_name, channel_url, video_id, video_title,
artist, title, filename):
Args:
file_path: Path to the file to check
Returns:
True if file is a valid MP4, False otherwise
"""
return is_valid_mp4_file(file_path)
def download_video_and_track(
yt_dlp_path,
config,
downloads_dir,
songlist_tracking,
channel_name,
channel_url,
video_id,
video_title,
artist,
title,
filename,
):
"""
Download a single video and track its status.
Returns True if successful, False otherwise.
"""
output_path = downloads_dir / channel_name / filename
return download_single_video(
output_path, video_id, config, yt_dlp_path,
artist, title, channel_name, songlist_tracking
output_path,
video_id,
config,
yt_dlp_path,
artist,
title,
channel_name,
songlist_tracking,
)
def download_single_video(output_path, video_id, config, yt_dlp_path,
artist, title, channel_name, songlist_tracking):
"""Download a single video and handle post-processing."""
output_path.parent.mkdir(parents=True, exist_ok=True)
def download_single_video(
output_path: Path,
video_id: str,
config: Union[AppConfig, Dict[str, Any]],
yt_dlp_path: str,
artist: str,
title: str,
channel_name: str,
songlist_tracking: Dict[str, Any],
) -> bool:
"""
Download a single video and handle post-processing.
Args:
output_path: Output file path
video_id: YouTube video ID
config: Configuration dictionary
yt_dlp_path: Path to yt-dlp executable
artist: Song artist name
title: Song title
channel_name: Channel name
songlist_tracking: Songlist tracking data
Returns:
True if successful, False otherwise
"""
ensure_directory_exists(output_path.parent)
print(f"⬇️ Downloading: {artist} - {title} -> {output_path}")
video_url = f"https://www.youtube.com/watch?v={video_id}"
dlp_cmd = [
str(yt_dlp_path),
"--no-check-certificates",
"--ignore-errors",
"--no-warnings",
"-o", str(output_path),
"-f", config["download_settings"]["format"],
video_url
]
print(f"🔧 Running command: {' '.join(dlp_cmd)}")
print(f"📺 Resolution settings: {config.get('download_settings', {}).get('preferred_resolution', 'Unknown')}")
print(f"🎬 Format string: {config.get('download_settings', {}).get('format', 'Unknown')}")
# Build command using centralized utility
cmd = build_yt_dlp_command(yt_dlp_path, video_url, output_path, config)
print(f"🔧 Running command: {' '.join(cmd)}")
print(f"📺 Resolution settings: {config.download_settings.preferred_resolution}")
print(f"🎬 Format string: {config.download_settings.format}")
# Debug: Show available formats (optional)
if config.get('debug_show_formats', False):
show_available_formats(yt_dlp_path, video_url)
if hasattr(config, "debug_show_formats") and config.debug_show_formats:
show_available_formats(video_url, yt_dlp_path)
try:
result = subprocess.run(dlp_cmd, capture_output=True, text=True, check=True)
result = execute_yt_dlp_command(cmd)
print(f"✅ yt-dlp completed successfully")
print(f"📄 yt-dlp stdout: {result.stdout}")
except subprocess.CalledProcessError as e:
print(f"❌ yt-dlp failed with exit code {e.returncode}")
print(f"❌ yt-dlp stderr: {e.stderr}")
error = handle_yt_dlp_error(e, artist, title, video_id, channel_name)
log_error(error)
# Mark song as failed in tracking
error_msg = f"yt-dlp failed with exit code {e.returncode}: {e.stderr}"
_mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg)
_mark_song_failed_standalone(
artist, title, video_id, channel_name, error.message
)
return False
# Verify download
if not verify_download(output_path, artist, title, video_id, channel_name):
return False
# Post-processing
add_id3_tags(output_path, f"{artist} - {title} (Karaoke Version)", channel_name)
mark_songlist_song_downloaded(songlist_tracking, artist, title, channel_name, output_path)
mark_songlist_song_downloaded(
songlist_tracking, artist, title, channel_name, output_path
)
# Clean up temporary files
cleanup_temp_files(output_path.with_suffix(""))
print(f"✅ Downloaded and tracked: {artist} - {title}")
print(f"🎉 All post-processing complete for: {output_path}")
return True
def _mark_song_failed_standalone(artist, title, video_id, channel_name, error_message):
"""Standalone helper to mark a song as failed in tracking."""
from karaoke_downloader.tracking_manager import TrackingManager
tracker = TrackingManager()
tracker.mark_song_failed(artist, title, video_id, channel_name, error_message)
print(f"🏷️ Marked song as failed: {artist} - {title}")
def show_available_formats(yt_dlp_path, video_url):
"""Show available formats for debugging."""
print(f"🔍 Checking available formats for: {video_url}")
format_cmd = [
str(yt_dlp_path),
"--list-formats",
video_url
]
try:
format_result = subprocess.run(format_cmd, capture_output=True, text=True, timeout=DEFAULT_FORMAT_CHECK_TIMEOUT)
print(f"📋 Available formats:\n{format_result.stdout}")
except Exception as e:
print(f"⚠️ Could not check formats: {e}")
# Note: show_available_formats is now imported from youtube_utils
def verify_download(output_path, artist, title, video_id=None, channel_name=None):
"""Verify that the download was successful."""
@ -159,13 +169,17 @@ def verify_download(output_path, artist, title, video_id=None, channel_name=None
# Check if yt-dlp saved it somewhere else
possible_files = list(output_path.parent.glob("*.mp4"))
if possible_files:
print(f"🔍 Found these files in the directory: {[f.name for f in possible_files]}")
print(
f"🔍 Found these files in the directory: {[f.name for f in possible_files]}"
)
# Look for a file that matches our pattern (artist - title)
artist_part = artist.lower()
title_part = title.lower()
for file in possible_files:
file_lower = file.stem.lower()
if artist_part in file_lower and any(word in file_lower for word in title_part.split()):
if artist_part in file_lower and any(
word in file_lower for word in title_part.split()
):
print(f"🎯 Found matching file: {file.name}")
output_path = file
break
@ -174,31 +188,44 @@ def verify_download(output_path, artist, title, video_id=None, channel_name=None
# Mark song as failed if we have the required info
if video_id and channel_name:
error_msg = f"Download failed: file does not exist and no matching file found"
_mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg)
_mark_song_failed_standalone(
artist, title, video_id, channel_name, error_msg
)
return False
else:
# Mark song as failed if we have the required info
if video_id and channel_name:
error_msg = f"Download failed: file does not exist"
_mark_song_failed_standalone(artist, title, video_id, channel_name, error_msg)
_mark_song_failed_standalone(
artist, title, video_id, channel_name, error_msg
)
return False
if output_path.stat().st_size == 0:
print(f"❌ Download failed: file is empty (0 bytes): {output_path}")
return False
# Optional MP4 validation
# if not is_valid_mp4(output_path):
# print(f"❌ File is not a valid MP4: {output_path}")
# return False
return True
def execute_download_plan(download_plan, unmatched, cache_file, config, yt_dlp_path,
downloads_dir, songlist_tracking, limit=None):
def execute_download_plan(
download_plan,
unmatched,
cache_file,
config,
yt_dlp_path,
downloads_dir,
songlist_tracking,
limit=None,
):
"""
Execute a download plan with progress tracking and cache management.
Args:
download_plan: List of download items to process
unmatched: List of unmatched songs
@ -208,59 +235,72 @@ def execute_download_plan(download_plan, unmatched, cache_file, config, yt_dlp_p
downloads_dir: Directory for downloads
songlist_tracking: Songlist tracking data
limit: Optional limit on number of downloads
Returns:
tuple: (downloaded_count, success)
"""
downloaded_count = 0
total_to_download = limit if limit is not None else len(download_plan)
for idx, item in enumerate(download_plan[:]): # Use slice to allow modification during iteration
# Use reverse iteration to avoid index shifting issues when removing items
for idx in range(len(download_plan) - 1, -1, -1):
if limit is not None and downloaded_count >= limit:
break
artist = item['artist']
title = item['title']
channel_name = item['channel_name']
channel_url = item['channel_url']
video_id = item['video_id']
video_title = item['video_title']
print(f"\n⬇️ Downloading {idx+1} of {total_to_download}:")
item = download_plan[idx]
artist = item["artist"]
title = item["title"]
channel_name = item["channel_name"]
channel_url = item["channel_url"]
video_id = item["video_id"]
video_title = item["video_title"]
print(f"\n⬇️ Downloading {len(download_plan) - idx} of {total_to_download}:")
print(f" 📋 Songlist: {artist} - {title}")
print(f" 🎬 Video: {video_title} ({channel_name})")
if 'match_score' in item:
if "match_score" in item:
print(f" 🎯 Match Score: {item['match_score']:.1f}%")
# Create filename
filename = sanitize_filename(artist, title)
output_path = downloads_dir / channel_name / filename
# Download the file
success = download_single_video(
output_path, video_id, config, yt_dlp_path,
artist, title, channel_name, songlist_tracking
output_path,
video_id,
config,
yt_dlp_path,
artist,
title,
channel_name,
songlist_tracking,
)
if success:
downloaded_count += 1
# Remove completed item from plan and update cache
download_plan.pop(idx)
save_plan_cache(cache_file, download_plan, unmatched)
print(f"🗑️ Removed completed item from download plan. {len(download_plan)} items remaining.")
print(
f"🗑️ Removed completed item from download plan. {len(download_plan)} items remaining."
)
# Delete cache if all items are complete
if len(download_plan) == 0:
cleanup_cache(cache_file)
print(f"🎉 Downloaded {downloaded_count} songlist songs.")
print(f"📊 Summary: Found {downloaded_count} songs, {len(unmatched)} songs not found.")
print(
f"📊 Summary: Found {downloaded_count} songs, {len(unmatched)} songs not found."
)
# Final cleanup
cleanup_cache(cache_file)
return downloaded_count, True
def cleanup_cache(cache_file):
"""Clean up the cache file."""
if cache_file.exists():
@ -268,60 +308,8 @@ def cleanup_cache(cache_file):
cache_file.unlink()
print(f"🗑️ Deleted download plan cache: {cache_file.name}")
except Exception as e:
print(f"⚠️ Could not delete download plan cache: {e}")
print(f"⚠️ Could not delete download plan cache: {e}")
def should_skip_song_standalone(artist, title, channel_name, video_id, video_title, downloads_dir, tracker=None, server_songs=None, server_duplicates_tracking=None):
"""
Standalone function to check if a song should be skipped.
Performs four checks in order:
1. Already downloaded (tracking) - if tracker provided
2. File exists on filesystem
3. Already on server - if server data provided
4. Previously failed download (bad file) - if tracker provided
Returns:
tuple: (should_skip, reason, total_filtered)
"""
total_filtered = 0
# Check 1: Already downloaded by this system (if tracker provided)
if tracker and tracker.is_song_downloaded(artist, title, channel_name, video_id):
return True, "already downloaded", total_filtered
# Check 2: File already exists on filesystem
# Generate the expected filename based on the download mode context
safe_title = title
invalid_chars = ['?', ':', '*', '"', '<', '>', '|', '/', '\\']
for char in invalid_chars:
safe_title = safe_title.replace(char, "")
safe_title = safe_title.replace("...", "").replace("..", "").replace(".", "").strip()
# Try different filename patterns that might exist
possible_filenames = [
f"{artist} - {safe_title}.mp4", # Songlist mode
f"{channel_name} - {safe_title}.mp4", # Latest-per-channel mode
f"{artist} - {safe_title} (Karaoke Version).mp4" # Channel videos mode
]
for filename in possible_filenames:
if len(filename) > DEFAULT_FILENAME_LENGTH_LIMIT:
# Apply length limits if needed
safe_artist = artist.replace("'", "").replace('"', "").strip()
filename = f"{safe_artist[:DEFAULT_ARTIST_LENGTH_LIMIT]} - {safe_title[:DEFAULT_TITLE_LENGTH_LIMIT]}.mp4"
output_path = downloads_dir / channel_name / filename
if output_path.exists() and output_path.stat().st_size > 0:
return True, "file exists", total_filtered
# Check 3: Already on server (if server data provided)
if server_songs is not None and server_duplicates_tracking is not None:
from karaoke_downloader.server_manager import check_and_mark_server_duplicate
if check_and_mark_server_duplicate(server_songs, server_duplicates_tracking, artist, title, video_title, channel_name):
total_filtered += 1
return True, "on server", total_filtered
# Check 4: Previously failed download (bad file) - if tracker provided
if tracker and tracker.is_song_failed(artist, title, channel_name, video_id):
return True, "previously failed", total_filtered
return False, None, total_filtered
# Note: should_skip_song_standalone function has been removed and replaced with SongValidator class
# Use karaoke_downloader.song_validator.create_song_validator() instead

View File

@ -1,15 +1,138 @@
import re
"""
YouTube utilities for channel info, playlist info, and yt-dlp command generation.
"""
import json
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from karaoke_downloader.config_manager import AppConfig
def get_channel_info(channel_url):
if '@' in channel_url:
channel_name = channel_url.split('@')[1].split('/')[0]
channel_id = f"@{channel_name}"
else:
channel_name = "unknown_channel"
channel_id = "unknown_channel"
channel_name = re.sub(r'[<>:"/\\|?*]', '_', channel_name)
return channel_name, channel_id
def get_playlist_info(playlist_url):
return get_channel_info(playlist_url)
def get_channel_info(
channel_url: str, yt_dlp_path: str = "downloader/yt-dlp.exe"
) -> tuple[str, str]:
"""Get channel information using yt-dlp. Returns (channel_name, channel_id)."""
try:
# Extract channel name from URL for now (faster than calling yt-dlp)
if "/@" in channel_url:
# Keep the @ symbol for cache key consistency
channel_name = "@" + channel_url.split("/@")[1].split("/")[0]
elif "/channel/" in channel_url:
channel_name = channel_url.split("/channel/")[1].split("/")[0]
else:
channel_name = "Unknown"
# Extract channel ID from URL (keep @ symbol for @ channels)
if "/channel/" in channel_url:
channel_id = channel_url.split("/channel/")[1].split("/")[0]
elif "/@" in channel_url:
# Keep the @ symbol for cache key consistency
channel_id = "@" + channel_url.split("/@")[1].split("/")[0]
else:
channel_id = channel_url
return channel_name, channel_id
except Exception as e:
print(f"❌ Failed to get channel info: {e}")
return "Unknown", channel_url
def get_playlist_info(
playlist_url: str, yt_dlp_path: str = "downloader/yt-dlp.exe"
) -> List[Dict[str, Any]]:
"""Get playlist information using yt-dlp."""
try:
cmd = [yt_dlp_path, "--dump-json", "--flat-playlist", playlist_url]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
videos = []
for line in result.stdout.strip().split("\n"):
if line.strip():
videos.append(json.loads(line))
return videos
except subprocess.CalledProcessError as e:
print(f"❌ Failed to get playlist info: {e}")
return []
def build_yt_dlp_command(
yt_dlp_path: str,
video_url: str,
output_path: Path,
config: Union[AppConfig, Dict[str, Any]],
additional_args: Optional[List[str]] = None,
) -> List[str]:
"""
Build a standardized yt-dlp command with consistent arguments.
Args:
yt_dlp_path: Path to yt-dlp executable
video_url: YouTube video URL
output_path: Output file path
config: Configuration dictionary with download settings
additional_args: Optional additional arguments to append
Returns:
List of command arguments for subprocess.run
"""
cmd = [
str(yt_dlp_path),
"--no-check-certificates",
"--ignore-errors",
"--no-warnings",
"-o",
str(output_path),
"-f",
config.download_settings.format,
video_url,
]
# Add any additional arguments
if additional_args:
cmd.extend(additional_args)
return cmd
def execute_yt_dlp_command(
cmd: List[str], timeout: Optional[int] = None
) -> subprocess.CompletedProcess:
"""
Execute a yt-dlp command with standardized error handling.
Args:
cmd: Command list to execute
timeout: Optional timeout in seconds
Returns:
CompletedProcess object
Raises:
subprocess.CalledProcessError: If the command fails
subprocess.TimeoutExpired: If the command times out
"""
return subprocess.run(
cmd, capture_output=True, text=True, check=True, timeout=timeout
)
def show_available_formats(
video_url: str, yt_dlp_path: str = "downloader/yt-dlp.exe", timeout: int = 30
) -> None:
"""
Show available formats for a video (debugging utility).
Args:
video_url: YouTube video URL
yt_dlp_path: Path to yt-dlp executable
timeout: Timeout in seconds
"""
print(f"🔍 Checking available formats for: {video_url}")
format_cmd = [str(yt_dlp_path), "--list-formats", video_url]
try:
format_result = subprocess.run(
format_cmd, capture_output=True, text=True, timeout=timeout
)
print(f"📋 Available formats:\n{format_result.stdout}")
except Exception as e:
print(f"⚠️ Could not check formats: {e}")

112
pyproject.toml Normal file
View File

@ -0,0 +1,112 @@
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "karaoke-downloader"
version = "3.3.0"
description = "A Python-based Windows CLI tool to download karaoke videos from YouTube channels/playlists"
authors = [{name = "TeamHearse"}]
readme = "README.md"
requires-python = ">=3.7"
dependencies = [
"mutagen",
"rapidfuzz",
]
[project.optional-dependencies]
dev = [
"mypy",
"flake8",
"black",
"isort",
"pytest",
"pytest-cov",
]
[tool.black]
line-length = 88
target-version = ['py37']
include = '\.pyi?$'
extend-exclude = '''
/(
# directories
\.eggs
| \.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| build
| dist
)/
'''
[tool.isort]
profile = "black"
multi_line_output = 3
line_length = 88
known_first_party = ["karaoke_downloader"]
known_third_party = ["mutagen", "rapidfuzz", "subprocess", "pathlib", "typing", "json", "datetime", "dataclasses"]
sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
[tool.mypy]
python_version = "3.9"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
disallow_untyped_decorators = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true
strict_equality = true
show_error_codes = true
[[tool.mypy.overrides]]
module = [
"mutagen.*",
"rapidfuzz.*",
"cv2.*",
"subprocess.*",
]
ignore_missing_imports = true
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"--strict-markers",
"--strict-config",
"--cov=karaoke_downloader",
"--cov-report=term-missing",
"--cov-report=html",
"--cov-report=xml",
]
[tool.coverage.run]
source = ["karaoke_downloader"]
omit = [
"*/tests/*",
"*/test_*",
"*/__pycache__/*",
]
[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"def __repr__",
"if self.debug:",
"if settings.DEBUG",
"raise AssertionError",
"raise NotImplementedError",
"if 0:",
"if __name__ == .__main__.:",
"class .*\\bProtocol\\):",
"@(abc\\.)?abstractmethod",
]