""" Download plan building utilities. Handles pre-scanning channels and building download plans. """ from karaoke_downloader.cache_manager import ( delete_plan_cache, get_download_plan_cache_file, load_cached_plan, save_plan_cache, ) from karaoke_downloader.fuzzy_matcher import ( create_song_key, extract_artist_title, get_similarity_function, is_exact_match, is_fuzzy_match, ) from karaoke_downloader.youtube_utils import get_channel_info # Constants DEFAULT_FILENAME_LENGTH_LIMIT = 100 DEFAULT_ARTIST_LENGTH_LIMIT = 30 DEFAULT_TITLE_LENGTH_LIMIT = 60 DEFAULT_FUZZY_THRESHOLD = 85 def build_download_plan( channel_urls, undownloaded, tracker, yt_dlp_path, fuzzy_match=False, fuzzy_threshold=DEFAULT_FUZZY_THRESHOLD, ): """ For each song in undownloaded, scan all channels for a match. Use fuzzy matching if enabled. Return (download_plan, unmatched_songs): - download_plan: list of dicts {artist, title, channel_name, channel_url, video_id, video_title, match_score} - unmatched_songs: list of songs not found in any channel """ plan = [] unmatched = [] channel_match_counts = {} # Pre-process songlist for O(1) lookups song_keys = set() song_lookup = {} for song in undownloaded: key = create_song_key(song["artist"], song["title"]) song_keys.add(key) song_lookup[key] = song for i, channel_url in enumerate(channel_urls, 1): print(f"\n🚦 Starting channel {i}/{len(channel_urls)}: {channel_url}") print(f" šŸ” Getting channel info...") channel_name, channel_id = get_channel_info(channel_url) print(f" āœ… Channel info: {channel_name} (ID: {channel_id})") print(f" šŸ” Fetching video list from channel...") available_videos = tracker.get_channel_video_list( channel_url, yt_dlp_path=str(yt_dlp_path), force_refresh=False ) print( f" šŸ“Š Channel has {len(available_videos)} videos to scan against {len(undownloaded)} songlist songs" ) matches_this_channel = 0 video_matches = [] # Initialize video_matches for this channel # Pre-process video titles for efficient matching if fuzzy_match: # For fuzzy matching, create normalized video keys for video in available_videos: v_artist, v_title = extract_artist_title(video["title"]) video_key = create_song_key(v_artist, v_title) # Find best match among remaining songs best_match = None best_score = 0 for song_key in song_keys: if song_key in song_lookup: # Only check unmatched songs score = get_similarity_function()(song_key, video_key) if score >= fuzzy_threshold and score > best_score: best_score = score best_match = song_key if best_match: song = song_lookup[best_match] video_matches.append( { "artist": song["artist"], "title": song["title"], "channel_name": channel_name, "channel_url": channel_url, "video_id": video["id"], "video_title": video["title"], "match_score": best_score, } ) # Remove matched song from future consideration del song_lookup[best_match] song_keys.remove(best_match) matches_this_channel += 1 else: # For exact matching, use direct key comparison for video in available_videos: v_artist, v_title = extract_artist_title(video["title"]) video_key = create_song_key(v_artist, v_title) if video_key in song_keys: song = song_lookup[video_key] video_matches.append( { "artist": song["artist"], "title": song["title"], "channel_name": channel_name, "channel_url": channel_url, "video_id": video["id"], "video_title": video["title"], "match_score": 100, } ) # Remove matched song from future consideration del song_lookup[video_key] song_keys.remove(video_key) matches_this_channel += 1 # Add matches to plan plan.extend(video_matches) # Print match count once per channel channel_match_counts[channel_name] = matches_this_channel print(f" → Found {matches_this_channel} songlist matches in this channel.") # Remaining unmatched songs unmatched = list(song_lookup.values()) # Print summary table print("\nšŸ“Š Channel match summary:") for channel, count in channel_match_counts.items(): print(f" {channel}: {count} matches") print( f" TOTAL: {sum(channel_match_counts.values())} matches across {len(channel_match_counts)} channels." ) return plan, unmatched