From cf8f4e2be940b7ab5bc5119ff9dac6c3dcd90ef4 Mon Sep 17 00:00:00 2001 From: mbrucedogs Date: Thu, 24 Jul 2025 10:30:42 -0500 Subject: [PATCH] Signed-off-by: mbrucedogs --- data/deduplicate_songlist_tracking.py | 28 ++++++++++ data/songlist_report.py | 71 ++++++++++++++++++++++++++ karaoke_downloader/downloader.py | 3 +- karaoke_downloader/songlist_manager.py | 13 +++-- 4 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 data/deduplicate_songlist_tracking.py create mode 100644 data/songlist_report.py diff --git a/data/deduplicate_songlist_tracking.py b/data/deduplicate_songlist_tracking.py new file mode 100644 index 0000000..aa69ccc --- /dev/null +++ b/data/deduplicate_songlist_tracking.py @@ -0,0 +1,28 @@ +import json +from pathlib import Path + +def normalize_title(title): + normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip() + return " ".join(normalized.split()).lower() + +def deduplicate_songlist_tracking(tracking_path="data/songlist_tracking.json"): + tracking_file = Path(tracking_path) + if not tracking_file.exists(): + print(f"File not found: {tracking_path}") + return + with open(tracking_file, 'r', encoding='utf-8') as f: + data = json.load(f) + deduped = {} + for entry in data.values(): + artist = entry.get("artist", "").strip() + title = entry.get("title", "").strip() + key = f"{artist.lower()}_{normalize_title(title)}" + # Always keep the latest entry (by downloaded_at) + if key not in deduped or (entry.get("downloaded_at") and entry.get("downloaded_at") > deduped[key].get("downloaded_at", "")): + deduped[key] = entry + with open(tracking_file, 'w', encoding='utf-8') as f: + json.dump(deduped, f, indent=2, ensure_ascii=False) + print(f"Deduplicated {len(data)} -> {len(deduped)} entries in {tracking_path}") + +if __name__ == "__main__": + deduplicate_songlist_tracking() \ No newline at end of file diff --git a/data/songlist_report.py b/data/songlist_report.py new file mode 100644 index 0000000..2ff1e0a --- /dev/null +++ b/data/songlist_report.py @@ -0,0 +1,71 @@ +import json +from pathlib import Path + +def normalize_title(title): + normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip() + return " ".join(normalized.split()).lower() + +def load_songlist(songlist_path="data/songList.json"): + songlist_file = Path(songlist_path) + if not songlist_file.exists(): + print(f"⚠️ Songlist file not found: {songlist_path}") + return [] + with open(songlist_file, 'r', encoding='utf-8') as f: + data = json.load(f) + all_songs = [] + for category in data: + if "songs" in category: + for song in category["songs"]: + if "artist" in song and "title" in song: + all_songs.append({ + "artist": song["artist"].strip(), + "title": song["title"].strip(), + "position": song.get("position", 0) + }) + return all_songs + +def load_songlist_tracking(tracking_path="data/songlist_tracking.json"): + tracking_file = Path(tracking_path) + if not tracking_file.exists(): + return {} + with open(tracking_file, 'r', encoding='utf-8') as f: + return json.load(f) + +def main(): + all_songs = load_songlist() + print(f"Total songs in songList.json (raw): {len(all_songs)}") + + # Deduplicate + unique_songs = {} + for song in all_songs: + artist = song["artist"] + title = song["title"] + key = f"{artist.lower()}_{normalize_title(title)}" + if key not in unique_songs: + unique_songs[key] = song + print(f"Unique songs (by artist+title): {len(unique_songs)}") + + # Downloaded songs + tracking = load_songlist_tracking() + downloaded_keys = set(tracking.keys()) + print(f"Downloaded songs: {len(downloaded_keys)}") + + # Songs left to download + left_to_download = [song for key, song in unique_songs.items() if key not in downloaded_keys] + print(f"Songs left to download: {len(left_to_download)}") + + print("\nSample unique songs:") + for song in list(unique_songs.values())[:10]: + print(f" - {song['artist']} - {song['title']}") + + print("\nSample downloaded songs:") + for key in list(downloaded_keys)[:10]: + entry = tracking[key] + print(f" - {entry['artist']} - {entry['title']}") + + print("\nSample songs left to download:") + for song in left_to_download[:10]: + print(f" - {song['artist']} - {song['title']}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/karaoke_downloader/downloader.py b/karaoke_downloader/downloader.py index c5853de..3906337 100644 --- a/karaoke_downloader/downloader.py +++ b/karaoke_downloader/downloader.py @@ -253,7 +253,8 @@ class KaraokeDownloader: return False # Filter for songs not yet downloaded undownloaded = [s for s in songlist if not is_songlist_song_downloaded(self.songlist_tracking, s['artist'], s['title'])] - print(f"🎯 {len(undownloaded)} songlist songs to download.") + print(f"🎯 {len(songlist)} total unique songs in songlist.") + print(f"🎯 {len(undownloaded)} unique songlist songs to download.") if not undownloaded: print("🎵 All songlist songs already downloaded.") return True diff --git a/karaoke_downloader/songlist_manager.py b/karaoke_downloader/songlist_manager.py index 898e2d4..fd7ea85 100644 --- a/karaoke_downloader/songlist_manager.py +++ b/karaoke_downloader/songlist_manager.py @@ -11,16 +11,23 @@ def load_songlist(songlist_path="data/songList.json"): with open(songlist_file, 'r', encoding='utf-8') as f: data = json.load(f) all_songs = [] + seen = set() for category in data: if "songs" in category: for song in category["songs"]: if "artist" in song and "title" in song: + artist = song["artist"].strip() + title = song["title"].strip() + key = f"{artist.lower()}_{normalize_title(title)}" + if key in seen: + continue + seen.add(key) all_songs.append({ - "artist": song["artist"].strip(), - "title": song["title"].strip(), + "artist": artist, + "title": title, "position": song.get("position", 0) }) - print(f"📋 Loaded {len(all_songs)} songs from songlist") + print(f"📋 Loaded {len(all_songs)} unique songs from songlist (deduplicated)") return all_songs except (json.JSONDecodeError, FileNotFoundError) as e: print(f"⚠️ Could not load songlist: {e}")