Signed-off-by: mbrucedogs <mbrucedogs@gmail.com>
This commit is contained in:
parent
820a885f9f
commit
cf8f4e2be9
28
data/deduplicate_songlist_tracking.py
Normal file
28
data/deduplicate_songlist_tracking.py
Normal file
@ -0,0 +1,28 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def normalize_title(title):
|
||||
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
|
||||
return " ".join(normalized.split()).lower()
|
||||
|
||||
def deduplicate_songlist_tracking(tracking_path="data/songlist_tracking.json"):
|
||||
tracking_file = Path(tracking_path)
|
||||
if not tracking_file.exists():
|
||||
print(f"File not found: {tracking_path}")
|
||||
return
|
||||
with open(tracking_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
deduped = {}
|
||||
for entry in data.values():
|
||||
artist = entry.get("artist", "").strip()
|
||||
title = entry.get("title", "").strip()
|
||||
key = f"{artist.lower()}_{normalize_title(title)}"
|
||||
# Always keep the latest entry (by downloaded_at)
|
||||
if key not in deduped or (entry.get("downloaded_at") and entry.get("downloaded_at") > deduped[key].get("downloaded_at", "")):
|
||||
deduped[key] = entry
|
||||
with open(tracking_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(deduped, f, indent=2, ensure_ascii=False)
|
||||
print(f"Deduplicated {len(data)} -> {len(deduped)} entries in {tracking_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
deduplicate_songlist_tracking()
|
||||
71
data/songlist_report.py
Normal file
71
data/songlist_report.py
Normal file
@ -0,0 +1,71 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def normalize_title(title):
|
||||
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
|
||||
return " ".join(normalized.split()).lower()
|
||||
|
||||
def load_songlist(songlist_path="data/songList.json"):
|
||||
songlist_file = Path(songlist_path)
|
||||
if not songlist_file.exists():
|
||||
print(f"⚠️ Songlist file not found: {songlist_path}")
|
||||
return []
|
||||
with open(songlist_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
all_songs = []
|
||||
for category in data:
|
||||
if "songs" in category:
|
||||
for song in category["songs"]:
|
||||
if "artist" in song and "title" in song:
|
||||
all_songs.append({
|
||||
"artist": song["artist"].strip(),
|
||||
"title": song["title"].strip(),
|
||||
"position": song.get("position", 0)
|
||||
})
|
||||
return all_songs
|
||||
|
||||
def load_songlist_tracking(tracking_path="data/songlist_tracking.json"):
|
||||
tracking_file = Path(tracking_path)
|
||||
if not tracking_file.exists():
|
||||
return {}
|
||||
with open(tracking_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def main():
|
||||
all_songs = load_songlist()
|
||||
print(f"Total songs in songList.json (raw): {len(all_songs)}")
|
||||
|
||||
# Deduplicate
|
||||
unique_songs = {}
|
||||
for song in all_songs:
|
||||
artist = song["artist"]
|
||||
title = song["title"]
|
||||
key = f"{artist.lower()}_{normalize_title(title)}"
|
||||
if key not in unique_songs:
|
||||
unique_songs[key] = song
|
||||
print(f"Unique songs (by artist+title): {len(unique_songs)}")
|
||||
|
||||
# Downloaded songs
|
||||
tracking = load_songlist_tracking()
|
||||
downloaded_keys = set(tracking.keys())
|
||||
print(f"Downloaded songs: {len(downloaded_keys)}")
|
||||
|
||||
# Songs left to download
|
||||
left_to_download = [song for key, song in unique_songs.items() if key not in downloaded_keys]
|
||||
print(f"Songs left to download: {len(left_to_download)}")
|
||||
|
||||
print("\nSample unique songs:")
|
||||
for song in list(unique_songs.values())[:10]:
|
||||
print(f" - {song['artist']} - {song['title']}")
|
||||
|
||||
print("\nSample downloaded songs:")
|
||||
for key in list(downloaded_keys)[:10]:
|
||||
entry = tracking[key]
|
||||
print(f" - {entry['artist']} - {entry['title']}")
|
||||
|
||||
print("\nSample songs left to download:")
|
||||
for song in left_to_download[:10]:
|
||||
print(f" - {song['artist']} - {song['title']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -253,7 +253,8 @@ class KaraokeDownloader:
|
||||
return False
|
||||
# Filter for songs not yet downloaded
|
||||
undownloaded = [s for s in songlist if not is_songlist_song_downloaded(self.songlist_tracking, s['artist'], s['title'])]
|
||||
print(f"🎯 {len(undownloaded)} songlist songs to download.")
|
||||
print(f"🎯 {len(songlist)} total unique songs in songlist.")
|
||||
print(f"🎯 {len(undownloaded)} unique songlist songs to download.")
|
||||
if not undownloaded:
|
||||
print("🎵 All songlist songs already downloaded.")
|
||||
return True
|
||||
|
||||
@ -11,16 +11,23 @@ def load_songlist(songlist_path="data/songList.json"):
|
||||
with open(songlist_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
all_songs = []
|
||||
seen = set()
|
||||
for category in data:
|
||||
if "songs" in category:
|
||||
for song in category["songs"]:
|
||||
if "artist" in song and "title" in song:
|
||||
artist = song["artist"].strip()
|
||||
title = song["title"].strip()
|
||||
key = f"{artist.lower()}_{normalize_title(title)}"
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
all_songs.append({
|
||||
"artist": song["artist"].strip(),
|
||||
"title": song["title"].strip(),
|
||||
"artist": artist,
|
||||
"title": title,
|
||||
"position": song.get("position", 0)
|
||||
})
|
||||
print(f"📋 Loaded {len(all_songs)} songs from songlist")
|
||||
print(f"📋 Loaded {len(all_songs)} unique songs from songlist (deduplicated)")
|
||||
return all_songs
|
||||
except (json.JSONDecodeError, FileNotFoundError) as e:
|
||||
print(f"⚠️ Could not load songlist: {e}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user