KaraokeVideoDownloader/utilities/deduplicate_songlist_tracking.py

28 lines
1.2 KiB
Python

import json
from pathlib import Path
def normalize_title(title):
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
def deduplicate_songlist_tracking(tracking_path="data/songlist_tracking.json"):
tracking_file = Path(tracking_path)
if not tracking_file.exists():
print(f"File not found: {tracking_path}")
return
with open(tracking_file, 'r', encoding='utf-8') as f:
data = json.load(f)
deduped = {}
for entry in data.values():
artist = entry.get("artist", "").strip()
title = entry.get("title", "").strip()
key = f"{artist.lower()}_{normalize_title(title)}"
# Always keep the latest entry (by downloaded_at)
if key not in deduped or (entry.get("downloaded_at") and entry.get("downloaded_at") > deduped[key].get("downloaded_at", "")):
deduped[key] = entry
with open(tracking_file, 'w', encoding='utf-8') as f:
json.dump(deduped, f, indent=2, ensure_ascii=False)
print(f"Deduplicated {len(data)} -> {len(deduped)} entries in {tracking_path}")
if __name__ == "__main__":
deduplicate_songlist_tracking()