KaraokeVideoDownloader/karaoke_downloader/fuzzy_matcher.py

101 lines
2.9 KiB
Python

"""
Fuzzy matching utilities for songlist-to-video matching.
Handles similarity calculations and match validation.
"""
def get_similarity_function():
"""
Get the best available similarity function.
Returns rapidfuzz if available, otherwise falls back to difflib.
"""
try:
from rapidfuzz import fuzz
def similarity(a, b):
return fuzz.ratio(a, b)
return similarity
except ImportError:
import difflib
def similarity(a, b):
return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
return similarity
def normalize_title(title):
"""Normalize a title for comparison."""
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
def extract_artist_title(video_title):
"""Extract artist and title from video title."""
if " - " in video_title:
parts = video_title.split(" - ", 1)
return parts[0].strip(), parts[1].strip()
return "", video_title
def create_song_key(artist, title):
"""Create a normalized key for song comparison."""
return f"{artist.lower()}_{normalize_title(title)}"
def create_video_key(video_title):
"""Create a normalized key for video comparison."""
artist, title = extract_artist_title(video_title)
return f"{artist.lower()}_{normalize_title(title)}"
def is_fuzzy_match(songlist_artist, songlist_title, video_title, threshold=90):
"""
Check if a songlist entry matches a video title using fuzzy matching.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
threshold: Minimum similarity score (0-100)
Returns:
tuple: (is_match, score) where is_match is boolean and score is the similarity score
"""
similarity = get_similarity_function()
song_key = create_song_key(songlist_artist, songlist_title)
video_key = create_video_key(video_title)
score = similarity(song_key, video_key)
is_match = score >= threshold
return is_match, score
def is_exact_match(songlist_artist, songlist_title, video_title):
"""
Check if a songlist entry exactly matches a video title.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
Returns:
bool: True if exact match, False otherwise
"""
v_artist, v_title = extract_artist_title(video_title)
# Check artist and title separately
artist_match = normalize_title(v_artist) == normalize_title(songlist_artist)
title_match = normalize_title(v_title) == normalize_title(songlist_title)
# Also check if video title matches "artist - title" format
full_title_match = normalize_title(video_title) == normalize_title(
f"{songlist_artist} - {songlist_title}"
)
return (artist_match and title_match) or full_title_match