KaraokeVideoDownloader/karaoke_downloader/fuzzy_matcher.py

138 lines
4.4 KiB
Python

"""
Fuzzy matching utilities for songlist-to-video matching.
Handles similarity calculations and match validation.
"""
def get_similarity_function():
"""
Get the best available similarity function.
Returns rapidfuzz if available, otherwise falls back to difflib.
"""
try:
from rapidfuzz import fuzz
def similarity(a, b):
return fuzz.ratio(a, b)
return similarity
except ImportError:
import difflib
def similarity(a, b):
return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
return similarity
def normalize_title(title):
"""Normalize a title for comparison."""
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
return " ".join(normalized.split()).lower()
def extract_artist_title(video_title):
"""Extract artist and title from video title."""
# Handle "Title - Artist" format
if " - " in video_title:
parts = video_title.split(" - ", 1)
return parts[0].strip(), parts[1].strip()
# Handle "Title Karaoke | Artist Karaoke Version" format
if " | " in video_title and "karaoke" in video_title.lower():
parts = video_title.split(" | ", 1)
title_part = parts[0].strip()
artist_part = parts[1].strip()
# Clean up the parts
title = title_part.replace("Karaoke", "").strip()
artist = artist_part.replace("Karaoke Version", "").strip()
return artist, title
# Handle "Title Artist KARAOKE" format
if "karaoke" in video_title.lower():
# Try to find the artist by looking for common patterns
title_lower = video_title.lower()
# Look for patterns like "Title Artist KARAOKE"
# This is a simplified approach - we'll need to improve this
words = video_title.split()
if len(words) >= 3:
# Assume the last word before "KARAOKE" is part of the artist
for i, word in enumerate(words):
if "karaoke" in word.lower():
if i >= 2:
# Everything before the last word before KARAOKE is title
# Everything after is artist
title = " ".join(words[:i-1])
artist = " ".join(words[i-1:])
return artist, title
# If we can't parse it, return empty artist and full title
return "", video_title
# Default: return empty artist and full title
return "", video_title
def create_song_key(artist, title):
"""Create a normalized key for song comparison."""
return f"{artist.lower()}_{normalize_title(title)}"
def create_video_key(video_title):
"""Create a normalized key for video comparison."""
artist, title = extract_artist_title(video_title)
return f"{artist.lower()}_{normalize_title(title)}"
def is_fuzzy_match(songlist_artist, songlist_title, video_title, threshold=90):
"""
Check if a songlist entry matches a video title using fuzzy matching.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
threshold: Minimum similarity score (0-100)
Returns:
tuple: (is_match, score) where is_match is boolean and score is the similarity score
"""
similarity = get_similarity_function()
song_key = create_song_key(songlist_artist, songlist_title)
video_key = create_video_key(video_title)
score = similarity(song_key, video_key)
is_match = score >= threshold
return is_match, score
def is_exact_match(songlist_artist, songlist_title, video_title):
"""
Check if a songlist entry exactly matches a video title.
Args:
songlist_artist: Artist from songlist
songlist_title: Title from songlist
video_title: YouTube video title
Returns:
bool: True if exact match, False otherwise
"""
v_artist, v_title = extract_artist_title(video_title)
# Check artist and title separately
artist_match = normalize_title(v_artist) == normalize_title(songlist_artist)
title_match = normalize_title(v_title) == normalize_title(songlist_title)
# Also check if video title matches "artist - title" format
full_title_match = normalize_title(video_title) == normalize_title(
f"{songlist_artist} - {songlist_title}"
)
return (artist_match and title_match) or full_title_match