138 lines
4.4 KiB
Python
138 lines
4.4 KiB
Python
"""
|
|
Fuzzy matching utilities for songlist-to-video matching.
|
|
Handles similarity calculations and match validation.
|
|
"""
|
|
|
|
|
|
def get_similarity_function():
|
|
"""
|
|
Get the best available similarity function.
|
|
Returns rapidfuzz if available, otherwise falls back to difflib.
|
|
"""
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
|
|
def similarity(a, b):
|
|
return fuzz.ratio(a, b)
|
|
|
|
return similarity
|
|
except ImportError:
|
|
import difflib
|
|
|
|
def similarity(a, b):
|
|
return int(difflib.SequenceMatcher(None, a, b).ratio() * 100)
|
|
|
|
return similarity
|
|
|
|
|
|
def normalize_title(title):
|
|
"""Normalize a title for comparison."""
|
|
normalized = title.replace("(Karaoke Version)", "").replace("(Karaoke)", "").strip()
|
|
return " ".join(normalized.split()).lower()
|
|
|
|
|
|
def extract_artist_title(video_title):
|
|
"""Extract artist and title from video title."""
|
|
# Handle "Title - Artist" format
|
|
if " - " in video_title:
|
|
parts = video_title.split(" - ", 1)
|
|
return parts[0].strip(), parts[1].strip()
|
|
|
|
# Handle "Title Karaoke | Artist Karaoke Version" format
|
|
if " | " in video_title and "karaoke" in video_title.lower():
|
|
parts = video_title.split(" | ", 1)
|
|
title_part = parts[0].strip()
|
|
artist_part = parts[1].strip()
|
|
|
|
# Clean up the parts
|
|
title = title_part.replace("Karaoke", "").strip()
|
|
artist = artist_part.replace("Karaoke Version", "").strip()
|
|
|
|
return artist, title
|
|
|
|
# Handle "Title Artist KARAOKE" format
|
|
if "karaoke" in video_title.lower():
|
|
# Try to find the artist by looking for common patterns
|
|
title_lower = video_title.lower()
|
|
|
|
# Look for patterns like "Title Artist KARAOKE"
|
|
# This is a simplified approach - we'll need to improve this
|
|
words = video_title.split()
|
|
if len(words) >= 3:
|
|
# Assume the last word before "KARAOKE" is part of the artist
|
|
for i, word in enumerate(words):
|
|
if "karaoke" in word.lower():
|
|
if i >= 2:
|
|
# Everything before the last word before KARAOKE is title
|
|
# Everything after is artist
|
|
title = " ".join(words[:i-1])
|
|
artist = " ".join(words[i-1:])
|
|
return artist, title
|
|
|
|
# If we can't parse it, return empty artist and full title
|
|
return "", video_title
|
|
|
|
# Default: return empty artist and full title
|
|
return "", video_title
|
|
|
|
|
|
def create_song_key(artist, title):
|
|
"""Create a normalized key for song comparison."""
|
|
return f"{artist.lower()}_{normalize_title(title)}"
|
|
|
|
|
|
def create_video_key(video_title):
|
|
"""Create a normalized key for video comparison."""
|
|
artist, title = extract_artist_title(video_title)
|
|
return f"{artist.lower()}_{normalize_title(title)}"
|
|
|
|
|
|
def is_fuzzy_match(songlist_artist, songlist_title, video_title, threshold=90):
|
|
"""
|
|
Check if a songlist entry matches a video title using fuzzy matching.
|
|
|
|
Args:
|
|
songlist_artist: Artist from songlist
|
|
songlist_title: Title from songlist
|
|
video_title: YouTube video title
|
|
threshold: Minimum similarity score (0-100)
|
|
|
|
Returns:
|
|
tuple: (is_match, score) where is_match is boolean and score is the similarity score
|
|
"""
|
|
similarity = get_similarity_function()
|
|
|
|
song_key = create_song_key(songlist_artist, songlist_title)
|
|
video_key = create_video_key(video_title)
|
|
|
|
score = similarity(song_key, video_key)
|
|
is_match = score >= threshold
|
|
|
|
return is_match, score
|
|
|
|
|
|
def is_exact_match(songlist_artist, songlist_title, video_title):
|
|
"""
|
|
Check if a songlist entry exactly matches a video title.
|
|
|
|
Args:
|
|
songlist_artist: Artist from songlist
|
|
songlist_title: Title from songlist
|
|
video_title: YouTube video title
|
|
|
|
Returns:
|
|
bool: True if exact match, False otherwise
|
|
"""
|
|
v_artist, v_title = extract_artist_title(video_title)
|
|
|
|
# Check artist and title separately
|
|
artist_match = normalize_title(v_artist) == normalize_title(songlist_artist)
|
|
title_match = normalize_title(v_title) == normalize_title(songlist_title)
|
|
|
|
# Also check if video title matches "artist - title" format
|
|
full_title_match = normalize_title(video_title) == normalize_title(
|
|
f"{songlist_artist} - {songlist_title}"
|
|
)
|
|
|
|
return (artist_match and title_match) or full_title_match
|