""" Reporting and output generation for the Karaoke Song Library Cleanup Tool. """ from typing import Dict, List, Any from collections import defaultdict, Counter from utils import format_file_size, get_file_extension, extract_channel_from_path class ReportGenerator: """Generates reports and statistics for the karaoke cleanup process.""" def __init__(self, config: Dict[str, Any]): self.config = config self.verbose = config.get('output', {}).get('verbose', False) self.include_reasons = config.get('output', {}).get('include_reasons', True) self.channel_priorities = config.get('channel_priorities', []) def analyze_skip_patterns(self, skip_songs: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze patterns in the skip list to understand duplicate distribution.""" analysis = { 'total_skipped': len(skip_songs), 'file_type_distribution': defaultdict(int), 'channel_distribution': defaultdict(int), 'duplicate_reasons': defaultdict(int), 'kept_vs_skipped_channels': defaultdict(lambda: {'kept': 0, 'skipped': 0}), 'folder_patterns': defaultdict(int), 'artist_duplicate_counts': defaultdict(int), 'title_duplicate_counts': defaultdict(int) } for skip_song in skip_songs: # File type analysis ext = get_file_extension(skip_song['path']) analysis['file_type_distribution'][ext] += 1 # Channel analysis for MP4s if ext == '.mp4': channel = extract_channel_from_path(skip_song['path'], self.channel_priorities) if channel: analysis['channel_distribution'][channel] += 1 analysis['kept_vs_skipped_channels'][channel]['skipped'] += 1 # Reason analysis reason = skip_song.get('reason', 'unknown') analysis['duplicate_reasons'][reason] += 1 # Folder pattern analysis path_parts = skip_song['path'].split('\\') if len(path_parts) > 1: folder = path_parts[-2] # Second to last part (folder name) analysis['folder_patterns'][folder] += 1 # Artist/Title duplicate counts artist = skip_song.get('artist', 'Unknown') title = skip_song.get('title', 'Unknown') analysis['artist_duplicate_counts'][artist] += 1 analysis['title_duplicate_counts'][title] += 1 return analysis def analyze_channel_optimization(self, stats: Dict[str, Any], skip_analysis: Dict[str, Any]) -> Dict[str, Any]: """Analyze channel priorities and suggest optimizations.""" analysis = { 'current_priorities': self.channel_priorities.copy(), 'priority_effectiveness': {}, 'suggested_priorities': [], 'unused_channels': [], 'missing_channels': [] } # Analyze effectiveness of current priorities for channel in self.channel_priorities: kept_count = stats['channel_breakdown'].get(channel, 0) skipped_count = skip_analysis['kept_vs_skipped_channels'].get(channel, {}).get('skipped', 0) total_count = kept_count + skipped_count if total_count > 0: effectiveness = kept_count / total_count analysis['priority_effectiveness'][channel] = { 'kept': kept_count, 'skipped': skipped_count, 'total': total_count, 'effectiveness': effectiveness } # Find channels not in current priorities all_channels = set(stats['channel_breakdown'].keys()) used_channels = set(self.channel_priorities) analysis['unused_channels'] = list(all_channels - used_channels) # Suggest priority order based on effectiveness if analysis['priority_effectiveness']: sorted_channels = sorted( analysis['priority_effectiveness'].items(), key=lambda x: x[1]['effectiveness'], reverse=True ) analysis['suggested_priorities'] = [channel for channel, _ in sorted_channels] return analysis def generate_enhanced_summary_report(self, stats: Dict[str, Any], skip_analysis: Dict[str, Any]) -> str: """Generate an enhanced summary report with detailed statistics.""" report = [] report.append("=" * 80) report.append("ENHANCED KARAOKE SONG LIBRARY ANALYSIS REPORT") report.append("=" * 80) report.append("") # Basic statistics report.append("📊 BASIC STATISTICS") report.append("-" * 40) report.append(f"Total songs processed: {stats['total_songs']:,}") report.append(f"Unique songs found: {stats['unique_songs']:,}") report.append(f"Duplicates identified: {stats['duplicates_found']:,}") report.append(f"Groups with duplicates: {stats['groups_with_duplicates']:,}") if stats['duplicates_found'] > 0: duplicate_percentage = (stats['duplicates_found'] / stats['total_songs']) * 100 report.append(f"Duplicate rate: {duplicate_percentage:.1f}%") report.append("") # File type analysis report.append("📁 FILE TYPE ANALYSIS") report.append("-" * 40) total_files = sum(stats['file_type_breakdown'].values()) for ext, count in sorted(stats['file_type_breakdown'].items()): percentage = (count / total_files) * 100 skipped_count = skip_analysis['file_type_distribution'].get(ext, 0) kept_count = count - skipped_count report.append(f"{ext}: {count:,} total ({percentage:.1f}%) - {kept_count:,} kept, {skipped_count:,} skipped") report.append("") # Channel analysis if stats['channel_breakdown']: report.append("🎵 CHANNEL ANALYSIS") report.append("-" * 40) for channel, count in sorted(stats['channel_breakdown'].items()): skipped_count = skip_analysis['kept_vs_skipped_channels'].get(channel, {}).get('skipped', 0) kept_count = count - skipped_count effectiveness = (kept_count / count * 100) if count > 0 else 0 report.append(f"{channel}: {count:,} total - {kept_count:,} kept ({effectiveness:.1f}%), {skipped_count:,} skipped") report.append("") # Skip pattern analysis report.append("🗑️ SKIP PATTERN ANALYSIS") report.append("-" * 40) report.append(f"Total files to skip: {skip_analysis['total_skipped']:,}") # Top folders with most skips top_folders = sorted(skip_analysis['folder_patterns'].items(), key=lambda x: x[1], reverse=True)[:10] if top_folders: report.append("Top folders with most duplicates:") for folder, count in top_folders: report.append(f" {folder}: {count:,} files") report.append("") # Duplicate reasons if skip_analysis['duplicate_reasons']: report.append("Duplicate reasons:") for reason, count in skip_analysis['duplicate_reasons'].items(): percentage = (count / skip_analysis['total_skipped']) * 100 report.append(f" {reason}: {count:,} ({percentage:.1f}%)") report.append("") report.append("=" * 80) return "\n".join(report) def generate_channel_optimization_report(self, channel_analysis: Dict[str, Any]) -> str: """Generate a report with channel priority optimization suggestions.""" report = [] report.append("🔧 CHANNEL PRIORITY OPTIMIZATION ANALYSIS") report.append("=" * 80) report.append("") # Current priorities report.append("📋 CURRENT PRIORITIES") report.append("-" * 40) for i, channel in enumerate(channel_analysis['current_priorities'], 1): effectiveness = channel_analysis['priority_effectiveness'].get(channel, {}) if effectiveness: report.append(f"{i}. {channel} - {effectiveness['effectiveness']:.1%} effectiveness " f"({effectiveness['kept']:,} kept, {effectiveness['skipped']:,} skipped)") else: report.append(f"{i}. {channel} - No data available") report.append("") # Effectiveness analysis if channel_analysis['priority_effectiveness']: report.append("📈 EFFECTIVENESS ANALYSIS") report.append("-" * 40) for channel, data in sorted(channel_analysis['priority_effectiveness'].items(), key=lambda x: x[1]['effectiveness'], reverse=True): report.append(f"{channel}: {data['effectiveness']:.1%} effectiveness " f"({data['kept']:,} kept, {data['skipped']:,} skipped)") report.append("") # Suggested optimizations if channel_analysis['suggested_priorities']: report.append("💡 SUGGESTED OPTIMIZATIONS") report.append("-" * 40) report.append("Recommended priority order based on effectiveness:") for i, channel in enumerate(channel_analysis['suggested_priorities'], 1): report.append(f"{i}. {channel}") report.append("") # Unused channels if channel_analysis['unused_channels']: report.append("🔍 UNUSED CHANNELS") report.append("-" * 40) report.append("Channels found in your library but not in current priorities:") for channel in channel_analysis['unused_channels']: report.append(f" - {channel}") report.append("") report.append("=" * 80) return "\n".join(report) def generate_duplicate_pattern_report(self, skip_analysis: Dict[str, Any]) -> str: """Generate a report analyzing duplicate patterns.""" report = [] report.append("🔄 DUPLICATE PATTERN ANALYSIS") report.append("=" * 80) report.append("") # Most duplicated artists top_artists = sorted(skip_analysis['artist_duplicate_counts'].items(), key=lambda x: x[1], reverse=True)[:20] if top_artists: report.append("🎤 ARTISTS WITH MOST DUPLICATES") report.append("-" * 40) for artist, count in top_artists: report.append(f"{artist}: {count:,} duplicate files") report.append("") # Most duplicated titles top_titles = sorted(skip_analysis['title_duplicate_counts'].items(), key=lambda x: x[1], reverse=True)[:20] if top_titles: report.append("🎵 TITLES WITH MOST DUPLICATES") report.append("-" * 40) for title, count in top_titles: report.append(f"{title}: {count:,} duplicate files") report.append("") # File type duplicate patterns report.append("📁 DUPLICATE PATTERNS BY FILE TYPE") report.append("-" * 40) for ext, count in sorted(skip_analysis['file_type_distribution'].items()): percentage = (count / skip_analysis['total_skipped']) * 100 report.append(f"{ext}: {count:,} files ({percentage:.1f}% of all duplicates)") report.append("") # Channel duplicate patterns if skip_analysis['channel_distribution']: report.append("🎵 DUPLICATE PATTERNS BY CHANNEL") report.append("-" * 40) for channel, count in sorted(skip_analysis['channel_distribution'].items(), key=lambda x: x[1], reverse=True): percentage = (count / skip_analysis['total_skipped']) * 100 report.append(f"{channel}: {count:,} files ({percentage:.1f}% of all duplicates)") report.append("") report.append("=" * 80) return "\n".join(report) def generate_actionable_insights_report(self, stats: Dict[str, Any], skip_analysis: Dict[str, Any], channel_analysis: Dict[str, Any]) -> str: """Generate actionable insights and recommendations.""" report = [] report.append("💡 ACTIONABLE INSIGHTS & RECOMMENDATIONS") report.append("=" * 80) report.append("") # Space savings duplicate_percentage = (stats['duplicates_found'] / stats['total_songs']) * 100 report.append("💾 STORAGE OPTIMIZATION") report.append("-" * 40) report.append(f"• {duplicate_percentage:.1f}% of your library consists of duplicates") report.append(f"• Removing {stats['duplicates_found']:,} duplicate files will significantly reduce storage") report.append(f"• This represents a major opportunity for library cleanup") report.append("") # Channel priority recommendations if channel_analysis['suggested_priorities']: report.append("🎯 CHANNEL PRIORITY RECOMMENDATIONS") report.append("-" * 40) report.append("Consider updating your channel priorities to:") for i, channel in enumerate(channel_analysis['suggested_priorities'][:5], 1): report.append(f"{i}. Prioritize '{channel}' (highest effectiveness)") if channel_analysis['unused_channels']: report.append("") report.append("Add these channels to your priorities:") for channel in channel_analysis['unused_channels'][:5]: report.append(f"• '{channel}'") report.append("") # File type insights report.append("📁 FILE TYPE INSIGHTS") report.append("-" * 40) mp4_count = stats['file_type_breakdown'].get('.mp4', 0) mp3_count = stats['file_type_breakdown'].get('.mp3', 0) if mp4_count > 0: mp4_percentage = (mp4_count / stats['total_songs']) * 100 report.append(f"• {mp4_percentage:.1f}% of your library is MP4 format (highest quality)") if mp3_count > 0: report.append("• You have MP3 files (including CDG/MP3 pairs) - the tool correctly handles them") # Most problematic areas top_folders = sorted(skip_analysis['folder_patterns'].items(), key=lambda x: x[1], reverse=True)[:5] if top_folders: report.append("") report.append("🔍 AREAS NEEDING ATTENTION") report.append("-" * 40) report.append("Folders with the most duplicates:") for folder, count in top_folders: report.append(f"• '{folder}': {count:,} duplicate files") report.append("") report.append("=" * 80) return "\n".join(report) def generate_summary_report(self, stats: Dict[str, Any]) -> str: """Generate a summary report of the cleanup process.""" report = [] report.append("=" * 60) report.append("KARAOKE SONG LIBRARY CLEANUP SUMMARY") report.append("=" * 60) report.append("") # Basic statistics report.append(f"Total songs processed: {stats['total_songs']:,}") report.append(f"Unique songs found: {stats['unique_songs']:,}") report.append(f"Duplicates identified: {stats['duplicates_found']:,}") report.append(f"Groups with duplicates: {stats['groups_with_duplicates']:,}") report.append("") # File type breakdown report.append("FILE TYPE BREAKDOWN:") for ext, count in sorted(stats['file_type_breakdown'].items()): percentage = (count / stats['total_songs']) * 100 report.append(f" {ext}: {count:,} ({percentage:.1f}%)") report.append("") # Channel breakdown (for MP4s) if stats['channel_breakdown']: report.append("MP4 CHANNEL BREAKDOWN:") for channel, count in sorted(stats['channel_breakdown'].items()): report.append(f" {channel}: {count:,}") report.append("") # Duplicate statistics if stats['duplicates_found'] > 0: duplicate_percentage = (stats['duplicates_found'] / stats['total_songs']) * 100 report.append(f"DUPLICATE ANALYSIS:") report.append(f" Duplicate rate: {duplicate_percentage:.1f}%") report.append(f" Space savings potential: Significant") report.append("") report.append("=" * 60) return "\n".join(report) def generate_channel_priority_report(self, stats: Dict[str, Any], channel_priorities: List[str]) -> str: """Generate a report about channel priority matching.""" report = [] report.append("CHANNEL PRIORITY ANALYSIS") report.append("=" * 60) report.append("") # Count songs with and without defined channel priorities total_mp4s = sum(count for ext, count in stats['file_type_breakdown'].items() if ext == '.mp4') songs_with_priority = sum(stats['channel_breakdown'].values()) songs_without_priority = total_mp4s - songs_with_priority report.append(f"MP4 files with defined channel priorities: {songs_with_priority:,}") report.append(f"MP4 files without defined channel priorities: {songs_without_priority:,}") report.append("") if songs_without_priority > 0: report.append("Note: Songs without defined channel priorities will be marked for manual review.") report.append("Consider adding their folder names to the channel_priorities configuration.") report.append("") # Show channel priority order report.append("Channel Priority Order (highest to lowest):") for i, channel in enumerate(channel_priorities, 1): report.append(f" {i}. {channel}") report.append("") return "\n".join(report) def generate_duplicate_details(self, duplicate_info: List[Dict[str, Any]]) -> str: """Generate detailed report of duplicate groups.""" if not duplicate_info: return "No duplicates found." report = [] report.append("DETAILED DUPLICATE ANALYSIS") report.append("=" * 60) report.append("") for i, group in enumerate(duplicate_info, 1): report.append(f"Group {i}: {group['artist']} - {group['title']}") report.append(f" Total versions: {group['total_versions']}") report.append(" Versions:") for version in group['versions']: status = "✓ KEEP" if version['will_keep'] else "✗ SKIP" channel_info = f" ({version['channel']})" if version['channel'] else "" report.append(f" {status} {version['priority_rank']}. {version['path']}{channel_info}") report.append("") return "\n".join(report) def generate_skip_list_summary(self, skip_songs: List[Dict[str, Any]]) -> str: """Generate a summary of the skip list.""" if not skip_songs: return "No songs marked for skipping." report = [] report.append("SKIP LIST SUMMARY") report.append("=" * 60) report.append("") # Group by reason reasons = {} for skip_song in skip_songs: reason = skip_song.get('reason', 'unknown') if reason not in reasons: reasons[reason] = [] reasons[reason].append(skip_song) for reason, songs in reasons.items(): report.append(f"{reason.upper()} ({len(songs)} songs):") for song in songs[:10]: # Show first 10 report.append(f" {song['artist']} - {song['title']}") report.append(f" Path: {song['path']}") if 'kept_version' in song: report.append(f" Kept: {song['kept_version']}") report.append("") if len(songs) > 10: report.append(f" ... and {len(songs) - 10} more") report.append("") return "\n".join(report) def generate_config_summary(self, config: Dict[str, Any]) -> str: """Generate a summary of the current configuration.""" report = [] report.append("CURRENT CONFIGURATION") report.append("=" * 60) report.append("") # Channel priorities report.append("Channel Priorities (MP4 files):") for i, channel in enumerate(config.get('channel_priorities', [])): report.append(f" {i + 1}. {channel}") report.append("") # Matching settings matching = config.get('matching', {}) report.append("Matching Settings:") report.append(f" Case sensitive: {matching.get('case_sensitive', False)}") report.append(f" Fuzzy matching: {matching.get('fuzzy_matching', False)}") if matching.get('fuzzy_matching'): report.append(f" Fuzzy threshold: {matching.get('fuzzy_threshold', 0.8)}") report.append("") # Output settings output = config.get('output', {}) report.append("Output Settings:") report.append(f" Verbose mode: {output.get('verbose', False)}") report.append(f" Include reasons: {output.get('include_reasons', True)}") report.append("") return "\n".join(report) def generate_progress_report(self, current: int, total: int, message: str = "") -> str: """Generate a progress report.""" percentage = (current / total) * 100 if total > 0 else 0 bar_length = 30 filled_length = int(bar_length * current // total) bar = '█' * filled_length + '-' * (bar_length - filled_length) progress_line = f"\r[{bar}] {percentage:.1f}% ({current:,}/{total:,})" if message: progress_line += f" - {message}" return progress_line def print_report(self, report_type: str, data: Any) -> None: """Print a formatted report to console.""" if report_type == "summary": print(self.generate_summary_report(data)) elif report_type == "duplicates": if self.verbose: print(self.generate_duplicate_details(data)) elif report_type == "skip_summary": print(self.generate_skip_list_summary(data)) elif report_type == "config": print(self.generate_config_summary(data)) else: print(f"Unknown report type: {report_type}") def save_report_to_file(self, report_content: str, file_path: str) -> None: """Save a report to a text file.""" import os os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'w', encoding='utf-8') as f: f.write(report_content) print(f"Report saved to: {file_path}") def generate_detailed_duplicate_analysis(self, skip_songs: List[Dict[str, Any]], best_songs: List[Dict[str, Any]]) -> str: """Generate a detailed analysis showing specific songs and their duplicate versions.""" report = [] report.append("=" * 100) report.append("DETAILED DUPLICATE ANALYSIS - WHAT'S ACTUALLY HAPPENING") report.append("=" * 100) report.append("") # Group skip songs by artist/title to show duplicates together duplicate_groups = {} for skip_song in skip_songs: artist = skip_song.get('artist', 'Unknown') title = skip_song.get('title', 'Unknown') key = f"{artist} - {title}" if key not in duplicate_groups: duplicate_groups[key] = { 'artist': artist, 'title': title, 'skipped_versions': [], 'kept_version': skip_song.get('kept_version', 'Unknown') } duplicate_groups[key]['skipped_versions'].append({ 'path': skip_song['path'], 'reason': skip_song.get('reason', 'duplicate') }) # Sort by number of duplicates (most duplicates first) sorted_groups = sorted(duplicate_groups.items(), key=lambda x: len(x[1]['skipped_versions']), reverse=True) report.append(f"📊 FOUND {len(duplicate_groups)} SONGS WITH DUPLICATES") report.append("") # Show top 20 most duplicated songs report.append("🎵 TOP 20 MOST DUPLICATED SONGS:") report.append("-" * 80) for i, (key, group) in enumerate(sorted_groups[:20], 1): num_duplicates = len(group['skipped_versions']) report.append(f"{i:2d}. {key}") report.append(f" 📁 KEPT: {group['kept_version']}") report.append(f" 🗑️ SKIPPING {num_duplicates} duplicate(s):") for j, version in enumerate(group['skipped_versions'][:5], 1): # Show first 5 report.append(f" {j}. {version['path']}") if num_duplicates > 5: report.append(f" ... and {num_duplicates - 5} more") report.append("") # Show some examples of different duplicate patterns report.append("🔍 DUPLICATE PATTERNS EXAMPLES:") report.append("-" * 80) # Find examples of different duplicate scenarios mp4_vs_mp4 = [] mp4_vs_cdg_mp3 = [] same_channel_duplicates = [] for key, group in sorted_groups: skipped_paths = [v['path'] for v in group['skipped_versions']] kept_path = group['kept_version'] # Check for MP4 vs MP4 duplicates if (kept_path.endswith('.mp4') and any(p.endswith('.mp4') for p in skipped_paths)): mp4_vs_mp4.append(key) # Check for MP4 vs CDG/MP3 duplicates if (kept_path.endswith('.mp4') and any(p.endswith('.mp3') or p.endswith('.cdg') for p in skipped_paths)): mp4_vs_cdg_mp3.append(key) # Check for same channel duplicates kept_channel = self._extract_channel(kept_path) if kept_channel and any(self._extract_channel(p) == kept_channel for p in skipped_paths): same_channel_duplicates.append(key) report.append("📁 MP4 vs MP4 Duplicates (different channels):") for song in mp4_vs_mp4[:5]: report.append(f" • {song}") report.append("") report.append("🎵 MP4 vs MP3 Duplicates (format differences):") for song in mp4_vs_cdg_mp3[:5]: report.append(f" • {song}") report.append("") report.append("🔄 Same Channel Duplicates (exact duplicates):") for song in same_channel_duplicates[:5]: report.append(f" • {song}") report.append("") # Show file type distribution in duplicates report.append("📊 DUPLICATE FILE TYPE BREAKDOWN:") report.append("-" * 80) file_types = {'mp4': 0, 'mp3': 0} for group in duplicate_groups.values(): for version in group['skipped_versions']: path = version['path'].lower() if path.endswith('.mp4'): file_types['mp4'] += 1 elif path.endswith('.mp3') or path.endswith('.cdg'): file_types['mp3'] += 1 total_duplicates = sum(file_types.values()) for file_type, count in file_types.items(): percentage = (count / total_duplicates * 100) if total_duplicates > 0 else 0 report.append(f" {file_type.upper()}: {count:,} files ({percentage:.1f}%)") report.append("") report.append("=" * 100) return "\n".join(report) def _extract_channel(self, path: str) -> str: """Extract channel name from path for analysis.""" for channel in self.channel_priorities: if channel.lower() in path.lower(): return channel return None