#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Validation script for SUR project Computes metrics and evaluates performance of recognition systems """ import os import numpy as np import argparse import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, accuracy_score, classification_report from collections import Counter def load_results(file_path, include_probs=False): """ Load results from a result file Args: file_path: Path to the result file include_probs: Whether to include the log probabilities Returns: Dictionary mapping filenames to labels (and log probs if requested) """ results = {} try: with open(file_path, 'r') as f: for line in f: parts = line.strip().split() if len(parts) >= 2: filename = parts[0] label = int(parts[1]) if include_probs and len(parts) >= 33: log_probs = [float(p) for p in parts[2:33]] results[filename] = (label, log_probs) else: results[filename] = label except Exception as e: print(f"Error loading {file_path}: {e}") return {} print(f"Loaded {len(results)} results from {file_path}") return results def load_ground_truth(file_path): """ Load ground truth labels from a file Args: file_path: Path to the ground truth file Returns: Dictionary mapping filenames to labels """ ground_truth = {} try: with open(file_path, 'r') as f: for line in f: parts = line.strip().split() if len(parts) >= 2: filename = parts[0] label = int(parts[1]) ground_truth[filename] = label except Exception as e: print(f"Error loading ground truth {file_path}: {e}") return {} print(f"Loaded {len(ground_truth)} ground truth labels from {file_path}") return ground_truth def calculate_accuracy(predictions, ground_truth): """ Calculate accuracy against ground truth Args: predictions: Dictionary mapping filenames to predicted labels ground_truth: Dictionary mapping filenames to true labels Returns: Accuracy, number of correct predictions, total predictions """ if not ground_truth or not predictions: return 0, 0, 0 common_files = set(predictions.keys()).intersection(set(ground_truth.keys())) if not common_files: return 0, 0, 0 true_labels = [ground_truth[f] for f in common_files] pred_labels = [predictions[f] for f in common_files] correct = sum(1 for t, p in zip(true_labels, pred_labels) if t == p) accuracy = correct / len(common_files) return accuracy, correct, len(common_files) def calculate_class_accuracies(predictions, ground_truth): """ Calculate per-class accuracies Args: predictions: Dictionary mapping filenames to predicted labels ground_truth: Dictionary mapping filenames to true labels Returns: Dictionary mapping class IDs to (accuracy, count) """ if not ground_truth or not predictions: return {} common_files = set(predictions.keys()).intersection(set(ground_truth.keys())) if not common_files: return {} # Group by true class class_results = {} for f in common_files: true_label = ground_truth[f] pred_label = predictions[f] if true_label not in class_results: class_results[true_label] = {'correct': 0, 'total': 0} class_results[true_label]['total'] += 1 if true_label == pred_label: class_results[true_label]['correct'] += 1 # Calculate accuracies class_accuracies = {} for class_id, results in class_results.items(): accuracy = results['correct'] / results['total'] if results['total'] > 0 else 0 class_accuracies[class_id] = (accuracy, results['total']) return class_accuracies def analyze_modality_agreement(audio_results, image_results, fusion_results, ground_truth=None): """ Analyze agreement between modalities and fusion Args: audio_results: Dictionary mapping filenames to audio predictions image_results: Dictionary mapping filenames to image predictions fusion_results: Dictionary mapping filenames to fusion predictions ground_truth: Optional dictionary of ground truth labels Returns: Dictionary with agreement statistics """ common_files = set(audio_results.keys()) & set(image_results.keys()) & set(fusion_results.keys()) if ground_truth: common_files = common_files & set(ground_truth.keys()) total = len(common_files) if total == 0: return {} stats = { 'total': total, 'audio_image_agree': 0, 'fusion_follows_audio': 0, 'fusion_follows_image': 0, 'fusion_differs_from_both': 0, 'audio_correct': 0, 'image_correct': 0, 'fusion_correct': 0, 'only_audio_correct': 0, 'only_image_correct': 0, 'only_fusion_correct': 0, 'all_correct': 0, 'none_correct': 0 } for f in common_files: audio_pred = audio_results[f] image_pred = image_results[f] fusion_pred = fusion_results[f] # Agreement between modalities if audio_pred == image_pred: stats['audio_image_agree'] += 1 # Which modality the fusion follows if fusion_pred == audio_pred and fusion_pred != image_pred: stats['fusion_follows_audio'] += 1 elif fusion_pred == image_pred and fusion_pred != audio_pred: stats['fusion_follows_image'] += 1 elif fusion_pred != audio_pred and fusion_pred != image_pred: stats['fusion_differs_from_both'] += 1 # Correctness if ground truth available if ground_truth: true_label = ground_truth[f] audio_correct = (audio_pred == true_label) image_correct = (image_pred == true_label) fusion_correct = (fusion_pred == true_label) if audio_correct: stats['audio_correct'] += 1 if image_correct: stats['image_correct'] += 1 if fusion_correct: stats['fusion_correct'] += 1 if audio_correct and not image_correct and not fusion_correct: stats['only_audio_correct'] += 1 elif image_correct and not audio_correct and not fusion_correct: stats['only_image_correct'] += 1 elif fusion_correct and not audio_correct and not image_correct: stats['only_fusion_correct'] += 1 elif audio_correct and image_correct and fusion_correct: stats['all_correct'] += 1 elif not audio_correct and not image_correct and not fusion_correct: stats['none_correct'] += 1 # Convert counts to percentages for key in list(stats.keys()): if key != 'total': stats[f'{key}_pct'] = (stats[key] / total) * 100 return stats def calculate_confidence_metrics(results_with_probs): """ Calculate confidence-based metrics for predictions Args: results_with_probs: Dictionary mapping filenames to (label, log_probs) tuples Returns: Dictionary of confidence metrics """ if not results_with_probs: return {} winning_margins = [] entropies = [] confidences = [] for filename, (label, log_probs) in results_with_probs.items(): # Convert to probabilities probs = np.exp(log_probs) probs_sum = np.sum(probs) if probs_sum > 0: probs = probs / probs_sum # Winning probability winning_prob = probs[label-1] # -1 because labels are 1-indexed confidences.append(winning_prob) # Margin between top two probabilities sorted_probs = sorted(probs, reverse=True) margin = sorted_probs[0] - sorted_probs[1] if len(sorted_probs) > 1 else sorted_probs[0] winning_margins.append(margin) # Entropy (uncertainty measure) valid_probs = probs[probs > 0] # Avoid log(0) entropy = -np.sum(valid_probs * np.log(valid_probs)) entropies.append(entropy) return { 'avg_confidence': np.mean(confidences), 'med_confidence': np.median(confidences), 'min_confidence': np.min(confidences), 'max_confidence': np.max(confidences), 'avg_margin': np.mean(winning_margins), 'avg_entropy': np.mean(entropies), } def plot_confusion_matrix(predictions, ground_truth, title, output_file=None): """ Plot and save a confusion matrix Args: predictions: Dictionary mapping filenames to predicted labels ground_truth: Dictionary mapping filenames to true labels title: Title for the plot output_file: Optional path to save the plot """ if not ground_truth or not predictions: print(f"Cannot create confusion matrix for {title}: Missing data") return common_files = set(predictions.keys()).intersection(set(ground_truth.keys())) if not common_files: print(f"Cannot create confusion matrix for {title}: No common files") return true_labels = [ground_truth[f] for f in common_files] pred_labels = [predictions[f] for f in common_files] # Get unique classes classes = sorted(set(true_labels) | set(pred_labels)) # Create confusion matrix cm = confusion_matrix(true_labels, pred_labels, labels=classes) # Normalize by row (true labels) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm_normalized = np.nan_to_num(cm_normalized) # Replace NaNs with 0 # Plot plt.figure(figsize=(10, 8)) plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues) plt.title(f'Normalized Confusion Matrix - {title}') plt.colorbar() # Add labels tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.tight_layout() # Save if output file specified if output_file: plt.savefig(output_file) print(f"Saved confusion matrix to {output_file}") else: plt.show() plt.close() def main(): """Main function""" parser = argparse.ArgumentParser(description="Validate and analyze recognition results") parser.add_argument("--audio_file", type=str, default="audio_gmm_results.txt", help="Path to audio recognition results file") parser.add_argument("--image_file", type=str, default="image_svm_results.txt", help="Path to image recognition results file") parser.add_argument("--fusion_file", type=str, default="fusion_results.txt", help="Path to fusion results file") parser.add_argument("--ground_truth", type=str, default=None, help="Path to ground truth labels file (if available)") parser.add_argument("--output_dir", type=str, default="../results/validation_results", help="Directory to save validation results") args = parser.parse_args() # Create output directory if it doesn't exist if args.output_dir: os.makedirs(args.output_dir, exist_ok=True) # Load results audio_results = load_results(args.audio_file) image_results = load_results(args.image_file) fusion_results = load_results(args.fusion_file) # Load results with probabilities for confidence analysis audio_results_with_probs = load_results(args.audio_file, include_probs=True) image_results_with_probs = load_results(args.image_file, include_probs=True) fusion_results_with_probs = load_results(args.fusion_file, include_probs=True) # Load ground truth if available ground_truth = None if args.ground_truth and os.path.exists(args.ground_truth): ground_truth = load_ground_truth(args.ground_truth) # Basic statistics print("\n===== Basic Statistics =====") print(f"Audio results: {len(audio_results)} files") print(f"Image results: {len(image_results)} files") print(f"Fusion results: {len(fusion_results)} files") if ground_truth: print("\n===== Accuracy Against Ground Truth =====") audio_acc, audio_correct, audio_total = calculate_accuracy(audio_results, ground_truth) image_acc, image_correct, image_total = calculate_accuracy(image_results, ground_truth) fusion_acc, fusion_correct, fusion_total = calculate_accuracy(fusion_results, ground_truth) print(f"Audio accuracy: {audio_acc:.4f} ({audio_correct}/{audio_total})") print(f"Image accuracy: {image_acc:.4f} ({image_correct}/{image_total})") print(f"Fusion accuracy: {fusion_acc:.4f} ({fusion_correct}/{fusion_total})") # Per-class accuracies audio_class_acc = calculate_class_accuracies(audio_results, ground_truth) image_class_acc = calculate_class_accuracies(image_results, ground_truth) fusion_class_acc = calculate_class_accuracies(fusion_results, ground_truth) print("\n===== Per-Class Accuracies =====") print("Class\tAudio\tImage\tFusion\tCount") print("-" * 50) for class_id in sorted(set(audio_class_acc.keys()) | set(image_class_acc.keys()) | set(fusion_class_acc.keys())): audio_acc_str = f"{audio_class_acc.get(class_id, (0, 0))[0]:.4f}" if class_id in audio_class_acc else "N/A" image_acc_str = f"{image_class_acc.get(class_id, (0, 0))[0]:.4f}" if class_id in image_class_acc else "N/A" fusion_acc_str = f"{fusion_class_acc.get(class_id, (0, 0))[0]:.4f}" if class_id in fusion_class_acc else "N/A" count = max( audio_class_acc.get(class_id, (0, 0))[1], image_class_acc.get(class_id, (0, 0))[1], fusion_class_acc.get(class_id, (0, 0))[1] ) print(f"{class_id}\t{audio_acc_str}\t{image_acc_str}\t{fusion_acc_str}\t{count}") # Create confusion matrices if args.output_dir: plot_confusion_matrix( audio_results, ground_truth, "Audio", os.path.join(args.output_dir, "audio_confusion_matrix.png") ) plot_confusion_matrix( image_results, ground_truth, "Image", os.path.join(args.output_dir, "image_confusion_matrix.png") ) plot_confusion_matrix( fusion_results, ground_truth, "Fusion", os.path.join(args.output_dir, "fusion_confusion_matrix.png") ) # Analyze modality agreement agreement_stats = analyze_modality_agreement(audio_results, image_results, fusion_results, ground_truth) if agreement_stats: print("\n===== Modality Agreement Analysis =====") print(f"Total files analyzed: {agreement_stats['total']}") print(f"Audio and image agree: {agreement_stats['audio_image_agree']} ({agreement_stats.get('audio_image_agree_pct', 0):.2f}%)") print(f"Fusion follows audio: {agreement_stats['fusion_follows_audio']} ({agreement_stats.get('fusion_follows_audio_pct', 0):.2f}%)") print(f"Fusion follows image: {agreement_stats['fusion_follows_image']} ({agreement_stats.get('fusion_follows_image_pct', 0):.2f}%)") print(f"Fusion differs from both: {agreement_stats['fusion_differs_from_both']} ({agreement_stats.get('fusion_differs_from_both_pct', 0):.2f}%)") if ground_truth: print("\n===== Correctness Analysis =====") print(f"Audio correct: {agreement_stats['audio_correct']} ({agreement_stats.get('audio_correct_pct', 0):.2f}%)") print(f"Image correct: {agreement_stats['image_correct']} ({agreement_stats.get('image_correct_pct', 0):.2f}%)") print(f"Fusion correct: {agreement_stats['fusion_correct']} ({agreement_stats.get('fusion_correct_pct', 0):.2f}%)") print(f"Only audio correct: {agreement_stats['only_audio_correct']} ({agreement_stats.get('only_audio_correct_pct', 0):.2f}%)") print(f"Only image correct: {agreement_stats['only_image_correct']} ({agreement_stats.get('only_image_correct_pct', 0):.2f}%)") print(f"Only fusion correct: {agreement_stats['only_fusion_correct']} ({agreement_stats.get('only_fusion_correct_pct', 0):.2f}%)") print(f"All modalities correct: {agreement_stats['all_correct']} ({agreement_stats.get('all_correct_pct', 0):.2f}%)") print(f"No modality correct: {agreement_stats['none_correct']} ({agreement_stats.get('none_correct_pct', 0):.2f}%)") # Confidence analysis print("\n===== Confidence Analysis =====") audio_conf = calculate_confidence_metrics(audio_results_with_probs) image_conf = calculate_confidence_metrics(image_results_with_probs) fusion_conf = calculate_confidence_metrics(fusion_results_with_probs) if audio_conf: print(f"Audio average confidence: {audio_conf['avg_confidence']:.4f}") print(f"Audio average margin: {audio_conf['avg_margin']:.4f}") print(f"Audio average entropy: {audio_conf['avg_entropy']:.4f}") if image_conf: print(f"Image average confidence: {image_conf['avg_confidence']:.4f}") print(f"Image average margin: {image_conf['avg_margin']:.4f}") print(f"Image average entropy: {image_conf['avg_entropy']:.4f}") if fusion_conf: print(f"Fusion average confidence: {fusion_conf['avg_confidence']:.4f}") print(f"Fusion average margin: {fusion_conf['avg_margin']:.4f}") print(f"Fusion average entropy: {fusion_conf['avg_entropy']:.4f}") print("\n===== Validation Complete =====") if __name__ == "__main__": main()