#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Face recognition system for SUR project
Part (a) - Image-based person identification
"""

import os
import numpy as np
import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from PIL import Image
from skimage.feature import hog
import pickle
import argparse

# Function to extract features from PNG files (adapted from demo PNG2FEA)
def png2fea(filename):
    """
    Convert PNG image to feature vector using HOG (Histogram of Oriented Gradients)
    
    Args:
        filename: Path to the PNG file
        
    Returns:
        Feature vector
    """
    try:
        # Load image
        image = Image.open(filename).convert('L')  # Convert to grayscale
        
        # Resize to consistent dimensions for HOG
        image = image.resize((80, 80), Image.LANCZOS)
        img_array = np.array(image)
        
        # Extract HOG features
        hog_features = hog(
            img_array, 
            orientations=9, 
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2), 
            block_norm='L2-Hys',
            visualize=False
        )
        
        return hog_features
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return None

def load_data(data_dir="SUR_projekt2024-2025", subset="train", use_dev_for_training=False, dev_ratio=0.5):
    """
    Load PNG files from data directory and extract features
    
    Args:
        data_dir: Base directory containing the data
        subset: 'train' or 'dev' subset
        use_dev_for_training: Whether to use a portion of dev data for training
        dev_ratio: Portion of dev data to use for training (0.0-1.0)
        
    Returns:
        features: Feature vectors
        labels: Class labels
        filenames: Original filenames
    """
    features = []
    labels = []
    filenames = []
    
    # Path to the subset directory
    subset_dir = os.path.join(data_dir, subset)
    
    # Check if directory exists
    if not os.path.exists(subset_dir):
        raise FileNotFoundError(f"Directory {subset_dir} not found")
    
    # Loop through each class directory (1-31)
    for class_id in range(1, 32):
        class_dir = os.path.join(subset_dir, str(class_id))
        
        # Skip if class directory doesn't exist
        if not os.path.exists(class_dir):
            print(f"Warning: Class directory {class_dir} not found")
            continue
            
        # Get all PNG files in the class directory
        png_files = glob.glob(os.path.join(class_dir, "*.png"))
        
        print(f"Processing class {class_id}, found {len(png_files)} PNG files")
        
        # Process each PNG file
        for png_file in png_files:
            # Extract features
            feature_vector = png2fea(png_file)
            
            if feature_vector is not None:
                features.append(feature_vector)
                labels.append(class_id)
                filenames.append(os.path.basename(png_file))
    
    # Add a portion of dev data if requested and if we're loading train data
    if use_dev_for_training and subset == "train":
        dev_dir = os.path.join(data_dir, "dev")
        
        if os.path.exists(dev_dir):
            print(f"Adding {dev_ratio*100:.0f}% of dev data to training...")
            
            for class_id in range(1, 32):
                class_dir = os.path.join(dev_dir, str(class_id))
                
                if not os.path.exists(class_dir):
                    continue
                
                # Get all PNG files in the dev class directory
                png_files = glob.glob(os.path.join(class_dir, "*.png"))
                
                # Calculate how many files to use for training
                num_train_files = max(1, int(len(png_files) * dev_ratio))
                
                # Select files for training (we'll use the first portion)
                train_files = png_files[:num_train_files]
                
                print(f"Adding {len(train_files)}/{len(png_files)} dev files for class {class_id}")
                
                # Process each training file from dev
                for png_file in train_files:
                    feature_vector = png2fea(png_file)
                    
                    if feature_vector is not None:
                        features.append(feature_vector)
                        labels.append(class_id)
                        filenames.append("dev_" + os.path.basename(png_file))
        else:
            print(f"Dev directory {dev_dir} not found, using only train data.")
    
    return np.array(features), np.array(labels), filenames

def train_model(features, labels):
    """
    Train SVM model for face recognition
    
    Args:
        features: Feature vectors
        labels: Class labels
        
    Returns:
        Trained model, scaler
    """
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        features, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Define parameter grid for SVM
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
        'kernel': ['rbf']
    }
    
    # Initialize SVM classifier
    svm = SVC(probability=True, random_state=42)
    
    # Perform grid search to find best parameters
    print("Performing grid search for SVM parameters...")
    grid_search = GridSearchCV(
        svm, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2
    )
    grid_search.fit(X_train_scaled, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate on validation set
    y_pred = best_model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Validation accuracy: {accuracy:.4f}")
    
    # Plot confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig('../results/face_confusion_matrix.png')
    
    # Train final model on all data
    print("Training final model on all data...")
    X_all_scaled = scaler.transform(features)
    best_model.fit(X_all_scaled, labels)
    
    return best_model, scaler

def predict(model, scaler, features):
    """
    Make predictions with trained model
    
    Args:
        model: Trained SVM model
        scaler: Feature scaler
        features: Feature vectors
        
    Returns:
        Predictions, log probabilities
    """
    # Scale features
    features_scaled = scaler.transform(features)
    
    # Get class predictions
    predictions = model.predict(features_scaled)
    
    # Get log probabilities
    log_probs = model.predict_log_proba(features_scaled)
    
    return predictions, log_probs

def process_test_files(model, scaler, test_dir):
    """
    Process test files and generate predictions
    
    Args:
        model: Trained model
        scaler: Feature scaler
        test_dir: Directory containing test files
        
    Returns:
        results: List of results (filename, prediction, log_probs)
    """
    results = []
    
    # Get all PNG files in test directory (recursive)
    png_files = glob.glob(os.path.join(test_dir, "**", "*.png"), recursive=True)
    
    print(f"Found {len(png_files)} PNG files for testing")
    
    for png_file in png_files:
        # Extract features
        feature_vector = png2fea(png_file)
        
        if feature_vector is not None:
            # Make prediction
            pred, log_probs = predict(model, scaler, feature_vector.reshape(1, -1))
            
            # Get filename without extension
            filename = os.path.basename(png_file).rsplit('.', 1)[0]
            
            # Store result
            results.append((filename, pred[0], log_probs[0]))
    
    return results

def save_results(results, output_file="image_svm_results.txt"):
    """
    Save results to output file
    
    Args:
        results: List of results (filename, prediction, log_probs)
        output_file: Output filename
    """
    with open(output_file, 'w') as f:
        for filename, pred, log_probs in results:
            # Format: filename prediction log_prob_1 log_prob_2 ... log_prob_31
            line = f"{filename} {pred}"
            for lp in log_probs:
                line += f" {lp:.6f}"
            f.write(line + "\n")
    
    print(f"Results saved to {output_file}")

def save_model(model, scaler, output_file="face_model.pkl"):
    """
    Save model and scaler to file
    
    Args:
        model: Trained model
        scaler: Feature scaler
        output_file: Output filename
    """
    with open(output_file, 'wb') as f:
        pickle.dump((model, scaler), f)
    
    print(f"Model saved to {output_file}")

def load_model(input_file="face_model.pkl"):
    """
    Load model and scaler from file
    
    Args:
        input_file: Input filename
        
    Returns:
        model, scaler
    """
    with open(input_file, 'rb') as f:
        model, scaler = pickle.load(f)
    
    return model, scaler

def main():
    """Main function"""
    parser = argparse.ArgumentParser(description="Face recognition system")
    parser.add_argument("--data_dir", type=str, default="SUR_projekt2024-2025",
                        help="Base directory containing the data")
    parser.add_argument("--mode", type=str, choices=["train", "test", "train_test"],
                        default="train_test", help="Mode of operation")
    parser.add_argument("--model_file", type=str, default="face_model.pkl",
                        help="File to save/load model")
    parser.add_argument("--test_dir", type=str, default="SUR_projekt2024-2025/dev",
                        help="Directory containing test files")
    parser.add_argument("--output_file", type=str, default="image_svm_results.txt",
                        help="Output file for test results")
    parser.add_argument("--use_dev_for_training", action="store_true",
                        help="Use a portion of dev data for training")
    parser.add_argument("--dev_ratio", type=float, default=0.5,
                        help="Portion of dev data to use for training (0.0-1.0)")
    parser.add_argument("--remaining_dev_file", type=str, default="remaining_dev_image.txt",
                        help="File to save remaining dev files not used for training")
    
    args = parser.parse_args()
    
    if args.mode in ["train", "train_test"]:
        # Load data
        print("Loading training data...")
        features, labels, filenames = load_data(args.data_dir, "train", 
                                              use_dev_for_training=args.use_dev_for_training,
                                              dev_ratio=args.dev_ratio)
        
        # If we're using dev data for training, create a file with remaining dev files for validation
        if args.use_dev_for_training:
            create_remaining_dev_files(args.data_dir, args.dev_ratio, args.remaining_dev_file)
        
        # Train model
        print("Training model...")
        model, scaler = train_model(features, labels)
        
        # Save model
        save_model(model, scaler, args.model_file)
    
    if args.mode in ["test", "train_test"]:
        if args.mode == "test":
            # Load model
            print("Loading model...")
            model, scaler = load_model(args.model_file)
        
        # Process test files
        print("Processing test files...")
        # If we're using part of dev for training and want to test on the remaining dev files
        if args.use_dev_for_training and os.path.exists(args.remaining_dev_file):
            print(f"Testing on remaining dev files listed in {args.remaining_dev_file}")
            results = process_remaining_dev_files(model, scaler, args.remaining_dev_file, args.data_dir)
        else:
            results = process_test_files(model, scaler, args.test_dir)
        
        # Save results
        save_results(results, args.output_file)

def create_remaining_dev_files(data_dir, dev_ratio, output_file):
    """
    Create a file listing the remaining dev files not used for training
    
    Args:
        data_dir: Base directory containing the data
        dev_ratio: Portion of dev data used for training
        output_file: File to save the list of remaining dev files
    """
    print(f"Creating list of remaining dev files for validation...")
    dev_dir = os.path.join(data_dir, "dev")
    remaining_files = []
    
    for class_id in range(1, 32):
        class_dir = os.path.join(dev_dir, str(class_id))
        if not os.path.exists(class_dir):
            continue
        
        # Get all PNG files in the dev class directory
        png_files = glob.glob(os.path.join(class_dir, "*.png"))
        
        # Calculate how many files were used for training
        num_train_files = max(1, int(len(png_files) * dev_ratio))
        
        # Select files not used for training
        test_files = png_files[num_train_files:]
        
        for test_file in test_files:
            # Store class and file path
            remaining_files.append((class_id, test_file))
    
    # Save the list of remaining files
    with open(output_file, 'w') as f:
        for class_id, file_path in remaining_files:
            f.write(f"{class_id},{file_path}\n")
    
    print(f"Saved list of {len(remaining_files)} remaining dev files to {output_file}")

def process_remaining_dev_files(model, scaler, file_list, data_dir):
    """
    Process remaining dev files for testing
    
    Args:
        model: Trained model
        scaler: Feature scaler
        file_list: File containing list of remaining dev files
        data_dir: Base directory containing the data
        
    Returns:
        List of (filename, prediction, log_probs) tuples
    """
    results = []
    
    # Read the list of remaining dev files
    with open(file_list, 'r') as f:
        remaining_files = [line.strip().split(',') for line in f]
    
    print(f"Found {len(remaining_files)} files for testing")
    
    # Process each file
    for class_id_str, file_path in remaining_files:
        # Extract features
        feature_vector = png2fea(file_path)
        
        if feature_vector is not None:
            # Scale features
            feature_vector_scaled = scaler.transform(feature_vector.reshape(1, -1))
            
            # Make prediction
            pred = model.predict(feature_vector_scaled)[0]
            log_probs = model.predict_log_proba(feature_vector_scaled)[0]
            
            # Filename
            filename = os.path.basename(file_path).rsplit('.', 1)[0]
            
            # Store results
            results.append((filename, pred, log_probs))
    
    return results

if __name__ == "__main__":
    main()