"""
Initial setup. I cut first 1.25 second from each record and then trained gmm on all features
from a class. The model for each class had 16 components and diagonal covariance_type.
It had around 52-56% accuracy. With simple change of number of components, the accuracy 
improved. For 8 components cca 48%, and for 128 = 52% accuracy decreased. But for
32 = 53-60% and 64 = 53-63% the accuracy increased.
I also tried to switch from diagonal to full covariance matrix. 
I started with 
number of components | acc %
64                   | 44
16                   | 61
32                   | 53
8                    | 63
4                    | 61

"""

from sklearn.mixture import GaussianMixture
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank # pip install python_speech_features
 
from sklearn.mixture import GaussianMixture
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

TRAIN_DIR = "./Separate_data/train/sounds"
DEV_DIR = "./Separate_data/dev/sounds"
N_COMPONENTS = 4
COVA_TYPE = 'full' # 'diag'

def load_audio(src_folder):

    all_classes_mfcc_feats = []
    class_labels = []
    for person_class in sorted(os.listdir(src_folder)):
        class_dir = os.path.join(src_folder, person_class)
        
        class_mfcc_feats = []
        class_labels.append(person_class)

        for audio_record in sorted(os.listdir(class_dir)):
            audio_record_pth = os.path.join(class_dir, audio_record)
            freq_sampling, audio_sig = wavfile.read(audio_record_pth)

            audio_sig = audio_sig[20000:] # cut first 1.25 seconds

            mfcc_feats = mfcc(audio_sig, freq_sampling) # extract mfcc
            
            class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class
        
        all_classes_mfcc_feats.append(class_mfcc_feats)
    return all_classes_mfcc_feats, class_labels

def train_gmm(src_folder, n_components):

    class_features = {} # store features per class

    for person_class in sorted(os.listdir(src_folder)):
        class_dir = os.path.join(src_folder, person_class)
        
        class_mfcc_feats = []

        for audio_record in sorted(os.listdir(class_dir)):
            audio_record_pth = os.path.join(class_dir, audio_record)
            freq_sampling, audio_sig = wavfile.read(audio_record_pth)

            audio_sig = audio_sig[20000:] # cut first 2 seconds

            mfcc_feats = mfcc(audio_sig, freq_sampling) # extract mfcc
            class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class

        if class_mfcc_feats:
            class_mfcc_feats = np.vstack(class_mfcc_feats)
            print(f"\nTotal frames for {person_class}: {class_mfcc_feats.shape[0]}")
            
            # train GMM for this class
            gmm = GaussianMixture(n_components=n_components, covariance_type=COVA_TYPE)
            gmm.fit(class_mfcc_feats)
            
            # store trained GMM
            class_features[person_class] = gmm

    return class_features

# train GMM
#trained_models = load_audio_train_gmm(TRAIN_DIR)

# To use for classification:
#test_features = mfcc(test_audio[20000:], 16000)
#scores = {name: model.score(test_features) for name, model in trained_models.items()}
#predicted_class = max(scores.items(), key=lambda x: x[1])[0]

def evaluate_models(models, test_dir):
    """Evaluate trained GMM models on test data"""
    true_labels = []
    pred_labels = []
    
    for person_class in sorted(os.listdir(test_dir)):
        class_dir = os.path.join(test_dir, person_class)
        
        for audio_record in sorted(os.listdir(class_dir)):
            try:
                audio_record_pth = os.path.join(class_dir, audio_record)
                freq_sampling, audio_sig = wavfile.read(audio_record_pth)
                
                audio_sig = audio_sig[20000:] if len(audio_sig) > 20000 else audio_sig
                test_features = mfcc(audio_sig, freq_sampling, nfft=2048)
                
                # Score against all models
                scores = {name: model.score(test_features) for name, model in models.items()}
                predicted_class = max(scores.items(), key=lambda x: x[1])[0]
                
                true_labels.append(person_class)
                pred_labels.append(predicted_class)
                
            except Exception as e:
                print(f"Error processing {audio_record_pth}: {str(e)}")
                continue
    
    return true_labels, pred_labels

def plot_confusion_matrix(true_labels, pred_labels, classes):
    """Plot confusion matrix"""
    cm = confusion_matrix(true_labels, pred_labels, labels=classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# Main execution
if __name__ == "__main__":
    # Train models
    print("Training GMM models...")
    trained_models = train_gmm(TRAIN_DIR, n_components=N_COMPONENTS)
    
    # Evaluate on development set
    print("\nEvaluating on development set...")
    true_labels, pred_labels = evaluate_models(trained_models, DEV_DIR)
    
    # Get unique class names
    classes = sorted(list(trained_models.keys()))
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels, target_names=classes, zero_division=0))
    
    # Plot confusion matrix
    plot_confusion_matrix(true_labels, pred_labels, classes)
    
    # Example of classifying a single file
    # test_file = "path_to_test_file.wav"
    # freq_sampling, audio_sig = wavfile.read(test_file)
    # test_features = mfcc(audio_sig[20000:], freq_sampling)
    # scores = {name: model.score(test_features) for name, model in trained_models.items()}
    # predicted_class = max(scores.items(), key=lambda x: x[1])[0]
    # print(f"\nPredicted class: {predicted_class}")