"""
Test nfft - The audio is split into short overlapping windows
FFT is applied to rach step and it controls how many points the FFT computes
Up to now I used default 512 value and the classifier has 77% accuracy.
Now I try 1024 and 2048.
default acc=77%
1024    acc=81%
2048    acc=76%
2048 likely causes over-smoothing in the frequency domain, blurring out speaker specific details.
"""

from sklearn.mixture import GaussianMixture
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank, delta # pip install python_speech_features
 
from sklearn.mixture import GaussianMixture
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

import librosa # pip install librosa

from sklearn.preprocessing import StandardScaler

TRAIN_DIR = "./Separate_data/train/sounds"
DEV_DIR = "./Separate_data/dev/sounds"
NUM_CEPS = 20
NFFT = 1024

def load_audio(src_folder):

    all_classes_mfcc_feats = []
    class_labels = []
    for person_class in sorted(os.listdir(src_folder)):
        class_dir = os.path.join(src_folder, person_class)
        
        class_mfcc_feats = []
        class_labels.append(person_class)

        for audio_record in sorted(os.listdir(class_dir)):
            audio_record_pth = os.path.join(class_dir, audio_record)
            freq_sampling, audio_sig = wavfile.read(audio_record_pth)

            audio_sig = audio_sig[20000:] # cut first 2 seconds

            mfcc_feats = mfcc(audio_sig, freq_sampling, numcep=NUM_CEPS, appendEnergy=False) # extract mfcc
            
            class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class
        
        all_classes_mfcc_feats.append(class_mfcc_feats)
    return all_classes_mfcc_feats, class_labels

def get_feats(audio_sig, freq_sampling):
    mfcc_feats = mfcc(audio_sig, freq_sampling, numcep=NUM_CEPS, appendEnergy=False, nfft=NFFT) # extract mfcc
    #delta_feats = delta(mfcc_feats, 2)
    #delta_delta_feats = delta(delta_feats, 2)
    #combined_feats = np.hstack((mfcc_feats, delta_feats, delta_delta_feats))
    return mfcc_feats

def compute_global_scaler(src_folder):
    all_feats = []
    for person_class in sorted(os.listdir(src_folder)):
        class_dir = os.path.join(src_folder, person_class)
        for audio_record in sorted(os.listdir(class_dir)):
            audio_record_pth = os.path.join(class_dir, audio_record)
            freq_sampling, audio_sig = wavfile.read(audio_record_pth)
            audio_sig = audio_sig[20000:]
            combined_feats = get_feats(audio_sig, freq_sampling)
            all_feats.append(combined_feats)
    all_feats = np.vstack(all_feats)
    scaler = StandardScaler()
    scaler.fit(all_feats)
    return scaler

def train_gmm(src_folder, scaler):

    class_features = {} # store features per class

    for person_class in sorted(os.listdir(src_folder)):
        class_dir = os.path.join(src_folder, person_class)
        
        class_mfcc_feats = []

        for audio_record in sorted(os.listdir(class_dir)):
            audio_record_pth = os.path.join(class_dir, audio_record)
            freq_sampling, audio_sig = wavfile.read(audio_record_pth)

            audio_sig = audio_sig[20000:] # cut first 2 seconds

            combined_feats = get_feats(audio_sig, freq_sampling)
            # add simple augumentation
            #stretched = librosa.effects.time_stretch(audio_sig, rate=0.9)
            #mfcc_feats = np.vstack([mfcc_feats, mfcc(stretched[20000:], freq_sampling)]) #, nfft=2048)])

            class_mfcc_feats.append(combined_feats) # add them to all mfcc for this class

        if class_mfcc_feats:
            class_mfcc_feats = np.vstack(class_mfcc_feats)
            print(f"\nTotal frames for {person_class}: {class_mfcc_feats.shape[0]}")
            
            # Normalize features
            normalized_feats = scaler.transform(class_mfcc_feats)
            
            # train GMM for this class
            gmm = GaussianMixture(n_components=8, 
                                covariance_type='full',
                                max_iter=500,
                                n_init=3,
                                random_state=42)
            gmm.fit(normalized_feats)
            
            # Store GMM and its scaler
            class_features[person_class] = (gmm, scaler)

    return class_features

# train GMM
#trained_models = load_audio_train_gmm(TRAIN_DIR)

# To use for classification:
#test_features = mfcc(test_audio[20000:], 16000)
#scores = {name: model.score(test_features) for name, model in trained_models.items()}
#predicted_class = max(scores.items(), key=lambda x: x[1])[0]

def evaluate_models(models, test_dir, scaler):
    """Evaluate trained GMM models on test data"""
    true_labels = []
    pred_labels = []
    
    for person_class in sorted(os.listdir(test_dir)):
        class_dir = os.path.join(test_dir, person_class)
        
        for audio_record in sorted(os.listdir(class_dir)):
            try:
                audio_record_pth = os.path.join(class_dir, audio_record)
                freq_sampling, audio_sig = wavfile.read(audio_record_pth)
                
                audio_sig = audio_sig[20000:] if len(audio_sig) > 20000 else audio_sig
                test_features = get_feats(audio_sig, freq_sampling)
                
                # Score against all models
                scores = {
                    name: gmm.score(scaler.transform(test_features))
                    for name, (gmm, scaler) in models.items()
                }
                predicted_class = max(scores.items(), key=lambda x: x[1])[0]
                
                true_labels.append(person_class)
                pred_labels.append(predicted_class)
                
            except Exception as e:
                print(f"Error processing {audio_record_pth}: {str(e)}")
                continue
    
    return true_labels, pred_labels

def plot_confusion_matrix(true_labels, pred_labels, classes):
    """Plot confusion matrix"""
    cm = confusion_matrix(true_labels, pred_labels, labels=classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# Main execution
if __name__ == "__main__":

    # Prepare scaler
    print("Computing global scaler...")
    scaler = compute_global_scaler(TRAIN_DIR)

    # Train models
    print("Training GMM models...")
    trained_models = train_gmm(TRAIN_DIR, scaler)
    
    # Evaluate on development set
    print("\nEvaluating on development set...")
    true_labels, pred_labels = evaluate_models(trained_models, DEV_DIR, scaler)
    
    # Get unique class names
    classes = sorted(list(trained_models.keys()))
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels, target_names=classes, zero_division=0))
    
    # Plot confusion matrix
    plot_confusion_matrix(true_labels, pred_labels, classes)
    plt.show()
    
    # Example of classifying a single file
    # test_file = "path_to_test_file.wav"
    # freq_sampling, audio_sig = wavfile.read(test_file)
    # test_features = mfcc(audio_sig[20000:], freq_sampling)
    # scores = {name: model.score(test_features) for name, model in trained_models.items()}
    # predicted_class = max(scores.items(), key=lambda x: x[1])[0]
    # print(f"\nPredicted class: {predicted_class}")