import os
import librosa
import numpy as np
from sklearn.mixture import GaussianMixture
import pickle
import sys

MODELS_DIR = "models_gmm"
TEST_DIR = "dev"        # folder with training/test WAVs
TRAIN_DIR = "train"   # folder with training WAVs
PEOPLE_NUMBER = 31  # number of people in the dataset

def extract_mfcc(wav_path):
    signal, sr = librosa.load(wav_path, sr=16000, mono=True)
    signal, _ = librosa.effects.trim(signal, top_db=20) # remove silence
    
    mfcc = librosa.feature.mfcc(
        y=signal, sr=sr, n_mfcc=13, n_fft=512,
        hop_length=240, win_length=400, n_mels=23
    )
    # Compute delta and delta-delta
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)

    # Combine MFCC + delta + delta2 -> shape (39, T)
    combined = np.vstack([mfcc, delta, delta2])

    return combined.T

def wav16khz2mfcc(dir_name, n_mfcc=13):
    features = []

    for file in sorted(os.listdir(dir_name)):
        if file.endswith('.wav'):
            file_path = os.path.join(dir_name, file)

            mfcc_t = extract_mfcc(file_path)

            features.append(mfcc_t)

    return features

def train_gmm_models(is_final = True, train_root=TRAIN_DIR, dev_root=TEST_DIR, output_dir=MODELS_DIR, n_components=32):
    os.makedirs(output_dir, exist_ok=True)

    for person_id in range(1, PEOPLE_NUMBER + 1):
        # Get features from training data
        train_path = os.path.join(train_root, str(person_id))
        train_features = wav16khz2mfcc(train_path)
        
        if is_final:
            # Get features from dev data
            dev_path = os.path.join(dev_root, str(person_id))
            dev_features = wav16khz2mfcc(dev_path)
        
            # Combine features from both datasets
            all_features = train_features + dev_features
        else:
            all_features = train_features

        if not all_features:
            print(f'No WAV files found for person {person_id}')
            continue

        all_feats = np.vstack(all_features)

        gmm = GaussianMixture(
            n_components=n_components,
            covariance_type='diag',
            reg_covar=1e-3,
            max_iter=200
        )
        gmm.fit(all_feats)

        # Save model
        model_path = os.path.join(output_dir, f'gmm_{person_id:02d}.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(gmm, f)

def load_models():
    models = []
    for i in range(1, PEOPLE_NUMBER + 1):
        path = os.path.join(MODELS_DIR, f"gmm_{i:02d}.pkl")
        with open(path, "rb") as f:
            models.append(pickle.load(f))
    return models

def predict(mfcc, models):
    scores = [model.score(mfcc) for model in models]
    return np.argmax(scores) + 1, scores

def evaluate_all(EVAL_DIR='eval', OUTPUT_FILE='voice_results.txt'):
    models = load_models()

    total = 0
    correct = 0

    with open(OUTPUT_FILE, "w") as out_f:
        for root, dirs, files in os.walk(EVAL_DIR):
            for file in sorted(files):
                if file.endswith(".wav"):
                    wav_path = os.path.join(root, file)
                    name = os.path.splitext(file)[0]

                    mfcc = extract_mfcc(wav_path)
                    pred_label, scores = predict(mfcc, models)

                    line = f"{name} {pred_label} " + " ".join(f"{s:.2f}" for s in scores)
                    out_f.write(line + "\n")


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python3 train_image.py [test_dir] [results_file] [is_final]")
        sys.exit(1)
        
    train_gmm_models(sys.argv[3])
    evaluate_all(sys.argv[1], sys.argv[2])