import os import librosa import numpy as np from sklearn.mixture import GaussianMixture import pickle import sys MODELS_DIR = "models_gmm" TEST_DIR = "dev" # folder with training/test WAVs TRAIN_DIR = "train" # folder with training WAVs PEOPLE_NUMBER = 31 # number of people in the dataset def extract_mfcc(wav_path): signal, sr = librosa.load(wav_path, sr=16000, mono=True) signal, _ = librosa.effects.trim(signal, top_db=20) # remove silence mfcc = librosa.feature.mfcc( y=signal, sr=sr, n_mfcc=13, n_fft=512, hop_length=240, win_length=400, n_mels=23 ) # Compute delta and delta-delta delta = librosa.feature.delta(mfcc) delta2 = librosa.feature.delta(mfcc, order=2) # Combine MFCC + delta + delta2 -> shape (39, T) combined = np.vstack([mfcc, delta, delta2]) return combined.T def wav16khz2mfcc(dir_name, n_mfcc=13): features = [] for file in sorted(os.listdir(dir_name)): if file.endswith('.wav'): file_path = os.path.join(dir_name, file) mfcc_t = extract_mfcc(file_path) features.append(mfcc_t) return features def train_gmm_models(is_final = True, train_root=TRAIN_DIR, dev_root=TEST_DIR, output_dir=MODELS_DIR, n_components=32): os.makedirs(output_dir, exist_ok=True) for person_id in range(1, PEOPLE_NUMBER + 1): # Get features from training data train_path = os.path.join(train_root, str(person_id)) train_features = wav16khz2mfcc(train_path) if is_final: # Get features from dev data dev_path = os.path.join(dev_root, str(person_id)) dev_features = wav16khz2mfcc(dev_path) # Combine features from both datasets all_features = train_features + dev_features else: all_features = train_features if not all_features: print(f'No WAV files found for person {person_id}') continue all_feats = np.vstack(all_features) gmm = GaussianMixture( n_components=n_components, covariance_type='diag', reg_covar=1e-3, max_iter=200 ) gmm.fit(all_feats) # Save model model_path = os.path.join(output_dir, f'gmm_{person_id:02d}.pkl') with open(model_path, 'wb') as f: pickle.dump(gmm, f) def load_models(): models = [] for i in range(1, PEOPLE_NUMBER + 1): path = os.path.join(MODELS_DIR, f"gmm_{i:02d}.pkl") with open(path, "rb") as f: models.append(pickle.load(f)) return models def predict(mfcc, models): scores = [model.score(mfcc) for model in models] return np.argmax(scores) + 1, scores def evaluate_all(EVAL_DIR='eval', OUTPUT_FILE='voice_results.txt'): models = load_models() total = 0 correct = 0 with open(OUTPUT_FILE, "w") as out_f: for root, dirs, files in os.walk(EVAL_DIR): for file in sorted(files): if file.endswith(".wav"): wav_path = os.path.join(root, file) name = os.path.splitext(file)[0] mfcc = extract_mfcc(wav_path) pred_label, scores = predict(mfcc, models) line = f"{name} {pred_label} " + " ".join(f"{s:.2f}" for s in scores) out_f.write(line + "\n") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python3 train_image.py [test_dir] [results_file] [is_final]") sys.exit(1) train_gmm_models(sys.argv[3]) evaluate_all(sys.argv[1], sys.argv[2])