import os import numpy as np import librosa from sklearn.mixture import GaussianMixture from glob import glob import random import joblib # ___ CONFIGURATION ___ TRAIN_DIR = "merged_data" # Path to train directory DEV_DIR = "dev" # Path to dev directory (test) OUTPUT_MODEL_DIR = "models/voice/gmm" N_MFCC = 12 GMM_COMPONENTS = 16 SILENCE_THRESHOLD = 17 SPEAKER_COUNT = 31 INIT_TIME = 1.0 # Initial duration of every record without speech AUGMENTATION_COUNT = 2 # number of augmentations per file def augment_audio(y, sr): """Randomly choose one type of augmentation and its value and apply it to the current voice record.""" aug_type = random.choice(['pitch', 'stretch', 'noise']) if aug_type == 'pitch': # Soft pitch shift: +-0.5 semitones steps = random.uniform(-0.5, 0.5) y = librosa.effects.pitch_shift(y, sr=sr, n_steps=steps) elif aug_type == 'stretch': # Soft time stretch: 0.98x to 1.02x speed rate = random.uniform(0.98, 1.02) y = librosa.effects.time_stretch(y, rate=rate) elif aug_type == 'noise': # Very low noise noise_amp = 0.001 * np.random.uniform() * np.amax(y) y = y + noise_amp * np.random.normal(size=y.shape) return y def silence_crop(y, sr): """Removes silent sections from the voice record.""" y = y[int(INIT_TIME * sr):] # Detect non-silent intervals intervals = librosa.effects.split(y, top_db=SILENCE_THRESHOLD) # Concatenate non-silent intervals (remove silent intervals) y_nonsilent = np.concatenate([y[start:end] for start, end in intervals]) return y_nonsilent def mean_normalize(features): """Perform mean normalization on MFCC features.""" mean = np.mean(features, axis=0) return features - mean def extract_mfcc(file_path, n_mfcc=N_MFCC, augment=False): """Extract MFCC from the voice record.""" y, sr = librosa.load(file_path, sr=None) if augment: y = augment_audio(y, sr) y_cropped = silence_crop(y, sr) mfcc = librosa.feature.mfcc(y=y_cropped, sr=sr, n_mfcc=n_mfcc) features = mfcc.T # Centers the features around zero per dimension. features = mean_normalize(features) return features def load_files_by_speaker(base_dir): """Load .wav files by speaker (subfolders)""" speaker_files = {} for i in range(1, SPEAKER_COUNT + 1): speaker_id = str(i) # Get .wav filenames according to the speakers subfolder with data path_pattern = os.path.join(base_dir, speaker_id, "*.wav") files = sorted(glob(path_pattern)) if files: speaker_files[speaker_id] = files return speaker_files def train_gmm(features, n_components=GMM_COMPONENTS): """Train a Gaussian Mixture Model (GMM) on the given MFCC feature set""" gmm = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=200, reg_covar=1e-3) gmm.fit(features) return gmm def classify(features, gmm_models): """Classify the speaker by comparing MFCC features to trained GMMs.""" scores = {sid: model.score(features) for sid, model in gmm_models.items()} predicted = max(scores, key=scores.get) return predicted, scores def evaluate_model(gmm_models): """Evaluate trained GMM speaker models on the test dataset.""" print("\nModel evaluation:") test_data = load_files_by_speaker(DEV_DIR) correct = 0 total = 0 for true_speaker_id, file_list in test_data.items(): for file in file_list: features = extract_mfcc(file) predicted, scores = classify(features, gmm_models) print(f"Test file: {os.path.basename(file)} | True: {true_speaker_id} | Predicted: {predicted}") if predicted == true_speaker_id: correct += 1 total += 1 accuracy = correct / total if total > 0 else 0 print(f"\nFinal Accuracy: {accuracy * 100:.2f}%") # ___ MAIN SCRIPT ___ if __name__ == "__main__": train_data = load_files_by_speaker(TRAIN_DIR) # Train models gmm_models = {} for speaker_id, file_list in train_data.items(): print(f"Training GMM for speaker {speaker_id}...") all_features = [] for file_path in file_list: # Original features = extract_mfcc(file_path, augment=False) all_features.append(features) # Augmented versions (conditionally) for _ in range(AUGMENTATION_COUNT): features_aug = extract_mfcc(file_path, augment=True) all_features.append(features_aug) # train gmm for current speaker all_features = np.vstack(all_features) gmm = train_gmm(all_features) gmm_models[speaker_id] = gmm #evaluate_model(gmm_models) # Save trained models for predicting os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True) for speaker_id, model in gmm_models.items(): joblib.dump(model, f"{OUTPUT_MODEL_DIR}/gmm_speaker_{speaker_id}.joblib") print("Trained GMM model saved.")