""" Test nfft - The audio is split into short overlapping windows FFT is applied to rach step and it controls how many points the FFT computes Up to now I used default 512 value and the classifier has 77% accuracy. Now I try 1024 and 2048. default acc=77% 1024 acc=81% 2048 acc=76% 2048 likely causes over-smoothing in the frequency domain, blurring out speaker specific details. """ from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank, delta # pip install python_speech_features from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns import librosa # pip install librosa from sklearn.preprocessing import StandardScaler TRAIN_DIR = "./Separate_data/train/sounds" DEV_DIR = "./Separate_data/dev/sounds" NUM_CEPS = 20 NFFT = 1024 def load_audio(src_folder): all_classes_mfcc_feats = [] class_labels = [] for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] class_labels.append(person_class) for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 2 seconds mfcc_feats = mfcc(audio_sig, freq_sampling, numcep=NUM_CEPS, appendEnergy=False) # extract mfcc class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class all_classes_mfcc_feats.append(class_mfcc_feats) return all_classes_mfcc_feats, class_labels def get_feats(audio_sig, freq_sampling): mfcc_feats = mfcc(audio_sig, freq_sampling, numcep=NUM_CEPS, appendEnergy=False, nfft=NFFT) # extract mfcc #delta_feats = delta(mfcc_feats, 2) #delta_delta_feats = delta(delta_feats, 2) #combined_feats = np.hstack((mfcc_feats, delta_feats, delta_delta_feats)) return mfcc_feats def compute_global_scaler(src_folder): all_feats = [] for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] combined_feats = get_feats(audio_sig, freq_sampling) all_feats.append(combined_feats) all_feats = np.vstack(all_feats) scaler = StandardScaler() scaler.fit(all_feats) return scaler def train_gmm(src_folder, scaler): class_features = {} # store features per class for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 2 seconds combined_feats = get_feats(audio_sig, freq_sampling) # add simple augumentation #stretched = librosa.effects.time_stretch(audio_sig, rate=0.9) #mfcc_feats = np.vstack([mfcc_feats, mfcc(stretched[20000:], freq_sampling)]) #, nfft=2048)]) class_mfcc_feats.append(combined_feats) # add them to all mfcc for this class if class_mfcc_feats: class_mfcc_feats = np.vstack(class_mfcc_feats) print(f"\nTotal frames for {person_class}: {class_mfcc_feats.shape[0]}") # Normalize features normalized_feats = scaler.transform(class_mfcc_feats) # train GMM for this class gmm = GaussianMixture(n_components=8, covariance_type='full', max_iter=500, n_init=3, random_state=42) gmm.fit(normalized_feats) # Store GMM and its scaler class_features[person_class] = (gmm, scaler) return class_features # train GMM #trained_models = load_audio_train_gmm(TRAIN_DIR) # To use for classification: #test_features = mfcc(test_audio[20000:], 16000) #scores = {name: model.score(test_features) for name, model in trained_models.items()} #predicted_class = max(scores.items(), key=lambda x: x[1])[0] def evaluate_models(models, test_dir, scaler): """Evaluate trained GMM models on test data""" true_labels = [] pred_labels = [] for person_class in sorted(os.listdir(test_dir)): class_dir = os.path.join(test_dir, person_class) for audio_record in sorted(os.listdir(class_dir)): try: audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] if len(audio_sig) > 20000 else audio_sig test_features = get_feats(audio_sig, freq_sampling) # Score against all models scores = { name: gmm.score(scaler.transform(test_features)) for name, (gmm, scaler) in models.items() } predicted_class = max(scores.items(), key=lambda x: x[1])[0] true_labels.append(person_class) pred_labels.append(predicted_class) except Exception as e: print(f"Error processing {audio_record_pth}: {str(e)}") continue return true_labels, pred_labels def plot_confusion_matrix(true_labels, pred_labels, classes): """Plot confusion matrix""" cm = confusion_matrix(true_labels, pred_labels, labels=classes) plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') plt.show() # Main execution if __name__ == "__main__": # Prepare scaler print("Computing global scaler...") scaler = compute_global_scaler(TRAIN_DIR) # Train models print("Training GMM models...") trained_models = train_gmm(TRAIN_DIR, scaler) # Evaluate on development set print("\nEvaluating on development set...") true_labels, pred_labels = evaluate_models(trained_models, DEV_DIR, scaler) # Get unique class names classes = sorted(list(trained_models.keys())) # Print classification report print("\nClassification Report:") print(classification_report(true_labels, pred_labels, target_names=classes, zero_division=0)) # Plot confusion matrix plot_confusion_matrix(true_labels, pred_labels, classes) plt.show() # Example of classifying a single file # test_file = "path_to_test_file.wav" # freq_sampling, audio_sig = wavfile.read(test_file) # test_features = mfcc(audio_sig[20000:], freq_sampling) # scores = {name: model.score(test_features) for name, model in trained_models.items()} # predicted_class = max(scores.items(), key=lambda x: x[1])[0] # print(f"\nPredicted class: {predicted_class}")