""" Initial setup. I cut first 1.25 second from each record and then trained gmm on all features from a class. The model for each class had 16 components and diagonal covariance_type. It had around 52-56% accuracy. With simple change of number of components, the accuracy improved. For 8 components cca 48%, and for 128 = 52% accuracy decreased. But for 32 = 53-60% and 64 = 53-63% the accuracy increased. I also tried to switch from diagonal to full covariance matrix. I started with number of components | acc % 64 | 44 16 | 61 32 | 53 8 | 63 4 | 61 """ from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank # pip install python_speech_features from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns TRAIN_DIR = "./Separate_data/train/sounds" DEV_DIR = "./Separate_data/dev/sounds" N_COMPONENTS = 4 COVA_TYPE = 'full' # 'diag' def load_audio(src_folder): all_classes_mfcc_feats = [] class_labels = [] for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] class_labels.append(person_class) for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 1.25 seconds mfcc_feats = mfcc(audio_sig, freq_sampling) # extract mfcc class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class all_classes_mfcc_feats.append(class_mfcc_feats) return all_classes_mfcc_feats, class_labels def train_gmm(src_folder, n_components): class_features = {} # store features per class for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 2 seconds mfcc_feats = mfcc(audio_sig, freq_sampling) # extract mfcc class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class if class_mfcc_feats: class_mfcc_feats = np.vstack(class_mfcc_feats) print(f"\nTotal frames for {person_class}: {class_mfcc_feats.shape[0]}") # train GMM for this class gmm = GaussianMixture(n_components=n_components, covariance_type=COVA_TYPE) gmm.fit(class_mfcc_feats) # store trained GMM class_features[person_class] = gmm return class_features # train GMM #trained_models = load_audio_train_gmm(TRAIN_DIR) # To use for classification: #test_features = mfcc(test_audio[20000:], 16000) #scores = {name: model.score(test_features) for name, model in trained_models.items()} #predicted_class = max(scores.items(), key=lambda x: x[1])[0] def evaluate_models(models, test_dir): """Evaluate trained GMM models on test data""" true_labels = [] pred_labels = [] for person_class in sorted(os.listdir(test_dir)): class_dir = os.path.join(test_dir, person_class) for audio_record in sorted(os.listdir(class_dir)): try: audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] if len(audio_sig) > 20000 else audio_sig test_features = mfcc(audio_sig, freq_sampling, nfft=2048) # Score against all models scores = {name: model.score(test_features) for name, model in models.items()} predicted_class = max(scores.items(), key=lambda x: x[1])[0] true_labels.append(person_class) pred_labels.append(predicted_class) except Exception as e: print(f"Error processing {audio_record_pth}: {str(e)}") continue return true_labels, pred_labels def plot_confusion_matrix(true_labels, pred_labels, classes): """Plot confusion matrix""" cm = confusion_matrix(true_labels, pred_labels, labels=classes) plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') plt.show() # Main execution if __name__ == "__main__": # Train models print("Training GMM models...") trained_models = train_gmm(TRAIN_DIR, n_components=N_COMPONENTS) # Evaluate on development set print("\nEvaluating on development set...") true_labels, pred_labels = evaluate_models(trained_models, DEV_DIR) # Get unique class names classes = sorted(list(trained_models.keys())) # Print classification report print("\nClassification Report:") print(classification_report(true_labels, pred_labels, target_names=classes, zero_division=0)) # Plot confusion matrix plot_confusion_matrix(true_labels, pred_labels, classes) # Example of classifying a single file # test_file = "path_to_test_file.wav" # freq_sampling, audio_sig = wavfile.read(test_file) # test_features = mfcc(audio_sig[20000:], freq_sampling) # scores = {name: model.score(test_features) for name, model in trained_models.items()} # predicted_class = max(scores.items(), key=lambda x: x[1])[0] # print(f"\nPredicted class: {predicted_class}")