""" No PCA. Again tried to increase NUM_CEPS = 24 got 77% and with decreased NUM_CEPS = 16 got 68% accuracy. So I set the NUM_CEPS back to 20 and I tried to remove silent parts of the records. This helpe :-) With top db settings 15db I got 81% accuracy 20db I got 84% accuracy 25db I got 87% accuracy 30db I got 84% accuracy So I will keep the silence removal set to top 25db. """ import os os.environ["OMP_NUM_THREADS"] = "1" from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank, delta # pip install python_speech_features from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns import librosa # pip install librosa from sklearn.preprocessing import StandardScaler TRAIN_DIR = "./Separate_data/train/sounds" DEV_DIR = "./Separate_data/dev/sounds" NUM_CEPS = 20 NFFT = 1024 SILENCE_TOP_DB = 25 def load_audio(src_folder): all_classes_mfcc_feats = [] class_labels = [] for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] class_labels.append(person_class) for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 2 seconds mfcc_feats = mfcc(audio_sig, freq_sampling, numcep=NUM_CEPS, appendEnergy=False) # extract mfcc class_mfcc_feats.append(mfcc_feats) # add them to all mfcc for this class all_classes_mfcc_feats.append(class_mfcc_feats) return all_classes_mfcc_feats, class_labels def remove_silence(audio_sig, top_db=SILENCE_TOP_DB): intervals = librosa.effects.split(audio_sig, top_db=top_db) non_silent = [audio_sig[start:end] for start, end in intervals] return np.concatenate(non_silent) def get_feats(audio_sig, freq_sampling): audio_sig_no_silence = remove_silence(audio_sig) mfcc_feats = mfcc(audio_sig_no_silence, freq_sampling, numcep=NUM_CEPS, appendEnergy=False, nfft=NFFT) # extract mfcc #delta_feats = delta(mfcc_feats, 2) #delta_delta_feats = delta(delta_feats, 2) #combined_feats = np.hstack((mfcc_feats, delta_feats, delta_delta_feats)) return mfcc_feats def compute_global_scaler(src_folder): all_feats = [] for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] combined_feats = get_feats(audio_sig, freq_sampling) all_feats.append(combined_feats) all_feats = np.vstack(all_feats) scaler = StandardScaler() scaler.fit(all_feats) return scaler def train_gmm(src_folder, scaler): class_features = {} # store features per class for person_class in sorted(os.listdir(src_folder)): class_dir = os.path.join(src_folder, person_class) class_mfcc_feats = [] for audio_record in sorted(os.listdir(class_dir)): audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] # cut first 2 seconds combined_feats = get_feats(audio_sig, freq_sampling) # add simple augumentation #stretched = librosa.effects.time_stretch(audio_sig, rate=0.9) #mfcc_feats = np.vstack([mfcc_feats, mfcc(stretched[20000:], freq_sampling)]) #, nfft=2048)]) class_mfcc_feats.append(combined_feats) # add them to all mfcc for this class if class_mfcc_feats: class_mfcc_feats = np.vstack(class_mfcc_feats) print(f"\nTotal frames for {person_class}: {class_mfcc_feats.shape[0]}") # Normalize features normalized_feats = scaler.transform(class_mfcc_feats) # train GMM for this class gmm = GaussianMixture(n_components=8, covariance_type='full', max_iter=500, n_init=3, random_state=42) gmm.fit(normalized_feats) # Store GMM and its scaler class_features[person_class] = (gmm, scaler) return class_features # train GMM #trained_models = load_audio_train_gmm(TRAIN_DIR) # To use for classification: #test_features = mfcc(test_audio[20000:], 16000) #scores = {name: model.score(test_features) for name, model in trained_models.items()} #predicted_class = max(scores.items(), key=lambda x: x[1])[0] def evaluate_models(models, test_dir, scaler): """Evaluate trained GMM models on test data""" true_labels = [] pred_labels = [] for person_class in sorted(os.listdir(test_dir)): class_dir = os.path.join(test_dir, person_class) for audio_record in sorted(os.listdir(class_dir)): try: audio_record_pth = os.path.join(class_dir, audio_record) freq_sampling, audio_sig = wavfile.read(audio_record_pth) audio_sig = audio_sig[20000:] if len(audio_sig) > 20000 else audio_sig test_features = get_feats(audio_sig, freq_sampling) # Score against all models scores = { name: gmm.score(scaler.transform(test_features)) for name, (gmm, scaler) in models.items() } predicted_class = max(scores.items(), key=lambda x: x[1])[0] true_labels.append(person_class) pred_labels.append(predicted_class) except Exception as e: print(f"Error processing {audio_record_pth}: {str(e)}") continue return true_labels, pred_labels def plot_confusion_matrix(true_labels, pred_labels, classes): """Plot confusion matrix""" cm = confusion_matrix(true_labels, pred_labels, labels=classes) plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') plt.show() # Main execution if __name__ == "__main__": # Prepare scaler print("Computing global scaler...") scaler = compute_global_scaler(TRAIN_DIR) # Train models print("Training GMM models...") trained_models = train_gmm(TRAIN_DIR, scaler) # Evaluate on development set print("\nEvaluating on development set...") true_labels, pred_labels = evaluate_models(trained_models, DEV_DIR, scaler) # Get unique class names classes = sorted(list(trained_models.keys())) # Print classification report print("\nClassification Report:") print(classification_report(true_labels, pred_labels, target_names=classes, zero_division=0)) # Plot confusion matrix plot_confusion_matrix(true_labels, pred_labels, classes) plt.show() # Example of classifying a single file # test_file = "path_to_test_file.wav" # freq_sampling, audio_sig = wavfile.read(test_file) # test_features = mfcc(audio_sig[20000:], freq_sampling) # scores = {name: model.score(test_features) for name, model in trained_models.items()} # predicted_class = max(scores.items(), key=lambda x: x[1])[0] # print(f"\nPredicted class: {predicted_class}")