""" From v9 which performed best on the jack knife in v11 70% accuracy Now I try to apply cepstral mean normalization which normalizes MFCCs accross recordings by removing channel/microphone bias. I got 90% accuracy. """ import os os.environ["OMP_NUM_THREADS"] = "1" from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank, delta # pip install python_speech_features from sklearn.mixture import GaussianMixture import os import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from python_speech_features import mfcc, logfbank from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns import librosa # pip install librosa from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline import joblib NUM_CEPS = 20 NFFT = 1024 SILENCE_TOP_DB = 25 EVAL_DIR = "SUR_projekt2024-2025_eval/eval/" # load model trained_models = joblib.load("audio_gmm_model.pkl") def remove_silence(audio_sig, top_db=SILENCE_TOP_DB): intervals = librosa.effects.split(audio_sig, top_db=top_db) non_silent = [audio_sig[start:end] for start, end in intervals] return np.concatenate(non_silent) def get_feats(audio_sig, freq_sampling): audio_sig_no_silence = remove_silence(audio_sig) mfcc_feats = mfcc(audio_sig_no_silence, freq_sampling, numcep=20, appendEnergy=False, nfft=1024) mfcc_feats -= np.mean(mfcc_feats, axis=0, keepdims=True) return mfcc_feats def predict_class(wav_path): fs, audio = wavfile.read(wav_path) audio = audio[20000:] # Trim initial 2 seconds feats = get_feats(audio, fs) scores = {} best_score = float("-inf") best_class = None for person_class, (gmm, scaler) in trained_models.items(): norm_feats = scaler.transform(feats) score = gmm.score(norm_feats) scores[person_class] = score #.append({"score" : score, "class" : person_class}) if score > best_score: best_score = score best_class = person_class return best_class, scores def process_wav(wav_pth): result, scores = predict_class(wav_path=wav_pth) sorted_log_probs = [] for c in range(1,32): sorted_log_probs.append(scores[str(c)]) return result, sorted_log_probs with open("audio_gmm", "w", encoding="ascii", errors="ignore") as resFile: for wav_file in sorted(os.listdir(EVAL_DIR)): if not wav_file.lower().endswith(".wav"): continue # skip non wav wav_file_path = os.path.join(EVAL_DIR, wav_file) res_class, sorted_log_probs = process_wav(wav_file_path) file_name = wav_file.replace(".wav", "") one_wav_res = f"{file_name} {res_class} " + ' '.join(f"{score}" for score in sorted_log_probs) resFile.write(one_wav_res + "\n")