# If true, the WAV files will be read and their features will be saved in the CSV files
# As this is the most time consuming task, only enable it if you don't have the CSV files yet
CREATE_CSV_FILES = False
# Defines the names of the CSV files
TRAIN_CSV_FILE = "train.csv"
TEST_CSV_FILE = "test.csv"
MORE_TRAIN_CSV_FILE = "more_train.csv"
MORE_TEST_CSV_FILE = "more_test.csv"
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
import librosa
import csv
import os
def extractWavFeatures(soundFilesFolder, csvFileName):
print("The features of the files in the folder "+soundFilesFolder+" will be saved to "+csvFileName)
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
header += f' mfcc{i}'
header += ' label'
header = header.split()
print('CSV Header: ', header)
file = open(csvFileName, 'w', newline='')
#with file:
writer = csv.writer(file)
writer.writerow(header)
genres = '1 2 3 4 5 6 7 8 9 0'.split()
for filename in os.listdir(soundFilesFolder):
number = f'{soundFilesFolder}/{filename}'
y, sr = librosa.load(number, mono=True, duration=30)
# remove leading and trailing silence
y, index = librosa.effects.trim(y)
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
rmse = librosa.feature.rms(y=y)
spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
zcr = librosa.feature.zero_crossing_rate(y)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'
for e in mfcc:
to_append += f' {np.mean(e)}'
writer.writerow(to_append.split())
file.close()
print("End of extractWavFeatures")
if (CREATE_CSV_FILES == True):
extractWavFeatures("../data/recordings/train", TRAIN_CSV_FILE)
extractWavFeatures("../data/recordings/test", TEST_CSV_FILE)
extractWavFeatures("../data/recordings/moreSpeakersTrain", MORE_TRAIN_CSV_FILE)
extractWavFeatures("../data/recordings/moreSpeakersTest", MORE_TEST_CSV_FILE)
print("CSV files are created")
else:
print("CSV files creation is skipped")
#Reading a dataset and convert file name to corresbonding umnber
import pandas as pd
import csv
from sklearn import preprocessing
def preProcessData(csvFileName):
print(csvFileName+ " will be preprocessed")
data = pd.read_csv(csvFileName)
# we have six speakers:
# 0: Jackson
# 1: Nicolas
# 2: Theo
# 3: Ankur
# 4: Caroline
# 5: Rodolfo
filenameArray = data['filename']
speakerArray = []
#print(filenameArray)
for i in range(len(filenameArray)):
speaker = filenameArray[i][2]
#print(speaker)
if speaker == "j":
speaker = "0"
elif speaker == "n":
speaker = "1"
elif speaker == "t":
speaker = "2"
elif speaker == "a":
speaker = "3"
elif speaker == "c":
speaker = "4"
elif speaker == "r":
speaker = "5"
else:
speaker = "6"
#print(speaker)
speakerArray.append(speaker)
data['number'] = speakerArray
#Dropping unnecessary columns
data = data.drop(['filename'],axis=1)
data = data.drop(['label'],axis=1)
data = data.drop(['chroma_stft'],axis=1)
data.shape
print("Preprocessing is finished")
print(data.head())
return data
trainData = preProcessData(TRAIN_CSV_FILE)
testData = preProcessData(TEST_CSV_FILE)
moreTrainData = preProcessData(MORE_TRAIN_CSV_FILE)
moreTestData = preProcessData(MORE_TEST_CSV_FILE)
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
X = np.array(trainData.iloc[:, :-1], dtype = float)
y = trainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]
print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
print("Y from test data:", y_test.shape)
#Normalizing the dataset
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
X_train = scaler.fit_transform( X_train )
X_val = scaler.transform( X_val )
X_test = scaler.transform( X_test )
print("X from training data", X_train.shape)
print("X from validation data", X_val.shape)
print("X from test data", X_test.shape)
#Creating a Model
from keras import models
from keras import layers
import keras
# model 1
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation='softmax'))
# Learning Process of a model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# simple early stopping
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
#Train with early stopping to avoid overfitting
history = model.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=50,
batch_size=128,
callbacks=[es])
# plot training history
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
def getSpeaker(speaker):
speaker = str(speaker)
if speaker == "0":
return "Jackson"
elif speaker == "1":
return "Nicola"
elif speaker == "2":
return "Theo"
elif speaker == "3":
return "Ankur"
elif speaker == "4":
return "Caroline"
elif speaker == "5":
return "Rodolfo"
else:
speaker = "Unknown"
def printPrediction(X_data, y_data, printDigit):
print('\n# Generate predictions')
for i in range(len(y_data)):
prediction = getSpeaker(model.predict_classes(X_data[i:i+1])[0])
speaker = getSpeaker(y_data[i])
if printDigit == True:
print("Number={0:d}, y={1:10s}- prediction={2:10s}- match={3}".format(i, speaker, prediction, speaker==prediction))
else:
print("y={0:10s}- prediction={1:10s}- match={2}".format(speaker, prediction, speaker==prediction))
import numpy as np
from keras import backend as K
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix
def report(X_data, y_data):
#Confution Matrix and Classification Report
Y_pred = model.predict_classes(X_data)
y_test_num = y_data.astype(np.int64)
conf_mt = confusion_matrix(y_test_num, Y_pred)
print(conf_mt)
plt.matshow(conf_mt)
plt.show()
print('\nClassification Report')
target_names = ["Jackson", "Nicola", "Theo", "Ankur", "Caroline", "Rodolfo", "Unknown"]
print(classification_report(y_test_num, Y_pred))
print('\n# TEST DATA #\n')
score = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
# Prediction
printPrediction(X_test[0:10], y_test[0:10], False)
print("Classification Report for Test Data\n")
report(X_test, y_test)
# Splitting the dataset into training, validation and testing dataset
from sklearn.model_selection import train_test_split
fullTrainData = trainData.append(moreTrainData)
X = np.array(fullTrainData.iloc[:, :-1], dtype = float)
y = fullTrainData.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_test = np.array(testData.iloc[:, :-1], dtype = float)
y_test = testData.iloc[:, -1]
X_more_test = np.array(moreTestData.iloc[:, :-1], dtype = float)
y_more_test = moreTestData.iloc[:, -1]
print("Y from training data:", y_train.shape)
print("Y from validation data:", y_val.shape)
print("Y from test data:", y_test.shape)
print("Y from other speakers test data:", y_more_test.shape)
#Normalizing the dataset
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
X_train = scaler.fit_transform( X_train )
X_val = scaler.transform( X_val )
X_test = scaler.transform( X_test )
X_more_test = scaler.transform( X_more_test )
print("X from training data", X_train.shape)
print("X from validation data", X_val.shape)
print("X from test data", X_test.shape)
print("X from other speakers test data", X_more_test.shape)
#Creating a Model
from keras import models
from keras import layers
import keras
# model 1
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation='softmax'))
# Learning Process of a model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# simple early stopping
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
#Train with early stopping to avoid overfitting
history = model.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=50,
batch_size=128,
callbacks=[es])
# plot training history
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
print('\n# TEST DATA #\n')
score = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
# Prediction
printPrediction(X_test[0:10], y_test[0:10], False)
print('\n# OTHER SPEAKERS DATA #\n')
score = model.evaluate(X_more_test, y_more_test)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
# Prediction
printPrediction(X_more_test[0:10], y_more_test[0:10], False)
print("Classification Report for Test Data\n")
report(X_test, y_test)
print("Classification Report for Other Speakers\n")
report(X_more_test, y_more_test)