Preprocess and MLmodel

mail@pastecode.io avatar
unknown
python
2 months ago
3.7 kB
3
Indexable
Never
import os
import re
import pandas as pd
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import train_test_split

saved_model_path = 'path/to/saved/model'
tflite_model_path = 'path/where/to/save/my_model.tflite'
recorded_data_path = 'C:/Users/Jonny/Desktop/TestingSpin'


# # Converts my Tensorflow model to a Tflite version for use in java environment
# converter = tf.lite.TFLiteConverter.from_saved_model(savedModelDir)
# tfliteModel = converter.convert()
#
# # Saves the tflite model to specific dir.
# with open(tfliteModelPath, 'wb') as f:
#     f.write(tfliteModel)


# Returns 1 if the motionName matches the motion I want to train for
def label_encoding(motionname):
    return 1 if motionname.lower() == 'spin' else 0


# Organize data for learning
feature_data_frames = []
label_data_frames = []
data_frame = None
label = None
motion_name = None

# Iterates over all the files in the given dir
for filename in os.listdir(recorded_data_path):
    # Checks if the file is a .csv
    if filename.endswith(".csv"):
        # Checks with regex if the filename of the .csv matches the assumed feature-data of my app.
        match = re.match(r'^([\w\s]+)?_\d+\.csv$', filename)
        # If the filename matches, we encode the label, extract the name of the motion from the filename
        # and create dataframes for both features and labels from our data.
        if match:
            motion_name = match.group(1)
            label = label_encoding(motion_name)

            file_path = os.path.join(recorded_data_path, filename)

            # Finding out the longest row for padding the dataFrame
            counting_list = []
            with open(file_path, 'r') as file:
                for line in file:
                    line_data = line.strip().split(',')
                    counting_list.append(line_data)
            max_length = max(len(row) for row in counting_list)

            # Reads the csv and turning it into a dataframes and making each row the same length with the help
            # of max_length.
            data_frame = pd.read_csv(file_path, header=None, names=range(max_length))
            data_frame.columns = ['sensorname' if i == 0 else 'timestamp' if i == 1 else 'sensorvalues'
                                  for i in range(data_frame.shape[1])]

            # Fills the blank spaces with 0.0 for rows with blank spaces
            padded_data_frame = data_frame.fillna(0.0)

            # Create a dataframe with the fitting label for the features
            data_frame_label = pd.DataFrame({'label': [label] * len(padded_data_frame)})

            # Fills the features and labels list with the features and labels dataframes
            if data_frame is not None:
                feature_data_frames.append(padded_data_frame)
                label_data_frames.append(data_frame_label)

# Concatenating the list of dataframes into one big dataframe, preserving the pre-existing indexes
concat_feature = pd.concat(feature_data_frames)
concat_label = pd.concat(label_data_frames)


# Splitting the padded feature and label data into test and train cases
x_train, x_test, y_train, y_test = train_test_split(
    concat_feature, concat_label, test_size=0.2, random_state=29, shuffle=False)
print(x_train)
print(y_train)
print(x_test)
print(y_test)

# Flattens the labels to be 1 Dimensional.
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


# SVM setup and fitting
clf = svm.SVC(kernel='linear')

clf.fit(x_train, y_train)

y_prediction = clf.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_prediction))
print("Precision:", metrics.precision_score(y_test, y_prediction))
print("Recall:", metrics.recall_score(y_test, y_prediction))
Leave a Comment