Untitled
unknown
plain_text
a year ago
3.5 kB
3
Indexable
Never
import pandas as pd import numpy as np from ast import literal_eval import pickle as pkl import utils import Lindel from Lindel import Predictor import os def find_seq_and_replace(data_set, test_seq, sequences_60bp, test_data_point): """ Finds a specified sequence in a dataset and returns the corresponding 60bp sequence. Args: data_set (List[str]): The dataset to search through. test_seq (str): The sequence to search for. sequences_60bp (List[str]): A list to store the 60bp sequences found. test_data_point (List[float]): A list containing data associated with the test sequence. Returns: bool: True if the sequence was found and replaced, False otherwise. """ for seq in data_set: pos = seq.find(test_seq, 39) if pos >= 0: start_pos = pos + 17 - 30 end_pos = pos + 17 + 30 seq_60_bp = seq[start_pos:end_pos] sequences_60bp.append(seq_60_bp) test_data_point[0] = seq_60_bp return True return False def get_test_set_60bp(): """ Reads in data from a file and returns a list of test sequences and associated data. Returns: List[List[float]]: A list containing test sequences and associated data. """ test_data = [] with open('data_course/Lindel_test.txt') as f: lines = f.readlines() for l in lines: line_arr = l.split() row = [line_arr[0]] for token in line_arr[1:]: row.append(float(token)) test_data.append(row) seventy_k, homing_design, mh1_200bp, _, _, _ = utils.read_200bp_sequences( 'data_course/algient_NHEJ_guides_final.txt') sequences_60bp = [] for test_data_point in test_data: test_seq = test_data_point[0] if find_seq_and_replace(seventy_k, test_seq, sequences_60bp, test_data_point): continue elif find_seq_and_replace(homing_design, test_seq, sequences_60bp, test_data_point): continue else: find_seq_and_replace(mh1_200bp, test_seq, sequences_60bp, test_data_point) # print("\n-------------\n") # print(test_data[0][0]) # print(len(test_data[0][0])) # print(len(sequences_60bp)) # 440 data points in Lindel_test return test_data def mse(x, y): """ Calculates the mean squared error between two arrays Args: x : The first array. y : The second array. """ return ((x-y)**2).mean() if __name__ == '__main__': get_test_set_60bp() test_data = get_test_set_60bp() model_del_array_weights, model_del_array_biases, model_ratio_array_weights, model_ratio_array_biases, model_ins_array_weights, model_ins_array_biases = utils.get_weights_biases() weights_biases = [model_ratio_array_weights, model_ratio_array_biases, model_del_array_weights, model_del_array_biases, model_ins_array_weights, model_ins_array_biases] prerequesites = pkl.load(open(os.path.join(Lindel.__path__[0],'model_prereq.pkl'),'rb')) predictions = [] for test_point in test_data: test_seq = test_point[0] frequencies_hat, c = Predictor.gen_prediction(test_seq, weights_biases, prerequesites) predictions.append(frequencies_hat) np.save('lindel_output', np.array(predictions))