Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
3.5 kB
3
Indexable
Never
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle as pkl
import utils
import Lindel
from Lindel import Predictor
import os

def find_seq_and_replace(data_set, test_seq, sequences_60bp, test_data_point):
    """
       Finds a specified sequence in a dataset and returns the corresponding 60bp sequence.

       Args:
           data_set (List[str]): The dataset to search through.
           test_seq (str): The sequence to search for.
           sequences_60bp (List[str]): A list to store the 60bp sequences found.
           test_data_point (List[float]): A list containing data associated with the test sequence.

       Returns:
           bool: True if the sequence was found and replaced, False otherwise.
    """
    for seq in data_set:
        pos = seq.find(test_seq, 39)
        if pos >= 0:
            start_pos = pos + 17 - 30
            end_pos = pos + 17 + 30
            seq_60_bp = seq[start_pos:end_pos]
            sequences_60bp.append(seq_60_bp)
            test_data_point[0] = seq_60_bp
            return True

    return False


def get_test_set_60bp():
    """
       Reads in data from a file and returns a list of test sequences and associated data.

       Returns:
           List[List[float]]: A list containing test sequences and associated data.
    """
    test_data = []
    with open('data_course/Lindel_test.txt') as f:
        lines = f.readlines()
        for l in lines:
            line_arr = l.split()
            row = [line_arr[0]]
            for token in line_arr[1:]:
                row.append(float(token))
            test_data.append(row)

    seventy_k, homing_design, mh1_200bp, _, _, _ = utils.read_200bp_sequences(
        'data_course/algient_NHEJ_guides_final.txt')
    sequences_60bp = []

    for test_data_point in test_data:
        test_seq = test_data_point[0]
        if find_seq_and_replace(seventy_k, test_seq, sequences_60bp, test_data_point):
            continue
        elif find_seq_and_replace(homing_design, test_seq, sequences_60bp, test_data_point):
            continue
        else:
            find_seq_and_replace(mh1_200bp, test_seq, sequences_60bp, test_data_point)

    # print("\n-------------\n")
    # print(test_data[0][0])
    # print(len(test_data[0][0]))
    # print(len(sequences_60bp))  # 440 data points in Lindel_test

    return test_data


def mse(x, y):
    """
       Calculates the mean squared error between two arrays
        Args:
        x : The first array.
        y : The second array.
    """

    return ((x-y)**2).mean()


if __name__ == '__main__':
    get_test_set_60bp()

    test_data = get_test_set_60bp()
    model_del_array_weights, model_del_array_biases, model_ratio_array_weights, model_ratio_array_biases, model_ins_array_weights, model_ins_array_biases = utils.get_weights_biases()
    weights_biases = [model_ratio_array_weights, model_ratio_array_biases, model_del_array_weights, model_del_array_biases,
         model_ins_array_weights, model_ins_array_biases]

    prerequesites = pkl.load(open(os.path.join(Lindel.__path__[0],'model_prereq.pkl'),'rb'))

    predictions = []

    for test_point in test_data:
        test_seq = test_point[0]
        frequencies_hat, c = Predictor.gen_prediction(test_seq, weights_biases, prerequesites)
        predictions.append(frequencies_hat)

    np.save('lindel_output', np.array(predictions))