Untitled

 avatar
unknown
python
3 years ago
9.5 kB
6
Indexable
#==========
#start of def

def s(Si,Tj):
    if(Si==Tj):
        return 0
    else:
        return 1

#end of def
#==========
#start of def

def edit_distance(S,T):
    m=len(S)
    n=len(T)
    D=[[0 for j in range(n+1)]for i in range(m+1)]
    D[0][0]= 0
    for i in range(1,m+1):
        D[i][0]=i
    for j in range(1,n+1):
        D[0][j]=j
    for i in range(1,m+1):
        for j in range(1,n+1):
            D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1])
    return D[m][n]
#Complexity of this is O(nm)

#end of def
#==========
#start of def

def convert_edit_sequence_to_alignment(S,T,pointers):
    m=len(S)
    n=len(T)
    i=m
    j=n
    str1=''
    str2=''
    str_match=''
    while(i>0 or j>0):
        if(pointers[i][j]=='D'):
            str1=S[i-1]+str1
            str2=T[j-1]+str2
            if(S[i-1]==T[j-1]):
                str_match='M'+str_match
            else:
                str_match='R'+str_match
            i-=1
            j-=1
        elif(pointers[i][j]=='L'):
            str1=' '+str1
            str2=T[j-1]+str2
            str_match='I'+str_match
            j-=1
        else:
            str1=S[i-1]+str1
            str2=' '+str2
            str_match='D'+str_match
            i-=1
    print(str_match)
    print(str1)
    print(str2)
    return str1, str2
        

#end of def 
#==========
#start of def

def convert_alignment_into_steps(str1, str2):
    pos=1
    step=1
    for i in range(len(str1)):
        if(step>50):
            break
        if(str1[i]==' '):
            print("Step "+ str(step)+ " : Insert "+ str(str2[i]))
            step+=1
            pos+=1
        elif(str2[i]==' '):
            print("Step "+ str(step)+ " : Delete "+ str(str1[i]))
            step+=1
        elif(str1[i]!=str2[i]):
            print("Step "+ str(step)+ " : Replace "+ str(str1[i])+" with "+str(str2[i]))
            step+=1
            pos+=1
        else:
            print("Step "+ str(step)+" : Match "+ str(str1[i]))
            step+=1
            pos+=1

#end of def 
#==========
#start of def

def edit_distance_with_pointer(S,T):
    m=len(S)
    n=len(T)
    D=[[0 for j in range(n+1)]for i in range(m+1)]
    pointers=[['' for j in range(n+1)] for i in range(m+1)]
    D[0][0]= 0
    for i in range(m+1):
        D[i][0]=i
        pointers[i][0]='U'
    for j in range(1,n+1):
        D[0][j]=j
        pointers[0][j]='L'
    for i in range(1,m+1):
        for j in range(1,n+1):
            D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1]) 
            if(i==1 and j==1):
                pointers[i][j]='D'
            elif(D[i][j]==D[i-1][j]+1):
                pointers[i][j]='U'
            elif(D[i][j]==D[i][j-1]+1):
                pointers[i][j]='L'
            else:
                pointers[i][j]='D'
    print(D[m][n])
    print("=====")
    str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
    print("=====")
    convert_alignment_into_steps(str1, str2)
    


#end of def 
#==========
#start of def

def ParseProteinsTxt():
    '''
    This is a simple function to parse the Fasta files and return the amino acid sequence it reads.
    '''
    file_location='II-9-3-2022-proteins.txt'
    dict_of_sequences={}
    sequence_curr=""
    curr_sequence_name=""
    with open(file_location, "r") as fh:
        for line in fh:
            if(line.startswith("Protein")):
                curr_line=''.join(list(line))
                curr_sequence_name=curr_line[-2]
                dict_of_sequences[curr_sequence_name]=""
            elif(line.startswith("#")):
                if(sequence_curr!=""):
                    dict_of_sequences[curr_sequence_name]=sequence_curr
                sequence_curr=""
                curr_sequence_name=""
            else:
                seq_on_line = ''.join(list(line))
                seq_on_line=seq_on_line[:-1]
                sequence_curr+=seq_on_line
    return dict_of_sequences

#end of def 
#==========
#start of def

def ParseBLOSUM():
    file_location='II-9-3-2022-blosum.txt'
    blosum_matrix={}
    with open(file_location, "r") as fh:
        count=0
        aa_list=""
        for line in fh:
                if(count==0):
                    seq_on_line = ''.join(list(line.split(' ')))
                    seq_on_line=seq_on_line[:-1]
                    aa_list=seq_on_line
                else:
                    seq_on_line = ''.join(list(line))
                    seq_on_line=seq_on_line[:-1]
                    if(len(seq_on_line)>0):
                        nums_temp_=line[1:-1].split(' ')
                        nums_temp=[]
                        for curr_rem in nums_temp_:
                            if(len(curr_rem)!=0):
                                nums_temp.append(curr_rem)
                        nums=[int(n) for n in nums_temp]
                        curr_AA_row=seq_on_line[0]
                        for i in range(len(aa_list)): 
                            blosum_matrix[(curr_AA_row,aa_list[i])]=nums[i]
                count+=1
    return blosum_matrix

#end of def
#==========
#start of def

def new_s(Si,Tj):
    return blosum_matrix[(Si,Tj)]

#end of def
#==========
#start of def

def max_score_with_scoring_matrix(S,T):
    m=len(S)
    n=len(T)
    D=[['' for j in range(n+1)]for i in range(m+1)]
    pointers=[['' for j in range(n+1)] for i in range(m+1)]
    D[0][0]= 0
    for i in range(m+1):
        D[i][0]=i*-8
        pointers[i][0]='U'
    for j in range(1,n+1):
        D[0][j]=j*-8
        pointers[0][j]='L'
    for i in range(1,m+1):
        for j in range(1,n+1):
            D[i][j]=max(max(D[i][j-1]-8,D[i-1][j]-8),new_s(S[i-1], T[j-1])+D[i-1][j-1]) 
            if(i==1 and j==1):
                pointers[i][j]='D'
            elif(D[i][j]==D[i-1][j]-8):
                pointers[i][j]='U'
            elif(D[i][j]==D[i][j-1]-8):
                pointers[i][j]='L'
            else:
                pointers[i][j]='D'
    print(D[m][n])
    print("=====")
    str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
    print("=====")
    convert_alignment_into_steps(str1, str2)

#Complexity of this is O(nm)   


#end of def
#==========
#start of def

def scoring_for_gaps(S,T,u, flag_for_printing):
    '''
    V(i,j)=max(E(i,j), F(i,j), G(i,j))
    E(i,j)=max_{0<=k<j}(V(i,k) + w(j-k))
    F(i,j)=max_{0<=k<i}(V(k,j) + w(i-k))
    G(i,j)=V(i-1,j-1)+s(Si,Tj)

    normally: O(mn(n+m))
    w(l)={0; l=0 \\ u; l>=1}
    u<0
    E(i,j)=max_{0<=k<=j-1}(V(i,k)+w(j-k))=max(E(i,j-1), V(i,j-1)+w(1)) (only deletions)
    F(i,j)=u+max_{0<=k<=i-1}(V(k,j)+w(i-k))=max(F(i-1,j), V(i-1,j)+w(1)) (only insertions)
    G(i,j)=V(i-1,j-1)+s(Si,Tj) (replacement on last character)
    V(i,j)=max(E(i,j), F(i,j), G(i,j))
    V(0,0)=0
    E(0,0)=0
    F(0,0)=0
    V(i,0)=u(i>0)
    V(0,j)=u(j>0)
    E(i,0)=u (i>0)
    F(0,j)=u(j>0)
    E(0,j)=0 (j>0)
    F(i,0)=0 (i>0)
    '''
    m=len(S)
    n=len(T)
    V=[[0 for j in range(n+1)] for i in range(m+1)]
    E=[[0 for j in range(n+1)]for i in range(m+1)]
    F=[[0 for j in range(n+1)]for i in range(m+1)]
    G=[[0 for j in range(n+1)]for i in range(m+1)]
    pointers=[['' for j in range(n+1)] for i in range(m+1)]
    for i in range(m+1):
        V[i][0]=u
        E[i][0]=u
        pointers[i][0]='U'
    for j in range(n+1):
        V[0][j]=u
        F[0][j]=u
        pointers[0][j]='L'
    V[0][0]=0
    for i in range(1,m+1):
        for j in range(1,n+1):
            E[i][j]=max(E[i][j-1], V[i][j-1]+u)
            F[i][j]=max(F[i-1][j], V[i-1][j]+u)
            G[i][j]=V[i-1][j-1]+new_s(S[i-1],T[j-1])
            V[i][j]=max(E[i][j], F[i][j], G[i][j])
            if(V[i][j]==G[i][j]):
                pointers[i][j]='D'
            elif(V[i][j]==E[i][j]):
                pointers[i][j]='L'
            else:
                pointers[i][j]='U'
    if(flag_for_printing):
        print(V[m][n])
        print("=====")
        str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
        print("=====")
        convert_alignment_into_steps(str1, str2)
    return V[m][n]
#end of def
#==========
#start of def

def new_new_s(Si,Tj):
    if(Si==Tj):
        return 1
    else:
        return -1

#end of def
#==========
#start of def

def scoring_for_gaps_new(S,T,u, flag_for_printing):
    m=len(S)
    n=len(T)
    V=[[0 for j in range(n+1)] for i in range(m+1)]
    E=[[0 for j in range(n+1)]for i in range(m+1)]
    F=[[0 for j in range(n+1)]for i in range(m+1)]
    G=[[0 for j in range(n+1)]for i in range(m+1)]
    for i in range(m+1):
        V[i][0]=u
        E[i][0]=u
    for j in range(n+1):
        V[0][j]=u
        F[0][j]=u
    V[0][0]=0
    for i in range(1,m+1):
        for j in range(1,n+1):
            E[i][j]=max(E[i][j-1], V[i][j-1]+u)
            F[i][j]=max(F[i-1][j], V[i-1][j]+u)
            G[i][j]=V[i-1][j-1]+new_new_s(S[i-1],T[j-1])
            V[i][j]=max(E[i][j], F[i][j], G[i][j])
    return V[m][n]
#end of def
#==========
#start of def

def montecarloestimation(N, n, u, p_):
    Ui=['' for x in range(N)]
    Vi=['' for x in range(N)]
    V_gap=[0 for x in range(N)]
    rng = np.random.default_rng()
    for i in range(N):
        pred=rng.random()
        U=rng.choice(['a','b'],n, p=[p_,1-p_])
        Ui[i]=''.join(U)
        V=rng.choice(['a','b'],n, p=[p_,1-p_])
        Vi[i]=''.join(V)
        V_gap[i]=scoring_for_gaps_new(Ui[i], Vi[i], u, False)
    mean=sum(V_gap)/N
    return (mean/n)

#end of def
Editor is loading...