# Untitled

unknown
python
a year ago
9.5 kB
2
Indexable
Never
```#==========
#start of def

def s(Si,Tj):
if(Si==Tj):
return 0
else:
return 1

#end of def
#==========
#start of def

def edit_distance(S,T):
m=len(S)
n=len(T)
D=[[0 for j in range(n+1)]for i in range(m+1)]
D[0][0]= 0
for i in range(1,m+1):
D[i][0]=i
for j in range(1,n+1):
D[0][j]=j
for i in range(1,m+1):
for j in range(1,n+1):
D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1])
return D[m][n]
#Complexity of this is O(nm)

#end of def
#==========
#start of def

def convert_edit_sequence_to_alignment(S,T,pointers):
m=len(S)
n=len(T)
i=m
j=n
str1=''
str2=''
str_match=''
while(i>0 or j>0):
if(pointers[i][j]=='D'):
str1=S[i-1]+str1
str2=T[j-1]+str2
if(S[i-1]==T[j-1]):
str_match='M'+str_match
else:
str_match='R'+str_match
i-=1
j-=1
elif(pointers[i][j]=='L'):
str1=' '+str1
str2=T[j-1]+str2
str_match='I'+str_match
j-=1
else:
str1=S[i-1]+str1
str2=' '+str2
str_match='D'+str_match
i-=1
print(str_match)
print(str1)
print(str2)
return str1, str2

#end of def
#==========
#start of def

def convert_alignment_into_steps(str1, str2):
pos=1
step=1
for i in range(len(str1)):
if(step>50):
break
if(str1[i]==' '):
print("Step "+ str(step)+ " : Insert "+ str(str2[i]))
step+=1
pos+=1
elif(str2[i]==' '):
print("Step "+ str(step)+ " : Delete "+ str(str1[i]))
step+=1
elif(str1[i]!=str2[i]):
print("Step "+ str(step)+ " : Replace "+ str(str1[i])+" with "+str(str2[i]))
step+=1
pos+=1
else:
print("Step "+ str(step)+" : Match "+ str(str1[i]))
step+=1
pos+=1

#end of def
#==========
#start of def

def edit_distance_with_pointer(S,T):
m=len(S)
n=len(T)
D=[[0 for j in range(n+1)]for i in range(m+1)]
pointers=[['' for j in range(n+1)] for i in range(m+1)]
D[0][0]= 0
for i in range(m+1):
D[i][0]=i
pointers[i][0]='U'
for j in range(1,n+1):
D[0][j]=j
pointers[0][j]='L'
for i in range(1,m+1):
for j in range(1,n+1):
D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1])
if(i==1 and j==1):
pointers[i][j]='D'
elif(D[i][j]==D[i-1][j]+1):
pointers[i][j]='U'
elif(D[i][j]==D[i][j-1]+1):
pointers[i][j]='L'
else:
pointers[i][j]='D'
print(D[m][n])
print("=====")
str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
print("=====")
convert_alignment_into_steps(str1, str2)

#end of def
#==========
#start of def

def ParseProteinsTxt():
'''
This is a simple function to parse the Fasta files and return the amino acid sequence it reads.
'''
file_location='II-9-3-2022-proteins.txt'
dict_of_sequences={}
sequence_curr=""
curr_sequence_name=""
with open(file_location, "r") as fh:
for line in fh:
if(line.startswith("Protein")):
curr_line=''.join(list(line))
curr_sequence_name=curr_line[-2]
dict_of_sequences[curr_sequence_name]=""
elif(line.startswith("#")):
if(sequence_curr!=""):
dict_of_sequences[curr_sequence_name]=sequence_curr
sequence_curr=""
curr_sequence_name=""
else:
seq_on_line = ''.join(list(line))
seq_on_line=seq_on_line[:-1]
sequence_curr+=seq_on_line
return dict_of_sequences

#end of def
#==========
#start of def

def ParseBLOSUM():
file_location='II-9-3-2022-blosum.txt'
blosum_matrix={}
with open(file_location, "r") as fh:
count=0
aa_list=""
for line in fh:
if(count==0):
seq_on_line = ''.join(list(line.split(' ')))
seq_on_line=seq_on_line[:-1]
aa_list=seq_on_line
else:
seq_on_line = ''.join(list(line))
seq_on_line=seq_on_line[:-1]
if(len(seq_on_line)>0):
nums_temp_=line[1:-1].split(' ')
nums_temp=[]
for curr_rem in nums_temp_:
if(len(curr_rem)!=0):
nums_temp.append(curr_rem)
nums=[int(n) for n in nums_temp]
curr_AA_row=seq_on_line[0]
for i in range(len(aa_list)):
blosum_matrix[(curr_AA_row,aa_list[i])]=nums[i]
count+=1
return blosum_matrix

#end of def
#==========
#start of def

def new_s(Si,Tj):
return blosum_matrix[(Si,Tj)]

#end of def
#==========
#start of def

def max_score_with_scoring_matrix(S,T):
m=len(S)
n=len(T)
D=[['' for j in range(n+1)]for i in range(m+1)]
pointers=[['' for j in range(n+1)] for i in range(m+1)]
D[0][0]= 0
for i in range(m+1):
D[i][0]=i*-8
pointers[i][0]='U'
for j in range(1,n+1):
D[0][j]=j*-8
pointers[0][j]='L'
for i in range(1,m+1):
for j in range(1,n+1):
D[i][j]=max(max(D[i][j-1]-8,D[i-1][j]-8),new_s(S[i-1], T[j-1])+D[i-1][j-1])
if(i==1 and j==1):
pointers[i][j]='D'
elif(D[i][j]==D[i-1][j]-8):
pointers[i][j]='U'
elif(D[i][j]==D[i][j-1]-8):
pointers[i][j]='L'
else:
pointers[i][j]='D'
print(D[m][n])
print("=====")
str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
print("=====")
convert_alignment_into_steps(str1, str2)

#Complexity of this is O(nm)

#end of def
#==========
#start of def

def scoring_for_gaps(S,T,u, flag_for_printing):
'''
V(i,j)=max(E(i,j), F(i,j), G(i,j))
E(i,j)=max_{0<=k<j}(V(i,k) + w(j-k))
F(i,j)=max_{0<=k<i}(V(k,j) + w(i-k))
G(i,j)=V(i-1,j-1)+s(Si,Tj)

normally: O(mn(n+m))
w(l)={0; l=0 \\ u; l>=1}
u<0
E(i,j)=max_{0<=k<=j-1}(V(i,k)+w(j-k))=max(E(i,j-1), V(i,j-1)+w(1)) (only deletions)
F(i,j)=u+max_{0<=k<=i-1}(V(k,j)+w(i-k))=max(F(i-1,j), V(i-1,j)+w(1)) (only insertions)
G(i,j)=V(i-1,j-1)+s(Si,Tj) (replacement on last character)
V(i,j)=max(E(i,j), F(i,j), G(i,j))
V(0,0)=0
E(0,0)=0
F(0,0)=0
V(i,0)=u(i>0)
V(0,j)=u(j>0)
E(i,0)=u (i>0)
F(0,j)=u(j>0)
E(0,j)=0 (j>0)
F(i,0)=0 (i>0)
'''
m=len(S)
n=len(T)
V=[[0 for j in range(n+1)] for i in range(m+1)]
E=[[0 for j in range(n+1)]for i in range(m+1)]
F=[[0 for j in range(n+1)]for i in range(m+1)]
G=[[0 for j in range(n+1)]for i in range(m+1)]
pointers=[['' for j in range(n+1)] for i in range(m+1)]
for i in range(m+1):
V[i][0]=u
E[i][0]=u
pointers[i][0]='U'
for j in range(n+1):
V[0][j]=u
F[0][j]=u
pointers[0][j]='L'
V[0][0]=0
for i in range(1,m+1):
for j in range(1,n+1):
E[i][j]=max(E[i][j-1], V[i][j-1]+u)
F[i][j]=max(F[i-1][j], V[i-1][j]+u)
G[i][j]=V[i-1][j-1]+new_s(S[i-1],T[j-1])
V[i][j]=max(E[i][j], F[i][j], G[i][j])
if(V[i][j]==G[i][j]):
pointers[i][j]='D'
elif(V[i][j]==E[i][j]):
pointers[i][j]='L'
else:
pointers[i][j]='U'
if(flag_for_printing):
print(V[m][n])
print("=====")
str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers)
print("=====")
convert_alignment_into_steps(str1, str2)
return V[m][n]
#end of def
#==========
#start of def

def new_new_s(Si,Tj):
if(Si==Tj):
return 1
else:
return -1

#end of def
#==========
#start of def

def scoring_for_gaps_new(S,T,u, flag_for_printing):
m=len(S)
n=len(T)
V=[[0 for j in range(n+1)] for i in range(m+1)]
E=[[0 for j in range(n+1)]for i in range(m+1)]
F=[[0 for j in range(n+1)]for i in range(m+1)]
G=[[0 for j in range(n+1)]for i in range(m+1)]
for i in range(m+1):
V[i][0]=u
E[i][0]=u
for j in range(n+1):
V[0][j]=u
F[0][j]=u
V[0][0]=0
for i in range(1,m+1):
for j in range(1,n+1):
E[i][j]=max(E[i][j-1], V[i][j-1]+u)
F[i][j]=max(F[i-1][j], V[i-1][j]+u)
G[i][j]=V[i-1][j-1]+new_new_s(S[i-1],T[j-1])
V[i][j]=max(E[i][j], F[i][j], G[i][j])
return V[m][n]
#end of def
#==========
#start of def

def montecarloestimation(N, n, u, p_):
Ui=['' for x in range(N)]
Vi=['' for x in range(N)]
V_gap=[0 for x in range(N)]
rng = np.random.default_rng()
for i in range(N):
pred=rng.random()
U=rng.choice(['a','b'],n, p=[p_,1-p_])
Ui[i]=''.join(U)
V=rng.choice(['a','b'],n, p=[p_,1-p_])
Vi[i]=''.join(V)
V_gap[i]=scoring_for_gaps_new(Ui[i], Vi[i], u, False)
mean=sum(V_gap)/N
return (mean/n)

#end of def```