Untitled
unknown
python
3 years ago
9.5 kB
6
Indexable
#========== #start of def def s(Si,Tj): if(Si==Tj): return 0 else: return 1 #end of def #========== #start of def def edit_distance(S,T): m=len(S) n=len(T) D=[[0 for j in range(n+1)]for i in range(m+1)] D[0][0]= 0 for i in range(1,m+1): D[i][0]=i for j in range(1,n+1): D[0][j]=j for i in range(1,m+1): for j in range(1,n+1): D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1]) return D[m][n] #Complexity of this is O(nm) #end of def #========== #start of def def convert_edit_sequence_to_alignment(S,T,pointers): m=len(S) n=len(T) i=m j=n str1='' str2='' str_match='' while(i>0 or j>0): if(pointers[i][j]=='D'): str1=S[i-1]+str1 str2=T[j-1]+str2 if(S[i-1]==T[j-1]): str_match='M'+str_match else: str_match='R'+str_match i-=1 j-=1 elif(pointers[i][j]=='L'): str1=' '+str1 str2=T[j-1]+str2 str_match='I'+str_match j-=1 else: str1=S[i-1]+str1 str2=' '+str2 str_match='D'+str_match i-=1 print(str_match) print(str1) print(str2) return str1, str2 #end of def #========== #start of def def convert_alignment_into_steps(str1, str2): pos=1 step=1 for i in range(len(str1)): if(step>50): break if(str1[i]==' '): print("Step "+ str(step)+ " : Insert "+ str(str2[i])) step+=1 pos+=1 elif(str2[i]==' '): print("Step "+ str(step)+ " : Delete "+ str(str1[i])) step+=1 elif(str1[i]!=str2[i]): print("Step "+ str(step)+ " : Replace "+ str(str1[i])+" with "+str(str2[i])) step+=1 pos+=1 else: print("Step "+ str(step)+" : Match "+ str(str1[i])) step+=1 pos+=1 #end of def #========== #start of def def edit_distance_with_pointer(S,T): m=len(S) n=len(T) D=[[0 for j in range(n+1)]for i in range(m+1)] pointers=[['' for j in range(n+1)] for i in range(m+1)] D[0][0]= 0 for i in range(m+1): D[i][0]=i pointers[i][0]='U' for j in range(1,n+1): D[0][j]=j pointers[0][j]='L' for i in range(1,m+1): for j in range(1,n+1): D[i][j]=min(min(D[i][j-1]+1,D[i-1][j]+1),s(S[i-1], T[j-1])+D[i-1][j-1]) if(i==1 and j==1): pointers[i][j]='D' elif(D[i][j]==D[i-1][j]+1): pointers[i][j]='U' elif(D[i][j]==D[i][j-1]+1): pointers[i][j]='L' else: pointers[i][j]='D' print(D[m][n]) print("=====") str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers) print("=====") convert_alignment_into_steps(str1, str2) #end of def #========== #start of def def ParseProteinsTxt(): ''' This is a simple function to parse the Fasta files and return the amino acid sequence it reads. ''' file_location='II-9-3-2022-proteins.txt' dict_of_sequences={} sequence_curr="" curr_sequence_name="" with open(file_location, "r") as fh: for line in fh: if(line.startswith("Protein")): curr_line=''.join(list(line)) curr_sequence_name=curr_line[-2] dict_of_sequences[curr_sequence_name]="" elif(line.startswith("#")): if(sequence_curr!=""): dict_of_sequences[curr_sequence_name]=sequence_curr sequence_curr="" curr_sequence_name="" else: seq_on_line = ''.join(list(line)) seq_on_line=seq_on_line[:-1] sequence_curr+=seq_on_line return dict_of_sequences #end of def #========== #start of def def ParseBLOSUM(): file_location='II-9-3-2022-blosum.txt' blosum_matrix={} with open(file_location, "r") as fh: count=0 aa_list="" for line in fh: if(count==0): seq_on_line = ''.join(list(line.split(' '))) seq_on_line=seq_on_line[:-1] aa_list=seq_on_line else: seq_on_line = ''.join(list(line)) seq_on_line=seq_on_line[:-1] if(len(seq_on_line)>0): nums_temp_=line[1:-1].split(' ') nums_temp=[] for curr_rem in nums_temp_: if(len(curr_rem)!=0): nums_temp.append(curr_rem) nums=[int(n) for n in nums_temp] curr_AA_row=seq_on_line[0] for i in range(len(aa_list)): blosum_matrix[(curr_AA_row,aa_list[i])]=nums[i] count+=1 return blosum_matrix #end of def #========== #start of def def new_s(Si,Tj): return blosum_matrix[(Si,Tj)] #end of def #========== #start of def def max_score_with_scoring_matrix(S,T): m=len(S) n=len(T) D=[['' for j in range(n+1)]for i in range(m+1)] pointers=[['' for j in range(n+1)] for i in range(m+1)] D[0][0]= 0 for i in range(m+1): D[i][0]=i*-8 pointers[i][0]='U' for j in range(1,n+1): D[0][j]=j*-8 pointers[0][j]='L' for i in range(1,m+1): for j in range(1,n+1): D[i][j]=max(max(D[i][j-1]-8,D[i-1][j]-8),new_s(S[i-1], T[j-1])+D[i-1][j-1]) if(i==1 and j==1): pointers[i][j]='D' elif(D[i][j]==D[i-1][j]-8): pointers[i][j]='U' elif(D[i][j]==D[i][j-1]-8): pointers[i][j]='L' else: pointers[i][j]='D' print(D[m][n]) print("=====") str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers) print("=====") convert_alignment_into_steps(str1, str2) #Complexity of this is O(nm) #end of def #========== #start of def def scoring_for_gaps(S,T,u, flag_for_printing): ''' V(i,j)=max(E(i,j), F(i,j), G(i,j)) E(i,j)=max_{0<=k<j}(V(i,k) + w(j-k)) F(i,j)=max_{0<=k<i}(V(k,j) + w(i-k)) G(i,j)=V(i-1,j-1)+s(Si,Tj) normally: O(mn(n+m)) w(l)={0; l=0 \\ u; l>=1} u<0 E(i,j)=max_{0<=k<=j-1}(V(i,k)+w(j-k))=max(E(i,j-1), V(i,j-1)+w(1)) (only deletions) F(i,j)=u+max_{0<=k<=i-1}(V(k,j)+w(i-k))=max(F(i-1,j), V(i-1,j)+w(1)) (only insertions) G(i,j)=V(i-1,j-1)+s(Si,Tj) (replacement on last character) V(i,j)=max(E(i,j), F(i,j), G(i,j)) V(0,0)=0 E(0,0)=0 F(0,0)=0 V(i,0)=u(i>0) V(0,j)=u(j>0) E(i,0)=u (i>0) F(0,j)=u(j>0) E(0,j)=0 (j>0) F(i,0)=0 (i>0) ''' m=len(S) n=len(T) V=[[0 for j in range(n+1)] for i in range(m+1)] E=[[0 for j in range(n+1)]for i in range(m+1)] F=[[0 for j in range(n+1)]for i in range(m+1)] G=[[0 for j in range(n+1)]for i in range(m+1)] pointers=[['' for j in range(n+1)] for i in range(m+1)] for i in range(m+1): V[i][0]=u E[i][0]=u pointers[i][0]='U' for j in range(n+1): V[0][j]=u F[0][j]=u pointers[0][j]='L' V[0][0]=0 for i in range(1,m+1): for j in range(1,n+1): E[i][j]=max(E[i][j-1], V[i][j-1]+u) F[i][j]=max(F[i-1][j], V[i-1][j]+u) G[i][j]=V[i-1][j-1]+new_s(S[i-1],T[j-1]) V[i][j]=max(E[i][j], F[i][j], G[i][j]) if(V[i][j]==G[i][j]): pointers[i][j]='D' elif(V[i][j]==E[i][j]): pointers[i][j]='L' else: pointers[i][j]='U' if(flag_for_printing): print(V[m][n]) print("=====") str1, str2 = convert_edit_sequence_to_alignment(S,T,pointers) print("=====") convert_alignment_into_steps(str1, str2) return V[m][n] #end of def #========== #start of def def new_new_s(Si,Tj): if(Si==Tj): return 1 else: return -1 #end of def #========== #start of def def scoring_for_gaps_new(S,T,u, flag_for_printing): m=len(S) n=len(T) V=[[0 for j in range(n+1)] for i in range(m+1)] E=[[0 for j in range(n+1)]for i in range(m+1)] F=[[0 for j in range(n+1)]for i in range(m+1)] G=[[0 for j in range(n+1)]for i in range(m+1)] for i in range(m+1): V[i][0]=u E[i][0]=u for j in range(n+1): V[0][j]=u F[0][j]=u V[0][0]=0 for i in range(1,m+1): for j in range(1,n+1): E[i][j]=max(E[i][j-1], V[i][j-1]+u) F[i][j]=max(F[i-1][j], V[i-1][j]+u) G[i][j]=V[i-1][j-1]+new_new_s(S[i-1],T[j-1]) V[i][j]=max(E[i][j], F[i][j], G[i][j]) return V[m][n] #end of def #========== #start of def def montecarloestimation(N, n, u, p_): Ui=['' for x in range(N)] Vi=['' for x in range(N)] V_gap=[0 for x in range(N)] rng = np.random.default_rng() for i in range(N): pred=rng.random() U=rng.choice(['a','b'],n, p=[p_,1-p_]) Ui[i]=''.join(U) V=rng.choice(['a','b'],n, p=[p_,1-p_]) Vi[i]=''.join(V) V_gap[i]=scoring_for_gaps_new(Ui[i], Vi[i], u, False) mean=sum(V_gap)/N return (mean/n) #end of def
Editor is loading...