labp4.py
unknown
python
2 years ago
6.6 kB
9
Indexable
### Your task is to finish the 'upgma' function below. You may use as much of the existing code as you please.
### If you use all existing code, it should be enough to enter code under each "Write code here:" statement.
import numpy as np
# Replace these with your names
authors = ['Alexander De Rosa', 'Jasin Aman Ali']
# 'dist_matr' is set
dist_matr = np.array([[0, 0.1, 0.2, 0.3],
[0.1, 0, 0.2, 0.3],
[0.2, 0.2, 0, 0.3],
[0.3, 0.3, 0.3, 0]])
# 'names_list' is set
names_list = ['S1', 'S2', 'S3', 'S4']
# This allows testing the code by running the script in the terminal: $ python3 labp4.py
# Keeping these unchanged, the output should be '(S4,(S3,(S1,S2)));' or '(S4,(S3,(S2,S1)));' or corresponding.
def upgma(dist_matr, names_list):
# 'names_list' is copied into 'nwk_list'. To start with, 'nwk_list' will contain one sequence name per element,
# but it will be updated in each iteration of the while-loop below.
nwk_list = names_list
# 'cluster_list' is generated. This is a list of lists. Each element represents a cluster, as a list of the
# indices of the sequences of the cluster (0 for 'S1', 1 for 'S2', etc). To start with, each element has just one
# index in it, but it will be updated in each iteration of the while-loop below.
cluster_list = []
for i in range(len(names_list)):
cluster_list.append([i])
# The orginal 'dist_matr' is copied into 'updated_dist_matr'. The idea is to keep 'dist_matr' unchanged
# but make changes to 'updated_dist_matr' in each iteration of the while-loop below.
updated_dist_matr = dist_matr
# while-loop that in each iteration merges the pair of clusters with smallest average distance between objects
# until there is only one cluster.
while(len(cluster_list) > 1):
### Find the pair of clusters in 'updated_dist_matr' with the smallest distance (we call these "winning clusters"
# in the comments below), and get their indices in 'updated_dist_matr'.
# One way could be to go through each row / column combination of 'updated_dist_matr' with a nested for loop.
# Write code here:
min_i = 0
min_j = 1
min_val = updated_dist_matr[0, 1]
for i in range(len(updated_dist_matr)):
for j in range(i+1, len(updated_dist_matr)):
if updated_dist_matr[i, j] < min_val:
min_val = updated_dist_matr[i, j]
min_i = i
min_j = j
### Update 'cluster_list'.
# Make a list 'in_new_cluster' by combining the lists of sequence indices (from 'cluster_list') of the two "winning clusters".
# Write code here:
in_new_cluster = cluster_list[min_i] + cluster_list[min_j]
# Remove the elements of the two "winning clusters" in 'cluster_list' (if done in two steps, it may be wise to remove
# the later element first, to not screw up the indexing).
# Write code here:
del cluster_list[max(min_i, min_j)] # remove later element first
del cluster_list[min(min_i, min_j)]
# Add the 'in_new_cluster' list as an element to the end of 'cluster_list'.
# Write code here:
cluster_list.append(in_new_cluster)
### Updaate 'nwk_list'.
# Combine the two "winning clusters" of 'nwk_list' in one string, separating them with a "," and adding a "(" before
# and a ")" after. For example, combining 'S1' and 'S2' should give '(S1,S2)'.
# Write code here:
new_cluster_nwk = '(' + nwk_list[min_i] + ',' + nwk_list[min_j] + ')'
# Remove the elements of of the two "winning clusters" in 'nwk_list' (if done in two steps, it may be wise to remove
# the later element first, to not screw up the indexing).
# Write code here:
del nwk_list[max(min_i, min_j)] # remove later element first
del nwk_list[min(min_i, min_j)]
# Add the 'new_cluster_nwk' string as an element to the end of 'nwk_list'.
# Write code here:
nwk_list.append(new_cluster_nwk)
### Update 'updated_dist_matr'.
# Remove the two rows and columns of 'updated_dist_matr' that correspond to the two "winning clusters".
# Write code here:
updated_dist_matr = np.delete(updated_dist_matr, [min_i, min_j], axis=0) # remove rows
updated_dist_matr = np.delete(updated_dist_matr, [min_i, min_j], axis=1) # remove columns
# Add a new row of zeros and a new column of zeros to the "end" of 'updated_dist_matr' (the zeros will be replaced below).
# We've done it for you since it's a bit tricky.
updated_dist_matr = np.append(updated_dist_matr, np.zeros((1, updated_dist_matr.shape[1])), 0) # add row with zeros to the end (bottom)
updated_dist_matr = np.append(updated_dist_matr, np.zeros((updated_dist_matr.shape[0], 1)), 1) # add column with zeros to the end (right)
# Calculate the distance between the new cluster and each other cluster, using the average method,
# and insert these distances in the last row and column of 'updated_dist_matr'.
# The indices of the sequences of the new cluster are in 'in_new_cluster'. One approach could be to go through each element
# of 'cluster_list', except the last, to get the indices of the sequences of the other clusters, and calculate the average of
# these sequences' distances to the sequences of the new cluster, using the values in the original 'dist_matr'.
# Write code here:
for i in range(len(updated_dist_matr)-1):
dist_sum = 0
count = 0
for x in in_new_cluster:
for y in cluster_list[i]:
dist_sum += dist_matr[x, y]
count += 1
avg_dist = dist_sum / count
updated_dist_matr[i, -1] = avg_dist
updated_dist_matr[-1, i] = avg_dist
## After the while loop, all clusters should have been merged into one, and only one element should exist in 'nwk_list'.
# We convert this to a string and add a ";" to the end, to get proper Newick format. This string is then returned.
nwk = nwk_list[0] + ';'
return (nwk)
# Test code for the upgma function.
# Will only be executed if this file is run directly
# e.g. by running the command "python3 labp4.py"
if __name__ == "__main__":
nwk = upgma(dist_matr, names_list)
print(nwk)
Editor is loading...