Untitled
unknown
python
4 years ago
4.4 kB
53
Indexable
import pickle
from sklearn.cluster import DBSCAN
from operator import itemgetter
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
#Some pickled (serialized) files which contain match information. Get this using Riot API.
pickled = pickle.load(open('./all_traits-trait_data_points.pkl','rb'))
tdp = pickled[1]
all_traits = list(pickled[0])
participant_info = pickle.load(open('./participant_info_vec.pkl','rb'))
all_traits.sort()
trait_to_index = dict()
#Create a dictionary from trait to coordinate, e.g. Chemtech -> 5, Bruiser -> 4 (not actual numbers)
for (i,trait) in enumerate(all_traits):
trait_to_index[trait] = i
tdp_vec = []
units_vec = []
placement_vec = []
units_to_index = dict()
all_units = set()
#Convert each datapoint into a vector [a b c d e ...] where a indicates the number of units with trait corresponding to
#coordinate 1, b indicates # units with traits coordinate 2, etc.
for data_point in tdp:
vec = np.zeros(len(all_traits))
for trait in data_point:
trait_name = trait['name']
ind = trait_to_index[trait_name]
vec[ind] = float(trait['num_units'])
tdp_vec.append(vec)
tdp_vec = np.array(tdp_vec)
#Get some placement information for each player
for part in participant_info:
placement_vec.append(part['placement'])
units = part['units']
for unit in units:
all_units.add(unit['character_id'])
#Get some units information for each player
all_units = list(all_units)
all_units.sort()
for (i,unit) in enumerate(all_units):
units_to_index[unit] = i
#Convert units into a vector. 0 indicates unit not present, 1 indicates unit present.
for part in participant_info:
vec = np.zeros(len(all_units))
units = part['units']
for unit in units:
unit_name = unit['character_id']
ind = units_to_index[unit_name]
vec[ind] = 1
units_vec.append(vec)
#Run t-SNE on the trait vector datapoints.
cutoff = len(tdp_vec)
X_embedded = TSNE(n_components=2,init='pca').fit_transform(np.array(tdp_vec[0:cutoff]))
#Run DBSCAN clustering
db = DBSCAN(eps=3.3, min_samples=10).fit(X_embedded)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
label_to_units = []
label_placements = []
unique_labels = set(labels)
#Colour map
colors = [plt.cm.tab20(each) for each in np.linspace(0, 1, len(unique_labels))]
#Colour/label each point by DBSCAN classification
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
else:
label_to_units.append([])
label_placements.append([])
class_member_mask = labels == k
xy = X_embedded[class_member_mask & core_samples_mask]
plt.plot(
xy[:, 0],
xy[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
label = k
)
xy = X_embedded[class_member_mask & ~core_samples_mask]
plt.plot(
xy[:, 0],
xy[:, 1],
"o",
markerfacecolor=tuple(col),
markeredgecolor="k",
)
plt.legend()
#Group units/placements by cluster
for (i,label) in enumerate(labels):
if label == -1:
continue
label_to_units[label].append(units_vec[i])
label_placements[label].append(placement_vec[i])
#Print the most used units/placement for each clsuter
N = 15
for i in range(len(unique_labels)-1):
#Skip clusters < 20 data points
if len(label_to_units[i]) < 20:
continue
#Get average nubmer times unit is played
vec = np.zeros(len(all_units))
for units in label_to_units[i]:
vec += units/len(label_to_units[i])
#get the top N most played units for the cluster
temp = np.argpartition(-vec, N)
result_args = temp[:N]
unit_args = [all_units[x] for x in result_args]
temp = np.partition(-vec, N)
result = -temp[:N]
print(i, np.mean(label_placements[i]), len(label_placements[i]))
for thing in sorted(zip(unit_args,result), key=lambda x : x[1], reverse = True):
print(thing[0][5:], str(round(thing[1] * 100,2) ) + '%')
print(len(tdp_vec))
plt.show()
#for i in range(len(unique_labels)-1):
# plt.hist(label_placements[i], bins = 8)
# plt.show()
Editor is loading...