Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
4.4 kB
32
Indexable
Never
import pickle
from sklearn.cluster import DBSCAN
from operator import itemgetter
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

#Some pickled (serialized) files which contain match information. Get this using Riot API.
pickled = pickle.load(open('./all_traits-trait_data_points.pkl','rb'))
tdp = pickled[1]
all_traits = list(pickled[0])
participant_info = pickle.load(open('./participant_info_vec.pkl','rb'))

all_traits.sort()
trait_to_index = dict()

#Create a dictionary from trait to coordinate, e.g. Chemtech -> 5, Bruiser -> 4 (not actual numbers) 
for (i,trait) in enumerate(all_traits):
    trait_to_index[trait] = i

tdp_vec = []
units_vec = []
placement_vec = []

units_to_index = dict()
all_units = set()

#Convert each datapoint into a vector [a b c d e ...] where a indicates the number of units with trait corresponding to
#coordinate 1, b indicates # units with traits coordinate 2, etc.
for data_point in tdp:
    vec = np.zeros(len(all_traits))
    for trait in data_point:
        trait_name = trait['name']
        ind = trait_to_index[trait_name]
        vec[ind] = float(trait['num_units'])
    tdp_vec.append(vec)
tdp_vec = np.array(tdp_vec)

#Get some placement information for each player
for part in participant_info:
    placement_vec.append(part['placement'])
    units = part['units']
    for unit in units:
        all_units.add(unit['character_id'])

#Get some units information for each player
all_units = list(all_units)
all_units.sort()
for (i,unit) in enumerate(all_units):
    units_to_index[unit] = i

#Convert units into a vector. 0 indicates unit not present, 1 indicates unit present. 
for part in participant_info:
    vec = np.zeros(len(all_units))
    units = part['units']
    for unit in units:
        unit_name = unit['character_id']
        ind = units_to_index[unit_name]
        vec[ind] = 1
    units_vec.append(vec)

#Run t-SNE on the trait vector datapoints. 
cutoff = len(tdp_vec)
X_embedded = TSNE(n_components=2,init='pca').fit_transform(np.array(tdp_vec[0:cutoff]))

#Run DBSCAN clustering
db = DBSCAN(eps=3.3, min_samples=10).fit(X_embedded)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
label_to_units = []
label_placements = []
unique_labels = set(labels)

#Colour map
colors = [plt.cm.tab20(each) for each in np.linspace(0, 1, len(unique_labels))]

#Colour/label each point by DBSCAN classification 
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]
    else:
        label_to_units.append([])
        label_placements.append([])

    class_member_mask = labels == k

    xy = X_embedded[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        label = k
    )

    xy = X_embedded[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
    )
plt.legend()

#Group units/placements by cluster
for (i,label) in enumerate(labels):
    if label == -1:
        continue
    label_to_units[label].append(units_vec[i])
    label_placements[label].append(placement_vec[i])

#Print the most used units/placement for each clsuter
N = 15
for i in range(len(unique_labels)-1):
    #Skip clusters < 20 data points
    if len(label_to_units[i]) < 20:
        continue
    #Get average nubmer times unit is played
    vec = np.zeros(len(all_units))
    for units in label_to_units[i]:
        vec += units/len(label_to_units[i])
    #get the top N most played units for the cluster
    temp = np.argpartition(-vec, N)
    result_args = temp[:N]
    unit_args = [all_units[x] for x in result_args]
    temp = np.partition(-vec, N)
    result = -temp[:N]
    print(i, np.mean(label_placements[i]), len(label_placements[i]))
    for thing in sorted(zip(unit_args,result), key=lambda x : x[1], reverse = True):
        print(thing[0][5:], str(round(thing[1] * 100,2) ) + '%')

print(len(tdp_vec))
plt.show()
#for i in range(len(unique_labels)-1):
#    plt.hist(label_placements[i], bins = 8)
#    plt.show()