Untitled

mail@pastecode.io avatar
unknown
python
6 months ago
2.2 kB
1
Indexable
Never
    def fit(self, data : pd.DataFrame, animate=False, colour_map=None):
        if animate and not colour_map:
            raise ValueError('Argument colourmap has to be specified if animate is set to true.')

        self.data = data

        assignments = defaultdict(lambda: -1) # this will store our assignments of instances to a cluster

        # randomly create our cluster centres
        self.c : pd.DataFrame = data.sample(self.k, random_state=self.seed)
        self.c.index = [i for i in range(self.k)] # re-index

        converged = False
        loops = 1

        while not converged:

            converged = True

            # step 1: assign every instance to its closest cluster
            for i, instance in data.iterrows():
                prev_assignment = assignments[i]
                closest_cluster, min_d = (None, float('inf'))

                for j, cluster in self.c.iterrows():
                    # calculate the euclidian distance between the test instance and the cluster centre
                    d = self.distance(instance, cluster)

                    # assign the instance to the cluster if it is the closest cluster found so far
                    if d < min_d:
                        min_d = d
                        closest_cluster = j

                assignments[i] = closest_cluster

                converged = False if prev_assignment != assignments[i] else converged

            if animate: self._savefig(data, assignments, f'{loops}_{time.time()}.png', colour_map)


            # step 2: recentre the clusters to the mean of the points assigned to it
            for j, cluster in self.c.iterrows():
                # calculate the mean of the points assigned to cluster 
                points_in_cluster = data.iloc[[i for i in assignments.keys() if assignments[i]==j]]
                m = points_in_cluster.mean()

                # recentre the cluster
                self.c.iloc[j] = m

            loops += 1

        if animate: self._savefig(data, assignments, f'{loops}_{time.time()}.png', colour_map)

        return pd.Series([assignments[x] for x in data.index], index=data.index)
Leave a Comment