# Untitled

unknown
plain_text
10 days ago
2.5 kB
0
Indexable
Never
```def compute_emd(x, y):
# 计算两个分布之间的EMD
# x, y 应该是两个向量或直方图，代表分布
x = np.array(x)
y = np.array(y)
cost_matrix = np.abs(x[:, np.newaxis] - y[np.newaxis, :])
row_ind, col_ind = linear_sum_assignment(cost_matrix)
emd_distance = cost_matrix[row_ind, col_ind].sum()
return emd_distance

def run_kmeans_emd(x, args):
print('performing kmeans clustering with EMD distance')
results = {'im2cluster': [], 'centroids': [], 'density': []}

for seed, num_cluster in enumerate(args.num_cluster):
d = x.shape[1]
k = int(num_cluster)
clus = faiss.Clustering(d, k)
clus.verbose = True
clus.niter = 20
clus.nredo = 5
clus.seed = seed
clus.max_points_per_centroid = 1000
clus.min_points_per_centroid = 10

res = faiss.StandardGpuResources()
cfg = faiss.GpuIndexFlatConfig()
cfg.useFloat16 = False
cfg.device = args.gpu
index = faiss.GpuIndexFlatL2(res, d, cfg)

clus.train(x, index)

# 手动计算 EMD 距离并重新分配样本到最近的质心
centroids = faiss.vector_to_array(clus.centroids).reshape(k, d)
D = np.zeros((x.shape[0], k))

for i in range(x.shape[0]):
for j in range(k):
D[i, j] = compute_emd(x[i], centroids[j])

I = np.argmin(D, axis=1)
im2cluster = [int(n) for n in I]

Dcluster = [[] for _ in range(k)]
for im, i in enumerate(im2cluster):
Dcluster[i].append(D[im][i])

density = np.zeros(k)
for i, dist in enumerate(Dcluster):
if len(dist) > 1:
d = (np.asarray(dist) ** 0.5).mean() / np.log(len(dist) + 10)
density[i] = d

dmax = density.max()
for i, dist in enumerate(Dcluster):
if len(dist) <= 1:
density[i] = dmax

density = density.clip(np.percentile(density, 10), np.percentile(density, 90))
density = args.temperature * density / density.mean()

centroids = torch.Tensor(centroids).cuda()
centroids = F.normalize(centroids, p=2, dim=1)

im2cluster = torch.LongTensor(im2cluster).cuda()
density = torch.Tensor(density).cuda()

results['centroids'].append(centroids)
results['density'].append(density)
results['im2cluster'].append(im2cluster)

return results```