In this post we will present how to cluster embedding vectors, and how to visualize the results.
We start with a simple random tensors that represents the embeddings, and plot them using t-SNE which reduces the 768 features to a 2 dimentional representation.
To make the t-SNE work nice, we will insert predetermined variance into the data.
import random
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
random.seed(42)
torch.manual_seed(42)
embeddings = None
for i in range(100):
embeddings_part = torch.rand(20, 768) + random.uniform(0, 0.5)
if embeddings is None:
embeddings = embeddings_part
else:
embeddings = torch.cat((embeddings, embeddings_part))
tsne = TSNE(2)
tsne_result = tsne.fit_transform(embeddings)
x, y = tsne_result[:, 0], tsne_result[:, 1]
plt.clf()
plt.scatter(x, y, s=2)
plt.legend()
plt.savefig('embeddings.pdf')
The plotted graph is:
Next, we use k-means to cluster the vectors to 10 clusters, and display the result clustering.
cluster_amount = 10
kmeans_model = KMeans(n_clusters=cluster_amount, random_state=0)
corpus_clusters = kmeans_model.fit_predict(embeddings).tolist()
plt.clf()
for cluster in range(cluster_amount):
cluster_points = None
for sample_index, sample_cluster in enumerate(corpus_clusters):
if sample_cluster == cluster:
tsne_point = tsne_result[sample_index]
tsne_point = np.expand_dims(tsne_point, axis=0)
if cluster_points is None:
cluster_points = tsne_point
else:
cluster_points = np.concatenate((cluster_points, tsne_point))
if cluster_points is not None:
x = cluster_points[:, 0]
y = cluster_points[:, 1]
plt.scatter(x, y, s=2, label=cluster)
plt.legend()
plt.savefig("clustered.pdf")
And the plotted clustering is:
No comments:
Post a Comment