-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Expand file tree
/
Copy pathk_medoids.py
More file actions
107 lines (86 loc) · 3.13 KB
/
k_medoids.py
File metadata and controls
107 lines (86 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com)
Requirements:
- numpy
- matplotlib
Python:
- 3.5+
Inputs:
- X: 2D numpy array of features
- k: number of clusters
Usage:
1. Define k and X
2. Create initial medoids:
initial_medoids = get_initial_medoids(X, k, seed=0)
3. Run kmedoids:
medoids, cluster_assignment = kmedoids(
X, k, initial_medoids, maxiter=100, verbose=True
)
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
def get_initial_medoids(data, k, seed=None):
rng = np.random.default_rng(seed)
n = data.shape[0]
indices = rng.choice(n, k, replace=False)
medoids = data[indices, :]
return medoids
def assign_clusters(data, medoids):
distances = pairwise_distances(data, medoids, metric="euclidean")
cluster_assignment = np.argmin(distances, axis=1)
return cluster_assignment
def revise_medoids(data, k, cluster_assignment):
new_medoids = []
for i in range(k):
members = data[cluster_assignment == i]
if len(members) == 0:
continue
# Compute total distance from each point to all others in cluster
total_distances = np.sum(pairwise_distances(members, members), axis=1)
medoid_index = np.argmin(total_distances)
new_medoids.append(members[medoid_index])
return np.array(new_medoids)
def compute_heterogeneity(data, k, medoids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
members = data[cluster_assignment == i]
if len(members) == 0:
continue
distances = pairwise_distances(members, [medoids[i]])
heterogeneity += np.sum(distances**2)
return heterogeneity
def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False):
medoids = initial_medoids.copy()
prev_assignment = None
for itr in range(maxiter):
cluster_assignment = assign_clusters(data, medoids)
medoids = revise_medoids(data, k, cluster_assignment)
if (
prev_assignment is not None
and (prev_assignment == cluster_assignment).all()
):
break
if verbose and prev_assignment is not None:
changed = np.sum(prev_assignment != cluster_assignment)
print(f"Iteration {itr}: {changed} points changed clusters")
prev_assignment = cluster_assignment.copy()
return medoids, cluster_assignment
# Optional plotting
def plot_clusters(data, medoids, cluster_assignment):
ax = plt.axes(projection="3d")
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis")
ax.scatter(medoids[:, 0], medoids[:, 1], medoids[:, 2], c="red", s=100, marker="x")
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
ax.set_title("3D K-Medoids Clustering")
plt.show()
# Optional test
if __name__ == "__main__":
from sklearn import datasets
X = datasets.load_iris()["data"]
k = 3
medoids = get_initial_medoids(X, k, seed=0)
medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True)
plot_clusters(X, medoids, clusters)