K-Means++是一种用于初始化K-Means聚类的方法,它的目的是通过选择合理的初始点来优化K-Means聚类的性能。
K-Means算法的基本流程是:
K-Means++算法的基本思路是:
下面是一段python代码的实例:
import random
import math
class KMeansPlusPlus:
def __init__(self, data_points, K, max_iterations):
self.data_points = data_points
self.K = K
self.max_iterations = max_iterations
self.random = random.Random()
def cluster(self):
# Initialize centroids list
centroids = []
# Randomly select first centroid
centroids.append(self.data_points[self.random.randint(0, len(self.data_points))].centroid)
# Select remaining centroids
for i in range(1, self.K):
# Calculate distance of each data point to nearest centroid
distances = [dp.distance_to_nearest_centroid(centroids) for dp in self.data_points]
# Convert distances to probability distribution
sum_distances = sum(distances)
probabilities = [d / sum_distances for d in distances]
# Randomly select a new centroid from probability distribution
r = self.random.random()
cumulative_probability = 0
for j, p in enumerate(probabilities):
cumulative_probability += p
if r <= cumulative_probability:
centroids.append(self.data_points[j].centroid)
break
# Run K-Means algorithm
kmeans = KMeans(self.data_points, centroids, self.max_iterations)
return kmeans.cluster()
class DataPoint:
def __init__(self, coordinates):
self.coordinates = coordinates
self.centroid = None
def distance_to_nearest_centroid(self, centroids):
min_distance = float("inf")
for centroid in centroids:
distance = self.euclidean_distance(centroid.coordinates)
if distance < min_distance:
min_distance = distance
return min_distance
def euclidean_distance(self, coordinates):
sum_squared_distance = 0
for i in range(len(self.coordinates)):
sum_squared_distance += math.pow(self.coordinates[i] - coordinates[i], 2)
return math.sqrt(sum_squared_distance)
class Centroid:
def __init__(self, coordinates):
self.coordinates = coordinates
self.data_points = []
def update_coordinates(self):
num_data_points = len(self.data_points)
new_coordinates = [0] * len(self.coordinates)
for data_point in self.data_points:
for i in range(len(new_coordinates)):
new_coordinates[i] += data_point.coordinates[i]
for i in range(len(new_coordinates)):
new_coordinates[i] /= num_data_points
self.coordinates = new_coordinates
class KMeans:
def __init__(self, data_points, centroids, max_iterations):
self.data_points = data_points
self.centroids = centroids
self.max_iterations = max_iterations
def cluster(self):
for _ in range(self.max_iterations):
# Clear data points belonging to each centroid
for centroid in self.centroids:
centroid.data_points.clear()
# Assign each data point to nearest centroid
for data_point in self.data_points:
min_distance = float("inf")
nearest_centroid = None
for centroid in self.centroids:
distance = data_point.euclidean_distance(centroid.coordinates)
if distance < min_distance:
min_distance = distance
nearest_centroid = centroid
nearest_centroid.data_points.append(data_point)
data_point.centroid = nearest_centroid
# Update centroid coordinates
for centroid in self.centroids:
centroid.update_coordinates()
return self.centroids