Skip to content

Commit b311d34

Browse files
committed
Clean up of k-Means Clustering.
1 parent a72a812 commit b311d34

File tree

5 files changed

+168
-147
lines changed

5 files changed

+168
-147
lines changed

K-Means/KMeans.swift

+62-61
Original file line numberDiff line numberDiff line change
@@ -6,74 +6,75 @@
66
import Foundation
77

88
class KMeans {
9-
var numCenters:Int
10-
var convergeDist:Double
11-
12-
init(numCenters:Int, convergeDist:Double) {
13-
self.numCenters = numCenters
14-
self.convergeDist = convergeDist
9+
let numCenters: Int
10+
let convergeDist: Double
11+
12+
init(numCenters: Int, convergeDist: Double) {
13+
self.numCenters = numCenters
14+
self.convergeDist = convergeDist
15+
}
16+
17+
private func nearestCenter(x: Vector, centers: [Vector]) -> Int {
18+
var nearestDist = DBL_MAX
19+
var minIndex = 0
20+
21+
for (idx, c) in centers.enumerate() {
22+
let dist = x.distTo(c)
23+
if dist < nearestDist {
24+
minIndex = idx
25+
nearestDist = dist
26+
}
1527
}
28+
return minIndex
29+
}
30+
31+
func findCenters(points: [Vector]) -> [Vector] {
32+
var centerMoveDist = 0.0
33+
let zeros = [Double](count: points[0].length, repeatedValue: 0.0)
1634

17-
private func nearestCenter(x: Vector, Centers: [Vector]) -> Int {
18-
var nearestDist = DBL_MAX
19-
var minIndex = 0;
20-
21-
for (idx, c) in Centers.enumerate() {
22-
let dist = x.distTo(c)
23-
if dist < nearestDist {
24-
minIndex = idx
25-
nearestDist = dist
26-
}
27-
}
28-
return minIndex
29-
}
35+
var kCenters = reservoirSample(points, k: numCenters)
3036

31-
func findCenters(points: [Vector]) -> [Vector] {
32-
var centerMoveDist = 0.0
33-
let zeros = [Double](count: points[0].length, repeatedValue: 0.0)
34-
35-
var kCenters = reservoirSample(points, k: numCenters)
36-
37-
repeat {
38-
var cnts = [Double](count: numCenters, repeatedValue: 0.0)
39-
var newCenters = [Vector](count:numCenters, repeatedValue: Vector(d:zeros))
40-
41-
for p in points {
42-
let c = nearestCenter(p, Centers: kCenters)
43-
cnts[c]++
44-
newCenters[c] += p
45-
}
46-
47-
for idx in 0..<numCenters {
48-
newCenters[idx] /= cnts[idx]
49-
}
50-
51-
centerMoveDist = 0.0
52-
for idx in 0..<numCenters {
53-
centerMoveDist += kCenters[idx].distTo(newCenters[idx])
54-
}
55-
56-
kCenters = newCenters
57-
} while(centerMoveDist > convergeDist)
58-
return kCenters
59-
}
37+
repeat {
38+
var cnts = [Double](count: numCenters, repeatedValue: 0.0)
39+
var newCenters = [Vector](count:numCenters, repeatedValue: Vector(d:zeros))
40+
41+
for p in points {
42+
let c = nearestCenter(p, centers: kCenters)
43+
cnts[c] += 1
44+
newCenters[c] += p
45+
}
46+
47+
for idx in 0..<numCenters {
48+
newCenters[idx] /= cnts[idx]
49+
}
50+
51+
centerMoveDist = 0.0
52+
for idx in 0..<numCenters {
53+
centerMoveDist += kCenters[idx].distTo(newCenters[idx])
54+
}
55+
56+
kCenters = newCenters
57+
} while centerMoveDist > convergeDist
58+
59+
return kCenters
60+
}
6061
}
6162

6263
// Pick k random elements from samples
6364
func reservoirSample<T>(samples:[T], k:Int) -> [T] {
64-
var result = [T]()
65-
66-
// Fill the result array with first k elements
67-
for i in 0..<k {
68-
result.append(samples[i])
69-
}
70-
// randomly replace elements from remaining pool
71-
for i in (k+1)..<samples.count {
72-
let j = random()%(i+1)
73-
if j < k {
74-
result[j] = samples[i]
75-
}
65+
var result = [T]()
66+
67+
// Fill the result array with first k elements
68+
for i in 0..<k {
69+
result.append(samples[i])
70+
}
71+
// randomly replace elements from remaining pool
72+
for i in (k+1)..<samples.count {
73+
let j = random() % (i+1)
74+
if j < k {
75+
result[j] = samples[i]
7676
}
77-
return result
77+
}
78+
return result
7879
}
7980

K-Means/README.markdown

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# k-Means Clustering
2+
3+
Goal: Partition data into **k** clusters based on nearest means.
4+
5+
The idea behind k-Means is to take data that has no formal classification to it and determine if there are any natural clusters (groups of related objects) within the data.
6+
7+
k-Means assumes that there are **k-centers** within the data. The data that is closest to these *centroids* become classified or grouped together. k-Means doesn't tell you what the classifier is for that particular data group, but it assists in trying to find what clusters potentially exist.
8+
9+
## The algorithm
10+
11+
The k-Means algorithm is really quite simple at its core:
12+
13+
1. Choose **k** random points to be the initial centers
14+
2. Repeat the following two steps until the *centroids* reach convergence:
15+
1. Assign each point to its nearest *centroid*
16+
2. Update the *centroid* to the mean of its nearest points
17+
18+
Convergence is said to be reached when all of the *centroids* have not changed.
19+
20+
This brings about a few of the parameters that are required for k-Means:
21+
22+
- **k**: This is the number of *centroids* to attempt to locate.
23+
- **convergence distance**: The minimum distance that the centers are allowed to move after a particular update step.
24+
- **distance function**: There are a number of distance functions that can be used, but mostly commonly the Euclidean distance function is adequate. But often that can lead to convergence not being reached in higher dimensionally.
25+
26+
This is what the algorithm would look like in Swift:
27+
28+
```swift
29+
func kMeans(numCenters: Int, convergeDist: Double, points: [Vector]) -> [Vector] {
30+
var centerMoveDist = 0.0
31+
let zeros = [Double](count: points[0].length, repeatedValue: 0.0)
32+
33+
var kCenters = reservoirSample(points, k: numCenters)
34+
35+
repeat {
36+
var cnts = [Double](count: numCenters, repeatedValue: 0.0)
37+
var newCenters = [Vector](count:numCenters, repeatedValue: Vector(d:zeros))
38+
39+
for p in points {
40+
let c = nearestCenter(p, centers: kCenters)
41+
cnts[c] += 1
42+
newCenters[c] += p
43+
}
44+
45+
for idx in 0..<numCenters {
46+
newCenters[idx] /= cnts[idx]
47+
}
48+
49+
centerMoveDist = 0.0
50+
for idx in 0..<numCenters {
51+
centerMoveDist += euclidean(kCenters[idx], newCenters[idx])
52+
}
53+
54+
kCenters = newCenters
55+
} while centerMoveDist > convergeDist
56+
57+
return kCenters
58+
}
59+
```
60+
61+
## Example
62+
63+
These examples are contrived to show the exact nature of k-Means and finding clusters. These clusters are very easily identified by human eyes: we see there is one in the lower left corner, one in the upper right corner, and maybe one in the middle.
64+
65+
In all these examples the squares represent the data points and the stars represent the *centroids*.
66+
67+
##### Good clusters
68+
69+
This first example shows k-Means finding all three clusters:
70+
71+
![Good Clustering](Images/k_means_good.png)
72+
73+
The selection of initial centroids found the lower left cluster (indicated by red) and did pretty good on the center and upper left clusters.
74+
75+
#### Bad Clustering
76+
77+
The next two examples highlight the unpredictability of k-Means and how it not always finds the best clustering.
78+
79+
![Bad Clustering 1](Images/k_means_bad1.png)
80+
81+
As you can see in this one, the initial *centroids* were all a little too close and the 'blue' didn't quite get to a good place. By adjusting the convergence distance we should be able to get it better.
82+
83+
![Bad Clustering 1](Images/k_means_bad2.png)
84+
85+
In this example, the blue cluster never really could separate from the red cluster and as such sort of got stuck down there.
86+
87+
## Performance
88+
89+
The first thing to recognize is that k-Means is classified as an NP-Hard type of problem. The selection of the initial *centroids* has a big effect on how the resulting clusters may end up. This means that trying to find an exact solution is not likely -- even in 2 dimensional space.
90+
91+
As seen from the steps above the complexity really isn't that bad -- it is often considered to be on the order of **O(kndi)**, where **k** is the number of *centroids*, **n** is the number of **d**-dimensional vectors, and **i** is the number of iterations for convergence.
92+
93+
The amount of data has a big linear effect on the running time of k-Means, but tuning how far you want the *centroids* to converge can have a big impact how many iterations will be done. As a general rule, **k** should be relatively small compared to the number of vectors.
94+
95+
Often times as more data is added certain points may lie in the boundary between two *centroids* and as such those centroids would continue to bounce back and forth and the **convergence** distance would need to be tuned to prevent that.
96+
97+
## See Also
98+
99+
[K-Means Clustering on Wikipedia](https://en.wikipedia.org/wiki/K-means_clustering)
100+
101+
*Written by John Gill*

K-Means/README.md

-83
This file was deleted.

K-Means/Tests/Tests.xcodeproj/project.pbxproj

+3-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
B80894DB1C852CFA0018730E /* KMeans.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = KMeans.swift; path = ../KMeans.swift; sourceTree = SOURCE_ROOT; };
1717
B80894E01C852D100018730E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
1818
B80894E31C852D100018730E /* KMeansTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KMeansTests.swift; sourceTree = SOURCE_ROOT; };
19-
B80894E51C852D100018730E /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; name = Info.plist; path = ../Info.plist; sourceTree = "<group>"; };
20-
B80894E91C852DA00018730E /* Vector.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = Vector.swift; path = ../Vector.swift; sourceTree = "<group>"; };
19+
B80894E51C852D100018730E /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; path = Info.plist; sourceTree = "<group>"; };
20+
B80894E91C852DA00018730E /* Vector.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Vector.swift; sourceTree = SOURCE_ROOT; };
2121
/* End PBXFileReference section */
2222

2323
/* Begin PBXFrameworksBuildPhase section */
@@ -254,6 +254,7 @@
254254
B80894E81C852D100018730E /* Release */,
255255
);
256256
defaultConfigurationIsVisible = 0;
257+
defaultConfigurationName = Release;
257258
};
258259
/* End XCConfigurationList section */
259260
};

README.markdown

+2-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ Bad sorting algorithms (don't use these!):
9999
- Statistics
100100

101101
### Machine learning
102-
- [k-Means](K-Means/). Unsupervised classifier that partitions data into k clusters.
102+
103+
- [k-Means Clustering](K-Means/). Unsupervised classifier that partitions data into *k* clusters.
103104
- k-Nearest Neighbors
104105
- Linear Regression
105106
- Logistic Regression

0 commit comments

Comments
 (0)