# 用户地理位置的聚类算法实现-演道网

## 2. 用户地理位置信息的的聚类实现

xy = numpy.array([[116.455788, 39.920767], [116.456065, 39.920965], [116.452312, 39.92304], [116.421385, 39.989539],
[116.455685, 39.92069], [116.455876, 39.920845], [116.455973, 39.920902], [116.455645, 39.920657],
[116.456022, 39.920934], [116.455685, 39.920691], [116.456023, 39.920671], [116.45596, 39.920864],
[116.455522, 39.920856], [116.455276, 39.920407], [116.455799, 39.920867],
[116.455349, 39.920425], [116.45511, 39.920377], [116.455318, 39.920442], [116.455298, 39.920474],
[116.455839, 39.920636], [116.455979, 39.921168], [116.454281, 39.920006], [116.45598, 39.920612],
[116.45388, 39.919584], [116.455474, 39.920737], [116.456009, 39.920641], [116.455439, 39.920574],
[116.455759, 39.920841], [116.455838, 39.920644], [116.455983, 39.920847],
[116.459803, 39.922041], [116.456029, 39.92088], [116.455539, 39.920603], [116.455989, 39.920851],
[116.455719, 39.920789], [116.45601, 39.92082], [116.456229, 39.920564], [116.455906, 39.920771],
[116.456248, 39.920868], [116.455805, 39.920544], [116.455896, 39.920758], [116.43692, 39.926767],
[116.454672, 39.92024], [116.454813, 39.917848], [116.381415, 40.00875], [116.422925, 39.980757],
[116.422849, 39.9808], [116.38107, 40.009217], [116.456078, 39.920747], [116.455242, 39.919515],
[116.455615, 39.920533], [116.422092, 39.991104], [116.454847, 39.917724],
[116.456686, 39.924316], [116.45575, 39.920642], [116.456713, 39.924413], [116.455846, 39.920828],
[116.422108, 39.991098], [116.422075, 39.991139], [118.775572, 31.97337], [118.776968, 31.97392],
[118.778187, 31.973121], [118.775695, 31.973254], [118.775302, 31.973807],
[118.776303, 31.973692], [118.777541, 31.973439], [118.776196, 31.973489],
[116.448944, 39.926799], [116.45487, 39.917804], [116.455762, 39.920645], [116.456146, 39.920441],
[116.455857, 39.920043], [116.455458, 39.920826], [116.455533, 39.920791],
[116.455426, 39.920896], [116.45566, 39.920811], [116.455696, 39.920621], [116.453667, 39.9259],
[116.466606, 39.886322], [116.455917, 39.92062]])

### 2.1 基于Kmeans的聚类实现

res, idx = kmeans2(numpy.array(zip(xy[:, 0], xy[:, 1], z)), 2, iter=20, minit='points')

res, idx = kmeans2(numpy.array(zip(xy[:, 0], xy[:, 1], z)), 3, iter=20, minit='points')

### 2.2 基于DBSCAN的聚类实现

DBSCAN算法的重点是选取的聚合半径参数和聚合所需指定的MinPts数目。

def haversine(lonlat1, lonlat2):
lat1, lon1 = lonlat1
lat2, lon2 = lonlat2
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * asin(sqrt(a))
r = 6371  # Radius of earth in kilometers. Use 3956 for miles
return c * r

def clustering_by_dbscan():
......
distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v))))

db = DBSCAN(eps=2, min_samples=5, metric='precomputed')
y_db = db.fit_predict(distance_matrix)
X['cluster'] = y_db
......
plt.scatter(X['lat'], X['lng'], c=X['cluster'])
plt.show()

## 3.基于DBSCAN和Kmeans的混合算法实现

def clustering_by_dbscan_and_kmeans2():
X = pd.DataFrame(
{"lat": [39.920767, 39.920965, 39.92304, 39.989539, 39.92069, 39.920845, 39.920902, 39.920657, 39.920934,
39.920691, 39.920671, 39.920864, 39.920856, 39.920407, 39.920867, 39.920425, 39.920377, 39.920442,
39.920474, 39.920636, 39.921168, 39.920006, 39.920612, 39.919584, 39.920737, 39.920641, 39.920574,
39.920841, 39.920644, 39.920847, 39.922041, 39.92088, 39.920603, 39.920851, 39.920789, 39.92082,
39.920564, 39.920771, 39.920868, 39.920544, 39.920758, 39.926767, 39.92024, 39.917848, 40.00875,
39.980757, 39.9808, 40.009217, 39.920747, 39.919515, 39.920533, 39.991104, 39.917724, 39.924316,
39.920642, 39.924413, 39.920828, 39.991098, 39.991139, 31.97337, 31.97392, 31.973121, 31.973254,
31.973807, 31.973692, 31.973439, 31.973489, 39.926799, 39.917804, 39.920645, 39.920441, 39.920043,
39.920826, 39.920791, 39.920896, 39.920811, 39.920621, 39.9259, 39.886322, 39.92062],
"lng": [116.455788, 116.456065, 116.452312, 116.421385, 116.455685, 116.455876, 116.455973, 116.455645,
116.456022, 116.455685, 116.456023, 116.45596, 116.455522, 116.455276, 116.455799, 116.455349,
116.45511, 116.455318, 116.455298, 116.455839, 116.455979, 116.454281, 116.45598, 116.45388,
116.455474, 116.456009, 116.455439, 116.455759, 116.455838, 116.455983, 116.459803, 116.456029,
116.455539, 116.455989, 116.455719, 116.45601, 116.456229, 116.455906, 116.456248, 116.455805,
116.455896, 116.43692, 116.454672, 116.454813, 116.381415, 116.422925, 116.422849, 116.38107,
116.456078, 116.455242, 116.455615, 116.422092, 116.454847, 116.456686, 116.45575, 116.456713,
116.455846, 116.422108, 116.422075, 118.775572, 118.776968, 118.778187, 118.775695, 118.775302,
118.776303, 118.777541, 118.776196, 116.448944, 116.45487, 116.455762, 116.456146, 116.455857,
116.455458, 116.455533, 116.455426, 116.45566, 116.455696, 116.453667, 116.466606, 116.455917]
})
distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v))))

db = DBSCAN(eps=2, min_samples=5, metric='precomputed')
y_db = db.fit_predict(distance_matrix)

X['cluster'] = y_db

results = {}
for i in X.values:
if i[2] not in results.keys():
results[i[2]] = [[i[1], i[0]]]
else:
if results[i[2]]:
results[i[2]].append([i[1], i[0]])
else:
results[i[2]] = [[i[1], i[0]]]
print "DBSCAN output: ", len(results), results.keys()
print "KMeans calc center as below: "
for k in results.keys():
xy = numpy.array(results[k])
z = numpy.sin(xy[:, 1] - 0.2 * xy[:, 1])

z = whiten(z)

res, idx = kmeans2(numpy.array(zip(xy[:, 0], xy[:, 1], z)), 1, iter=20, minit='points')