# 地理空间数据聚类
%matplotlib inline
import numpy
as np,pandas
as pd,matplotlib.pyplot
as plt
from sklearn.cluster
import DBSCAN
from geopy.distance
import great_circle
from shapely.geometry
import MultiPoint,Polygon
from geopy.geocoders
import Nominatim
from geopy.point
import Point
import geopandas
as gpd
from sklearn.preprocessing
import StandardScaler,minmax_scale ## 1.数据加载 数据量为1亿条,是一个三维数据:(经度,纬度,创建时间)
path=
r"GPS.csv"
df=pd.read_csv(path,index_col=
0,usecols=[
1,
2,
3,
4],parse_dates=[
3])
df=df[(df.latitude!=
0) & (df.longitude >
73.3)].drop_duplicates()
df_sort=df.groupby(by=df.index).count().sort_values(by=
"longitude",ascending=
False)
dfIndex=df_sort[df_sort.longitude>
30].index
dftest=df.loc[dfIndex]
dftest.head()
latitudelongitudecreate_timecustorm_id
20630139.842956117.6338082016-08-19 20:27:0020630139.842868117.6337262016-08-19 20:11:0020630139.842754117.6337032016-08-19 19:50:0020630139.842852117.6337902016-08-19 19:38:0020630139.842839117.6337912016-08-19 18:56:00
2.数据转换
经纬度解析出城市
def parse_city(latlng):
try:
locations=geolocator.reverse(Point(latlng),timeout=
10)
loc=locations.raw[
u'address']
if u'state_district' in loc:
city=loc[
u'state_district'].split(
'/')[
0]
else :
city =loc[
u'county'].split(
'/')[
0]
except Exception
as e:
print e
city=
None
try:
state= loc[
u'state']
except Exception
as e:
print e
state=
None
return city,state
def parse_state(latlng):
try:
locations=geolocator.reverse(Point(latlng),timeout=
10)
loc=locations.raw
state= loc[
u'address'][
u'state']
except Exception
as e:
print e
state=
None
return state
geolocator = Nominatim()
latlngs=df.ix[:,[
'longitude',
'latitude']].values
df[
'city']=map(parse_city,latlngs)
df[
'state']=map(parse_state,latlngs)
3.聚类分析
coords=dftest.as_matrix(columns=[
'longitude',
'latitude'])
kms_per_radian =
6371.0088
epsilon =
10/ kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=
80, algorithm=
'ball_tree', metric=
'haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n]
for n
in range(num_clusters)])
print(
'Number of clusters: {}'.format(num_clusters))
Number of clusters: 110
3.1类的中心点
def get_centermost_point(cluster):
centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
centermost_point = min(cluster, key=
lambda point: great_circle(point, centroid).m)
return tuple(centermost_point)
centermost_points = clusters[:-
1].map(get_centermost_point)
3.2 类中心原始数据
lons, lats = zip(*centermost_points)
rep_points = pd.DataFrame({
'lon':lons,
'lat':lats})
rs = rep_points.apply(
lambda row: dftest[(dftest[
'latitude']==row[
'lat']) &(dftest[
'longitude']==row[
'lon'])].iloc[
0], axis=
1)
4.数据可视化
4.1 二维坐标
fig, ax = plt.subplots(figsize=[
10,
6])
rs_scatter = ax.scatter(rs[
'longitude'], rs[
'latitude'], c=
'#99cc99', edgecolor=
'None', alpha=
0.7, s=
120)
df_scatter = ax.scatter(df[
'longitude'], df[
'latitude'], c=
'k', alpha=
0.9, s=
3)
ax.set_title(
'Full data set vs DBSCAN reduced set')
ax.set_xlabel(
'Longitude')
ax.set_ylabel(
'Latitude')
ax.legend([df_scatter, rs_scatter], [
'Full set',
'Reduced set'], loc=
'upper right')
4.2 确定坐标的范围
margin_width =
0
lon_range = [rs[
'longitude'].min() - margin_width, rs[
'longitude'].max() + margin_width]
lat_range = [rs[
'latitude'].min() - margin_width, rs[
'latitude'].max() + margin_width]
spatial_extent = Polygon([(lon_range[
0], lat_range[
0]),
(lon_range[
0], lat_range[
1]),
(lon_range[
1], lat_range[
1]),
(lon_range[
1], lat_range[
0])])
4.3中国省级行政区地图
file_path=
r'E:\workpalce\Python\regular_python\IP_location_wordpress-master\date\Lambert\省级行政区.shp'.decode(
'utf-8')
world=gpd.read_file(gpd.datasets.get_path(
'naturalearth_lowres'))
china_map=gpd.read_file(file_path)
china_map=china_map.to_crs(world.crs)
china_map.plot(color=
'white',figsize=(
20,
20))
4.4 地理坐标中的聚类分析
import bokeh
from bokeh
import mpl
from bokeh.plotting
import output_file, show
import matplotlib.pylab
as pylab
pylab.rcParams[
'figure.figsize'] =
20,
20
china_map=china_map[china_map[
'geometry'].intersects(spatial_extent)]
fig=plt.figure()
ydimension = int((lat_range[
1] - lat_range[
0]) /
4)
xdimension = int((lon_range[
1] - lon_range[
0]) /
4)
fig.set_size_inches(xdimension, ydimension)
china_map.plot(colormap=
'binary', alpha=
0)
scal=minmax_scale([len(line)
for line
in clusters[:-
1]])
rs_scatter = plt.scatter(x=rs[
'longitude'], y=rs[
'latitude'], c=scal+
10, edgecolor=
'white', alpha=
.9, s=scal*
6000)
TOPN
每个人的地理位置信息进行DBSCAN聚类分析,得出用户活动范围最为频繁的TopN个地点,本文人为一个人在5公里活动区域内为一个“共现区域”,在一个地点活动至少2次才能被认为是“共现区域”,下面是样例代码,在实际操作中由于数据量比较大,可以才用多线程或者分布式的方式实现速度的提升。
def getPersonlMost(dft):
coords=dft.as_matrix(columns=[
'longitude',
'latitude'])
kms_per_radian =
6371.0088
epsilon =
5/ kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=
2, algorithm=
'ball_tree', metric=
'haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n]
for n
in range(num_clusters)])
clusters=pd.Series([line
for line
in clusters
if len(line)>
0])
sorted_cluster=sorted([(get_centermost_point(line),len(line))
for line
in clusters],key =
lambda x:x[
1],reverse=
True)[:
3]
return sorted_cluster
path=
"GPSA.csv"
df=pd.read_csv(path,index_col=
0,usecols=[
1,
2,
3,
4],names=[
'custorm_id',
'longitude',
'latitude',
'createtime'])
df=df[(df.latitude!=
0) & (df.longitude >
73.3)].drop_duplicates()
df_sort=df.groupby(by=df.index).count().sort_values(by=
"longitude",ascending=
False)
dfIndex=df_sort[df_sort.longitude>
5].index
dftest=df.loc[dfIndex].dropna()
dftest.to_csv(
"deftest.csv")
TopN=[]
import csv
dftest=pd.read_csv(
"deftest.csv",index_col=
0).drop_duplicates().dropna()
cnt=
0
for line
in dftest.index.unique():
cnt+=
1
if cnt%
1000==
0:
print cnt
dfs=dftest.loc[line]
cc=getPersonlMost(dfs)
TopN.append(cc)
peronalMost=pd.DataFrame(TopN,index=dftest.index.unique(),columns=[
"mostly",
'secondly',
'merely'])
peronalMost.to_csv(
"personal_most.csv")
总结
本文主要是对位置信息进行简单的分析聚类,可以看出人群主要分布情况,以及每个人主要活动的几个区域。其实地理位置信息可以获取很多的个人信息,比如根据其活动的区域的经济状况可以大体推算出其经济水平,通过其活动的范围大小,离散程度等等情况分析其一些行为和心理上的特征。本文只取地理位置数据进行分析,而因为各种原因,没有在时间维度上进行探索,这也是一个前进的方向,基于时空数据的分析,是当下数据挖掘领域较为火热的领域。无论是个人征信系统的建立、还是用户行为的分析,随着物联网的兴起,物联网上数量庞大的传感器的数据也是基于时间和空间的数据,你也许根据数据去预测地震、海啸、森林火灾、水资源的污染,甚或者你可以深入到智能家居,智能电网、智能交通等等智慧城市的各个领域。时间虽然过去了,但它留下了财富,空间虽然不断变换,我们亦可以追本溯源。