代码注释:机器学习实战第10章 利用K-均值聚类算法对未标注数据分组

    xiaoxiao2021-03-25  131

    写在开头的话:在学习《机器学习实战》的过程中发现书中很多代码并没有注释,这对新入门的同学是一个挑战,特此贴出我对代码做出的注释,仅供参考,欢迎指正。

    1、K-均值聚类算法

    #coding:gbk from numpy import * #作用:从文件中导入数据集 #输入:文件名 #输出:数据集 def loadDataSet(fileName): #general function to parse tab -delimited floats dataMat = [] #assume last column is target value fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float,curLine) #map all elements to float() dataMat.append(fltLine) return dataMat #作用:计算两个向量的欧式距离 #输入:向量A,向量B #输出:向量间的欧式距离 def distEclud(vecA, vecB): return sqrt(sum(power(vecA - vecB, 2))) #作用:为给定数据集构建一个包含k个随机质心的集合 #输入:数据集,随机质心数 #输出:包含k个随机质心的集合 def randCent(dataSet, k): n = shape(dataSet)[1] centroids = mat(zeros((k, n))) for j in range(n): minJ = min(dataSet[:, j]) rangeJ = float(max(dataSet[:, j]) - minJ) centroids[:, j] = minJ + rangeJ * random.rand(k, 1) return centroids #作用:k-均值算法 #输入:数据集,簇数目,距离计算方法,质心集合创造方法 #输出:簇质心集合,簇分配结果矩阵 def kMeans(dataSet, k, distMeas = distEclud, createCent = randCent): m = shape(dataSet)[0] #簇分配结果矩阵,包含两列,一列记录簇索引值,第二列存储误差 clusterAssment = mat(zeros((m, 2))) centroids = createCent(dataSet, k) clusterChanged = True while clusterChanged: clusterChanged = False #对每个样本,寻找最近的质心 for i in range(m): minDist = inf minIndex = -1#从属簇的索引值 for j in range(k): distJI = distMeas(centroids[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI minIndex = j #只要有数据点的簇分配结果发生改变,clusterChanged = True if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 #print centroids #遍历所有质心并更新它们的取值 for cent in range(k): ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] centroids[cent, :] = mean(ptsInClust, axis = 0)#axis = 0表示沿矩阵的列方向进行均值计算 return centroids, clusterAssment

    2、二分K-均值聚类算法

    #作用:二分k-均值聚类算法 #输入:数据集,簇数目,距离计算方法 #输出:簇质心集合,簇分配结果矩阵 def biKmeans(dataSet, k, distMeas = distEclud): m = shape(dataSet)[0]#数据点个数 clusterAssment = mat(zeros((m, 2)))#簇分配结果矩阵,包含两列,一列记录簇索引值,第二列存储误差 centroid0 = mean(dataSet, axis = 0).tolist()[0]#计算整个数据集的质心 centList = [centroid0]#使用列表保留所有簇的质心,将初始簇的质心压入 #遍历数据集中所有的点来计算每个点到质心的误差值 for j in range(m): clusterAssment[j, 1] = distMeas(mat(centroid0), dataSet[j, :]) ** 2 #不停对簇进行划分,直到得到想要的簇数目为止 while (len(centList) < k): lowestSSE = inf #遍历已有的簇来决定最佳的簇进行划分 for i in range(len(centList)): #只有第i个簇的数据集 ptsInCurrCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :] #对第i个簇一分为二 centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) #对第i个簇划分后得到的误差平方和 sseSplit = sum(splitClustAss[:, 1]) #除了第i个簇的数据集的误差平方和 sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1]) print "sseSplit, and notSplit: ", sseSplit, sseNotSplit #如果划分后的簇有最小的总误差 if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit +sseNotSplit #将新的二分好的簇的第二个的索引值设为cenList + 1,即现有的centList后的一个 bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList) #将新的二分好的簇的第二个的索引值设为bestCentToSplit,即要二分的簇 bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit print 'the bestCentToSplit is: ', bestCentToSplit print 'the len of bestClustAss is: ', len(bestClustAss) #均需要加tolist()[0],否则后面会出错误 centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]#将i个簇换成新的二分好的簇的第一个 centList.append(bestNewCents[1, :].tolist()[0])#将新的二分好的簇的第二个压入列表 clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss#更新簇的分配结果 #print centList return mat(centList), clusterAssment

    3、对地图上的点进行聚类

    import urllib import json #作用:对地址进行地理编码 #输入:地址,城市 #输出:地理编码 def geoGrab(stAddress, city): apiStem = 'http://where.yahooapis.com/geocode?' params = {} params['flags'] = 'J'#将返回类型设为JSON params['appid'] = 'ppp68N8t' params['location'] = '%s %s' % (stAddress, city) url_params = urllib.urlencode(params)#将创建的字典转换为可以通过URL进行传递的字符串格式 yahooApi = apiStem + url_params print yahooApi#打印输出的URL c = urllib.urlopen(yahooApi)#打开URL return json.loads(c.read())#读取返回值 from time import sleep #作用:服务器不存在,失败 #输入: #输出: def massPlaceFind(fileName): fw = open('places.txt', 'w') for line in open(fileName).readlines(): line = line.strip() lineArr = line.split('\t') retDict = geoGrab(lineArr[1], lineArr[2]) if retDict['ResultSet']['Error'] == 0: lat = float(retDict['ResultSet']['Result'][0]['latitude'])#维度 lng = float(retDict['ResultSet']['Result'][0]['longitude'])#经度 print "%s\t%f\t%f" % (lineArr[0], lat, lng) fw.write('%s\t%f\t%f\n' % (line, lat, lng)) else: print "error fetching" sleep(1) fw.close() def distSLC(vecA, vecB): a = sin(vecA[0, 1] * pi / 180) * sin(vecB[0, 1] * pi / 180) b = cos(vecA[0, 1] * pi / 180) * cos(vecB[0, 1] * pi / 180) * cos(pi * (vecB[0, 0] -vecA[0, 0]) / 180) return arccos(a + b) * 6371.0 import matplotlib import matplotlib.pyplot as plt def clusterClubs(numClust = 5): datList = []#表示每个地点的经度、维度 for line in open('places.txt').readlines(): lineArr = line.split('\t') datList.append([float(lineArr[4]), float(lineArr[3])]) datMat = mat(datList) #二分k-均值聚类算法 myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas = distSLC) fig = plt.figure() rect = [0.1, 0.1, 0.8, 0.8] scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<'] axprops = dict(xticks = [], yticks = []) ax0 = fig.add_axes(rect, label = 'ax0', **axprops) imgP = plt.imread('Portland.png') ax0.imshow(imgP) ax1 = fig.add_axes(rect, label = 'ax1', frameon = False) #ax1.scatter(myCentroids[:, 0].flatten().A[0], myCentroids[:, 1].flatten().A[0], marker='+', s=300) #绘制坐标点 for i in range(numClust): ptsInCurrCluster = datMat[nonzero(clustAssing[:, 0].A == i)[0], :] markerStyle = scatterMarkers[i % len(scatterMarkers)] ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0], ptsInCurrCluster[:, 1].flatten().A[0], \ marker = markerStyle, s = 90) #myCentroids = mat(myCentroids) #print myCentroids #绘制簇中心 ax1.scatter(myCentroids[:, 0].flatten().A[0], myCentroids[:, 1].flatten().A[0], marker = '+', s = 300) plt.show()

    转载请注明原文地址: https://ju.6miu.com/read-3259.html

    最新回复(0)