机器学习实战---读书笔记: 第2章 k-近邻算法

    xiaoxiao2021-03-25  8

    内容来源于书《机器学习实战》

    # *-* coding: utf-8 *-* from numpy import * import operator from operator import itemgetter from os import * import codecs ''' <<机器学习实战>> ---读书笔记 : 第2章 k-近邻算法 关键: 1 K近邻算法流程: 1)计算当前点到训练集中每个点的距离 2)对距离按照递增排序 3)选取前k个距离最小的,统计其中类别出现概率最高的作为当前点的类别 2 k近邻算法 优点:简单,基于实际数据学习 缺点:保存全部数据集,占用空间大,耗时, 假设训练集为m,对于n个测试集,每个数据包含k个特征, 耗费时间为O(n*m*k) 注意需要对特征值进行归一化处理, newValue = (oldValue - min) / (max - min) minVals = dataSet.min(0) 获取数据集中每一列最小值,以一行的数组返回 3 numpy.tile(A,B):把A重复B次 numpy.array([0,1], [2,3]) 说明就是将[0,1]变成行重复2次,列重复3次的数组 [0,1,0,1,0,1] [0,1,0,1,0,1] 4 numpyArr.shape[0] :行数 #注意这里是矩阵[ [],[] ],外层有大括号 group = array([ [1.0,1.1] , [1.0,1.0], [0,0] , [0,0.1] ]) 初始化全为0的特征矩阵,注意矩阵创建提供的是元组 featureMatrix = zeros((rows , cols)) 5 diffmat ** 2,表示对矩阵中每个元素进行平方运算 sqlDiffmat.sum(axis=1):axix=0表示对列处理,axis=1对行处理,这里是每一行元素累加 distances.argsort():按照数值从小到大排序,返回索引值向量 #统计前k个中出现类别最多的作为当前点的类别,对键值对中的值排序 , sorted是排序后返回 labelToCount = sorted(labelToCount.items() , key=itemgetter(1) , reverse=True): returnMat[index , :] = listFromLine[0 : 3] 其中 : 表示完全切片 zeros((rows , cols)) :生成一个rows行,cols列的数组 6 ''' def createDataSet(): #注意这里是矩阵[ [],[] ],外层有大括号 group = array([ [1.0,1.1] , [1.0,1.0], [0,0] , [0,0.1] ]) labels = ['A' , 'A' , 'B' , 'B'] return group , labels ''' ''' def classify0(inX , dataSet , labels , k): #先求距离:1)将点从列的角度上复制n行,2)矩阵相减,3)距离矩阵平方并按行累加,4)距离矩阵求开根号的值 # shape是元组,用[]不是() rows = dataSet.shape[0] pointSet = tile( inX , (rows , 1) ) diffSet = pointSet - dataSet diffSet = diffSet ** 2 diffSet = diffSet.sum(axis=1) distanceSet = diffSet ** 0.5 #接下来对距离按照从小到大排序,获取距离最小的点对应索引号数组 indexs = distanceSet.argsort() labelToCount = {} for i in range(k): label = labels[ indexs[i] ] labelToCount[ label ] = labelToCount.get(label , 0) + 1 #统计前k个中出现类别最多的作为当前点的类别,对键值对中的值排序 , sorted是排序后返回 labelToCount = sorted(labelToCount.items() , key=itemgetter(1) , reverse=True) return labelToCount[0][0] def testClassify0(): group , labels = createDataSet() label = classify0([0,0] , group , labels , 3) print(label) #归一化特征值,主要是获取每一列最小值,每一列最大值, newValue = (oldValue - min) / (max - min) def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) #初始化一个0矩阵 normalArray = zeros(shape(dataSet)) rangeVals = maxVals - minVals rows = dataSet.shape[0] #记录归一化结果值 normalSet = dataSet - tile(minVals , (rows , 1)) normalSet = normalSet / tile(rangeVals , (rows , 1)) #print(maxVals) return normalSet , rangeVals ,minVals ''' 将文本记录转换为numpy,返回特征数组和标签数组 cols是特征的个数(不含分类结果这一属性) ''' def file2matrix(filename , cols): fr = codecs.open(filename , "r" , "utf-8") lines = fr.readlines() rows = len(lines) #初始化全为0的特征矩阵,注意矩阵创建提供的是元组 featureMatrix = zeros((rows , cols)) #读取每一行,初始化特征矩阵 labels = [] index = 0 for line in lines: line = line.strip() #注意,去除空格等 words = line.split('\t') # : 表示完全切片 featureMatrix[index , : ] = words[0 : cols] #将类别转化为整型数 labels.append(int(words[-1])) index += 1 return featureMatrix , labels def dataingClassTest(): testRatio = 0.10 fileName = "datingTestSet2.txt" featureMatrix , labels = file2matrix(fileName , 3) normalDataset , ranges , minVals = autoNorm(featureMatrix) ''' print(normalDataset) print(ranges) print(minVals) ''' rows = normalDataset.shape[0] errCount = 0.0 testNums = int(testRatio * rows) #对每行数据进行分类 for i in range(testNums): # classify0(inX , dataSet , labels , k) result = classify0(normalDataset[i , :] , normalDataset[ testNums : rows , : ] , labels[ testNums : rows] , 3 ) #print("current classify result is: %d, real result is : %d" % (result , labels[i])) if result != labels[i]: errCount += 1 print("error rate is : %f" % (errCount / float(rows))) def classifyPerson(): resultList = ['not at all like' , 'like' , 'very like'] times = float(input("time consumed on games: ")) miles = float(input("miles per year: ")) icecream = float(input("icecream consumed per year: ")) datatingDataMat , datingLabels = file2matrix('datingTestSet2.txt' ,3) normMat , ranges , minVals = autoNorm(datatingDataMat) inArr = array([times ,miles , icecream ]) result = classify0( (inArr - minVals) / ranges , normMat , datingLabels , 3) print("you will %s the person" % (resultList[result - 1])) def testMin(): dataSet = array( [ [0.8,400,0.5] , [12,134000,0.9] ,[0,20000,1.1], [67,32000 , 0.1] ]) minVals= dataSet.min(0) maxVals = dataSet.max(0) print(minVals) print(maxVals) #将图像格式处理为一个向量,把32*3二进制图像矩阵转换为1*1024的向量 def img2vector(filename): resultVector = zeros((1,1024)) fr = codecs.open(filename , "r" , "utf-8") for i in range(32): line = fr.readline() for j in range(32): resultVector[0 , 32 * i + j] = int(line[j]) return resultVector #进行手写数字识别系统测试,主要读取每一个文件,转换为1*1024的矩阵,最终得到m * 1024,m为训练集的个数;然后对测试集进行处理,分析结果 def handwritingClassTest(): labels = [] trainingFiles = listdir("trainingDigits") rows = len(trainingFiles) trainDatas = zeros((rows , 1024)) #读取并生成训练集 for i in range(rows): fileName = trainingFiles[i] name = fileName.split(".")[0] label = name.split("_")[0] labels.append(label) realName = "trainingDigits/%s" % (fileName) trainDatas[i, :] = img2vector(realName) #对测试集进行训练 errCount = 0 testFiles = listdir("testDigits") testNum = len(testFiles) for i in range(testNum): fileName = testFiles[i] name = fileName.split(".")[0] realLabel = name.split("_")[0] realName = "testDigits/%s" % (fileName) testVector = img2vector(realName) label = classify0(testVector , trainDatas , labels , 3) #print("real label: %s , classify label: %s" % (realLabel , label)) if realLabel != label: errCount += 1 print("total error num is: %d ,error rate is: %f" % (errCount , errCount / testNum)) def testImg2Vector(): testVector = img2vector("testDigits/0_13.txt") print(testVector[0 , 0:31]) print(testVector[0 , 32:63]) if __name__ == "__main__": testClassify0() #testMin() dataingClassTest() #classifyPerson() #testImg2Vector() handwritingClassTest()

    转载请注明原文地址: https://ju.6miu.com/read-114210.html

    最新回复(0)