python-knn(简版理解)

    xiaoxiao2021-11-03  59

    来自机器学习实战一书代码。

    # !/usr/bin/python # -*- coding: utf-8 -*- import numpy as np import operator def creatDataSet(): group = np.array([[1.0, 1.1], [1.0, 1.0], [0,0], [0,0.1]]) labels = ['A','A','B','B'] return group, labels def classify0(inX, dataSet, labels, k): """ # 作用:判断某一数据集最近(欧氏距离)的k个点的类别数目,通过多数表决决定输入Inx的类别 # Args:inX : 进行判断的数据,矩阵格式,1行 # dataSet: 训练集 # labels:训练集标签 # k:最近的k个点 # return: 投票数目最多的类别 """ dataSetSize = dataSet.shape[0] # shape 返还行长度,列长度 类似于R的dim diffMat = np.tile(inX, (dataSetSize,1)) - dataSet # tile 类似 rep操作, 代码 (x0-x1),(y0-y1) sqDiffMat = diffMat ** 2 # 开方 (x0-x1)^2,(y0-y1)^2 sqDistances = sqDiffMat.sum(axis = 1) # 按行求和 (x0-x1)^2+(y0-y1)^2 distances = sqDistances ** 0.5 # 开根号 sqrt((x0-x1)^2+(y0-y1)^2) # 求欧式距离 sorteDistIndicies = distances.argsort() # 类似于order操作 返还排序后的下标 classCount = {} # 变量声明 dist格式 for i in range(k): # 最近K个点的类别结果 voteIlabel = labels[sorteDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) # 多数表决 return sortedClassCount[0][0] # 返还投票最多的类别 def file2matrix(filename): """ # 作用:txt数据读取 # Args:filename:读取数据的名字,目录名以'/'分隔 # return: returnMat:矩阵格式,读取的数据 # classLabelVector:list格式,数据标签 """ fr = open(filename) arrayOLines = fr.readlines() numberOfLines = len(arrayOLines) # 获取 list 长度 returnMat = np.zeros((numberOfLines, 3)) # 预分配内存,原始数据有 arrayOLines行,3列 classLabelVector = [] # 变量声明 list格式 index = 0 for line in arrayOLines: # 循环list, 类似lapply,对每一个list进行操作 line = line.strip() # 截取回车字符 listFromLine = line.split('\t') # 分隔符 returnMat[index,:] = listFromLine[0:3] # 按行赋值 classLabelVector.append(int(listFromLine[-1])) # append添加行,listFromLine[-1],-指的是倒数,与R语言的删除意思不同 index += 1 # i++ return returnMat, classLabelVector def autoNorm(dataSet): """ # 作用:最大最小值-标准化数据 # Args: dataSet:矩阵格式,进行标准化的数据 # return: normDataSet: 矩阵格式,标准化后的dataSet # ranges:每列 max - min # minVals:每列 min """ minVals = dataSet.min(0) # 获取每列最小值 maxVals = dataSet.max(0) # 获取每列最大值 ranges = maxVals - minVals # 分母 max - min normDataSet = np.zeros(np.shape(dataSet)) # 预分配内存,生成全为0的矩阵 m = dataSet.shape[0] # 获取行长度 normDataSet = dataSet - np.tile(minVals, (m, 1)) # 分子: x - min normDataSet = normDataSet/np.tile(ranges, (m, 1)) # 分子/分母 return normDataSet, ranges, minVals # 测试 import knn hoRatio = 0.1 # 设置测试集比例 datingDataMat, datingLabels = knn.file2matrix('Data/datingTestSet2.txt') # 数据集读取 normMat, ranges, minVals = knn.autoNorm(datingDataMat) # 标准化数据 m = normMat.shape[0] numTestVecs = int(m * hoRatio) # 测试集长度 errorCount = 0.0 classifierResultAll = [] # 变量声明,list格式 for i in range(numTestVecs): classifierResult = knn.classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) classifierResultAll.append(classifierResult) print "the classifier came back with :%d, the real answer is : %d" % (classifierResult, datingLabels[i]) if( classifierResult != datingLabels[i]): errorCount += 1 print "the total error rate is : %f" % (errorCount / float(numTestVecs))
    转载请注明原文地址: https://ju.6miu.com/read-677977.html

    最新回复(0)