kNN概述
cd C:\Users\exuejwa\Documents\mlia\machinelearninginaction\Ch02
C:\Users\exuejwa\Documents\mlia\machinelearninginaction\Ch02
from numpy
import *
import operator
from os
import listdir
def createDataSet():
group = array([[
1.0,
1.1],[
1.0,
1.0],[
0,
0],[
0,
0.1]])
labels = [
'A',
'A',
'B',
'B']
return group, labels
group, labels = createDataSet()
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[
0]
diffMat = tile(inX, (dataSetSize,
1)) - dataSet
sqDiffMat = diffMat**
2
sqDistances = sqDiffMat.sum(axis=
1)
distances = sqDistances**
0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i
in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,
0) +
1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(
1), reverse=
True)
return sortedClassCount[
0][
0]
运行结果
classify0([
0,
0], group, labels,
3)
'B'
示例:使用kNN改进约会网站的配对效果
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,
3))
classLabelVector = []
index =
0
for line
in arrayOLines:
line = line.strip()
listFromLine = line.split(
'\t')
returnMat[index,:] = listFromLine[
0:
3]
classLabelVector.append(int(listFromLine[-
1]))
index +=
1
return returnMat, classLabelVector
datingDataMat, datingLabels = file2matrix(
'datingTestSet2.txt')
datingDataMat[
0:
10]
array([[ 4.09200000e+04, 8.32697600e+00, 9.53952000e-01],
[ 1.44880000e+04, 7.15346900e+00, 1.67390400e+00],
[ 2.60520000e+04, 1.44187100e+00, 8.05124000e-01],
[ 7.51360000e+04, 1.31473940e+01, 4.28964000e-01],
[ 3.83440000e+04, 1.66978800e+00, 1.34296000e-01],
[ 7.29930000e+04, 1.01417400e+01, 1.03295500e+00],
[ 3.59480000e+04, 6.83079200e+00, 1.21319200e+00],
[ 4.26660000e+04, 1.32763690e+01, 5.43880000e-01],
[ 6.74970000e+04, 8.63157700e+00, 7.49278000e-01],
[ 3.54830000e+04, 1.22731690e+01, 1.50805300e+00]])
datingLabels[
0:
10]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3]
import matplotlib
import matplotlib.pyplot
as plt
fig = plt.figure()
ax = fig.add_subplot(
111)
ax.scatter(datingDataMat[:,
1], datingDataMat[:,
2],
15.0*array(datingLabels),
15.0*array(datingLabels))
<matplotlib.collections.PathCollection at 0x799a908>
plt.show()
归一化数值
newValue = (oldValue - min) / (max - min)
def autoNorm(dataSet):
minVals = dataSet.min(
0)
maxVals = dataSet.max(
0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[
0]
normDataSet = dataSet - tile(minVals, (m,
1))
normDataSet = normDataSet/tile(ranges, (m,
1))
return normDataSet, ranges, minVals
normMat, ranges, minVals = autoNorm(datingDataMat)
normMat
array([[ 0.44832535, 0.39805139, 0.56233353],
[ 0.15873259, 0.34195467, 0.98724416],
[ 0.28542943, 0.06892523, 0.47449629],
...,
[ 0.29115949, 0.50910294, 0.51079493],
[ 0.52711097, 0.43665451, 0.4290048 ],
[ 0.47940793, 0.3768091 , 0.78571804]])
ranges
array([ 9.12730000e+04, 2.09193490e+01, 1.69436100e+00])
minVals
array([ 0. , 0. , 0.001156])
def datingClassTest():
hoRatio =
0.10
datingDataMat, datingLabels = file2matrix(
'datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[
0]
numTestVecs = int(m*hoRatio)
errorCount =
0.0
for i
in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],
3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount +=
1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
运行结果
datingClassTest()
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
def classifyPerson():
resultList = [
'not at all',
'in small doses',
'in large doses']
percentTats = float(raw_input(
"percentage of time spent playing video games?"))
ffMiles = float(raw_input(
"frequent flier miles earned per year?"))
iceCream = float(raw_input(
"liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix(
'datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels,
3)
print "You will probably like this person: ", resultList[classifierResult -
1]
运行结果
classifyPerson()
percentage of time spent playing video games?10
frequent flier miles earned per year?10000
liters of ice cream consumed per year?0.5
You will probably like this person: in small doses
示例:手写识别系统
def img2vector(filename):
returnVect = zeros((
1,
1024))
fr = open(filename)
for i
in range(
32):
lineStr = fr.readline()
for j
in range(
32):
returnVect[
0,
32*i+j] = int(lineStr[j])
return returnVect
testVector = img2vector(
'testDigits/0_13.txt')
testVector[
0,
0:
31]
array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0.])
testVector[
0][
32:
63]
array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0.])
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir(
'trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m,
1024))
for i
in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split(
'.')[
0]
classNumStr = int(fileStr.split(
'_')[
0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector(
'trainingDigits/%s' % fileNameStr)
testFileList = listdir(
'testDigits')
errorCount =
0.0
mTest = len(testFileList)
for i
in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split(
'.')[
0]
classNumStr = int(fileStr.split(
'_')[
0])
vectorUnderTest = img2vector(
'testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels,
3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount +=
1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
运行结果
handwritingClassTest()
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 0, the real answer is: 0
......
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the total number of errors is: 11
the total error rate is: 0.011628