#!/usr/bin/python2 #-*-coding:UTF-8-*- from numpy import * #整个PCA的实现中,是约定所有的数据记录中不含有标签的,因此,在调用之前,标签应该另做存放 #The centralize function change the dataSet in the original position def centralize(dataSet): #The dataSet must be of array type meanRow=mean(dataSet,axis=0,keepdims=False) dataSet-meanRow return dataSet,meanRow def getCov(dataSet): #Get the cov matrix return cov(dataSet,rowvar=0) def getEigValsVects(covMat): #The covMat object must be of matrix type ! eigVals,eigVects=linalg.eig(covMat) return eigVals,eigVects def selectEigValsVects(eigVals,eigVects,num): valIndices=argsort(eigVals) valIndices=valIndices[-1::-1] selectIndices=valIndices[:num] selectVals=eigVals[selectIndices] selectVects=eigVects[:,selectIndices] return selectVals,selectVects def percent2Num(eigVals,percent): sortedEigVals=sort(eigVals) sortedEigVals=sortedEigVals[-1::-1] valSum=sum(sortedEigVals) tempSum=0.0 for i in range(len(sortedEigVals)): tempSum+=sortedEigVals[i] if tempSum>percent*valSum: return i+1 def pca(dataSet,percent): dataSet,meanRow=centralize(dataSet) covSet=getCov(dataSet) eigVals,eigVects=getEigValsVects(mat(covSet)) valNum=percent2Num(eigVals,0.95) selectVals,selectVects=selectEigValsVects(eigVals,eigVects,valNum) newDataSet=dataSet*selectVects return newDataSet