根据单词出现修改相应的标签
def set_of_words2_vec(vocabList, label, inputSet): returnVec = [0] * len(vocabList) for word in inputSet: for index, r in enumerate(vocabList): if r == word: returnVec[index] += label return returnVec 对数据惊醒学习操作,计算出词组出现的概率分布 def trainNB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) labelSet = list(set(trainCategory)) pAbusive = {} for r in labelSet: pAbusive[str(r)] = len([row for row in trainCategory if row == r]) \ / float(numTrainDocs) pNumber = {} pDenom = {} for row in labelSet: pNumber[str(row)] = ones(numWords) pDenom[str(row)] = 2.0 for i in range(numTrainDocs): pNumber[str(trainCategory[i])] += [row / trainCategory[i] for row in trainMatrix[i]] pDenom[str(trainCategory[i])] += sum(trainMatrix[i]) / trainCategory[i] ret = {} for i in range(len(labelSet)): ret[str(labelSet[i])] = pNumber[str(labelSet[i])] / pDenom[str(labelSet[i])] return ret, pAbusive 判断测试词组的出现的概率,选择出出现概率最高的一项,就是该词组的评分了。 def classifyNB(vec2Classify, pVec, pClass, trainCategory): labelSet = list(set(trainCategory)) p = {} for row in labelSet: p[str(row)] = sum(vec2Classify * pVec[str(row)]) + log(pClass[str(row)]) m = sorted(p.items(), key=lambda k: k[1], reverse=True) return float(m[0][0]) 一下是对文档进行测试的操作, def testingNB(): dataSet, labels = load_data_set() vocabSet, labelSet = create_vocab_list(dataSet, labels) trainMatrix = [] for index, row in enumerate(dataSet): trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row)) pV, pAb = trainNB0(trainMatrix, labels) testEntry = ['学习', '很棒', '真不错'] testEntry = list(set(testEntry)) thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry)) print testEntry, 'classified as: ', classifyNB(thisDoc, pV, pAb, labels) def test(number): ''' 验证算法的正确性 :param number: 当成测试样本的额百分比 :return: ''' dataSet, labels = load_data_set() test_number = int(len(dataSet) * number) testSet = [] for i in range(test_number): randIndex = int(random.uniform(0, len(dataSet))) testSet.append([dataSet[randIndex], labels[randIndex]]) del (dataSet[randIndex]) del (labels[randIndex]) # 进行学习 vocabSet, labelSet = create_vocab_list(dataSet, labels) trainMatrix = [] for index, row in enumerate(dataSet): trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row)) pV, pAb = trainNB0(trainMatrix, labels) # 进行测试 errorCount = 0 for row in testSet: testEntry = row[0] testEntry = list(set(testEntry)) thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry)) ret = classifyNB(thisDoc, pV, pAb, labels) if ret != row[1]: print "classification error", row[1], ret errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) test(0.1) # testingNB() 悲剧的是测试的结果很不理想,难道中文不能这样分词,还是那一个细节出现问题,还请大神指导下,权当学习一下吧! [案例现在地址](http://download.csdn.net/detail/u010154424/9602826)