对读者借书的书目进行关联规则处理,通过关联规则来查找读者借书之间的关系. 首先获取读者证号、索书号列表,由于部分数据的索书号为空,或者出现异常值,所以需要对索书号进行数据清洗:
1、数据清洗
""" 数据要求:read_num, book_id """ pf = pd.read_csv('new_data.csv', encoding='gbk') # print pf.head() """ 处理缺失数据、异常数据 """ data = pf[['read_num', 'book_id']].copy() print data print len(data) # 182508 print '-----------------------------------------' # 删除空值 data = data.dropna() # dropna()函数返回一个包含非空数据和索引值的Series print data print len(data) # 182427 print '-----------------------------------------' # 重复判断 data_is_duplicate = data.duplicated() # print data_is_duplicate print '-----------------------------------------' # 去除重复 data = data.drop_duplicates() print data print len(data) print '-----------------------------------------' # 异常值处理,去除book_id中的异常值,由于book_id的值全部为大写字母 data = data[(data['book_id'] >= 'A') & (data['book_id'] <= 'Z')] print data print len(data)2、标签算法 数据集中的索书号全部都是英文字母,在进行数据分析的过程中必须全部转化为数字,所以这里使用标签算法将所有字母全部转换为数字,便于数据分析的处理。
""" 算法:获取标签 """ def add_label(s): l = [] m = [] for i in range(len(s)): if i == 0: m = [] l = [1] else: m.append(s[i - 1]) if s[i] in m: if m.index(s[i]) == 0: l.append(1) else: l.append(l[m.index(s[i])]) else: l.append(max(l) + 1) return l """ 注意:索书号的首位都是大写字母,出现非大写字母时,就将那一条数据删除,减少误差 """ read_num = data['read_num'].tolist() book_id = data['book_id'].tolist() book_id = add_label(book_id) # print book_id[:10] # print read_num[:10] # 转换成二维数组 new_aprior = [] new_aprior.append(read_num) new_aprior.append(book_id) m = np.array(new_aprior).T # print 'm:', m # print m[1][0] # print list(m[1])3、多值处理 数据处理的过程中,机器学习算法的应用必须满足一定的条件,对于算法的输入数据必须满足格式要求,对于关联规则而言,就要将每个顾客的购物放到同一个list中,才能对所有顾客的购物篮进行关联规则算法的应用。
""" 算法:多值处理 """ # 字典多值处理 res = {} # 多值字典 for item in m: k = item[0] if not res.has_key(k): # 给定的键在字典中,就返回true,否则返回false res[item[0]] = [] res[item[0]].append(item[1]) # print res print len(res) # print res # 将字典的值全部加到一个新的list中 new = [] for item in res: if res.has_key(item): new.append(res.get(item)) # 获取键值 # print new # 得到了每个同学借书的书目 (list) new_array = np.array(new) print new_array print '---------------------------------------------------------------'4、关联规则 关联规则算法的使用,在最小支持度和最小置信度的选取上一定要进行衡量,值选择太大了,就会导致没有结果产生,太小了,产生的结果就没有任何意义了;这里最小支持度为0.1,最小置信度为0.2
最小支持度: 一个项集出现的概率,A、B两件商品同时购买的概率,A、B两本书同时被借出的概率。 最小置信度:购买A商品的基础上,购买B商品的概率,借出A书的基础上,借出B书的概率。
""" 对学生的借书的书目进行关联规则处理,通过关联规则来查找学生借书之间的关系 """ # def loadDataSet(): # 加载数据集 # return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] def createC1(dataSet): # 构建所有候选项集的集合,数据中有哪些项 C1 = [] for transaction in dataSet: for item in transaction: if not [item] in C1: C1.append([item]) # C1是列表,对于每一项进行添加,[1,3,4,2,5] C1.sort() return map(frozenset, C1) def scanD(D, Ck, minSupport): # 由候选项集生成符合最小支持度的项集L。参数分别为数据集、候选项集列表,最小支持度 ssCnt = {} for tid in D: # 对于数据集里的每一条记录 for can in Ck: # 每个候选项集can if can.issubset(tid): # 若是候选集can是作为记录的子集,那么其值+1,对其计数 if not ssCnt.has_key(can): # ssCnt[can] = ssCnt.get(can,0)+1一句可破,没有的时候为0,加上1,有的时候用get取出,加1 ssCnt[can] = 1 else: ssCnt[can] += 1 numItems = float(len(D)) retList = [] supportData = {} for key in ssCnt: support = ssCnt[key] / numItems # 除以总的记录条数,即为其支持度 if support >= minSupport: retList.insert(0, key) # 超过最小支持度的项集,将其记录下来。 supportData[key] = support return retList, supportData def aprioriGen(Lk, k): # 创建符合置信度的项集Ck, retList = [] lenLk = len(Lk) for i in range(lenLk): for j in range(i + 1, lenLk): # k=3时,[:k-2]即取[0],对{0,1},{0,2},{1,2}这三个项集来说,L1=0,L2=0,将其合并得{0,1,2},当L1=0,L2=1不添加, L1 = list(Lk[i])[:k - 2] L2 = list(Lk[j])[:k - 2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataSet, minSupport=0.1): # 最小支持度 C1 = createC1(dataSet) D = map(set, dataSet) L1, supportData = scanD(D, C1, minSupport) L = [L1] # L将包含满足最小支持度,即经过筛选的所有频繁n项集,这里添加频繁1项集 k = 2 while (len(L[k - 2]) > 0): # k=2开始,由频繁1项集生成频繁2项集,直到下一个打的项集为空 Ck = aprioriGen(L[k - 2], k) Lk, supK = scanD(D, Ck, minSupport) supportData.update(supK) # supportData为字典,存放每个项集的支持度,并以更新的方式加入新的supK L.append(Lk) k += 1 return L, supportData # dataSet = loadDataSet() # 加载数据集 dataSet = new print dataSet C1 = createC1(dataSet) # 候选项集的获取,候选项集是数据集中项的集合 print "所有候选1项集C1:\n", C1 # [1,2,3,4,5] D = map(set, dataSet) print "数据集D:\n", D # 数据集格式:[set(1,3,4), set(2,3,5), set(1,2,3,5), set(2,5)] L1, supportData0 = scanD(D, C1, 0.1) # 符合最小支持度的频繁1项集L1 print "符合最小支持度的频繁1项集L1:\n", L1 L, suppData = apriori(dataSet) # 所有符合最小支持度的项集L print "所有符合最小支持度的项集L:\n", L print "频繁2项集:\n", aprioriGen(L[0], 2) L, suppData = apriori(dataSet, minSupport=0.1) print "所有符合最小支持度为0.1的项集L:\n", L L, suppData = apriori(dataSet, minSupport=0.2) print "所有符合最小支持度为0.2的项集L:\n", L print '-----------------------------------------------------------' def generateRules(L, supportData, minConf=0.1): # 最小置信度 bigRuleList = [] # 规则存放在bigRuleList列表中 for i in range(1, len(L)): for freqSet in L[i]: # L0是频繁1项集,没关联规则 H1 = [frozenset([item]) for item in freqSet] # H1存放频繁i项集的某个频繁项集的单个元素集合,频繁3项集的{0,1,2}的{{0},{1},{2} if i > 1: rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 从频繁3项集开始,从置信度算出关联规则 else: calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对频繁2项集,计算置信度 return bigRuleList def calcConf(freqSet, H, supportData, br1, minConf=0.1): # 计算置信度函数,最小置信度 prunedH = [] for conseq in H: conf = supportData[freqSet] / supportData[ freqSet - conseq] # conf({2}) = s({{0},{1},{2}})/s({{0},{1},{2}}-{2}) if conf >= minConf: print freqSet - conseq, "——>", conseq, "conf:", conf # 那么有{{0},{1}}——>{{2}} br1.append((freqSet - conseq, conseq, conf)) prunedH.append(conseq) return prunedH def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.1): m = len(H[0]) # m,频繁m项集 if (len(freqSet)) > (m + 1): Hmp1 = aprioriGen(H, m + 1) # 由H,创建m+1项集 Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf) # 保留符合置信度的m+1项集,Hmp1 = prunedH if (len(Hmp1) > 1): rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf) L, suppData = apriori(dataSet, minSupport=0.1) # 符合最小支持度的项集 rules = generateRules(L, suppData, minConf=0.2) # 最小置信度 print rules print '------------------------------------------------------------------------' rules = generateRules(L, suppData, minConf=0.5) print rulesOutput:
符合最小支持度的频繁1项集L1: [frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])] 所有符合最小支持度的项集L: [[frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])], [frozenset([10, 6]), frozenset([9, 6]), frozenset([6, 7])], []] 频繁2项集: [frozenset([12, 6]), frozenset([4, 12]), frozenset([12, 14]), frozenset([8, 12]), frozenset([11, 12]), frozenset([12, 13]), frozenset([10, 12]), frozenset([12, 5]), frozenset([12, 7]), frozenset([9, 12]), frozenset([4, 6]), frozenset([14, 6]), frozenset([8, 6]), frozenset([11, 6]), frozenset([13, 6]), frozenset([10, 6]), frozenset([5, 6]), frozenset([6, 7]), frozenset([9, 6]), frozenset([4, 14]), frozenset([8, 4]), frozenset([11, 4]), frozenset([4, 13]), frozenset([10, 4]), frozenset([4, 5]), frozenset([4, 7]), frozenset([9, 4]), frozenset([8, 14]), frozenset([11, 14]), frozenset([13, 14]), frozenset([10, 14]), frozenset([5, 14]), frozenset([14, 7]), frozenset([9, 14]), frozenset([8, 11]), frozenset([8, 13]), frozenset([8, 10]), frozenset([8, 5]), frozenset([8, 7]), frozenset([8, 9]), frozenset([11, 13]), frozenset([10, 11]), frozenset([11, 5]), frozenset([11, 7]), frozenset([9, 11]), frozenset([10, 13]), frozenset([13, 5]), frozenset([13, 7]), frozenset([9, 13]), frozenset([10, 5]), frozenset([10, 7]), frozenset([9, 10]), frozenset([5, 7]), frozenset([9, 5]), frozenset([9, 7])] 所有符合最小支持度为0.1的项集L: [[frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])], [frozenset([10, 6]), frozenset([9, 6]), frozenset([6, 7])], []] 所有符合最小支持度为0.2的项集L: [[frozenset([6]), frozenset([10]), frozenset([7]), frozenset([9])], []] ----------------------------------------------------------- frozenset([6]) ——> frozenset([10]) conf: 0.283821263482 frozenset([10]) ——> frozenset([6]) conf: 0.518143459916 frozenset([6]) ——> frozenset([9]) conf: 0.263790446841 frozenset([9]) ——> frozenset([6]) conf: 0.522269676632 frozenset([7]) ——> frozenset([6]) conf: 0.658473105842 frozenset([6]) ——> frozenset([7]) conf: 0.350847457627 [(frozenset([6]), frozenset([10]), 0.28382126348228043), (frozenset([10]), frozenset([6]), 0.5181434599156117), (frozenset([6]), frozenset([9]), 0.2637904468412943), (frozenset([9]), frozenset([6]), 0.5222696766320928), (frozenset([7]), frozenset([6]), 0.6584731058415269), (frozenset([6]), frozenset([7]), 0.3508474576271186)] ------------------------------------------------------------------------ frozenset([10]) ——> frozenset([6]) conf: 0.518143459916 frozenset([9]) ——> frozenset([6]) conf: 0.522269676632 frozenset([7]) ——> frozenset([6]) conf: 0.658473105842 [(frozenset([10]), frozenset([6]), 0.5181434599156117), (frozenset([9]), frozenset([6]), 0.5222696766320928), (frozenset([7]), frozenset([6]), 0.6584731058415269)]