首先我的原始数据是这样的,关于爬虫请看http://blog.csdn.net/jemila/article/details/61196863
我的数据链接:http://pan.baidu.com/s/1hskNlEO 密码:dxv5
加载以下模块
import os import jieba import sys reload(sys) sys.setdefaultencoding("utf-8") from langconv import *
from langconv import *加载这个模块是为了简繁转换
加载停止词
f=open(r'C:/Users/user/Desktop/stopword.txt') stopwords = f.readlines() stopwords = [i.replace("\n","").decode("gbk") for i in stopwords]定义一个分词函数
def sent2word(sentence): """ Segment a sentence to words Delete stopwords """ segList = jieba.cut(sentence) segResult = [] for w in segList: segResult.append(w) newSent = [] for word in segResult: if word in stopwords: # print "stopword: %s" % word continue else: newSent.append(Converter('zh-hans').convert(word.decode('utf-8'))) return newSent定义一个新建文件夹函数
def mkdir(path): # 引入模块 import os # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 print path+' 创建成功' # 创建目录操作函数 os.makedirs(path) return True else: # 如果目录存在则不创建,并提示目录已存在 print path+' 目录已存在' return False定义一个列表转文字的函数,因为每一句分词之后返回的是一个列表,转成文字才能保存到新的文件当中
def list_to_str(list): new_str ='' for i in range(len(list)): new_str = new_str+' '+list[i] return new_str加载文件夹,读取并且分词,注意如果文件中某一行包括分号的,就是作词和作曲啦,这类信息不用。还有就是纯音乐的也直接去掉。
test=r'C:/Users/user/Desktop/lyrics/' p = unicode(test,'utf-8') list_dir=os.listdir(p) for i,j in enumerate(list_dir): path_rude = u'C:/Users/yunfang/Desktop/lyrics/%s'%j list_dir_dir=os.listdir(path_rude) for x,y in enumerate(list_dir_dir): path2_rude=r'C:/Users/yunfang/Desktop/lyrics/%s/%s'%(j,y) f_test= open(path2_rude) print "正在打开",j,"文件夹的",y,"文件" word='' for line in f_test.readlines(): if ":" in line or u"纯音乐" in line : continue else: result = sent2word(line) if result ==[]: continue else: list_str = list_to_str(result) word=word+u'\n'+list_str mkpath=r"C:/Users/yunfang/Desktop/lyrics_result/%s"%j mkdir(mkpath) f_result=open(u'C:/Users/yunfang/Desktop/lyrics_result/%s/%s'%(j,y),'w') f_result.write(str(word)) f_result.close() f_test.close() print "正在关闭",j,"文件夹的",y,"文件" f.close()
提取部分,想到TF-IDF和主题模型LDA。首先我做了TF-IDF发现不太合适用来分析,因为TF-IDF=词频*逆文档频率。词频表示文章中某个词出现的次数,逆文档频率表示如果某个词比较少见,但是它在某篇文章中多次出现,那么它很可能就反映了这篇文章的特性,它就更有可能揭示这篇文字的话题所在。于是当一个词在某段歌词中出现的频率极高(跟分词有关系),在其他歌词中出现频率极低,就会挖掘成关键词,于是对于12个歌手,运用TF-IDF的结果如下。比如,一只,在低苦艾的《不叫鸟》中出现了无数次,然而其他歌词竟然频率极地。再比如,ll,其实是在草东没有排队中的《等》,歌词是说,you will wait you will wait you will keep on wait,you will wait you will wait you will wait for nothing ”。分词结果变成,I ll wait I ll wait I ll keep on waiting, I ll wait I ll wait I ll wait for nothing。于是出现了ll这种尴尬分词,
低苦艾一只0.326554546反光镜瞬间0.230418286周云蓬呼吸0.406213645好妹妹乐队再见0.372984749海龟先生the0.563802482窦唯结束0.352856921草东没有派对ll0.508105904赵雷北京0.335062454逃跑计划再见0.45489933郝云生活0.268754262陈粒一步0.218289286马頔亲爱0.28473938
算法部分写下,免得以后忘了,不过文件夹也是够多了,我是新手代码用的也不是很简洁。。。生成一个concat文件夹,放置分词后的所有文件合并后的txt文件,并且命名为歌手名字。
path = u"C:/Users/user/Desktop/lyrics_result/" listdir = os.listdir(path) for i in listdir: path_way =u"C:/Users/user/Desktop/lyrics_result/%s/"%(i.split(".")[0]) listdir_dir = os.listdir(path_way) text_list="" for j in listdir_dir: path_acc=u"C:/Users/user/Desktop/lyrics_result/%s/%s"%(i.split(".")[0],j) file = open(path_acc) text = list_to_str(file.readlines()) text_list=text_list+text print "this is ",i f_result=open(u'C:/Users/user/Desktop/lyrics_concat/%s.txt'%(i),'w') f_result.write(text_list) f_result.close() 读取合并后的TXT文件,并生成向量 path = u"C:/Users/user/Desktop/lyrics_concat/" listdir = os.listdir(path) arr=[] for i in listdir: data=[] path_way =u"C:/Users/user/Desktop/lyrics_concat/%s"%(i) text_list="" print i file = open(path_way) text = list_to_str(file.readlines()) dataList = text.split('\n') for oneline in dataList: if oneline is not u" ": data.append(unicode(oneline.strip(),'utf-8')) data = [ x for x in data if x != '' ] arr.append(data) file.close() #arr为12个子列表的二层列表,将其重新生成一层列表12个元素 word_list = [] for i in arr: text = "" for j in i: text = text+" "+j word_list.append(text) 接下来用TF-IDF计算 ,name为提取的词频>=3的单词,weight为其权重 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=3)#至少多少个词能当词库 vectorizer.fit_transform(word_list) name = vectorizer.get_feature_names() weight = vectorizer.fit_transform(word_list).toarray()