安装
python版本
需要Python2.7或3.4+
使用PIP安装
pip
install -U nltk
安装NLTK数据
import nltk
nltk.download()
from nltk.corpus
import brown
brown.words()
下载之后,如果找不到数据,需要设置NLTK_DATA为数据的目录。
Text对象
form nltk.book
import *
text1.concordance(
'monstrous')
text1.similar(
'monstrous')
text1.common_contexts([
'monstrous',
'gamesome'])
text4.dispersion_plot([
'freedom',
'America'])
text1.count(
'monstrous')
text1.collocations()
FreqDist对象
import nltk
from nltk.book
import *
'''
生成FreqDist对象,FreqDist继承自dict
FreqDist中的键为单词,值为单词的出现总次数
FreqDist构造函数接受任意一个列表
'''
fdist1 = FreqDist(text1)
fdist1.plot(
10)
fdist1.tabulate(
15)
fdist1.most_common(
15)
FreqDist::hapaxes()
fdist1.max()
words = set(text1)
long_words = [w
for w
in words
if len(w) >
7 and fdist1[w] >
7]
print(sorted(long_words))
中文分词
斯坦福中文分词器支持词性标注,命名实体识别和句法分析。 下载最新的jar包 下载SLF4J
from nltk.tokenize.stanford_segmenter
import StanfordSegmenter
segmenter = StanfordSegmenter(
path_to_jar=
"stanford-segmenter-3.7.0.jar",
path_to_slf4j=
"slf4j-simple-1.7.25.jar",
path_to_sihan_corpora_dict=
"./data",
path_to_model=
"./data/pku.gz",
path_to_dict=
"./data/dict-chris6.ser.gz"
)
sentence =
u"这是斯坦福中文分词器测试"
print segmenter.segment(sentence)
print segmenter.segment_file(
"test.simp.utf8")
语料库
import nltk
from nltk.corpus
import gutenberg
gutenberg.fileids()
emma = gutenberg.words(
"austen-emma.txt")
emma_str = gutenberg.raw(
"austen-emma.txt")
emma_sents = gutenberg.sents(
"austen-emma.txt")
print(emma_sents)
from nltk.corpus
import webtext
print(webtext.fileids())
from nltk.corpus
import inaugural
print(inaugural.fileids())
from nltk.corpus
import nps_chat
print(nps_chat.fileids())
chat_room = nps_chat.posts(
'10-19-30s_705posts.xml')
print(chat_room)
from nltk.corpus
import brown
print(brown.categories())
print(brown.fileids([
'news',
'lore']))
ca02 = brown.words(fileids=
'ca02')
print(
'ca02: ', ca02)
from nltk.corpus
import reuters
print(reuters.categories())
条件频率分布
import nltk
from nltk.corpus
import brown
pairs = [(genre, word)
for genre
in brown.categories()
for word
in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(pairs)
print(cfd.conditions())
genres = [
'news',
'religion',
'hobbies',
'science_fiction',
'romance',
'humor']
modals = [
'can',
'could',
'may',
'might',
'must',
'will']
cfd.tabulate(conditions=genres, samples=modals)
cfd.plot(conditions=genres, samples=modals)
sent = [
'I',
'am',
'a',
'good',
'man']
print(list(nltk.bigrams(sent)))
text = brown.words(categories=
'news')
bigrams_words = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams_words)
fd = cfd[
'can']
fd.plot(
10)
词性标注
词性标注集
《PFR人民日报标注语料库》词性编码表《现代汉语语料库加工规范——词语切分与词性标注》词性标记计算所 ICTCLAS 3.0汉语词性标记集HanLP词性标注集BosonNLP词性标注结巴分词标注
import nltk
words = nltk.word_tokenize(
'And now for something completely different')
print(words)
word_tag = nltk.pos_tag(words)
print(word_tag)
from nltk.corpus
import brown
words_tag = brown.tagged_words(categories=
'news')
print(words_tag[:
10])
tagged_sents = brown.tagged_sents(categories=
'news')
print(tagged_sents)
from nltk.corpus
import sinica_treebank
print(sinica_treebank.fileids())
words = sinica_treebank.words(
'parsed')
print(words[:
40])
words_tag = sinica_treebank.tagged_words(
'parsed')
print(words_tag[:
40])
words_tag = sinica_treebank.tagged_words(
'parsed')
tag_fd = nltk.FreqDist(tag
for (word, tag)
in words_tag)
tag_fd.tabulate(
5)
创建词性标注器
import nltk
raw =
"You are a good man, but i don't love you!"
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger(
'NN')
tagged_words = default_tagger.tag(tokens)
print(tagged_words)
from nltk.corpus
import brown
tagged_sents = brown.tagged_sents(categories=
'news')
print(default_tagger.evaluate(tagged_sents))
fd = nltk.FreqDist(brown.words(categories=
'news'))
most_common_pairs = fd.most_common(
100)
most_common_words = [i[
0]
for i
in most_common_pairs]
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=
'news'))
likely_tags = dict((word, cfd[word].max())
for word
in most_common_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
tagged_sents = brown.tagged_sents(categories=
'news')
print(baseline_tagger.evaluate(tagged_sents))
raw =
"You are a good man, but i don't love you!"
tokens = nltk.word_tokenize(raw)
print(baseline_tagger.tag(tokens))
baseline_tagger2 = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger(
'NN'))
tagged_sents = brown.tagged_sents(categories=
'news')
print(baseline_tagger2.evaluate(tagged_sents))
fd = nltk.FreqDist(brown.words(categories=
'news'))
most_common_pairs = fd.most_common(
500)
most_common_words = [i[
0]
for i
in most_common_pairs]
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories=
'news'))
likely_tags = dict((word, cfd[word].max())
for word
in most_common_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger(
'NN'))
tagged_sents = brown.tagged_sents(categories=
'news')
print(baseline_tagger.evaluate(tagged_sents))
一元标注器
import nltk
from nltk.corpus
import brown
tagged_sents = brown.tagged_sents(categories=
'news')
unigram_tagger = nltk.UnigramTagger(train=tagged_sents)
print(unigram_tagger.evaluate(tagged_sents))
tagged_sents = brown.tagged_sents(categories=
'news')
size = int(len(tagged_sents) *
0.9)
train_sets = tagged_sents[:size]
test_sets = tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train=train_sets)
print(unigram_tagger.evaluate(train_sets))
print(unigram_tagger.evaluate(test_sets))
二元标注器
tagged_sents = brown.tagged_sents(categories=
'news')
size = int(len(tagged_sents) *
0.9)
train_sets = tagged_sents[:size]
test_sets = tagged_sents[size:]
bigram_tagger = nltk.BigramTagger(train=train_sets)
print(bigram_tagger.evaluate(train_sets))
print(bigram_tagger.evaluate(test_sets))
二元标注器会考查一个单词本身和它前一个单词的标记,如果遇到一个新词,那么二元标注器就没法标记它,并且还会导致接下来的单词都没法标记,所以我们会看到二元标注器在测试集上正确率很低。
组合标注器
按照下列方式组合 - 尝试使用bigram标注器标注单词。 - 如果bigram标注器无法找到一个标记,尝试unigram标注器。 - 如果unigram标注器无法找到一个标记,使用默认标注器。
import nltk
from nltk.corpus
import brown
tagged_sents = brown.tagged_sents(categories=
'news')
size = int(len(tagged_sents) *
0.9)
train_sets = tagged_sents[:size]
test_sets = tagged_sents[size:]
t0 = nltk.DefaultTagger(
'NN')
t1 = nltk.UnigramTagger(train=train_sets, backoff=t0)
t2 = nltk.BigramTagger(train=train_sets, backoff=t1)
print(t2.evaluate(train_sets))
print(t2.evaluate(test_sets))
对中文昵称进行性别分类
import nltk
import random
from nltk.classify
import apply_features
from nltk.corpus
import PlaintextCorpusReader
names_corpus = PlaintextCorpusReader(
'./', [
'female.txt',
'male.txt'])
all_names = names_corpus.words()
ch_freq = nltk.FreqDist(ch.lower()
for name
in all_names
for ch
in name)
ch_freq_most = ch_freq.most_common(
1000)
ch_features = [ch
for (ch, count)
in ch_freq_most]
print(ch_freq_most)
def name_features(name):
"""
名称特征提取
:param name: 名称
:return: 名称特征
"""
name_chs = set([ch.lower()
for ch
in name])
features = {}
for ch
in ch_features:
features[
'contain(%s)' % ch] = (ch
in name_chs)
return features
female_names = [(name,
'female')
for name
in names_corpus.words(
'female.txt')]
male_names = [(name,
'male')
for name
in names_corpus.words(
'male.txt')]
total_names = female_names + male_names
random.shuffle(total_names)
train_set_size = int(len(total_names) *
0.6)
train_names = total_names[:train_set_size]
test_names = total_names[train_set_size:]
train_set = apply_features(name_features, train_names,
True)
test_set = apply_features(name_features, test_names,
True)
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, train_set))
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(
20)
for (name, tag)
in test_names:
guess = classifier.classify(name_features(name))
if guess != tag:
print(tag, guess, name)
正则表达式分块器
将一个或多个连续的词分成一块。 分块是用于实体识别的基本技术。
名词短语分块
名词短语分块也叫做NP-分块,NP-分块信息最有用的来源之一是词性标记,所以我们一般在分块前都会进行词性标注。
符号含义例子
Ssentencethe man walkedNPnoun phrasea dogVPverb phrasesaw a parkPPprepositionalphrase with a telescopeDetdeterminertheNnoundogVverbwalkedPprepositionin
正则表达式分块器
import nltk
text =
"Lucy let down her long golden hair"
sentence = nltk.word_tokenize(text)
sentence_tag = nltk.pos_tag(sentence)
print(sentence_tag)
grammar =
r"""
NP: {<DT|PRP\$>?<JJ>*<NN>}
{<NNP>+}
"""
cp = nltk.RegexpParser(grammar)
tree = cp.parse(sentence_tag)
tree.draw()
加缝隙
import nltk
text =
"the little yellow dog barked at the cat"
sentence = nltk.word_tokenize(text)
sentence_tag = nltk.pos_tag(sentence)
print(sentence_tag)
grammar =
r"""
NP: {<.*>+}
}<VBD|IN>+{
"""
cp = nltk.RegexpParser(grammar)
tree = cp.parse(sentence_tag)
tree.draw()
评估分块器
块在文本文件中的标准方式是IOB标记:I(inside,内部),O(outside,外部),B(begn,开始)
conll2000语料库
import nltk
from nltk.corpus
import conll2000
test_sents = conll2000.chunked_sents(
"train.txt", chunk_types=[
"NP"])
tags = nltk.chunk.tree2conlltags(test_sents[
0])
print(tags)
使用conll2000语料库评估分块器
grammar =
r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents(
"train.txt", chunk_types=[
"NP"])
print(cp.evaluate(test_sents))
使用一元标注器创建分块器
class UnigramChunker(nltk.ChunkParserI):
"""
一元分块器,
该分块器可以从训练句子集中找出每个词性标注最有可能的分块标记,
然后使用这些信息进行分块
"""
def __init__(self, train_sents):
"""
构造函数
:param train_sents: Tree对象列表
"""
train_data = []
for sent
in train_sents:
conlltags = nltk.chunk.tree2conlltags(sent)
ti_list = [(t, i)
for w, t, i
in conlltags]
train_data.append(ti_list)
self.__tagger = nltk.UnigramTagger(train_data)
def parse(self, tokens):
"""
对句子进行分块
:param tokens: 标注词性的单词列表
:return: Tree对象
"""
tags = [tag
for (word, tag)
in tokens]
ti_list = self.__tagger.tag(tags)
iob_tags = [iob_tag
for (tag, iob_tag)
in ti_list]
conlltags = [(word, pos, iob_tag)
for ((word, pos), iob_tag)
in zip(tokens, iob_tags)]
return nltk.chunk.conlltags2tree(conlltags)
test_sents = conll2000.chunked_sents(
"test.txt", chunk_types=[
"NP"])
train_sents = conll2000.chunked_sents(
"train.txt", chunk_types=[
"NP"])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))