7 从文本提取信息

Posted 2021-01-03 nxf-rabbit75

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了7 从文本提取信息相关的知识，希望对你有一定的参考价值。

一、信息提取

信息提取结构

import nltk
def ie_proprocess(document):
    sentences = nltk.sent_tokenize(document)#句子分割器
    sentences = [nltk.word_tokenize(sent) for sent in sentences]#分词器
    sentences = [nltk.pos_tag(sent) for sent in sentences]#词性标注器

二、分块

名词短语分块(NP-chunking NP-分块) 寻找单独名词短语对应的块

#基于正则表达式的NP分块器的例子
sentence = [("the","DT"),("little","JJ"),("yellow","JJ"),("dog","NN"),("barked","VBD"),("at","IN"),("the","DT"),("cat","NN")]
grammer = "NP:{<DT>?<JJ>*<NN>}"#尖括号英语标记标识符的边界，尖括号之间的所有括号都被忽略
cp = nltk.RegexpParser(grammer)
result = cp.parse(sentence)
print(result)
result.draw()
#result
#(S
#  (NP the/DT little/JJ yellow/JJ dog/NN)
#  barked/VBD
#  at/IN
#  (NP the/DT cat/NN))

#用正则表达式分块
grammer = r‘‘‘NP:{<DT|PP$>?<JJ>*<NN>}   #匹配一个可选的限定词或所有格代名词
            {<NPP>+} ‘‘‘ #匹配一个或多个专有名词
cp = nltk.RegexpParser(grammer)
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
print(cp.parse(sentence))
sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]

nouns = [("money","NN"),("market","NN"),("fund","NN")]
grammar = "NP: {<NN><NN>}"  #如果将匹配两个连续名词的文本的规则应用到包含３个连续名词的文本中，则只有前两个名词被分块
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))#(S (NP money/NN market/NN) fund/NN)

#探索文本语料库
#使用分块器可以在已标注的语料库中提取匹配特定词性标记序列的短语
cp = nltk.RegexpParser(‘CHUNK : {<V.*><TO><V.*>}‘)
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == ‘CHUNK‘:
            print(subtree)
        (CHUNK combined / VBN  to / TO  achieve / VB)
        (CHUNK continue / VB to / TO place / VB)

def find_chunks(chunk):#chunk = ‘CHUNK : {<V.*><TO><V.*>}‘
    cp = nltk.RegexpParser (chunk)
    brown = nltk.corpus.brown
    for sent in brown.tagged_sents ():
        tree = cp.parse (sent)
        for subtree in tree.subtrees ():
            if subtree.label () == ‘CHUNK‘:
                print (subtree)

缝隙

为不包括在大块中的标识符序列定义一个缝隙

加缝隙是从大块中去除标识符序列的过程分为三种：标识符贯穿整块、标识符出现在块中间、标识符出现在块的周边

grammar = r"""
NP:
   {<.*>+}
   }<VBD|IN>+{"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
#(S
# (NP the/DT little/JJ yellow/JJ dog/NN)
# barked/VBD
# at/IN
# (NP the/DT cat/NN))

分块的表示：标记与树状图

Ｉ（inside，内部），Ｏ（outside，外部）或Ｂ（begin，开始）

Ｂ标志着它是分块的开始。块内的标识符子序列被标志为Ｉ，其他为Ｏ

Ｂ和Ｉ标记是块类型的后缀，如B-NP, I-NP。

NLTK用树状图作为分块的内部表示，却提供这些树状图与IOB之间格式转换的方法

三、开发和评估分块器

如何评估分块器？

from nltk.corpus import conll2000
print(conll2000.chunked_sents(‘train.txt‘)[99])
#(S
# (PP Over/IN)
# (NP a/DT cup/NN)
# (PP of/IN)
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# (VP told/VBD)
# (NP his/PRP$ story/NN)
# ./.)

#corpora模块语料库包含三种分块类型：NP分块，VP分块，PP分块
print(conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])[99])#只选择NP分块
#(S
# Over/IN
# (NP a/DT cup/NN)
# of/IN
# (NP coffee/NN)
# ,/,
# (NP Mr./NNP Stone/NNP)
# told/VBD
# (NP his/PRP$ story/NN)
# ./.)

简单评估和基准

cp = nltk.RegexpParser("") #不分块
test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
print(cp.evaluate(test_sents))
#ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%

grammer = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammer)
test_sents = conll2000.chunked_sents(‘test.txt‘)
print(cp.evaluate(test_sents))
#ChunkParse score:
    IOB Accuracy:  62.5%%
    Precision:     70.6%%
    Recall:        38.5%%
    F-Measure:     49.8%%

使用unigram标注器对名词短语分块

class UnigramChunker(nltk.ChunkParserI):
    def __init__(self,train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]

        #为词性标注IOB块标记
        conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图

test_sents = conll2000.chunked_sents(‘test.txt‘,chunk_types = [‘NP‘])
train_sents = conll2000.chunked_sents(‘train.txt‘,chunk_types = [‘NP‘])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
# ChunkParse score:
#     IOB Accuracy:  92.9%%
#     Precision:     79.9%%
#     Recall:        86.8%%
#     F-Measure:     83.2%%

postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
print(postags)
#[‘#‘, ‘$‘, "‘‘", ‘(‘, ‘)‘, ‘,‘, ‘.‘, ‘:‘, ‘CC‘, ‘CD‘, ‘DT‘, ‘EX‘, ‘FW‘, ‘IN‘, ‘JJ‘, ‘JJR‘, ‘JJS‘, ‘MD‘, ‘NN‘, ‘NNP‘, ‘NNPS‘, ‘NNS‘, ‘PDT‘, ‘POS‘, ‘PRP‘, ‘PRP$‘, ‘RB‘, ‘RBR‘, ‘RBS‘, ‘RP‘, ‘SYM‘, ‘TO‘, ‘UH‘, ‘VB‘, ‘VBD‘, ‘VBG‘, ‘VBN‘, ‘VBP‘, ‘VBZ‘, ‘WDT‘, ‘WP‘, ‘WP$‘, ‘WRB‘, ‘``‘]

print(unigram_chunker.tagger.tag(postags))
#[(‘#‘, ‘B-NP‘), (‘$‘, ‘B-NP‘), ("‘‘", ‘O‘), (‘(‘, ‘O‘), (‘)‘, ‘O‘), (‘,‘, ‘O‘), (‘.‘, ‘O‘), (‘:‘, ‘O‘), (‘CC‘, ‘O‘),...]

#建立bigram分类器
class BigramChunker(nltk.ChunkParserI):
    def __init__(self,train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos,chunktag) in tagged_pos_tags]

        #为词性标注IOB块标记
        conlltags = [(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)#转化成分块树状图

bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
ChunkParse score:
#     IOB Accuracy:  93.3%%
#     Precision:     82.3%%
#     Recall:        86.8%%
#     F-Measure:     84.5%%

#训练基于分类器的分块器
#使用连续分类器对名词短语分块
# maxent 最大熵
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos} #只提供当前标识符的词性标记

class ConsecutiveNPChunkTagger (nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag (tagged_sent)
            history = []
            for i, (word, tag) in enumerate (tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append ((featureset, tag))
                history.append (tag)
        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm=‘megam‘, trace=0)# 最大熵

    def tag(self, sentence):
        history = []
        for i, word in enumerate (sentence):
            featureset = npchunk_features (sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append (tag)
        return zip (sentence, history)

class ConsecutiveNPChunker (nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags (sent)] for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag (sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.conlltags2tree (conlltags)
        
# chunker = ConsecutiveNPChunker(train_sents)
# print(chunker.evaluate(test_sents))
#有时词性标记不足以确定一个句子应如何分块

四、语言结构中的递归

用级联分块器构建嵌套结构

只需创建一个包含递归规则的多级的分块语法，就可以建立任意深度的分块结构

grammar = r"""
   NP: {<DT|JJ|NN.*>+}
   PP: {<IN><NP>}
   VP: {<VB.*><NP|PP|CLAUSE>+$}
   CLAUSE: {<NP><VP>}
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary","NN"), ("saw","VBD"),("the","DT"),("cat","NN"),("sit","VB"),("on","IN"),("the","DT"),("mat","NN")]
print(cp.parse(sentence))
# (S
#   (NP Mary/NN)
#   saw/VBD   #无法识别VP
#   (CLAUSE
#     (NP the/DT cat/NN)
#     (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))

cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
# (S
#   (CLAUSE
#     (NP Mary/NN)
#     (VP
#       saw/VBD
#       (CLAUSE
#         (NP the/DT cat/NN)
#         (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))

树状图

在nltk中，创建树状图，方法是给节点添加标签和一个子链表

tree1 = nltk.Tree(‘NP‘,[‘Alice‘])
tree2 = nltk.Tree(‘NP‘,[‘the‘,‘rabbit‘])
tree3 = nltk.Tree(‘VP‘,[‘chased‘,tree2])
tree4 = nltk.Tree(‘S‘,[tree1,tree3])
# print(tree4[1])
tree4.draw()

树遍历

使用递归函数来遍历树是标准的做法

def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end="")
    else:
        # Now we know that t.node is defined
        print("(", t.label(), end=‘‘)
        for child in t:
            traverse(child)
        print(")", end=‘‘)
import nltk
t = nltk.Tree(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
t = nltk.Tree.fromstring(‘(S (NP Alice) (VP chased (NP the rabbit)))‘)
traverse(t)

五、命名实体识别

sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent,binary=True))#ne_chunk(binary=True)命名实体只被标注为NE,否则，分类器会添加类型标注
print(nltk.ne_chunk(sent))

六、关系抽取

只要文本中的命名实体被识别，我们就可以提取它们之间存在的关系。

方法之一是首先寻找所有(X, a, Y)形式的三元组，其中X和Y是指定类型的命名实体，a表示X和Y之间关系的字符串

import re
IN = re.compile(r‘.*in(?!.+ing)‘)
for doc in nltk.corpus.ieer.parsed_docs(‘NYT_19980315‘):
    for rel in nltk.sem.extract_rels(‘ORG‘,‘LOC‘,doc,corpus=‘ieer‘,pattern=IN):
        print(nltk.sem.relextract.rtuple(rel))

from nltk.corpus import conll2002
vnv = """
 (
 is/V| #3rdsing present and
 was/V| #past forms of the verb zijn (‘be‘)
 werd/V| #and also present
 wordt/V #pastof worden(‘become‘)
 )
 .* #followed byanything
 van/Prep #followed byvan(‘of‘)
 """
VAN = re.compile(vnv,re.VERBOSE)
for doc in conll2002.chunked_sents(‘ned.train‘):
    for r in nltk.sem.extract_rels(‘PER‘,‘ORG‘,doc,corpus=‘con112002‘,pattern=VAN):
        print(nltk.sem.clause(r,relsym = "VAN"))

以上是关于7 从文本提取信息的主要内容，如果未能解决你的问题，请参考以下文章

7 从文本提取信息

从单个按钮从多个片段中提取数据

从 HTML 正文中提取文本片段（在 .NET 中）

使用 PyPDF2 从 PDF 文件中提取文本

了解 BitTorrent 片段输出

全文搜索用于提取文本片段（返回预期文本及其周围）