NLTK 对文本结构进行语法分析代码示例
Posted 光英的记忆
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NLTK 对文本结构进行语法分析代码示例相关的知识,希望对你有一定的参考价值。
from nltk import CFG
import nltk
from nltk.chunk.regexp import *
from nltk.parse.stanford import StanfordParser
# import re
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('ieer')
# 语法分析
toy_grammar = nltk.CFG.fromstring(
"""
S -> NP VP
VP -> V NP
V -> "eats" | "drinks"
NP -> Det N
Det -> "a" | "an" | "the"
N -> "president" |"Obama" |"apple"| "coke"
""")
toy_grammar.productions()
print(toy_grammar.productions())
# S indicate the entire sentence
# VP is verb phrase the
# NP is noun phrase (chunk that has noun in it)
# N some example nouns
# V is verb
# Det is determiner used in the sentences
# 正则表达式语法分析器
chunk_rules = ChunkRule("<.*>+", "chunk everything")
# NP
# Preposition
# Verb
# PP -> P NP
# VP -> V (NP|PP)*
reg_parser = RegexpParser('''
NP: <DT>? <JJ>* <NN>*
P: <IN>
V: <V.*>
PP: <P> <NP>
VP: <V> <NP|PP>*
''')
test_sent = "Mr. Obama played a big role in the Health insurance bill"
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent))
paresed_out = reg_parser.parse(test_sent_pos)
print(paresed_out)
# 依存分析
# english_parser = StanfordParser('stanford-parser.jar', 'stanfordparser-3.4-models.jar')
# english_parser.raw_parse_sents("this is the english parser test")
# 组块化
test_sent = "The prime minister announced he had asked the chief \\
government whip, Philip Ruddock, to call a special party room meeting for \\
9am on Monday to consider the spill motion."
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent))
rule_vp = ChunkRule(r'(<VB.*>)?(<VB.*>)+(<PRP>)?', 'Chunk VPs')
parser_vp = RegexpChunkParser([rule_vp], chunk_label='VP')
print(parser_vp.parse(test_sent_pos))
rule_np = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+', 'Chunk NPs')
parser_np = RegexpChunkParser([rule_np], chunk_label="NP")
print(parser_np.parse(test_sent_pos))
# 命名实体识别
f = open('test.txt')
text = f.read()
sentences = nltk.sent_tokenize(test_sent) # 句子标记解析
print(sentences) # 单词标记解析
token_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] #
tagged_sentences = [nltk.pos_tag(sentence) for sentence in token_sentences] # 词性标注
for sent in tagged_sentences:
print(nltk.ne_chunk(sent)) # 命名实体识别
# 信息抽取
IN = re.compile(r'.*\\bin\\b(?!\\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer',
pattern=IN):
print(nltk.sem.rtuple(rel))
以上是关于NLTK 对文本结构进行语法分析代码示例的主要内容,如果未能解决你的问题,请参考以下文章