NLP_命名实体识别CRF++使用流程
Posted YWP_2016
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NLP_命名实体识别CRF++使用流程相关的知识,希望对你有一定的参考价值。
重要参考
用CRF做命名实体识别(一) - 简书 (jianshu.com)https://www.jianshu.com/p/12f2cdd86679(8条消息) 【windows下CRF++的安装与使用】_feng_zhiyu的博客-CSDN博客_crf++安装https://blog.csdn.net/feng_zhiyu/article/details/80793316
代码实践
- B, M, E, S 格式:B表示实体首字,M表示实体中字,E表示实体尾字,S表示单字
- 注意:各种编码/解码细节
生成训练/测试数据
- 生成训练数据/测试数据均为适合CRF++的格式
# -*- coding: utf8 -*-
import sys
home_dir = "D:\\Desktop\\新Asian-Elephant\\毕业\\CRF\\CRF++-0.58\\YWP\\\\199801\\\\"
def splitWord(words):
uni = words.encode('utf-8').decode('utf-8')
li = list()
for u in uni:
li.append(str(u).encode('utf-8'))
return li
# 4 tag: B, M, E, S
def get4Tag(li):
length = len(li)
# print length
if length == 1:
return ['S']
elif length == 2:
return ['B', 'E']
elif length > 2:
li = list()
li.append('B')
for i in range(0, length - 2):
li.append('M')
li.append('E')
return li
def saveDataFile(trainobj, testobj, isTest, word, handle, tag):
if isTest:
saveTrainFile(testobj, word, handle, tag)
else:
saveTrainFile(trainobj, word, handle, tag)
def saveTrainFile(fiobj, word, handle, tag):
if len(word) > 0:
wordli = splitWord(word)
tag == '4'
tagli = get4Tag(wordli)
for i in range(0, len(wordli)):
w = wordli[i]
h = handle
t = tagli[i]
w=w.decode('utf-8')
fiobj.write(str(w) + '\\t' + h + '\\t' + t + '\\n')
else:
# print 'New line'
fiobj.write('\\n')
# B,M,M1,M2,M3,E,S
def convertTag(tag):
fiobj = open(home_dir + 'people-daily.txt', 'r')
trainobj = open(home_dir + 'train.data', 'w',encoding='UTF-8')
testobj = open(home_dir + 'test.data', 'w',encoding='UTF-8')
arr = fiobj.readlines()
i = 0
for a in arr:
i += 1
a = a.strip('\\r\\n\\t ')
if a == "": continue
words = a.split(" ")
test = False
if i % 10 == 0:
test = True
for word in words:
# print "---->", word
word = word.strip('\\t ')
if len(word) > 0:
i1 = word.find('[')
if i1 >= 0:
word = word[i1 + 1:]
i2 = word.find(']')
if i2 > 0:
w = word[:i2]
word_hand = word.split('/')
# print "----",word
#print("word_hand[0]:",word_hand[0])
#print("word_hand[1]:", word_hand[1])
#print('word_hand:',word_hand)
#print('len(word_hand):',len(word_hand))
w, h = word_hand[0],word_hand[1] #w, h = word_hand
# print w,h
if h == 'nr': # ren min
# print 'NR',w
if w.find('·') >= 0:
tmpArr = w.split('·')
for tmp in tmpArr:
saveDataFile(trainobj, testobj, test, tmp, h, tag)
continue
if h != 'm':
saveDataFile(trainobj, testobj, test, w, h, tag)
if h == 'w':
saveDataFile(trainobj, testobj, test, "", "", tag) # split
trainobj.flush()
testobj.flush()
# sys.argv[0]表示代码本身文件路径
# Sys.argv[ ]其实就是一个列表,里边的项为用户输入的参数,关键就是要明白这参数是从程序外部输入的
if __name__ == '__main__':
#tag = sys.argv[0]
convertTag(4)
创建特征模板
- 创建由指定特征组成的模板→存至template文件
模型训练与测试
简易版
- 命令行下,训练模型→model
crf_learn -a MIRA template train.data model
- 命令行下,评估模型
crf_test -m model test.data >> output.txt
完整版
- 命令行下,训练模型→model
- template:模板文件,train.data:生成的训练数据,4_model :模型
crf_learn -f 3 -c 4.0 template train.data 4_model > 4_train.rst
crf_test -m 4_model test.data > 4_test.rst
评估模型
import sys
if __name__ == "__main__":
try:
file = open(sys.argv[1], "r",encoding='UTF-8')
except:
print
("result file is not specified, or open failed!")
sys.exit()
wc_of_test = 0
wc_of_gold = 0
wc_of_correct = 0
flag = True
for l in file:
if l == '\\n': continue
_, _, g, r = l.strip().split()
if r != g:
flag = False
if r in ('E', 'S'):
wc_of_test += 1
if flag:
wc_of_correct += 1
flag = True
if g in ('E', 'S'):
wc_of_gold += 1
print("WordCount from test result:", wc_of_test)
print("WordCount from golden data:", wc_of_gold)
print("WordCount of correct segs :", wc_of_correct)
# 查全率
P = wc_of_correct / float(wc_of_test)
# 查准率,召回率
R = wc_of_correct / float(wc_of_gold)
print("P = %f, R = %f, F-score = %f" % (P, R, (2 * P * R) / (P + R)))
- 命令行下,运行评估模型的文件F-value.py
python F-value.py 4_test.rst
以上是关于NLP_命名实体识别CRF++使用流程的主要内容,如果未能解决你的问题,请参考以下文章
NLP作业三:用BiLSTM+CRF实现中文命名实体识别(TensorFlow入门)代码+报告