自然语言处理(NLP)基于BiLSTM的关系抽取

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了自然语言处理(NLP)基于BiLSTM的关系抽取相关的知识,希望对你有一定的参考价值。

【自然语言处理(NLP)】基于BiLSTM的关系抽取


(文章目录)


前言

(一)、任务描述

关系抽取是信息抽取的重要子任务,其主要目的是将非结构化获半结构化描述的自然语言文本转化成结构化数据,关系抽取主要负责对文本中抽取出的实体对进行关系分类,即抽取实体间的语义关系。

关系抽取涉及三元组,即(实体1,实体2,关系),其中实体1又称为头实体,实体2又称为尾实体,在建模关系分类时,一般会分别获得实体对所处上下文的特征,然后与实体对的特征进行融合,获得整体的样本特征,然后进行分类。


(二)、环境配置

本示例使用BiLSTM实现关系抽取,代码运行的环境配置如下:Python版本为3.7,PaddlePaddle版本为2.0.2,操作平台为AI Studio。

import paddle
import numpy as np
import matplotlib.pyplot as plt
print(paddle.__version__)

输出结果如下图1所示:


一、数据准备

加载库,读取数据,观察数据统计特征


(一)、导入相关包

import json
import numpy as np
from functools import partial
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F
import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
import matplotlib.pyplot as plt

(二)、数据预处理


train_data = open(data/data78726/semeval()
val_data = open(data/data78726/semeval()
test_data = open(data/data78726/semeval()

train_data  = [json.loads(line.strip()) for line in train_data]
val_data  = [json.loads(line.strip()) for line in val_data]
test_data  = [json.loads(line.strip()) for line in test_data]
print(train_data[0])
print(train_data:,len(train_data))
print(val_data:,len(val_data))
print(test_data:,len(test_data))
# train_data: 6507
# val_data: 1493
# test_data: 2717
rel2id =  json.loads(open(data/data78726/semeval())
id2rel = v:k for k,v in rel2id.items()
num_classes = len(list(rel2id))
print(num_classes)
print(rel2id)

输出结果如下图2所示:


(三)、确定词表

构建词表

vocab = 
vocab[<pad>],vocab[<unk>]= 0,1
idx = 2
for line in train_data + val_data:
    for w in line[token]:
        if w not in vocab:
            vocab[w] = idx
            idx += 1
chars = abcdefghijklmnopqrstuvwxyz
for char in chars:
    if char not in vocab:
        vocab[char] = idx
        idx += 1

vocab_size = len(list(vocab))
maxlen=30
print(vocab_size)

输出结果如下图3所示:


(四)、转换数据格式

# 转换数据格式
def txt_to_list(datas):
    sents,e1,e2,y = [],[],[],[]
    for line in datas:
        sents.append(line[token])
        e1.append(line[h][name])
        e2.append(line[t][name])
        y.append(line[relation])
    return sents,e1,e2,y

# 将句子转化为id序列
dic = 
def word2id(datas, maxlen=5):
    res = []
    if maxlen<10:
        maxlen = 10
        datas = [list(data) for data in datas] 
    for data in datas: 
        if len(data) not in dic:
            dic[len(data)] = 1
        else:
            dic[len(data)] += 1
        line = [vocab[c] if c in vocab else 1 for c in data][:maxlen]
        line = line + [0] * (maxlen-len(line))  # 固定长度
        res.append(np.array(line))
    return res

train_sents,train_e1,train_e2,train_y = txt_to_list(train_data)
val_sents,val_e1,val_e2,val_y = txt_to_list(val_data)
test_sents,test_e1,test_e2,test_y = txt_to_list(test_data)




train_idx = Stack()(word2id(train_sents,maxlen))
val_idx =  Stack()(word2id(val_sents,maxlen))
test_idx =  Stack()(word2id(test_sents,maxlen))

train_e1_idx = Stack()(word2id(train_e1,2))
val_e1_idx =  Stack()(word2id(val_e1,2))
test_e1_idx =  Stack()(word2id(test_e1,2))

train_e2_idx = Stack()(word2id(train_e2,2))
val_e2_idx =  Stack()(word2id(val_e2,2))
test_e2_idx =  Stack()(word2id(test_e2,2))

train_yid = [rel2id[c] for c in train_y]
val_yid = [rel2id[c]  for c in val_y]
test_yid = [rel2id[c]  for c in test_y]
print(len(test_yid),sum([1 if x==18 else 0 for x in test_yid]))
# print(set(train_yid),set(val_yid),set(test_yid))
# lens = list(dic.keys())
# lens.sort()
# ccc= 0
# for k in lens:
#     ccc += dic[k]
#     print(k,dic[k],ccc/(len(train_y)+len(val_y)+len(test_y)))
# 29 186 0.9048241112251563
# 40 43 0.9810581319399085

class  MyDataset(paddle.io.Dataset):
    def __init__(self, data):
        super(MyDataset, self).__init__() 
        self.data = data

    def __getitem__(self, index): 
        data = self.data[index][0]  # ([train_sents,train_e1,train_e2],y)
        e1 = self.data[index][1]
        e2 = self.data[index][2]
        label = self.data[index][3]
        return data,e1,e2,label

    def __len__(self): 
        return len(self.data)

    def get_labels(self):
        return [str(c) for c in range(19)]

#  ([train_sents,train_e1,train_e2],y)
 
batch_size = 64
use_gpu = True

train = MyDataset([[x,e1,e2,y] for x,e1,e2,y in zip(train_idx,train_e1_idx,train_e2_idx,train_yid)])
val = MyDataset([[x,e1,e2,y] for x,e1,e2,y in zip(val_idx,val_e1_idx,val_e2_idx,val_yid)])
test = MyDataset([(x,e1,e2,y) for x,e1,e2,y in zip(test_idx,test_e1_idx,test_e2_idx,test_yid)])
print(train[0])
train_loader = paddle.io.DataLoader(train, batch_size=batch_size, shuffle=True)
val_loader = paddle.io.DataLoader(val, batch_size=batch_size, shuffle=False)
test_loader = paddle.io.DataLoader(test, batch_size=batch_size, shuffle=False)
# print(train_loader[0])

输出结果如下图4所示:


二、网络构建

class BiLSTMWithCRF(nn.Layer):
    def __init__(self,
                 emb_size,
                 hidden_size,
                 word_num,
                 label_num,
                 use_w2v_emb=False):
        super(BiLSTMWithCRF, self).__init__()
        self.word_emb = nn.Embedding(word_num, emb_size)
        self.lstm = nn.LSTM(emb_size,
                          hidden_size,
                          num_layers=2,
                          direction=bidirectional)
        self.fc = nn.Linear(hidden_size * 2, label_num + 2)  # BOS EOS
        self.crf = LinearChainCrf(label_num)
        self.decoder = ViterbiDecoder(self.crf.transitions)

    def forward(self, x, lens):
        embs = self.word_emb(x)
        output, _ = self.lstm(embs)
        output = self.fc(output)
        _, pred = self.decoder(output, lens)
        return output, lens, pred

# Define the model netword and its loss
network = BiLSTMWithCRF(300, 300, len(word_vocab), len(label_vocab))
model = paddle.Model(network)

三、模型定义

定义网络结构后,需要配置优化器、损失函数、评价指标。


class Model(nn.Layer):
    def __init__(self,vocab_size, embedding_size,hidden_size,dropout_rate,fc_hidden_size,num_layers=2):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.emb = paddle.nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                          hidden_size,
                          num_layers=num_layers,
                          direction=bidirectional)
        self.lstm_e1 = nn.LSTM(embedding_size,
                          hidden_size,
                          num_layers=1)
        self.lstm_e2 = nn.LSTM(embedding_size,
                          hidden_size,
                          num_layers=1)
        self.fc = nn.Linear(hidden_size*4, fc_hidden_size)
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)
        # self.loss = nn.CrossEntropyLoss()

    def forward(self, text,e1,e2, y=None): 
        emb_text = self.emb(text)
        emb_e1 = self.emb(e1)
        emb_e2 = self.emb(e2) 
        r1,(_,_) = self.lstm(emb_text) 
        r2,(_,_)  = self.lstm_e1(emb_e1) 
        r3,(_,_)  = self.lstm_e2(emb_e2) 
        f = paddle.concat([r1[:,-1,:self.hidden_size],r1[:,0,self.hidden_size:],r2[:,-1,:],r3[:,-1,:]],axis=-1) 
        # [batch_size,time_steps,num_directions * hidden_size]
        fc_out = paddle.tanh(self.fc(f))
        logits = self.output_layer(fc_out)
        # probs = F.softmax(logits, axis=-1)

        return logits



四、模型训练配置

embedding_size=128
hidden_size=256
dropout_rate=0.1
fc_hidden_size=128
num_layers=2
model= Model(vocab_size,embedding_size,hidden_size,dropout_rate,fc_hidden_size,num_layers)
optimizer = paddle.optimizer.Adam(
        parameters=model.parameters(), learning_rate=5e-4)

loss_func = paddle.nn.CrossEntropyLoss() 

五、训练模型

epoches = 5
steps = 0
total_loss = []
total_acc  = []
Iters = []
for i in range(epoches): 
    for data in train_loader: 
        x,e1,e2,y = data
        steps += 1
        logits = model(x,e1,e2)
        pred = paddle.argmax(logits,axis=-1) 
        acc = sum(pred.numpy()==y.numpy())/len(y)
        loss = loss_func(logits,y) 
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        if steps % 30 == 0:
            Iters.append(steps)
            total_loss.append(loss.numpy()[0])
            total_acc.append(acc)
            #打印中间过程
            print(epo: , step: , loss is: , acc is: \\
                  .format(i, steps, loss.numpy(), acc))
        
    paddle.save(model.state_dict(),model_.pdparams.format(i))


def draw_process(title,color,iters,data,label):
    plt.title(title, fontsize=24)
    plt.xlabel("iter", fontsize=20)
    plt.ylabel(label, fontsize=20)
    plt.plot(iters, data,color=color,label=label) 
    plt.legend()
    plt.grid()
    plt.show()

    
draw_process("trainning loss","red",Iters,total_loss,"trainning loss")
draw_process("trainning acc","green",Iters,total_acc,"trainning acc")
      

输出结果如下图5、6、7所示:


六、验证模型


model.eval()
# model= Model(vocab_size,embedding_size,hidden_size,dropout_rate,fc_hidden_size,num_layers)
# params = paddle.load(model_9.pdparams)
# model.set_state(params)
preds = []
y = []
for data in val_loader: 
    x,e1,e2,y_ = data
    steps += 1
    logits = model(x,e1,e2)
    pred = paddle.argmax(logits,axis=-1)  
    y += list(y_.numpy()) 
    preds += list(pred.numpy())
cc = [1 if a==b else 0 for a,b in zip(preds,y)]
acc = sum(cc)/len(cc)
print(测试准确率为:,acc)

输出结果如下图8所示:


(七)、模型测试

model.eval()

x = paddle.to_tensor(test_idx)
e1 = paddle.to_tensor(test_e1_idx)
e2 = paddle.to_tensor(test_e2_idx)
y = paddle.to_tensor(test_yid)
logits = model(x,e1,e2)
pred = paddle.argmax(logits,axis=-1) 
# test_sents,test_e1,test_e2,test_y
for i in range(5):
    print(-*30)
    print(上下文: , .join(test_sents[i]))
    print(实体1:  ,test_e1[i])
    print(实体2:  ,test_e2[i])
    print(真实标签:,test_y[i].split(()[0])
    print(预测标签:,id2rel[pred[i].numpy()[0]].split(()[0])

输出结果如下图9所示:


总结

本系列文章内容为根据清华社出版的《自然语言处理实践》所作的相关笔记和感悟,其中代码均为基于百度飞桨开发,若有任何侵权和不妥之处,请私信于我,定积极配合处理,看到必回!!!

最后,引用本次活动的一句话,来作为文章的结语~( ̄▽ ̄~)~:

【**学习的最大理由是想摆脱平庸,早一天就多一份人生的精彩;迟一天就多一天平庸的困扰。**】

以上是关于自然语言处理(NLP)基于BiLSTM的关系抽取的主要内容,如果未能解决你的问题,请参考以下文章

自然语言处理之基于biLSTM的pytorch立场检测实现

《自然语言处理实战入门》 ---- NLP方向:面试笔试题集

NLP领域的ImageNet时代到来:词嵌入「已死」,语言模型当立

斯坦福Introduction to NLP:第十讲关系抽取

知识图谱关系抽取

自然语言处理(NLP)基于ERNIE语言模型的文本语义匹配