中文情感分析 glove+LSTM

Posted xyli09

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了中文情感分析 glove+LSTM相关的知识,希望对你有一定的参考价值。

最近尝试了一下中文的情感分析。

主要使用了Glove和LSTM。语料数据集采用的是中文酒店评价语料

1、首先是训练Glove,获得词向量(这里是用的300d)。这一步使用的是jieba分词和中文维基。

2、将中文酒店评价语料进行清洗,并分词。分词后转化为词向量的表示形式。

3、使用LSTM网络进行训练。

最终的正确率在91%左右

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 13:52:23 2018

@author: xyli
处理酒店评价语料数据,
分词,并转化为Glove向量
"""
import sys
import os
import chardet
import jieba
import re
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Masking
from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape
from keras.models import Sequential, Model
from Attention_layer import Attention_layer

from keras.layers import Convolution2D, MaxPooling2D  
from keras.utils import np_utils 


def loadGLoveModel(filename):
    embeddings_index = {}
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype=float32)
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def word2Glovec(List,model):
    vec=[]
    insert = [float(0) for i in range(300)] #300表示vec的维度
    insert = np.asarray(insert, dtype=float32)
    for w in List:
        v = model.get(w)
        if v is None:
            vec.append(insert)
        else:
            vec.append(v)
    return vec

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
#    string = string.decode(‘utf-8‘)
    string = re.sub(r"\", "", string)
    string = re.sub(r"", "", string)
    string = re.sub(r""", "", string)
    string = re.sub(r"
", "", string)
    string = re.sub(r"
", "", string)
    string = re.sub(r",","",string)
    string = re.sub(r".","",string)
    string = re.sub(r"","",string)
    string = re.sub(r"","",string)
    string = re.sub(r"","",string)
    string = re.sub(r"","",string)
    string = re.sub(r"(","",string)
    string = re.sub(r")","",string)
    string = re.sub(r"","",string)
    string = re.sub(r"","",string)
    return string.strip()

def fitList(List,n):
    L = len(List)
#    insert = [0 for i in range(300)]
    insert = !
    if L < n:
        d=n-L
        appList=[insert for i in range(d)]
        List+=appList
    else:
        if L>n:
            List=List[0:n]
    return List

def readData(filename):
    
    
    with open(filename, rb) as f:
        data = f.read()
        data=data.decode(gb18030,ignore)
        data=clean_str(data)
        seg_list = jieba.cut(data)  # 默认是精确模式
    segList=[]
    for s in seg_list:
        s=clean_str(s)
        segList.append(s)
    return segList
        
def loadData():
    Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000"
    DIR=[/neg,/pos]
    commentList=[]
    rootdir = Corpus_DIR+DIR[0]
    filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件
    labelList=[[0.0,1.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    
    rootdir = Corpus_DIR+DIR[1]
    filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件
    labelList2=[[1.0,0.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    labelList+=labelList2
    return commentList,labelList

if __name__==__main__:
    List,labelList=loadData()  #加载语料数据
    gloveModel=loadGLoveModel(model/zhs_wiki_glove.vectors.300d.txt)  #加载glove模型数据
    countList=[]
    commentVecList=[]
    n=100
    for c in List:
        countList.append(len(c))
        glovec=word2Glovec(fitList(c,n),gloveModel)
        commentVecList.append(glovec)
        
    VALIDATION_SPLIT = 0.2
    
    commentVecList=np.array(commentVecList)
    labelList=np.array(labelList)
    indices = np.arange(commentVecList.shape[0])
    np.random.shuffle(indices)
    data = commentVecList[indices]
    labels = labelList[indices]
    
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    
    model = Sequential()
    model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
#    model.add(Activation(‘relu‘)) #激活层 
#    model.add(Attention_layer())
    model.add(Bidirectional(LSTM(60,return_sequences=True)))
#    model.add(Attention_layer())
#    model.add(Activation(‘relu‘)) #激活层 
    model.add(Dropout(0.3)) #神经元随机失活
    model.add(Bidirectional(LSTM(30,return_sequences=False)))
    model.add(Dropout(0.3)) #神经元随机失活
    model.add(Dense(y_train.shape[1], activation=softmax))
    model.compile(loss=categorical_crossentropy, optimizer=adam, metrics=[accuracy])
    model.summary()
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=25, batch_size=200)
   
    

本文还在完善中。。。

以上是关于中文情感分析 glove+LSTM的主要内容,如果未能解决你的问题,请参考以下文章

文本情感分析:基于word2vec和glove词向量的文本表示

NLP文本情感分析

使用 Keras 进行情感分类器训练

深度学习项目五:利用LSTM网络进行情感分析(NLP)

使用Keras对LoutM进行Youtube评论的情感分析

基于LSTM的文本情感分析(Keras版)