gensim中的word2vec

Posted

一骑绝尘

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了gensim中的word2vec相关的知识,希望对你有一定的参考价值。

安装gensim前要装python,numpy, scipy, 通过pip list检查
开始安装gensim

    sudo pip install gensim

 

参考文档:http://www.jianshu.com/p/6d542ff65b1e
    http://kexue.fm/archives/4316/

文档http://www.jianshu.com/p/6d542ff65b1e上的两个python程序有错误, 我已经改正,内容见python文件

对文件编码格式处理
cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt
分词

python word_segment.py corpus.txt corpus_seg.txt



训练模型

python train_word2vec_model.py seg_title_keyword_abstracts.txt agriculture.model agriculture.vector
python train_word2vec_model.py seg_qa.txt qa.model qa.vector

我的训练模型中的参数也和文档不一样,见train_word2vec_model.py的26行
 model = Word2Vec(LineSentence(inp),size=400,window=5, sg=1, hs=1,min_count=5,workers=multiprocessing.cpu_count())

 

dir_seg.py文件

# -*- encoding:utf-8 -*-
import jieba  # 导入jieba模块
import re
import os

jieba.load_userdict("/home/hadoop/ext.dic")  # 加载自定义词典
jieba.del_word(‘延长‘)
import jieba.posseg as pseg

#对文件夹下的所有文件进行分词,分词结果存入一个文件中
def splitSentence(fold, outputFile):
    # 把停用词做成字典
    # stopwords = {}
    # fstop = open(‘stop_words.txt‘, ‘r‘)
    # for eachWord in fstop:
    #     stopwords[eachWord.strip().decode(‘utf-8‘, ‘ignore‘)] = eachWord.strip().decode(‘utf-8‘, ‘ignore‘)
    # fstop.close()
    if not os.path.isdir(fold):
        print ‘不是一个文件夹,请检查!‘
    else:
        filelist = os.listdir(fold)
        for f in filelist:
            of = fold + ‘/‘ + f
            fin = open(of, ‘r‘)  # 以读的方式打开文件
            fout = open(outputFile, ‘a‘)  # 以写得方式打开文件
            jieba.enable_parallel(4)  # 并行分词
            for eachLine in fin:
                line = eachLine.strip().decode(‘utf-8‘, ‘ignore‘)  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
                line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\‘]+|[+——!,;:。?、[email protected]#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
                               line)
                wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词
                outStr = ‘‘
                for word in wordList:
                    #if word not in stopwords:
                    outStr += word
                    outStr += ‘ ‘
                fout.write(outStr.strip().encode(‘utf-8‘) + ‘\n‘)  # 将分词好的结果写入到输出文件
            fin.close()
            fout.close()
            print f,‘ 分词结束!‘
        print  ‘分词完成!‘


splitSentence(‘/home/hadoop/article‘, ‘/home/hadoop/tt.txt‘)

 

word_segment.py文件

# -*- coding: utf-8 -*-
# word_segment.py用于语料分词

import logging
import os.path
import sys
import re
import jieba

reload(sys)
sys.setdefaultencoding( "utf-8" )

# 先用正则将<content>和</content>去掉
def reTest(content):
  reContent = re.sub(‘<content>|</content>‘,‘‘,content)
  return reContent

if __name__ == ‘__main__‘:
  program = os.path.basename(sys.argv[0])
  logger = logging.getLogger(program)
  logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘)
  logging.root.setLevel(level=logging.INFO)
  logger.info("running %s" % ‘ ‘.join(sys.argv))

   # check and process input arguments
  if len(sys.argv) < 3:
    print globals()[‘__doc__‘] % locals()
    sys.exit(1)
  inp, outp = sys.argv[1:3]
  space = " "
  i = 0

  finput = open(inp)
  foutput = open(outp,‘w‘)
  for line in finput:
    line_seg = jieba.cut(reTest(line))
    foutput.write(space.join(line_seg))
    i = i + 1
    if (i % 1000 == 0):
      logger.info("Saved " + str(i) + " articles_seg")

  finput.close()
  foutput.close()
  logger.info("Finished Saved " + str(i) + " articles")



 

# -*- coding: utf-8 -*-
# train_word2vec_model.py用于训练模型

import logging
import os.path
import sys
import multiprocessing

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__==‘__main__‘:
  program = os.path.basename(sys.argv[0])
  logger = logging.getLogger(program)

  logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘)
  logging.root.setLevel(level=logging.INFO)
  logging.info("running %s" % ‘ ‘.join(sys.argv))

  if len(sys.argv) < 4:
    print globals()[‘__doc__‘] % locals()
    sys.exit(1)

  inp,outp,outp2 = sys.argv[1:4]

  model = Word2Vec(LineSentence(inp),size=400,window=5, sg=1, hs=1,min_count=5,workers=multiprocessing.cpu_count())

  model.save(outp)
  model.wv.save_word2vec_format(outp2,binary=False)

 

模型的运用:
# coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from gensim.models import Word2Vec
import logging,gensim,os
import pandas as pd

#模型的加载
model = Word2Vec.load(‘/home/hadoop/agriculture.model‘)

class  similarmodel:

    def get_simiar_keyword(self, word):
        str = ‘‘
        try:

            simi = pd.Series(model.most_similar(word))
        except BaseException:
            str = ‘‘
        else:

            for i in simi:
                temp = i[0]+ ‘;‘
                str += temp

        return str

si = similarmodel()
print si.get_simiar_keyword(u‘水稻‘)

 

求top105的关键词

# coding:utf-8

import xlsxwriter

import sys
import csv
import re
import collections as col

from simular import similarmodel



reload(sys)
sys.setdefaultencoding( "utf-8" )

#提取top100的关键词
zh_pattern = re.compile(u‘[\u4e00-\u9fa5]+‘)


def contain_zh(word):
    global zh_pattern
    match = zh_pattern.search(word)

    return match


filename = ‘/home/hadoop/caas_casdd_utf8.csv‘
filedistination = ‘/home/hadoop/top100keyword.xlsx‘


def getfromcsv(csvfile, txtfile):
    # csvfile:要读取的csv文件,注意csv的编码为utf-8,提前可用editplus工具转为utf-8
    # txtfile:写入目标文件
    list_keyword = []
    with open(filename, ‘r‘) as readf:
        with open(filedistination, ‘w‘) as fw:
            reader = csv.DictReader(readf)
            # 抽取abstracts列
            for row in reader:
                temp = row[‘keyword‘].decode(‘utf-8‘)
                sp = re.split(u‘[;,;,;,/“”、。()()]‘, temp)
                for w in sp:
                    if contain_zh(w):
                        # print w
                        list_keyword.append(w)
                        # fw.write(w.encode(‘utf-8‘)+‘\n‘)
                        # print temp,‘\n‘

            i = 0
            for w in list_keyword:
                i = i + 1
                # print w
                # fw.write(w.encode(‘utf-8‘) + ‘\n‘)

            print i

    # print list_keyword

    c = col.Counter(list_keyword)
    top100 = c.most_common(105)

    # for i in top100:
    #     print i[0], ‘ ‘, i[1]


    # print c.most_common(10)
    # print  str(list(c.most_common(100))).replace(‘u\‘‘, ‘\‘‘).decode("unicode-escape")

    workbook = xlsxwriter.Workbook(filedistination)  # 建立文件

    worksheet = workbook.add_worksheet()  # 建立sheet

    worksheet.write(‘A1‘, u‘关键词‘)  # 向A1写入
    worksheet.write(‘B1‘, u‘词频‘)  # 向B1写入
    worksheet.write(‘C1‘, u‘相关词‘)  # 向C1写入

    row = 1
    colomn = 0

    si = similarmodel()

    # print si.get_simiar_keyword(u‘杂交水稻‘)
    for i in top100:
        worksheet.write(row, colomn, i[0])
        worksheet.write(row, colomn + 1, i[1])
        worksheet.write(row, colomn + 2, si.get_simiar_keyword((i[0])))
        print si.get_simiar_keyword((i[0]))
        row += 1





    workbook.close()



getfromcsv(filename, filedistination)

 

以上是关于gensim中的word2vec的主要内容,如果未能解决你的问题,请参考以下文章

使用 Gensim 获取三元组的问题

如何从 gensim 模型中的 Doc2Vec 相似度分数访问文档详细信息?

使用 gensim 了解 LDA 实现

gensim使用方法以及例子

使用 Gensim 获得 LDA 模型的最佳主题数量的最佳方法是啥?

gensim Doc2Vec vs tensorflow Doc2Vec