安装gensim前要装python,numpy, scipy, 通过pip list检查
开始安装gensim
sudo pip install gensim
参考文档:http://www.jianshu.com/p/6d542ff65b1e
http://kexue.fm/archives/4316/
文档http://www.jianshu.com/p/6d542ff65b1e上的两个python程序有错误, 我已经改正,内容见python文件
对文件编码格式处理
cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>" > corpus.txt
分词
python word_segment.py corpus.txt corpus_seg.txt
训练模型
python train_word2vec_model.py seg_title_keyword_abstracts.txt agriculture.model agriculture.vector
python train_word2vec_model.py seg_qa.txt qa.model qa.vector
我的训练模型中的参数也和文档不一样,见train_word2vec_model.py的26行
model = Word2Vec(LineSentence(inp),size=400,window=5, sg=1, hs=1,min_count=5,workers=multiprocessing.cpu_count())
dir_seg.py文件
# -*- encoding:utf-8 -*-
import jieba # 导入jieba模块
import re
import os
jieba.load_userdict("/home/hadoop/ext.dic") # 加载自定义词典
jieba.del_word(‘延长‘)
import jieba.posseg as pseg
#对文件夹下的所有文件进行分词,分词结果存入一个文件中
def splitSentence(fold, outputFile):
# 把停用词做成字典
# stopwords = {}
# fstop = open(‘stop_words.txt‘, ‘r‘)
# for eachWord in fstop:
# stopwords[eachWord.strip().decode(‘utf-8‘, ‘ignore‘)] = eachWord.strip().decode(‘utf-8‘, ‘ignore‘)
# fstop.close()
if not os.path.isdir(fold):
print ‘不是一个文件夹,请检查!‘
else:
filelist = os.listdir(fold)
for f in filelist:
of = fold + ‘/‘ + f
fin = open(of, ‘r‘) # 以读的方式打开文件
fout = open(outputFile, ‘a‘) # 以写得方式打开文件
jieba.enable_parallel(4) # 并行分词
for eachLine in fin:
line = eachLine.strip().decode(‘utf-8‘, ‘ignore‘) # 去除每行首尾可能出现的空格,并转为Unicode进行处理
line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\‘]+|[+——!,;:。?、[email protected]#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
line)
wordList = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词
outStr = ‘‘
for word in wordList:
#if word not in stopwords:
outStr += word
outStr += ‘ ‘
fout.write(outStr.strip().encode(‘utf-8‘) + ‘\n‘) # 将分词好的结果写入到输出文件
fin.close()
fout.close()
print f,‘ 分词结束!‘
print ‘分词完成!‘
splitSentence(‘/home/hadoop/article‘, ‘/home/hadoop/tt.txt‘)
word_segment.py文件
# -*- coding: utf-8 -*-
# word_segment.py用于语料分词
import logging
import os.path
import sys
import re
import jieba
reload(sys)
sys.setdefaultencoding( "utf-8" )
# 先用正则将<content>和</content>去掉
def reTest(content):
reContent = re.sub(‘<content>|</content>‘,‘‘,content)
return reContent
if __name__ == ‘__main__‘:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘)
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ‘ ‘.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print globals()[‘__doc__‘] % locals()
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
finput = open(inp)
foutput = open(outp,‘w‘)
for line in finput:
line_seg = jieba.cut(reTest(line))
foutput.write(space.join(line_seg))
i = i + 1
if (i % 1000 == 0):
logger.info("Saved " + str(i) + " articles_seg")
finput.close()
foutput.close()
logger.info("Finished Saved " + str(i) + " articles")
# -*- coding: utf-8 -*-
# train_word2vec_model.py用于训练模型
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__==‘__main__‘:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘)
logging.root.setLevel(level=logging.INFO)
logging.info("running %s" % ‘ ‘.join(sys.argv))
if len(sys.argv) < 4:
print globals()[‘__doc__‘] % locals()
sys.exit(1)
inp,outp,outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp),size=400,window=5, sg=1, hs=1,min_count=5,workers=multiprocessing.cpu_count())
model.save(outp)
model.wv.save_word2vec_format(outp2,binary=False)
模型的运用:
# coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from gensim.models import Word2Vec
import logging,gensim,os
import pandas as pd
#模型的加载
model = Word2Vec.load(‘/home/hadoop/agriculture.model‘)
class similarmodel:
def get_simiar_keyword(self, word):
str = ‘‘
try:
simi = pd.Series(model.most_similar(word))
except BaseException:
str = ‘‘
else:
for i in simi:
temp = i[0]+ ‘;‘
str += temp
return str
si = similarmodel()
print si.get_simiar_keyword(u‘水稻‘)
求top105的关键词
# coding:utf-8
import xlsxwriter
import sys
import csv
import re
import collections as col
from simular import similarmodel
reload(sys)
sys.setdefaultencoding( "utf-8" )
#提取top100的关键词
zh_pattern = re.compile(u‘[\u4e00-\u9fa5]+‘)
def contain_zh(word):
global zh_pattern
match = zh_pattern.search(word)
return match
filename = ‘/home/hadoop/caas_casdd_utf8.csv‘
filedistination = ‘/home/hadoop/top100keyword.xlsx‘
def getfromcsv(csvfile, txtfile):
# csvfile:要读取的csv文件,注意csv的编码为utf-8,提前可用editplus工具转为utf-8
# txtfile:写入目标文件
list_keyword = []
with open(filename, ‘r‘) as readf:
with open(filedistination, ‘w‘) as fw:
reader = csv.DictReader(readf)
# 抽取abstracts列
for row in reader:
temp = row[‘keyword‘].decode(‘utf-8‘)
sp = re.split(u‘[;,;,;,/“”、。()()]‘, temp)
for w in sp:
if contain_zh(w):
# print w
list_keyword.append(w)
# fw.write(w.encode(‘utf-8‘)+‘\n‘)
# print temp,‘\n‘
i = 0
for w in list_keyword:
i = i + 1
# print w
# fw.write(w.encode(‘utf-8‘) + ‘\n‘)
print i
# print list_keyword
c = col.Counter(list_keyword)
top100 = c.most_common(105)
# for i in top100:
# print i[0], ‘ ‘, i[1]
# print c.most_common(10)
# print str(list(c.most_common(100))).replace(‘u\‘‘, ‘\‘‘).decode("unicode-escape")
workbook = xlsxwriter.Workbook(filedistination) # 建立文件
worksheet = workbook.add_worksheet() # 建立sheet
worksheet.write(‘A1‘, u‘关键词‘) # 向A1写入
worksheet.write(‘B1‘, u‘词频‘) # 向B1写入
worksheet.write(‘C1‘, u‘相关词‘) # 向C1写入
row = 1
colomn = 0
si = similarmodel()
# print si.get_simiar_keyword(u‘杂交水稻‘)
for i in top100:
worksheet.write(row, colomn, i[0])
worksheet.write(row, colomn + 1, i[1])
worksheet.write(row, colomn + 2, si.get_simiar_keyword((i[0])))
print si.get_simiar_keyword((i[0]))
row += 1
workbook.close()
getfromcsv(filename, filedistination)