Word2VecfastTextGlove训练词向量
Posted ~无关风月~
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Word2VecfastTextGlove训练词向量相关的知识,希望对你有一定的参考价值。
Word2Vec
环境:
gensim 3.5.0
python 3.6.1
训练
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
program = os.path.basename('train_word2vec_model')
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
inp='./data/cleaned_tweets_text_160W.csv'
outp='Word2vec_model/word2vec_model_160w_200d'
n_dim=200
model = Word2Vec(LineSentence(inp), size=n_dim, window=5, min_count=5,
workers=multiprocessing.cpu_count())
model.save(outp)
使用
import gensim
outp='Word2vec_model/word2vec_model_160w_200d'
model = gensim.models.Word2Vec.load(outp)
model['happy']
array([ -7.10430890e-02, -5.29273868e-01, 4.72357810e-01, 6.88659430e-01, -2.42118329e-01, -9.04537499e-01, 3.61087114e-01, -3.96869183e-01, -1.67573178e+00, ... 1.18314767e+00, -1.24723041e+00, 1.19374382e+00, 3.74429256e-01, 1.68333733e+00], dtype=float32)
result = model.most_similar(u"sad",topn=10)
for e in result:
print(e[0],e[1])
sad" 0.760986328125
upset 0.7119562029838562
sadd 0.7007699012756348
depressed 0.6737781763076782
bummed 0.6687890291213989
saad 0.6555933952331543
upsetting 0.6473105549812317
devastated 0.6356861591339111
disappointed 0.6215260624885559
heartbroken 0.5960309505462646
# man - woman = king - queen
# man - woman = husband - wife
# man - woman = boy - girl
model.most_similar(['woman', 'boy'], ['man'], topn=1)
[(‘girl’, 0.6189800500869751)]
print (model.doesnt_match(u"happy sad like desk".split()))
desk
fastText
环境:
gensim 3.5.0
python 3.6.1
安装fasttext
可见:https://github.com/facebookresearch/fastText/tree/master/python
$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ pip install .
训练
from gensim.models import FastText
from gensim.models.word2vec import LineSentence
import logging
import os.path
import sys
program = os.path.basename('fast-text')
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
inp='./data/cleaned_tweets_text_160W.csv'
outp='fasttext_model/fasttext_model_160w_200d'
n_dim=200
model = FastText(LineSentence(inp), size=n_dim, window=5, min_count=5)
model.save(outp)
使用
from gensim.models import FastText
# 使用:模型得保存与加载
outp='../word-Vectorization/fasttext_model/fasttext_model_160w_200d'
model = FastText.load(outp)
# 指定词条词向量
model['happy']
array([ 8.10116410e-01, 5.25403082e-01, -3.36705118e-01,
-4.42356855e-01, -2.04296446e+00, -9.93114054e-01,
…
1.20443761e+00, -3.73263896e-01, 1.15821147e+00,
-1.95465660e+00, 2.11104417e+00], dtype=float32)
result = model.most_similar(u"queeen")
for e in result:
print(e[0],e[1])
queen" 0.8662827014923096
mcqueen 0.8575066924095154
queenie 0.8428698182106018
queens 0.7900538444519043
queer 0.7410856485366821
queezy 0.7156778573989868
queenstown 0.7068071365356445
quen 0.7057324647903442
queensland 0.7055833339691162
Glove
环境:
python 3.6.1
官方glove:https://github.com/stanfordnlp/GloVe
笔者使用:https://github.com/maciejkula/glove-python
安装:
pip install glove_python
训练
from __future__ import print_function
import argparse
import pprint
import gensim
from glove import Glove
from glove import Corpus
from gensim.models.word2vec import LineSentence
inp='./data/processed_160Msemeval-2016-2017-task3-QAText'
outp='glove_model/QAText_200d'
n_dim=200
corpus_model = Corpus()
corpus_model.fit(LineSentence(inp), window=5)
#corpus_model.save('corpus.model')
print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)
glove = Glove(no_components=n_dim, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10,
no_threads=1, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save(outp)
使用
from glove import Glove
outp='../word-Vectorization/glove_model/glove_model_160w_200d'
glove = Glove.load(outp)
glove.most_similar('happiness', number=10)
[(‘optimism’, 0.88474224538009472),
(‘technology’, 0.87399374238355365),
(‘photography’, 0.87293168487727013),
(‘jquery’, 0.87160586331482937),
(‘excitement’, 0.87015550639464156),
(‘mailplane’, 0.86709197956363315),
(‘secrets’, 0.86616698912523094),
(‘laughter’, 0.86593359775060097),
(‘records’, 0.86521431383969472)]
# 指定词条词向量
glove.word_vectors[glove.dictionary['happy']]
array([ 0.1824374 , -0.15493986, -0.23131742, -0.20251903, -0.25899053,
0.16043589, -0.11017494, -0.15413852, 0.12485044, 0.28871841,
…
0.14323183, -0.21197602, -0.18841062, 0.32888953, -0.32943953,
0.15334943, -0.09995708, 0.10678763, 0.12507708, -0.26995188,
0.17373759, -0.17477675, 0.16042781, -0.3823496 , -0.21795925])
参考:
https://www.zybuluo.com/hanxiaoyang/note/472184?tdsourcetag=s_pctim_aiomsg
https://blog.csdn.net/sinat_26917383/article/details/83041424
https://blog.csdn.net/sinat_26917383/article/details/83029140
以上是关于Word2VecfastTextGlove训练词向量的主要内容,如果未能解决你的问题,请参考以下文章
Machine Translation无监督神经机器翻译论述