python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量相关的知识,希望对你有一定的参考价值。
import os
import numpy as np
import pandas as pd
import codecs
from pathlib import Path
from gensim import models
import random
class DataMerger():
word_vectors = models.Word2Vec.load("word2vec.model")
base_path = Path(__file__).parent
data_path = (base_path / "data_train").resolve()
WINDOW = 3
CONTENTS = []
NOUNS = []
NPROS = []
LABELS = []
UNKS = 0
# returns sum of vectors for context words within given window range
def getContextVector(self, tuple, window):
pos = tuple[0]
found = 0
vector = np.ndarray(shape=300)
for i in range(pos-window, pos+window):
if i > len(self.CONTENTS):
break
if i == pos or i < 0:
continue
line = self.CONTENTS[i]
if len(line) < 2:
continue
if i < pos:
if line == "":
continue
elif not line.split("\t")[5].split(",")[0] in ["NOUN", "ADJF", "VERB", "ADVB"]:
continue
else:
found += 1
word = line.split("\t")[3].lower()
if len(word) < 2 and word != "я":
continue
if word in self.word_vectors.wv.vocab:
vector += self.word_vectors.wv[word]
else:
self.UNKS += 1
print("------CONTEXT------")
print(word)
else:
if line == "":
break
elif not line.split("\t")[5].split(",")[0] in ["NOUN", "ADJF", "VERB", "ADVB"]:
continue
else:
found += 1
word = line.split("\t")[3].lower()
if len(word) < 2 and word != "я":
continue
if word in self.word_vectors.wv.vocab:
vector += self.word_vectors.wv[word]
else:
self.UNKS += 1
print("------CONTEXT------")
print(word)
if found == 0:
return None
return vector
# returns w2v vector for single word or noun phrase
def getWordVector(self, word):
words = word.split(" ")
lenth = len(words)
found = 0
vector = np.zeros(shape=300)
for word in words:
if len(word) < 2 and word != "я":
continue
word = word.lower()
if word in self.word_vectors.wv.vocab:
found += 1
vector += self.word_vectors.wv[word]
else:
self.UNKS += 1
print("------WORD------")
print(word)
if found == 0:
return None
return vector
def createDataset(self):
for filename in os.listdir(self.data_path):
file_path = (self.data_path / filename).resolve()
with codecs.open(file_path, "rb", encoding="utf-8", errors="ignore") as text:
contents = text.read()
self.CONTENTS = contents.split("\n")
chained_lines = [line for line in contents.split("\n")
if line.strip() and line.split("\t")[6] != "-" ] # chain column
chains = set()
for line in chained_lines:
chain = line.split("\t")[6].split(",")
for ch in chain:
chains.add(ch)
for index,chain in enumerate(chains):
noun_phrase = ""
random_noun = ""
cur_mention = ""
npros = []
lines = [line for line in chained_lines if line.split("\t")[6] == chain] # all lines for given chain
filtered_lines = []
random_chain = random.randint(0,len(chains))
random_lines = [line for line in chained_lines if line.split("\t")[6] == random_chain] # all lines for random chain
for index, line in enumerate(lines):
word = line.split("\t")[3]
mention = line.split("\t")[7]
morph = line.split("\t")[5].split(",")[0]
if morph == "NOUN":
if mention == cur_mention:
noun_phrase += " " + word
filtered_lines.append(line)
if noun_phrase == "":
noun_phrase += word
cur_mention = mention
filtered_lines.append(line)
elif morph == "NPRO":
npros.append((index, word))
filtered_lines.append(line)
for index, line in enumerate(random_lines):
word = line.split("\t")[3]
mention = line.split("\t")[7]
morph = line.split("\t")[5].split(",")[0]
if morph == "NOUN":
if mention == cur_mention:
random_noun += " " + word
filtered_lines.append(line)
if noun_phrase == "":
random_noun += word
cur_mention = mention
filtered_lines.append(line)
if len(npros) > 0:
noun_vec = self.getWordVector(noun_phrase)
rand_vec = self.getWordVector(random_noun)
for npro in npros:
npro_vec = self.getContextVector(npro,self.WINDOW)
if noun_vec is not None and npro_vec is not None:
self.NOUNS.append(self.getWordVector(noun_phrase))
self.NPROS.append(self.getContextVector(npro,self.WINDOW))
self.LABELS.append(1)
if rand_vec is not None and npro_vec is not None:
self.NOUNS.append(self.getWordVector(random_noun))
self.NPROS.append(self.getContextVector(npro,self.WINDOW))
self.LABELS.append(0)
print(self.UNKS)
def createCSV(self):
df = pd.DataFrame({"noun" : self.NOUNS, "npro" : self.NPROS, "label" : self.LABELS})
df.to_csv("dataset.csv", index=False)
以上是关于python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量的主要内容,如果未能解决你的问题,请参考以下文章
使用Python将DOTA数据集的格式转换成VOC2007数据集的格式
使用Python将DOTA数据集的格式转换成VOC2007数据集的格式
使用Python将DOTA数据集的格式转换成VOC2007数据集的的格式
使用Python将NWPU VHR-10数据集的格式转换成VOC2007数据集的格式