python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量相关的知识,希望对你有一定的参考价值。

import os
import numpy as np
import pandas as pd
import codecs
from pathlib import Path
from gensim import models
import random


class DataMerger():

    word_vectors = models.Word2Vec.load("word2vec.model")
    base_path = Path(__file__).parent
    data_path = (base_path / "data_train").resolve()
    WINDOW = 3
    CONTENTS = []
    NOUNS = []
    NPROS = []
    LABELS = []
    UNKS = 0

    # returns sum of vectors for context words within given window range
    def getContextVector(self, tuple, window):
        pos = tuple[0]
        found = 0
        vector = np.ndarray(shape=300)
        for i in range(pos-window, pos+window):
            if i > len(self.CONTENTS):
                break
            if i == pos or i < 0:
                continue
            line = self.CONTENTS[i]
            if len(line) < 2:
                continue
            if i < pos:
                if line == "":
                    continue
                elif not line.split("\t")[5].split(",")[0] in ["NOUN", "ADJF", "VERB", "ADVB"]:
                    continue
                else:
                    found += 1
                    word = line.split("\t")[3].lower()
                    if len(word) < 2 and word != "я":
                        continue
                    if word in self.word_vectors.wv.vocab:
                        vector += self.word_vectors.wv[word]
                    else:
                        self.UNKS += 1
                        print("------CONTEXT------")
                        print(word)

            else:
                if line == "":
                    break
                elif not line.split("\t")[5].split(",")[0] in ["NOUN", "ADJF", "VERB", "ADVB"]:
                    continue
                else:
                    found += 1
                    word = line.split("\t")[3].lower()
                    if len(word) < 2 and word != "я":
                        continue
                    if word in self.word_vectors.wv.vocab:
                        vector += self.word_vectors.wv[word]
                    else:
                        self.UNKS += 1
                        print("------CONTEXT------")
                        print(word)
                    
        if found == 0:
            return None
        return vector
            
    # returns w2v vector for single word or noun phrase
    def getWordVector(self, word):
        words = word.split(" ")
        lenth = len(words)
        found = 0
        vector = np.zeros(shape=300)
        for word in words:
            if len(word) < 2 and word != "я":
                continue
            word = word.lower()
            if word in self.word_vectors.wv.vocab:
                found += 1
                vector += self.word_vectors.wv[word]
            else:
                self.UNKS += 1
                print("------WORD------")
                print(word)
        if found == 0:
            return None
        return vector
        
    def createDataset(self):
        for filename in os.listdir(self.data_path):
            file_path = (self.data_path / filename).resolve()
            with codecs.open(file_path, "rb", encoding="utf-8", errors="ignore") as text:
                contents = text.read() 
                self.CONTENTS = contents.split("\n")
                chained_lines = [line for line in contents.split("\n") 
                    if line.strip() and line.split("\t")[6] != "-" ] # chain column 

                chains = set()    

                for line in chained_lines:
                    chain = line.split("\t")[6].split(",")
                    for ch in chain:
                        chains.add(ch)

                for index,chain in enumerate(chains):
                    noun_phrase = ""
                    random_noun = ""
                    cur_mention = ""
                    npros = []
                    lines = [line for line in chained_lines if line.split("\t")[6] == chain] # all lines for given chain
                    filtered_lines = []
                    random_chain = random.randint(0,len(chains))
                    random_lines = [line for line in chained_lines if line.split("\t")[6] == random_chain] # all lines for random chain

                    for index, line in enumerate(lines):
                        word = line.split("\t")[3]
                        mention = line.split("\t")[7]
                        morph = line.split("\t")[5].split(",")[0]
                        if morph == "NOUN":
                            if mention == cur_mention:
                                noun_phrase += " " + word
                                filtered_lines.append(line)
                            if noun_phrase == "":
                                noun_phrase += word
                                cur_mention = mention
                                filtered_lines.append(line)
                        elif morph == "NPRO":
                            npros.append((index, word))
                            filtered_lines.append(line)

                    for index, line in enumerate(random_lines):
                        word = line.split("\t")[3]
                        mention = line.split("\t")[7]
                        morph = line.split("\t")[5].split(",")[0]
                        if morph == "NOUN":
                            if mention == cur_mention:
                                random_noun += " " + word
                                filtered_lines.append(line)
                            if noun_phrase == "":
                                random_noun += word
                                cur_mention = mention
                                filtered_lines.append(line)

                    if len(npros) > 0:
                        noun_vec = self.getWordVector(noun_phrase)
                        rand_vec = self.getWordVector(random_noun)
                        for npro in npros:
                            npro_vec = self.getContextVector(npro,self.WINDOW)
                            if noun_vec is not None and npro_vec is not None:
                                self.NOUNS.append(self.getWordVector(noun_phrase))
                                self.NPROS.append(self.getContextVector(npro,self.WINDOW))
                                self.LABELS.append(1)
                            if rand_vec is not None and npro_vec is not None:
                                self.NOUNS.append(self.getWordVector(random_noun))
                                self.NPROS.append(self.getContextVector(npro,self.WINDOW))
                                self.LABELS.append(0)
                                
        print(self.UNKS)                
        
    def createCSV(self):
        df = pd.DataFrame({"noun" : self.NOUNS, "npro" : self.NPROS, "label" : self.LABELS})
        df.to_csv("dataset.csv", index=False)


以上是关于python 将单独的数据文件转换为一个数据集,从单词或单词中获取向量,从上下文中获取向量的主要内容,如果未能解决你的问题,请参考以下文章

使用Python将DOTA数据集的格式转换成VOC2007数据集的格式

使用Python将DOTA数据集的格式转换成VOC2007数据集的格式

使用Python将DOTA数据集的格式转换成VOC2007数据集的的格式

使用Python将NWPU VHR-10数据集的格式转换成VOC2007数据集的格式

使用Python将NWPU VHR-10数据集的格式转换成VOC2007数据集的的格式

将大 csv 转换为 hdf5