为什么spacy遗忘了旧的训练数据以及如何解决

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了为什么spacy遗忘了旧的训练数据以及如何解决相关的知识,希望对你有一定的参考价值。

我正在尝试为ner训练spacy模型。我有一个2940行的数据集,我训练了一个基础

模型,使用这些数据将其命名为current_model,我又得到了另外10个不同的数据集

每个行的范围从200到530行,所以我使用spacy的spacy.load("current_model")加载了current_model,然后使用每个数据集进行了训练。我试图预测ner

使用test data可以识别新数据集中的ner,但似乎忘记了最旧数据集中的ner

我这样做是因为减少了培训时间。请在下面查看我的代码以查看我尝试过的内容

要做和我

基本模型训练代码


import spacy

from spacy.util import minibatch,compounding

import random

from pathlib import Path

from spacy import displacy

import re

import pandas as pd

from bs4 import BeautifulSoup

from datetime import datetime

imporcytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: row["_id"], tag.name, on row i+2skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no i+2")

continue

i += 1

if entities:

train_data.append((text, "entities": entities))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = 

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took ".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: row["_id"], tag.name, on row i+2skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no i+2")

continue

i += 1

if entities:

train_data.append((text, "entities": entities))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = 

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took ".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


用于训练新数据集并保存到另一个目录的代码

注意:下面的代码是写入新文件。


import spacy

from spacy import displacy

import pandas as pd

from annotations_training_spacy_31_oct_2019 import populate_train_data,train,save_to_directory



# test_texts = "I Like Today and Evening"

# base_training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

test_text = test_texts


# new_data_set = [

# ("Today is an Awsome Day", "entities":[(1,5,"DAY")]),

# ]


nlp = train(training_data=new_data_set,old_training_data=base_training_data,model_name="trained_model_with_transfer_learning_8_2019_12_05")

save_to_directory(nlp,["trained_model_with_transfer_learning_9"])


doc = nlp(test_text)

print("ENTITIES in '%s'" % test_text)

nlp.add_pipe(nlp.create_pipe('sentencizer'))

sentence = list(doc.sents)

for ent in doc.ents:

print(ent.label_,ent.text)



displacy.serve(sentence, style='ent')


如您所见,我还尝试加载旧的数据集标签。但是我仍然有这个问题

我知道有些人遇到了这个问题,如果有人解决了这个问题,请帮帮我。

感谢您对朋友的帮助。

嗨,

我正在尝试为ner训练spacy模型。我有一个2940行的数据集,我训练了一个基础

模型,使用这些数据将其命名为current_model,我又得到了另外10个不同的数据集

每个行的范围从200到530行,所以我使用spacy的spacy.load("current_model")加载了current_model,然后使用每个数据集进行了训练。我试图预测ner

使用test data可以识别新数据集中的ner,但似乎忘记了最旧数据集中的ner

我这样做是因为减少了培训时间。请在下面查看我的代码以查看我尝试过的内容

要做和我

基本模型训练代码


import spacy

from spacy.util import minibatch,compounding

import random

from pathlib import Path

from spacy import displacy

import re

import pandas as pd

from bs4 import BeautifulSoup

from datetime import datetime

imporcytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: row["_id"], tag.name, on row i+2skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no i+2")

continue

i += 1

if entities:

train_data.append((text, "entities": entities))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = 

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took ".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: row["_id"], tag.name, on row i+2skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no i+2")

continue

i += 1

if entities:

train_data.append((text, "entities": entities))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = 

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took ".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


用于训练新数据集并保存到另一个目录的代码

注意:下面的代码是写入新文件。


import spacy

from spacy import displacy

import pandas as pd

from annotations_training_spacy_31_oct_2019 import populate_train_data,train,save_to_directory



# test_texts = "I Like Today and Evening"

# base_training_data = [

# ("I Like Today and Evening", "entities":[(7,12,"DAY"),(17,24,"DAY")]),

# ("Today is my lucky day", "entities":[(1,5,"DAY")]),

# ("Yesterday and Today are two same days of a month","entities":[(14,19,"DAY")]),

# ("May Today is Best Day","entities":[(4,9,"DAY")]),

# ("Have a Nice Today and Every Day","entities":[(12,17,"DAY")]),

# ("Hey How are feeling Today","entities":[(20,25,"DAY")]),

# ]

test_text = test_texts


# new_data_set = [

# ("Today is an Awsome Day", "entities":[(1,5,"DAY")]),

# ]


nlp = train(training_data=new_data_set,old_training_data=base_training_data,model_name="trained_model_with_transfer_learning_8_2019_12_05")

save_to_directory(nlp,["trained_model_with_transfer_learning_9"])


doc = nlp(test_text)

print("ENTITIES in '%s'" % test_text)

nlp.add_pipe(nlp.create_pipe('sentencizer'))

sentence = list(doc.sents)

for ent in doc.ents:

print(ent.label_,ent.text)



displacy.serve(sentence, style='ent')


如您所见,我还尝试加载旧的数据集标签。但是我仍然有这个问题

我知道有些人遇到了这个问题,如果有人解决了这个问题,请帮帮我。

感谢您对朋友的帮助。

答案

您是在训练新模型还是将其附加到现有的伪造模型上?如果您稍后进行操作,则所有NN(学习权重,特征)都将无法学习且未对齐,从而导致精度降低。当我想训练spacy无法识别的韩文和日文名称时,我是根据经验告诉我的。您也可以尝试使用FastText,Flair和Polyglot,看看它是否达到您的目的。尝试从所有这些工具中脱颖而出,您应该会获得良好的输出。那就是我最后使用的解决方案。

以上是关于为什么spacy遗忘了旧的训练数据以及如何解决的主要内容,如果未能解决你的问题,请参考以下文章

使用 spaCy 3 进行自定义 NER 训练会引发 ValueError

深度学习训练 | spaCy如何在云服务器上安装使用?

如何使用混淆矩阵计算自定义训练的 spacy ner 模型的整体准确性?

Spacy 从训练模型中提取命名实体关系

将 NER 训练数据转换为 Spacy 训练数据格式

设备离线时,apns 丢弃了旧的推送通知