在自定义数据集上训练 Spacy NER 会出错
Training Spacy NER on custom dataset gives error
我正在尝试在自定义数据集上训练 spacy NER 模型。基本上我想使用这个模型从简历中提取姓名、组织、电子邮件、phone 号码等。
下面是我正在使用的代码。
import json
import random
import spacy
import sys
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
from spacy.gold import biluo_tags_from_offsets
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
def reformat_train_data(tokenizer, examples):
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text.strip())
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets['entities'])
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
print("output",output)
return output
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\Users\akjain\Downloads\Entity-Recognition-In-Resumes-SpaCy-master\traindata.json")
nlp = spacy.blank("en")
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
def get_data(): return reformat_train_data(nlp.tokenizer, TRAIN_DATA)
optimizer = nlp.begin_training(get_data)
for itn in range(10):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
train_spacy()
我收到以下错误。此外,我遇到了一个 link (https://github.com/explosion/spaCy/issues/3558),其中包含一些修复此代码的建议。但是即使在实施之后我仍然会出错。
我正在使用 Python 3.6.5 和 Spacy 2.2.3
数据集:
{"content": "Nida Khan\nTech Support Executive - Teleperformance for Microsoft\n\nJaipur, Rajasthan - Email me on Indeed: indeed.com/r/Nida-Khan/6c9160696f57efd8\n\n• To be an integral part of the organization and enhance my knowledge to utilize it in a productive\nmanner for the growth of the company and the global.\n\nINDUSTRIAL TRAINING\n\n• BHEL, (HEEP) HARIDWAR\nOn CNC System& PLC Programming.\n\nWORK EXPERIENCE\n\nTech Support Executive\n\nTeleperformance for Microsoft -\n\nSeptember 2017 to Present\n\nprocess.\n• 21 months of experience in ADFC as Phone Banker.\n\nEDUCATION\n\nBachelor of Technology in Electronics & communication Engg\n\nGNIT institute of Technology - Lucknow, Uttar Pradesh\n\n2008 to 2012\n\nClass XII\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2007\n\nClass X\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2005\n\nSKILLS\n\nMicrosoft office, excel, cisco, c language, cbs. (4 years)\n\nhttps://www.indeed.com/r/Nida-Khan/6c9160696f57efd8?isid=rex-download&ikw=download-top&co=IN","annotation":[{"label":["Email Address"],"points":[{"start":872,"end":910,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Skills"],"points":[{"start":800,"end":857,"text":"Microsoft office, excel, cisco, c language, cbs. (4 years)"}]},{"label":["Graduation Year"],"points":[{"start":676,"end":679,"text":"2012"}]},{"label":["College Name"],"points":[{"start":612,"end":640,"text":"GNIT institute of Technology "}]},{"label":["Degree"],"points":[{"start":552,"end":609,"text":"Bachelor of Technology in Electronics & communication Engg"}]},{"label":["Companies worked at"],"points":[{"start":420,"end":448,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":395,"end":417,"text":"\nTech Support Executive"}]},{"label":["Email Address"],"points":[{"start":106,"end":144,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Location"],"points":[{"start":66,"end":71,"text":"Jaipur"}]},{"label":["Companies worked at"],"points":[{"start":35,"end":63,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":10,"end":32,"text":"Tech Support Executive "}]},{"label":["Designation"],"points":[{"start":9,"end":31,"text":"\nTech Support Executive"}]},{"label":["Name"],"points":[{"start":0,"end":8,"text":"Nida Khan"}]}]}
问题是您正在向模型优化器提供训练数据。
如 https://github.com/explosion/spaCy/issues/3558 中所述,使用以下函数从实体跨度中删除前导和尾随空格。
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
# if there's preceding spaces, move the start position to nearest character
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
然后使用下面的函数进行训练:
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\Users\akjain\Downloads\Entity-Recognition-In-Resumes-SpaCy-master\traindata.json")
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
我正在尝试在自定义数据集上训练 spacy NER 模型。基本上我想使用这个模型从简历中提取姓名、组织、电子邮件、phone 号码等。
下面是我正在使用的代码。
import json
import random
import spacy
import sys
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
from spacy.gold import biluo_tags_from_offsets
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
def reformat_train_data(tokenizer, examples):
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text.strip())
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets['entities'])
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
print("output",output)
return output
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\Users\akjain\Downloads\Entity-Recognition-In-Resumes-SpaCy-master\traindata.json")
nlp = spacy.blank("en")
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
def get_data(): return reformat_train_data(nlp.tokenizer, TRAIN_DATA)
optimizer = nlp.begin_training(get_data)
for itn in range(10):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
train_spacy()
我收到以下错误。此外,我遇到了一个 link (https://github.com/explosion/spaCy/issues/3558),其中包含一些修复此代码的建议。但是即使在实施之后我仍然会出错。
我正在使用 Python 3.6.5 和 Spacy 2.2.3
数据集:
{"content": "Nida Khan\nTech Support Executive - Teleperformance for Microsoft\n\nJaipur, Rajasthan - Email me on Indeed: indeed.com/r/Nida-Khan/6c9160696f57efd8\n\n• To be an integral part of the organization and enhance my knowledge to utilize it in a productive\nmanner for the growth of the company and the global.\n\nINDUSTRIAL TRAINING\n\n• BHEL, (HEEP) HARIDWAR\nOn CNC System& PLC Programming.\n\nWORK EXPERIENCE\n\nTech Support Executive\n\nTeleperformance for Microsoft -\n\nSeptember 2017 to Present\n\nprocess.\n• 21 months of experience in ADFC as Phone Banker.\n\nEDUCATION\n\nBachelor of Technology in Electronics & communication Engg\n\nGNIT institute of Technology - Lucknow, Uttar Pradesh\n\n2008 to 2012\n\nClass XII\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2007\n\nClass X\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2005\n\nSKILLS\n\nMicrosoft office, excel, cisco, c language, cbs. (4 years)\n\nhttps://www.indeed.com/r/Nida-Khan/6c9160696f57efd8?isid=rex-download&ikw=download-top&co=IN","annotation":[{"label":["Email Address"],"points":[{"start":872,"end":910,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Skills"],"points":[{"start":800,"end":857,"text":"Microsoft office, excel, cisco, c language, cbs. (4 years)"}]},{"label":["Graduation Year"],"points":[{"start":676,"end":679,"text":"2012"}]},{"label":["College Name"],"points":[{"start":612,"end":640,"text":"GNIT institute of Technology "}]},{"label":["Degree"],"points":[{"start":552,"end":609,"text":"Bachelor of Technology in Electronics & communication Engg"}]},{"label":["Companies worked at"],"points":[{"start":420,"end":448,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":395,"end":417,"text":"\nTech Support Executive"}]},{"label":["Email Address"],"points":[{"start":106,"end":144,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Location"],"points":[{"start":66,"end":71,"text":"Jaipur"}]},{"label":["Companies worked at"],"points":[{"start":35,"end":63,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":10,"end":32,"text":"Tech Support Executive "}]},{"label":["Designation"],"points":[{"start":9,"end":31,"text":"\nTech Support Executive"}]},{"label":["Name"],"points":[{"start":0,"end":8,"text":"Nida Khan"}]}]}
问题是您正在向模型优化器提供训练数据。
如 https://github.com/explosion/spaCy/issues/3558 中所述,使用以下函数从实体跨度中删除前导和尾随空格。
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
# if there's preceding spaces, move the start position to nearest character
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
然后使用下面的函数进行训练:
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\Users\akjain\Downloads\Entity-Recognition-In-Resumes-SpaCy-master\traindata.json")
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)