当加载一个模型的时候,spacy首先分析其meta.json文件,这个文件中包含以下信息:
{
“lang": "en",
"name": "core_web_sm",
"description": "example model for spacy",
"pipeline": ["tagger", "parser", "ner"]
}
因此当我们调用nlp = spacy.load("en_core_web_sm")
的时候,实际进行的步骤是:
lang = "en"
pipeline = ["tagger", "parser", "ner"]
data_path = "path/to/en_core_web_sm"
cls = spacy.util.get_lang_class(lang)
nlp = cls()
for name in pipeline:
component = nlp.create_pipe(name)
nlp.add_pipe(component)
nlp.from_disk(model_data_path)
spacy模型是由三个部分组成的:
权重;
pipeline中的函数;
语言数据;
利用nlp.create_pipe
来对新的pipeline进行初始化;
需要将pipeline的名字和setting放置到meta.json中;
from spacy.pipeline import EntityRuler
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler)
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
with nlp.disable_pipes("tagger", "parser"):
doc = nlp(text)
doc = nlp(text)
nlp.remove_pipe("parser")
nlp.rename_pipe("ner", "entityrecognizer")
nlp.replace_pipe("tagger", my_custom_tagger)
import spacy
def my_component(doc):
return doc
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, name="print_info",last=True)
print(nlp.pipe_name)
Language.factories
来告诉spacy自定的管道在什么地方,具体代码表示:from spacy.languages import Language
Language.factories["entity_matcher"] = lambda nlp, **cfg: EntityMatcher(nlp, **cfg)
nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANINAL")
Doc.set_extension("hello", default=True)
assert doc._.hello
doc._.hello = False
Doc.set_extension("hello",getter=get_hello_value, setter=set_hello_value)
assert doc._.hello
doc._.hello = "Hi!"
Doc.set_extension("hello", method=lambda doc, name: "Hi {}!".format(name))
assert doc._.hello("Bob")=="Hi Bob!"
利用gold.spans_from_biluo_tags
来添加自定义的模型
import your_custom_entity_recognizer
from spacy.gold import offsets_from_biluo_tags
def custom_new_wrapper(doc):
words = [token.text for tokem in doc]
custom_entities = your_custom_entity_recognizer
doc.ents = spans_from_biluo_tags(doc, custom_enitiies)
return doc
可以利用nlp.add_pipe
来添加新的wrapper到pipelines中,可以利用nlp.replace_pipe
来用一个预训练好的模型替换原本的entity recognizer。
PS:example:
可以利用Doc.from_array
来创造一个新的Doc目标。需要整数来创造一个numpy数组,可以在StringStore中寻找字符串类别。
import your_custom_model
from spacy.symbols import POS, TAG, DEP, HEAD
from spacy.tokens import Doc
import numpy
def cystom_model_wrapper(doc):
words = [token.text fro token in doc]
spaces = [token.whitespace for token in doc]
pos, tags, deps, heads = your_custom_model(words)
pos = [doc.vocab.strings.add(label) for label in pos]
tags = [doc.vocab.strings.add(label) for label in tags]
deps = [doc.vocab.strings.add(label) for lable in deps]
attrs = [POS, TAG, DEP, HEAD]
arr = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
new_doc = Doc(doc.vocab, words=words, spaces=spaces).from_array(attrs, arr)
return new_doc