目录
如何利用spacy工具包,对一个句子(英文)进行进行依存句法分析,并构造相对应的邻接矩阵。
import numpy as np
import spacy
import pickle
from transformers import BertTokenizerFast
from spacy.tokens import Doc
from pdb import set_trace as stop
class WhitespaceTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split()
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.load('en_core_web_sm') # zh_core_web_sm
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
tokenizer = BertTokenizerFast.from_pretrained("/home/qtxu/PLM/bert-base-uncased")
def dependency_adj_matrix(text):
# https://spacy.io/docs/usage/processing-text
stop()
tokens = nlp(text)
tokenized = tokenizer(text.split(" "), is_split_into_words=True, add_special_tokens=False)
word_ids = tokenized.word_ids()
words = text.split()
# matrix = np.zeros((len(words), len(words))).astype('float32')
matrix1 = np.zeros((len(word_ids), len(word_ids))).astype('float32')
assert len(words) == len(list(tokens)) # make sure the len is same
assert (len(tokens) - 1) == max(word_ids)
for i, idx in enumerate(word_ids):
matrix1[i][i] = 1 # 主对角线是1, 保留自己本身的特征信息
for j, id in enumerate(word_ids):
if tokens[id] in tokens[idx].children or word_ids[j] == word_ids[i]:
# tokens[id] in tokens[idx].children:检查是否存在一个语法依赖关系,即 id 单词是否是 idx 单词的子节点
# word_ids[j] == word_ids[i]:检查是否是同一个单词,因为语法依赖关系还保留自己本身的特征信息,所以要将矩阵对角线设置为 1
matrix1[i][j] = 1
matrix1[j][i] = 1
return matrix1
def softmax(x):
if len(x.shape) > 1:
# matrix
tmp = np.max(x, axis=1)
x -= tmp.reshape((x.shape[0], 1))
x = np.exp(x)
tmp = np.sum(x, axis=1)
x /= tmp.reshape((x.shape[0], 1))
else:
# vector
tmp = np.max(x)
x -= tmp
x = np.exp(x)
tmp = np.sum(x)
x /= tmp
return x
def process(filename):
fin = open(filename, 'r', encoding='utf-8')
lines = fin.readlines()
fin.close()
idx2graph = {}
fout = open(filename+'.graph', 'wb')
for i in lines:
sentence = i.strip().split("\t")[0]
try:
adj_matrix = dependency_adj_matrix(sentence)
except:
print(filename)
raise
idx2graph[sentence] = adj_matrix
pickle.dump(idx2graph, fout)
fout.close()
filename = "/home/qtxu/SPN/data/Camera-COQE/dev_test.txt" # 存放句子的路径
process(filename)
注意:
如果是对英文句子,依据依存关系构造邻接矩阵,则使用 en_core_web_sm
反之,如果是中文,则使用 zh_core_web_sm
千万注意!!!:
使用的en_core_web_sm版本号,一定要与spacy匹配。
~如果喜欢,点个赞再走呗 ~