如何使用spacy构造邻接矩阵

柏明亮
2023-12-01

目录

问题描述:

代码实现:


问题描述:

如何利用spacy工具包,对一个句子(英文)进行进行依存句法分析,并构造相对应的邻接矩阵。

代码实现:

import numpy as np
import spacy
import pickle
from transformers import BertTokenizerFast
from spacy.tokens import Doc

from pdb import set_trace as stop

class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split()
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

nlp = spacy.load('en_core_web_sm')  # zh_core_web_sm
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
tokenizer = BertTokenizerFast.from_pretrained("/home/qtxu/PLM/bert-base-uncased")

def dependency_adj_matrix(text):
    # https://spacy.io/docs/usage/processing-text
    stop()
    tokens = nlp(text)
    tokenized = tokenizer(text.split(" "), is_split_into_words=True, add_special_tokens=False)
    word_ids = tokenized.word_ids()
    words = text.split()
    # matrix = np.zeros((len(words), len(words))).astype('float32')
    matrix1 = np.zeros((len(word_ids), len(word_ids))).astype('float32')
    assert len(words) == len(list(tokens)) # make sure the len is same
    assert (len(tokens) - 1) == max(word_ids)

    for i, idx in enumerate(word_ids):
        matrix1[i][i] = 1 # 主对角线是1, 保留自己本身的特征信息
        for j, id in enumerate(word_ids):
            if tokens[id] in tokens[idx].children or word_ids[j] == word_ids[i]: 
                # tokens[id] in tokens[idx].children:检查是否存在一个语法依赖关系,即 id 单词是否是 idx 单词的子节点
                # word_ids[j] == word_ids[i]:检查是否是同一个单词,因为语法依赖关系还保留自己本身的特征信息,所以要将矩阵对角线设置为 1
                matrix1[i][j] = 1
                matrix1[j][i] = 1
    return matrix1

def softmax(x):
    if len(x.shape) > 1:
        # matrix
        tmp = np.max(x, axis=1)
        x -= tmp.reshape((x.shape[0], 1))
        x = np.exp(x)
        tmp = np.sum(x, axis=1)
        x /= tmp.reshape((x.shape[0], 1))
    else:
        # vector
        tmp = np.max(x)
        x -= tmp
        x = np.exp(x)
        tmp = np.sum(x)
        x /= tmp
    return x

def process(filename):
    fin = open(filename, 'r', encoding='utf-8')
    lines = fin.readlines()
    fin.close()
    idx2graph = {}
    fout = open(filename+'.graph', 'wb')
    for i in lines:
        sentence = i.strip().split("\t")[0]
        try:
            adj_matrix = dependency_adj_matrix(sentence)
        except:
            print(filename)
            raise
        idx2graph[sentence] = adj_matrix
    pickle.dump(idx2graph, fout)
    fout.close()


filename = "/home/qtxu/SPN/data/Camera-COQE/dev_test.txt" # 存放句子的路径
process(filename)

注意:

如果是对英文句子,依据依存关系构造邻接矩阵,则使用 en_core_web_sm
反之,如果是中文,则使用 zh_core_web_sm

千万注意!!!:


使用的en_core_web_sm版本号,一定要与spacy匹配。

~如果喜欢,点个赞再走呗 ~

 类似资料: