coreference resolution

柳梓

2023-12-01

介绍

共指解析，按照百度的定义如下：

众所周知，人们为了避免重复，习惯用代词、称谓和缩略语来指代前面提到的实体全称。例如，在文章开始处会写“哈尔滨工业大学”，后面可能会说“哈工大”、“工大”等，还会提到“这所大学”、“她”等。这种现象称为共指现象。

简而言之，其目的在于自动识别表示同一实体的名词短语或代词等。

举个例子：

哈尔滨工业大学，一般学生或者大众喜欢简称为哈工大，工大等，她是一所美丽的大学。

实体(entity): 应是唯一定义的，并且具有共知的。哈尔滨工业大学即为这句话的实体。
指称(mention): 实体在自然语言文本中的另外一种表达形式，哈工大，工大，她都是指称。
共指(coreference): 如果文本或句子中的两个或多个mention指向同一个entity，那么则称为共指。

到这里可以看出，一个复杂的句子中可能会有多个实体以及对应的多个指称共指于不同的实体，这可以是一个分类任务也可以是一个聚类任务。

根据认知，中文里面能做实体的一般是专有名词，比如清华大学，《海蒂》，各行各业有不同的专有名词。另外就是名词或者名词短语以及代词，比如这人，他，它，她等。

下面介绍一种算法。

Word-Level Coreference Resolution

论文地址: Word-Level Coreference Resolution
代码地址：wl-coref

先说个人感受，这个咋感觉更像是提升速度，topK操作，而没有那么多骚操作来提升效果。代码质量杠杠的，但是不是batch训练，又有点怪怪的～

代码对于训练部分看完了，如果有说的不对的，或者没有涵盖到重点的，非常欢迎指教！

1. 获得word level embedding

获取subtoken输入bert后得到的向量

def _bertify(self, doc: Doc) -> torch.Tensor:
    subwords_batches = bert.get_subwords_batches(doc, self.config,
                                                    self.tokenizer)

    special_tokens = np.array([self.tokenizer.cls_token_id,
                                self.tokenizer.sep_token_id,
                                self.tokenizer.pad_token_id])
    subword_mask = ~(np.isin(subwords_batches, special_tokens))

    subwords_batches_tensor = torch.tensor(subwords_batches,
                                            device=self.config.device,
                                            dtype=torch.long)
    subword_mask_tensor = torch.tensor(subword_mask,
                                        device=self.config.device)

    # Obtain bert output for selected batches only
    attention_mask = (subwords_batches != self.tokenizer.pad_token_id)
    out, _ = self.bert(
        subwords_batches_tensor,
        attention_mask=torch.tensor(
            attention_mask, device=self.config.device))
    del _

    # [n_subwords, bert_emb]
    return out[subword_mask_tensor]

获取word level embedding

from typing import Tuple

import torch

from coref.config import Config
from coref.const import Doc


class WordEncoder(torch.nn.Module):  # pylint: disable=too-many-instance-attributes
    def __init__(self, features: int, config: Config):

        super().__init__()
        self.attn = torch.nn.Linear(in_features=features, out_features=1)
        self.dropout = torch.nn.Dropout(config.dropout_rate)


    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
                doc: Doc,
                x: torch.Tensor,
                ) -> Tuple[torch.Tensor, ...]:

        word_boundaries = torch.tensor(doc["word2subword"], device=self.device)
        # 每个token被tokenizer为subtokens后，starts记录每个token的起始位置
        starts = word_boundaries[:, 0]
        # ends记录每个token的结束位置
        ends = word_boundaries[:, 1]

        # [n_mentions, features]
        words = self._attn_scores(x, starts, ends).mm(x)

        words = self.dropout(words)

        return (words, self._cluster_ids(doc))

    def _attn_scores(self,
                     bert_out: torch.Tensor,
                     word_starts: torch.Tensor,
                     word_ends: torch.Tensor) -> torch.Tensor:

        n_subtokens = len(bert_out)
        n_words = len(word_starts)

        # [n_mentions, n_subtokens]
        # with 0 at positions belonging to the words and -inf elsewhere
        # 只有start到end之间的为0,否则为-inf
        attn_mask = torch.arange(0, n_subtokens, device=self.device).expand((n_words, n_subtokens))
        attn_mask = ((attn_mask >= word_starts.unsqueeze(1))
                     * (attn_mask < word_ends.unsqueeze(1)))
        attn_mask = torch.log(attn_mask.to(torch.float))
        # 每一个subtoken被降维为1,比如一个句子有477个subtokens，bert_out为(477, 768)，attn_scores就变成了(1,477)
        attn_scores = self.attn(bert_out).T  # [1, n_subtokens]
        attn_scores = attn_scores.expand((n_words, n_subtokens))
        attn_scores = attn_mask + attn_scores
        del attn_mask
        # 做归一化
        return torch.softmax(attn_scores, dim=1)  # [n_words, n_subtokens]

这部分可以看我最后简单示例，说明实现方式。

假设tokens长度为455, 这里输出就变成了word level embedding。

粗排(rough score)

还是觉得，不考虑batch_size那一维，整个代码都方便理解许多。

class RoughScorer(torch.nn.Module):
    def __init__(self, features: int, config: Config):
        super().__init__()
        self.dropout = torch.nn.Dropout(config.dropout_rate)
        self.bilinear = torch.nn.Linear(features, features)

        self.k = config.rough_k

    def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
                mentions: torch.Tensor,
                ) -> Tuple[torch.Tensor, torch.Tensor]:

        # 这里一共做了两件事情
        # 1. 获取token和token之间的关联矩阵，可用于表示之间的关联程度。
        # [n_mentions, n_mentions]
        pair_mask = torch.arange(mentions.shape[0])
        pair_mask = pair_mask.unsqueeze(1) - pair_mask.unsqueeze(0)
        pair_mask = torch.log((pair_mask > 0).to(torch.float))
        pair_mask = pair_mask.to(mentions.device) # -- 首先构建掩码矩阵，该矩阵为一个下三角矩阵，含义为每个词只能取该词之前的词作为候选词。

        # 但是有啥说啥，我就搞不明白为啥这里还要加一个bilinear，这不有点多此一举么，维度没发生改变，谁能知道给我解答下～
        # 不过没人看
        bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T)

        rough_scores = pair_mask + bilinear_scores
        # 2. 获取每个token的topK tokens。
        return self._prune(rough_scores)

    def _prune(self,
               rough_scores: torch.Tensor
               ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 骚气吧，是不是又get到新操作。不过这个sorted，不管true还是false结果都不变
        top_scores, indices = torch.topk(rough_scores,
                                         k=min(self.k, len(rough_scores)),
                                         dim=1, sorted=False)
        return top_scores, indices

获取词对特征

看到么看到么，人家到这里才开始干活～

top_indices的维度为(405,50)，表示这个句子一共有405个tokens，然后获取每个token最相关联的50个tokens的索引。

def forward(self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
            top_indices: torch.Tensor,
            doc: Doc) -> torch.Tensor:
    word_ids = torch.arange(0, len(doc["cased_words"]), device=self.device)
    # 一、same speaker特征
    speaker_map = torch.tensor(self._speaker_map(doc), device=self.device)
    # 获取speaker_map对应位置的值，最终输出维度和top_indices一样
    # 看这一步操作，妥妥让我难理解半个小时～
    # 1. speaker_map[top_indices]这里，第一次见这种map操作，学习了
    # 2. 这种广播写法让我觉得，emmmmm，还不如expand下来的更直接
    same_speaker = (speaker_map[top_indices] == speaker_map.unsqueeze(1))  # 广播
    same_speaker = self.speaker_emb(same_speaker.to(torch.long))
    
    # 二、距离特征
    # 这个特征我觉得还是挺有用的，1、加速了训练和推理速度 2、加快收敛速度
    # bucketing the distance (see __init__())
    distance = (word_ids.unsqueeze(1) - word_ids[top_indices]
                ).clamp_min_(min=1) # 小于1的变成1
    log_distance = distance.to(torch.float).log2().floor_()
    log_distance = log_distance.clamp_max_(max=6).to(torch.long) # 大于最大值的元素将变为最大值。 那么就是64
    # 一会log_distance一会distance的，看着就不容易理解，直接到log_distance多好
    distance = torch.where(distance < 5, distance - 1, log_distance + 2)
    distance = self.distance_emb(distance)

    # 三、同一文档特征
    genre = torch.tensor(self.genre2int[doc["document_id"][:2]],
                            device=self.device).expand_as(top_indices)
    genre = self.genre_emb(genre)

    return self.dropout(torch.cat((same_speaker, distance, genre), dim=2))

AnaphoricityScorer

这部分看着有点绕，看如下代码：

def forward(self, *,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
            all_mentions: torch.Tensor,
            mentions_batch: torch.Tensor,
            pw_batch: torch.Tensor,
            top_indices_batch: torch.Tensor,
            top_rough_scores_batch: torch.Tensor,
            ) -> torch.Tensor:

    # [batch_size, n_ants, pair_emb]
    pair_matrix = self._get_pair_matrix(
        all_mentions, mentions_batch, pw_batch, top_indices_batch)

    # [batch_size, n_ants]
    scores = top_rough_scores_batch + self._ffnn(pair_matrix)
    scores = utils.add_dummy(scores, eps=True)

    return scores

def _ffnn(self, x: torch.Tensor) -> torch.Tensor:
    """
    Calculates anaphoricity scores.

    Args:
        x: tensor of shape [batch_size, n_ants, n_features]

    Returns:
        tensor of shape [batch_size, n_ants]
    """
    x = self.out(self.hidden(x))
    return x.squeeze(2)

@staticmethod
def _get_pair_matrix(all_mentions: torch.Tensor,
                     mentions_batch: torch.Tensor,
                     pw_batch: torch.Tensor,
                     top_indices_batch: torch.Tensor,
                     ) -> torch.Tensor:
    
    emb_size = mentions_batch.shape[1]
    n_ants = pw_batch.shape[1]
    # 计算所有tokens和这50个tokens的关联度
    a_mentions = mentions_batch.unsqueeze(1).expand(-1, n_ants, emb_size)
    b_mentions = all_mentions[top_indices_batch]
    similarity = a_mentions * b_mentions

    out = torch.cat((a_mentions, b_mentions, similarity, pw_batch), dim=2)
    return out

_get_pair_matrix中的b_mentions，可参考b_mentions。后续接了个ffnn，获得其最终得分。

整体代码作者挺喜欢参差网络和前馈神经网络这种操作。

loss计算

CorefLoss计算分成了两部分，一个是NLML，另外一个BCELoss.

@staticmethod
def _nlml(input_: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
    # gold这地方明显就想计算相关性。
    gold = torch.logsumexp(input_ + torch.log(target), dim=1)
    input_ = torch.logsumexp(input_, dim=1)
    return (input_ - gold).mean()

附录

word-level实现demo

# -*- coding: utf8 -*-
#

import torch


class WordEmbedding(torch.nn.Module):
    def __init__(self, n_in, p):
        super(WordEmbedding, self).__init__()
        self.n_in = n_in
        self.p = p
        self.linear = torch.nn.Linear(in_features=n_in, out_features=1)

    def forward(self, x, indices):
        start_end_indices_to_tensor = torch.tensor(indices)
        start_indices = start_end_indices_to_tensor[:, 0]
        end_indices = start_end_indices_to_tensor[:, 1]
        mask = torch.arange(5).expand(3, 5)

        attn_mask = (mask >= start_indices.unsqueeze(1)) * (mask < end_indices.unsqueeze(1))
        attn_mask = torch.log(attn_mask.to(torch.float))
        attn_scores = self.linear(x).T
        attn_scores = attn_scores.expand((len(indices), x.size(0)))
        attn_scores = attn_scores + attn_mask
        return torch.softmax(attn_scores, dim=1).mm(x)


if __name__ == '__main__':
    # 假设为：我 是 中国人,5为每个subtoken，10为每个subtoken的embedding
    word_feature = torch.arange(50, dtype=torch.float).view(5, 10)
    # 那么index是：
    start_end_indices = [(0, 1), (1, 2), (2, 5)]

    we = WordEmbedding(n_in=10, p=0.1)
    we.forward(word_feature, start_end_indices)

取索引操作

>>> a
tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]])

# b对应top50选出来的index
>>> b
tensor([[0, 1],
        [1, 2],
        [2, 2]])

>>> a[b]
tensor([[[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9]],

        [[ 5,  6,  7,  8,  9],
         [10, 11, 12, 13, 14]],

        [[10, 11, 12, 13, 14],
         [10, 11, 12, 13, 14]]])
>>> a[b].shape
torch.Size([3, 2, 5])

精彩例子

这种写法我挺喜欢的，比如：


>>> aa
tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
>>> mm
tensor([[ True,  True, False,  True,  True],
        [False,  True,  True,  True,  True]])

# 一般会在最后计算loss时加上mask进行降维铺平
>>> aa[mm]
tensor([0, 1, 3, 4, 6, 7, 8, 9])

# 形式1： 作者将无关的score置为-inf
>>> aa * torch.log(mm.to(torch.float))
tensor([[0., 0., -inf, 0., 0.],
        [-inf, 0., 0., 0., 0.]])

# 形式2：
>>> aa * mm
tensor([[0, 1, 0, 3, 4],
        [0, 6, 7, 8, 9]])