bert-as-service 计算相似度

董洲
2023-12-01

安装python3 依赖

pip install bert_serving
pip install bert-serving-server==1.10.0 # 服务端
pip install bert-serving-client==1.10.0  # 客户端,与服务端互相独立
pip install tensorflow==1.15.0	(>=1.13.1)
wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip # 下载中文语料
unzip chinese_L-12_H-768_A-12.zip

启动service

start_service.py

# 导入bert客户端
from bert_serving.client import BertClient
import numpy as np


class SimilarModel:
    def __init__(self):
        # ip默认为本地模式,如果bert服务部署在其他服务器上,修改为对应ip
        self.bert_client = BertClient()

    def close_bert(self):
        self.bert_client.close()

    def get_sentence_vec(self,sentence):
        '''
        根据bert获取句子向量
        :param sentence:
        :return:
        '''
        return self.bert_client.encode([sentence])[0]

    def cos_similar(self,sen_a_vec, sen_b_vec):
        '''
        计算两个句子的余弦相似度
        :param sen_a_vec:
        :param sen_b_vec:
        :return:
        '''
        vector_a = np.mat(sen_a_vec)
        vector_b = np.mat(sen_b_vec)
        num = float(vector_a * vector_b.T)
        denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
        cos = num / denom
        return cos

if __name__=='__main__':
    # 从候选集condinates 中选出与sentence_a 最相近的句子
    condinates = ['为什么天空是蔚蓝色的','太空为什么是黑的?','天空怎么是蓝色的','明天去爬山如何']
    sentence_a = '天空为什么是蓝色的'
    bert_client = SimilarModel()
    max_cos_similar = 0
    most_similar_sentence = ''
    for sentence_b in condinates:
        sentence_a_vec = bert_client.get_sentence_vec(sentence_a)
        sentence_b_vec = bert_client.get_sentence_vec(sentence_b)
        cos_sim = bert_client.cos_similar(sentence_a_vec,sentence_b_vec)
        print(sentence_b_vec,cos_sim)
        if cos_sim > max_cos_similar:
            max_cos_similar = cos_sim
            most_similar_sentence = sentence_b

    print('最相似的句子:',most_similar_sentence)
    bert_client.close_bert()

# bert_as_service 对并发的支持不太友好,需要加锁使用!
lock = threading.Lock()  # 生成锁对象
bc = BertClient(
    ip=BERT_CONFIG.get("ip"),
    port=BERT_CONFIG.getint("port"),
    port_out=BERT_CONFIG.getint("port_out"),
    timeout=BERT_CONFIG.getint("timeout"),
    check_version=False,
    check_token_info=False,
)

bc.encode(['First do it', 'then do it right', 'then do it better'])  #直接输入整个句子不需要提前分词

 类似资料: