文本分类任务在实际工作中很常见,一般是多分类和多标签分类。多标签分类的内容参见博客https://blog.csdn.net/weixin_42223207/article/details/115036283。本文是以用hugging face的Transformers实现文本分类,采用的框架是tensorflow==2.4.0。本文的内容大致如下:
采用BertTokenizer对字进行Tokenizer,代码如下`
def create_inputs_targets(sentences, labels, max_len, tokenizer):
dataset_dict = {
"input_ids": [],
"attention_mask": [],
"labels": []
}
assert len(sentences) == len(labels)
for i in range(len(sentences)):
input_ids = []
for idx, word in enumerate(sentences[i]):
ids = tokenizer.encode(word, add_special_tokens=False)
input_ids.extend(ids.ids)
# Pad truncate,句子前后加'[CLS]','[SEP]'
input_ids = input_ids[:max_len - 2]
input_ids = [101] + input_ids + [102]
# 这里'O'对应的是16, 这里是否对应的是tag2id中的[CLS][SEP]
attention_mask = [1] * len(input_ids)
padding_len = max_len - len(input_ids)
# vocab中 [PAD]的编码是0
input_ids = input_ids + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
dataset_dict["input_ids"].append(input_ids)
dataset_dict["attention_mask"].append(attention_mask)
dataset_dict["labels"].append(labels[i])
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["attention_mask"],
]
y = dataset_dict["labels"]
return x, y
这里取了bert tokenizer中的input_ids和attention_mask。
采用Bert模型进行fine tuning,代码如下:
class BertTextClassifier(object):
def __init__(self, bert_model_name, label_num):
self.label_num = label_num
self.bert_model_name = bert_model_name
def get_model(self):
bert = TFBertModel.from_pretrained(self.bert_model_name)
input_ids = keras.Input(shape=(None,), dtype=tf.int32, name="input_ids")
attention_mask = keras.Input(shape=(None,), dtype=tf.int32, name="attention_mask")
outputs = bert(input_ids, attention_mask=attention_mask)[1]
cla_outputs = layers.Dense(self.label_num, activation='softmax')(outputs)
model = keras.Model(
inputs=[input_ids, attention_mask],
outputs=[cla_outputs])
return model
def create_model(bert_model_name, label_nums):
model = BertTextClassifier(bert_model_name, label_nums).get_model()
optimizer = tf.keras.optimizers.Adam(lr=1e-5)
model.compile(optimizer=optimizer, loss='categorical_crossentropy',
metrics=['accuracy', tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tf.keras.metrics.AUC()]) # metrics=['accuracy']
return model
这里采用tensorflow2.x中的高阶API keras进行模型训练。代码如下:
model = create_model(args["bert_model_name"], len(tag2id))
# model.summary()
model.fit(train_x,
train_y,
epochs=epoch,
verbose=1,
batch_size=batch_size,
validation_data=(dev_x, dev_y),
validation_batch_size=batch_size
) # , validation_split=0.1
# model save
model_path = os.path.join(args["output_path"], "classification_model.h5")
model.save_weights(model_path, overwrite=True)
# save pb model
tf.keras.models.save_model(model, args["pb_path"],
save_format="tf",
overwrite=True)
在搜狗数据集上训练的结果如下:
precision recall f1-score support
体育 1.00 1.00 1.00 209
健康 0.94 0.98 0.96 180
军事 0.99 0.99 0.99 208
教育 0.98 0.94 0.96 197
汽车 0.98 0.99 0.99 202
accuracy 0.98 996
macro avg 0.98 0.98 0.98 996
weighted avg 0.98 0.98 0.98 996
将数据处理为模型输入的格式,即用Tokenizer得到数据的input_ids和attention_mask的特征。代码如下:
def create_infer_inputs(sentences, max_len, tokenizer):
dataset_dict = {
"input_ids": [],
"attention_mask": [],
}
for i in range(len(sentences)):
input_ids = []
for idx, word in enumerate(sentences[i]):
ids = tokenizer.encode(word, add_special_tokens=False)
input_ids.extend(ids.ids)
# Pad truncate,句子前后加'[CLS]','[SEP]'
input_ids = input_ids[:max_len - 2]
input_ids = [101] + input_ids + [102]
# 这里'O'对应的是16, 这里是否对应的是tag2id中的[CLS][SEP]
attention_mask = [1] * len(input_ids)
padding_len = max_len - len(input_ids)
# vocab中 [PAD]的编码是0
input_ids = input_ids + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
dataset_dict["input_ids"].append(input_ids)
dataset_dict["attention_mask"].append(attention_mask)
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["attention_mask"],
]
return x
这里采用Flask服务实现模型预测。代码如下:
@app.route("/classification", methods=['POST'])
def classification_predict():
data = json.loads(request.get_data(), encoding="utf-8")
sentence = data["context"]
url = data["url"]
input_ids, attention_mask = create_infer_inputs(sentence, max_len, tokenizer)
print("input_ids: ", input_ids)
print("attention_mask: ", attention_mask)
data = json.dumps({"signature_name": "serving_default",
"inputs": {"input_ids": input_ids,
"attention_mask": attention_mask}})
headers = {"content-type": "application/json"}
result = requests.post(url, data=data, headers=headers)
print("result: ", result)
if result.status_code == 200:
result = json.loads(result.text)
logits = np.array(result["outputs"])
pred = np.argmax(logits, axis=1).tolist()
pred_label = id2tag[pred[0]]
print(pred_label)
return_result = {"code": 200,
"context": sentence,
"label": pred_label}
return jsonify(return_result)
else:
return_result = {"code": 200,
"context": sentence,
"label": None}
return jsonify(return_result)
其中url是用docker+Tensorflow serving部署模型的服务。如有问题,欢迎指正。