基于Hugging Face的transformers包的微调模型训练

章嘉致
2023-12-01

transformers API参考链接:https://huggingface.co/docs/transformers/v4.21.2/en/training

train.py

from datasets import load_dataset
from transformers import AutoTokenizer,AutoConfig
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import json

#from datasets import load_metric

os.environ["CUDA_VISIBLE_DEVICES"]= "1,2,3,4,5,6,7"

# 加载数据集(训练数据、测试数据)
dataset = load_dataset("csv", data_files={"train": "./weibo_train.csv", "test": "./weibo_test.csv"}, cache_dir="./cache")
dataset = dataset.class_encode_column("label") #对标签类进行编码,此过程对训练集的标签进行汇总

# 利用加载的数据集,对label进行编号,生成label_map,以便于训练、及后续的推理、计算准确率等
def generate_label_map(dataset):
    labels=dataset['train'].features['label'].names
    label2id=dict()
    for idx,label in enumerate(labels):
        label2id[label]=idx
    return label2id

def save_label_map(dataset,label_map_file):
    # only take the labels of the training data for the label set of the model.
    label2id=generate_label_map(dataset)
    with open(label_map_file,'w',encoding='utf-8') as fout:
        json.dump(label2id,fout)

# 保存label map
label_map_file='label2id.json'
save_label_map(dataset,label_map_file)

# 读取label map【注意,在多卡训练时,这种读取文件的方法可能会导致报错】
#label2id={}
#with open(label_map_file,'r',encoding='utf-8') as fin:
#    label2id=json.load(fin)

label2id=generate_label_map(dataset)

if not label2id:
    exit()
id2label={id:label for label,id in label2id.items()}
#加载tokenizer,会自动下载
tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese") 


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=45)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

#small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

config = AutoConfig.from_pretrained("./bert-base-chinese", num_labels=len(label2id), id2label=id2label, label2id=label2id)
# 加载Bert预训练模型
model=AutoModelForSequenceClassification.from_pretrained("./bert-base-chinese",config=config)


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=180,
    per_device_eval_batch_size=128,
    num_train_epochs=20,
    weight_decay=0.01,
    #fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    #train_dataset=small_train_dataset,
    #eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("training sample: ", trainer.train_dataset[1])

trainer.train()

print("finished")

2、推理

参考文档:https://huggingface.co/docs/transformers/v4.21.2/en/pipeline_tutorial

infer.py

from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset
import datasets
from sklearn.metrics import accuracy_score


# 加载测试数据
#dataset = load_dataset("csv", data_files={"train": "", "test": "./weibo_test.csv"}, split='test')
dataset=load_dataset("csv", data_files={"train": "weibo_train.csv", "test": "weibo_test.csv"}, cache_dir="./cache")

# 加载模型
model_dir='./results/checkpoint-1200'
print('using checkpoint from dir:',model_dir)
pipe = pipeline(task="text-classification",device=0,model=model_dir)

# 模型预测
preds=[]
for out in pipe(KeyDataset(dataset['test'], "text"), batch_size=128, truncation="only_first"):
    print(out)
    #print(out['label'])
    preds.append(out['label'])

'''
with open('pred.txt','w',encoding='utf8') as fout:
    for label in preds:
        fout.write(label)
        fout.write('\n')
'''

# 计算准确率
y_true=dataset['test']['label']
acc=accuracy_score(y_true,preds)
print('Acc on test data:{:.4f}'.format(acc))
 类似资料: