transformers API参考链接:https://huggingface.co/docs/transformers/v4.21.2/en/training
train.py
from datasets import load_dataset
from transformers import AutoTokenizer,AutoConfig
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import json
#from datasets import load_metric
os.environ["CUDA_VISIBLE_DEVICES"]= "1,2,3,4,5,6,7"
# 加载数据集(训练数据、测试数据)
dataset = load_dataset("csv", data_files={"train": "./weibo_train.csv", "test": "./weibo_test.csv"}, cache_dir="./cache")
dataset = dataset.class_encode_column("label") #对标签类进行编码,此过程对训练集的标签进行汇总
# 利用加载的数据集,对label进行编号,生成label_map,以便于训练、及后续的推理、计算准确率等
def generate_label_map(dataset):
labels=dataset['train'].features['label'].names
label2id=dict()
for idx,label in enumerate(labels):
label2id[label]=idx
return label2id
def save_label_map(dataset,label_map_file):
# only take the labels of the training data for the label set of the model.
label2id=generate_label_map(dataset)
with open(label_map_file,'w',encoding='utf-8') as fout:
json.dump(label2id,fout)
# 保存label map
label_map_file='label2id.json'
save_label_map(dataset,label_map_file)
# 读取label map【注意,在多卡训练时,这种读取文件的方法可能会导致报错】
#label2id={}
#with open(label_map_file,'r',encoding='utf-8') as fin:
# label2id=json.load(fin)
label2id=generate_label_map(dataset)
if not label2id:
exit()
id2label={id:label for label,id in label2id.items()}
#加载tokenizer,会自动下载
tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese")
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=45)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
#small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
config = AutoConfig.from_pretrained("./bert-base-chinese", num_labels=len(label2id), id2label=id2label, label2id=label2id)
# 加载Bert预训练模型
model=AutoModelForSequenceClassification.from_pretrained("./bert-base-chinese",config=config)
training_args = TrainingArguments(
output_dir="./results",
learning_rate=2e-5,
per_device_train_batch_size=180,
per_device_eval_batch_size=128,
num_train_epochs=20,
weight_decay=0.01,
#fp16=True,
evaluation_strategy="epoch",
save_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
#train_dataset=small_train_dataset,
#eval_dataset=small_eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
print("training sample: ", trainer.train_dataset[1])
trainer.train()
print("finished")
2、推理
参考文档:https://huggingface.co/docs/transformers/v4.21.2/en/pipeline_tutorial
infer.py
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset
import datasets
from sklearn.metrics import accuracy_score
# 加载测试数据
#dataset = load_dataset("csv", data_files={"train": "", "test": "./weibo_test.csv"}, split='test')
dataset=load_dataset("csv", data_files={"train": "weibo_train.csv", "test": "weibo_test.csv"}, cache_dir="./cache")
# 加载模型
model_dir='./results/checkpoint-1200'
print('using checkpoint from dir:',model_dir)
pipe = pipeline(task="text-classification",device=0,model=model_dir)
# 模型预测
preds=[]
for out in pipe(KeyDataset(dataset['test'], "text"), batch_size=128, truncation="only_first"):
print(out)
#print(out['label'])
preds.append(out['label'])
'''
with open('pred.txt','w',encoding='utf8') as fout:
for label in preds:
fout.write(label)
fout.write('\n')
'''
# 计算准确率
y_true=dataset['test']['label']
acc=accuracy_score(y_true,preds)
print('Acc on test data:{:.4f}'.format(acc))