尝试执行 HuggingFace 教程 (pytorch) 时张量类型错误

Wrong tensor type when trying to do the HuggingFace tutorial (pytorch)

我最近一直在尝试从 Hugging Face 中获得 transformer 库的经验。由于在使用 Pytorch(以及一般的深度学习)方面我绝对是菜鸟,所以我从可以找到的介绍开始 here

这里是安装依赖的代码:

#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm

这是他们建议用于微调 BERT MNPR 数据集(用于 GLUE 基准测试)的代码。该数据集每个“样本”包含两个句子,因此在分词器中我们必须使用 sentence1sentence2.

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"

# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
  ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
  tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
  tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)
print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

# training loop:
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    # assert 1==0

这对我来说在 Google Colab 中非常有效。我想对另一个数据集 sst2 做同样的事情。我使用的代码与上面的非常相似。唯一改变的几行代码是导入数据和分词器的代码行(我们对每个特征有一个句子,而不是两个)。我已经仔细检查过,分词器工作正常。这是我的代码:

# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm

# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0

# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenization of dataset
def tokenize_function(example):
  return tokenizer(example["sentence"], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") 
tokenized_datasets.set_format("torch")

# setting DataLoader
train_dataloader = DataLoader(
  tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
  tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)

# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()} 
    for k,v in batch.items():
      print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
        
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

这是我得到的错误:

RuntimeError                              Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
     69     outputs = model(**batch)
     70     loss = outputs.loss
---> 71     loss.backward()
     72 
     73     optimizer.step()

1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    147     Variable._execution_engine.run_backward(
    148         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    150 
    151 

RuntimeError: Found dtype Long but expected Float

这似乎是一个非常愚蠢的错误,但正如我所说,我是一个绝对的 pytorch 菜鸟,我很难知道从哪里开始解决这个问题。我检查了 batch.items() 中值的类型,在这两种情况下,它们都是 torch.int64(或 torch.long)。我试图将 attention_maskinput_ids 值更改为 torch.float32,但我收到了相同的错误消息。

提前致谢。

Python 版本和包:

我找到了问题的根源。问题来自行

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

由于数据集有2个类,正确的模型调用方式应该是

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

通过此修改,我的代码现在可以工作了。