尝试执行 HuggingFace 教程 (pytorch) 时张量类型错误
Wrong tensor type when trying to do the HuggingFace tutorial (pytorch)
我最近一直在尝试从 Hugging Face 中获得 transformer 库的经验。由于在使用 Pytorch(以及一般的深度学习)方面我绝对是菜鸟,所以我从可以找到的介绍开始 here。
这里是安装依赖的代码:
#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm
这是他们建议用于微调 BERT MNPR 数据集(用于 GLUE 基准测试)的代码。该数据集每个“样本”包含两个句子,因此在分词器中我们必须使用 sentence1
和 sentence2
.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
# training loop:
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# assert 1==0
这对我来说在 Google Colab 中非常有效。我想对另一个数据集 sst2
做同样的事情。我使用的代码与上面的非常相似。唯一改变的几行代码是导入数据和分词器的代码行(我们对每个特征有一个句子,而不是两个)。我已经仔细检查过,分词器工作正常。这是我的代码:
# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm
# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0
# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenization of dataset
def tokenize_function(example):
return tokenizer(example["sentence"], truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# setting DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)
# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
for k,v in batch.items():
print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
这是我得到的错误:
RuntimeError Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
69 outputs = model(**batch)
70 loss = outputs.loss
---> 71 loss.backward()
72
73 optimizer.step()
1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: Found dtype Long but expected Float
这似乎是一个非常愚蠢的错误,但正如我所说,我是一个绝对的 pytorch 菜鸟,我很难知道从哪里开始解决这个问题。我检查了 batch.items()
中值的类型,在这两种情况下,它们都是 torch.int64
(或 torch.long
)。我试图将 attention_mask
和 input_ids
值更改为 torch.float32
,但我收到了相同的错误消息。
提前致谢。
Python 版本和包:
- python 3.7.20
- 火炬1.9.0+cu102
- 变形金刚 4.8.2
- GPU:Tesla T4(也试过 tesla P4)
我找到了问题的根源。问题来自行
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
由于数据集有2个类,正确的模型调用方式应该是
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
通过此修改,我的代码现在可以工作了。
我最近一直在尝试从 Hugging Face 中获得 transformer 库的经验。由于在使用 Pytorch(以及一般的深度学习)方面我绝对是菜鸟,所以我从可以找到的介绍开始 here。
这里是安装依赖的代码:
#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm
这是他们建议用于微调 BERT MNPR 数据集(用于 GLUE 基准测试)的代码。该数据集每个“样本”包含两个句子,因此在分词器中我们必须使用 sentence1
和 sentence2
.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
# training loop:
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# assert 1==0
这对我来说在 Google Colab 中非常有效。我想对另一个数据集 sst2
做同样的事情。我使用的代码与上面的非常相似。唯一改变的几行代码是导入数据和分词器的代码行(我们对每个特征有一个句子,而不是两个)。我已经仔细检查过,分词器工作正常。这是我的代码:
# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm
# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0
# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenization of dataset
def tokenize_function(example):
return tokenizer(example["sentence"], truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# setting DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)
# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
for k,v in batch.items():
print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
这是我得到的错误:
RuntimeError Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
69 outputs = model(**batch)
70 loss = outputs.loss
---> 71 loss.backward()
72
73 optimizer.step()
1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: Found dtype Long but expected Float
这似乎是一个非常愚蠢的错误,但正如我所说,我是一个绝对的 pytorch 菜鸟,我很难知道从哪里开始解决这个问题。我检查了 batch.items()
中值的类型,在这两种情况下,它们都是 torch.int64
(或 torch.long
)。我试图将 attention_mask
和 input_ids
值更改为 torch.float32
,但我收到了相同的错误消息。
提前致谢。
Python 版本和包:
- python 3.7.20
- 火炬1.9.0+cu102
- 变形金刚 4.8.2
- GPU:Tesla T4(也试过 tesla P4)
我找到了问题的根源。问题来自行
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
由于数据集有2个类,正确的模型调用方式应该是
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
通过此修改,我的代码现在可以工作了。