Pytorch:预期同一设备上的所有张量
Pytorch : Expected all tensors on same device
我在同一台设备上移动了我的模型和输入,但我仍然遇到运行时错误:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
这是我的代码,
首先是我的模型实现:
import torch
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self, n_hiddens, n_feature= 2, n_output= 1):
super().__init__()
self.hiddens = []
n_hidden_in = n_feature
for n_hidden in n_hiddens :
self.hiddens.append( torch.nn.Linear(n_hidden_in, n_hidden) ) # hidden layer
n_hidden_in = n_hidden
self.predict = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
for hidden in self.hiddens :
x = F.relu(hidden(x)) # activation function for hidden layer
x = self.predict(x) # linear output
return x
然后我定义我的数据加载器。这里,X 和 y 是 numpy 数组
from torch.utils.data import TensorDataset, DataLoader
# Split training/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) # create your datset
train_dataloader = DataLoader(train_dataset, batch_size= 1000) # create your dataloader
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) # create your datset
test_dataloader = DataLoader(test_dataset, batch_size= 1000) # create your dataloader
在这里我训练我的模型。错误发生在行“outputs = regressor(inputs)”
NUM_EPOCHS = 2000
BATCH_SIZE = 1000
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")
# 1 hidden layer
total_num_nodes = 256
regressor = Net(n_hiddens= [total_num_nodes]).to(device)
optimizer = torch.optim.SGD(regressor.parameters(), lr=0.2, momentum= 0.1, nesterov= True)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
for epoch in range(NUM_EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, values = data
inputs = inputs.float().to(device)
values = values.float().to(device)
optimizer.zero_grad() # clear gradients for next train
print(f"Input device is : cuda:{inputs.get_device()}")
print(f"Target value device is : cuda:{values.get_device()}")
print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
outputs = regressor(inputs) # <-- This is where I have the error
loss = loss_func(outputs, values)
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
这是我的打印语句的输出:
Device used : cuda:0
Input device is : cuda:0
Target value device is : cuda:0
Is model on cuda ? :True
这应该意味着我的模型和我的张量都在同一台设备上,为什么我仍然有这个错误?
错误日志是:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-6-5234b830bebc> in <module>()
24 print(f"Target value device is : cuda:{values.get_device()}")
25 print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
---> 26 outputs = regressor(inputs)
27 loss = loss_func(outputs, values)
28 loss.backward() # backpropagation, compute gradients
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-4-56c54b30b771> in forward(self, x)
16 def forward(self, x):
17 for hidden in self.hiddens :
---> 18 x = F.relu(hidden(x)) # activation function for hidden layer
19 x = self.predict(x) # linear output
20 return x
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/linear.py in forward(self, input)
101
102 def forward(self, input: Tensor) -> Tensor:
--> 103 return F.linear(input, self.weight, self.bias)
104
105 def extra_repr(self) -> str:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
非常感谢
TL;DR 使用 nn.ModuleList
而不是 pythonic 来存储隐藏层 Net
.
您所有的隐藏层都存储在 Net
中的一个简单的 pythonic 列表 self.hidden
中。当您使用 .to(device)
将模型移动到 GPU 时,pytorch 无法判断此 pythonic 列表的所有元素也应移动到同一设备。
然而,如果你设置 self.hidden = nn.ModuleLis()
,pytorch 现在知道将这个特殊列表的所有元素视为 nn.Module
s 并且 递归地 将它们移动到与 [=] 相同的设备11=].
查看这些答案, , 了解更多详情。
我在同一台设备上移动了我的模型和输入,但我仍然遇到运行时错误:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
这是我的代码, 首先是我的模型实现:
import torch
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self, n_hiddens, n_feature= 2, n_output= 1):
super().__init__()
self.hiddens = []
n_hidden_in = n_feature
for n_hidden in n_hiddens :
self.hiddens.append( torch.nn.Linear(n_hidden_in, n_hidden) ) # hidden layer
n_hidden_in = n_hidden
self.predict = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
for hidden in self.hiddens :
x = F.relu(hidden(x)) # activation function for hidden layer
x = self.predict(x) # linear output
return x
然后我定义我的数据加载器。这里,X 和 y 是 numpy 数组
from torch.utils.data import TensorDataset, DataLoader
# Split training/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) # create your datset
train_dataloader = DataLoader(train_dataset, batch_size= 1000) # create your dataloader
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) # create your datset
test_dataloader = DataLoader(test_dataset, batch_size= 1000) # create your dataloader
在这里我训练我的模型。错误发生在行“outputs = regressor(inputs)”
NUM_EPOCHS = 2000
BATCH_SIZE = 1000
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")
# 1 hidden layer
total_num_nodes = 256
regressor = Net(n_hiddens= [total_num_nodes]).to(device)
optimizer = torch.optim.SGD(regressor.parameters(), lr=0.2, momentum= 0.1, nesterov= True)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
for epoch in range(NUM_EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, values = data
inputs = inputs.float().to(device)
values = values.float().to(device)
optimizer.zero_grad() # clear gradients for next train
print(f"Input device is : cuda:{inputs.get_device()}")
print(f"Target value device is : cuda:{values.get_device()}")
print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
outputs = regressor(inputs) # <-- This is where I have the error
loss = loss_func(outputs, values)
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
这是我的打印语句的输出:
Device used : cuda:0
Input device is : cuda:0
Target value device is : cuda:0
Is model on cuda ? :True
这应该意味着我的模型和我的张量都在同一台设备上,为什么我仍然有这个错误?
错误日志是:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-6-5234b830bebc> in <module>()
24 print(f"Target value device is : cuda:{values.get_device()}")
25 print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
---> 26 outputs = regressor(inputs)
27 loss = loss_func(outputs, values)
28 loss.backward() # backpropagation, compute gradients
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-4-56c54b30b771> in forward(self, x)
16 def forward(self, x):
17 for hidden in self.hiddens :
---> 18 x = F.relu(hidden(x)) # activation function for hidden layer
19 x = self.predict(x) # linear output
20 return x
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/linear.py in forward(self, input)
101
102 def forward(self, input: Tensor) -> Tensor:
--> 103 return F.linear(input, self.weight, self.bias)
104
105 def extra_repr(self) -> str:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
非常感谢
TL;DR 使用 nn.ModuleList
而不是 pythonic 来存储隐藏层 Net
.
您所有的隐藏层都存储在 Net
中的一个简单的 pythonic 列表 self.hidden
中。当您使用 .to(device)
将模型移动到 GPU 时,pytorch 无法判断此 pythonic 列表的所有元素也应移动到同一设备。
然而,如果你设置 self.hidden = nn.ModuleLis()
,pytorch 现在知道将这个特殊列表的所有元素视为 nn.Module
s 并且 递归地 将它们移动到与 [=] 相同的设备11=].
查看这些答案