使用带有 nn.Embedding 的 captum 获取 RuntimeError

Using captum with nn.Embedding getting RuntimeError

我正在使用 captum 库并收到以下错误。这是重现错误的完整代码。

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

vocab_size = 1024
embedding_dim = 32
seq_len = 128
num_classes = 5
hidden_dim = 256

class predictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq_len = seq_len
        self.num_classes = num_classes
        self.hidden_dim = hidden_dim 
        self.vocab_size, self.embedding_dim = vocab_size, embedding_dim

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.linear = nn.Linear(self.seq_len*self.embedding_dim, self.num_classes)

    def forward(self, x):
        x = self.embedding(x.long())
        x = x.reshape(-1, self.seq_len*self.embedding_dim)
        x = F.relu(self.linear(x))
        return x

class wrapper_predictor(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, x):
        x = self.model(x)
        x = F.softmax(x, dim=1)
        return x
    
indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)

model = predictor().to(device)
wrapper_model = wrapper_predictor(model).to(device)

ig = IntegratedGradients(wrapper_model)
attributions, delta = ig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)

我解决了 LayerIntegratedGradients 的问题。

这里是 link 阅读更多内容以了解其他可能的解决方案。 https://captum.ai/tutorials/IMDB_TorchText_Interpret

这是使用 LayerIntegratedGradients 的实例,使用模型的正向函数和嵌入层作为 link 中给出的示例。

这是使用 LayerIntegratedGradients 和 nn.Embedding

的示例代码
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients, LayerIntegratedGradients
from torchsummary import summary

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

vocab_size = 1024
embedding_dim = 1
seq_len = 128
num_classes = 5
hidden_dim = 256

class predictor(nn.Module):
    def __init__(self):
        super(predictor, self).__init__()
        self.seq_len = seq_len
        self.num_classes = num_classes
        self.hidden_dim = hidden_dim 
        self.vocab_size, self.embedding_dim = vocab_size, embedding_dim

        self.embedding = nn.Sequential(
            nn.Embedding(self.vocab_size, self.embedding_dim),
        )
        self.embedding.weight = torch.randn((self.vocab_size, self.embedding_dim), requires_grad=True)
        self.fc = nn.Sequential(
            nn.Linear(self.seq_len*self.embedding_dim, self.hidden_dim, device=device, bias=False),
            nn.Linear(self.hidden_dim, self.num_classes, device=device, bias=False),
        )
    def forward(self, x):
        x = self.embedding(x.long())
        x = x.view(-1, self.seq_len*self.embedding_dim)
        x = self.fc(x)
        return x

class wrapper_predictor(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, x):
        x = self.model(x)
        x = F.softmax(x, dim=1) #keep softmax out of forward if attribution score is too low.
        return x

model = predictor().to(device)

indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)
input_size = indexes.shape
summary(model=model, input_size=input_size, batch_size=-1, device='cuda')

wrapper_model = wrapper_predictor(model).to(device)

lig = LayerIntegratedGradients(model, model.embedding)
attributions, delta = lig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)