Keras 中 add_loss 函数的用途是什么？

Question

目前我偶然发现了变分自动编码器，并试图让它们使用 keras 在 MNIST 上工作。我在 github.

上找到了教程

我的问题涉及以下代码行：

# Build model
vae = Model(x, x_decoded_mean)

# Calculate custom loss
xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)

# Compile
vae.add_loss(vae_loss)
vae.compile(optimizer='rmsprop')

为什么使用 add_loss 而不是将其指定为编译选项？ vae.compile(optimizer='rmsprop', loss=vae_loss) 之类的东西似乎不起作用并抛出以下错误：

ValueError: The model cannot be compiled because it has no loss to optimize.

此函数与自定义损失函数有什么区别，我可以将其添加为 Model.fit() 的参数？

提前致谢！

P.S.: 我知道 github 上有几个与此相关的问题，但其中大多数是开放的且未评论的。如果问题已经解决，请分享 link!

编辑 1

我删除了向模型添加损失的行，并使用了编译函数的损失参数。现在看起来像这样：

# Build model
vae = Model(x, x_decoded_mean)

# Calculate custom loss
xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)

# Compile
vae.compile(optimizer='rmsprop', loss=vae_loss)

这会抛出类型错误：

TypeError: Using a 'tf.Tensor' as a Python 'bool' is not allowed. Use 'if t is not None:' instead of 'if t:' to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.

编辑 2

感谢@MarioZ 的努力，我找到了解决方法。

# Build model
vae = Model(x, x_decoded_mean)

# Calculate custom loss in separate function
def vae_loss(x, x_decoded_mean):
    xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)
    return vae_loss

# Compile
vae.compile(optimizer='rmsprop', loss=vae_loss)

...

vae.fit(x_train, 
    x_train,        # <-- did not need this previously
    shuffle=True,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(x_test, x_test))     # <-- worked with (x_test, None) before

出于某些奇怪的原因，我不得不在拟合模型时明确指定 y 和 y_test。本来，我不需要这样做。生产的样品对我来说似乎是合理的。

虽然我可以解决这个问题，但我仍然不知道这两种方法的区别和缺点是什么（除了需要不同的语法）。有人可以给我更多见解吗？

Answer 1

您需要将编译行更改为

vae.compile(optimizer='rmsprop', loss=vae_loss)

Answer 2

试试这个：

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model, Sequential
from keras.layers import Input, Lambda, Dense, Dropout, Layer, Bidirectional, Embedding, Lambda, LSTM, RepeatVector, TimeDistributed, BatchNormalization, Activation, Merge
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from keras import backend as K
from keras import metrics
from scipy.stats import norm
from keras.utils import to_categorical
from keras import initializers
bias = bias_initializer='zeros'

from keras import objectives




np.random.seed(22)



data1 = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype='int32')

data2 = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype='int32')


data3 = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype='int32')

#train = np.zeros(shape=(992,54))
#test = np.zeros(shape=(921,54))

train = np.zeros(shape=(300,54))
test = np.zeros(shape=(300,54))

for n, i in enumerate(train):
    if (n<=100):
        train[n] = data1
    elif (n>100 and n<=200):
        train[n] = data2
    elif(n>200):
        train[n] = data3


for n, i in enumerate(test):
    if (n<=100):
        test[n] = data1
    elif(n>100 and n<=200):
        test[n] = data2
    elif(n>200):
        test[n] = data3


batch_size = 5
original_dim = train.shape[1]

intermediate_dim45 = 45
intermediate_dim35 = 35
intermediate_dim25 = 25
intermediate_dim15 = 15
intermediate_dim10 = 10
intermediate_dim5 = 5
latent_dim = 3
epochs = 50
epsilon_std = 1.0

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

x = Input(shape=(original_dim,), name = 'first_input_mario')

h1 = Dense(intermediate_dim45, activation='relu', name='h1')(x)
hD = Dropout(0.5)(h1)
h2 = Dense(intermediate_dim25, activation='relu', name='h2')(hD)
h3 = Dense(intermediate_dim10, activation='relu', name='h3')(h2)
h = Dense(intermediate_dim5, activation='relu', name='h')(h3) #bilo je relu
h = Dropout(0.1)(h)

z_mean = Dense(latent_dim, activation='relu')(h)
z_log_var = Dense(latent_dim, activation='relu')(h)

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

decoder_h = Dense(latent_dim, activation='relu')
decoder_h1 = Dense(intermediate_dim5, activation='relu')
decoder_h2 = Dense(intermediate_dim10, activation='relu')
decoder_h3 = Dense(intermediate_dim25, activation='relu')
decoder_h4 = Dense(intermediate_dim45, activation='relu')

decoder_mean = Dense(original_dim, activation='sigmoid')


h_decoded = decoder_h(z)
h_decoded1 = decoder_h1(h_decoded)
h_decoded2 = decoder_h2(h_decoded1)
h_decoded3 = decoder_h3(h_decoded2)
h_decoded4 = decoder_h4(h_decoded3)

x_decoded_mean = decoder_mean(h_decoded4)

vae = Model(x, x_decoded_mean)


def vae_loss(x, x_decoded_mean):
    xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
    kl_loss = -0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
    loss = xent_loss + kl_loss
    return loss

vae.compile(optimizer='rmsprop', loss=vae_loss)

vae.fit(train, train, batch_size = batch_size, epochs=epochs, shuffle=True,
        validation_data=(test, test))


vae = Model(x, x_decoded_mean)

encoder = Model(x, z_mean)

decoder_input = Input(shape=(latent_dim,))

_h_decoded = decoder_h  (decoder_input)
_h_decoded1 = decoder_h1  (_h_decoded)
_h_decoded2 = decoder_h2  (_h_decoded1)
_h_decoded3 = decoder_h3  (_h_decoded2)
_h_decoded4 = decoder_h4  (_h_decoded3)

_x_decoded_mean = decoder_mean(_h_decoded4)
generator = Model(decoder_input, _x_decoded_mean)
generator.summary()

Answer 3

我将尝试回答最初的问题，即为什么使用 model.add_loss() 而不是为 model.compile(loss=...) 指定自定义损失函数。

Keras 中的所有损失函数始终采用两个参数 y_true 和 y_pred。看看 Keras 中可用的各种标准损失函数的定义，它们都有这两个参数。它们是'targets'（许多教科书中的Y变量）和模型的实际输出。大多数标准损失函数都可以写成这两个张量的表达式。但是一些更复杂的损失不能这样写。对于您的 VAE 示例，情况就是这样，因为损失函数还取决于额外的张量，即 z_log_var 和 z_mean，这些张量不适用于损失函数。使用 model.add_loss() 没有这样的限制，并且允许您编写依赖于许多其他张量的更复杂的损失，但它具有更多依赖于模型的不便之处，而标准损失函数仅适用于任何模型。

（注意：这里其他答案中提出的代码有点作弊，因为它们只是使用全局变量来潜入额外的必需依赖项。这使得损失函数在数学意义上不是真正的函数。我认为这是不太干净的代码，我希望它更容易出错。）

Answer 4

JIH 的回答当然是正确的，但补充一下也许有用：

model.add_loss() 没有限制，但它也消除了在 model.fit().

中使用例如目标的舒适性

如果您的损失取决于模型的其他参数、其他模型或外部变量，您仍然可以使用 Keras 类型封装的损失函数，方法是使用一个封装函数来传递所有其他参数：

def loss_carrier(extra_param1, extra_param2):
    def loss(y_true, y_pred):
        #x = complicated math involving extra_param1, extraparam2, y_true, y_pred
        #remember to use tensor objects, so for example keras.sum, keras.square, keras.mean
        #also remember that if extra_param1, extra_maram2 are variable tensors instead of simple floats,
        #you need to have them defined as inputs=(main,extra_param1, extraparam2) in your keras.model instantiation.
        #and have them defind as keras.Input or tf.placeholder with the right shape.
        return x
    return loss

model.compile(optimizer='adam', loss=loss_carrier)

诀窍是最后一行 return Keras 期望的函数只有两个参数 y_true 和 y_pred.

可能看起来比 model.add_loss 版本更复杂，但损失保持模块化。

Answer 5

我也想知道相同的查询和一些相关的东西，比如如何在中间层中添加损失函数。在这里分享一些观察到的信息，希望对大家有所帮助。的确，标准 keras 损失函数只接受两个参数，y_true 和 y_pred。但在实验过程中，有时我们在计算这两个值（y_true、y_pred）时需要一些外部参数或系数。这可能像往常一样在最后一层或模型层中间的某处需要。

`model.add_loss()`

接受的答案正确地描述了 model.add_loss() 函数。它可能取决于 层输入 （张量）。根据官方doc，在编写自定义层或子classed模型的call方法时，我们可能想要计算我们希望在训练过程中最小化的标量（例如regularization losses)。我们可以使用 add_loss() 层方法来跟踪此类损失项。例如，activity 正则化损失取决于调用层时传递的输入。这是一个基于输入的 L2 范数添加稀疏正则化损失的层的示例：

from tensorflow.keras.layers import Layer

class MyActivityRegularizer(Layer):
  """Layer that creates an activity sparsity regularization loss."""

  def __init__(self, rate=1e-2):
    super(MyActivityRegularizer, self).__init__()
    self.rate = rate

  def call(self, inputs):
    # We use `add_loss` to create a regularization loss
    # that depends on the inputs.
    self.add_loss(self.rate * tf.reduce_sum(tf.square(inputs)))
    return inputs

通过 add_loss 添加的损失值可以在任何 Layer 或 Model 的 .losses 列表属性中检索（它们从每个递归检索底层）：

from tensorflow.keras import layers

class SparseMLP(Layer):
  """Stack of Linear layers with a sparsity regularization loss."""

  def __init__(self, output_dim):
      super(SparseMLP, self).__init__()
      self.dense_1 = layers.Dense(32, activation=tf.nn.relu)
      self.regularization = MyActivityRegularizer(1e-2)
      self.dense_2 = layers.Dense(output_dim)

  def call(self, inputs):
      x = self.dense_1(inputs)
      x = self.regularization(x)
      return self.dense_2(x)


mlp = SparseMLP(1)
y = mlp(tf.ones((10, 10)))

print(mlp.losses)  # List containing one float32 scalar

另请注意，在使用 model.fit() 时，此类损失项会自动处理。在编写自定义训练循环时，我们应该从 model.losses 中手动检索这些术语，如下所示：

loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

# Iterate over the batches of a dataset.
for x, y in dataset:
    with tf.GradientTape() as tape:
        # Forward pass.
        logits = model(x)
        # Loss value for this batch.
        loss_value = loss_fn(y, logits)
        # Add extra loss terms to the loss value.
        loss_value += sum(model.losses) # < ------------- HERE ---------

    # Update the weights of the model to minimize the loss value.
    gradients = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))

`Custom losses`

有了model.add_loss()，（AFAIK），我们可以在网络中间的某个地方使用它。这里我们不再只绑定两个参数，即 y_true、y_pred。但是，如果我们还想将外部参数或系数归因于网络的最后一层损失函数怎么办。 Nric 的回答是正确的。但是也可以通过subclassing tf.keras.losses.Loss class 实现，实现下面两个方法：

__init__(self): 接受损失函数调用期间传递的参数
call(self, y_true, y_pred)：使用目标 (y_true) 和模型预测 (y_pred) 来计算模型的损失

这是一个自定义 MSE 的示例，通过 class 对 tf.keras.losses.Loss class 进行子class。并且这里我们也不再只绑定两个参数，即 y_ture、y_pred.

class CustomMSE(keras.losses.Loss):
    def __init__(self, regularization_factor=0.1, name="custom_mse"):
        super().__init__(name=name)
        self.regularization_factor = regularization_factor

    def call(self, y_true, y_pred):
        mse = tf.math.reduce_mean(tf.square(y_true - y_pred))
        reg = tf.math.reduce_mean(tf.square(0.5 - y_pred))
        return mse + reg * self.regularization_factor

model.compile(optimizer=..., loss=CustomMSE())

Keras 中 add_loss 函数的用途是什么？

What is the purpose of the add_loss function in Keras?

neural-network

autoencoder

keras

编辑 1

编辑 2

`model.add_loss()`

`Custom losses`