如何将 keras add_weight() 变量与张量流概率分布一起使用？

Question

我正在创建一个新的 keras 层，它接受一个输入数据向量并由 2 个标量（均值和标准差）参数化。我将输入数据建模为正态分布，并通过梯度下降估计其均值和方差。但是，当我初始化 tfp.Normal(mu, sigma) 其中 mu 和 sigma 来自 add_weights() 期间，build()，梯度不会通过 mu 和 sigma 传播。

tensorflow 概率文档指出，您可以传入分布参数的训练变量并通过它们进行反向传播。我如何让它在 keras 中工作？

下面是一个最小的工作示例。

import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
tfk = tf.keras
tfkl = tf.keras.layers
tfd = tfp.distributions
tfpl = tfp.layers
EPS = 1e-5

batch_size = 4
N = 100
x = np.random.randn(batch_size, N)

class NormalLikelihood(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihood, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight("mean", shape=[1], initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight("std", shape=[1], initializer=tf.keras.initializers.RandomUniform(minval=EPS, maxval=5.0, seed=None), constraint=tf.keras.constraints.non_neg(), dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r

input_layer = tf.keras.layers.Input(shape=(100,))
r = NormalLikelihood()(input_layer)
r = -tf.reduce_sum(tf.math.log(r))
model = tf.keras.models.Model(input_layer, r)
model.add_loss(r)
model.compile(optimizer='rmsprop', loss=None)
model.fit(x, y=None)

此代码导致 builtins.ValueError：没有为任何变量提供梯度：['normal_likelihood/mean:0'，'normal_likelihood/std:0']，这不是预期的。期望的行为是 ['normal_likelihood/mean:0'、'normal_likelihood/std:0'] 为它们提供渐变。

查看 google colab 中的代码：https://colab.research.google.com/drive/1_u4XTCIH-2qwNSgv9zkZiCG_zeCIEZGp?usp=sharing

Answer 1

将 tfp.distributions.Normal(self.mu[0], self.sigma[0]) 更改为 tfp.distributions.Normal(self.mu, self.sigma)。

之所以可行，是因为在 .fit() keras 方法的幕后，梯度计算正在寻找可训练的变量。当您对模型的权重进行索引时，您正在根据破坏链式法则连通性的常数计算梯度。

示例：

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp


EPS = 1e-5


class NormalLikelihoodYours(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihoodYours, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight(
            "mean", shape=[1],
            initializer=tf.keras.initializers.RandomNormal(
                mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight(
            "std", shape=[1],
            initializer=tf.keras.initializers.RandomUniform(
                minval=EPS, maxval=5.0, seed=None),
            constraint=tf.keras.constraints.non_neg(),
            dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r
    

class NormalLikelihoodMine(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihoodMine, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight(
            "mean", shape=[1],
            initializer=tf.keras.initializers.RandomNormal(
                mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight(
            "std", shape=[1],
            initializer=tf.keras.initializers.RandomUniform(
                minval=EPS, maxval=5.0, seed=None),
            constraint=tf.keras.constraints.non_neg(),
            dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu, self.sigma)

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r

# loss function
def calc_loss(logits):
    return -tf.math.reduce_sum(tf.math.log(logits))

# model input
input_layer = tf.keras.layers.Input(shape=(100,))
x_in = tf.random.normal([4, 100])

# your model
your_output = NormalLikelihoodYours()(input_layer)
your_model = tf.keras.models.Model(input_layer, your_output)\

# my model
my_output = NormalLikelihoodMine()(input_layer)
my_model = tf.keras.models.Model(input_layer, my_output)

# yours has no gradients because the network weights are not
# included anywhere in the loss calculation. When you index them
# with `[0]` they go from being trainable variables in the network,
# to just constants.
with tf.GradientTape() as tape:
    y_hat = your_model(x_in)
    loss = calc_loss(y_hat)
    
print(tape.gradient(loss, your_model.trainable_variables))
# [None, None]

# my model has gradients because `loss` and the weights in
# `trainable_variables` are connected
with tf.GradientTape() as tape:
    y_hat = my_model(x_in)
    loss = calc_loss(y_hat)
    
print(tape.gradient(loss, my_model.trainable_variables))
# [<tf.Tensor: shape=(1,), numpy=array([43.83749], dtype=float32)>,
#  <tf.Tensor: shape=(1,), numpy=array([-37.348656], dtype=float32)>]

如何将 keras add_weight() 变量与张量流概率分布一起使用？

How does one use keras add_weight() vars with tensorflow probability distributions?

keras

tensorflow

tensorflow2.0