tf2.4 mixed_precision with float16 return 0 渐变
tf2.4 mixed_precision with float16 return 0 gradient
这个问题之前发过here,为了大家多多关注,特在此重新打开
主要问题是在正常的 float32 环境中测试时,tensorflow returns 梯度类似,但在我使用 float16 转换为 mixed_precision.set_global_policy('mixed_float16')
后,返回的梯度始终为 0.
下面是一个可以重现错误的玩具代码。
系统信息
OS 平台和分发:linux
TensorFlow 版本(使用下面的命令):tf2.4.1
重现代码
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision
import numpy as np
from tqdm import tqdm
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
mixed_precision.set_global_policy('mixed_float16')
def forward_conv(x, filters, kernels, name='forward', padding='same'):
i = 0
for flt, kernel in zip(filters, kernels):
x = layers.Conv3D(flt, kernel, activation='relu', padding=padding, dilation_rate=(1, 1, 1),
use_bias=False, name=str(i) + '_' + name)(x)
x = layers.BatchNormalization(name=str(i) + '_bn_' + name)(x)
i += 1
return x
def part_one(ipt):
l1 = forward_conv(ipt, (4, 4), (3, 3), name='enc1')
d2 = layers.MaxPool3D(pool_size=(2, 2, 2))(l1)
l2 = forward_conv(d2, (4, 4), (3, 3), name='enc2')
return l1, l2
def part_inner(ipt1, ipt2):
l1 = forward_conv(ipt1, (4, 4), (3, 3), name='enc1')
l2 = forward_conv(ipt2, (4, 4), (3, 3), name='enc2')
return l1, l2
def part_two(ipt1, ipt2):
l2 = forward_conv(ipt2, (4, 4), (3, 3), name='dec2')
u1 = layers.UpSampling3D(size=(2, 2, 2))(l2)
r1 = forward_conv(ipt1 + u1, (4, 4), (3, 3), name='dec1')
return r1
initial = tf.ones([1, 256, 368, 368, 1], dtype=tf.float16)
tf.random.set_seed(1)
with tf.GradientTape() as g:
g.watch(initial)
l1_, l2_ = part_one(initial)
for _ in range(2):
l1_, l2_ = part_inner(l1_, l2_)
opt_ = part_two(l1_, l2_)
loss = tf.reduce_mean(l1_) + tf.reduce_mean(opt_)
gd = g.gradient(loss, initial)
print('-' * 100)
print(f'loss is {loss} and grad is {np.sum(gd)} with ckpt= {ckpt}')
行为描述
当使用tf.float32设置时,梯度的结果是合理的,值在0.6左右,但是当使用mixed_precision转换到tf.float16时,梯度恒定为0。我们是否应该期望计算普通 float32 模式和 mixed_precision float16 模式之间的渐变如此不同?谢谢!
在关于 mixed_precision
的 Tensorflow documentation 中,他们谈到使用损失缩放来处理这个问题。
由于文档经常随着 tensorflow 变得过时,这里是建议的代码:
loss_scale = 1024
loss = model(inputs)
loss *= loss_scale
# Assume `grads` are float32. You do not want to divide float16 gradients.
grads = compute_gradient(loss, model.trainable_variables)
grads /= loss_scale
这应该可以解决问题。
这个问题之前发过here,为了大家多多关注,特在此重新打开
主要问题是在正常的 float32 环境中测试时,tensorflow returns 梯度类似,但在我使用 float16 转换为 mixed_precision.set_global_policy('mixed_float16')
后,返回的梯度始终为 0.
下面是一个可以重现错误的玩具代码。
系统信息
OS 平台和分发:linux TensorFlow 版本(使用下面的命令):tf2.4.1
重现代码
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision
import numpy as np
from tqdm import tqdm
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
mixed_precision.set_global_policy('mixed_float16')
def forward_conv(x, filters, kernels, name='forward', padding='same'):
i = 0
for flt, kernel in zip(filters, kernels):
x = layers.Conv3D(flt, kernel, activation='relu', padding=padding, dilation_rate=(1, 1, 1),
use_bias=False, name=str(i) + '_' + name)(x)
x = layers.BatchNormalization(name=str(i) + '_bn_' + name)(x)
i += 1
return x
def part_one(ipt):
l1 = forward_conv(ipt, (4, 4), (3, 3), name='enc1')
d2 = layers.MaxPool3D(pool_size=(2, 2, 2))(l1)
l2 = forward_conv(d2, (4, 4), (3, 3), name='enc2')
return l1, l2
def part_inner(ipt1, ipt2):
l1 = forward_conv(ipt1, (4, 4), (3, 3), name='enc1')
l2 = forward_conv(ipt2, (4, 4), (3, 3), name='enc2')
return l1, l2
def part_two(ipt1, ipt2):
l2 = forward_conv(ipt2, (4, 4), (3, 3), name='dec2')
u1 = layers.UpSampling3D(size=(2, 2, 2))(l2)
r1 = forward_conv(ipt1 + u1, (4, 4), (3, 3), name='dec1')
return r1
initial = tf.ones([1, 256, 368, 368, 1], dtype=tf.float16)
tf.random.set_seed(1)
with tf.GradientTape() as g:
g.watch(initial)
l1_, l2_ = part_one(initial)
for _ in range(2):
l1_, l2_ = part_inner(l1_, l2_)
opt_ = part_two(l1_, l2_)
loss = tf.reduce_mean(l1_) + tf.reduce_mean(opt_)
gd = g.gradient(loss, initial)
print('-' * 100)
print(f'loss is {loss} and grad is {np.sum(gd)} with ckpt= {ckpt}')
行为描述
当使用tf.float32设置时,梯度的结果是合理的,值在0.6左右,但是当使用mixed_precision转换到tf.float16时,梯度恒定为0。我们是否应该期望计算普通 float32 模式和 mixed_precision float16 模式之间的渐变如此不同?谢谢!
在关于 mixed_precision
的 Tensorflow documentation 中,他们谈到使用损失缩放来处理这个问题。
由于文档经常随着 tensorflow 变得过时,这里是建议的代码:
loss_scale = 1024
loss = model(inputs)
loss *= loss_scale
# Assume `grads` are float32. You do not want to divide float16 gradients.
grads = compute_gradient(loss, model.trainable_variables)
grads /= loss_scale
这应该可以解决问题。