TensorFlow 2.0:在自定义训练循环中显示进度条
TensorFlow 2.0: display progress bar in custom training loop
我正在为音频分类任务训练 CNN,我正在使用带有自定义训练循环的 TensorFlow 2.0 RC(如官方网站 this guide 中所述)。我会发现有一个漂亮的进度条真的很方便,类似于通常的 Keras model.fit
.
这是我的训练代码大纲(我使用 4 个 GPU,采用镜像分布策略):
strategy = distribute.MirroredStrategy()
distr_train_dataset = strategy.experimental_distribute_dataset(train_dataset)
if valid_dataset:
distr_valid_dataset = strategy.experimental_distribute_dataset(valid_dataset)
with strategy.scope():
model = build_model() # build the model
optimizer = # define optimizer
train_loss = # define training loss
train_metrics_1 = # AUC-ROC
train_metrics_2 = # AUC-PR
valid_metrics_1 = # AUC-ROC for validation
valid_metrics_2 = # AUC-PR for validation
# rescale loss
def compute_loss(labels, predictions):
per_example_loss = train_loss(labels, predictions)
return per_example_loss/config.batch_size
def train_step(batch):
audio_batch, label_batch = batch
with tf.GradientTape() as tape:
logits = model(audio_batch)
loss = compute_loss(label_batch, logits)
variables = model.trainable_variables
grads = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(grads, variables))
train_metrics_1.update_state(label_batch, logits)
train_metrics_2.update_state(label_batch, logits)
train_mean_loss.update_state(loss)
return loss
def valid_step(batch):
audio_batch, label_batch = batch
logits = model(audio_batch, training=False)
loss = compute_loss(label_batch, logits)
val_metrics_1.update_state(label_batch, logits)
val_metrics_2.update_state(label_batch, logits)
val_loss.update_state(loss)
return loss
@tf.function
def distributed_train(batch):
num_batches = 0
for batch in distr_train_dataset:
num_batches += 1
strategy.experimental_run_v2(train_step, args=(batch, ))
# print progress here
tf.print('Step', num_batches, '; Loss', train_mean_loss.result(), '; ROC_AUC', train_metrics_1.result(), '; PR_AUC', train_metrics_2.result())
gc.collect()
@tf.function
def distributed_valid(batch):
for batch in distr_valid_dataset:
strategy.experimental_run_v2(valid_step, args=(batch, ))
gc.collect()
for epoch in range(epochs):
distributed_train(distr_train_dataset)
gc.collect()
train_metrics_1.reset_states()
train_metrics_2.reset_states()
train_mean_loss.reset_states()
if valid_dataset:
distributed_valid(distr_valid_dataset)
gc.collect()
val_metrics_1.reset_states()
val_metrics_2.reset_states()
val_loss.reset_states()
这里 train_dataset
和 valid_dataset
是用通常的 tf.data 输入管道生成的两个 tf.data.TFRecordDataset。
TensorFlow 提供了一个非常好的 tf.keras.utils.Progbar(这确实是您在使用 model.fit
训练时看到的)。我看过它的 source code,它依赖于 numpy,所以我不能用它来代替 tf.print()
语句(以图形模式执行)。
如何在我的自定义训练循环中实现类似的进度条(使用我的训练函数 运行 在图形模式下)?
首先model.fit
如何显示进度条?
How can I implement a similar progress bar in my custom training loop (with my training function running in graph mode)?
为什么不稍微改变一下代码的结构,以便将单个 strategy.experimental_run_v2
调用封装在 tf.function
装饰函数中,并让它们 return 您想要显示的指标,然后 运行 那些在未装饰的 for
循环中并使用 tf.keras.utils.Progbar
?
How does model.fit
display a progress bar in the first place?
在v2中,model.fit
通过使用TrainingContext
对象显示进度条,该对象封装了一个Progbar
以及其他指定的回调,用on_epoch_end
、on_batch_begin
等处理日志的方法。老实说,我不太确定如何在自定义训练循环中实现类似的机制,但可能值得研究一下默认的机制,它的来源是 here.
可以使用以下代码生成自定义训练循环的进度条:
from tensorflow.keras.utils import Progbar
import time
import numpy as np
metrics_names = ['acc','pr']
num_epochs = 5
num_training_samples = 100
batch_size = 10
for i in range(num_epochs):
print("\nepoch {}/{}".format(i+1,num_epochs))
pb_i = Progbar(num_training_samples, stateful_metrics=metrics_names)
for j in range(num_training_samples//batch_size):
time.sleep(0.3)
values=[('acc',np.random.random(1)), ('pr',np.random.random(1))]
pb_i.add(batch_size, values=values)
输出:
epoch 1/5
100/100 [==============================] - 3s 30ms/step - acc: 0.2169 - pr: 0.9011
epoch 2/5
100/100 [==============================] - 3s 30ms/step - acc: 0.7815 - pr: 0.4900
epoch 3/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8003 - pr: 0.9292
epoch 4/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8280 - pr: 0.9113
epoch 5/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8497 - pr: 0.1929
@Shubham Malaviya 的回答很完美。
我只是想在与 tf.data.Dataset
交互时进一步扩展它。这段代码也是基于这个answer.
import tensorflow as tf
import numpy as np
import time
# From https://www.tensorflow.org/guide/data#reading_input_data
(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.fashion_mnist.load_data()
images_train = images_train/255
images_test = images_test/255
dataset_train = tf.data.Dataset.from_tensor_slices((images_train, labels_train))
dataset_test = tf.data.Dataset.from_tensor_slices((images_test, labels_test))
# From @Shubham Malaviya
metrics_names = ['train_loss','val_loss']
num_epochs = 2
num_training_samples = images_train.shape[0]
batch_size = 10
# Loop on each epoch
for epoch in range(num_epochs):
print("\nepoch {}/{}".format(epoch+1,num_epochs))
progBar = tf.keras.utils.Progbar(num_training_samples, stateful_metrics=metrics_names)
# Loop on each batch of train dataset
for idX, (batch_x, batch_y) in enumerate(dataset_train.batch(batch_size)):
# Train the model
train_loss = np.random.random(1)
values=[('train_loss',train_loss)]
progBar.update(idX*batch_size, values=values)
# Loop on each batch of test dataset for validation
for batch_x, batch_y in dataset_test.batch(batch_size):
# Foward image through the network
# -----
# Calc the loss
val_loss = np.random.random(1)
# Update progBar with val_loss
values=[('train_loss',train_loss),('val_loss',val_loss)]
progBar.update(num_training_samples, values=values, finalize=True)
输出:
epoch 1/2 60000/60000 [==============================] - 1s 22us/step
- train_loss: 0.7019 - val_loss: 0.0658
epoch 2/2 60000/60000 [==============================] - 1s 21us/step
- train_loss: 0.5561 - val_loss: 0.0324
我正在为音频分类任务训练 CNN,我正在使用带有自定义训练循环的 TensorFlow 2.0 RC(如官方网站 this guide 中所述)。我会发现有一个漂亮的进度条真的很方便,类似于通常的 Keras model.fit
.
这是我的训练代码大纲(我使用 4 个 GPU,采用镜像分布策略):
strategy = distribute.MirroredStrategy()
distr_train_dataset = strategy.experimental_distribute_dataset(train_dataset)
if valid_dataset:
distr_valid_dataset = strategy.experimental_distribute_dataset(valid_dataset)
with strategy.scope():
model = build_model() # build the model
optimizer = # define optimizer
train_loss = # define training loss
train_metrics_1 = # AUC-ROC
train_metrics_2 = # AUC-PR
valid_metrics_1 = # AUC-ROC for validation
valid_metrics_2 = # AUC-PR for validation
# rescale loss
def compute_loss(labels, predictions):
per_example_loss = train_loss(labels, predictions)
return per_example_loss/config.batch_size
def train_step(batch):
audio_batch, label_batch = batch
with tf.GradientTape() as tape:
logits = model(audio_batch)
loss = compute_loss(label_batch, logits)
variables = model.trainable_variables
grads = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(grads, variables))
train_metrics_1.update_state(label_batch, logits)
train_metrics_2.update_state(label_batch, logits)
train_mean_loss.update_state(loss)
return loss
def valid_step(batch):
audio_batch, label_batch = batch
logits = model(audio_batch, training=False)
loss = compute_loss(label_batch, logits)
val_metrics_1.update_state(label_batch, logits)
val_metrics_2.update_state(label_batch, logits)
val_loss.update_state(loss)
return loss
@tf.function
def distributed_train(batch):
num_batches = 0
for batch in distr_train_dataset:
num_batches += 1
strategy.experimental_run_v2(train_step, args=(batch, ))
# print progress here
tf.print('Step', num_batches, '; Loss', train_mean_loss.result(), '; ROC_AUC', train_metrics_1.result(), '; PR_AUC', train_metrics_2.result())
gc.collect()
@tf.function
def distributed_valid(batch):
for batch in distr_valid_dataset:
strategy.experimental_run_v2(valid_step, args=(batch, ))
gc.collect()
for epoch in range(epochs):
distributed_train(distr_train_dataset)
gc.collect()
train_metrics_1.reset_states()
train_metrics_2.reset_states()
train_mean_loss.reset_states()
if valid_dataset:
distributed_valid(distr_valid_dataset)
gc.collect()
val_metrics_1.reset_states()
val_metrics_2.reset_states()
val_loss.reset_states()
这里 train_dataset
和 valid_dataset
是用通常的 tf.data 输入管道生成的两个 tf.data.TFRecordDataset。
TensorFlow 提供了一个非常好的 tf.keras.utils.Progbar(这确实是您在使用 model.fit
训练时看到的)。我看过它的 source code,它依赖于 numpy,所以我不能用它来代替 tf.print()
语句(以图形模式执行)。
如何在我的自定义训练循环中实现类似的进度条(使用我的训练函数 运行 在图形模式下)?
首先model.fit
如何显示进度条?
How can I implement a similar progress bar in my custom training loop (with my training function running in graph mode)?
为什么不稍微改变一下代码的结构,以便将单个 strategy.experimental_run_v2
调用封装在 tf.function
装饰函数中,并让它们 return 您想要显示的指标,然后 运行 那些在未装饰的 for
循环中并使用 tf.keras.utils.Progbar
?
How does
model.fit
display a progress bar in the first place?
在v2中,model.fit
通过使用TrainingContext
对象显示进度条,该对象封装了一个Progbar
以及其他指定的回调,用on_epoch_end
、on_batch_begin
等处理日志的方法。老实说,我不太确定如何在自定义训练循环中实现类似的机制,但可能值得研究一下默认的机制,它的来源是 here.
可以使用以下代码生成自定义训练循环的进度条:
from tensorflow.keras.utils import Progbar
import time
import numpy as np
metrics_names = ['acc','pr']
num_epochs = 5
num_training_samples = 100
batch_size = 10
for i in range(num_epochs):
print("\nepoch {}/{}".format(i+1,num_epochs))
pb_i = Progbar(num_training_samples, stateful_metrics=metrics_names)
for j in range(num_training_samples//batch_size):
time.sleep(0.3)
values=[('acc',np.random.random(1)), ('pr',np.random.random(1))]
pb_i.add(batch_size, values=values)
输出:
epoch 1/5
100/100 [==============================] - 3s 30ms/step - acc: 0.2169 - pr: 0.9011
epoch 2/5
100/100 [==============================] - 3s 30ms/step - acc: 0.7815 - pr: 0.4900
epoch 3/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8003 - pr: 0.9292
epoch 4/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8280 - pr: 0.9113
epoch 5/5
100/100 [==============================] - 3s 30ms/step - acc: 0.8497 - pr: 0.1929
@Shubham Malaviya 的回答很完美。
我只是想在与 tf.data.Dataset
交互时进一步扩展它。这段代码也是基于这个answer.
import tensorflow as tf
import numpy as np
import time
# From https://www.tensorflow.org/guide/data#reading_input_data
(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.fashion_mnist.load_data()
images_train = images_train/255
images_test = images_test/255
dataset_train = tf.data.Dataset.from_tensor_slices((images_train, labels_train))
dataset_test = tf.data.Dataset.from_tensor_slices((images_test, labels_test))
# From @Shubham Malaviya
metrics_names = ['train_loss','val_loss']
num_epochs = 2
num_training_samples = images_train.shape[0]
batch_size = 10
# Loop on each epoch
for epoch in range(num_epochs):
print("\nepoch {}/{}".format(epoch+1,num_epochs))
progBar = tf.keras.utils.Progbar(num_training_samples, stateful_metrics=metrics_names)
# Loop on each batch of train dataset
for idX, (batch_x, batch_y) in enumerate(dataset_train.batch(batch_size)):
# Train the model
train_loss = np.random.random(1)
values=[('train_loss',train_loss)]
progBar.update(idX*batch_size, values=values)
# Loop on each batch of test dataset for validation
for batch_x, batch_y in dataset_test.batch(batch_size):
# Foward image through the network
# -----
# Calc the loss
val_loss = np.random.random(1)
# Update progBar with val_loss
values=[('train_loss',train_loss),('val_loss',val_loss)]
progBar.update(num_training_samples, values=values, finalize=True)
输出:
epoch 1/2 60000/60000 [==============================] - 1s 22us/step
- train_loss: 0.7019 - val_loss: 0.0658
epoch 2/2 60000/60000 [==============================] - 1s 21us/step
- train_loss: 0.5561 - val_loss: 0.0324