TF2 代码比 Conv1D 网络的等效 PyTorch 代码慢 10 倍

TF2 code 10 times slower than equivalent PyTorch code for a Conv1D network

我一直在尝试将一些 PyTorch 代码转换为 TensorFlow 2,但 TF2 代码慢了大约 10 倍。我已经尝试查看这可能来自何处,据我所知它来自 tape.gradient 调用(性能与 keras 的 .fit 函数相同)。我尝试过使用不同的数据加载器、声明模型的方式、安装等...结果是一致的。

任何关于为什么会发生这种情况的解释/解决方案将不胜感激。

这是 TF2 代码的极简版本:

import time

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np


# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18, 1)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)

# Create a small model
model = tf.keras.Sequential([
    layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
    layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
    layers.Conv1D(256, kernel_size=1, strides=1, padding="same", activation="relu"),
    layers.GlobalAveragePooling2D(),
    layers.Flatten(),
    layers.Dense(128, use_bias=True, activation="relu"),
    layers.Dense(32, use_bias=True, activation="relu"),
    layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)


@tf.function
def train_step(data_batch, label_batch):
    with tf.GradientTape() as tape:
        y_pred = model(data_batch)
        loss = tf.keras.losses.MSE(labels_batch, y_pred)
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))


step_times = []
for epoch in range(20):
    for data_batch, labels_batch in train_dataset:
        step_start_time = time.perf_counter()
        train_step(data_batch, labels_batch)
        if epoch != 0:
            step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")

PyTorch 等价物:

import time

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 18, 120)


# Create a small model
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(18, 64, kernel_size=7, stride=3, padding=3)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=5, stride=2, padding=2)
        self.conv3 = nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2)
        self.conv4 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, inputs):
        x = F.relu(self.conv1(inputs))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = x.mean(2)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
loss_fn = torch.nn.MSELoss()

batch_size = 256
train_steps_per_epoch = train_data.shape[0] // batch_size
step_times = []
for epoch in range(20):
    for step in range(train_steps_per_epoch):
        batch_start, batch_end = step * batch_size, (step+1) * batch_size
        data_batch = torch.FloatTensor(train_data[batch_start:batch_end]).to(device)
        labels_batch = torch.FloatTensor(train_labels[batch_start:batch_end]).to(device)

        step_start_time = time.perf_counter()
        optimizer.zero_grad()
        y_pred = model(data_batch)
        loss = loss_fn(labels_batch, torch.squeeze(y_pred))
        loss.backward()
        optimizer.step()

        if epoch != 0:
            step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")

您使用 tf.GradientTape 正确,但您提供的代码段中的模型和数据都不同。

这是使用与您的 Pytorch 模型相同的数据和模型架构的 TF 代码。

import time

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np


# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)

model = tf.keras.Sequential([
    layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
    layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
    layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
    layers.Conv1D(256, kernel_size=3, strides=1, padding="same", activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, use_bias=True, activation="relu"),
    layers.Dense(32, use_bias=True, activation="relu"),
    layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)


@tf.function
def train_step(data_batch, label_batch, model):
    with tf.GradientTape() as tape:
        y_pred = model(data_batch, training=True)
        loss = tf.keras.losses.MSE(labels_batch, y_pred)
    gradients = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))


step_times = []
for epoch in range(20):
    for data_batch, labels_batch in train_dataset:
        step_start_time = time.perf_counter()
        train_step(data_batch, labels_batch, model)
        if epoch != 0:
            step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")

因此,实际上,TF 比 Pytorch 快 3 倍:0.035 秒对 0.112 秒