如何使用带有 3D 张量输入的 keras 嵌入层?

How to use keras embedding layer with 3D tensor input?

我在使用 Keras 嵌入层对我的输入数据进行一种热编码时遇到困难。

玩具代码如下

导入包

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
import openpyxl
import pandas as pd
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau

输入数据是基于文本的,如下所示。

训练和测试数据

X_train_orignal= np.array(['OC(=O)C1=C(Cl)C=CC=C1Cl', 'OC(=O)C1=C(Cl)C=C(Cl)C=C1Cl',
       'OC(=O)C1=CC=CC(=C1Cl)Cl', 'OC(=O)C1=CC(=CC=C1Cl)Cl',
       'OC1=C(C=C(C=C1)[N+]([O-])=O)[N+]([O-])=O'])

X_test_orignal=np.array(['OC(=O)C1=CC=C(Cl)C=C1Cl', 'CCOC(N)=O',
       'OC1=C(Cl)C(=C(Cl)C=C1Cl)Cl'])

Y_train=np.array(([[2.33],
       [2.59],
       [2.59],
       [2.54],
       [4.06]]))

Y_test=np.array([[2.20],
   [2.81],
   [2.00]])

正在创建词典

现在我创建了两个字典,字符索引副。唯一字符编号存储在 len(charset) 中,字符串的最大长度以及 5 个附加字符存储在 embed 中。每个字符串的开头将用 ! 填充,结尾将是 E.

charset = set("".join(list(X_train_orignal))+"!E")
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
embed = max([len(smile) for smile in X_train_orignal]) + 5
print (str(charset))
print(len(charset), embed)

一个热编码

我将所有的火车数据转换成一个热编码如下。

def vectorize(smiles):
        one_hot =  np.zeros((smiles.shape[0], embed , len(charset)),dtype=np.int8)
        for i,smile in enumerate(smiles):
            #encode the startchar
            one_hot[i,0,char_to_int["!"]] = 1
            #encode the rest of the chars
            for j,c in enumerate(smile):
                one_hot[i,j+1,char_to_int[c]] = 1
            #Encode endchar
            one_hot[i,len(smile)+1:,char_to_int["E"]] = 1

        return one_hot[:,0:-1,:]

X_train = vectorize(X_train_orignal)
print(X_train.shape)
X_test = vectorize(X_test_orignal)
print(X_test.shape)

当它将输入的训练数据转换为一个热编码时,一个热编码数据的形状对于训练变为(5, 44, 14),对于测试变为(3, 44, 14)。对于火车,有5个例子,0-44是最大长度,14个是唯一字符。字符数较少的示例,用E填充到最大长度。

验证正确的填充 以下是验证我们是否正确填充的代码。

mol_str_train=[]
mol_str_test=[]
for x in range(5):

    mol_str_train.append("".join([int_to_char[idx] for idx in np.argmax(X_train[x,:,:], axis=1)]))

for x in range(3):
    mol_str_test.append("".join([int_to_char[idx] for idx in np.argmax(X_test[x,:,:], axis=1)]))

让我们看看火车组的样子。

mol_str_train

['!OC(=O)C1=C(Cl)C=CC=C1ClEEEEEEEEEEEEEEEEEEEE',
 '!OC(=O)C1=C(Cl)C=C(Cl)C=C1ClEEEEEEEEEEEEEEEE',
 '!OC(=O)C1=CC=CC(=C1Cl)ClEEEEEEEEEEEEEEEEEEEE',
 '!OC(=O)C1=CC(=CC=C1Cl)ClEEEEEEEEEEEEEEEEEEEE',
 '!OC1=C(C=C(C=C1)[N+]([O-])=O)[N+]([O-])=OEEE']

现在是建立模型的时候了。

型号

model = Sequential()
model.add(Embedding(len(charset), 10, input_length=embed))
model.add(Flatten())
model.add(Dense(1, activation='linear'))

def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


optimizer = Adam(lr=0.00025)
lr_metric = get_lr_metric(optimizer)
model.compile(loss="mse", optimizer=optimizer, metrics=[coeff_determination, lr_metric])



callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-15, verbose=1, mode='auto',cooldown=0),
    ModelCheckpoint(filepath="weights.best.hdf5", monitor='val_loss', save_best_only=True, verbose=1, mode='auto')]


history =model.fit(x=X_train, y=Y_train,
                              batch_size=1,
                              epochs=10,
                              validation_data=(X_test,Y_test),
                              callbacks=callbacks_list)

错误

ValueError: Error when checking input: expected embedding_3_input to have 2 dimensions, but got array with shape (5, 44, 14)

嵌入层需要二维数组。我该如何处理这个问题,以便它可以接受一个热矢量编码数据。

以上代码都可以运行。

Keras 嵌入层使用索引,而不是直接使用单热编码。 所以你不需要 (5,44,14),只要 (5,44) 就可以了。

例如使用 argmax 获取索引:

X_test = np.argmax(X_test, axis=2)
X_train = np.argmax(X_train, axis=2)

尽管最好不要先对其进行单热编码 =)

除此之外,您的 'embed' 变量表示大小为 45,而您的数据大小为 44。

如果你改变这些,你的模型运行良好:

model = Sequential()
model.add(Embedding(len(charset), 10, input_length=44))
model.add(Flatten())
model.add(Dense(1, activation='linear'))

def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


optimizer = Adam(lr=0.00025)
lr_metric = get_lr_metric(optimizer)
model.compile(loss="mse", optimizer=optimizer, metrics=[coeff_determination,     lr_metric])



callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-15,     verbose=1, mode='auto',cooldown=0),
    ModelCheckpoint(filepath="weights.best.hdf5", monitor='val_loss',         save_best_only=True, verbose=1, mode='auto')]


history =model.fit(x=np.argmax(X_train, axis=2), y=Y_train,
                              batch_size=1,
                              epochs=10,
                              validation_data=(np.argmax(X_test, axis=2),Y_test),
                              callbacks=callbacks_list)    

我们的输入形状没有在嵌入层中正确定义。以下代码通过减少将数据维度转换为 2D 的步骤对我有用,您可以直接将 3-D 输入传递到嵌入层。

#THE MISSING STUFF
#_________________________________________
Y_train = Y_train.reshape(5) #Dense layer contains a single unit so need to input single dimension array
max_len = len(charset)
max_features = embed-1
inputshape = (max_features, max_len) #input shape didn't define. Embedding layer can accept 3D input by using input_shape
#__________________________________________

model = Sequential()
#model.add(Embedding(len(charset), 10, input_length=14))

model.add(Embedding(max_features, 10, input_shape=inputshape))#input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='linear'))
print(model.summary())

optimizer = Adam(lr=0.00025)
lr_metric = get_lr_metric(optimizer)
model.compile(loss="mse", optimizer=optimizer, metrics=[coeff_determination, lr_metric])


callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-15, verbose=1, mode='auto',cooldown=0),
    ModelCheckpoint(filepath="weights.best.hdf5", monitor='val_loss', save_best_only=True, verbose=1, mode='auto')]

history =model.fit(x=X_train, y=Y_train,
                              batch_size=10,
                              epochs=10,
                              validation_data=(X_test,Y_test),
                              callbacks=callbacks_list)