连体网络输出
Siamese network output
我正在尝试在 caffe 中实现一个孪生网络,其中它由两个不共享权重的图像网络组成。所以我基本上想做的是给每个网络一个图像,最后尝试找出它们之间的相似性距离,下面是我的 prototxt。所以我的主要问题是我也应该设置 "num_output" 什么?我的训练只有 2 类,0 表示它们不相似,1 表示它们相似。
name: "Siamese_ImageNet"
layers {
name: "data"
type: IMAGE_DATA
top: "data"
top: "label"
image_data_param {
source: "train1.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TRAIN }
}
layers {
name: "data"
type: IMAGE_DATA
top: "data"
top: "label"
image_data_param {
source: "test1.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TEST }
}
layers {
name: "data_p"
type: IMAGE_DATA
top: "data_p"
top: "label_p"
image_data_param {
source: "train2.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TRAIN }
}
layers {
name: "data_p"
type: IMAGE_DATA
top: "data_p"
top: "label_p"
image_data_param {
source: "test2.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TEST }
}
layers {
name: "conv1"
type: CONVOLUTION
bottom: "data"
top: "conv1"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1"
type: RELU
bottom: "conv1"
top: "conv1"
}
layers {
name: "pool1"
type: POOLING
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm1"
type: LRN
bottom: "pool1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv2"
type: CONVOLUTION
bottom: "norm1"
top: "conv2"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu2"
type: RELU
bottom: "conv2"
top: "conv2"
}
layers {
name: "pool2"
type: POOLING
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm2"
type: LRN
bottom: "pool2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv3"
type: CONVOLUTION
bottom: "norm2"
top: "conv3"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu3"
type: RELU
bottom: "conv3"
top: "conv3"
}
layers {
name: "conv4"
type: CONVOLUTION
bottom: "conv3"
top: "conv4"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu4"
type: RELU
bottom: "conv4"
top: "conv4"
}
layers {
name: "conv5"
type: CONVOLUTION
bottom: "conv4"
top: "conv5"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu5"
type: RELU
bottom: "conv5"
top: "conv5"
}
layers {
name: "pool5"
type: POOLING
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "pool5"
top: "fc6"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu6"
type: RELU
bottom: "fc6"
top: "fc6"
}
layers {
name: "drop6"
type: DROPOUT
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7"
type: INNER_PRODUCT
bottom: "fc6"
top: "fc7"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu7"
type: RELU
bottom: "fc7"
top: "fc7"
}
layers {
name: "drop7"
type: DROPOUT
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "conv1_p"
type: CONVOLUTION
bottom: "data_p"
top: "conv1_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1_p"
type: RELU
bottom: "conv1_p"
top: "conv1_p"
}
layers {
name: "pool1_p"
type: POOLING
bottom: "conv1_p"
top: "pool1_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm1_p"
type: LRN
bottom: "pool1_p"
top: "norm1_p"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv2_p"
type: CONVOLUTION
bottom: "norm1_p"
top: "conv2_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu2_p"
type: RELU
bottom: "conv2_p"
top: "conv2_p"
}
layers {
name: "pool2_p"
type: POOLING
bottom: "conv2_p"
top: "pool2_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm2_p"
type: LRN
bottom: "pool2_p"
top: "norm2_p"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv3_p"
type: CONVOLUTION
bottom: "norm2_p"
top: "conv3_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu3_p"
type: RELU
bottom: "conv3_p"
top: "conv3_p"
}
layers {
name: "conv4_p"
type: CONVOLUTION
bottom: "conv3_p"
top: "conv4_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu4_p"
type: RELU
bottom: "conv4_p"
top: "conv4_p"
}
layers {
name: "conv5_p"
type: CONVOLUTION
bottom: "conv4_p"
top: "conv5_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu5_p"
type: RELU
bottom: "conv5_p"
top: "conv5_p"
}
layers {
name: "pool5_p"
type: POOLING
bottom: "conv5_p"
top: "pool5_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "fc6_p"
type: INNER_PRODUCT
bottom: "pool5_p"
top: "fc6_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu6_p"
type: RELU
bottom: "fc6_p"
top: "fc6_p"
}
layers {
name: "drop6_p"
type: DROPOUT
bottom: "fc6_p"
top: "fc6_p"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7_p"
type: INNER_PRODUCT
bottom: "fc6_p"
top: "fc7_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu7_p"
type: RELU
bottom: "fc7_p"
top: "fc7_p"
}
layers {
name: "drop7_p"
type: DROPOUT
bottom: "fc7_p"
top: "fc7_p"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "loss"
type: CONTRASTIVE_LOSS
contrastive_loss_param {
margin: 1.0
}
bottom: "fc7"
bottom: "fc7_p"
bottom: "label"
top: "loss"
}
我的训练文件结构:
0 不相似,1 相似
train1.txt:
/aer/img1_1.jpg 0
/aer/img1_2.jpg 1
/aer/img1_3.jpg 1
train2.txt:
/tpd/img2_1.jpg 0
/tpd/img2_2.jpg 1
/tpd/img2_3.jpg 1
What should I set my "num_output"?
在了解您应该设置多少 num_output
之前,让我们解释一下它的含义。事实上,你可以将 Simense 网络的两侧,data -> fc7
,data_p -> fc7_p
视为 2 个特征提取器。每个都从相应数据层的图像中提取特征,例如 fc7
和 fc7_p
。所以num_output
定义了提取特征向量的维度。
在训练过程中,ContrastiveLoss
层总是尝试在两个提取的特征向量代表的图像相似时(label == 1
)最小化距离,在不相似(label == 0
).即特征向量的距离越小,图像越相似。
那么特征向量的最佳维度是多少才能最好地包含指示相似性的信息?或者你应该设置什么num_output
?可能没有一个确切的值,这取决于特征提取器的编码质量(您可以将特征视为图像的代码)以及识别图像相似性的难易程度。所以基本上如果网络(特征提取器)很深并且不难识别相似性,你可以选择相对较小的 num_output
例如 200,因为特征可能会被更大的网络编码得更好并且更多歧视性的。如果不是,您可以尝试更大的值,例如500、1000 或尝试更复杂的网络。
如果你想尝试 MultinomialLogisticLoss
而不是 ContrastiveLoss
层,你应该首先使用像CONCAT
然后将其送入 SOFTMAX_LOSS
层,如下所示:
...#original layers
layers {
name: "concat"
type: CONCAT
bottom: "fc7"
bottom: "fc7_p"
top: "fc_concat" # concatenate fc7 and fc7_p along channel axis
}
layer {
name: "fc_cls"
type: INNER_PRODUCT
bottom: "fc_concat"
top: "fc_cls"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 2 # a binary classification problem in this case
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: ACCURACY
bottom: "fc_cls"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: SOFTMAX_LOSS
bottom: "fc_cls"
bottom: "label"
top: "loss"
}
更新
Which is the best method to implement in order to compare similarity and use it for deploy, Constrastive Loss or SoftMax Loss?
Softmax Loss 简单易部署。但它只能给你二进制预测,即相似或不相似。它给出的 2 class(相似,不相似)的概率分布通常太硬(不均匀),例如[0.9*, 0.0*]
, [0.0*, 0.9*]
,....很多情况下并不能很好的反映真实的输入相似度
在使用 Constrastive Loss 时,您可以获得图像的判别特征向量。您可以使用该向量来计算相似性概率,就像 CVPR 2005 论文 Learning a Similarity Metric Discriminatively, with Application to Face Verification did in Section 4.1.(The key point is to compute a multivariate normal density using the feature vectors generated from the images belonging to a same subject). Also you can use a threshold to control the false positive rate and the false negative rate of the model to get a ROC curve 更好地评估模型一样。
对了,挖掘更多CNN架构来预测相似度,可以参考CVPR 2015论文Learning to Compare Image Patches via Convolutional Neural Networks.
只是为了更正 Dale 上面关于 Caffe 超级敏感语法的伟大 ,对于像我这样被卡住的菜鸟,这里有一些更正(层层到层,一些引号,加上注释的删除,以及有效的大写)
layer {
name: "concat"
type: "Concat"
bottom: "fc7"
bottom: "fc7_p"
top: "fc_concat"
}
layer {
name: "fc_cls"
type: "InnerProduct"
bottom: "fc_concat"
top: "fc_cls"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 2
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "fc_cls"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc_cls"
bottom: "label"
top: "loss"
}
我相信num_output
定义了提取的特征向量的维度,然后提取的特征可以用来确定L2
距离。如果 L2
距离大于 1 那么它是一个不同的 class 如果它接近 0图像相似。休息戴尔答案是完美的。
我正在尝试在 caffe 中实现一个孪生网络,其中它由两个不共享权重的图像网络组成。所以我基本上想做的是给每个网络一个图像,最后尝试找出它们之间的相似性距离,下面是我的 prototxt。所以我的主要问题是我也应该设置 "num_output" 什么?我的训练只有 2 类,0 表示它们不相似,1 表示它们相似。
name: "Siamese_ImageNet"
layers {
name: "data"
type: IMAGE_DATA
top: "data"
top: "label"
image_data_param {
source: "train1.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TRAIN }
}
layers {
name: "data"
type: IMAGE_DATA
top: "data"
top: "label"
image_data_param {
source: "test1.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TEST }
}
layers {
name: "data_p"
type: IMAGE_DATA
top: "data_p"
top: "label_p"
image_data_param {
source: "train2.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TRAIN }
}
layers {
name: "data_p"
type: IMAGE_DATA
top: "data_p"
top: "label_p"
image_data_param {
source: "test2.txt"
batch_size: 32
new_height: 256
new_width: 256
}
include: { phase: TEST }
}
layers {
name: "conv1"
type: CONVOLUTION
bottom: "data"
top: "conv1"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1"
type: RELU
bottom: "conv1"
top: "conv1"
}
layers {
name: "pool1"
type: POOLING
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm1"
type: LRN
bottom: "pool1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv2"
type: CONVOLUTION
bottom: "norm1"
top: "conv2"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu2"
type: RELU
bottom: "conv2"
top: "conv2"
}
layers {
name: "pool2"
type: POOLING
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm2"
type: LRN
bottom: "pool2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv3"
type: CONVOLUTION
bottom: "norm2"
top: "conv3"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu3"
type: RELU
bottom: "conv3"
top: "conv3"
}
layers {
name: "conv4"
type: CONVOLUTION
bottom: "conv3"
top: "conv4"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu4"
type: RELU
bottom: "conv4"
top: "conv4"
}
layers {
name: "conv5"
type: CONVOLUTION
bottom: "conv4"
top: "conv5"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu5"
type: RELU
bottom: "conv5"
top: "conv5"
}
layers {
name: "pool5"
type: POOLING
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "pool5"
top: "fc6"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu6"
type: RELU
bottom: "fc6"
top: "fc6"
}
layers {
name: "drop6"
type: DROPOUT
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7"
type: INNER_PRODUCT
bottom: "fc6"
top: "fc7"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu7"
type: RELU
bottom: "fc7"
top: "fc7"
}
layers {
name: "drop7"
type: DROPOUT
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "conv1_p"
type: CONVOLUTION
bottom: "data_p"
top: "conv1_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1_p"
type: RELU
bottom: "conv1_p"
top: "conv1_p"
}
layers {
name: "pool1_p"
type: POOLING
bottom: "conv1_p"
top: "pool1_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm1_p"
type: LRN
bottom: "pool1_p"
top: "norm1_p"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv2_p"
type: CONVOLUTION
bottom: "norm1_p"
top: "conv2_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu2_p"
type: RELU
bottom: "conv2_p"
top: "conv2_p"
}
layers {
name: "pool2_p"
type: POOLING
bottom: "conv2_p"
top: "pool2_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "norm2_p"
type: LRN
bottom: "pool2_p"
top: "norm2_p"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layers {
name: "conv3_p"
type: CONVOLUTION
bottom: "norm2_p"
top: "conv3_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu3_p"
type: RELU
bottom: "conv3_p"
top: "conv3_p"
}
layers {
name: "conv4_p"
type: CONVOLUTION
bottom: "conv3_p"
top: "conv4_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu4_p"
type: RELU
bottom: "conv4_p"
top: "conv4_p"
}
layers {
name: "conv5_p"
type: CONVOLUTION
bottom: "conv4_p"
top: "conv5_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu5_p"
type: RELU
bottom: "conv5_p"
top: "conv5_p"
}
layers {
name: "pool5_p"
type: POOLING
bottom: "conv5_p"
top: "pool5_p"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layers {
name: "fc6_p"
type: INNER_PRODUCT
bottom: "pool5_p"
top: "fc6_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu6_p"
type: RELU
bottom: "fc6_p"
top: "fc6_p"
}
layers {
name: "drop6_p"
type: DROPOUT
bottom: "fc6_p"
top: "fc6_p"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7_p"
type: INNER_PRODUCT
bottom: "fc6_p"
top: "fc7_p"
blobs_lr: 1
blobs_lr: 2
weight_decay: 1
weight_decay: 0
inner_product_param {
num_output: 2
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 1
}
}
}
layers {
name: "relu7_p"
type: RELU
bottom: "fc7_p"
top: "fc7_p"
}
layers {
name: "drop7_p"
type: DROPOUT
bottom: "fc7_p"
top: "fc7_p"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "loss"
type: CONTRASTIVE_LOSS
contrastive_loss_param {
margin: 1.0
}
bottom: "fc7"
bottom: "fc7_p"
bottom: "label"
top: "loss"
}
我的训练文件结构: 0 不相似,1 相似
train1.txt:
/aer/img1_1.jpg 0
/aer/img1_2.jpg 1
/aer/img1_3.jpg 1
train2.txt:
/tpd/img2_1.jpg 0
/tpd/img2_2.jpg 1
/tpd/img2_3.jpg 1
What should I set my "num_output"?
在了解您应该设置多少 num_output
之前,让我们解释一下它的含义。事实上,你可以将 Simense 网络的两侧,data -> fc7
,data_p -> fc7_p
视为 2 个特征提取器。每个都从相应数据层的图像中提取特征,例如 fc7
和 fc7_p
。所以num_output
定义了提取特征向量的维度。
在训练过程中,ContrastiveLoss
层总是尝试在两个提取的特征向量代表的图像相似时(label == 1
)最小化距离,在不相似(label == 0
).即特征向量的距离越小,图像越相似。
那么特征向量的最佳维度是多少才能最好地包含指示相似性的信息?或者你应该设置什么num_output
?可能没有一个确切的值,这取决于特征提取器的编码质量(您可以将特征视为图像的代码)以及识别图像相似性的难易程度。所以基本上如果网络(特征提取器)很深并且不难识别相似性,你可以选择相对较小的 num_output
例如 200,因为特征可能会被更大的网络编码得更好并且更多歧视性的。如果不是,您可以尝试更大的值,例如500、1000 或尝试更复杂的网络。
如果你想尝试 MultinomialLogisticLoss
而不是 ContrastiveLoss
层,你应该首先使用像CONCAT
然后将其送入 SOFTMAX_LOSS
层,如下所示:
...#original layers
layers {
name: "concat"
type: CONCAT
bottom: "fc7"
bottom: "fc7_p"
top: "fc_concat" # concatenate fc7 and fc7_p along channel axis
}
layer {
name: "fc_cls"
type: INNER_PRODUCT
bottom: "fc_concat"
top: "fc_cls"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 2 # a binary classification problem in this case
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: ACCURACY
bottom: "fc_cls"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: SOFTMAX_LOSS
bottom: "fc_cls"
bottom: "label"
top: "loss"
}
更新
Which is the best method to implement in order to compare similarity and use it for deploy, Constrastive Loss or SoftMax Loss?
Softmax Loss 简单易部署。但它只能给你二进制预测,即相似或不相似。它给出的 2 class(相似,不相似)的概率分布通常太硬(不均匀),例如[0.9*, 0.0*]
, [0.0*, 0.9*]
,....很多情况下并不能很好的反映真实的输入相似度
在使用 Constrastive Loss 时,您可以获得图像的判别特征向量。您可以使用该向量来计算相似性概率,就像 CVPR 2005 论文 Learning a Similarity Metric Discriminatively, with Application to Face Verification did in Section 4.1.(The key point is to compute a multivariate normal density using the feature vectors generated from the images belonging to a same subject). Also you can use a threshold to control the false positive rate and the false negative rate of the model to get a ROC curve 更好地评估模型一样。
对了,挖掘更多CNN架构来预测相似度,可以参考CVPR 2015论文Learning to Compare Image Patches via Convolutional Neural Networks.
只是为了更正 Dale 上面关于 Caffe 超级敏感语法的伟大
layer {
name: "concat"
type: "Concat"
bottom: "fc7"
bottom: "fc7_p"
top: "fc_concat"
}
layer {
name: "fc_cls"
type: "InnerProduct"
bottom: "fc_concat"
top: "fc_cls"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 2
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "fc_cls"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc_cls"
bottom: "label"
top: "loss"
}
我相信num_output
定义了提取的特征向量的维度,然后提取的特征可以用来确定L2
距离。如果 L2
距离大于 1 那么它是一个不同的 class 如果它接近 0图像相似。休息戴尔答案是完美的。