深度学习笔记(1):caffe 添加新层 attention LSTM layer和LSTM layer代码精读

    xiaoxiao2021-03-25  144

    总结一下最近的工作:LSTM layer 代码,caffe 加入新层 Attention LSTM layer

    LSTM layer

    关键代码如下,可以参考图1进行阅读,图一来自博客

    namespace caffe { template <typename Dtype> void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const { names->resize(2); (*names)[0] = "h_0"; (*names)[1] = "c_0"; //定义h_0,c_0 的输入 } template <typename Dtype> void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const { names->resize(2); (*names)[0] = "h_" + this->int_to_str(this->T_); (*names)[1] = "c_T"; // 定义输出,不同时刻的h_t } template <typename Dtype> void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const { names->resize(1); (*names)[0] = "h"; // 最终输出h } template <typename Dtype> void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const { const int num_output = this->layer_param_.recurrent_param().num_output(); CHECK_GT(num_output, 0) << "num_output must be positive"; const FillerParameter& weight_filler = this->layer_param_.recurrent_param().weight_filler(); const FillerParameter& bias_filler = this->layer_param_.recurrent_param().bias_filler(); // 权重W和偏差Bias // Add generic LayerParameter's (without bottoms/tops) of layer types we'll // use to save redundant code. LayerParameter hidden_param; hidden_param.set_type("InnerProduct"); hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4); hidden_param.mutable_inner_product_param()->set_bias_term(false); hidden_param.mutable_inner_product_param()->set_axis(2); hidden_param.mutable_inner_product_param()-> mutable_weight_filler()->CopyFrom(weight_filler); LayerParameter biased_hidden_param(hidden_param); biased_hidden_param.mutable_inner_product_param()->set_bias_term(true); biased_hidden_param.mutable_inner_product_param()-> mutable_bias_filler()->CopyFrom(bias_filler); LayerParameter sum_param; sum_param.set_type("Eltwise"); sum_param.mutable_eltwise_param()->set_operation( EltwiseParameter_EltwiseOp_SUM); LayerParameter slice_param; slice_param.set_type("Slice"); slice_param.mutable_slice_param()->set_axis(0); LayerParameter split_param; split_param.set_type("Split"); BlobShape input_shape; input_shape.add_dim(1); // eg:1x3x256,256是lstm层设置的,3为buffersize或则通道数 input_shape.add_dim(this->N_); input_shape.add_dim(num_output); net_param->add_input("c_0"); net_param->add_input_shape()->CopyFrom(input_shape); net_param->add_input("h_0"); net_param->add_input_shape()->CopyFrom(input_shape); LayerParameter* cont_slice_param = net_param->add_layer(); cont_slice_param->CopyFrom(slice_param); cont_slice_param->set_name("cont_slice"); cont_slice_param->add_bottom("cont"); cont_slice_param->mutable_slice_param()->set_axis(1); // cont 为0或则1,slice // Add layer to transform all timesteps of x to the hidden state dimension. // W_xc_x = W_xc * x + b_c { LayerParameter* x_transform_param = net_param->add_layer(); x_transform_param->CopyFrom(biased_hidden_param); x_transform_param->set_name("x_transform"); x_transform_param->add_param()->set_name("W_xc"); x_transform_param->add_param()->set_name("b_c"); x_transform_param->add_bottom("x"); x_transform_param->add_top("W_xc_x");//全连接层,W x X+ b ,实际上对应着下图的维度的变化,最后一维必须要变成1024 } //这样才能和4个gate对应,因为W x X+ b 包含了4个gate,可以看做[[],[],[],[]].因此维度需要为1024,! LayerParameter* x_slice_param = net_param->add_layer(); x_slice_param->CopyFrom(slice_param); x_slice_param->add_bottom("W_xc_x"); x_slice_param->set_name("W_xc_x_slice"); //维度变换完成后,需要slice //举例,一个视频,10帧,则一帧一帧的输入,这样应该好理解一点,参考下图 LayerParameter output_concat_layer; output_concat_layer.set_name("h_concat"); output_concat_layer.set_type("Concat"); output_concat_layer.add_top("h"); output_concat_layer.mutable_concat_param()->set_axis(0); //lstm每一个T都输出h_t,至于你取哪一个或者都去,取决于你的prototxt代码的编写 for (int t = 1; t <= this->T_; ++t) { // 开始实现 平铺 LSTM 层,可以看 这里 string tm1s = this->int_to_str(t - 1); string ts = this->int_to_str(t); cont_slice_param->add_top("cont_" + ts); x_slice_param->add_top("W_xc_x_" + ts); // Add layers to flush the hidden state when beginning a new // sequence, as indicated by cont_t. // h_conted_{t-1} := cont_t * h_{t-1} // // Normally, cont_t is binary (i.e., 0 or 1), so: // h_conted_{t-1} := h_{t-1} if cont_t == 1 // 0 otherwise { LayerParameter* cont_h_param = net_param->add_layer(); cont_h_param->CopyFrom(sum_param); cont_h_param->mutable_eltwise_param()->set_coeff_blob(true); cont_h_param->set_name("h_conted_" + tm1s); cont_h_param->add_bottom("h_" + tm1s); cont_h_param->add_bottom("cont_" + ts); cont_h_param->add_top("h_conted_" + tm1s); // h_conted 的产生,这个根据cont_slice 决定,与cont_slice 相乘 } // Add layer to compute // W_hc_h_{t-1} := W_hc * h_conted_{t-1} { LayerParameter* w_param = net_param->add_layer(); w_param->CopyFrom(hidden_param); w_param->set_name("transform_" + ts); w_param->add_param()->set_name("W_hc"); w_param->add_bottom("h_conted_" + tm1s); w_param->add_top("W_hc_h_" + tm1s); w_param->mutable_inner_product_param()->set_axis(2); // 计算 W X H } // Add the outputs of the linear transformations to compute the gate input. // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c // = W_hc_h_{t-1} + W_xc_x_t + b_c { LayerParameter* input_sum_layer = net_param->add_layer(); input_sum_layer->CopyFrom(sum_param); input_sum_layer->set_name("gate_input_" + ts); input_sum_layer->add_bottom("W_hc_h_" + tm1s); input_sum_layer->add_bottom("W_xc_x_" + ts); if (this->static_input_) { input_sum_layer->add_bottom("W_xc_x_static"); } input_sum_layer->add_top("gate_input_" + ts); //根据公式,计算4个gate的输出 } // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t. // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t // Outputs: c_t, h_t // [ i_t' ] // [ f_t' ] := gate_input_t // [ o_t' ] // [ g_t' ] // i_t := \sigmoid[i_t'] // f_t := \sigmoid[f_t'] // o_t := \sigmoid[o_t'] // g_t := \tanh[g_t'] // c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) // h_t := o_t .* \tanh[c_t] { LayerParameter* lstm_unit_param = net_param->add_layer(); lstm_unit_param->set_type("LSTMUnit"); lstm_unit_param->add_bottom("c_" + tm1s); lstm_unit_param->add_bottom("gate_input_" + ts); lstm_unit_param->add_bottom("cont_" + ts); lstm_unit_param->add_top("c_" + ts); lstm_unit_param->add_top("h_" + ts); lstm_unit_param->set_name("unit_" + ts); //4 个 gate 输入值进行不同函数的非线性变换 } output_concat_layer.add_bottom("h_" + ts); // 中间隐藏层的状态输出 } // for (int t = 1; t <= this->T_; ++t) { LayerParameter* c_T_copy_param = net_param->add_layer(); c_T_copy_param->CopyFrom(split_param); c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_)); c_T_copy_param->add_top("c_T"); } net_param->add_layer()->CopyFrom(output_concat_layer); //split:将C_T复制几份,分别给不同的layer,这些上层layer共享这个blob } INSTANTIATE_CLASS(LSTMLayer); REGISTER_LAYER_CLASS(LSTM); //LSTM layer 的注册 } // namespace caffe

       上面是LSTMlayer的代码分析,整体流程比较清晰,但是由于Attention Model 的提出,很多模型需要attention LSTM,而caffe 没有alstm layer,因此博主自己写了个ALSTM layer,由于篇幅比较长,换下一篇写。

    转载请注明原文地址: https://ju.6miu.com/read-1903.html

    最新回复(0)