plugin/warpctc/warpctc-inl.h

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file warpctc-inl.h
 * \brief warpctc operator
 * \author Liang Xiang
*/
#ifndef PLUGIN_WARPCTC_WARPCTC_INL_H_
#define PLUGIN_WARPCTC_WARPCTC_INL_H_

#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
#include <stdio.h>
#include <ctc.h>
#include <cstring>
#include <map>
#include <string>
#include <vector>
#include <utility>
#include <iostream>
#include "../../src/operator/operator_common.h"

namespace mxnet {
namespace op {

namespace warpctc_enum {
  enum CTCOpInputs {kData, kLabel};
  enum CTCOpOutputs {kOut};
  enum CTCTemp {kTmp};
}  // namespace warpctc_enum

struct WarpCTCParam : public dmlc::Parameter<WarpCTCParam> {
  int label_length;
  int input_length;
  DMLC_DECLARE_PARAMETER(WarpCTCParam) {
    DMLC_DECLARE_FIELD(label_length)
        .set_default(0)
        .describe("Real label length");
    DMLC_DECLARE_FIELD(input_length)
        .set_default(0)
        .describe("Input length");
  }
};

template<typename xpu>
class WarpCTCOp : public Operator {
 private:
  WarpCTCParam param_;

 public:
  explicit WarpCTCOp(WarpCTCParam p) {
    this->param_ = p;
  }

  ~WarpCTCOp() {
  }

  inline void throw_on_error(ctcStatus_t status, const char* message) {
    if (status != CTC_STATUS_SUCCESS) {
      throw std::runtime_error(message
                               + (", stat = "
                                  + std::string(ctcGetStatusString(status))));
    }
  }

  virtual void Forward(const OpContext &ctx,
                       const std::vector<TBlob> &in_data,
                       const std::vector<OpReqType> &req,
                       const std::vector<TBlob> &out_data,
                       const std::vector<TBlob> &aux_args) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(in_data.size(), 2) << "CTCOutput Input: [data, label]";
    CHECK_EQ(out_data.size(), 1) << "CTCOutput Output: [output]";

    Stream<xpu> *s = ctx.get_stream<xpu>();
    TBlob data = in_data[warpctc_enum::kData];
    TBlob out = out_data[warpctc_enum::kOut];
    Tensor<xpu, 2, float> data_tensor = data.FlatTo2D<xpu, float>(s);
    Tensor<xpu, 2, float> out_tensor = out.FlatTo2D<xpu, float>(s);
    Softmax(out_tensor, data_tensor);
  }

  std::vector<int> labelLengths(const int * flat_labels, int minibatch,
                                int size, int blank, int * total_length) {
    CHECK_EQ(param_.label_length * minibatch, size)
        << "label size should = label_length * minibatch";
    std::vector<int> ret(minibatch, 0);
    for (int i = 0; i < size; i++) {
      if (flat_labels[i] == blank) {
        continue;
      }
      int b = i / param_.label_length;
      ret[b]++;
      (*total_length)++;
    }
    return ret;
  }

  void removeBlank(const int * flat_labels, int * cpu_labels,
                   int size, int blank) {
    int k = 0;
    for (int i = 0; i < size; i++) {
      if (flat_labels[i] != blank) {
        cpu_labels[k] = flat_labels[i];
        k += 1;
      }
    }
  }

  virtual void Backward(const OpContext &ctx,
                        const std::vector<TBlob> &out_grad,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &in_grad,
                        const std::vector<TBlob> &aux_args) {
    using namespace mshadow;
    Stream<xpu> *s = ctx.get_stream<xpu>();
    TBlob data = in_data[warpctc_enum::kData];
    TBlob label = in_data[warpctc_enum::kLabel];
    CHECK_EQ(data.shape_.ndim(), 2) << "input data shape should be 2 (t*n, p)";
    ctcOptions info; //please updated to latest baidu/warp-ctc NOLINT(*)
    if (data.dev_mask() == cpu::kDevMask) {
      info.loc = CTC_CPU;
      info.num_threads = 1;
    } else if (data.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
      info.loc = CTC_GPU;
      info.stream = ctx.get_stream<gpu>()->stream_;
    } else {
#endif
      LOG(FATAL) << "Unknown device type " << data.dev_mask();
    }
    info.blank_label = 0;

    int T = param_.input_length;
    int minibatch = data.shape_[0] / T;
    int alphabet_size = data.shape_[1];
    std::vector<int> input_lengths;
    for (int i = 0; i < minibatch; i++) {
      input_lengths.push_back(T);
    }

#if MXNET_USE_CUDA
    cudaError_t cuda_status;
#endif
    float* activations = static_cast<float*>(data.dptr_);
    int* flat_labels = static_cast<int*>(label.dptr_);
    int* cpu_raw_labels = flat_labels;
    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
    if (data.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
      cpu_raw_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
      cuda_status = cudaMemcpyAsync(cpu_raw_labels, flat_labels,
                                    label.Size()*sizeof(int),
                                    cudaMemcpyDeviceToHost,
                                    ctx.get_stream<gpu>()->stream_);
      CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
#endif
    }

    int total_label_length = 0;
    std::vector<int> label_lengths = labelLengths(cpu_raw_labels,
                                                  minibatch,
                                                  label.Size(),
                                                  0, &total_label_length);
    int* cpu_labels = reinterpret_cast<int*>(
        malloc(sizeof(int) * total_label_length));
    removeBlank(cpu_raw_labels, cpu_labels, label.Size(), 0);

    size_t alloc_bytes;
    throw_on_error(get_workspace_size(label_lengths.data(),
                                      input_lengths.data(),
                                      alphabet_size,
                                      input_lengths.size(), info,
                                      &alloc_bytes),
                   "Error: get_workspace_size in inf_test");

    Tensor<xpu, 1> ctc_workspace = ctx.requested[warpctc_enum::kTmp].get_space<xpu>(
        mshadow::Shape1(alloc_bytes), s);

    std::vector<float> costs(minibatch);
    throw_on_error(compute_ctc_loss(activations,
                                    grads,
                                    cpu_labels,
                                    label_lengths.data(),
                                    input_lengths.data(),
                                    alphabet_size,
                                    minibatch,
                                    costs.data(),
                                    ctc_workspace.dptr_,
                                    info),
                   "Error: compute_ctc_loss");

    if (data.dev_mask() == cpu::kDevMask) {
      free(cpu_labels);
    } else if (data.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
      free(cpu_raw_labels);
      free(cpu_labels);
#endif
    }
  }
};

template<typename xpu>
Operator* CreateOp(WarpCTCParam type);


#if DMLC_USE_CXX11
class WarpCTCProp : public OperatorProperty {
 public:
  std::vector<std::string> ListArguments() const override {
    return {"data", "label"};
  }

  virtual std::vector<std::string> ListOutputs() const {
    return {"output"};
  }

  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs)
      override {
    param_.Init(kwargs);
  }

  std::map<std::string, std::string> GetParams() const override {
    return param_.__DICT__();
  }

  bool InferShape(mxnet::ShapeVector *in_shape,
                  mxnet::ShapeVector *out_shape,
                  mxnet::ShapeVector *aux_shape) const override {
    using namespace mshadow;
    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
    const mxnet::TShape &dshape = in_shape->at(0);
    if (dshape.ndim() == 0) return false;
    mxnet::TShape label_shape(dshape.ndim() - 1, 1);
    label_shape[0] = param_.label_length * (dshape[0] / param_.input_length);
    SHAPE_ASSIGN_CHECK(*in_shape, warpctc_enum::kLabel, label_shape);

    out_shape->clear();
    out_shape->push_back(dshape);
    return true;
  }

  virtual bool InferType(std::vector<int> *in_type,
                         std::vector<int> *out_type,
                         std::vector<int> *aux_type) const {
    CHECK_LE(in_type->size(), this->ListArguments().size());
    in_type->clear();
    in_type->push_back(mshadow::kFloat32);
    in_type->push_back(mshadow::kInt32);
    out_type->clear();
    out_type->push_back(mshadow::kFloat32);
    return true;
  }

  std::vector<ResourceRequest> BackwardResource(
      const mxnet::ShapeVector &in_shape) const override {
    return {ResourceRequest::kTempSpace};
  }

  OperatorProperty* Copy() const override {
    auto ptr = new WarpCTCProp();
    ptr->param_ = param_;
    return ptr;
  }

  std::string TypeString() const override {
    return "WarpCTC";
  }


  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
                                             const std::vector<int> &in_data,
                                             const std::vector<int> &out_data)
      const override {
    return {in_data[warpctc_enum::kData],
          in_data[warpctc_enum::kLabel],
          out_data[warpctc_enum::kOut]};
  }

  Operator* CreateOperator(Context ctx) const override;

 private:
  WarpCTCParam param_;
};
#endif  // DMLC_USE_CXX11

}  // namespace op
}  // namespace mxnet

#endif  // PLUGIN_WARPCTC_WARPCTC_INL_H_