ctc_include/detail/cpu_ctc.h

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

#pragma once

#include <tuple>
#include <cmath>
#include <limits>
#include <algorithm>
#include <numeric>

#include <dmlc/omp.h>

#include "ctc_helper.h"

namespace mxnet_warpctc {

template<typename ProbT>
class CpuCTC {
public:
    // Noncopyable
    CpuCTC(int alphabet_size, int minibatch, void* workspace,
           int blank_label) :
            alphabet_size_(alphabet_size), minibatch_(minibatch),
            workspace_(workspace), blank_label_(blank_label) {

    };

    CpuCTC(const CpuCTC&) = delete;
    CpuCTC& operator=(const CpuCTC&) = delete;

    ctcStatus_t cost_and_grad(const ProbT* const activations,
                              ProbT *grads,
                              ProbT* costs,
                              const int* const flat_labels,
                              const int* const label_lengths,
                              const int* const input_lengths);


    ctcStatus_t score_forward(const ProbT* const activations,
                              ProbT* costs,
                              const int* const flat_labels,
                              const int* const label_lengths,
                              const int* const input_lengths);

private:

    class CpuCTC_metadata {

    private:
        int setup_labels(const int* const labels, int blank_label, int L, int S);

    public:
        CpuCTC_metadata(int L, int S, int T, int mb, int alphabet_size,
                        void* workspace, size_t bytes_used, int blank_label,
                        const int* const labels);

        ProbT* alphas;
        ProbT* betas;
        int* labels_w_blanks;
        int* e_inc;
        int* s_inc;
        ProbT* output;
        int repeats;
    };

    int alphabet_size_; // Number of characters plus blank
    int minibatch_;
    void* workspace_;
    int blank_label_;

    void log_softmax(const ProbT* const activations, ProbT* log_probs,
                     const int* const input_lengths);

    std::tuple<ProbT, bool>
            cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                 const int* const labels, int T, int L,
                                 int mb, size_t bytes_used);

    ProbT compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                         const int* const e_inc,
                         const int* const s_inc,
                         const int* const labels,
                         ProbT* alphas);

    ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                 ProbT log_partition, int repeats,
                                 int S, int T, const int* const e_inc,
                                 const int* const s_inc,
                                 const int* const labels,
                                 ProbT* alphas,
                                 ProbT* betas,
                                 ProbT* output);
};

template<typename ProbT>
CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L, int S, int T, int mb,
                                                int alphabet_size,
                                                void* workspace, size_t bytes_used,
                                                int blank_label,
                                                const int* const labels) {

    alphas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(ProbT) * S * T;
    std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
    betas = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(ProbT) * S;
    std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
    labels_w_blanks = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(int) * S;
    e_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(int) * S;
    s_inc = reinterpret_cast<int *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(int) * S;
    output = reinterpret_cast<ProbT *>(static_cast<char *>(workspace) + bytes_used);
    bytes_used += sizeof(ProbT) * alphabet_size;

    repeats = setup_labels(labels, blank_label, L, S);
}

template<typename ProbT>
int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
                                                 int blank_label, int L, int S) {
    int e_counter = 0;
    int s_counter = 0;

    s_inc[s_counter++] = 1;

    int repeats = 0;

    for (int i = 1; i < L; ++i) {
        if (labels[i-1] == labels[i]) {
            s_inc[s_counter++] = 1;
            s_inc[s_counter++] = 1;
            e_inc[e_counter++] = 1;
            e_inc[e_counter++] = 1;
            ++repeats;
        }
        else {
            s_inc[s_counter++] = 2;
            e_inc[e_counter++] = 2;
        }
    }
    e_inc[e_counter++] = 1;

    for (int i = 0; i < L; ++i) {
        labels_w_blanks[2 * i] = blank_label;
        labels_w_blanks[2 * i + 1] = labels[i];
    }
    labels_w_blanks[S - 1] = blank_label;

    return repeats;
}

template<typename ProbT>
void
CpuCTC<ProbT>::log_softmax(const ProbT* const activations, ProbT* log_probs,
                           const int* const input_lengths) {
#pragma omp parallel for
    for (int mb = 0; mb < minibatch_; ++mb) {
        for(int c = 0; c < input_lengths[mb]; ++c) {
            int col_offset = (mb + minibatch_ * c) * alphabet_size_;
            ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
            for(int r = 0; r < alphabet_size_; ++r)
                max_activation = std::max(max_activation, activations[r + col_offset]);

            ProbT denom = ProbT(0.);
            for(int r = 0; r < alphabet_size_; ++r) {
                denom += std::exp(activations[r + col_offset] - max_activation);
            }

            for(int r = 0; r < alphabet_size_; ++r) {
                log_probs[r + col_offset] = activations[r + col_offset]
                                            - max_activation - std::log(denom);
            }
        }
    }
}

template<typename ProbT>
std::tuple<ProbT, bool>
CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                    const int* const labels,
                                    int T, int L, int mb, size_t bytes_used) {

    const int S = 2*L + 1; // Number of labels with blanks

    CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_, bytes_used, blank_label_, labels);

    bool over_threshold = false;

    if (L + ctcm.repeats > T) {
        return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
    }

    ProbT llForward = compute_alphas(log_probs, ctcm.repeats, S, T, ctcm.e_inc,
                                     ctcm.s_inc, ctcm.labels_w_blanks,
                                     ctcm.alphas);

    ProbT llBackward = compute_betas_and_grad(grad, log_probs, llForward, ctcm.repeats,
                                              S, T, ctcm.e_inc, ctcm.s_inc,
                                              ctcm.labels_w_blanks,
                                              ctcm.alphas,
                                              ctcm.betas,
                                              ctcm.output);

    ProbT diff = std::abs(llForward - llBackward);
    if (diff > ctc_helper::threshold) {
        over_threshold = true;
    }

    return std::make_tuple(-llForward, over_threshold);
}

// Computes forward probabilities
template<typename ProbT>
ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                                    const int* const e_inc,
                                    const int* const s_inc,
                                    const int* const labels,
                                    ProbT* alphas) {

    int start =  (((S /2) + repeats - T) < 0) ? 0 : 1,
            end = S > 1 ? 2 : 1;

    for (int i = start; i < end; ++i) {
        alphas[i] = log_probs[labels[i]];
    }

    for(int t = 1; t < T; ++t) {
        int remain = (S / 2) + repeats - (T - t);
        if(remain >= 0)
            start += s_inc[remain];
        if(t <= (S / 2) + repeats)
            end += e_inc[t - 1];
        int startloop = start;
        int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);

        if (start == 0) {
            alphas[idx1] = alphas[idx2] + log_probs[blank_label_ + idx3];
            startloop += 1;
        }

        for(int i = startloop; i < end; ++i) {
            ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2], alphas[(i-1) + idx2]);

            // Skip two if not on blank and not on repeat.
            if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
                prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);

            alphas[i + idx1] = prev_sum + log_probs[labels[i] + idx3];
        }
    }

    ProbT loglike = ctc_helper::neg_inf<ProbT>();
    for(int i = start; i < end; ++i) {
        loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
    }

    return loglike;
}

// Starting from T, we sweep backward over the alpha array computing one column
// of betas as we go.  At each position we can update product alpha * beta and then
// sum into the gradient associated with each label.
// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
// Assumed passed in grads are already zeroed!
template<typename ProbT>
ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                            ProbT log_partition, int repeats,
                                            int S, int T, const int* const e_inc,
                                            const int* const s_inc,
                                            const int* const labels,
                                            ProbT* alphas,
                                            ProbT* betas,
                                            ProbT* output) {
    int start = S > 1 ? (S - 2) : 0,
            end = (T > (S / 2) + repeats) ? S : S-1;

    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

    //set the starting values in the beta column at the very right edge
    for (int i = start; i < end; ++i) {
        betas[i] = log_probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)];

        //compute alpha * beta in log space at this position in (S, T) space
        alphas[i + (T - 1) * S] += betas[i];

        //update the gradient associated with this label
        //essentially performing a reduce-by-key in a sequential manner
        output[labels[i]] =
                ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S], output[labels[i]]);
    }

    //update the gradient wrt to each unique label
    for (int i = 0; i < alphabet_size_; ++i) {
        int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;

        if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
            log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
            grad[idx3] = std::exp(log_probs[idx3]);
        } else {
            grad[idx3] = std::exp(log_probs[idx3])
                         - std::exp(output[i] - log_probs[idx3] - log_partition);
        }
    }

    //loop from the second to last column all the way to the left
    for(int t = T - 2; t >= 0; --t) {
        int remain = (S / 2) + repeats - (T - t);
        if(remain >= -1)
            start -= s_inc[remain + 1];
        if(t < (S / 2) + repeats)
            end -= e_inc[t];

        int endloop = end == S ? end - 1 : end;
        int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);

        std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());

        for(int i = start; i < endloop; ++i) {
            ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i+1)]);
            // Skip two if not on blank and not on repeat.
            if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
                next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
            }
            betas[i] = next_sum + log_probs[labels[i] + idx3];

            //compute alpha * beta in log space
            alphas[i + idx1] += betas[i];

            //update the gradient associated with this label
            output[labels[i]] =
                    ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
        }

        if (end == S) {
            betas[(S-1)] = betas[(S-1)] + log_probs[blank_label_ + idx3];
            alphas[(S-1) + idx1] += betas[(S-1)];

            output[labels[S-1]] =
                    ctc_helper::log_plus<ProbT>()(alphas[S-1 + idx1], output[labels[S-1]]);
        }

        //go over the unique labels and compute the final grad
        // wrt to each one at this time step
        for (int i = 0; i < alphabet_size_; ++i) {

            if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
                log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
                grad[idx3] = std::exp(log_probs[idx3]);
            } else {
                grad[idx3] = std::exp(log_probs[idx3])
                             - std::exp(output[i] - log_probs[idx3] - log_partition);
            }
            ++idx3;
        }
    }

    ProbT loglike = ctc_helper::neg_inf<ProbT>();
    for(int i = start; i < end; ++i) {
        loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
    }

    return loglike;
}

template<typename ProbT>
ctcStatus_t
CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
                             ProbT *grads,
                             ProbT *costs,
                             const int* const flat_labels,
                             const int* const label_lengths,
                             const int* const input_lengths) {
    if (activations == nullptr ||
        grads == nullptr ||
        costs == nullptr ||
        flat_labels == nullptr ||
        label_lengths == nullptr ||
        input_lengths == nullptr
        )
        return CTC_STATUS_INVALID_VALUE;

    ProbT* log_probs = static_cast<ProbT *>(workspace_);

    int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

    size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

    //per minibatch memory
    size_t per_minibatch_bytes = 0;

    int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);;
    int maxS = 2 * maxL + 1;

    //output
    per_minibatch_bytes += sizeof(float) * alphabet_size_;

    //alphas
    per_minibatch_bytes += sizeof(float) * maxS * maxT;

    //betas
    per_minibatch_bytes += sizeof(float) * maxS;

    //labels w/blanks, e_inc, s_inc
    per_minibatch_bytes += 3 * sizeof(int) * maxS;

    log_softmax(activations, log_probs, input_lengths);

#pragma omp parallel for
    for (int mb = 0; mb < minibatch_; ++mb) {
        const int T = input_lengths[mb]; // Length of utterance (time)
        const int L = label_lengths[mb]; // Number of labels in transcription

        bool mb_status;

        std::tie(costs[mb], mb_status) =
                cost_and_grad_kernel(grads + mb * alphabet_size_,
                                     log_probs + mb * alphabet_size_,
                                     flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
                                     T, L, mb,
                                     bytes_used + mb * per_minibatch_bytes);
    }

    return CTC_STATUS_SUCCESS;
}

template<typename ProbT>
ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
                                         ProbT* costs,
                                         const int* const flat_labels,
                                         const int* const label_lengths,
                                         const int* const input_lengths) {
    if (activations == nullptr ||
        costs == nullptr ||
        flat_labels == nullptr ||
        label_lengths == nullptr ||
        input_lengths == nullptr
        )
        return CTC_STATUS_INVALID_VALUE;

    ProbT* log_probs = static_cast<ProbT *>(workspace_);

    int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);

    size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;

    //per minibatch memory
    size_t per_minibatch_bytes = 0;

    int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
    int maxS = 2 * maxL + 1;

    //output
    per_minibatch_bytes += sizeof(float) * alphabet_size_;

    //alphas
    per_minibatch_bytes += sizeof(float) * maxS * maxT;

    //betas
    per_minibatch_bytes += sizeof(float) * maxS;

    //labels w/blanks, e_inc, s_inc
    per_minibatch_bytes += 3 * sizeof(int) * maxS;

    log_softmax(activations, log_probs, input_lengths);

#pragma omp parallel for
    for (int mb = 0; mb < minibatch_; ++mb) {
        const int T = input_lengths[mb]; // Length of utterance (time)
        const int L = label_lengths[mb]; // Number of labels in transcription
        const int S = 2*L + 1; // Number of labels with blanks

        CpuCTC_metadata ctcm(L, S, T, mb, alphabet_size_, workspace_,
                             bytes_used + mb * per_minibatch_bytes, blank_label_,
                             flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));


        if (L + ctcm.repeats > T)
            costs[mb] = ProbT(0);
        else {
            costs[mb] = -compute_alphas(log_probs + mb * alphabet_size_, ctcm.repeats, S, T,
                                        ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
                                        ctcm.alphas);
        }

    }

    return CTC_STATUS_SUCCESS;
}

} // mxnet_warpctc