/** * Copyright (c) Glow Contributors. See CONTRIBUTORS file. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "libjit_defs.h" namespace { template static void libjit_dump_tensor_console_impl(ElemTy *tensor, dim_t *dims, dim_t numDims) { // Check for 0-dimensional tensor. if (!numDims) { printf("[ Scalar containing: %.3f ]\n", (float)tensor[0]); return; } // Output shape. printf("shape: ( "); for (size_t i = 0; i < numDims; ++i) { printf("%zu ", (size_t)dims[i]); } printf(")\n"); ElemTy mx = tensor[0]; ElemTy mn = tensor[0]; size_t size = 1; size_t sliceSize[numDims]; for (size_t i = 0; i < numDims; ++i) { size *= dims[i]; } for (ssize_t i = numDims - 1, curSliceSize = 1; i >= 0; --i) { sliceSize[i] = curSliceSize; curSliceSize *= dims[i]; } for (size_t i = 0, e = size; i < e; i++) { mx = MAX(mx, tensor[i]); mn = MIN(mn, tensor[i]); } // Check for zero tensor. if (mn == .0 && mx == .0) { printf("[ Zero tensor ]\n"); return; } // Output max and min. printf("max: %.3f min: %.3f\n", (float)mx, (float)mn); const unsigned maxNumElem = 100; printf("["); for (size_t i = 0, e = MIN(maxNumElem, size); i < e; i++) { // Print one open brace at the beginning of every row, slice, and tensor. for (size_t j = 0, e = numDims - 1; numDims > 1 && j < e; j++) { if (i % sliceSize[j] == 0) { // This iteration of outer loop is a new row, slice or tensor. printf("["); } } // Print the value at the current index. printf("%.3f", (float)tensor[i]); // Print one closed brace at the end of every row, slice, or tensor. for (size_t j = 0, e = numDims - 1; numDims > 1 && j < e; j++) { size_t next_index = i + 1; if (next_index % sliceSize[j] == 0u) { printf("]"); } } printf(", "); // Print one newline at the end of every row, slice, or tensor. for (size_t j = 0, e = numDims - 1; numDims > 1 && j < e; j++) { size_t next_index = i + 1; if (next_index % sliceSize[j] == 0u) { // Next iteration of outer loop will be a new row, slice or tensor. printf("\n"); } } } if (size > maxNumElem) { printf("..."); } printf("]\n"); } template static void libjit_dump_tensor_txt_impl(ElemTy *tensor, size_t tensorElemSize, const char *filename, const char *header) { FILE *fh = fopen(filename, "w"); if (!fh) { printf("ERROR opening file: '%s'!\n" "File name might be too long!\n", filename); return; } if (strlen(header)) { fprintf(fh, "%s\n", header); } for (size_t idx = 0, end = tensorElemSize; idx < end; idx++) { fprintf(fh, "%f, ", (double)tensor[idx]); } fclose(fh); } template static dim_t get_element_ptr(const ElemTy *tensor, const dim_t *dims, dim_t numDims, const dim_t *indices, dim_t numIndices) { dim_t index = 0; dim_t subdimensionSize = 1; for (dim_t i = numDims; i > 0; i--) { dim_t curIndicesValue = (i <= numIndices) ? indices[i - 1] : 0; index += subdimensionSize * curIndicesValue; subdimensionSize *= dims[i - 1]; } return index; } template static void libjit_insert_tensor(ElemTy *tensor, ElemTy *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { // Destination coordinates. dim_t C[5]; // A local copy of the offsets buffer. We copy the buffer to make it clear // to the optimizer that the inputs don't alias. This loop is optimized away. dim_t offsets_cpy[5]; for (dim_t i = 0; i < numDimsSlice; i++) { offsets_cpy[i] = offset[i]; } if (numDimsSlice == 5) { for (dim_t c = 0; c < count; c++) for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) for (dim_t w = 0; w < sliceDim[3]; w++) for (dim_t q = 0; q < sliceDim[4]; q++) { const dim_t countAxisOffset = c * sliceDim[axis]; C[0] = x + offsets_cpy[0] + ((axis == 0) ? countAxisOffset : 0); C[1] = y + offsets_cpy[1] + ((axis == 1) ? countAxisOffset : 0); C[2] = z + offsets_cpy[2] + ((axis == 2) ? countAxisOffset : 0); C[3] = w + offsets_cpy[3] + ((axis == 3) ? countAxisOffset : 0); C[4] = q + offsets_cpy[4] + ((axis == 4) ? countAxisOffset : 0); tensor[libjit_getXYZWQ(tensorDim, C[0], C[1], C[2], C[3], C[4])] = slice[libjit_getXYZWQ(sliceDim, x, y, z, w, q)]; } return; } if (numDimsSlice == 4) { for (dim_t c = 0; c < count; c++) for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) for (dim_t w = 0; w < sliceDim[3]; w++) { const dim_t countAxisOffset = c * sliceDim[axis]; C[0] = x + offsets_cpy[0] + ((axis == 0) ? countAxisOffset : 0); C[1] = y + offsets_cpy[1] + ((axis == 1) ? countAxisOffset : 0); C[2] = z + offsets_cpy[2] + ((axis == 2) ? countAxisOffset : 0); C[3] = w + offsets_cpy[3] + ((axis == 3) ? countAxisOffset : 0); tensor[libjit_getXYZW(tensorDim, C[0], C[1], C[2], C[3])] = slice[libjit_getXYZW(sliceDim, x, y, z, w)]; } return; } if (numDimsSlice == 3) { for (dim_t c = 0; c < count; c++) for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) { const dim_t countAxisOffset = c * sliceDim[axis]; C[0] = x + offsets_cpy[0] + ((axis == 0) ? countAxisOffset : 0); C[1] = y + offsets_cpy[1] + ((axis == 1) ? countAxisOffset : 0); C[2] = z + offsets_cpy[2] + ((axis == 2) ? countAxisOffset : 0); tensor[libjit_getXYZ(tensorDim, C[0], C[1], C[2])] = slice[libjit_getXYZ(sliceDim, x, y, z)]; } return; } if (numDimsSlice == 2) { for (dim_t c = 0; c < count; c++) for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) { const dim_t countAxisOffset = c * sliceDim[axis]; C[0] = x + offsets_cpy[0] + ((axis == 0) ? countAxisOffset : 0); C[1] = y + offsets_cpy[1] + ((axis == 1) ? countAxisOffset : 0); tensor[libjit_getXY(tensorDim, C[0], C[1])] = slice[libjit_getXY(sliceDim, x, y)]; } return; } if (numDimsSlice == 1) { for (dim_t c = 0; c < count; c++) for (dim_t x = 0; x < sliceDim[0]; x++) { const dim_t countAxisOffset = c * sliceDim[axis]; tensor[x + offsets_cpy[0] + ((axis == 0) ? countAxisOffset : 0)] = slice[x]; } return; } } template static void libjit_extract_tensor(ElemTy *tensor, ElemTy *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim) { // Source coordinates. dim_t C[5]; // A local copy of the offsets buffer. We copy the buffer to make it clear // to the optimizer that the inputs don't alias. This loop is optimized away. dim_t offsets_cpy[5]; for (dim_t i = 0; i < numDimsSlice; i++) { offsets_cpy[i] = offset[i]; } if (numDimsSlice == 5) { for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) for (dim_t w = 0; w < sliceDim[3]; w++) for (dim_t q = 0; q < sliceDim[4]; q++) { C[0] = x + offsets_cpy[0]; C[1] = y + offsets_cpy[1]; C[2] = z + offsets_cpy[2]; C[3] = w + offsets_cpy[3]; C[4] = q + offsets_cpy[4]; slice[libjit_getXYZWQ(sliceDim, x, y, z, w, q)] = tensor[libjit_getXYZWQ(tensorDim, C[0], C[1], C[2], C[3], C[4])]; } return; } if (numDimsSlice == 4) { for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) for (dim_t w = 0; w < sliceDim[3]; w++) { C[0] = x + offsets_cpy[0]; C[1] = y + offsets_cpy[1]; C[2] = z + offsets_cpy[2]; C[3] = w + offsets_cpy[3]; slice[libjit_getXYZW(sliceDim, x, y, z, w)] = tensor[libjit_getXYZW(tensorDim, C[0], C[1], C[2], C[3])]; } return; } if (numDimsSlice == 3) { for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) for (dim_t z = 0; z < sliceDim[2]; z++) { C[0] = x + offsets_cpy[0]; C[1] = y + offsets_cpy[1]; C[2] = z + offsets_cpy[2]; slice[libjit_getXYZ(sliceDim, x, y, z)] = tensor[libjit_getXYZ(tensorDim, C[0], C[1], C[2])]; } return; } if (numDimsSlice == 2) { for (dim_t x = 0; x < sliceDim[0]; x++) for (dim_t y = 0; y < sliceDim[1]; y++) { C[0] = x + offsets_cpy[0]; C[1] = y + offsets_cpy[1]; slice[libjit_getXY(sliceDim, x, y)] = tensor[libjit_getXY(tensorDim, C[0], C[1])]; } return; } if (numDimsSlice == 1) { for (dim_t x = 0; x < sliceDim[0]; x++) { slice[x] = tensor[x + offsets_cpy[0]]; } return; } } /// Helper struct for TopK template struct value_index { TI index; T value; }; /// Helper function for TopK template static int value_index_sort(const void *va, const void *vb) { value_index *a = (value_index *)va; value_index *b = (value_index *)vb; if (a->value != b->value) return a->value > b->value ? -1 : 1; return a->index < b->index ? -1 : 1; } /// Generic Top-K function. Here, \p scratch is some allocated buffer space, \p /// size is the size of the input, and \p n is the size of the last dimension of /// the input. template static void libjit_topk(T *values, TI *indices, const T *input, void *scratch, dim_t k, dim_t n, dim_t size) { dim_t in = 0; dim_t out = 0; // Initialize scratch with 0. memset(scratch, 0, 2 * n * sizeof(TI)); value_index *buffer = (value_index *)scratch; // Specialize TopK for the case where K is 1. if (k == 1) { while (in < size) { // Find the largest value by iterating over the array instead of calling // 'sort'. value_index mx = {0, input[in]}; for (TI i = 1; i < n; i++) { if (input[i + in] > mx.value) { mx = {i, input[i + in]}; } } indices[out] = mx.index; values[out] = mx.value; out++; in += n; } return; } while (in < size) { for (dim_t i = 0; i < n; i++) { buffer[i].index = i; buffer[i].value = input[in++]; } qsort(buffer, n, sizeof(value_index), value_index_sort); for (dim_t i = 0; i < k; i++) { indices[out] = buffer[i].index; values[out] = buffer[i].value; out++; } } } template static void libjit_gather(T *dest, const T *data, const IDX *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { // The index of the slice that is being written. dim_t outIdx = 0; // For each sample in our batch: for (dim_t sample = 0; sample < numSamples; sample++) { dim_t sampleStart = sample * sampleSize; // For each slice that we fetch: for (dim_t i = 0; i < numIndices; i++) { dim_t slice = indices[i]; // Copy the slice. memcpy(dest + outIdx * sliceSize, data + sampleStart + slice * sliceSize, sliceSize * sizeof(T)); // Point to the next location in the destination tensor. outIdx++; } } } template static void libjit_gatherranges(T *output, U *lengths, const T *data, const U *ranges, dim_t numExamples, dim_t exampleSize) { // Indices into the output and range buffers. dim_t outputIdx = 0; dim_t rangesIdx = 0; // For each example: for (dim_t example = 0; example < numExamples; ++example) { // Keep track of the total length of the gathered ranges for the example. U totalLen = 0; // For each range: for (dim_t range = 0; range < exampleSize; ++range) { // Get the start and length of the range. const U start = ranges[rangesIdx]; const U len = ranges[rangesIdx + 1]; // Copy the specified elements. memcpy(output + outputIdx, data + start, len * sizeof(T)); // len elements were copied, so increment the output index by len. outputIdx += len; // Each range is of the form (start, len), so increment the ranges // index by 2 to get to the next range. rangesIdx += 2; // Increment the total length for the example by len. totalLen += len; } // Record the total length of gathered ranges for the current example in // the lengths buffer. lengths[example] = totalLen; } } template static void libjit_scatterdatacopy(T *data, const dim_t *dataDims, const T2 *indices, const T *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize) { for (dim_t i = 0; i < numIndices; i++) { dim_t destDataIdx = indices[i * indexSize]; for (dim_t j = 1; j < indexSize; j++) { destDataIdx *= dataDims[j]; destDataIdx += indices[i * indexSize + j]; } memcpy(data + destDataIdx * sliceSize, slices + i * sliceSize, sliceSize * sizeof(T)); } } template static void libjit_scatterdataaddfloat(T *data, const dim_t *dataDims, const T2 *indices, const T *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize) { for (dim_t i = 0; i < numIndices; i++) { dim_t destDataIdx = indices[i * indexSize]; for (dim_t j = 1; j < indexSize; j++) { destDataIdx *= dataDims[j]; destDataIdx += indices[i * indexSize + j]; } for (dim_t j = 0; j < sliceSize; j++) { data[destDataIdx * sliceSize + j] += slices[i * sliceSize + j]; } } } template static void libjit_scatterdataaddquantized(T *data, const dim_t *dataDims, const T2 *indices, const T *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize, float dataScale, int32_t dataOffset, float sliceScale, int32_t sliceOffset) { for (size_t i = 0; i < numIndices; i++) { size_t destDataIdx = indices[i * indexSize]; for (size_t j = 1; j < indexSize; j++) { destDataIdx *= dataDims[j]; destDataIdx += indices[i * indexSize + j]; } for (size_t j = 0; j < sliceSize; j++) { float lhs = (data[destDataIdx * sliceSize + j] - dataOffset) * dataScale; float rhs = (slices[i * sliceSize + j] - sliceOffset) * sliceScale; T result = libjit_clip((lhs + rhs) / dataScale + dataOffset); data[destDataIdx * sliceSize + j] = result; } } } template static void libjit_transpose_generic(const T *inW, T *outW, const dim_t *idim, const dim_t *odim, const dim_t *shuffle, dim_t numDims) { // Transpose 2d matrices one tile at a time. This access pattern ensures // that the whole tile is kept in L1 cache. When scanning the whole row at // once we invalidate many cache lines when we touch a single column. const unsigned tileSize = 64; // Source coordinate. dim_t SC[5]; if (numDims == 5) { for (dim_t x = 0; x < odim[0]; x++) for (dim_t y = 0; y < odim[1]; y++) for (dim_t z = 0; z < odim[2]; z++) for (dim_t w = 0; w < odim[3]; w++) for (dim_t q = 0; q < odim[4]; q++) { SC[shuffle[0]] = x; SC[shuffle[1]] = y; SC[shuffle[2]] = z; SC[shuffle[3]] = w; SC[shuffle[4]] = q; outW[libjit_getXYZWQ(odim, x, y, z, w, q)] = inW[libjit_getXYZWQ(idim, SC[0], SC[1], SC[2], SC[3], SC[4])]; } return; } if (numDims == 4) { for (dim_t x = 0; x < odim[0]; x++) for (dim_t y = 0; y < odim[1]; y++) for (dim_t z = 0; z < odim[2]; z++) for (dim_t w = 0; w < odim[3]; w++) { SC[shuffle[0]] = x; SC[shuffle[1]] = y; SC[shuffle[2]] = z; SC[shuffle[3]] = w; outW[libjit_getXYZW(odim, x, y, z, w)] = inW[libjit_getXYZW(idim, SC[0], SC[1], SC[2], SC[3])]; } return; } if (numDims == 3) { for (dim_t x = 0; x < odim[0]; x++) { // Process the tiles in the innermost two dimensions: for (dim_t sy = 0; sy < odim[1]; sy += tileSize) { for (dim_t sz = 0; sz < odim[2]; sz += tileSize) { // Process the inner tile: for (dim_t y = sy; y < MIN(sy + tileSize, odim[1]); y++) { for (dim_t z = sz; z < MIN(sz + tileSize, odim[2]); z++) { SC[shuffle[0]] = x; SC[shuffle[1]] = y; SC[shuffle[2]] = z; outW[libjit_getXYZ(odim, x, y, z)] = inW[libjit_getXYZ(idim, SC[0], SC[1], SC[2])]; } } } } } return; } if (numDims == 2) { // Process the tiles in the matrix: for (dim_t sx = 0; sx < odim[0]; sx += tileSize) { for (dim_t sy = 0; sy < odim[1]; sy += tileSize) { // Process the inner tile: for (dim_t x = sx; x < MIN(sx + tileSize, odim[0]); x++) { for (dim_t y = sy; y < MIN(sy + tileSize, odim[1]); y++) { SC[shuffle[0]] = x; SC[shuffle[1]] = y; outW[libjit_getXY(odim, x, y)] = inW[libjit_getXY(idim, SC[0], SC[1])]; } } } } return; } } template static void libjit_flip_generic(const T *inW, T *outW, const dim_t *dims, dim_t axis, dim_t numDims) { // Product of outer dimensions excluding the flip dimension. dim_t outerLen = 1; for (dim_t idx = 0; idx < axis; idx++) { outerLen *= dims[idx]; } // Flip dimension. dim_t len = dims[axis]; // Product of inner dimensions excluding the flip dimension. dim_t innerLen = 1; for (dim_t idx = axis + 1; idx < numDims; idx++) { innerLen *= dims[idx]; } // Flip axis such that input data is read linearly. const T *inpPtr = inW; T *outPtr = outW + (len - 1) * innerLen; for (dim_t outerIdx = 0; outerIdx < outerLen; outerIdx++) { for (dim_t idx = 0; idx < len; idx++) { for (dim_t innerIdx = 0; innerIdx < innerLen; innerIdx++) { *outPtr++ = *inpPtr++; } outPtr -= 2 * innerLen; } outPtr += 2 * len * innerLen; } } template static void libjit_arg_max_generic(const inpT *inpW, outT *outW, const dim_t *dims, size_t numDims, size_t axis) { // Product of outer dimensions excluding the axis dimension. dim_t outerLen = 1; for (dim_t idx = 0; idx < axis; ++idx) { outerLen *= dims[idx]; } // Axis dimension length. dim_t axisLen = dims[axis]; // Product of inner dimensions excluding the axis dimension. dim_t innerLen = 1; for (dim_t idx = axis + 1; idx < numDims; ++idx) { innerLen *= dims[idx]; } // Traverse data such that output is written linearly. const inpT *inpPtr = inpW; outT *outPtr = outW; for (dim_t outerIdx = 0; outerIdx < outerLen; ++outerIdx) { for (dim_t innerIdx = 0; innerIdx < innerLen; ++innerIdx) { inpT maxVal = std::numeric_limits::lowest(); outT maxIdx = 0; for (dim_t axisIdx = 0; axisIdx < axisLen; ++axisIdx) { inpT inpVal = *inpPtr; if (inpVal > maxVal) { maxVal = inpVal; maxIdx = axisIdx; } inpPtr += innerLen; } inpPtr = inpPtr - axisLen * innerLen + 1; *outPtr++ = maxIdx; } inpPtr = inpPtr - innerLen + axisLen * innerLen; } } template static void libjit_arg_min_generic(const inpT *inpW, outT *outW, const dim_t *dims, size_t numDims, size_t axis) { // Product of outer dimensions excluding the axis dimension. dim_t outerLen = 1; for (dim_t idx = 0; idx < axis; ++idx) { outerLen *= dims[idx]; } // Axis dimension length. dim_t axisLen = dims[axis]; // Product of inner dimensions excluding the axis dimension. dim_t innerLen = 1; for (dim_t idx = axis + 1; idx < numDims; ++idx) { innerLen *= dims[idx]; } // Traverse data such that output is written linearly. const inpT *inpPtr = inpW; outT *outPtr = outW; for (dim_t outerIdx = 0; outerIdx < outerLen; ++outerIdx) { for (dim_t innerIdx = 0; innerIdx < innerLen; ++innerIdx) { inpT minVal = std::numeric_limits::max(); outT minIdx = 0; for (dim_t axisIdx = 0; axisIdx < axisLen; ++axisIdx) { inpT inpVal = *inpPtr; if (inpVal < minVal) { minVal = inpVal; minIdx = axisIdx; } inpPtr += innerLen; } inpPtr = inpPtr - axisLen * innerLen + 1; *outPtr++ = minIdx; } inpPtr = inpPtr - innerLen + axisLen * innerLen; } } template static void libjit_max_pool_generic(const T *inW, T *outW, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernelSizes, dim_t *strides, dim_t *pads) { dim_t pad_t = pads[0]; dim_t pad_l = pads[1]; dim_t stride_h = strides[0]; dim_t stride_w = strides[1]; dim_t kernel_h = kernelSizes[0]; dim_t kernel_w = kernelSizes[1]; // For each sample in the batch: for (dim_t n = 0; n < outWdims[0]; n++) { // For each (x,y) step in the input/output tensor: sdim_t x = -(sdim_t)pad_t; for (dim_t ax = 0; ax < outWdims[1]; x += stride_h, ax++) { sdim_t y = -(sdim_t)pad_l; for (dim_t ay = 0; ay < outWdims[2]; y += stride_w, ay++) { // For each layer in the output tensor: for (dim_t z = 0; z < inWdims[3]; z++) { int first = 1; T max = 0; // For each element in the pool filter: for (dim_t fx = 0; fx < kernel_h; fx++) { for (dim_t fy = 0; fy < kernel_w; fy++) { sdim_t ox = x + fx; sdim_t oy = y + fy; // Ignore index access below zero (this is due to padding). if (ox < 0 || oy < 0 || ox >= (sdim_t)inWdims[1] || oy >= (sdim_t)inWdims[2]) { continue; } float val = inW[libjit_getXYZW(inWdims, n, (dim_t)ox, (dim_t)oy, z)]; if (first || (val >= max)) { first = 0; max = val; } } } outW[libjit_getXYZW(outWdims, n, ax, ay, z)] = max; } // C } // W } // H } // N } template static void libjit_max_pool_argmax_generic(const T *inW, T *outW, T2 *argmax, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { dim_t pad_t = pads[0]; dim_t pad_l = pads[1]; dim_t stride_h = strides[0]; dim_t stride_w = strides[1]; dim_t kernel_h = kernels[0]; dim_t kernel_w = kernels[1]; // For each input in the batch: for (dim_t n = 0; n < outWdims[0]; n++) { // For each (x,y) step in the input/output tensor: sdim_t x = -(sdim_t)pad_t; for (dim_t ax = 0; ax < outWdims[1]; x += stride_h, ax++) { sdim_t y = -(sdim_t)pad_l; for (dim_t ay = 0; ay < outWdims[2]; y += stride_w, ay++) { // For each channel in the output tensor: for (dim_t z = 0; z < outWdims[3]; z++) { int64_t argmaxNHWC = 0; int first = 1; T max = 0; for (dim_t kx = 0; kx < kernel_h; kx++) { for (dim_t ky = 0; ky < kernel_w; ky++) { sdim_t ox = x + kx; sdim_t oy = y + ky; if (ox < 0 || oy < 0 || ox >= (sdim_t)inWdims[1] || oy >= (sdim_t)inWdims[2]) { continue; } const dim_t flatIndex = libjit_getXYZW(inWdims, n, (dim_t)ox, (dim_t)oy, z); T val = inW[flatIndex]; if (first || (val >= max)) { first = 0; max = val; argmaxNHWC = flatIndex; } } } const dim_t flatIndex = libjit_getXYZW(outWdims, n, ax, ay, z); outW[flatIndex] = max; argmax[flatIndex] = argmaxNHWC; } // C } // W } // H } // N } template void libjit_resizenearest_generic(T *dst, const T *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { for (dim_t ob = 0; ob < outWdims[0]; ++ob) { auto ib = std::min(dim_t(ob / (scale[0])), inWdims[0] - 1); for (dim_t oh = 0; oh < outWdims[1]; ++oh) { auto ih = std::min(dim_t(oh / (scale[1])), inWdims[1] - 1); for (dim_t ow = 0; ow < outWdims[2]; ++ow) { auto iw = std::min(dim_t(ow / (scale[2])), inWdims[2] - 1); for (dim_t oc = 0; oc < outWdims[3]; ++oc) { auto ic = std::min(dim_t(oc / (scale[3])), inWdims[3] - 1); const dim_t inIndex = libjit_getXYZW(inWdims, ib, ih, iw, ic); const dim_t outIndex = libjit_getXYZW(outWdims, ob, oh, ow, oc); dst[outIndex] = src[inIndex]; } } } } } template static void libjit_resizebilinear_generic(T *dst, const T *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { for (dim_t ob = 0; ob < outWdims[0]; ++ob) { for (dim_t oh = 0; oh < outWdims[1]; ++oh) { for (dim_t ow = 0; ow < outWdims[2]; ++ow) { float ihf = oh / scale[1]; float iwf = ow / scale[2]; dim_t ih = dim_t(ihf); dim_t iw = dim_t(iwf); auto ih0 = std::min(ih, inWdims[1] - 1); auto ih1 = std::min(ih + 1, inWdims[1] - 1); auto iw0 = std::min(iw, inWdims[2] - 1); auto iw1 = std::min(iw + 1, inWdims[2] - 1); for (dim_t oc = 0; oc < outWdims[3]; ++oc) { float v00 = src[libjit_getXYZW(inWdims, ob, ih0, iw0, oc)]; float v01 = src[libjit_getXYZW(inWdims, ob, ih0, iw1, oc)]; float v10 = src[libjit_getXYZW(inWdims, ob, ih1, iw0, oc)]; float v11 = src[libjit_getXYZW(inWdims, ob, ih1, iw1, oc)]; float hd = v00 + (v10 - v00) * (ihf - ih); float hw = v01 + (v11 - v01) * (ihf - ih); float result = hd + (hw - hd) * (iwf - iw); dst[libjit_getXYZW(outWdims, ob, oh, ow, oc)] = result; } } } } } template static void libjit_batchedadd_quantized(int8_t *dest, const int8_t *batch, const T *slice, dim_t numSlice, dim_t sliceSize, int32_t destOffset, int32_t batchOffset, int32_t sliceOffset, int32_t batchPre, int32_t batchPost, int32_t batchScale, int32_t slicePre, int32_t slicePost, int32_t sliceScale) { for (dim_t n = 0; n < numSlice; n++) { dim_t base = n * sliceSize; for (dim_t i = 0; i < sliceSize; i++) { int32_t b = batch[base + i] - batchOffset; int32_t s = slice[i] - sliceOffset; int32_t x = libjit_scale_i32i8(b, batchPre, batchPost, batchScale, 0); int32_t y = libjit_scale_i32i8(s, slicePre, slicePost, sliceScale, 0); dest[base + i] = libjit_clip(x + y + destOffset); } } } static void find_min_max_f(float *tensor, dim_t size, float &min, float &max) { min = tensor[0]; max = tensor[0]; for (dim_t i = 1; i < size; ++i) { float tensorVal = tensor[i]; if (tensorVal < min) min = tensorVal; if (tensorVal > max) max = tensorVal; // Sanity check for NaN and Infinity. assert(!std::isnan(tensor[i]) && "NaN value found!"); assert(!std::isinf(tensor[i]) && "Infinity value found!"); } } static int check_all_zeros(float *arrayToCheck, dim_t size) { for (dim_t i = 0; i < size; ++i) { if (arrayToCheck[i] != 0) { return 0; } } return 1; } /// Gen a bin number to insert \p value into the histogram which has \p nBins /// with \p minValue and binWidth in histogram. static dim_t get_bin(dim_t nBins, float binWidth, float minValue, float value) { dim_t result = binWidth == 0 ? 0 : MIN(static_cast((value - minValue) / binWidth), nBins - 1); return result; } template static void libjit_space_to_depth_generic(const T *inPtr, T *outPtr, dim_t blockSize, const dim_t *inDims, const dim_t *outDims) { dim_t inHeight = inDims[1]; dim_t inWidth = inDims[2]; dim_t inDepth = inDims[3]; dim_t outBatch = outDims[0]; dim_t outHeight = outDims[1]; dim_t outWidth = outDims[2]; dim_t outDepth = outDims[3]; for (dim_t b = 0; b < outBatch; ++b) { for (dim_t h = 0; h < outHeight; ++h) { for (dim_t w = 0; w < outWidth; ++w) { for (dim_t c = 0; c < outDepth; ++c) { // NHWC // c + // w * outDepth + // h * outDepth * outWidth + // b * outDepth * outWidth * outHeight dim_t outIndex = c + outDepth * (w + outWidth * (h + b * outHeight)); // Gets the block layer we are on dim_t blockDepthLayer = c / inDepth; // every multiple of block size we reset to 0 offset dim_t iw = w * blockSize + blockDepthLayer % blockSize; // every multiple of blockSize we start height traversal + 1 dim_t ih = h * blockSize + blockDepthLayer / blockSize; // at every multiple of inDepth index in to input depths resets to 0 dim_t id = c % inDepth; dim_t inIndex = id + inDepth * (iw + inWidth * (ih + b * inHeight)); outPtr[outIndex] = inPtr[inIndex]; } } } } } template static void libjit_copy_kernel_with_conversion(DstType *dstPtr, const SrcType *srcPtr, const dim_t *dims, dim_t numDims) { dim_t dimSize = 1; for (dim_t i = 0; i < numDims; ++i) { dimSize *= dims[i]; } for (dim_t i = 0; i < dimSize; ++i) { dstPtr[i] = DstType(srcPtr[i]); } } /// The dimensions passed in here are pre-expanded in LLVMIRGen with 1s so that /// we can iterate over the shape here, regardless of the shape of the tensor. template static void libjit_reducemin(T *dest, const T *batch, size_t destSize, const dim_t *destDims, const dim_t *batchDims, T init) { for (dim_t i = 0; i < destSize; i++) { dest[i] = init; } unsigned int axis[6]; for (dim_t i = 0; i < 6; i++) { axis[i] = (destDims[i] > 1); } for (dim_t x = 0, dx = 0; x < batchDims[0]; x++, dx += axis[0]) { for (dim_t y = 0, dy = 0; y < batchDims[1]; y++, dy += axis[1]) { for (dim_t z = 0, dz = 0; z < batchDims[2]; z++, dz += axis[2]) { for (dim_t w = 0, dw = 0; w < batchDims[3]; w++, dw += axis[3]) { for (dim_t q = 0, dq = 0; q < batchDims[4]; q++, dq += axis[4]) { for (dim_t r = 0, dr = 0; r < batchDims[5]; r++, dr += axis[5]) { T fdest = dest[libjit_getXYZWQR(destDims, dx, dy, dz, dw, dq, dr)]; T fnew = batch[libjit_getXYZWQR(batchDims, x, y, z, w, q, r)]; dest[libjit_getXYZWQR(destDims, dx, dy, dz, dw, dq, dr)] = std::min(fdest, fnew); } } } } } } } template static void libjit_cross_entropy_loss_generic(T *CE, T *P, T2 *labels, dim_t *dims) { CE[0] = 0.0; for (dim_t n = 0; n < dims[0]; ++n) { auto y = labels[n]; auto p_n = P[libjit_getXY(dims, n, y)]; CE[0] -= log(p_n); } } template static void libjit_sparse_lengths_sum_generic(T *dest, T *data, T2 *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { memset(dest, 0, segments * lineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { for (int32_t j = 0; j < lengths[i]; j++) { dim_t line = indices[curIndex]; for (dim_t k = 0; k < lineSize; k++) { dest[i * lineSize + k] += data[line * lineSize + k]; } curIndex++; } } } template static void libjit_sparse_lengths_weighted_sum_generic(T *dest, T *data, float *weights, T2 *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { memset(dest, 0, segments * lineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { for (int32_t j = 0; j < lengths[i]; j++) { float weight = weights[curIndex]; dim_t line = indices[curIndex]; for (dim_t k = 0; k < lineSize; k++) { dest[i * lineSize + k] += weight * data[line * lineSize + k]; } curIndex++; } } } template static void libjit_sparse_lengths_weighted_sum_grad_generic( const T *destGrad, T *dataGrad, T *weightsGrad, const T *data, const T *weights, const T2 *indices, const int32_t *lengths, dim_t segments, dim_t lineSize, dim_t dataGradRawSize) { // The data gradients not touched by this operation should // be 0, so set the entire buffer to 0 to start with. memset(dataGrad, 0, dataGradRawSize); for (dim_t i = 0, curIndex = 0; i < segments; ++i) { for (int32_t j = 0; j < lengths[i]; ++j, ++curIndex) { // For each index in each segment: // 1) accumulate into the corresponding data gradient the product of // the gradient of the result it was added to and the weight that it // was multiplied by during the SparseLengthsWeightedSum operation. // // 2) accumulate into each weight gradient the reduced sum of the // elementwise product of the result slice that the corresponding // weight produced and the input slice that the weight was multiplied // with. float weightGrad = 0.0f; float weight = weights[curIndex]; dim_t line = indices[curIndex]; for (dim_t k = 0; k < lineSize; ++k) { dataGrad[line * lineSize + k] += weight * destGrad[i * lineSize + k]; weightGrad += destGrad[i * lineSize + k] * data[line * lineSize + k]; } weightsGrad[curIndex] = weightGrad; } } } template static void libjit_rowwise_quantized_sparse_lengths_weighted_sum_generic( T *dest, uint8_t *data, T *scales, T *offsets, T *weights, T2 *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { memset(dest, 0, segments * lineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { for (int32_t j = 0; j < lengths[i]; j++) { const float weight = weights[curIndex]; const dim_t line = indices[curIndex]; const float scale = scales[line]; const float offset = offsets[line]; for (dim_t k = 0; k < lineSize; k++) { const float fData = scale * data[line * lineSize + k] + offset; dest[i * lineSize + k] += weight * fData; } curIndex++; } } } template static void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_generic( T *dest, int8_t *data, T *weights, T2 *indices, int32_t *lengths, dim_t segments, dim_t inLineSize, dim_t outLineSize) { memset(dest, 0, segments * outLineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { for (int32_t j = 0, e = lengths[i]; j < e; j++) { const float weight = weights[curIndex]; const dim_t line = indices[curIndex]; const int8_t *currRowScaleOffsetPtr = data + ((line + 1) * inLineSize) - 2 * sizeof(float); float scale, offset; memcpy(&scale, currRowScaleOffsetPtr, sizeof(float)); memcpy(&offset, currRowScaleOffsetPtr + sizeof(float), sizeof(float)); for (dim_t k = 0; k < outLineSize; k++) { const float fData = (scale * (uint8_t)(data[line * inLineSize + k])) + offset; dest[i * outLineSize + k] += weight * fData; } curIndex++; } } } template static void libjit_sparse_to_dense_generic(T *dest, const T2 *indices, const T *values, dim_t numIndices, dim_t destSize, dim_t valueSize) { memset(dest, 0, destSize * sizeof(float)); for (dim_t i = 0, valuesOffset = 0; i < numIndices; ++i, valuesOffset += valueSize) { dim_t idx = indices[i]; dim_t destOffset = idx * valueSize; for (size_t j = 0; j < valueSize; ++j) { dest[destOffset + j] += values[valuesOffset + j]; } } } struct ClassBox { float score{0.0f}; size_t index{0}; }; struct Box { float v0{0.0f}; float v1{0.0f}; float v2{0.0f}; float v3{0.0f}; }; struct OutBox { float classValue{0.0f}; size_t batchIndex{0}; size_t classIndex{0}; size_t boxIndex{0}; }; static void maxMin(float lhs, float rhs, float &min, float &max) { if (lhs >= rhs) { min = rhs; max = lhs; } else { min = lhs; max = rhs; } } static bool checkIOU(const Box &sb, const Box &cb, float iouThreshold, size_t centerPointBox) { float xSMin = 0.0f; float ySMin = 0.0f; float xSMax = 0.0f; float ySMax = 0.0f; float xCMin = 0.0f; float yCMin = 0.0f; float xCMax = 0.0f; float yCMax = 0.0f; // Standardizing coordinates so that (xmin, ymin) is upper left corner of a // box and (xmax, ymax) is lower right corner of the box. if (!centerPointBox) { // 0 means coordinates for diagonal ends of a box. // Coordinates can either be absolute or normalized. maxMin(sb.v0, sb.v2, xSMin, xSMax); maxMin(sb.v1, sb.v3, ySMin, ySMax); maxMin(cb.v0, cb.v2, xCMin, xCMax); maxMin(cb.v1, cb.v3, yCMin, yCMax); } else { float halfWidthS = sb.v2 / 2.0f; float halfHeightS = sb.v3 / 2.0f; float halfWidthC = cb.v2 / 2.0f; float halfHeightC = cb.v3 / 2.0f; xSMin = sb.v0 - halfWidthS; ySMin = sb.v1 - halfHeightS; xSMax = sb.v0 + halfWidthS; ySMax = sb.v1 + halfHeightS; xCMin = cb.v0 - halfWidthC; yCMin = cb.v1 - halfHeightC; xCMax = cb.v0 + halfWidthC; yCMax = cb.v1 + halfHeightC; } // finding upper left and lower right corner of a box formed by intersection. float xMin = MAX(xSMin, xCMin); float yMin = MAX(ySMin, yCMin); float xMax = MIN(xSMax, xCMax); float yMax = MIN(ySMax, yCMax); float intersectionArea = MAX((0.0f), xMax - xMin) * MAX((0.0f), yMax - yMin); if (intersectionArea == 0.0f) { return false; } float sArea = (xSMax - xSMin) * (ySMax - ySMin); float cArea = (xCMax - xCMin) * (yCMax - yCMin); float unionArea = sArea + cArea - intersectionArea; return intersectionArea > iouThreshold * unionArea; } // ONNX // Class/Score [BatchNum][ClassNum][BoxNum] // Box [BatchNum][BoxNum][4] // Result [BatchNum*MaxOutputPerBatch][3] // V4 // Class/Score [BatchNum][BoxNum] // Boxes [BatdhNum][BoxNum][4] // Result [BatchNum*MaxOutputPerBatch] // NumberOfIndicesDetected [BatchNum*MaxOutputPerBatch] template static void libjit_nms_generic(T *indices, T *numDetected, const float *boxTensor, const dim_t *boxTensorDims, dim_t boxTensorDimSize, const float *scoresTensor, const dim_t *scoresTensorDims, dim_t scoresTensorDimSize, const dim_t *resultTensorDims, dim_t resultTensorDimSize, unsigned centerPointBox, unsigned maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, bool isV4) { int boxesBoxDim = boxTensorDimSize - 2; size_t numBatches = 1; size_t numClasses = 1; size_t numBoxes = boxTensorDims[boxesBoxDim]; size_t maxOutputPerBatch = 0; if (!isV4) { int boxesBatchDim = boxTensorDimSize - 3; int scoresBatchDim = scoresTensorDimSize - 3; int scoresBoxDim = scoresTensorDimSize - 1; int scoresClassDim = scoresTensorDimSize - 2; assert(scoresTensorDims[scoresBoxDim] == boxTensorDims[boxesBoxDim] && "Mismatch between number of scores and number of boxes."); assert(scoresTensorDims[scoresBatchDim] == boxTensorDims[boxesBatchDim] && "Scores and Box Batch Dimensions don't match."); (void)boxesBatchDim; (void)scoresBoxDim; numBatches = scoresTensorDims[scoresBatchDim]; numClasses = scoresTensorDims[scoresClassDim]; numBoxes = boxTensorDims[boxesBoxDim]; maxOutputPerBatch = resultTensorDims[resultTensorDimSize - 2] / numBatches; } else { maxOutputPerBatch = resultTensorDims[resultTensorDimSize - 1] / numBatches; } static_assert(sizeof(Box) == 4 * sizeof(float), "Can't reinterpret raw float data as a Box."); const Box *boxes = reinterpret_cast(boxTensor); auto cmpFunc = [](const ClassBox &cb1, const ClassBox &cb2) -> bool { return cb1.score > cb2.score; }; size_t outPutBoxIndex = 0; for (size_t batchIndex = 0; batchIndex < numBatches; ++batchIndex) { int32_t detectedPerBatch = 0; OutBox minBox{scoresTensor[batchIndex * numClasses], batchIndex, 0, 0}; for (size_t classIndex = 0; classIndex < numClasses; ++classIndex) { ClassBox selectedIndices[numBoxes]; ClassBox potentialBoxes[numBoxes]; size_t indexPBoxes = 0; const float *currClass = &scoresTensor[(batchIndex * numClasses + classIndex) * numBoxes]; for (size_t boxIndex = 0; boxIndex < numBoxes; ++boxIndex) { float classScore = currClass[boxIndex]; if (classScore > scoreThreshold) { ClassBox &b = potentialBoxes[indexPBoxes++]; b.score = classScore; b.index = boxIndex; } } std::sort(potentialBoxes, potentialBoxes + indexPBoxes, cmpFunc); size_t indexSBoxes = 0; size_t detectedPerClass = 0; float tScore = minBox.classValue; for (unsigned int i = 0; i < indexPBoxes; ++i) { ClassBox &pbI = potentialBoxes[i]; const Box &potentialBox = boxes[batchIndex * numBoxes + pbI.index]; bool selected = true; for (unsigned int j = 0; j < indexSBoxes && selected; ++j) { ClassBox &sbI = selectedIndices[j]; const Box &selectedBox = boxes[batchIndex * numBoxes + sbI.index]; selected = !checkIOU(selectedBox, potentialBox, iouThreshold, centerPointBox); } if (selected) { selectedIndices[indexSBoxes++] = pbI; if (isV4) { indices[outPutBoxIndex] = pbI.index; } else { indices[outPutBoxIndex * 3 + 0] = batchIndex; indices[outPutBoxIndex * 3 + 1] = classIndex; indices[outPutBoxIndex * 3 + 2] = pbI.index; } tScore = pbI.score; ++outPutBoxIndex; ++detectedPerClass; ++detectedPerBatch; } if (detectedPerClass == maxOutputBoxesPerClass) { break; } } if (tScore < minBox.classValue) { minBox.classValue = tScore; if (isV4) { minBox.boxIndex = indices[outPutBoxIndex - 1]; } else { minBox.boxIndex = indices[(outPutBoxIndex - 1) * 3 + 2]; } minBox.classIndex = classIndex; } } // Filling the rest of the class with minimum value. for (size_t i = detectedPerBatch; i < maxOutputPerBatch; ++i) { if (isV4) { indices[outPutBoxIndex] = minBox.boxIndex; } else { indices[outPutBoxIndex * 3 + 0] = minBox.batchIndex; indices[outPutBoxIndex * 3 + 1] = minBox.classIndex; indices[outPutBoxIndex * 3 + 2] = minBox.boxIndex; } ++outPutBoxIndex; } // For ONNX NMS it's not used, for TF Batch Dimension is 1. for (size_t i = 0; i < maxOutputBoxesPerClass; ++i) { numDetected[batchIndex * maxOutputBoxesPerClass + i] = detectedPerBatch; } } } template void libjit_softmax_grad_generic(T *inG, T *outW, const T2 *selectedW, const dim_t *idim, const dim_t *selectdim) { for (dim_t n = 0; n < idim[0]; n++) { for (dim_t i = 0; i < idim[1]; i++) { float delta = (selectedW[libjit_getXY(selectdim, n, 0)] == i); inG[libjit_getXY(idim, n, i)] = outW[libjit_getXY(idim, n, i)] - delta; } } } template void libjit_max_pool_argmax_grad_generic(T *inG, const T *outG, const T2 *argmax, const dim_t *inGdims, const dim_t *outWdims) { // NHWC format is assumed for (dim_t n = 0; n < outWdims[0]; n++) { for (dim_t z = 0; z < outWdims[3]; z++) { // Clear inG for (dim_t x = 0; x < inGdims[1]; x++) { for (dim_t y = 0; y < inGdims[2]; y++) { inG[libjit_getXYZW(inGdims, n, x, y, z)] = 0.0; } } for (dim_t ax = 0; ax < outWdims[1]; ax++) { for (dim_t ay = 0; ay < outWdims[2]; ay++) { // Reuse precomputed linear index of max element from argmax. const dim_t flatIndex = libjit_getXYZW(outWdims, n, ax, ay, z); float df = outG[flatIndex]; inG[argmax[flatIndex]] += df; } // W } // H } // C } // N } } // namespace extern "C" { /// Macro to define a mini-kernel for data-parallel operations. The body of the /// kernel is auto-generated by the macro. /// \p name the name of the kernel /// \p type the type of the tensor elements and of the return value /// \p body the operation to be performed #define DEFINE_DATA_PARALLEL_KERNEL(name, type, body) \ type name(dim_t idx, const type *LHS, const type *RHS, const type *op3) { \ return body; \ } /// Macro to define a mini-kernel for data-parallel operations. The body of the /// kernel is not auto-generated by the macro. /// \p name the name of the kernel #define DEFINE_DATA_PARALLEL_KERNEL_FUNC(name) \ float name(dim_t idx, const float *LHS, const float *RHS, const float *op3) /// Macro to define a mini-kernel for data-parallel operations with immediate /// operands. /// \p name the name of the kernel /// \p type the type of the tensor elements and of the return value /// \p body the operation to be performed #define DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(name, type, body) \ type name(dim_t idx, type val, const type *LHS, const type *RHS) { \ return body; \ } /// Macro to define a mini-kernel for data-parallel arithmetic quantized /// operations. The body of the kernel is auto-generated by the macro. /// \p name the name of the kernel /// \p type the type of the tensor elements /// \p body the operation to be performed #define DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED(name, type, body) \ type name(dim_t idx, const type *LHS, const type *RHS, int32_t destOffset, \ int32_t lhsOffset, int32_t rhsOffset, int32_t lhsPre, \ int32_t lhsPost, int32_t lhsScale, int32_t rhsPre, \ int32_t rhsPost, int32_t rhsScale) { \ int32_t lhs = libjit_scale_i32i8(LHS[idx] - lhsOffset, lhsPre, lhsPost, \ lhsScale, 0); \ int32_t rhs = libjit_scale_i32i8(RHS[idx] - rhsOffset, rhsPre, rhsPost, \ rhsScale, 0); \ return libjit_clip((body) + destOffset); \ } /// Macro to define a mini-kernel for data-parallel multiplicative quantized /// operations. The body of the kernel is auto-generated by the macro. /// \p name the name of the kernel /// \p type the type of the tensor elements /// \p body the operation to be performed #define DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED_M(name, body) \ int8_t name(dim_t idx, const int8_t *LHS, const int8_t *RHS, \ int32_t destOffset, int32_t lhsOffset, int32_t rhsOffset, \ int32_t pre, int32_t post, int32_t scale) { \ int32_t lhs = LHS[idx] - lhsOffset; \ int32_t rhs = RHS[idx] - rhsOffset; \ return libjit_clip( \ libjit_scale_i32i8((body), pre, post, scale, destOffset)); \ } /// Define mini-kernels for all data parallel operations. They are invoked from /// the generated kernels for sequences of data parallel operations. DEFINE_DATA_PARALLEL_KERNEL(libjit_element_max_kernel_f, float, MAX(LHS[idx], RHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_min_kernel_f, float, MIN(LHS[idx], RHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_f, float, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_u, int64_t, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_i8, int8_t, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_i16, int16_t, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_i32, int32_t, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_copy_kernel_b, int8_t, LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_add_kernel_f, float, LHS[idx] + RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_add_kernel_i32, int32_t, LHS[idx] + RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_sub_kernel_f, float, LHS[idx] - RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_div_kernel_f, float, LHS[idx] / RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_div_kernel_u, int64_t, LHS[idx] / RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_div_kernel_i32, int32_t, LHS[idx] / RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_mul_kernel_f, float, LHS[idx] * RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_mul_kernel_i32, int32_t, LHS[idx] * RHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_pow_kernel_f, float, pow(LHS[idx], RHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_log_kernel_f, float, log(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_exp_kernel_f, float, exp(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_abs_kernel_f, float, std::abs(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_neg_kernel_f, float, -LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_floor_kernel_f, float, std::floor(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_ceil_kernel_f, float, std::ceil(LHS[idx])) // Rounding mode required by ONNX, Numpy, TensorFlow is round to even which // rounds to nearest even integer those values with fractional part 0.5. DEFINE_DATA_PARALLEL_KERNEL(libjit_element_round_kernel_f, float, std::nearbyintf(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_sqrt_kernel_f, float, std::sqrt(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_rsqrt_kernel_f, float, 1 / std::sqrt(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_reciprocal_kernel_f, float, 1 / LHS[idx]) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_sin_kernel_f, float, std::sin(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL(libjit_element_cos_kernel_f, float, std::cos(LHS[idx])) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED(libjit_element_add_kernel_i8, int8_t, lhs + rhs) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED(libjit_element_sub_kernel_i8, int8_t, lhs - rhs) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED(libjit_element_max_kernel_i8, int8_t, MAX(lhs, rhs)) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED(libjit_element_min_kernel_i8, int8_t, MIN(lhs, rhs)) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED_M(libjit_element_mul_kernel_i8, lhs *rhs) DEFINE_DATA_PARALLEL_KERNEL_QUANTIZED_M(libjit_element_div_kernel_i8, lhs / rhs) /// This is a variable used by Glow backends to determine the actual type used /// for size_t, dim_t and int variables when libjit was compiled. size_t libjit_sizeTVar; dim_t libjit_dimTVar; int libjit_intVar; /// Specialize the Modulo kernel into two functions based on the /// value of SignFollowDivisor. int64_t libjit_element_modulo_kernel_sign_follow_u(dim_t idx, const int64_t divisor, const int64_t *input) { int64_t res = input[idx] % divisor; if (res && ((res > 0) != (divisor > 0))) { res += divisor; } return res; } int64_t libjit_element_modulo_kernel_no_sign_follow_u(dim_t idx, const int64_t divisor, const int64_t *input) { return input[idx] % divisor; } int32_t libjit_element_modulo_kernel_sign_follow_i32(dim_t idx, const int64_t divisor, const int32_t *input) { int32_t res = input[idx] % divisor; if (res && ((res > 0) != (divisor > 0))) { res += divisor; } return res; } int32_t libjit_element_modulo_kernel_no_sign_follow_i32(dim_t idx, const int64_t divisor, const int32_t *input) { return input[idx] % divisor; } //===----------------------------------------------------------------------===// // Logical operations //===----------------------------------------------------------------------===// int8_t libjit_element_not_kernel_b(dim_t idx, const bool *input) { return !input[idx]; } int8_t libjit_element_and_kernel_b(dim_t idx, const bool *LHS, const bool *RHS) { return LHS[idx] && RHS[idx]; } int8_t libjit_element_or_kernel_b(dim_t idx, const bool *LHS, const bool *RHS) { return LHS[idx] || RHS[idx]; } int8_t libjit_element_xor_kernel_b(dim_t idx, const bool *LHS, const bool *RHS) { return LHS[idx] ^ RHS[idx]; } //===----------------------------------------------------------------------===// // Compare operations //===----------------------------------------------------------------------===// #define DEFINE_CMP_KERNEL_QUANTIZED(name, type, cmp) \ int8_t name(dim_t idx, const type *LHS, const type *RHS, int32_t lhsOffset, \ int32_t rhsOffset, int32_t pre, int32_t post, int32_t scale) { \ int32_t lhs = LHS[idx] - lhsOffset; \ int32_t rhs = RHS[idx] - rhsOffset; \ return (libjit_scale_i32i8(lhs, pre, post, scale, 0) cmp rhs) ? 1 : 0; \ } DEFINE_CMP_KERNEL_QUANTIZED(libjit_element_cmp_eq_kernel_i8, int8_t, ==) DEFINE_CMP_KERNEL_QUANTIZED(libjit_element_cmp_neq_kernel_i8, int8_t, !=) DEFINE_CMP_KERNEL_QUANTIZED(libjit_element_cmp_lt_kernel_i8, int8_t, <) DEFINE_CMP_KERNEL_QUANTIZED(libjit_element_cmp_lte_kernel_i8, int8_t, <=) #undef DEFINE_CMP_KERNEL_QUANTIZED #define DEFINE_CMP_KERNEL_NON_QUANTIZED(name, type, cmp) \ int8_t name(dim_t idx, const type *LHS, const type *RHS) { \ return (LHS[idx] cmp RHS[idx]) ? 1 : 0; \ } DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_eq_kernel_f, float, ==) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_eq_kernel_i32, int32_t, ==) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_eq_kernel_u, size_t, ==) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_neq_kernel_f, float, !=) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_neq_kernel_i32, int32_t, !=) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_neq_kernel_u, size_t, !=) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lt_kernel_f, float, <) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lt_kernel_i32, int32_t, <) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lt_kernel_u, size_t, <) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lte_kernel_f, float, <=) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lte_kernel_i32, int32_t, <=) DEFINE_CMP_KERNEL_NON_QUANTIZED(libjit_element_cmp_lte_kernel_u, size_t, <=) #undef DEFINE_CMP_KERNEL_NON_QUANTIZED int8_t libjit_element_is_nan_kernel_f(dim_t idx, const float *input) { return std::isnan(input[idx]) ? 1 : 0; } // Tanh cannot be vectorized by LLVM yet. Therefore we use the following // formula instead: 1 - 2 / (exp(x * 2) + 1), which is also used by Caffe2 and // provides a good accuracy. // Once LLVM supports the vectorization of tanh, we can replace this // approximation by a direct tanh call. // When the LIBJIT compile option "-ffast-math" is enabled the intermediate // computation expf(x) for Tanh operator is not handled properly for very // large positive values which results in NaN values for the Tanh output. // Therefore when the "-ffast-math" is enabled we compute the Tanh such that // we avoid computing large values for the "expf" function. #ifdef FFAST_MATH DEFINE_DATA_PARALLEL_KERNEL_FUNC(libjit_tanh_kernel_f) { float inpVal = LHS[idx]; float tanhVal = -1 + 2 / (expf(-2 * std::abs(inpVal)) + 1); return std::copysignf(tanhVal, inpVal); } #else DEFINE_DATA_PARALLEL_KERNEL_FUNC(libjit_tanh_kernel_f) { return 1 - 2 / (expf(LHS[idx] * 2) + 1); } #endif // FFAST_MATH int8_t libjit_intlookuptable_kernel_i8(dim_t idx, const int8_t *src, const int8_t *mapping) { return mapping[src[idx] + 128]; } float libjit_elementselect_kernel_f(dim_t idx, const int8_t *cond, const float *LHS, const float *RHS) { return (cond[idx] != 0) ? LHS[idx] : RHS[idx]; } int8_t libjit_elementselect_kernel_i8(dim_t idx, const int8_t *cond, const int8_t *LHS, const int8_t *RHS, int32_t destOffset, int32_t lhsOffset, int32_t rhsOffset, int32_t lhsPre, int32_t lhsPost, int32_t lhsScale, int32_t rhsPre, int32_t rhsPost, int32_t rhsScale) { return (cond[idx] != 0) ? libjit_clip(libjit_scale_i32i8(LHS[idx] - lhsOffset, lhsPre, lhsPost, lhsScale, destOffset)) : libjit_clip(libjit_scale_i32i8(RHS[idx] - rhsOffset, rhsPre, rhsPost, rhsScale, destOffset)); } float libjit_element_relu_f(dim_t idx, const float *src) { float srcVal = src[idx]; return MAX(srcVal, 0); } int8_t libjit_element_relu_i8(dim_t idx, const int8_t *src, int8_t srcOffset, int8_t destOffset, int32_t destPre, int32_t destPost, int32_t destScale) { int32_t reluVal = MAX(src[idx], srcOffset); int32_t scaledVal = libjit_scale_i32i8(reluVal - srcOffset, destPre, destPost, destScale, destOffset); return libjit_clip(scaledVal); } float libjit_element_clip_f(dim_t idx, const float *src, float min, float max) { float srcVal = src[idx]; return MIN(MAX(srcVal, min), max); } int8_t libjit_element_clip_i8(dim_t idx, const int8_t *src, int8_t clipMin, int8_t clipMax, int8_t srcOffset, int8_t destOffset, int32_t destPre, int32_t destPost, int32_t destScale) { int32_t clipVal = MIN(MAX(src[idx], clipMin), clipMax); int32_t scaledVal = libjit_scale_i32i8(clipVal - srcOffset, destPre, destPost, destScale, destOffset); return libjit_clip(scaledVal); } // When the LIBJIT compile option "-ffast-math" is enabled the intermediate // computation expf(x) for Sigmoid operator is not handled properly for very // large positive values which results in NaN values for the Sigmoid output. // Therefore when the "-ffast-math" is enabled we compute the Sigmoid such that // we avoid computing large values for the "expf" function. #ifdef FFAST_MATH DEFINE_DATA_PARALLEL_KERNEL_FUNC(libjit_sigmoid_kernel_f) { float inpVal = LHS[idx]; float sigmoidVal = 1 / (1 + expf(-std::abs(inpVal))); return (float)(std::signbit(inpVal)) + std::copysignf(sigmoidVal, inpVal); } #else DEFINE_DATA_PARALLEL_KERNEL_FUNC(libjit_sigmoid_kernel_f) { float e = expf(-LHS[idx]); return 1 / (e + 1); } #endif // FFAST_MATH DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(libjit_splat_kernel_f, float, val) DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(libjit_splat_kernel_u, int64_t, val) DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(libjit_splat_kernel_i8, int8_t, val) DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(libjit_splat_kernel_i32, int32_t, val) DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND(libjit_splat_kernel_b, int8_t, val) #undef DEFINE_DATA_PARALLEL_KERNEL #undef DEFINE_DATA_PARALLEL_KERNEL_FUNC #undef DEFINE_DATA_PARALLEL_KERNEL_FUNC #undef DEFINE_DATA_PARALLEL_KERNEL_WITH_IMM_OPERAND void libjit_batchedadd_f(float *dest, const float *batch, const float *slice, dim_t numSlice, dim_t sliceSize) { // For each layer in the batch: for (dim_t n = 0; n < numSlice; n++) { dim_t base = n * sliceSize; // For each element in the slice. for (dim_t i = 0; i < sliceSize; i++) { dest[base + i] = batch[base + i] + slice[i]; } } } void libjit_batchedadd_i8(int8_t *dest, const int8_t *batch, const int8_t *slice, dim_t numSlice, dim_t sliceSize, int32_t destOffset, int32_t batchOffset, int32_t sliceOffset, int32_t batchPre, int32_t batchPost, int32_t batchScale, int32_t slicePre, int32_t slicePost, int32_t sliceScale) { libjit_batchedadd_quantized(dest, batch, slice, numSlice, sliceSize, destOffset, batchOffset, sliceOffset, batchPre, batchPost, batchScale, slicePre, slicePost, sliceScale); } void libjit_batchedadd_i32_i8(int8_t *dest, const int8_t *batch, const int32_t *slice, dim_t numSlice, dim_t sliceSize, int32_t destOffset, int32_t batchOffset, int32_t sliceOffset, int32_t batchPre, int32_t batchPost, int32_t batchScale, int32_t slicePre, int32_t slicePost, int32_t sliceScale) { libjit_batchedadd_quantized(dest, batch, slice, numSlice, sliceSize, destOffset, batchOffset, sliceOffset, batchPre, batchPost, batchScale, slicePre, slicePost, sliceScale); } /// The dimensions passed in here are pre-expanded in LLVMIRGen with 1s so that /// we can iterate over the shape here, regardless of the shape of the tensor. void libjit_batchedreduceadd_f(float *dest, const float *batch, dim_t destSize, const dim_t *destDims, const dim_t *batchDims, dim_t axis) { for (dim_t i = 0; i < destSize; i++) dest[i] = 0.0; for (dim_t x = 0; x < batchDims[0]; x++) for (dim_t y = 0; y < batchDims[1]; y++) for (dim_t z = 0; z < batchDims[2]; z++) for (dim_t w = 0; w < batchDims[3]; w++) for (dim_t q = 0; q < batchDims[4]; q++) for (dim_t r = 0; r < batchDims[5]; r++) { dim_t I[] = {x, y, z, w, q, r}; I[axis] = 0; dest[libjit_getXYZWQR(destDims, I[0], I[1], I[2], I[3], I[4], I[5])] += batch[libjit_getXYZWQR(batchDims, x, y, z, w, q, r)]; } } void libjit_reducemin_f(float *dest, const float *batch, size_t destSize, const dim_t *destDims, const dim_t *batchDims) { libjit_reducemin(dest, batch, destSize, destDims, batchDims, std::numeric_limits::max()); } void libjit_reducemin_i32(int32_t *dest, const int32_t *batch, size_t destSize, const dim_t *destDims, const dim_t *batchDims) { libjit_reducemin(dest, batch, destSize, destDims, batchDims, std::numeric_limits::max()); } void libjit_reducemin_u(int64_t *dest, const int64_t *batch, size_t destSize, const dim_t *destDims, const dim_t *batchDims) { libjit_reducemin(dest, batch, destSize, destDims, batchDims, std::numeric_limits::max()); } /// Same as the non-quantized version, the dimensions here are pre-expanded in /// LLVMIRGen. However, for quantization, we must accumulate in the inner-most /// loop with higher precision (int32_t) and then clip the result back into the /// dest tensor. Thus we add max_tensor_dimensions different cases for this to /// ensure the axis is used as the inner-most loop. void libjit_batchedreduceadd_i8(int8_t *dest, const int8_t *batch, const dim_t *destDims, const dim_t *batchDims, int32_t destOffset, int32_t batchOffset, int32_t batchPre, int32_t batchPost, int32_t batchScale, dim_t axis) { switch (axis) { #define LOOP_AXIS_CASE(_D0, _D1, _D2, _D3, _D4, _D5_AXIS) \ case _D5_AXIS: \ for (dim_t i##_D0 = 0; i##_D0 < batchDims[_D0]; i##_D0++) \ for (dim_t i##_D1 = 0; i##_D1 < batchDims[_D1]; i##_D1++) \ for (dim_t i##_D2 = 0; i##_D2 < batchDims[_D2]; i##_D2++) \ for (dim_t i##_D3 = 0; i##_D3 < batchDims[_D3]; i##_D3++) \ for (dim_t i##_D4 = 0; i##_D4 < batchDims[_D4]; i##_D4++) { \ int32_t sum = 0.0; \ for (dim_t i##_D5_AXIS = 0; i##_D5_AXIS < batchDims[_D5_AXIS]; \ i##_D5_AXIS++) { \ sum += batch[libjit_getXYZWQR(batchDims, i0, i1, i2, i3, i4, \ i5)] - \ batchOffset; \ } \ dim_t i##_D5_AXIS = 0; \ int32_t res = libjit_scale_i32i8(sum, batchPre, batchPost, \ batchScale, destOffset); \ dest[libjit_getXYZWQR(destDims, i0, i1, i2, i3, i4, i5)] = \ libjit_clip(res); \ } \ return; // Each loop order, with the inner-most dimension/index equal to the axis. LOOP_AXIS_CASE(1, 2, 3, 4, 5, 0); LOOP_AXIS_CASE(0, 2, 3, 4, 5, 1); LOOP_AXIS_CASE(0, 1, 3, 4, 5, 2); LOOP_AXIS_CASE(0, 1, 2, 4, 5, 3); LOOP_AXIS_CASE(0, 1, 2, 3, 5, 4); LOOP_AXIS_CASE(0, 1, 2, 3, 4, 5); #undef LOOP_AXIS_CASE } } void libjit_cross_entropy_loss_f_u(float *CE, float *P, size_t *labels, dim_t *dims) { libjit_cross_entropy_loss_generic(CE, P, labels, dims); } void libjit_cross_entropy_loss_f_i32(float *CE, float *P, int32_t *labels, dim_t *dims) { libjit_cross_entropy_loss_generic(CE, P, labels, dims); } void libjit_gather64_f(float *dest, const float *data, const int64_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather64_i8(int8_t *dest, const int8_t *data, const int64_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather64_u(int64_t *dest, const int64_t *data, const int64_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather32_f(float *dest, const float *data, const int32_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather32_i8(int8_t *dest, const int8_t *data, const int32_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather32_u(int64_t *dest, const int64_t *data, const int32_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gather32_i32(int32_t *dest, const int32_t *data, const int32_t *indices, dim_t numIndices, dim_t sliceSize, dim_t numSamples, dim_t sampleSize) { libjit_gather(dest, data, indices, numIndices, sliceSize, numSamples, sampleSize); } void libjit_gatherranges64_f(float *output, int64_t *lengths, const float *data, const int64_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges64_i8(int8_t *output, int64_t *lengths, const int8_t *data, const int64_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges64_u(int64_t *output, int64_t *lengths, const int64_t *data, const int64_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges32_f(float *output, int32_t *lengths, const float *data, const int32_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges32_i8(int8_t *output, int32_t *lengths, const int8_t *data, const int32_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges32_u(uint64_t *output, int32_t *lengths, const uint64_t *data, const int32_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_gatherranges32_i32(int32_t *output, int32_t *lengths, const int32_t *data, const int32_t *ranges, dim_t numExamples, dim_t exampleSize) { libjit_gatherranges(output, lengths, data, ranges, numExamples, exampleSize); } void libjit_lengths_range_fill_i32(const int32_t *lengths, int32_t *output, const dim_t lengthsSize) { dim_t curIdx = 0; for (dim_t i = 0, e = lengthsSize; i < e; i++) { for (int32_t j = 0, f = lengths[i]; j < f; j++) { output[curIdx++] = j; } } } void libjit_scatterdata_f_i32(float *data, const dim_t *dataDims, const int32_t *indices, const float *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize, bool isCumulative) { if (isCumulative) { libjit_scatterdataaddfloat(data, dataDims, indices, slices, numIndices, indexSize, sliceSize); } else { libjit_scatterdatacopy(data, dataDims, indices, slices, numIndices, indexSize, sliceSize); } } void libjit_scatterdata_i8_u(int8_t *data, const dim_t *dataDims, const int64_t *indices, const int8_t *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize, bool isCumulative, float dataScale, int32_t dataOffset, float sliceScale, int32_t sliceOffset) { if (isCumulative) { libjit_scatterdataaddquantized(data, dataDims, indices, slices, numIndices, indexSize, sliceSize, dataScale, dataOffset, sliceScale, sliceOffset); } else { libjit_scatterdatacopy(data, dataDims, indices, slices, numIndices, indexSize, sliceSize); } } void libjit_scatterdata_i8_i32(int8_t *data, const dim_t *dataDims, const int32_t *indices, const int8_t *slices, dim_t numIndices, dim_t indexSize, dim_t sliceSize, bool isCumulative, float dataScale, int32_t dataOffset, float sliceScale, int32_t sliceOffset) { if (isCumulative) { libjit_scatterdataaddquantized(data, dataDims, indices, slices, numIndices, indexSize, sliceSize, dataScale, dataOffset, sliceScale, sliceOffset); } else { libjit_scatterdatacopy(data, dataDims, indices, slices, numIndices, indexSize, sliceSize); } } void libjit_lengths_to_ranges_i32(int32_t *ranges, const int32_t *lengths, dim_t size) { int32_t offset = 0; for (dim_t i = 0; i < size; i++) { auto length = lengths[i]; ranges[i * 2] = offset; ranges[i * 2 + 1] = length; offset += length; } } void libjit_sparse_lengths_sum_f_u(float *dest, float *data, size_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_sparse_lengths_sum_generic(dest, data, indices, lengths, segments, lineSize); } void libjit_sparse_lengths_sum_f_i32(float *dest, float *data, int32_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_sparse_lengths_sum_generic(dest, data, indices, lengths, segments, lineSize); } void libjit_sparse_lengths_weighted_sum_f_u(float *dest, float *data, float *weights, size_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_sparse_lengths_weighted_sum_generic(dest, data, weights, indices, lengths, segments, lineSize); } void libjit_sparse_lengths_weighted_sum_f_i32(float *dest, float *data, float *weights, int32_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_sparse_lengths_weighted_sum_generic(dest, data, weights, indices, lengths, segments, lineSize); } void libjit_embedding_bag_f(float *dest, float *data, float *weights, size_t *indices, size_t *offsets, dim_t segments, dim_t lineSize, dim_t totalLength, bool hasEndOffset) { if (hasEndOffset) { --segments; } memset(dest, 0, segments * lineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { int64_t start = offsets[i]; int64_t end = !hasEndOffset && i == segments - 1 ? totalLength : offsets[i + 1]; for (int64_t j = start; j < end; j++) { float weight = weights[curIndex]; dim_t line = indices[curIndex]; for (dim_t k = 0; k < lineSize; k++) { dest[i * lineSize + k] += weight * data[line * lineSize + k]; } curIndex++; } } } void libjit_sparse_lengths_weighted_sum_grad_f_u( const float *destGrad, float *dataGrad, float *weightsGrad, const float *data, const float *weights, const size_t *indices, const int32_t *lengths, dim_t segments, dim_t lineSize, dim_t dataGradRawSize) { libjit_sparse_lengths_weighted_sum_grad_generic( destGrad, dataGrad, weightsGrad, data, weights, indices, lengths, segments, lineSize, dataGradRawSize); } void libjit_sparse_lengths_weighted_sum_grad_f_i32( const float *destGrad, float *dataGrad, float *weightsGrad, const float *data, const float *weights, const int32_t *indices, const int32_t *lengths, dim_t segments, dim_t lineSize, dim_t dataGradRawSize) { libjit_sparse_lengths_weighted_sum_grad_generic( destGrad, dataGrad, weightsGrad, data, weights, indices, lengths, segments, lineSize, dataGradRawSize); } void libjit_rowwise_quantized_sparse_lengths_weighted_sum_f_u( float *dest, uint8_t *data, float *scales, float *offsets, float *weights, size_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_rowwise_quantized_sparse_lengths_weighted_sum_generic( dest, data, scales, offsets, weights, indices, lengths, segments, lineSize); } void libjit_rowwise_quantized_sparse_lengths_weighted_sum_f_i32( float *dest, uint8_t *data, float *scales, float *offsets, float *weights, int32_t *indices, int32_t *lengths, dim_t segments, dim_t lineSize) { libjit_rowwise_quantized_sparse_lengths_weighted_sum_generic( dest, data, scales, offsets, weights, indices, lengths, segments, lineSize); } void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f_u( float *dest, int8_t *data, float *weights, size_t *indices, int32_t *lengths, dim_t segments, dim_t inLineSize, dim_t outLineSize) { libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_generic( dest, data, weights, indices, lengths, segments, inLineSize, outLineSize); } void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f_i32( float *dest, int8_t *data, float *weights, int32_t *indices, int32_t *lengths, dim_t segments, dim_t inLineSize, dim_t outLineSize) { libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_generic( dest, data, weights, indices, lengths, segments, inLineSize, outLineSize); } void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f( float *dest, int8_t *data, float *weights, dim_t *indices, int32_t *lengths, dim_t segments, dim_t inLineSize, dim_t outLineSize) { memset(dest, 0, segments * outLineSize * sizeof(float)); dim_t curIndex = 0; for (dim_t i = 0; i < segments; i++) { for (int32_t j = 0, e = lengths[i]; j < e; j++) { const float weight = weights[curIndex]; const dim_t line = indices[curIndex]; const int8_t *currRowScaleOffsetPtr = data + ((line + 1) * inLineSize) - 2 * sizeof(float); float scale, offset; memcpy(&scale, currRowScaleOffsetPtr, sizeof(float)); memcpy(&offset, currRowScaleOffsetPtr + sizeof(float), sizeof(float)); for (dim_t k = 0; k < outLineSize; k++) { const float fData = (scale * (uint8_t)(data[line * inLineSize + k])) + offset; dest[i * outLineSize + k] += weight * fData; } curIndex++; } } } void libjit_embedding_bag_byte_rowwise_offsets_f( float *dest, int8_t *data, float *weights, size_t *indices, size_t *offsets, dim_t segments, dim_t numIndices, dim_t inLineSize, dim_t outLineSize, bool hasEndOffset) { if (hasEndOffset) { --segments; } memset(dest, 0, segments * outLineSize * sizeof(float)); for (dim_t i = 0; i < segments; i++) { dim_t start = offsets[i]; dim_t end = !hasEndOffset && i == segments - 1 ? numIndices : offsets[i + 1]; for (dim_t j = start; j < end; j++) { const float weight = weights[j]; const dim_t line = indices[j]; const int8_t *currRowScaleOffsetPtr = data + ((line + 1) * inLineSize) - 2 * sizeof(float); float scale, offset; memcpy(&scale, currRowScaleOffsetPtr, sizeof(float)); memcpy(&offset, currRowScaleOffsetPtr + sizeof(float), sizeof(float)); for (dim_t k = 0; k < outLineSize; k++) { const float fData = (scale * (uint8_t)(data[line * inLineSize + k])) + offset; dest[i * outLineSize + k] += weight * fData; } } } } void libjit_sparse_to_dense_f_u(float *dest, const size_t *indices, const float *values, dim_t numIndices, dim_t destSize, dim_t valueSize) { libjit_sparse_to_dense_generic(dest, indices, values, numIndices, destSize, valueSize); } void libjit_sparse_to_dense_f_i32(float *dest, const int32_t *indices, const float *values, dim_t numIndices, dim_t destSize, dim_t valueSize) { libjit_sparse_to_dense_generic(dest, indices, values, numIndices, destSize, valueSize); } void libjit_lengths_sum_f(float *dest, const float *data, const int32_t *lengths, dim_t destSize, dim_t lengthsSize, dim_t sliceSize) { memset(dest, 0, destSize * sizeof(float)); dim_t offsetOut = 0; dim_t offsetIn = 0; for (dim_t i = 0; i < lengthsSize; ++i) { for (int32_t j = 0; j < lengths[i]; ++j) { for (dim_t k = 0; k < sliceSize; ++k) { dest[offsetOut + k] += data[offsetIn + k]; } offsetIn += sliceSize; } offsetOut += sliceSize; } } void libjit_local_response_normalization_f( float *outW, const float *inW, float *scaleCache, const dim_t *outWdims, const dim_t *inWdims, dim_t halfWindow, float alpha, float beta, float k) { dim_t window = 2 * halfWindow + 1; float normedAlpha = alpha / window; for (dim_t n = 0; n < inWdims[0]; n++) { for (dim_t h = 0; h < inWdims[1]; h++) { for (dim_t w = 0; w < inWdims[2]; w++) { for (dim_t c = 0; c < inWdims[3]; c++) { float m2 = 0.0; for (dim_t i = (c >= halfWindow ? c - halfWindow : 0); i <= MIN(c + halfWindow, inWdims[3] - 1); i++) { float val = inW[libjit_getXYZW(inWdims, n, h, w, i)]; m2 += val * val; } float scale = k + normedAlpha * m2; scaleCache[libjit_getXYZW(inWdims, n, h, w, c)] = scale; float normFactor = pow(scale, -beta); outW[libjit_getXYZW(outWdims, n, h, w, c)] = inW[libjit_getXYZW(inWdims, n, h, w, c)] * normFactor; } // C } // W } // H } // N } void libjit_local_response_normalization_grad_f( float *inG, const float *outG, const float *inW, const float *outW, const float *scaleCache, const dim_t *outWdims, dim_t halfWindow, float alpha, float beta) { dim_t window = 2 * halfWindow + 1; float normedAlpha = alpha / window; float coeff = 2 * normedAlpha * beta; for (dim_t n = 0; n < outWdims[0]; n++) { for (dim_t h = 0; h < outWdims[1]; h++) { for (dim_t w = 0; w < outWdims[2]; w++) { // Prepare right half of sliding window based at c = 0 float sum = 0.0; for (dim_t i = 0; i < MIN(halfWindow, outWdims[3]); i++) { float outg = outG[libjit_getXYZW(outWdims, n, h, w, i)]; float outw = outW[libjit_getXYZW(outWdims, n, h, w, i)]; float scale = scaleCache[libjit_getXYZW(outWdims, n, h, w, i)]; sum += outg * (outw / scale); } for (dim_t c = 0; c < outWdims[3]; c++) { if (c > halfWindow) { dim_t j = c - halfWindow - 1; float outg = outG[libjit_getXYZW(outWdims, n, h, w, j)]; float outw = outW[libjit_getXYZW(outWdims, n, h, w, j)]; float scale = scaleCache[libjit_getXYZW(outWdims, n, h, w, j)]; sum -= outg * (outw / scale); } dim_t j = c + halfWindow; if (j < outWdims[3]) { float outg = outG[libjit_getXYZW(outWdims, n, h, w, j)]; float outw = outW[libjit_getXYZW(outWdims, n, h, w, j)]; float scale = scaleCache[libjit_getXYZW(outWdims, n, h, w, j)]; sum += outg * (outw / scale); } float outg = outG[libjit_getXYZW(outWdims, n, h, w, c)]; float inw = inW[libjit_getXYZW(outWdims, n, h, w, c)]; float scale = scaleCache[libjit_getXYZW(outWdims, n, h, w, c)]; inG[libjit_getXYZW(outWdims, n, h, w, c)] = outg * pow(scale, -beta) - coeff * inw * sum; } } // W } // H } // N } void libjit_max_pool_i8(const int8_t *inW, int8_t *outW, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernelSizes, dim_t *strides, dim_t *pads) { libjit_max_pool_generic(inW, outW, inWdims, outWdims, kernelSizes, strides, pads); } void libjit_max_pool_f(const float *inW, float *outW, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernelSizes, dim_t *strides, dim_t *pads) { libjit_max_pool_generic(inW, outW, inWdims, outWdims, kernelSizes, strides, pads); } void libjit_max_pool_argmax_i8_u(const int8_t *inW, int8_t *outW, int64_t *argmax, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { libjit_max_pool_argmax_generic(inW, outW, argmax, inWdims, outWdims, kernels, strides, pads); } void libjit_max_pool_argmax_f_u(const float *inW, float *outW, int64_t *argmax, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { libjit_max_pool_argmax_generic(inW, outW, argmax, inWdims, outWdims, kernels, strides, pads); } void libjit_max_pool_argmax_i8_i32(const int8_t *inW, int8_t *outW, int32_t *argmax, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { libjit_max_pool_argmax_generic(inW, outW, argmax, inWdims, outWdims, kernels, strides, pads); } void libjit_max_pool_argmax_f_i32(const float *inW, float *outW, int32_t *argmax, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { libjit_max_pool_argmax_generic(inW, outW, argmax, inWdims, outWdims, kernels, strides, pads); } void libjit_arg_max_i8_u(const int8_t *inW, int64_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_max_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_max_i8_i32(const int8_t *inW, int32_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_max_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_max_f_u(const float *inW, int64_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_max_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_max_f_i32(const float *inW, int32_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_max_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_min_i8_u(const int8_t *inW, int64_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_min_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_min_i8_i32(const int8_t *inW, int32_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_min_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_min_f_u(const float *inW, int64_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_min_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_arg_min_f_i32(const float *inW, int32_t *outW, const dim_t *inWdims, size_t inWNumDims, size_t axis) { libjit_arg_min_generic(inW, outW, inWdims, inWNumDims, axis); } void libjit_max_pool_argmax_grad_f_u(float *inG, const float *outG, const int64_t *argmax, const dim_t *inGdims, const dim_t *outWdims) { libjit_max_pool_argmax_grad_generic(inG, outG, argmax, inGdims, outWdims); } void libjit_max_pool_argmax_grad_f_i32(float *inG, const float *outG, const int32_t *argmax, const dim_t *inGdims, const dim_t *outWdims) { libjit_max_pool_argmax_grad_generic(inG, outG, argmax, inGdims, outWdims); } void libjit_resizenearest_f(float *dst, const float *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizenearest_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizenearest_i8(int8_t *dst, const int8_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizenearest_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizenearest_i32(int32_t *dst, const int32_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizenearest_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizenearest_u(int64_t *dst, const int64_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizenearest_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizebilinear_f(float *dst, const float *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizebilinear_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizebilinear_i8(int8_t *dst, const int8_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizebilinear_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizebilinear_i32(int32_t *dst, const int32_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizebilinear_generic(dst, src, scale, inWdims, outWdims); } void libjit_resizebilinear_u(int64_t *dst, const int64_t *src, const float *scale, const dim_t *inWdims, const dim_t *outWdims) { libjit_resizebilinear_generic(dst, src, scale, inWdims, outWdims); } void libjit_avg_pool_i8(const int8_t *inW, int8_t *outW, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernelSizes, dim_t *strides, dim_t *pads, int32_t outOffset, int32_t inOffset, int32_t outPre, int32_t outPost, int32_t outScale) { dim_t pad_t = pads[0]; dim_t pad_l = pads[1]; dim_t stride_h = strides[0]; dim_t stride_w = strides[1]; dim_t kernel_h = kernelSizes[0]; dim_t kernel_w = kernelSizes[1]; // For each input in the batch: for (dim_t n = 0; n < outWdims[0]; n++) { // For each (x,y) step in the input/output tensor: sdim_t x = -sdim_t(pad_t); for (dim_t ax = 0; ax < outWdims[1]; x += stride_h, ax++) { sdim_t y = -sdim_t(pad_l); for (dim_t ay = 0; ay < outWdims[2]; y += stride_w, ay++) { // For each layer in the output tensor: for (dim_t z = 0; z < inWdims[3]; z++) { int32_t sum = 0; for (dim_t fx = 0; fx < kernel_h; fx++) { for (dim_t fy = 0; fy < kernel_w; fy++) { sdim_t ox = x + fx; sdim_t oy = y + fy; // Ignore index access below zero (this is due to padding). if (ox < 0 || oy < 0 || ox >= (sdim_t)inWdims[1] || oy >= (sdim_t)inWdims[2]) { continue; } sum += inW[libjit_getXYZW(inWdims, n, (dim_t)ox, (dim_t)oy, z)] - inOffset; } } outW[libjit_getXYZW(outWdims, n, ax, ay, z)] = libjit_clip( libjit_scale_i32i8(sum, outPre, outPost, outScale, outOffset)); } // C } // W } // H } // N } void libjit_avg_pool_f(const float *inW, float *outW, const dim_t *inWdims, const dim_t *outWdims, dim_t *kernelSizes, dim_t *strides, dim_t *pads) { dim_t pad_t = pads[0]; dim_t pad_l = pads[1]; dim_t stride_h = strides[0]; dim_t stride_w = strides[1]; dim_t kernel_h = kernelSizes[0]; dim_t kernel_w = kernelSizes[1]; float filterArea = kernel_h * kernel_w; // For each input in the batch: for (dim_t n = 0; n < outWdims[0]; n++) { // For each (x,y) step in the input/output tensor: sdim_t x = -(sdim_t)pad_t; for (dim_t ax = 0; ax < outWdims[1]; x += stride_h, ax++) { sdim_t y = -(sdim_t)pad_l; for (dim_t ay = 0; ay < outWdims[2]; y += stride_w, ay++) { // For each layer in the output tensor: for (dim_t z = 0; z < inWdims[3]; z++) { float sum = 0; for (dim_t fx = 0; fx < kernel_h; fx++) { for (dim_t fy = 0; fy < kernel_w; fy++) { sdim_t ox = x + fx; sdim_t oy = y + fy; // Ignore index access below zero (this is due to padding). if (ox < 0 || oy < 0 || ox >= (sdim_t)inWdims[1] || oy >= (sdim_t)inWdims[2]) { continue; } sum += inW[libjit_getXYZW(inWdims, n, (dim_t)ox, (dim_t)oy, z)]; } } outW[libjit_getXYZW(outWdims, n, ax, ay, z)] = sum / filterArea; } // C } // W } // H } // N } void libjit_adaptive_avg_pool_f(const float *inW, float *outW, const dim_t *inWdims, const dim_t *outWdims) { // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AdaptiveAveragePooling.cpp #define START_IND(a, b, c) (size_t) std::floor((float)((a) * (c)) / (b)) #define END_IND(a, b, c) (size_t) std::ceil((float)(((a) + 1) * (c)) / (b)) // For each input in the batch: for (dim_t n = 0; n < outWdims[0]; n++) { // For each layer in the output tensor: for (dim_t z = 0; z < inWdims[3]; z++) { // For each value in the output tensor: for (dim_t ax = 0; ax < outWdims[1]; ax++) { dim_t x = START_IND(ax, outWdims[1], inWdims[1]); dim_t kH = END_IND(ax, outWdims[1], inWdims[1]) - x; for (dim_t ay = 0; ay < outWdims[2]; ay++) { dim_t y = START_IND(ay, outWdims[2], inWdims[2]); dim_t kW = END_IND(ay, outWdims[2], inWdims[2]) - y; float sum = 0; for (dim_t fx = 0; fx < kH; fx++) { for (dim_t fy = 0; fy < kW; fy++) { dim_t ox = x + fx; dim_t oy = y + fy; sum += inW[libjit_getXYZW(inWdims, n, ox, oy, z)]; } } outW[libjit_getXYZW(outWdims, n, ax, ay, z)] = (sum / kW / kH); } // W } // H } // C } // N #undef START_IND #undef END_IND } void libjit_avg_pool_grad_f(float *inG, const float *outG, const dim_t *inGdims, const dim_t *outWdims, dim_t *kernels, dim_t *strides, dim_t *pads) { dim_t pad_t = pads[0]; dim_t pad_l = pads[1]; dim_t stride_h = strides[0]; dim_t stride_w = strides[1]; dim_t kernel_h = kernels[0]; dim_t kernel_w = kernels[1]; float kernelArea = kernel_h * kernel_w; // NHWC format is assumed for (dim_t n = 0; n < outWdims[0]; n++) { for (dim_t z = 0; z < outWdims[3]; z++) { // Clear inG for (dim_t x = 0; x < inGdims[1]; x++) { for (dim_t y = 0; y < inGdims[2]; y++) { inG[libjit_getXYZW(inGdims, n, x, y, z)] = 0.0; } } sdim_t x = -(sdim_t)pad_t; for (dim_t ax = 0; ax < outWdims[1]; x += stride_h, ax++) { sdim_t y = -(sdim_t)pad_l; for (dim_t ay = 0; ay < outWdims[2]; y += stride_w, ay++) { float df = outG[libjit_getXYZW(outWdims, n, ax, ay, z)] / kernelArea; for (dim_t kx = 0; kx < kernel_h; kx++) { for (dim_t ky = 0; ky < kernel_w; ky++) { sdim_t ox = x + kx; sdim_t oy = y + ky; if (ox < 0 || oy < 0 || ox >= (sdim_t)inGdims[1] || oy >= (sdim_t)inGdims[2]) { continue; } inG[libjit_getXYZW(inGdims, n, (dim_t)ox, (dim_t)oy, z)] += df; } } } // W } // H } // C } // N } int8_t libjit_element_quantize_kernel_i8(dim_t idx, const float *inW, float scale, int32_t offset) { int32_t result = (int32_t)nearbyintf(inW[idx] / scale + offset); return (int8_t)MAX(INT8_MIN, MIN(INT8_MAX, result)); } int32_t libjit_element_quantize_kernel_i32(dim_t idx, const float *inW, float scale, int32_t offset) { int32_t result = (int32_t)nearbyintf(inW[idx] / scale + offset); return result; } float libjit_element_dequantize_kernel_f(dim_t idx, const int8_t *inW, float scale, int32_t offset) { return scale * (inW[idx] - offset); } int8_t libjit_element_rescale_kernel_i8(dim_t idx, const int8_t *inW, int32_t outOffset, int32_t inOffset, int32_t pre, int32_t post, int32_t scale) { int32_t s = libjit_scale_i32i8(inW[idx] - inOffset, pre, post, scale, outOffset); return libjit_clip(s); } void libjit_softmax_f(const float *inW, float *outW, const dim_t *idim, const dim_t *odim) { for (dim_t n = 0; n < idim[0]; n++) { float max = inW[libjit_getXY(idim, n, 0)]; // Find Max. for (dim_t i = 1; i < idim[1]; i++) { max = MAX(max, inW[libjit_getXY(idim, n, i)]); } float sum = 0; // Compute exp. for (dim_t i = 0; i < idim[1]; i++) { float e = expf(inW[libjit_getXY(idim, n, i)] - max); sum += e; outW[libjit_getXY(odim, n, i)] = e; } // Normalize the output. for (dim_t i = 0; i < idim[1]; i++) { outW[libjit_getXY(odim, n, i)] = outW[libjit_getXY(odim, n, i)] / sum; } } // N } void libjit_softmax_grad_f_u(float *inG, float *outW, const size_t *selectedW, const dim_t *idim, const dim_t *selectdim) { libjit_softmax_grad_generic(inG, outW, selectedW, idim, selectdim); } void libjit_softmax_grad_f_i32(float *inG, float *outW, const int32_t *selectedW, const dim_t *idim, const dim_t *selectdim) { libjit_softmax_grad_generic(inG, outW, selectedW, idim, selectdim); } void libjit_topk_f_u(float *values, size_t *indices, const float *input, void *scratch, dim_t k, dim_t n, dim_t size) { libjit_topk(values, indices, input, scratch, k, n, size); } void libjit_topk_f_i32(float *values, int32_t *indices, const float *input, void *scratch, dim_t k, dim_t n, dim_t size) { libjit_topk(values, indices, input, scratch, k, n, size); } void libjit_topk_i8_u(int8_t *values, size_t *indices, const int8_t *input, void *scratch, dim_t k, dim_t n, dim_t size) { libjit_topk(values, indices, input, scratch, k, n, size); } void libjit_topk_i8_i32(int8_t *values, int32_t *indices, const int8_t *input, void *scratch, dim_t k, dim_t n, dim_t size) { libjit_topk(values, indices, input, scratch, k, n, size); } void libjit_transpose_i8(const int8_t *inW, int8_t *outW, const dim_t *idim, const dim_t *odim, const dim_t *shuffle, dim_t numDims) { libjit_transpose_generic(inW, outW, idim, odim, shuffle, numDims); } void libjit_transpose_f(const float *inW, float *outW, const dim_t *idim, const dim_t *odim, const dim_t *shuffle, dim_t numDims) { libjit_transpose_generic(inW, outW, idim, odim, shuffle, numDims); } void libjit_transpose_u(const int64_t *inW, int64_t *outW, const dim_t *idim, const dim_t *odim, const dim_t *shuffle, dim_t numDims) { libjit_transpose_generic(inW, outW, idim, odim, shuffle, numDims); } void libjit_transpose_b(const bool *inW, bool *outW, const dim_t *idim, const dim_t *odim, const dim_t *shuffle, dim_t numDims) { libjit_transpose_generic(inW, outW, idim, odim, shuffle, numDims); } void libjit_flip_i8(const int8_t *inW, int8_t *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_flip_i16(const int16_t *inW, int16_t *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_flip_i32(const int32_t *inW, int32_t *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_flip_u(const int64_t *inW, int64_t *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_flip_f(const float *inW, float *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_flip_b(const bool *inW, bool *outW, const dim_t *dims, dim_t axis, dim_t numDims) { libjit_flip_generic(inW, outW, dims, axis, numDims); } void libjit_insert_tensor_f(float *tensor, float *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { libjit_insert_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim, count, axis); } void libjit_insert_tensor_i32(int32_t *tensor, int32_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { libjit_insert_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim, count, axis); } void libjit_extract_tensor_f(float *tensor, float *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim) { libjit_extract_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim); } void libjit_extract_tensor_i8(int8_t *tensor, int8_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim) { libjit_extract_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim); } void libjit_extract_tensor_i32(int32_t *tensor, int32_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim) { libjit_extract_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim); } void libjit_insert_tensor_u(int64_t *tensor, int64_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { libjit_insert_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim, count, axis); } void libjit_extract_tensor_u(int64_t *tensor, int64_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim) { libjit_extract_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim); } void libjit_insert_tensor_i8(int8_t *tensor, int8_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { libjit_insert_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim, count, axis); } void libjit_insert_tensor_b(int8_t *tensor, int8_t *slice, dim_t *offset, dim_t *tensorDim, dim_t *sliceDim, dim_t numDimsTensor, dim_t numDimsSlice, dim_t offsetDim, dim_t count, dim_t axis) { libjit_insert_tensor(tensor, slice, offset, tensorDim, sliceDim, numDimsTensor, numDimsSlice, offsetDim, count, axis); } void libjit_space_to_depth_f(const float *inTensor, float *outTensor, dim_t blockSize, const dim_t *inDims, const dim_t *outDims) { libjit_space_to_depth_generic(inTensor, outTensor, blockSize, inDims, outDims); } void libjit_space_to_depth_i8(const int8_t *inTensor, int8_t *outTensor, dim_t blockSize, const dim_t *inDims, const dim_t *outDims) { libjit_space_to_depth_generic(inTensor, outTensor, blockSize, inDims, outDims); } /// Function to dump a tensor in text format in the console. __attribute__((noinline)) void libjit_dump_tensor_console(uint8_t *tensor, dim_t *tensorDim, dim_t numDimsTensor, dim_t elemKind, const char *name) { printf("%s\n", name); /// This definition should match the defintion in Glow. enum class ElemKind : unsigned char { FloatTy, // 32-bit float type (float) Float16Ty, // 16-bit float type (half, fp16) BFloat16Ty, // 16-bit float type (bfloat16) Int8QTy, // 8-bit quantized type (int8_t) UInt8QTy, // unsigned 8-bit quantized type (uint8_t) Int16QTy, // 16-bit quantized type (int16_t) Int32QTy, // 32-bit quantized type (int32_t) Int32ITy, // 32-bit index type (int32_t) Int64ITy, // 64-bit index type (int64_t) UInt8FusedQTy, // 8-bit quantized type with fused scale/offset (uint8_t) BoolTy, // Bool type (bool) }; // Dump the content of a tensor. switch ((ElemKind)elemKind) { case ElemKind::FloatTy: libjit_dump_tensor_console_impl((float *)tensor, tensorDim, numDimsTensor); break; case ElemKind::Int64ITy: libjit_dump_tensor_console_impl((dim_t *)tensor, tensorDim, numDimsTensor); break; case ElemKind::Int8QTy: libjit_dump_tensor_console_impl((int8_t *)tensor, tensorDim, numDimsTensor); break; case ElemKind::Int32QTy: libjit_dump_tensor_console_impl((int32_t *)tensor, tensorDim, numDimsTensor); break; default: printf("Dumping this type of payload is not supported: %zu\n", (size_t)elemKind); break; } puts(""); } /// Function to dump a tensor in binary format in a file using the raw tensor /// data pointer \p tensor, the tensor data size \p tensorSize (in bytes) and /// the file name \p filename. A text header \p header will also be dumped. __attribute__((noinline)) void libjit_dump_tensor_bin(uint8_t *tensor, size_t tensorSize, const char *filename, const char *header) { FILE *fh = fopen(filename, "wb"); if (!fh) { printf("ERROR opening file: '%s'!\n" "File name might be too long!\n", filename); return; } // Dump header. fprintf(fh, "%s", header); // Dump tensor data. size_t size = fwrite(tensor, 1, tensorSize, fh); assert((size == tensorSize) && "Error dumping tensor to file!"); (void)size; fclose(fh); } /// Functions to dump a tensor in text format in a file using the raw tensor /// data pointer \p tensor, the tensor data size \p tensorElemSize (number of /// elements) and the file name \p filename. A text header \p header will also /// be dumped. #define DEFINE_DUMP_TENSOR_TXT_KERNEL(type, suffix) \ __attribute__((noinline)) void libjit_dump_tensor_txt_##suffix( \ uint8_t *tensor, size_t tensorElemSize, const char *filename, \ const char *header) { \ libjit_dump_tensor_txt_impl((type *)tensor, tensorElemSize, filename, \ header); \ } DEFINE_DUMP_TENSOR_TXT_KERNEL(float, f) DEFINE_DUMP_TENSOR_TXT_KERNEL(int8_t, i8) DEFINE_DUMP_TENSOR_TXT_KERNEL(int16_t, i16) DEFINE_DUMP_TENSOR_TXT_KERNEL(int32_t, i32) DEFINE_DUMP_TENSOR_TXT_KERNEL(int64_t, u) DEFINE_DUMP_TENSOR_TXT_KERNEL(bool, b) #undef DEFINE_DUMP_TENSOR_TXT_KERNEL void libjit_write_timestamp(uint64_t *tensor, dim_t offset) { // We are using C++ timer here to a avoid issues with gettimeofday // Issue #2397 covers migrating this to a libc approach but if you have issues // with a lack of C++ symbols at runtime check there first. uint64_t ts = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); memcpy(tensor + offset, &ts, sizeof(uint64_t)); } /// Copies a kernel with type conversion void libjit_convertTo_f_b(float *dstPtr, const bool *srcPtr, const dim_t *dims, dim_t numDims) { libjit_copy_kernel_with_conversion(dstPtr, srcPtr, dims, numDims); } void libjit_convertTo_b_f(bool *dstPtr, const float *srcPtr, const dim_t *dims, dim_t numDims) { libjit_copy_kernel_with_conversion(dstPtr, srcPtr, dims, numDims); } void libjit_convertTo_f_i32(float *dstPtr, const int32_t *srcPtr, const dim_t *dims, dim_t numDims) { libjit_copy_kernel_with_conversion(dstPtr, srcPtr, dims, numDims); } void libjit_convertTo_i32_u(int32_t *dstPtr, const int64_t *srcPtr, const dim_t *dims, dim_t numDims) { libjit_copy_kernel_with_conversion(dstPtr, srcPtr, dims, numDims); } void libjit_convertTo_u_i32(int64_t *dstPtr, const int32_t *srcPtr, const dim_t *dims, dim_t numDims) { libjit_copy_kernel_with_conversion(dstPtr, srcPtr, dims, numDims); } /// Update min/max values \p compInfo and histogram \p existingHistogram with /// data collected from tensor \p inputTensor. /// Note: code ported from Profile.cpp: generateTensorHistogram __attribute__((noinline)) void libjit_quantization_profile(float *inputTensor, dim_t tensorSize, float *compInfo, float *existingHistogram, dim_t *histDim) { dim_t nBins = histDim[0]; // Min/max computed from previous runs. If this is the first run, compInfo is // expected to be initialized as following: // compInfo[0]: std::numeric_limits::max() // compInfo[1]: std::numeric_limits::lowest() float min = compInfo[0]; float max = compInfo[1]; // Min/max value for entire current input tensor. float minInput; float maxInput; find_min_max_f(inputTensor, tensorSize, minInput, maxInput); // Update the global min/max. float newMin = MIN(minInput, min); float newMax = MAX(maxInput, max); compInfo[0] = newMin; compInfo[1] = newMax; // If input histogram is empty then return. if (nBins == 0) { return; } // Initial profile. if (check_all_zeros(existingHistogram, nBins) == 1) { min = minInput; max = maxInput; } // If the min/max range changes, there is the need to rescale the histogram. if (newMin < min || newMax > max) { float destBinWidth = (newMax - newMin) / nBins; float srcBinWidth = (max - min) / nBins; float scaledHistogram[nBins]; for (dim_t i = 0; i < nBins; ++i) { scaledHistogram[i] = 0.0f; } for (dim_t i = 0; i < nBins; ++i) { if (existingHistogram[i] == 0) continue; float srcBinBegin = min + srcBinWidth * i; dim_t destBin = (srcBinBegin - newMin) / destBinWidth; float destBinEnd = newMin + destBinWidth * (destBin + 1); float srcBinEnd = srcBinBegin + srcBinWidth; dim_t destBinToVerify = (srcBinEnd - newMin) / destBinWidth; // Make sure that destination bin is mapped at most to 2 final bins, based // on that redistribute percentage is calculated. assert(destBinToVerify <= destBin + 2); (void)destBinToVerify; // Calculate how much we need to redistribute. uint64_t dstBinCnt = static_cast( MIN(static_cast(round((destBinEnd - srcBinBegin) / srcBinWidth * existingHistogram[i])), existingHistogram[i])); dim_t newBin = get_bin(nBins, destBinWidth, newMin, srcBinBegin); scaledHistogram[newBin] += dstBinCnt; if (dstBinCnt < existingHistogram[i]) { dim_t newBin = get_bin(nBins, destBinWidth, newMin, srcBinBegin + destBinWidth); scaledHistogram[newBin] += existingHistogram[i] - dstBinCnt; } } // Copy scaled histogram back to the existing histogram. for (dim_t i = 0, e = nBins; i < e; ++i) { existingHistogram[i] = scaledHistogram[i]; } // Update global min and max. min = newMin; max = newMax; } // Update the histogram with the values of the current input tensor. float binWidth = (max - min) / nBins; for (dim_t i = 0, e = tensorSize; i < e; ++i) { dim_t newBin = get_bin(nBins, binWidth, min, inputTensor[i]); existingHistogram[newBin]++; } } __attribute__((noinline)) void libjit_nms_u(uint64_t *indices, uint64_t *numDetected, const float *boxTensor, const dim_t *boxTensorDims, dim_t boxTensorDimSize, const float *scoresTensor, const dim_t *scoresTensorDims, dim_t scoresTensorDimSize, const dim_t *resultTensorDims, dim_t resultTensorDimSize, unsigned centerPointBox, unsigned maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, bool isV4) { libjit_nms_generic(indices, numDetected, boxTensor, boxTensorDims, boxTensorDimSize, scoresTensor, scoresTensorDims, scoresTensorDimSize, resultTensorDims, resultTensorDimSize, centerPointBox, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, isV4); } __attribute__((noinline)) void libjit_nms_i32(int32_t *indices, int32_t *numDetected, const float *boxTensor, const dim_t *boxTensorDims, dim_t boxTensorDimSize, const float *scoresTensor, const dim_t *scoresTensorDims, dim_t scoresTensorDimSize, const dim_t *resultTensorDims, dim_t resultTensorDimSize, unsigned centerPointBox, unsigned maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, bool isV4) { libjit_nms_generic(indices, numDetected, boxTensor, boxTensorDims, boxTensorDimSize, scoresTensor, scoresTensorDims, scoresTensorDimSize, resultTensorDims, resultTensorDimSize, centerPointBox, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, isV4); } /// FFT Radix2 DIT (Decimation In Time) implementation for Complex data. /// The \p input and \p output buffers have 2 * \p fftLength float /// samples corresponding to \p fftLength complex samples with real and /// imaginary parts interleaved: real[0], imag[0], real[1], imag[1], ... /// The lookup tables \p twiddleFactors and \p bitReverseIndices are /// generated at compile time. The boolean flag \p inPlace decides whether /// the FFT computation is done in-place (that is in the \p input buffer /// without writing in the \p output buffer) or out-of-place (written in /// the \p output buffer). void libjit_fft_complex_f(float *output, float *input, const float *twiddleFactors, const int32_t *bitReverseIndices, unsigned fftLength, bool inPlace) { // Bit Reverse Reordering. if (inPlace) { for (dim_t idx = 0; idx < fftLength; idx++) { int32_t bitRevIdx = bitReverseIndices[idx]; if (idx < bitRevIdx) { // Swap complex pair. float real = input[2 * idx + 0]; float imag = input[2 * idx + 1]; input[2 * idx + 0] = input[2 * bitRevIdx + 0]; input[2 * idx + 1] = input[2 * bitRevIdx + 1]; input[2 * bitRevIdx + 0] = real; input[2 * bitRevIdx + 1] = imag; } } } else { for (dim_t idx = 0; idx < fftLength; idx++) { int32_t bitRevIdx = bitReverseIndices[idx]; output[2 * idx + 0] = input[2 * bitRevIdx + 0]; output[2 * idx + 1] = input[2 * bitRevIdx + 1]; } } // FFT output pointer. float *bitRevOut = inPlace ? input : output; // Number of FFT stages. dim_t stageNum = std::log2((double)fftLength); // Number of radix2 butterfly groups for 1st stage. dim_t groupNum = fftLength / 2; // Number of radix2 butterflies per group for 1st stage. dim_t groupButterNum = 1; // Stage loop. for (dim_t stageIdx = 0; stageIdx < stageNum; stageIdx++) { // Butterfly input/output pointers. float *inp1Ptr = bitRevOut + 0 * groupButterNum; float *inp2Ptr = bitRevOut + 2 * groupButterNum; // Butterfly group loop. for (dim_t groupIdx = 0; groupIdx < groupNum; groupIdx++) { // Twiddle factors pointer. const float *twPtr = twiddleFactors; // Butterfly loop within group. for (dim_t groupButterIdx = 0; groupButterIdx < groupButterNum; groupButterIdx++) { // Radix 2 butterfly. float inp0_re = *inp1Ptr++; float inp0_im = *inp1Ptr--; float inp1_re = *inp2Ptr++; float inp1_im = *inp2Ptr--; float tw_re = *twPtr++; float tw_im = *twPtr--; twPtr += (2 * groupNum); float inp1_tw_mult_re = inp1_re * tw_re - inp1_im * tw_im; float inp1_tw_mult_im = inp1_re * tw_im + inp1_im * tw_re; *inp1Ptr++ = inp0_re + inp1_tw_mult_re; *inp1Ptr++ = inp0_im + inp1_tw_mult_im; *inp2Ptr++ = inp0_re - inp1_tw_mult_re; *inp2Ptr++ = inp0_im - inp1_tw_mult_im; } inp1Ptr += 2 * groupButterNum; inp2Ptr += 2 * groupButterNum; } // Update parameters for next stage. groupNum >>= 1; groupButterNum <<= 1; } } /// FFT Radix2 DIT (Decimation In Time) implementation for Real data. /// The implementation uses a fftLength/2 FFT for Complex data followed /// by a step to map the complex FFT to the real FFT by using a set of /// of complex weights \p complexToRealWeights A[k] defined as: /// A[k] = 1/2 * (1 - j*exp(-j*2*pi*k/N)) for k = 0 .. N/4-1 /// The \p input buffer has \p fftLength float values corresponding /// to \p fftLength real samples. Since the FFT of a real signal /// has conjugate symmetry, the \p output buffer only contains /// 2 * (fftLength/2+1) = fftLength + 2 float values corresponding /// to fftLength/2+1 complex samples with real and imaginary parts /// interleaved: real[0], imag[0], real[1], imag[1], ... /// The lookup tables \p twiddleFactors and \p bitReverseIndices are /// generated at compile time as if they were generated for a N/2 /// complex FFT. The boolean flag \p inPlace decides whether the FFT /// computation is done in-place (that is in the \p input buffer /// without writing in the \p output buffer) or out-of-place (written in /// the \p output buffer). void libjit_fft_real_f(float *output, float *input, const float *twiddleFactors, const int32_t *bitReverseIndices, const float *complexToRealWeights, unsigned fftLength, bool inPlace) { // Perform N/2 complex FFT (in-place or out-of-place). // G[k] with k = 0 .. N/2-1. libjit_fft_complex_f(output, input, twiddleFactors, bitReverseIndices, fftLength / 2, inPlace); // Complex to Real FFT mapping (in-place). // X[k] = G[k] * A[k] + conj(G[N/2-k]) * (1 - A[k]) // for k = 0 .. N/2 with the convention G[N/2] = G[0]. // Particular cases: // real(X[0]) = real(G[0]) + imag(G[0]) // imag(X[0]) = 0 // real(X[N/2]) = real(G[0]) - imag(G[0]) // imag(X[N/2]) = 0 // X[N/4] = conj(G[N/4]) const float *Ak = complexToRealWeights + 2; float *ptr = inPlace ? input : output; float *ptr0 = &ptr[0]; float *ptr1 = &ptr[2 * fftLength / 2 + 1]; float inp0_re = *ptr0++; float inp0_im = *ptr0--; *ptr0++ = inp0_re + inp0_im; *ptr0++ = 0; *ptr1-- = 0; *ptr1-- = inp0_re - inp0_im; for (dim_t k = 1; k < fftLength / 4; k++) { float inp0_re = *ptr0++; float inp0_im = *ptr0--; float inp1_im = *ptr1--; float inp1_re = *ptr1++; float Ak_re = *Ak++; float Ak_im = *Ak++; float dif_re = inp0_re - inp1_re; float sum_im = inp0_im + inp1_im; float prod0 = dif_re * Ak_re - sum_im * Ak_im; float prod1 = dif_re * Ak_im + sum_im * Ak_re; *ptr0++ = +prod0 + inp1_re; *ptr0++ = +prod1 - inp1_im; *ptr1-- = +prod1 - inp0_im; *ptr1-- = -prod0 + inp0_re; } if (fftLength >= 4) { *ptr1 = -*ptr1; } } /// Compute the spectrogram for the given 1D mono audio signal \p input. /// The input windows are weighted using the \p window function and the /// FFT LUTs \p twiddleFactors and \p bitReverseIndices are computed at /// compile-time. More details in Graph.h about the AudioSpectrogram node. void libjit_audio_spectrogram_f( void *winOutScratch, void *fftOutScratch, float *spectrogram, const float *input, const float *window, const float *twiddleFactors, const int32_t *bitReverseIndices, const float *complexToRealWeights, const dim_t *spectrogramDims, const dim_t inputLength, const dim_t windowSize, const dim_t windowStride, const bool magnitudeSquared) { dim_t winNum = spectrogramDims[0]; dim_t specLen = spectrogramDims[1]; dim_t fftLen = (specLen - 1) * 2; // Scratch buffers. float *winOut = (float *)winOutScratch; float *fftOut = (float *)fftOutScratch; memset(winOut, 0, fftLen * sizeof(float)); // Compute the spectrogram. for (dim_t winIdx = 0; winIdx < winNum; winIdx++) { // Windowing. for (dim_t n = 0; n < windowSize; n++) { winOut[n] = input[winIdx * windowStride + n] * window[n]; } // Compute spectrum (perform FFT for real data). libjit_fft_real_f(fftOut, winOut, twiddleFactors, bitReverseIndices, complexToRealWeights, fftLen, false /* inPlace */); // Compute spectrum magnitude/power. for (dim_t k = 0; k < specLen; k++) { float real = fftOut[2 * k + 0]; float imag = fftOut[2 * k + 1]; float power = real * real + imag * imag; if (magnitudeSquared) { *spectrogram++ = power; } else { *spectrogram++ = std::sqrt(power); } } } } /// Compute the MFCC (Mel Frequency Cepstral Coefficient) for the given /// \p spectrogram power. The lookup tables \p melWeights, \p melRanges /// and \p dctMat are computed at compile-time. More details in Graph.h /// about the MFCC node. void libjit_mfcc_f(void *scratch, float *coefficients, const float *spectrogram, const float *melWeights, const int32_t *melRanges, const float *dctMat, const dim_t *coefficientsDims, const dim_t *spectrogramDims, const dim_t filterBankCount) { // Scratch buffer. float *melBuff = (float *)scratch; // Perform MFCC for all the windows. dim_t winNum = spectrogramDims[0]; dim_t winSize = spectrogramDims[1]; dim_t numCoefficients = coefficientsDims[1]; for (dim_t winIdx = 0; winIdx < winNum; winIdx++) { // Pointers backup for this window. const float *melWeightsPtr = melWeights; const int32_t *melRangesPtr = melRanges; const float *dctMatPtr = dctMat; // Apply Mel filter bank mapping. We use sqrt for the spectrogram since we // assume the spectrogram is a power value and not a magnitude. for (dim_t melIdx = 0; melIdx < filterBankCount; melIdx++) { int32_t freqIdxStart = *melRangesPtr++; int32_t freqIdxStop = *melRangesPtr++; // Compute Mel Power. float melPwr = 0.0f; for (int32_t freqIdx = freqIdxStart; freqIdx <= freqIdxStop; freqIdx++) { melPwr += std::sqrt(spectrogram[freqIdx]) * (*melWeightsPtr++); } // Take logarithm in-place (avoid log(0)). melBuff[melIdx] = (melPwr == 0.0) ? logf(std::numeric_limits::min()) : logf(melPwr); } // Compute DCT transform. for (dim_t k = 0; k < numCoefficients; k++) { float dctOut = 0.0f; for (dim_t n = 0; n < filterBankCount; n++) { dctOut += (*dctMatPtr++) * melBuff[n]; } *coefficients++ = dctOut; } // Go to next spectrogram window. spectrogram += winSize; } } } // extern "C"