1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <math.h>
14 
15 #include "aom_dsp/aom_dsp_common.h"
16 #include "av1/common/av1_common_int.h"
17 #include "av1/encoder/cnn.h"
18 
19 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
20 
21 typedef struct {
22   const float **input;
23   int in_width;
24   int in_height;
25   int in_stride;
26   const CNN_LAYER_CONFIG *layer_config;
27   float **output;
28   int out_stride;
29   int start_idx;
30   int th_step;
31 } CONVOLVE_OPS;
32 
33 typedef float (*activation_fn)(float);
34 
softsign(float x)35 static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
36 
relu(float x)37 static float relu(float x) { return (x < 0) ? 0 : x; }
38 
identity(float x)39 static float identity(float x) { return x; }
40 
41 typedef struct {
42   int allocsize;
43   int channels;
44   int width, height, stride;
45   float *buf[CNN_MAX_CHANNELS];
46 } TENSOR;
47 
init_tensor(TENSOR * tensor)48 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
49 
free_tensor(TENSOR * tensor)50 static void free_tensor(TENSOR *tensor) {
51   if (tensor->allocsize) {
52     aom_free(tensor->buf[0]);
53     tensor->buf[0] = NULL;
54     tensor->allocsize = 0;
55   }
56 }
57 
realloc_tensor(TENSOR * tensor,int channels,int width,int height)58 static void realloc_tensor(TENSOR *tensor, int channels, int width,
59                            int height) {
60   const int newallocsize = channels * width * height;
61   if (tensor->allocsize < newallocsize) {
62     free_tensor(tensor);
63     tensor->buf[0] =
64         (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
65     tensor->allocsize = newallocsize;
66   }
67   tensor->width = width;
68   tensor->height = height;
69   tensor->stride = width;
70   tensor->channels = channels;
71   for (int c = 1; c < channels; ++c)
72     tensor->buf[c] = &tensor->buf[0][c * width * height];
73 }
74 
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)75 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
76                         TENSOR *dst) {
77   assert(src->width == dst->width);
78   assert(src->height == dst->height);
79   assert(copy_channels <= src->channels);
80   if (src->stride == dst->width && dst->stride == dst->width) {
81     for (int c = 0; c < copy_channels; ++c) {
82       memcpy(dst->buf[dst_offset + c], src->buf[c],
83              sizeof(*dst->buf[0]) * src->width * src->height);
84     }
85   } else {
86     for (int c = 0; c < copy_channels; ++c) {
87       for (int r = 0; r < dst->height; ++r) {
88         memcpy(&dst->buf[dst_offset + c][r * dst->stride],
89                &src->buf[c][r * src->stride],
90                dst->width * sizeof(*dst->buf[c]));
91       }
92     }
93   }
94 }
95 
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)96 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
97                           int channels, int width, int height, int stride) {
98   tensor->allocsize = 0;
99   tensor->channels = channels;
100   tensor->width = width;
101   tensor->height = height;
102   tensor->stride = stride;
103   if (buf) {
104     for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
105   } else {
106     for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
107   }
108 }
109 
swap_tensor(TENSOR * t1,TENSOR * t2)110 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
111   TENSOR t = *t1;
112   *t1 = *t2;
113   *t2 = t;
114 }
115 
116 // The concatenated tensor goes into dst with first the channels in
117 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)118 static void concat_tensor(const TENSOR *src, TENSOR *dst) {
119   assert(src->width == dst->width);
120   assert(src->height == dst->height);
121 
122   const int dst_channels = dst->channels;
123   const int channels = dst->channels + src->channels;
124   const int newallocsize = channels * dst->width * dst->height;
125   if (dst->allocsize < newallocsize) {
126     TENSOR t;
127     init_tensor(&t);
128     // allocate new buffers and copy first the dst channels
129     realloc_tensor(&t, channels, dst->width, dst->height);
130     copy_tensor(dst, dst->channels, 0, &t);
131     // Swap the tensors and free the old buffers
132     swap_tensor(dst, &t);
133     free_tensor(&t);
134   }
135   for (int c = 1; c < channels; ++c)
136     dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
137   // Copy the channels in src after the first dst_channels channels.
138   copy_tensor(src, src->channels, dst_channels, dst);
139 }
140 
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)141 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
142   return (t1->width == t2->width && t1->height == t2->height);
143 }
144 
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
146   return (t1->channels == t2->channels && t1->width == t2->width &&
147           t1->height == t2->height);
148 }
149 
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)150 void av1_find_cnn_layer_output_size(int in_width, int in_height,
151                                     const CNN_LAYER_CONFIG *layer_config,
152                                     int *out_width, int *out_height) {
153   if (!layer_config->deconvolve) {
154     switch (layer_config->pad) {
155       case PADDING_SAME_ZERO:
156       case PADDING_SAME_REPLICATE:
157         *out_width = (in_width + layer_config->skip_width - 1) /
158                      layer_config->skip_width;
159         *out_height = (in_height + layer_config->skip_height - 1) /
160                       layer_config->skip_height;
161         break;
162       case PADDING_VALID:
163         *out_width =
164             (in_width - layer_config->filter_width + layer_config->skip_width) /
165             layer_config->skip_width;
166         *out_height = (in_height - layer_config->filter_height +
167                        layer_config->skip_height) /
168                       layer_config->skip_height;
169         break;
170       default: assert(0 && "Unknown padding type");
171     }
172   } else {
173     switch (layer_config->pad) {
174       case PADDING_SAME_ZERO:
175       case PADDING_SAME_REPLICATE:
176         *out_width = in_width * layer_config->skip_width;
177         *out_height = in_height * layer_config->skip_height;
178         break;
179       case PADDING_VALID:
180         *out_width = (in_width - 1) * layer_config->skip_width +
181                      layer_config->filter_width;
182         *out_height = (in_height - 1) * layer_config->skip_height +
183                       layer_config->filter_height;
184         break;
185       default: assert(0 && "Unknown padding type");
186     }
187   }
188 }
189 
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])190 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
191                            int channels_per_branch[]) {
192   int branch = layer_config->branch;
193   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
194   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
195     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
196       if (layer_config->branch_copy_type == BRANCH_INPUT) {
197         channels_per_branch[b] = layer_config->in_channels;
198       } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
199         channels_per_branch[b] = layer_config->out_channels;
200       } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
201         channels_per_branch[b] = layer_config->out_channels;
202         for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
203           if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
204             assert(channels_per_branch[c] > 0);
205             channels_per_branch[b] += channels_per_branch[c];
206           }
207         }
208       }
209     }
210   }
211   channels_per_branch[branch] = layer_config->out_channels;
212   for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
213     if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
214       assert(channels_per_branch[c] > 0);
215       channels_per_branch[branch] += channels_per_branch[c];
216     }
217   }
218 }
219 
220 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)221 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
222   const int num_layers = cnn_config->num_layers;
223   const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
224 
225   for (int idx = 0; idx < num_layers; idx++) {
226     if (layer_configs[idx].output_num != -1) {
227       return 1;
228     }
229   }
230   return 0;
231 }
232 #endif
233 
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)234 void av1_find_cnn_output_size(int in_width, int in_height,
235                               const CNN_CONFIG *cnn_config, int *out_width,
236                               int *out_height, int *out_channels) {
237   int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
238   int i_width[CNN_MAX_BRANCHES] = { 0 };
239   int i_height[CNN_MAX_BRANCHES] = { 0 };
240   i_width[0] = in_width + cnn_config->ext_width * 2;
241   i_height[0] = in_height + cnn_config->ext_height * 2;
242 
243 #if CONFIG_DEBUG
244   assert(cnn_has_at_least_one_output(cnn_config));
245 #endif
246 
247   for (int i = 0; i < cnn_config->num_layers; ++i) {
248     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
249     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
250     const int branch = layer_config->branch;
251     int o_width = 0, o_height = 0;
252 
253     if (layer_config->branch_copy_type == BRANCH_INPUT) {
254       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
255         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
256           assert(i_width[branch] > 0 && i_height[branch] > 0);
257           i_width[b] = i_width[branch];
258           i_height[b] = i_height[branch];
259         }
260       }
261     }
262 
263     av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
264                                    layer_config, &o_width, &o_height);
265     i_width[branch] = o_width;
266     i_height[branch] = o_height;
267 
268     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
269       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
270         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
271           i_width[b] = o_width;
272           i_height[b] = o_height;
273         }
274       }
275     }
276 
277     find_cnn_out_channels(layer_config, channels_per_branch);
278 
279     const int output_num = layer_config->output_num;
280     if (output_num != -1) {  // Current layer is an output layer
281       out_width[output_num] = o_width;
282       out_height[output_num] = o_height;
283       out_channels[output_num] = channels_per_branch[layer_config->branch];
284     }
285   }
286 }
287 
get_activation(ACTIVATION layer_activation)288 activation_fn get_activation(ACTIVATION layer_activation) {
289   switch (layer_activation) {
290     case NONE: return identity;
291     case RELU: return relu;
292     case SOFTSIGN: return softsign;
293     case SIGMOID:
294       assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
295       return NULL;
296     default: assert(0 && "Unknown activation type"); return NULL;
297   }
298 }
299 
get_start_shift_convolve(int width,int filt_width,int stride)300 static INLINE int get_start_shift_convolve(int width, int filt_width,
301                                            int stride) {
302   const int mod = (width % stride);
303   const int filt_off = (filt_width - 1) / 2;
304   const int dif = (mod ? mod - 1 : stride - 1);
305   return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
306 }
307 
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)308 void av1_cnn_add_c(float **output, int channels, int width, int height,
309                    int stride, const float **add) {
310   for (int c = 0; c < channels; ++c) {
311     for (int i = 0; i < height; ++i)
312       for (int j = 0; j < width; ++j)
313         output[c][i * stride + j] += add[c][i * stride + j];
314   }
315 }
316 
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)317 void av1_cnn_activate_c(float **output, int channels, int width, int height,
318                         int stride, ACTIVATION layer_activation) {
319   activation_fn activation = get_activation(layer_activation);
320   for (int c = 0; c < channels; ++c) {
321     for (int i = 0; i < height; ++i)
322       for (int j = 0; j < width; ++j)
323         output[c][i * stride + j] = activation(output[c][i * stride + j]);
324   }
325 }
326 
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])327 static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
328                                            const CNN_LAYER_CONFIG *layer_config,
329                                            int branch, TENSOR branch_output[]) {
330   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
331   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
332     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
333       // Copy layer's active tensor to output tensor of branch b if set in
334       // mask. The output becomes the input of the first layer of the branch
335       // because the layer of the branch is not the first layer.
336       int copy_channels = branch_config->channels_to_copy > 0
337                               ? branch_config->channels_to_copy
338                               : layer_active_tensor->channels;
339       realloc_tensor(&branch_output[b], copy_channels,
340                      layer_active_tensor->width, layer_active_tensor->height);
341       copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
342     }
343   }
344 }
345 
346 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
347 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)348 static void convolve_maxpool_padding_zero(
349     const float **input, int in_width, int in_height, int in_stride,
350     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
351     const int cstep, const int filter_width_half,
352     const int filter_height_half) {
353   for (int i = 0; i < layer_config->out_channels; ++i) {
354     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
355       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
356         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
357              ++hh) {
358           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
359                ++ww) {
360             float sum = layer_config->bias[i];
361             for (int k = 0; k < layer_config->in_channels; ++k) {
362               int off = k * layer_config->out_channels + i;
363               for (int l = 0; l < layer_config->filter_height; ++l) {
364                 const int ii = hh + l - filter_height_half;
365                 for (int m = 0; m < layer_config->filter_width;
366                      ++m, off += cstep) {
367                   const int jj = ww + m - filter_width_half;
368                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
369                     continue;
370                   sum += layer_config->weights[off] *
371                          input[k][ii * in_stride + jj];
372                 }
373               }
374             }
375             const float a = sum;
376             if (h == hh && w == ww)
377               output[i][u * out_stride + v] = a;
378             else
379               output[i][u * out_stride + v] =
380                   AOMMAX(output[i][u * out_stride + v], a);
381           }
382         }
383       }
384     }
385   }
386 }
387 
388 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
389 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)390 static void convolve_maxpool_padding_replicate(
391     const float **input, int in_width, int in_height, int in_stride,
392     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
393     const int cstep, const int filter_width_half,
394     const int filter_height_half) {
395   for (int i = 0; i < layer_config->out_channels; ++i) {
396     for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
397       for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
398         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
399              ++hh) {
400           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
401                ++ww) {
402             float sum = layer_config->bias[i];
403             for (int k = 0; k < layer_config->in_channels; ++k) {
404               int off = k * layer_config->out_channels + i;
405               for (int l = 0; l < layer_config->filter_height; ++l) {
406                 const int ii =
407                     CLAMPINDEX(hh + l - filter_height_half, in_height);
408                 for (int m = 0; m < layer_config->filter_width;
409                      ++m, off += cstep) {
410                   const int jj =
411                       CLAMPINDEX(ww + m - filter_width_half, in_width);
412                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
413                   sum += layer_config->weights[off] *
414                          input[k][ii * in_stride + jj];
415                 }
416               }
417             }
418             const float a = sum;
419             if (h == hh && w == ww)
420               output[i][u * out_stride + v] = a;
421             else
422               output[i][u * out_stride + v] =
423                   AOMMAX(output[i][u * out_stride + v], a);
424           }
425         }
426       }
427     }
428   }
429 }
430 
431 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
432 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)433 static void convolve_maxpool_padding_valid(
434     const float **input, int in_width, int in_height, int in_stride,
435     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
436     const int cstep) {
437   for (int i = 0; i < layer_config->out_channels; ++i) {
438     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
439          h += layer_config->skip_height, ++u) {
440       for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
441            w += layer_config->skip_width, ++v) {
442         for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
443              ++hh) {
444           for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
445                ++ww) {
446             float sum = layer_config->bias[i];
447             for (int k = 0; k < layer_config->in_channels; ++k) {
448               int off = k * layer_config->out_channels + i;
449               for (int l = 0; l < layer_config->filter_height; ++l) {
450                 const int ii = hh + l;
451                 for (int m = 0; m < layer_config->filter_width;
452                      ++m, off += cstep) {
453                   const int jj = ww + m;
454                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
455                   sum += layer_config->weights[off] *
456                          input[k][ii * in_stride + jj];
457                 }
458               }
459             }
460             const float a = sum;
461             if (h == hh && w == ww)
462               output[i][u * out_stride + v] = a;
463             else
464               output[i][u * out_stride + v] =
465                   AOMMAX(output[i][u * out_stride + v], a);
466           }
467         }
468       }
469     }
470   }
471 }
472 
473 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
474 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)475 static void convolve_element_wise(const float **input, int in_width,
476                                   int in_height, int in_stride,
477                                   const CNN_LAYER_CONFIG *const layer_config,
478                                   float **output, int out_stride, int start_idx,
479                                   int step) {
480   const int start_h = get_start_shift_convolve(
481       in_height, layer_config->filter_height, layer_config->skip_height);
482   const int start_w =
483       get_start_shift_convolve(in_width, layer_config->filter_width,
484                                layer_config->skip_width) +
485       start_idx * layer_config->skip_width;
486   const int out_w_step = AOMMAX(step, 1);
487   const int in_w_step = layer_config->skip_width * out_w_step;
488   for (int i = 0; i < layer_config->out_channels; ++i) {
489     for (int h = start_h, u = 0; h < in_height;
490          h += layer_config->skip_height, ++u) {
491       const int in_h = h * in_stride;
492       const int out_h = u * out_stride + start_idx;
493       for (int w = start_w, out_index = out_h; w < in_width;
494            w += in_w_step, out_index += out_w_step) {
495         float sum = layer_config->bias[i];
496         for (int k = 0; k < layer_config->in_channels; ++k) {
497           sum += layer_config->weights[k * layer_config->out_channels + i] *
498                  input[k][in_h + w];
499         }
500         output[i][out_index] = sum;
501       }
502     }
503   }
504 }
505 
506 // CNNConvolve specific to maxpool set as 0 and padding equal to
507 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)508 static void convolve_no_maxpool_padding_zero(
509     const float **input, int in_width, int in_height, int in_stride,
510     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
511     int start_idx, const int cstep, const int filter_width_half,
512     const int filter_height_half, const int ii_shift, const int jj_shift,
513     const int channel_step) {
514   const int start_h = get_start_shift_convolve(
515       in_height, layer_config->filter_height, layer_config->skip_height);
516   const int start_w = get_start_shift_convolve(
517       in_width, layer_config->filter_width, layer_config->skip_width);
518   const int end_ii_shift = filter_height_half + 1;
519   const int end_jj_shift = filter_width_half + 1;
520   // *_filter_margin stores the number of pixels along a dimension in the
521   // intersection of the complement of the image in the extended image
522   // and the filter.
523   const int top_filter_margin = layer_config->filter_width * ii_shift;
524   const int right_filter_margin = end_jj_shift - in_width;
525   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
526     for (int h = start_h, u = 0; h < in_height;
527          h += layer_config->skip_height, ++u) {
528       const int out_h = u * out_stride;
529       const int top_cstep =
530           AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
531               cstep +
532           i;
533       const int start_ii = AOMMAX(0, h - ii_shift);
534       const int end_ii = AOMMIN(in_height, h + end_ii_shift);
535       for (int w = start_w, out_index = out_h; w < in_width;
536            w += layer_config->skip_width, ++out_index) {
537         const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
538         const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
539         const int start_jj = AOMMAX(0, w - jj_shift);
540         const int end_jj = AOMMIN(in_width, w + end_jj_shift);
541         float sum = layer_config->bias[i];
542         for (int k = 0; k < layer_config->in_channels; ++k) {
543           int off = k * layer_config->out_channels + top_cstep;
544           for (int ii = start_ii; ii < end_ii; ++ii) {
545             off += left_cstep;
546             for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
547               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
548             }
549             off += right_cstep;
550           }
551         }
552         output[i][out_index] = sum;
553       }
554     }
555   }
556 }
557 
558 // CNNConvolve specific to maxpool set as 0 and padding equal to
559 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)560 static void convolve_no_maxpool_padding_replicate(
561     const float **input, int in_width, int in_height, int in_stride,
562     const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
563     int start_idx, const int cstep, const int ii_shift, const int jj_shift,
564     const int channel_step) {
565   // h and w are shifted to an offset coordinate system to reduce in-loop
566   // computation.
567   const int start_h =
568       get_start_shift_convolve(in_height, layer_config->filter_height,
569                                layer_config->skip_height) -
570       ii_shift;
571   const int start_w =
572       get_start_shift_convolve(in_width, layer_config->filter_width,
573                                layer_config->skip_width) -
574       jj_shift;
575   const int end_h = in_height - ii_shift;
576   const int end_w = in_width - jj_shift;
577   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
578     for (int h = start_h, u = 0; h < end_h;
579          h += layer_config->skip_height, ++u) {
580       const int out_h = u * out_stride;
581       const int upper_ii_index = layer_config->filter_height + h;
582       for (int w = start_w, out_index = out_h; w < end_w;
583            w += layer_config->skip_width, ++out_index) {
584         const int upper_jj_index = layer_config->filter_width + w;
585         float sum = layer_config->bias[i];
586         for (int k = 0; k < layer_config->in_channels; ++k) {
587           int off = k * layer_config->out_channels + i;
588           for (int ii = h; ii < upper_ii_index; ++ii) {
589             const int clamped_ii = CLAMPINDEX(ii, in_height);
590             for (int jj = w; jj < upper_jj_index; ++jj) {
591               const int clamped_jj = CLAMPINDEX(jj, in_width);
592               assert(clamped_ii >= 0 && clamped_ii < in_height &&
593                      clamped_jj >= 0 && clamped_jj < in_width);
594               sum += layer_config->weights[off] *
595                      input[k][clamped_ii * in_stride + clamped_jj];
596               off += cstep;
597             }
598           }
599         }
600         output[i][out_index] = sum;
601       }
602     }
603   }
604 }
605 
606 // CNNConvolve specific to maxpool set as 0 and padding equal to
607 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)608 void av1_cnn_convolve_no_maxpool_padding_valid_c(
609     const float **input, int in_width, int in_height, int in_stride,
610     const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
611     int start_idx, int cstep, int channel_step) {
612   assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
613          !layer_config->maxpool);
614   assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
615   assert(layer_config->pad == PADDING_VALID);
616   for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
617     for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
618          h += layer_config->skip_height, ++u) {
619       const int out_h = u * out_stride;
620       const int upper_ii_index = layer_config->filter_height + h;
621       for (int w = 0, out_index = out_h;
622            w < in_width - layer_config->filter_width + 1;
623            w += layer_config->skip_width, ++out_index) {
624         const int upper_jj_index = layer_config->filter_width + w;
625         float sum = layer_config->bias[i];
626         for (int k = 0; k < layer_config->in_channels; ++k) {
627           int off = k * layer_config->out_channels + i;
628           for (int ii = h; ii < upper_ii_index; ++ii) {
629             for (int jj = w; jj < upper_jj_index; ++jj) {
630               assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
631               sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
632               off += cstep;
633             }
634           }
635         }
636         output[i][out_index] = sum;
637       }
638     }
639   }
640 }
641 
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)642 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
643                              int in_stride,
644                              const CNN_LAYER_CONFIG *layer_config,
645                              float **output, int out_stride, int start_idx,
646                              int step) {
647   assert(!layer_config->deconvolve);
648   const int cstep = layer_config->in_channels * layer_config->out_channels;
649   const int filter_height_half = layer_config->filter_height >> 1;
650   const int filter_width_half = layer_config->filter_width >> 1;
651   const int channel_step = AOMMAX(step, 1);
652 
653   if (layer_config->maxpool &&
654       (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
655     switch (layer_config->pad) {
656       case PADDING_SAME_ZERO:
657         convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
658                                       layer_config, output, out_stride, cstep,
659                                       filter_width_half, filter_height_half);
660         break;
661       case PADDING_SAME_REPLICATE:
662         convolve_maxpool_padding_replicate(
663             input, in_width, in_height, in_stride, layer_config, output,
664             out_stride, cstep, filter_width_half, filter_height_half);
665         break;
666       case PADDING_VALID:
667         convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
668                                        layer_config, output, out_stride, cstep);
669         break;
670       default: assert(0 && "Unknown padding type");
671     }
672   } else {
673     // Results in element-wise matrix multiplication.
674     if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
675       convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
676                             output, out_stride, start_idx, step);
677       return;
678     }
679     const int ii_shift =
680         filter_height_half - (layer_config->filter_height - 1) % 2;
681     const int jj_shift =
682         filter_width_half - (layer_config->filter_width - 1) % 2;
683     switch (layer_config->pad) {
684       case PADDING_SAME_ZERO:
685         convolve_no_maxpool_padding_zero(
686             input, in_width, in_height, in_stride, layer_config, output,
687             out_stride, start_idx, cstep, filter_width_half, filter_height_half,
688             ii_shift, jj_shift, channel_step);
689         break;
690       case PADDING_SAME_REPLICATE:
691         convolve_no_maxpool_padding_replicate(
692             input, in_width, in_height, in_stride, layer_config, output,
693             out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
694         break;
695       case PADDING_VALID:
696         av1_cnn_convolve_no_maxpool_padding_valid(
697             input, in_width, in_height, in_stride, layer_config, output,
698             out_stride, start_idx, cstep, channel_step);
699         break;
700       default: assert(0 && "Unknown padding type");
701     }
702   }
703 }
704 
convolve_layer(void * arg1,void * arg2)705 static int convolve_layer(void *arg1, void *arg2) {
706   const CONVOLVE_OPS *convolve_ops = arg1;
707   (void)arg2;
708   av1_cnn_convolve(
709       convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
710       convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
711       convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
712   return 1;
713 }
714 
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)715 static void convolve_layer_mt(const float **input, int in_width, int in_height,
716                               int in_stride,
717                               const CNN_LAYER_CONFIG *layer_config,
718                               const CNN_THREAD_DATA *thread_data,
719                               float **output, int out_stride) {
720   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
721   const int num_workers = thread_data->num_workers;
722 
723   CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
724   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
725     AVxWorker *const worker = &thread_data->workers[th];
726     winterface->reset(worker);
727 
728     CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
729                                  in_stride,  layer_config, output,
730                                  out_stride, th,           num_workers };
731     convolve_ops[th] = convolve_op;
732     worker->hook = convolve_layer;
733     worker->data1 = &(convolve_ops[th]);
734     worker->data2 = NULL;
735 
736     // Start convolving.
737     if (th == num_workers - 1) {
738       winterface->execute(worker);
739     } else {
740       winterface->launch(worker);
741     }
742   }
743 
744   // Wait until all workers have finished.
745   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
746     winterface->sync(&thread_data->workers[th]);
747   }
748 }
749 
get_start_shift_deconvolve(int filt_width,int stride)750 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
751   const int dif = AOMMAX(filt_width - stride, 0);
752   return dif / 2;
753 }
754 
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)755 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
756                          int stride, const float *gamma, const float *beta,
757                          const float *mean, const float *std) {
758   assert(gamma && beta && beta && std && "batchnorm has null parameter!");
759   for (int ch = 0; ch < channels; ch++) {
760     const float ch_gamma = gamma[ch];
761     const float ch_beta = beta[ch];
762     const float ch_mean = mean[ch];
763     const float ch_std = std[ch];
764     float *image_row = image[ch];
765 
766     for (int row = 0; row < height; row++) {
767       for (int col = 0; col < width; col++) {
768         image_row[col] =
769             ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
770       }
771       image_row += stride;
772     }
773   }
774 }
775 
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)776 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
777                           int in_stride, const CNN_LAYER_CONFIG *layer_config,
778                           float **output, int out_stride) {
779   assert(layer_config->deconvolve);
780 
781   const int cstep = layer_config->in_channels * layer_config->out_channels;
782 
783   int out_width = 0;
784   int out_height = 0;
785   av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
786                                  &out_height);
787   switch (layer_config->pad) {
788     case PADDING_SAME_ZERO:
789       for (int i = 0; i < layer_config->out_channels; ++i) {
790         for (int u = 0; u < out_height; ++u) {
791           for (int v = 0; v < out_width; ++v) {
792             float sum = layer_config->bias[i];
793             for (int k = 0; k < layer_config->in_channels; ++k) {
794               int off = k * layer_config->out_channels + i;
795               for (int l = 0; l < layer_config->filter_height; ++l) {
796                 const int h =
797                     u - l +
798                     get_start_shift_deconvolve(layer_config->filter_height,
799                                                layer_config->skip_height);
800                 for (int m = 0; m < layer_config->filter_width;
801                      ++m, off += cstep) {
802                   const int w =
803                       v - m +
804                       get_start_shift_deconvolve(layer_config->filter_width,
805                                                  layer_config->skip_width);
806                   if ((h % layer_config->skip_height) != 0 ||
807                       (w % layer_config->skip_width) != 0)
808                     continue;
809                   const int ii = h / layer_config->skip_height;
810                   const int jj = w / layer_config->skip_width;
811                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
812                     continue;
813                   sum += layer_config->weights[off] *
814                          input[k][ii * in_stride + jj];
815                 }
816               }
817             }
818             output[i][u * out_stride + v] = sum;
819           }
820         }
821       }
822       break;
823     case PADDING_SAME_REPLICATE:
824       for (int i = 0; i < layer_config->out_channels; ++i) {
825         for (int u = 0; u < out_height; ++u) {
826           for (int v = 0; v < out_width; ++v) {
827             float sum = layer_config->bias[i];
828             for (int k = 0; k < layer_config->in_channels; ++k) {
829               int off = k * layer_config->out_channels + i;
830               for (int l = 0; l < layer_config->filter_height; ++l) {
831                 const int h =
832                     u - l +
833                     get_start_shift_deconvolve(layer_config->filter_height,
834                                                layer_config->skip_height);
835                 for (int m = 0; m < layer_config->filter_width;
836                      ++m, off += cstep) {
837                   const int w =
838                       v - m +
839                       get_start_shift_deconvolve(layer_config->filter_width,
840                                                  layer_config->skip_width);
841                   if ((h % layer_config->skip_height) != 0 ||
842                       (w % layer_config->skip_width) != 0)
843                     continue;
844                   const int ii =
845                       CLAMPINDEX(h / layer_config->skip_height, in_height);
846                   const int jj =
847                       CLAMPINDEX(w / layer_config->skip_width, in_width);
848                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
849                   sum += layer_config->weights[off] *
850                          input[k][ii * in_stride + jj];
851                 }
852               }
853             }
854             output[i][u * out_stride + v] = sum;
855           }
856         }
857       }
858       break;
859     case PADDING_VALID:
860       for (int i = 0; i < layer_config->out_channels; ++i) {
861         for (int u = 0; u < out_height; ++u) {
862           for (int v = 0; v < out_width; ++v) {
863             float sum = layer_config->bias[i];
864             for (int k = 0; k < layer_config->in_channels; ++k) {
865               int off = k * layer_config->out_channels + i;
866               for (int l = 0; l < layer_config->filter_height; ++l) {
867                 const int h = u - l;
868                 for (int m = 0; m < layer_config->filter_width;
869                      ++m, off += cstep) {
870                   const int w = v - m;
871                   if ((h % layer_config->skip_height) != 0 ||
872                       (w % layer_config->skip_width) != 0)
873                     continue;
874                   const int ii = h / layer_config->skip_height;
875                   const int jj = w / layer_config->skip_width;
876                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
877                     continue;
878                   sum += layer_config->weights[off] *
879                          input[k][ii * in_stride + jj];
880                 }
881               }
882             }
883             output[i][u * out_stride + v] = sum;
884           }
885         }
886       }
887       break;
888     default: assert(0 && "Unknown padding type");
889   }
890 }
891 
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)892 void av1_cnn_predict_c(const float **input, int in_width, int in_height,
893                        int in_stride, const CNN_CONFIG *cnn_config,
894                        const CNN_THREAD_DATA *thread_data,
895                        CNN_MULTI_OUT *output_struct) {
896   TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
897   TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
898 
899   float **output[CNN_MAX_BRANCHES];
900   const int *out_chs = output_struct->output_channels;
901   output[0] = output_struct->output_buffer;
902   for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
903     output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
904   }
905 
906   int i_width = in_width;
907   int i_height = in_height;
908   int o_width = 0, o_height = 0;
909   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
910     init_tensor(&tensor1[b]);
911     init_tensor(&tensor2[b]);
912   }
913 
914   const int *out_stride = output_struct->output_strides;
915   for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
916     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
917     const int branch = layer_config->branch;
918     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
919 
920     // Allocate input tensor
921     if (layer == 0) {       // First layer
922       assert(branch == 0);  // First layer must be primary branch
923       assign_tensor(&tensor1[branch], (float **)input,
924                     layer_config->in_channels, in_width, in_height, in_stride);
925     } else {  // Non-first layer
926       // Swap tensor1 and tensor2
927       swap_tensor(&tensor1[branch], &tensor2[branch]);
928 
929       i_width = tensor1[branch].width;
930       i_height = tensor1[branch].height;
931     }
932 
933     // Allocate output tensor
934     av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
935                                    &o_height);
936     const int output_num = layer_config->output_num;
937     if (output_num == -1) {  // Non-output layer
938       realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
939                      o_height);
940     } else {  // Output layer
941       free_tensor(&tensor2[branch]);
942       assign_tensor(&tensor2[branch], output[output_num],
943                     layer_config->out_channels, o_width, o_height,
944                     out_stride[output_num]);
945     }
946 
947     // If we are combining branches make sure that the branch to combine
948     // is different from the current branch.
949     assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
950                    !(branch_config->branches_to_combine & (1 << branch))));
951 
952     if (layer_config->branch_copy_type == BRANCH_INPUT) {
953       copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
954                                      tensor2);
955     }
956     // Check consistency of input and output channels
957     assert(tensor1[branch].channels == layer_config->in_channels);
958     assert(tensor2[branch].channels == layer_config->out_channels);
959 
960     // Convolve/Deconvolve
961     if (!cnn_config->layer_config[layer].deconvolve) {
962       if (thread_data->num_workers > 1) {
963         convolve_layer_mt((const float **)tensor1[branch].buf,
964                           tensor1[branch].width, tensor1[branch].height,
965                           tensor1[branch].stride, layer_config, thread_data,
966                           tensor2[branch].buf, tensor2[branch].stride);
967       } else {
968         av1_cnn_convolve((const float **)tensor1[branch].buf,
969                          tensor1[branch].width, tensor1[branch].height,
970                          tensor1[branch].stride, layer_config,
971                          tensor2[branch].buf, tensor2[branch].stride, 0, 1);
972       }
973     } else {
974       av1_cnn_deconvolve((const float **)tensor1[branch].buf,
975                          tensor1[branch].width, tensor1[branch].height,
976                          tensor1[branch].stride, layer_config,
977                          tensor2[branch].buf, tensor2[branch].stride);
978     }
979 
980     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
981       copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
982                                      tensor2);
983     }
984 
985     // Add tensors from other branches if needed
986     if (layer_config->branch_combine_type == BRANCH_ADD) {
987       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
988         if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
989           assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
990           av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
991                       tensor2[branch].width, tensor2[branch].height,
992                       tensor2[branch].stride, (const float **)tensor2[b].buf);
993         }
994       }
995     }
996 
997     // Non-linearity
998     if (layer_config->activation != IDENTITY)
999       av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1000                        tensor2[branch].width, tensor2[branch].height,
1001                        tensor2[branch].stride, layer_config->activation);
1002 
1003     if (layer_config->bn_params.bn_gamma) {
1004       av1_cnn_batchnorm(
1005           tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1006           tensor2[branch].height, tensor2[branch].stride,
1007           layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1008           layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1009     }
1010 
1011     // Concatenate tensors
1012     if (layer_config->branch_combine_type == BRANCH_CAT) {
1013       if (output_num == -1) {  // Non-output layer
1014         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1015           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1016             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1017             assert(tensor2[b].channels > 0);
1018             concat_tensor(&tensor2[b], &tensor2[branch]);
1019           }
1020         }
1021       } else {  // Output layer
1022         const int existing_channels = tensor2[branch].channels;
1023         int num_chs = existing_channels;
1024         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1025           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1026             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1027             // Needed only to assign the new channel buffers
1028             num_chs += tensor2[b].channels;
1029           }
1030         }
1031         assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1032                       o_height, out_stride[output_num]);
1033 
1034         num_chs = existing_channels;
1035         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1036           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1037             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1038             // Needed only to assign the new channel buffers
1039             copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1040                         &tensor2[branch]);
1041             num_chs += tensor2[b].channels;
1042           }
1043         }
1044       }
1045     }
1046 
1047     if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1048       copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
1049                                      tensor2);
1050     }
1051   }
1052 
1053   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1054     free_tensor(&tensor1[b]);
1055     free_tensor(&tensor2[b]);
1056   }
1057 }
1058 
1059 // Assume output already has proper allocation
1060 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1061 void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1062                                    int stride, const CNN_CONFIG *cnn_config,
1063                                    const CNN_THREAD_DATA *thread_data,
1064                                    CNN_MULTI_OUT *output) {
1065   const float max_val = 255.0;
1066 
1067   const int in_width = width + 2 * cnn_config->ext_width;
1068   const int in_height = height + 2 * cnn_config->ext_height;
1069   const int in_channels = cnn_config->layer_config[0].in_channels;
1070   float *inputs[CNN_MAX_CHANNELS];
1071   float *input_ =
1072       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1073   const int in_stride = in_width;
1074 
1075   for (int c = 0; c < in_channels; ++c) {
1076     inputs[c] = input_ + c * in_stride * in_height;
1077     float *input =
1078         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1079 
1080     if (cnn_config->strict_bounds) {
1081       for (int i = 0; i < height; ++i)
1082         for (int j = 0; j < width; ++j)
1083           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1084       // extend left and right
1085       for (int i = 0; i < height; ++i) {
1086         for (int j = -cnn_config->ext_width; j < 0; ++j)
1087           input[i * in_stride + j] = input[i * in_stride];
1088         for (int j = width; j < width + cnn_config->ext_width; ++j)
1089           input[i * in_stride + j] = input[i * in_stride + width - 1];
1090       }
1091       // extend top and bottom
1092       for (int i = -cnn_config->ext_height; i < 0; ++i)
1093         memcpy(&input[i * in_stride - cnn_config->ext_width],
1094                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1095       for (int i = height; i < height + cnn_config->ext_height; ++i)
1096         memcpy(&input[i * in_stride - cnn_config->ext_width],
1097                &input[(height - 1) * in_stride - cnn_config->ext_width],
1098                in_width * sizeof(*input));
1099     } else {
1100       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1101            ++i)
1102         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1103              ++j)
1104           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1105     }
1106   }
1107   av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1108                   cnn_config, thread_data, output);
1109 
1110   aom_free(input_);
1111 }
1112 
1113 // Assume output already has proper allocation
1114 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1115 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1116                                           int stride,
1117                                           const CNN_CONFIG *cnn_config,
1118                                           const CNN_THREAD_DATA *thread_data,
1119                                           int bit_depth,
1120                                           CNN_MULTI_OUT *output) {
1121   const float max_val = (float)((1 << bit_depth) - 1);
1122 
1123   const int in_width = width + 2 * cnn_config->ext_width;
1124   const int in_height = height + 2 * cnn_config->ext_height;
1125   const int in_channels = cnn_config->layer_config[0].in_channels;
1126   float *inputs[CNN_MAX_CHANNELS];
1127   float *input_ =
1128       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1129   const int in_stride = in_width;
1130 
1131   for (int c = 0; c < in_channels; ++c) {
1132     inputs[c] = input_ + c * in_stride * in_height;
1133     float *input =
1134         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1135 
1136     if (cnn_config->strict_bounds) {
1137       for (int i = 0; i < height; ++i)
1138         for (int j = 0; j < width; ++j)
1139           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1140       // extend left and right
1141       for (int i = 0; i < height; ++i) {
1142         for (int j = -cnn_config->ext_width; j < 0; ++j)
1143           input[i * in_stride + j] = input[i * in_stride];
1144         for (int j = width; j < width + cnn_config->ext_width; ++j)
1145           input[i * in_stride + j] = input[i * in_stride + width - 1];
1146       }
1147       // extend top and bottom
1148       for (int i = -cnn_config->ext_height; i < 0; ++i)
1149         memcpy(&input[i * in_stride - cnn_config->ext_width],
1150                &input[-cnn_config->ext_width], in_width * sizeof(*input));
1151       for (int i = height; i < height + cnn_config->ext_height; ++i)
1152         memcpy(&input[i * in_stride - cnn_config->ext_width],
1153                &input[(height - 1) * in_stride - cnn_config->ext_width],
1154                in_width * sizeof(*input));
1155     } else {
1156       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1157            ++i)
1158         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1159              ++j)
1160           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1161     }
1162   }
1163 
1164   av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1165                   cnn_config, thread_data, output);
1166 
1167   aom_free(input_);
1168 }
1169 
1170 // Assume output already has proper allocation
1171 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)1172 void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
1173                          const CNN_CONFIG *cnn_config,
1174                          const CNN_THREAD_DATA *thread_data, float **output,
1175                          int out_stride) {
1176   int out_width = 0, out_height = 0, out_channels = 0;
1177   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1178                            &out_channels);
1179   const int output_chs[1] = { out_channels };
1180   const int output_strides[1] = { out_stride };
1181   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1182                                   .output_strides = output_strides,
1183                                   .output_buffer = output };
1184   av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
1185                                 thread_data, &output_struct);
1186 }
1187 
1188 // Assume output already has proper allocation
1189 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,float ** output,int out_stride)1190 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
1191                                 int stride, const CNN_CONFIG *cnn_config,
1192                                 const CNN_THREAD_DATA *thread_data,
1193                                 int bit_depth, float **output, int out_stride) {
1194   int out_width = 0, out_height = 0, out_channels = 0;
1195   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1196                            &out_channels);
1197   const int output_chs[1] = { out_channels };
1198   const int output_strides[1] = { out_stride };
1199   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1200                                   .output_strides = output_strides,
1201                                   .output_buffer = output };
1202   av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
1203                                        thread_data, bit_depth, &output_struct);
1204 }
1205