1 /*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <math.h>
14
15 #include "aom_dsp/aom_dsp_common.h"
16 #include "av1/common/av1_common_int.h"
17 #include "av1/encoder/cnn.h"
18
19 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
20
21 typedef struct {
22 const float **input;
23 int in_width;
24 int in_height;
25 int in_stride;
26 const CNN_LAYER_CONFIG *layer_config;
27 float **output;
28 int out_stride;
29 int start_idx;
30 int th_step;
31 } CONVOLVE_OPS;
32
33 typedef float (*activation_fn)(float);
34
softsign(float x)35 static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
36
relu(float x)37 static float relu(float x) { return (x < 0) ? 0 : x; }
38
identity(float x)39 static float identity(float x) { return x; }
40
41 typedef struct {
42 int allocsize;
43 int channels;
44 int width, height, stride;
45 float *buf[CNN_MAX_CHANNELS];
46 } TENSOR;
47
init_tensor(TENSOR * tensor)48 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
49
free_tensor(TENSOR * tensor)50 static void free_tensor(TENSOR *tensor) {
51 if (tensor->allocsize) {
52 aom_free(tensor->buf[0]);
53 tensor->buf[0] = NULL;
54 tensor->allocsize = 0;
55 }
56 }
57
realloc_tensor(TENSOR * tensor,int channels,int width,int height)58 static void realloc_tensor(TENSOR *tensor, int channels, int width,
59 int height) {
60 const int newallocsize = channels * width * height;
61 if (tensor->allocsize < newallocsize) {
62 free_tensor(tensor);
63 tensor->buf[0] =
64 (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
65 tensor->allocsize = newallocsize;
66 }
67 tensor->width = width;
68 tensor->height = height;
69 tensor->stride = width;
70 tensor->channels = channels;
71 for (int c = 1; c < channels; ++c)
72 tensor->buf[c] = &tensor->buf[0][c * width * height];
73 }
74
copy_tensor(const TENSOR * src,int copy_channels,int dst_offset,TENSOR * dst)75 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
76 TENSOR *dst) {
77 assert(src->width == dst->width);
78 assert(src->height == dst->height);
79 assert(copy_channels <= src->channels);
80 if (src->stride == dst->width && dst->stride == dst->width) {
81 for (int c = 0; c < copy_channels; ++c) {
82 memcpy(dst->buf[dst_offset + c], src->buf[c],
83 sizeof(*dst->buf[0]) * src->width * src->height);
84 }
85 } else {
86 for (int c = 0; c < copy_channels; ++c) {
87 for (int r = 0; r < dst->height; ++r) {
88 memcpy(&dst->buf[dst_offset + c][r * dst->stride],
89 &src->buf[c][r * src->stride],
90 dst->width * sizeof(*dst->buf[c]));
91 }
92 }
93 }
94 }
95
assign_tensor(TENSOR * tensor,float * buf[CNN_MAX_CHANNELS],int channels,int width,int height,int stride)96 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
97 int channels, int width, int height, int stride) {
98 tensor->allocsize = 0;
99 tensor->channels = channels;
100 tensor->width = width;
101 tensor->height = height;
102 tensor->stride = stride;
103 if (buf) {
104 for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
105 } else {
106 for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
107 }
108 }
109
swap_tensor(TENSOR * t1,TENSOR * t2)110 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
111 TENSOR t = *t1;
112 *t1 = *t2;
113 *t2 = t;
114 }
115
116 // The concatenated tensor goes into dst with first the channels in
117 // original dst followed by the channels in the src
concat_tensor(const TENSOR * src,TENSOR * dst)118 static void concat_tensor(const TENSOR *src, TENSOR *dst) {
119 assert(src->width == dst->width);
120 assert(src->height == dst->height);
121
122 const int dst_channels = dst->channels;
123 const int channels = dst->channels + src->channels;
124 const int newallocsize = channels * dst->width * dst->height;
125 if (dst->allocsize < newallocsize) {
126 TENSOR t;
127 init_tensor(&t);
128 // allocate new buffers and copy first the dst channels
129 realloc_tensor(&t, channels, dst->width, dst->height);
130 copy_tensor(dst, dst->channels, 0, &t);
131 // Swap the tensors and free the old buffers
132 swap_tensor(dst, &t);
133 free_tensor(&t);
134 }
135 for (int c = 1; c < channels; ++c)
136 dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
137 // Copy the channels in src after the first dst_channels channels.
138 copy_tensor(src, src->channels, dst_channels, dst);
139 }
140
check_tensor_equal_dims(TENSOR * t1,TENSOR * t2)141 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
142 return (t1->width == t2->width && t1->height == t2->height);
143 }
144
check_tensor_equal_size(TENSOR * t1,TENSOR * t2)145 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
146 return (t1->channels == t2->channels && t1->width == t2->width &&
147 t1->height == t2->height);
148 }
149
av1_find_cnn_layer_output_size(int in_width,int in_height,const CNN_LAYER_CONFIG * layer_config,int * out_width,int * out_height)150 void av1_find_cnn_layer_output_size(int in_width, int in_height,
151 const CNN_LAYER_CONFIG *layer_config,
152 int *out_width, int *out_height) {
153 if (!layer_config->deconvolve) {
154 switch (layer_config->pad) {
155 case PADDING_SAME_ZERO:
156 case PADDING_SAME_REPLICATE:
157 *out_width = (in_width + layer_config->skip_width - 1) /
158 layer_config->skip_width;
159 *out_height = (in_height + layer_config->skip_height - 1) /
160 layer_config->skip_height;
161 break;
162 case PADDING_VALID:
163 *out_width =
164 (in_width - layer_config->filter_width + layer_config->skip_width) /
165 layer_config->skip_width;
166 *out_height = (in_height - layer_config->filter_height +
167 layer_config->skip_height) /
168 layer_config->skip_height;
169 break;
170 default: assert(0 && "Unknown padding type");
171 }
172 } else {
173 switch (layer_config->pad) {
174 case PADDING_SAME_ZERO:
175 case PADDING_SAME_REPLICATE:
176 *out_width = in_width * layer_config->skip_width;
177 *out_height = in_height * layer_config->skip_height;
178 break;
179 case PADDING_VALID:
180 *out_width = (in_width - 1) * layer_config->skip_width +
181 layer_config->filter_width;
182 *out_height = (in_height - 1) * layer_config->skip_height +
183 layer_config->filter_height;
184 break;
185 default: assert(0 && "Unknown padding type");
186 }
187 }
188 }
189
find_cnn_out_channels(const CNN_LAYER_CONFIG * layer_config,int channels_per_branch[])190 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
191 int channels_per_branch[]) {
192 int branch = layer_config->branch;
193 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
194 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
195 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
196 if (layer_config->branch_copy_type == BRANCH_INPUT) {
197 channels_per_branch[b] = layer_config->in_channels;
198 } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
199 channels_per_branch[b] = layer_config->out_channels;
200 } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
201 channels_per_branch[b] = layer_config->out_channels;
202 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
203 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
204 assert(channels_per_branch[c] > 0);
205 channels_per_branch[b] += channels_per_branch[c];
206 }
207 }
208 }
209 }
210 }
211 channels_per_branch[branch] = layer_config->out_channels;
212 for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
213 if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
214 assert(channels_per_branch[c] > 0);
215 channels_per_branch[branch] += channels_per_branch[c];
216 }
217 }
218 }
219
220 #if CONFIG_DEBUG
cnn_has_at_least_one_output(const CNN_CONFIG * cnn_config)221 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
222 const int num_layers = cnn_config->num_layers;
223 const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
224
225 for (int idx = 0; idx < num_layers; idx++) {
226 if (layer_configs[idx].output_num != -1) {
227 return 1;
228 }
229 }
230 return 0;
231 }
232 #endif
233
av1_find_cnn_output_size(int in_width,int in_height,const CNN_CONFIG * cnn_config,int * out_width,int * out_height,int * out_channels)234 void av1_find_cnn_output_size(int in_width, int in_height,
235 const CNN_CONFIG *cnn_config, int *out_width,
236 int *out_height, int *out_channels) {
237 int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
238 int i_width[CNN_MAX_BRANCHES] = { 0 };
239 int i_height[CNN_MAX_BRANCHES] = { 0 };
240 i_width[0] = in_width + cnn_config->ext_width * 2;
241 i_height[0] = in_height + cnn_config->ext_height * 2;
242
243 #if CONFIG_DEBUG
244 assert(cnn_has_at_least_one_output(cnn_config));
245 #endif
246
247 for (int i = 0; i < cnn_config->num_layers; ++i) {
248 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
249 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
250 const int branch = layer_config->branch;
251 int o_width = 0, o_height = 0;
252
253 if (layer_config->branch_copy_type == BRANCH_INPUT) {
254 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
255 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
256 assert(i_width[branch] > 0 && i_height[branch] > 0);
257 i_width[b] = i_width[branch];
258 i_height[b] = i_height[branch];
259 }
260 }
261 }
262
263 av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
264 layer_config, &o_width, &o_height);
265 i_width[branch] = o_width;
266 i_height[branch] = o_height;
267
268 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
269 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
270 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
271 i_width[b] = o_width;
272 i_height[b] = o_height;
273 }
274 }
275 }
276
277 find_cnn_out_channels(layer_config, channels_per_branch);
278
279 const int output_num = layer_config->output_num;
280 if (output_num != -1) { // Current layer is an output layer
281 out_width[output_num] = o_width;
282 out_height[output_num] = o_height;
283 out_channels[output_num] = channels_per_branch[layer_config->branch];
284 }
285 }
286 }
287
get_activation(ACTIVATION layer_activation)288 activation_fn get_activation(ACTIVATION layer_activation) {
289 switch (layer_activation) {
290 case NONE: return identity;
291 case RELU: return relu;
292 case SOFTSIGN: return softsign;
293 case SIGMOID:
294 assert(0 && "Sigmoid has not been supported in CNN."); // TO DO
295 return NULL;
296 default: assert(0 && "Unknown activation type"); return NULL;
297 }
298 }
299
get_start_shift_convolve(int width,int filt_width,int stride)300 static INLINE int get_start_shift_convolve(int width, int filt_width,
301 int stride) {
302 const int mod = (width % stride);
303 const int filt_off = (filt_width - 1) / 2;
304 const int dif = (mod ? mod - 1 : stride - 1);
305 return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
306 }
307
av1_cnn_add_c(float ** output,int channels,int width,int height,int stride,const float ** add)308 void av1_cnn_add_c(float **output, int channels, int width, int height,
309 int stride, const float **add) {
310 for (int c = 0; c < channels; ++c) {
311 for (int i = 0; i < height; ++i)
312 for (int j = 0; j < width; ++j)
313 output[c][i * stride + j] += add[c][i * stride + j];
314 }
315 }
316
av1_cnn_activate_c(float ** output,int channels,int width,int height,int stride,ACTIVATION layer_activation)317 void av1_cnn_activate_c(float **output, int channels, int width, int height,
318 int stride, ACTIVATION layer_activation) {
319 activation_fn activation = get_activation(layer_activation);
320 for (int c = 0; c < channels; ++c) {
321 for (int i = 0; i < height; ++i)
322 for (int j = 0; j < width; ++j)
323 output[c][i * stride + j] = activation(output[c][i * stride + j]);
324 }
325 }
326
copy_active_tensor_to_branches(const TENSOR * layer_active_tensor,const CNN_LAYER_CONFIG * layer_config,int branch,TENSOR branch_output[])327 static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
328 const CNN_LAYER_CONFIG *layer_config,
329 int branch, TENSOR branch_output[]) {
330 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
331 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
332 if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
333 // Copy layer's active tensor to output tensor of branch b if set in
334 // mask. The output becomes the input of the first layer of the branch
335 // because the layer of the branch is not the first layer.
336 int copy_channels = branch_config->channels_to_copy > 0
337 ? branch_config->channels_to_copy
338 : layer_active_tensor->channels;
339 realloc_tensor(&branch_output[b], copy_channels,
340 layer_active_tensor->width, layer_active_tensor->height);
341 copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
342 }
343 }
344 }
345
346 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
347 // greater than 1 and padding equal to PADDING_SAME_ZERO.
convolve_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)348 static void convolve_maxpool_padding_zero(
349 const float **input, int in_width, int in_height, int in_stride,
350 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
351 const int cstep, const int filter_width_half,
352 const int filter_height_half) {
353 for (int i = 0; i < layer_config->out_channels; ++i) {
354 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
355 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
356 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
357 ++hh) {
358 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
359 ++ww) {
360 float sum = layer_config->bias[i];
361 for (int k = 0; k < layer_config->in_channels; ++k) {
362 int off = k * layer_config->out_channels + i;
363 for (int l = 0; l < layer_config->filter_height; ++l) {
364 const int ii = hh + l - filter_height_half;
365 for (int m = 0; m < layer_config->filter_width;
366 ++m, off += cstep) {
367 const int jj = ww + m - filter_width_half;
368 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
369 continue;
370 sum += layer_config->weights[off] *
371 input[k][ii * in_stride + jj];
372 }
373 }
374 }
375 const float a = sum;
376 if (h == hh && w == ww)
377 output[i][u * out_stride + v] = a;
378 else
379 output[i][u * out_stride + v] =
380 AOMMAX(output[i][u * out_stride + v], a);
381 }
382 }
383 }
384 }
385 }
386 }
387
388 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
389 // greater than 1 and padding equal to PADDING_SAME_REPLICATE.
convolve_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep,const int filter_width_half,const int filter_height_half)390 static void convolve_maxpool_padding_replicate(
391 const float **input, int in_width, int in_height, int in_stride,
392 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
393 const int cstep, const int filter_width_half,
394 const int filter_height_half) {
395 for (int i = 0; i < layer_config->out_channels; ++i) {
396 for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
397 for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
398 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
399 ++hh) {
400 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
401 ++ww) {
402 float sum = layer_config->bias[i];
403 for (int k = 0; k < layer_config->in_channels; ++k) {
404 int off = k * layer_config->out_channels + i;
405 for (int l = 0; l < layer_config->filter_height; ++l) {
406 const int ii =
407 CLAMPINDEX(hh + l - filter_height_half, in_height);
408 for (int m = 0; m < layer_config->filter_width;
409 ++m, off += cstep) {
410 const int jj =
411 CLAMPINDEX(ww + m - filter_width_half, in_width);
412 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
413 sum += layer_config->weights[off] *
414 input[k][ii * in_stride + jj];
415 }
416 }
417 }
418 const float a = sum;
419 if (h == hh && w == ww)
420 output[i][u * out_stride + v] = a;
421 else
422 output[i][u * out_stride + v] =
423 AOMMAX(output[i][u * out_stride + v], a);
424 }
425 }
426 }
427 }
428 }
429 }
430
431 // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
432 // greater than 1 and padding equal to PADDING_VALID.
convolve_maxpool_padding_valid(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,const int cstep)433 static void convolve_maxpool_padding_valid(
434 const float **input, int in_width, int in_height, int in_stride,
435 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
436 const int cstep) {
437 for (int i = 0; i < layer_config->out_channels; ++i) {
438 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
439 h += layer_config->skip_height, ++u) {
440 for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
441 w += layer_config->skip_width, ++v) {
442 for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
443 ++hh) {
444 for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
445 ++ww) {
446 float sum = layer_config->bias[i];
447 for (int k = 0; k < layer_config->in_channels; ++k) {
448 int off = k * layer_config->out_channels + i;
449 for (int l = 0; l < layer_config->filter_height; ++l) {
450 const int ii = hh + l;
451 for (int m = 0; m < layer_config->filter_width;
452 ++m, off += cstep) {
453 const int jj = ww + m;
454 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
455 sum += layer_config->weights[off] *
456 input[k][ii * in_stride + jj];
457 }
458 }
459 }
460 const float a = sum;
461 if (h == hh && w == ww)
462 output[i][u * out_stride + v] = a;
463 else
464 output[i][u * out_stride + v] =
465 AOMMAX(output[i][u * out_stride + v], a);
466 }
467 }
468 }
469 }
470 }
471 }
472
473 // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
474 // equal to 1.
convolve_element_wise(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,int step)475 static void convolve_element_wise(const float **input, int in_width,
476 int in_height, int in_stride,
477 const CNN_LAYER_CONFIG *const layer_config,
478 float **output, int out_stride, int start_idx,
479 int step) {
480 const int start_h = get_start_shift_convolve(
481 in_height, layer_config->filter_height, layer_config->skip_height);
482 const int start_w =
483 get_start_shift_convolve(in_width, layer_config->filter_width,
484 layer_config->skip_width) +
485 start_idx * layer_config->skip_width;
486 const int out_w_step = AOMMAX(step, 1);
487 const int in_w_step = layer_config->skip_width * out_w_step;
488 for (int i = 0; i < layer_config->out_channels; ++i) {
489 for (int h = start_h, u = 0; h < in_height;
490 h += layer_config->skip_height, ++u) {
491 const int in_h = h * in_stride;
492 const int out_h = u * out_stride + start_idx;
493 for (int w = start_w, out_index = out_h; w < in_width;
494 w += in_w_step, out_index += out_w_step) {
495 float sum = layer_config->bias[i];
496 for (int k = 0; k < layer_config->in_channels; ++k) {
497 sum += layer_config->weights[k * layer_config->out_channels + i] *
498 input[k][in_h + w];
499 }
500 output[i][out_index] = sum;
501 }
502 }
503 }
504 }
505
506 // CNNConvolve specific to maxpool set as 0 and padding equal to
507 // PADDING_SAME_ZERO.
convolve_no_maxpool_padding_zero(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int filter_width_half,const int filter_height_half,const int ii_shift,const int jj_shift,const int channel_step)508 static void convolve_no_maxpool_padding_zero(
509 const float **input, int in_width, int in_height, int in_stride,
510 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
511 int start_idx, const int cstep, const int filter_width_half,
512 const int filter_height_half, const int ii_shift, const int jj_shift,
513 const int channel_step) {
514 const int start_h = get_start_shift_convolve(
515 in_height, layer_config->filter_height, layer_config->skip_height);
516 const int start_w = get_start_shift_convolve(
517 in_width, layer_config->filter_width, layer_config->skip_width);
518 const int end_ii_shift = filter_height_half + 1;
519 const int end_jj_shift = filter_width_half + 1;
520 // *_filter_margin stores the number of pixels along a dimension in the
521 // intersection of the complement of the image in the extended image
522 // and the filter.
523 const int top_filter_margin = layer_config->filter_width * ii_shift;
524 const int right_filter_margin = end_jj_shift - in_width;
525 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
526 for (int h = start_h, u = 0; h < in_height;
527 h += layer_config->skip_height, ++u) {
528 const int out_h = u * out_stride;
529 const int top_cstep =
530 AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
531 cstep +
532 i;
533 const int start_ii = AOMMAX(0, h - ii_shift);
534 const int end_ii = AOMMIN(in_height, h + end_ii_shift);
535 for (int w = start_w, out_index = out_h; w < in_width;
536 w += layer_config->skip_width, ++out_index) {
537 const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
538 const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
539 const int start_jj = AOMMAX(0, w - jj_shift);
540 const int end_jj = AOMMIN(in_width, w + end_jj_shift);
541 float sum = layer_config->bias[i];
542 for (int k = 0; k < layer_config->in_channels; ++k) {
543 int off = k * layer_config->out_channels + top_cstep;
544 for (int ii = start_ii; ii < end_ii; ++ii) {
545 off += left_cstep;
546 for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
547 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
548 }
549 off += right_cstep;
550 }
551 }
552 output[i][out_index] = sum;
553 }
554 }
555 }
556 }
557
558 // CNNConvolve specific to maxpool set as 0 and padding equal to
559 // PADDING_SAME_REPLICATE.
convolve_no_maxpool_padding_replicate(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * const layer_config,float ** output,int out_stride,int start_idx,const int cstep,const int ii_shift,const int jj_shift,const int channel_step)560 static void convolve_no_maxpool_padding_replicate(
561 const float **input, int in_width, int in_height, int in_stride,
562 const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
563 int start_idx, const int cstep, const int ii_shift, const int jj_shift,
564 const int channel_step) {
565 // h and w are shifted to an offset coordinate system to reduce in-loop
566 // computation.
567 const int start_h =
568 get_start_shift_convolve(in_height, layer_config->filter_height,
569 layer_config->skip_height) -
570 ii_shift;
571 const int start_w =
572 get_start_shift_convolve(in_width, layer_config->filter_width,
573 layer_config->skip_width) -
574 jj_shift;
575 const int end_h = in_height - ii_shift;
576 const int end_w = in_width - jj_shift;
577 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
578 for (int h = start_h, u = 0; h < end_h;
579 h += layer_config->skip_height, ++u) {
580 const int out_h = u * out_stride;
581 const int upper_ii_index = layer_config->filter_height + h;
582 for (int w = start_w, out_index = out_h; w < end_w;
583 w += layer_config->skip_width, ++out_index) {
584 const int upper_jj_index = layer_config->filter_width + w;
585 float sum = layer_config->bias[i];
586 for (int k = 0; k < layer_config->in_channels; ++k) {
587 int off = k * layer_config->out_channels + i;
588 for (int ii = h; ii < upper_ii_index; ++ii) {
589 const int clamped_ii = CLAMPINDEX(ii, in_height);
590 for (int jj = w; jj < upper_jj_index; ++jj) {
591 const int clamped_jj = CLAMPINDEX(jj, in_width);
592 assert(clamped_ii >= 0 && clamped_ii < in_height &&
593 clamped_jj >= 0 && clamped_jj < in_width);
594 sum += layer_config->weights[off] *
595 input[k][clamped_ii * in_stride + clamped_jj];
596 off += cstep;
597 }
598 }
599 }
600 output[i][out_index] = sum;
601 }
602 }
603 }
604 }
605
606 // CNNConvolve specific to maxpool set as 0 and padding equal to
607 // PADDING_VALID.
av1_cnn_convolve_no_maxpool_padding_valid_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int cstep,int channel_step)608 void av1_cnn_convolve_no_maxpool_padding_valid_c(
609 const float **input, int in_width, int in_height, int in_stride,
610 const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
611 int start_idx, int cstep, int channel_step) {
612 assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
613 !layer_config->maxpool);
614 assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
615 assert(layer_config->pad == PADDING_VALID);
616 for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
617 for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
618 h += layer_config->skip_height, ++u) {
619 const int out_h = u * out_stride;
620 const int upper_ii_index = layer_config->filter_height + h;
621 for (int w = 0, out_index = out_h;
622 w < in_width - layer_config->filter_width + 1;
623 w += layer_config->skip_width, ++out_index) {
624 const int upper_jj_index = layer_config->filter_width + w;
625 float sum = layer_config->bias[i];
626 for (int k = 0; k < layer_config->in_channels; ++k) {
627 int off = k * layer_config->out_channels + i;
628 for (int ii = h; ii < upper_ii_index; ++ii) {
629 for (int jj = w; jj < upper_jj_index; ++jj) {
630 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
631 sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
632 off += cstep;
633 }
634 }
635 }
636 output[i][out_index] = sum;
637 }
638 }
639 }
640 }
641
av1_cnn_convolve(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride,int start_idx,int step)642 static void av1_cnn_convolve(const float **input, int in_width, int in_height,
643 int in_stride,
644 const CNN_LAYER_CONFIG *layer_config,
645 float **output, int out_stride, int start_idx,
646 int step) {
647 assert(!layer_config->deconvolve);
648 const int cstep = layer_config->in_channels * layer_config->out_channels;
649 const int filter_height_half = layer_config->filter_height >> 1;
650 const int filter_width_half = layer_config->filter_width >> 1;
651 const int channel_step = AOMMAX(step, 1);
652
653 if (layer_config->maxpool &&
654 (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
655 switch (layer_config->pad) {
656 case PADDING_SAME_ZERO:
657 convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
658 layer_config, output, out_stride, cstep,
659 filter_width_half, filter_height_half);
660 break;
661 case PADDING_SAME_REPLICATE:
662 convolve_maxpool_padding_replicate(
663 input, in_width, in_height, in_stride, layer_config, output,
664 out_stride, cstep, filter_width_half, filter_height_half);
665 break;
666 case PADDING_VALID:
667 convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
668 layer_config, output, out_stride, cstep);
669 break;
670 default: assert(0 && "Unknown padding type");
671 }
672 } else {
673 // Results in element-wise matrix multiplication.
674 if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
675 convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
676 output, out_stride, start_idx, step);
677 return;
678 }
679 const int ii_shift =
680 filter_height_half - (layer_config->filter_height - 1) % 2;
681 const int jj_shift =
682 filter_width_half - (layer_config->filter_width - 1) % 2;
683 switch (layer_config->pad) {
684 case PADDING_SAME_ZERO:
685 convolve_no_maxpool_padding_zero(
686 input, in_width, in_height, in_stride, layer_config, output,
687 out_stride, start_idx, cstep, filter_width_half, filter_height_half,
688 ii_shift, jj_shift, channel_step);
689 break;
690 case PADDING_SAME_REPLICATE:
691 convolve_no_maxpool_padding_replicate(
692 input, in_width, in_height, in_stride, layer_config, output,
693 out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
694 break;
695 case PADDING_VALID:
696 av1_cnn_convolve_no_maxpool_padding_valid(
697 input, in_width, in_height, in_stride, layer_config, output,
698 out_stride, start_idx, cstep, channel_step);
699 break;
700 default: assert(0 && "Unknown padding type");
701 }
702 }
703 }
704
convolve_layer(void * arg1,void * arg2)705 static int convolve_layer(void *arg1, void *arg2) {
706 const CONVOLVE_OPS *convolve_ops = arg1;
707 (void)arg2;
708 av1_cnn_convolve(
709 convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
710 convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
711 convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
712 return 1;
713 }
714
convolve_layer_mt(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)715 static void convolve_layer_mt(const float **input, int in_width, int in_height,
716 int in_stride,
717 const CNN_LAYER_CONFIG *layer_config,
718 const CNN_THREAD_DATA *thread_data,
719 float **output, int out_stride) {
720 const AVxWorkerInterface *const winterface = aom_get_worker_interface();
721 const int num_workers = thread_data->num_workers;
722
723 CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
724 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
725 AVxWorker *const worker = &thread_data->workers[th];
726 winterface->reset(worker);
727
728 CONVOLVE_OPS convolve_op = { input, in_width, in_height,
729 in_stride, layer_config, output,
730 out_stride, th, num_workers };
731 convolve_ops[th] = convolve_op;
732 worker->hook = convolve_layer;
733 worker->data1 = &(convolve_ops[th]);
734 worker->data2 = NULL;
735
736 // Start convolving.
737 if (th == num_workers - 1) {
738 winterface->execute(worker);
739 } else {
740 winterface->launch(worker);
741 }
742 }
743
744 // Wait until all workers have finished.
745 for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
746 winterface->sync(&thread_data->workers[th]);
747 }
748 }
749
get_start_shift_deconvolve(int filt_width,int stride)750 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
751 const int dif = AOMMAX(filt_width - stride, 0);
752 return dif / 2;
753 }
754
av1_cnn_batchnorm_c(float ** image,int channels,int width,int height,int stride,const float * gamma,const float * beta,const float * mean,const float * std)755 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
756 int stride, const float *gamma, const float *beta,
757 const float *mean, const float *std) {
758 assert(gamma && beta && beta && std && "batchnorm has null parameter!");
759 for (int ch = 0; ch < channels; ch++) {
760 const float ch_gamma = gamma[ch];
761 const float ch_beta = beta[ch];
762 const float ch_mean = mean[ch];
763 const float ch_std = std[ch];
764 float *image_row = image[ch];
765
766 for (int row = 0; row < height; row++) {
767 for (int col = 0; col < width; col++) {
768 image_row[col] =
769 ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
770 }
771 image_row += stride;
772 }
773 }
774 }
775
av1_cnn_deconvolve_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_LAYER_CONFIG * layer_config,float ** output,int out_stride)776 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
777 int in_stride, const CNN_LAYER_CONFIG *layer_config,
778 float **output, int out_stride) {
779 assert(layer_config->deconvolve);
780
781 const int cstep = layer_config->in_channels * layer_config->out_channels;
782
783 int out_width = 0;
784 int out_height = 0;
785 av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
786 &out_height);
787 switch (layer_config->pad) {
788 case PADDING_SAME_ZERO:
789 for (int i = 0; i < layer_config->out_channels; ++i) {
790 for (int u = 0; u < out_height; ++u) {
791 for (int v = 0; v < out_width; ++v) {
792 float sum = layer_config->bias[i];
793 for (int k = 0; k < layer_config->in_channels; ++k) {
794 int off = k * layer_config->out_channels + i;
795 for (int l = 0; l < layer_config->filter_height; ++l) {
796 const int h =
797 u - l +
798 get_start_shift_deconvolve(layer_config->filter_height,
799 layer_config->skip_height);
800 for (int m = 0; m < layer_config->filter_width;
801 ++m, off += cstep) {
802 const int w =
803 v - m +
804 get_start_shift_deconvolve(layer_config->filter_width,
805 layer_config->skip_width);
806 if ((h % layer_config->skip_height) != 0 ||
807 (w % layer_config->skip_width) != 0)
808 continue;
809 const int ii = h / layer_config->skip_height;
810 const int jj = w / layer_config->skip_width;
811 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
812 continue;
813 sum += layer_config->weights[off] *
814 input[k][ii * in_stride + jj];
815 }
816 }
817 }
818 output[i][u * out_stride + v] = sum;
819 }
820 }
821 }
822 break;
823 case PADDING_SAME_REPLICATE:
824 for (int i = 0; i < layer_config->out_channels; ++i) {
825 for (int u = 0; u < out_height; ++u) {
826 for (int v = 0; v < out_width; ++v) {
827 float sum = layer_config->bias[i];
828 for (int k = 0; k < layer_config->in_channels; ++k) {
829 int off = k * layer_config->out_channels + i;
830 for (int l = 0; l < layer_config->filter_height; ++l) {
831 const int h =
832 u - l +
833 get_start_shift_deconvolve(layer_config->filter_height,
834 layer_config->skip_height);
835 for (int m = 0; m < layer_config->filter_width;
836 ++m, off += cstep) {
837 const int w =
838 v - m +
839 get_start_shift_deconvolve(layer_config->filter_width,
840 layer_config->skip_width);
841 if ((h % layer_config->skip_height) != 0 ||
842 (w % layer_config->skip_width) != 0)
843 continue;
844 const int ii =
845 CLAMPINDEX(h / layer_config->skip_height, in_height);
846 const int jj =
847 CLAMPINDEX(w / layer_config->skip_width, in_width);
848 assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
849 sum += layer_config->weights[off] *
850 input[k][ii * in_stride + jj];
851 }
852 }
853 }
854 output[i][u * out_stride + v] = sum;
855 }
856 }
857 }
858 break;
859 case PADDING_VALID:
860 for (int i = 0; i < layer_config->out_channels; ++i) {
861 for (int u = 0; u < out_height; ++u) {
862 for (int v = 0; v < out_width; ++v) {
863 float sum = layer_config->bias[i];
864 for (int k = 0; k < layer_config->in_channels; ++k) {
865 int off = k * layer_config->out_channels + i;
866 for (int l = 0; l < layer_config->filter_height; ++l) {
867 const int h = u - l;
868 for (int m = 0; m < layer_config->filter_width;
869 ++m, off += cstep) {
870 const int w = v - m;
871 if ((h % layer_config->skip_height) != 0 ||
872 (w % layer_config->skip_width) != 0)
873 continue;
874 const int ii = h / layer_config->skip_height;
875 const int jj = w / layer_config->skip_width;
876 if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
877 continue;
878 sum += layer_config->weights[off] *
879 input[k][ii * in_stride + jj];
880 }
881 }
882 }
883 output[i][u * out_stride + v] = sum;
884 }
885 }
886 }
887 break;
888 default: assert(0 && "Unknown padding type");
889 }
890 }
891
av1_cnn_predict_c(const float ** input,int in_width,int in_height,int in_stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output_struct)892 void av1_cnn_predict_c(const float **input, int in_width, int in_height,
893 int in_stride, const CNN_CONFIG *cnn_config,
894 const CNN_THREAD_DATA *thread_data,
895 CNN_MULTI_OUT *output_struct) {
896 TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
897 TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
898
899 float **output[CNN_MAX_BRANCHES];
900 const int *out_chs = output_struct->output_channels;
901 output[0] = output_struct->output_buffer;
902 for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
903 output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
904 }
905
906 int i_width = in_width;
907 int i_height = in_height;
908 int o_width = 0, o_height = 0;
909 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
910 init_tensor(&tensor1[b]);
911 init_tensor(&tensor2[b]);
912 }
913
914 const int *out_stride = output_struct->output_strides;
915 for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
916 const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
917 const int branch = layer_config->branch;
918 const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
919
920 // Allocate input tensor
921 if (layer == 0) { // First layer
922 assert(branch == 0); // First layer must be primary branch
923 assign_tensor(&tensor1[branch], (float **)input,
924 layer_config->in_channels, in_width, in_height, in_stride);
925 } else { // Non-first layer
926 // Swap tensor1 and tensor2
927 swap_tensor(&tensor1[branch], &tensor2[branch]);
928
929 i_width = tensor1[branch].width;
930 i_height = tensor1[branch].height;
931 }
932
933 // Allocate output tensor
934 av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
935 &o_height);
936 const int output_num = layer_config->output_num;
937 if (output_num == -1) { // Non-output layer
938 realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
939 o_height);
940 } else { // Output layer
941 free_tensor(&tensor2[branch]);
942 assign_tensor(&tensor2[branch], output[output_num],
943 layer_config->out_channels, o_width, o_height,
944 out_stride[output_num]);
945 }
946
947 // If we are combining branches make sure that the branch to combine
948 // is different from the current branch.
949 assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
950 !(branch_config->branches_to_combine & (1 << branch))));
951
952 if (layer_config->branch_copy_type == BRANCH_INPUT) {
953 copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
954 tensor2);
955 }
956 // Check consistency of input and output channels
957 assert(tensor1[branch].channels == layer_config->in_channels);
958 assert(tensor2[branch].channels == layer_config->out_channels);
959
960 // Convolve/Deconvolve
961 if (!cnn_config->layer_config[layer].deconvolve) {
962 if (thread_data->num_workers > 1) {
963 convolve_layer_mt((const float **)tensor1[branch].buf,
964 tensor1[branch].width, tensor1[branch].height,
965 tensor1[branch].stride, layer_config, thread_data,
966 tensor2[branch].buf, tensor2[branch].stride);
967 } else {
968 av1_cnn_convolve((const float **)tensor1[branch].buf,
969 tensor1[branch].width, tensor1[branch].height,
970 tensor1[branch].stride, layer_config,
971 tensor2[branch].buf, tensor2[branch].stride, 0, 1);
972 }
973 } else {
974 av1_cnn_deconvolve((const float **)tensor1[branch].buf,
975 tensor1[branch].width, tensor1[branch].height,
976 tensor1[branch].stride, layer_config,
977 tensor2[branch].buf, tensor2[branch].stride);
978 }
979
980 if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
981 copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
982 tensor2);
983 }
984
985 // Add tensors from other branches if needed
986 if (layer_config->branch_combine_type == BRANCH_ADD) {
987 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
988 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
989 assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
990 av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
991 tensor2[branch].width, tensor2[branch].height,
992 tensor2[branch].stride, (const float **)tensor2[b].buf);
993 }
994 }
995 }
996
997 // Non-linearity
998 if (layer_config->activation != IDENTITY)
999 av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
1000 tensor2[branch].width, tensor2[branch].height,
1001 tensor2[branch].stride, layer_config->activation);
1002
1003 if (layer_config->bn_params.bn_gamma) {
1004 av1_cnn_batchnorm(
1005 tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
1006 tensor2[branch].height, tensor2[branch].stride,
1007 layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
1008 layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
1009 }
1010
1011 // Concatenate tensors
1012 if (layer_config->branch_combine_type == BRANCH_CAT) {
1013 if (output_num == -1) { // Non-output layer
1014 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1015 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1016 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1017 assert(tensor2[b].channels > 0);
1018 concat_tensor(&tensor2[b], &tensor2[branch]);
1019 }
1020 }
1021 } else { // Output layer
1022 const int existing_channels = tensor2[branch].channels;
1023 int num_chs = existing_channels;
1024 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1025 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1026 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1027 // Needed only to assign the new channel buffers
1028 num_chs += tensor2[b].channels;
1029 }
1030 }
1031 assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
1032 o_height, out_stride[output_num]);
1033
1034 num_chs = existing_channels;
1035 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1036 if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
1037 assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
1038 // Needed only to assign the new channel buffers
1039 copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
1040 &tensor2[branch]);
1041 num_chs += tensor2[b].channels;
1042 }
1043 }
1044 }
1045 }
1046
1047 if (layer_config->branch_copy_type == BRANCH_COMBINED) {
1048 copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
1049 tensor2);
1050 }
1051 }
1052
1053 for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
1054 free_tensor(&tensor1[b]);
1055 free_tensor(&tensor2[b]);
1056 }
1057 }
1058
1059 // Assume output already has proper allocation
1060 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,CNN_MULTI_OUT * output)1061 void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
1062 int stride, const CNN_CONFIG *cnn_config,
1063 const CNN_THREAD_DATA *thread_data,
1064 CNN_MULTI_OUT *output) {
1065 const float max_val = 255.0;
1066
1067 const int in_width = width + 2 * cnn_config->ext_width;
1068 const int in_height = height + 2 * cnn_config->ext_height;
1069 const int in_channels = cnn_config->layer_config[0].in_channels;
1070 float *inputs[CNN_MAX_CHANNELS];
1071 float *input_ =
1072 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1073 const int in_stride = in_width;
1074
1075 for (int c = 0; c < in_channels; ++c) {
1076 inputs[c] = input_ + c * in_stride * in_height;
1077 float *input =
1078 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1079
1080 if (cnn_config->strict_bounds) {
1081 for (int i = 0; i < height; ++i)
1082 for (int j = 0; j < width; ++j)
1083 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1084 // extend left and right
1085 for (int i = 0; i < height; ++i) {
1086 for (int j = -cnn_config->ext_width; j < 0; ++j)
1087 input[i * in_stride + j] = input[i * in_stride];
1088 for (int j = width; j < width + cnn_config->ext_width; ++j)
1089 input[i * in_stride + j] = input[i * in_stride + width - 1];
1090 }
1091 // extend top and bottom
1092 for (int i = -cnn_config->ext_height; i < 0; ++i)
1093 memcpy(&input[i * in_stride - cnn_config->ext_width],
1094 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1095 for (int i = height; i < height + cnn_config->ext_height; ++i)
1096 memcpy(&input[i * in_stride - cnn_config->ext_width],
1097 &input[(height - 1) * in_stride - cnn_config->ext_width],
1098 in_width * sizeof(*input));
1099 } else {
1100 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1101 ++i)
1102 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1103 ++j)
1104 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1105 }
1106 }
1107 av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1108 cnn_config, thread_data, output);
1109
1110 aom_free(input_);
1111 }
1112
1113 // Assume output already has proper allocation
1114 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_multi_out_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,CNN_MULTI_OUT * output)1115 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
1116 int stride,
1117 const CNN_CONFIG *cnn_config,
1118 const CNN_THREAD_DATA *thread_data,
1119 int bit_depth,
1120 CNN_MULTI_OUT *output) {
1121 const float max_val = (float)((1 << bit_depth) - 1);
1122
1123 const int in_width = width + 2 * cnn_config->ext_width;
1124 const int in_height = height + 2 * cnn_config->ext_height;
1125 const int in_channels = cnn_config->layer_config[0].in_channels;
1126 float *inputs[CNN_MAX_CHANNELS];
1127 float *input_ =
1128 (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
1129 const int in_stride = in_width;
1130
1131 for (int c = 0; c < in_channels; ++c) {
1132 inputs[c] = input_ + c * in_stride * in_height;
1133 float *input =
1134 inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
1135
1136 if (cnn_config->strict_bounds) {
1137 for (int i = 0; i < height; ++i)
1138 for (int j = 0; j < width; ++j)
1139 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1140 // extend left and right
1141 for (int i = 0; i < height; ++i) {
1142 for (int j = -cnn_config->ext_width; j < 0; ++j)
1143 input[i * in_stride + j] = input[i * in_stride];
1144 for (int j = width; j < width + cnn_config->ext_width; ++j)
1145 input[i * in_stride + j] = input[i * in_stride + width - 1];
1146 }
1147 // extend top and bottom
1148 for (int i = -cnn_config->ext_height; i < 0; ++i)
1149 memcpy(&input[i * in_stride - cnn_config->ext_width],
1150 &input[-cnn_config->ext_width], in_width * sizeof(*input));
1151 for (int i = height; i < height + cnn_config->ext_height; ++i)
1152 memcpy(&input[i * in_stride - cnn_config->ext_width],
1153 &input[(height - 1) * in_stride - cnn_config->ext_width],
1154 in_width * sizeof(*input));
1155 } else {
1156 for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
1157 ++i)
1158 for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
1159 ++j)
1160 input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
1161 }
1162 }
1163
1164 av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
1165 cnn_config, thread_data, output);
1166
1167 aom_free(input_);
1168 }
1169
1170 // Assume output already has proper allocation
1171 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img(uint8_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,float ** output,int out_stride)1172 void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
1173 const CNN_CONFIG *cnn_config,
1174 const CNN_THREAD_DATA *thread_data, float **output,
1175 int out_stride) {
1176 int out_width = 0, out_height = 0, out_channels = 0;
1177 av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1178 &out_channels);
1179 const int output_chs[1] = { out_channels };
1180 const int output_strides[1] = { out_stride };
1181 CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1182 .output_strides = output_strides,
1183 .output_buffer = output };
1184 av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
1185 thread_data, &output_struct);
1186 }
1187
1188 // Assume output already has proper allocation
1189 // Assume input image buffers all have same resolution and strides
av1_cnn_predict_img_highbd(uint16_t ** dgd,int width,int height,int stride,const CNN_CONFIG * cnn_config,const CNN_THREAD_DATA * thread_data,int bit_depth,float ** output,int out_stride)1190 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
1191 int stride, const CNN_CONFIG *cnn_config,
1192 const CNN_THREAD_DATA *thread_data,
1193 int bit_depth, float **output, int out_stride) {
1194 int out_width = 0, out_height = 0, out_channels = 0;
1195 av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
1196 &out_channels);
1197 const int output_chs[1] = { out_channels };
1198 const int output_strides[1] = { out_stride };
1199 CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
1200 .output_strides = output_strides,
1201 .output_buffer = output };
1202 av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
1203 thread_data, bit_depth, &output_struct);
1204 }
1205