1 /**
2  *
3  *  Copyright 2016-2020 Netflix, Inc.
4  *
5  *     Licensed under the BSD+Patent License (the "License");
6  *     you may not use this file except in compliance with the License.
7  *     You may obtain a copy of the License at
8  *
9  *         https://opensource.org/licenses/BSDplusPatent
10  *
11  *     Unless required by applicable law or agreed to in writing, software
12  *     distributed under the License is distributed on an "AS IS" BASIS,
13  *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *     See the License for the specific language governing permissions and
15  *     limitations under the License.
16  *
17  */
18 
19 #include <errno.h>
20 #include <math.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <string.h>
24 
25 #include "feature/integer_motion.h"
26 #include "feature/common/alignment.h"
27 
28 #include <immintrin.h>
29 
x_convolution_16_avx512(const uint16_t * src,uint16_t * dst,unsigned width,unsigned height,ptrdiff_t src_stride,ptrdiff_t dst_stride)30 void x_convolution_16_avx512(const uint16_t *src, uint16_t *dst, unsigned width,
31                              unsigned height, ptrdiff_t src_stride,
32                              ptrdiff_t dst_stride)
33 {
34     const unsigned radius = filter_width / 2;
35     const unsigned left_edge = vmaf_ceiln(radius, 1);
36     const unsigned right_edge = vmaf_floorn(width - (filter_width - radius), 1);
37     const unsigned shift_add_round = 32768;
38     const unsigned vector_loop = (width>>5) -1;
39     uint16_t *tmpdst = dst;
40     uint16_t *src_p = (uint16_t*) src + (left_edge - radius);
41     unsigned nr = left_edge + 32 *vector_loop;
42     uint16_t *src_pt = (uint16_t*) src + nr -radius;
43     for (unsigned i = 0; i < height; ++i) {
44         for (unsigned j = 0; j < left_edge; j++) {
45             dst[i * dst_stride + j] =
46                 (edge_16(true, src, width, height, src_stride, i, j) +
47                  shift_add_round) >> 16;
48         }
49 
50     }
51 
52     for (unsigned i = 0; i < height; ++i) {
53         uint16_t *src_p1 = src_p;
54         for (unsigned j = 0; j <vector_loop; j=j+1) {
55             __m512i src1            = _mm512_loadu_si512 ((__m512i *)src_p1);
56             __m512i kernel1         = _mm512_set1_epi16( 3571);
57             __m512i kernel2         = _mm512_set1_epi16( 16004);
58             __m512i kernel3         = _mm512_set1_epi16( 26386) ;
59             __m512i result          = _mm512_mulhi_epu16(src1,kernel1);
60             __m512i resultlo        = _mm512_mullo_epi16(src1,kernel1);
61 
62             // src1= src1>>16; //shift by a  pixel
63             __m512i src2            = _mm512_loadu_si512 ((__m512i *)(src_p1+1));
64             __m512i result2         = _mm512_mulhi_epu16(src2,kernel2);
65             __m512i result2lo       = _mm512_mullo_epi16(src2,kernel2);
66             __m512i accum1_lo       = _mm512_unpacklo_epi16(resultlo, result);
67             __m512i accum1_hi       = _mm512_unpackhi_epi16(resultlo, result);
68             __m512i accum2_lo       = _mm512_unpacklo_epi16(result2lo, result2);
69             __m512i accum2_hi       = _mm512_unpackhi_epi16(result2lo, result2);
70 
71             // Filter[3] value
72             // src1= src1>>32;
73             __m512i src3            = _mm512_loadu_si512 ((__m512i *)(src_p1+2));
74             __m512i result3         = _mm512_mulhi_epu16(src3,kernel3);
75             __m512i result3lo       = _mm512_mullo_epi16(src3,kernel3);
76             __m512i accum3_lo       = _mm512_unpacklo_epi16 (result3lo, result3);
77             __m512i accum3_hi       = _mm512_unpackhi_epi16 (result3lo, result3);
78             //filter 4
79             src1      = _mm512_loadu_si512 ((__m512i *)(src_p1+3));
80             result    = _mm512_mulhi_epu16(src1,kernel2);
81             resultlo  = _mm512_mullo_epi16(src1,kernel2);
82 
83             //Filter 5
84             src2      = _mm512_loadu_si512((__m512i *)(src_p1+4));
85             result2   = _mm512_mulhi_epu16(src2,kernel1);
86             result2lo = _mm512_mullo_epi16(src2,kernel1);
87 
88             __m512i accum4_lo =_mm512_unpacklo_epi16(resultlo, result);
89             __m512i accum4_hi =_mm512_unpackhi_epi16(resultlo, result);
90             __m512i accum5_lo =_mm512_unpacklo_epi16(result2lo, result2);
91             __m512i accum5_hi =_mm512_unpackhi_epi16(result2lo, result2);
92 
93             __m512i addnum    = _mm512_set1_epi32(32768);
94             __m512i accum_lo  = _mm512_add_epi32(accum1_lo,accum2_lo);
95             __m512i accumi_lo = _mm512_add_epi32(accum3_lo,accum4_lo);
96                     accum5_lo = _mm512_add_epi32(accum5_lo,addnum);
97                     accum_lo  = _mm512_add_epi32(accum5_lo,accum_lo);
98                     accum_lo  = _mm512_add_epi32(accumi_lo,accum_lo);
99             __m512i accum_hi  = _mm512_add_epi32(accum1_hi,accum2_hi);
100             __m512i accumi_hi = _mm512_add_epi32(accum3_hi,accum4_hi);
101                     accum_hi  = _mm512_add_epi32(accum5_hi,accum_hi);
102                     accumi_hi = _mm512_add_epi32(accumi_hi,addnum);
103                     accum_hi  = _mm512_add_epi32(accumi_hi,accum_hi);
104                     accum_lo  = _mm512_srli_epi32(accum_lo, 0x10);
105                     accum_hi  = _mm512_srli_epi32(accum_hi, 0x10);
106 
107             result = _mm512_packus_epi32(accum_lo,accum_hi);
108             _mm512_storeu_si512((__m512i *) (dst+ i * dst_stride + j*32+ left_edge),result);
109             src_p1+=32;
110         }
111 
112         src_p += src_stride;
113     }
114 
115    for (unsigned i = 0; i < height; ++i) {
116         uint16_t *src_p1 = src_pt;
117         for (unsigned j = nr; j < (right_edge); j++) {
118             uint32_t accum = 0;
119             uint16_t *src_p2 = src_p1;
120             for (unsigned k = 0; k < filter_width; ++k) {
121                 accum += filter[k] * (*src_p2);
122                 src_p2++;
123             }
124             src_p1++;
125             dst[i * dst_stride + j] = (accum + shift_add_round) >> 16;
126         }
127         src_pt += src_stride;
128     }
129 
130     for (unsigned i = 0; i < height; ++i) {
131         for (unsigned j = right_edge; j < width; j++) {
132             dst[i * dst_stride + j] =
133                 (edge_16(true, src, width, height, src_stride, i, j) +
134                  shift_add_round) >> 16;
135         }
136     }
137 }
138