1 /**
2  *
3  *  Copyright 2016-2020 Netflix, Inc.
4  *
5  *     Licensed under the BSD+Patent License (the "License");
6  *     you may not use this file except in compliance with the License.
7  *     You may obtain a copy of the License at
8  *
9  *         https://opensource.org/licenses/BSDplusPatent
10  *
11  *     Unless required by applicable law or agreed to in writing, software
12  *     distributed under the License is distributed on an "AS IS" BASIS,
13  *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *     See the License for the specific language governing permissions and
15  *     limitations under the License.
16  *
17  */
18 
19 #include <immintrin.h>
20 #include <stdbool.h>
21 #include <stddef.h>
22 
23 #include "feature/integer_motion.h"
24 #include "feature/common/alignment.h"
25 
x_convolution_16_avx2(const uint16_t * src,uint16_t * dst,unsigned width,unsigned height,ptrdiff_t src_stride,ptrdiff_t dst_stride)26 void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width,
27                            unsigned height, ptrdiff_t src_stride,
28                            ptrdiff_t dst_stride)
29 {
30     const unsigned radius = filter_width / 2;
31     const unsigned left_edge = vmaf_ceiln(radius, 1);
32     const unsigned right_edge = vmaf_floorn(width - (filter_width - radius), 1);
33     const unsigned shift_add_round = 32768;
34     const unsigned vector_loop = (width >> 4) - 1;
35     uint16_t *tmpdst = dst;
36 
37     uint16_t *src_p = (uint16_t*) src + (left_edge - radius);
38     unsigned nr = left_edge + 16 * vector_loop;
39     uint16_t *src_pt = (uint16_t*) src + nr -radius;
40     for (unsigned i = 0; i < height; ++i) {
41         for (unsigned j = 0; j < left_edge; j++) {
42             dst[i * dst_stride + j] =
43                 (edge_16(true, src, width, height, src_stride, i, j) +
44                  shift_add_round) >> 16;
45         }
46     }
47 
48     for (unsigned i = 0; i < height; ++i) {
49         uint16_t *src_p1 = src_p;
50         for (unsigned j = 0; j < vector_loop; j = j + 1) {
51             __m256i src1 = _mm256_loadu_si256((__m256i*) src_p1);
52             __m256i kernel1 = _mm256_set1_epi16(3571);
53             __m256i kernel2 = _mm256_set1_epi16(16004);
54             __m256i kernel3 = _mm256_set1_epi16(26386);
55             __m256i result = _mm256_mulhi_epu16(src1, kernel1);
56             __m256i resultlo = _mm256_mullo_epi16(src1, kernel1);
57 
58             //src1 = src1 >> 16; //shift by a  pixel
59             __m256i src2 = _mm256_loadu_si256((__m256i*) (src_p1 + 1));
60             __m256i result2 = _mm256_mulhi_epu16(src2, kernel2);
61             __m256i result2lo = _mm256_mullo_epi16(src2, kernel2);
62             __m256i accum1_lo = _mm256_unpacklo_epi16(resultlo, result);
63             __m256i accum1_hi = _mm256_unpackhi_epi16(resultlo, result);
64             __m256i accum2_lo = _mm256_unpacklo_epi16(result2lo, result2);
65             __m256i accum2_hi = _mm256_unpackhi_epi16(result2lo, result2);
66 
67             //Filter[3] value
68             // src1= src1>>32;
69             __m256i src3 = _mm256_loadu_si256((__m256i*) (src_p1 + 2));
70             __m256i result3 = _mm256_mulhi_epu16(src3, kernel3);
71             __m256i result3lo = _mm256_mullo_epi16(src3, kernel3);
72             __m256i accum3_lo = _mm256_unpacklo_epi16(result3lo, result3);
73             __m256i accum3_hi = _mm256_unpackhi_epi16(result3lo, result3);
74 
75             //filter 4
76             src1 = _mm256_loadu_si256((__m256i*) (src_p1 + 3));
77             result = _mm256_mulhi_epu16(src1, kernel2);
78             resultlo = _mm256_mullo_epi16(src1, kernel2);
79 
80             //Filter 5
81             src2 = _mm256_loadu_si256((__m256i*) (src_p1 + 4));
82             result2 = _mm256_mulhi_epu16(src2, kernel1);
83             result2lo = _mm256_mullo_epi16(src2, kernel1);
84 
85             __m256i accum4_lo =_mm256_unpacklo_epi16(resultlo, result);
86             __m256i accum4_hi =_mm256_unpackhi_epi16(resultlo, result);
87             __m256i accum5_lo =_mm256_unpacklo_epi16(result2lo, result2);
88             __m256i accum5_hi =_mm256_unpackhi_epi16(result2lo, result2);
89 
90             __m256i addnum = _mm256_set1_epi32(32768);
91             __m256i accum_lo = _mm256_add_epi32(accum1_lo, accum2_lo);
92             __m256i accumi_lo = _mm256_add_epi32(accum3_lo, accum4_lo);
93             accum5_lo = _mm256_add_epi32(accum5_lo, addnum);
94             accum_lo = _mm256_add_epi32(accum5_lo, accum_lo);
95             accum_lo = _mm256_add_epi32(accumi_lo, accum_lo);
96             __m256i accum_hi = _mm256_add_epi32(accum1_hi, accum2_hi);
97             __m256i accumi_hi = _mm256_add_epi32(accum3_hi, accum4_hi);
98             accum_hi = _mm256_add_epi32(accum5_hi, accum_hi);
99             accumi_hi = _mm256_add_epi32(accumi_hi, addnum);
100             accum_hi = _mm256_add_epi32(accumi_hi, accum_hi);
101             accum_lo = _mm256_srli_epi32(accum_lo, 0x10);
102             accum_hi = _mm256_srli_epi32(accum_hi, 0x10);
103 
104             result = _mm256_packus_epi32(accum_lo, accum_hi);
105             _mm256_storeu_si256(
106                 (__m256i*) (dst + i * dst_stride + j * 16 + left_edge), result);
107 
108             src_p1 += 16;
109         }
110         src_p += src_stride;
111     }
112 
113     for (unsigned i = 0; i < height; ++i) {
114         uint16_t *src_p1 = src_pt;
115         for (unsigned j = nr; j < (right_edge); j++) {
116             uint32_t accum = 0;
117             uint16_t *src_p2 = src_p1;
118             for (unsigned k = 0; k < filter_width; ++k) {
119                 accum += filter[k] * (*src_p2);
120                 src_p2++;
121             }
122             src_p1++;
123             dst[i * dst_stride + j] = (accum + shift_add_round) >> 16;
124         }
125         src_pt += src_stride;
126     }
127 
128     for (unsigned i = 0; i < height; ++i) {
129         for (unsigned j = right_edge; j < width; j++) {
130             dst[i * dst_stride + j] =
131                 (edge_16(true, src, width, height, src_stride, i, j) +
132                  shift_add_round) >> 16;
133         }
134     }
135 }
136