1 /**
2 *
3 * Copyright 2016-2020 Netflix, Inc.
4 *
5 * Licensed under the BSD+Patent License (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * https://opensource.org/licenses/BSDplusPatent
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #include <immintrin.h>
20 #include <stdbool.h>
21 #include <stddef.h>
22
23 #include "feature/integer_motion.h"
24 #include "feature/common/alignment.h"
25
x_convolution_16_avx2(const uint16_t * src,uint16_t * dst,unsigned width,unsigned height,ptrdiff_t src_stride,ptrdiff_t dst_stride)26 void x_convolution_16_avx2(const uint16_t *src, uint16_t *dst, unsigned width,
27 unsigned height, ptrdiff_t src_stride,
28 ptrdiff_t dst_stride)
29 {
30 const unsigned radius = filter_width / 2;
31 const unsigned left_edge = vmaf_ceiln(radius, 1);
32 const unsigned right_edge = vmaf_floorn(width - (filter_width - radius), 1);
33 const unsigned shift_add_round = 32768;
34 const unsigned vector_loop = (width >> 4) - 1;
35 uint16_t *tmpdst = dst;
36
37 uint16_t *src_p = (uint16_t*) src + (left_edge - radius);
38 unsigned nr = left_edge + 16 * vector_loop;
39 uint16_t *src_pt = (uint16_t*) src + nr -radius;
40 for (unsigned i = 0; i < height; ++i) {
41 for (unsigned j = 0; j < left_edge; j++) {
42 dst[i * dst_stride + j] =
43 (edge_16(true, src, width, height, src_stride, i, j) +
44 shift_add_round) >> 16;
45 }
46 }
47
48 for (unsigned i = 0; i < height; ++i) {
49 uint16_t *src_p1 = src_p;
50 for (unsigned j = 0; j < vector_loop; j = j + 1) {
51 __m256i src1 = _mm256_loadu_si256((__m256i*) src_p1);
52 __m256i kernel1 = _mm256_set1_epi16(3571);
53 __m256i kernel2 = _mm256_set1_epi16(16004);
54 __m256i kernel3 = _mm256_set1_epi16(26386);
55 __m256i result = _mm256_mulhi_epu16(src1, kernel1);
56 __m256i resultlo = _mm256_mullo_epi16(src1, kernel1);
57
58 //src1 = src1 >> 16; //shift by a pixel
59 __m256i src2 = _mm256_loadu_si256((__m256i*) (src_p1 + 1));
60 __m256i result2 = _mm256_mulhi_epu16(src2, kernel2);
61 __m256i result2lo = _mm256_mullo_epi16(src2, kernel2);
62 __m256i accum1_lo = _mm256_unpacklo_epi16(resultlo, result);
63 __m256i accum1_hi = _mm256_unpackhi_epi16(resultlo, result);
64 __m256i accum2_lo = _mm256_unpacklo_epi16(result2lo, result2);
65 __m256i accum2_hi = _mm256_unpackhi_epi16(result2lo, result2);
66
67 //Filter[3] value
68 // src1= src1>>32;
69 __m256i src3 = _mm256_loadu_si256((__m256i*) (src_p1 + 2));
70 __m256i result3 = _mm256_mulhi_epu16(src3, kernel3);
71 __m256i result3lo = _mm256_mullo_epi16(src3, kernel3);
72 __m256i accum3_lo = _mm256_unpacklo_epi16(result3lo, result3);
73 __m256i accum3_hi = _mm256_unpackhi_epi16(result3lo, result3);
74
75 //filter 4
76 src1 = _mm256_loadu_si256((__m256i*) (src_p1 + 3));
77 result = _mm256_mulhi_epu16(src1, kernel2);
78 resultlo = _mm256_mullo_epi16(src1, kernel2);
79
80 //Filter 5
81 src2 = _mm256_loadu_si256((__m256i*) (src_p1 + 4));
82 result2 = _mm256_mulhi_epu16(src2, kernel1);
83 result2lo = _mm256_mullo_epi16(src2, kernel1);
84
85 __m256i accum4_lo =_mm256_unpacklo_epi16(resultlo, result);
86 __m256i accum4_hi =_mm256_unpackhi_epi16(resultlo, result);
87 __m256i accum5_lo =_mm256_unpacklo_epi16(result2lo, result2);
88 __m256i accum5_hi =_mm256_unpackhi_epi16(result2lo, result2);
89
90 __m256i addnum = _mm256_set1_epi32(32768);
91 __m256i accum_lo = _mm256_add_epi32(accum1_lo, accum2_lo);
92 __m256i accumi_lo = _mm256_add_epi32(accum3_lo, accum4_lo);
93 accum5_lo = _mm256_add_epi32(accum5_lo, addnum);
94 accum_lo = _mm256_add_epi32(accum5_lo, accum_lo);
95 accum_lo = _mm256_add_epi32(accumi_lo, accum_lo);
96 __m256i accum_hi = _mm256_add_epi32(accum1_hi, accum2_hi);
97 __m256i accumi_hi = _mm256_add_epi32(accum3_hi, accum4_hi);
98 accum_hi = _mm256_add_epi32(accum5_hi, accum_hi);
99 accumi_hi = _mm256_add_epi32(accumi_hi, addnum);
100 accum_hi = _mm256_add_epi32(accumi_hi, accum_hi);
101 accum_lo = _mm256_srli_epi32(accum_lo, 0x10);
102 accum_hi = _mm256_srli_epi32(accum_hi, 0x10);
103
104 result = _mm256_packus_epi32(accum_lo, accum_hi);
105 _mm256_storeu_si256(
106 (__m256i*) (dst + i * dst_stride + j * 16 + left_edge), result);
107
108 src_p1 += 16;
109 }
110 src_p += src_stride;
111 }
112
113 for (unsigned i = 0; i < height; ++i) {
114 uint16_t *src_p1 = src_pt;
115 for (unsigned j = nr; j < (right_edge); j++) {
116 uint32_t accum = 0;
117 uint16_t *src_p2 = src_p1;
118 for (unsigned k = 0; k < filter_width; ++k) {
119 accum += filter[k] * (*src_p2);
120 src_p2++;
121 }
122 src_p1++;
123 dst[i * dst_stride + j] = (accum + shift_add_round) >> 16;
124 }
125 src_pt += src_stride;
126 }
127
128 for (unsigned i = 0; i < height; ++i) {
129 for (unsigned j = right_edge; j < width; j++) {
130 dst[i * dst_stride + j] =
131 (edge_16(true, src, width, height, src_stride, i, j) +
132 shift_add_round) >> 16;
133 }
134 }
135 }
136